├── baselines
    ├── __init__.py
    ├── a2c
    │   ├── __init__.py
    │   ├── README.md
    │   ├── run_atari.py
    │   ├── policies.py
    │   ├── a2c.py
    │   └── utils.py
    ├── acktr
    │   ├── __init__.py
    │   ├── README.md
    │   ├── running_stat.py
    │   ├── run_mujoco.py
    │   ├── run_atari.py
    │   ├── value_functions.py
    │   ├── filters.py
    │   ├── policies.py
    │   ├── kfac_utils.py
    │   ├── acktr_cont.py
    │   └── utils.py
    ├── ddpg
    │   ├── __init__.py
    │   ├── README.md
    │   ├── util.py
    │   ├── noise.py
    │   ├── models.py
    │   ├── memory.py
    │   ├── main.py
    │   └── training.py
    ├── ppo1
    │   ├── __init__.py
    │   ├── README.md
    │   ├── run_mujoco.py
    │   ├── run_atari.py
    │   ├── cnn_policy.py
    │   └── mlp_policy.py
    ├── trpo_mpi
    │   ├── __init__.py
    │   ├── README.md
    │   ├── run_mujoco.py
    │   ├── run_atari.py
    │   └── nosharing_cnn_policy.py
    ├── deepq
    │   ├── experiments
    │   │   ├── __init__.py
    │   │   ├── atari
    │   │   │   ├── __init__.py
    │   │   │   ├── download_model.py
    │   │   │   ├── enjoy.py
    │   │   │   ├── model.py
    │   │   │   └── wang2015_eval.py
    │   │   ├── enjoy_cartpole.py
    │   │   ├── enjoy_mountaincar.py
    │   │   ├── enjoy_pong.py
    │   │   ├── train_mountaincar.py
    │   │   ├── train_cartpole.py
    │   │   ├── train_pong.py
    │   │   └── custom_cartpole.py
    │   ├── __init__.py
    │   ├── README.md
    │   ├── models.py
    │   └── replay_buffer.py
    ├── bench
    │   ├── __init__.py
    │   ├── monitor.py
    │   └── benchmarks.py
    ├── common
    │   ├── __init__.py
    │   ├── vec_env
    │   │   ├── __init__.py
    │   │   └── subproc_vec_env.py
    │   ├── mpi_fork.py
    │   ├── tests
    │   │   ├── test_schedules.py
    │   │   ├── test_tf_util.py
    │   │   └── test_segment_tree.py
    │   ├── cg.py
    │   ├── console_util.py
    │   ├── mpi_moments.py
    │   ├── dataset.py
    │   ├── math_util.py
    │   ├── mpi_adam.py
    │   ├── mpi_running_mean_std.py
    │   ├── schedules.py
    │   ├── segment_tree.py
    │   ├── azure_utils.py
    │   ├── atari_wrappers.py
    │   └── atari_wrappers_deprecated.py
    └── logger.py
├── data
    ├── logo.jpg
    └── cartpole.gif
├── .gitignore
├── README.md
├── setup.py
└── LICENSE


/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/a2c/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/acktr/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/ddpg/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/ppo1/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/trpo_mpi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/logo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/baselines/master/data/logo.jpg


--------------------------------------------------------------------------------
/data/cartpole.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mimoralea/baselines/master/data/cartpole.gif


--------------------------------------------------------------------------------
/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.bench.benchmarks import *
2 | from baselines.bench.monitor import *
3 | 
4 | 


--------------------------------------------------------------------------------
/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.common.console_util import *
2 | from baselines.common.dataset import Dataset
3 | from baselines.common.math_util import *
4 | from baselines.common.misc_util import *
5 | 


--------------------------------------------------------------------------------
/baselines/a2c/README.md:
--------------------------------------------------------------------------------
1 | # A2C
2 | 
3 | - Original paper: https://arxiv.org/abs/1602.01783
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.a2c.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.


--------------------------------------------------------------------------------
/baselines/deepq/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.deepq import models  # noqa
2 | from baselines.deepq.build_graph import build_act, build_train  # noqa
3 | 
4 | from baselines.deepq.simple import learn, load  # noqa
5 | from baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer  # noqa
6 | 


--------------------------------------------------------------------------------
/baselines/acktr/README.md:
--------------------------------------------------------------------------------
1 | # ACKTR
2 | 
3 | - Original paper: https://arxiv.org/abs/1708.05144
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.acktr.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.


--------------------------------------------------------------------------------
/baselines/ddpg/README.md:
--------------------------------------------------------------------------------
1 | # DDPG
2 | 
3 | - Original paper: https://arxiv.org/abs/1509.02971
4 | - Baselines post: https://blog.openai.com/better-exploration-with-parameter-noise/
5 | - `python -m baselines.ddpg.main` runs the algorithm for 1M frames = 10M timesteps on a Mujoco environment. See help (`-h`) for more options.


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | *.pyc
 3 | *.py~
 4 | .DS_Store
 5 | .idea
 6 | 
 7 | # Setuptools distribution and build folders.
 8 | /dist/
 9 | /build
10 | keys/
11 | 
12 | # Virtualenv
13 | /env
14 | 
15 | 
16 | *.sublime-project
17 | *.sublime-workspace
18 | 
19 | .idea
20 | 
21 | logs/
22 | 
23 | .ipynb_checkpoints
24 | ghostdriver.log
25 | 
26 | htmlcov
27 | 
28 | junk
29 | src
30 | 
31 | *.egg-info
32 | .cache
33 | 


--------------------------------------------------------------------------------
/baselines/ppo1/README.md:
--------------------------------------------------------------------------------
1 | # PPOSGD
2 | 
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 8 python -m baselines.ppo1.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.ppo1.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
7 | 
8 | 


--------------------------------------------------------------------------------
/baselines/trpo_mpi/README.md:
--------------------------------------------------------------------------------
1 | # trpo_mpi
2 | 
3 | - Original paper: https://arxiv.org/abs/1502.05477
4 | - Baselines blog post https://blog.openai.com/openai-baselines-ppo/
5 | - `mpirun -np 16 python -m baselines.trpo_mpi.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.trpo_mpi.run_mujoco` runs the algorithm for 1M timesteps on a Mujoco environment.
7 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def main():
 7 |     env = gym.make("CartPole-v0")
 8 |     act = deepq.load("cartpole_model.pkl")
 9 | 
10 |     while True:
11 |         obs, done = env.reset(), False
12 |         episode_rew = 0
13 |         while not done:
14 |             env.render()
15 |             obs, rew, done, _ = env.step(act(obs[None])[0])
16 |             episode_rew += rew
17 |         print("Episode reward", episode_rew)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     main()
22 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_mountaincar.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def main():
 7 |     env = gym.make("MountainCar-v0")
 8 |     act = deepq.load("mountaincar_model.pkl")
 9 | 
10 |     while True:
11 |         obs, done = env.reset(), False
12 |         episode_rew = 0
13 |         while not done:
14 |             env.render()
15 |             obs, rew, done, _ = env.step(act(obs[None])[0])
16 |             episode_rew += rew
17 |         print("Episode reward", episode_rew)
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     main()
22 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
 1 | class VecEnv(object):
 2 |     """
 3 |     Vectorized environment base class
 4 |     """
 5 |     def step(self, vac):
 6 |         """
 7 |         Apply sequence of actions to sequence of environments
 8 |         actions -> (observations, rewards, news)
 9 | 
10 |         where 'news' is a boolean vector indicating whether each element is new.
11 |         """
12 |         raise NotImplementedError
13 |     def reset(self):
14 |         """
15 |         Reset all environments
16 |         """
17 |         raise NotImplementedError
18 |     def close(self):
19 |         pass


--------------------------------------------------------------------------------
/baselines/deepq/experiments/enjoy_pong.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
 5 | 
 6 | 
 7 | def main():
 8 |     env = gym.make("PongNoFrameskip-v4")
 9 |     env = ScaledFloatFrame(wrap_dqn(env))
10 |     act = deepq.load("pong_model.pkl")
11 | 
12 |     while True:
13 |         obs, done = env.reset(), False
14 |         episode_rew = 0
15 |         while not done:
16 |             env.render()
17 |             obs, rew, done, _ = env.step(act(obs[None])[0])
18 |             episode_rew += rew
19 |         print("Episode reward", episode_rew)
20 | 
21 | 
22 | if __name__ == '__main__':
23 |     main()
24 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_mountaincar.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def main():
 7 |     env = gym.make("MountainCar-v0")
 8 |     # Enabling layer_norm here is import for parameter space noise!
 9 |     model = deepq.models.mlp([64], layer_norm=True)
10 |     act = deepq.learn(
11 |         env,
12 |         q_func=model,
13 |         lr=1e-3,
14 |         max_timesteps=100000,
15 |         buffer_size=50000,
16 |         exploration_fraction=0.1,
17 |         exploration_final_eps=0.1,
18 |         print_freq=10,
19 |         param_noise=True
20 |     )
21 |     print("Saving model to mountaincar_model.pkl")
22 |     act.save("mountaincar_model.pkl")
23 | 
24 | 
25 | if __name__ == '__main__':
26 |     main()
27 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
 1 | import os, subprocess, sys
 2 | 
 3 | def mpi_fork(n, bind_to_core=False):
 4 |     """Re-launches the current script with workers
 5 |     Returns "parent" for original parent, "child" for MPI children
 6 |     """
 7 |     if n<=1: 
 8 |         return "child"
 9 |     if os.getenv("IN_MPI") is None:
10 |         env = os.environ.copy()
11 |         env.update(
12 |             MKL_NUM_THREADS="1",
13 |             OMP_NUM_THREADS="1",
14 |             IN_MPI="1"
15 |         )
16 |         args = ["mpirun", "-np", str(n)]
17 |         if bind_to_core:
18 |             args += ["-bind-to", "core"]
19 |         args += [sys.executable] + sys.argv
20 |         subprocess.check_call(args, env=env)
21 |         return "parent"
22 |     else:
23 |         return "child"
24 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | 
 5 | 
 6 | def callback(lcl, glb):
 7 |     # stop training if reward exceeds 199
 8 |     is_solved = lcl['t'] > 100 and sum(lcl['episode_rewards'][-101:-1]) / 100 >= 199
 9 |     return is_solved
10 | 
11 | 
12 | def main():
13 |     env = gym.make("CartPole-v0")
14 |     model = deepq.models.mlp([64])
15 |     act = deepq.learn(
16 |         env,
17 |         q_func=model,
18 |         lr=1e-3,
19 |         max_timesteps=100000,
20 |         buffer_size=50000,
21 |         exploration_fraction=0.1,
22 |         exploration_final_eps=0.02,
23 |         print_freq=10,
24 |         callback=callback
25 |     )
26 |     print("Saving model to cartpole_model.pkl")
27 |     act.save("cartpole_model.pkl")
28 | 
29 | 
30 | if __name__ == '__main__':
31 |     main()
32 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="data/logo.jpg" width=25% align="right" />
 2 | 
 3 | # Baselines
 4 | 
 5 | OpenAI Baselines is a set of high-quality implementations of reinforcement learning algorithms.
 6 | 
 7 | These algorithms will make it easier for the research community to replicate, refine, and identify new ideas, and will create good baselines to build research on top of. Our DQN implementation and its variants are roughly on par with the scores in published papers. We expect they will be used as a base around which new ideas can be added, and as a tool for comparing a new approach against existing ones. 
 8 | 
 9 | You can install it by typing:
10 | 
11 | ```bash
12 | git clone https://github.com/openai/baselines.git
13 | cd baselines
14 | pip install -e .
15 | ```
16 | 
17 | - [A2C](baselines/a2c)
18 | - [ACKTR](baselines/acktr)
19 | - [DDPG](baselines/ddpg)
20 | - [DQN](baselines/deepq)
21 | - [PPO](baselines/ppo1)
22 | - [TRPO](baselines/trpo_mpi)
23 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
 4 | 
 5 | 
 6 | def test_piecewise_schedule():
 7 |     ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
 8 | 
 9 |     assert np.isclose(ps.value(-10), 500)
10 |     assert np.isclose(ps.value(0), 150)
11 |     assert np.isclose(ps.value(5), 200)
12 |     assert np.isclose(ps.value(9), 80)
13 |     assert np.isclose(ps.value(50), 50)
14 |     assert np.isclose(ps.value(80), 50)
15 |     assert np.isclose(ps.value(150), 0)
16 |     assert np.isclose(ps.value(175), -25)
17 |     assert np.isclose(ps.value(201), 500)
18 |     assert np.isclose(ps.value(500), 500)
19 | 
20 |     assert np.isclose(ps.value(200 - 1e-10), -50)
21 | 
22 | 
23 | def test_constant_schedule():
24 |     cs = ConstantSchedule(5)
25 |     for i in range(-100, 100):
26 |         assert np.isclose(cs.value(i), 5)
27 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/train_pong.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from baselines import deepq
 4 | from baselines.common.atari_wrappers_deprecated import wrap_dqn, ScaledFloatFrame
 5 | 
 6 | 
 7 | def main():
 8 |     env = gym.make("PongNoFrameskip-v4")
 9 |     env = ScaledFloatFrame(wrap_dqn(env))
10 |     model = deepq.models.cnn_to_mlp(
11 |         convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)],
12 |         hiddens=[256],
13 |         dueling=True
14 |     )
15 |     act = deepq.learn(
16 |         env,
17 |         q_func=model,
18 |         lr=1e-4,
19 |         max_timesteps=2000000,
20 |         buffer_size=10000,
21 |         exploration_fraction=0.1,
22 |         exploration_final_eps=0.01,
23 |         train_freq=4,
24 |         learning_starts=10000,
25 |         target_network_update_freq=1000,
26 |         gamma=0.99,
27 |         prioritized_replay=True
28 |     )
29 |     act.save("pong_model.pkl")
30 |     env.close()
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys
 3 | 
 4 | if sys.version_info.major != 3:
 5 |     print("This Python is only compatible with Python 3, but you are running "
 6 |           "Python {}. The installation will likely fail.".format(sys.version_info.major))
 7 | 
 8 | 
 9 | setup(name='baselines',
10 |       packages=[package for package in find_packages()
11 |                 if package.startswith('baselines')],
12 |       install_requires=[
13 |           'gym[mujoco,atari,classic_control]',
14 |           'scipy',
15 |           'tqdm',
16 |           'joblib',
17 |           'zmq',
18 |           'dill',
19 |           'tensorflow >= 1.0.0',
20 |           'azure==1.0.3',
21 |           'progressbar2',
22 |           'mpi4py',
23 |       ],
24 |       description="OpenAI baselines: high quality implementations of reinforcement learning algorithms",
25 |       author="OpenAI",
26 |       url='https://github.com/openai/baselines',
27 |       author_email="gym@openai.com",
28 |       version="0.1.4")
29 | 


--------------------------------------------------------------------------------
/baselines/common/cg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
 3 |     """
 4 |     Demmel p 312
 5 |     """
 6 |     p = b.copy()
 7 |     r = b.copy()
 8 |     x = np.zeros_like(b)
 9 |     rdotr = r.dot(r)
10 | 
11 |     fmtstr =  "%10i %10.3g %10.3g"
12 |     titlestr =  "%10s %10s %10s"
13 |     if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 | 
15 |     for i in range(cg_iters):
16 |         if callback is not None:
17 |             callback(x)
18 |         if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 |         z = f_Ax(p)
20 |         v = rdotr / p.dot(z)
21 |         x += v*p
22 |         r -= v*z
23 |         newrdotr = r.dot(r)
24 |         mu = newrdotr/rdotr
25 |         p = r + mu*p
26 | 
27 |         rdotr = newrdotr
28 |         if rdotr < residual_tol:
29 |             break
30 | 
31 |     if callback is not None:
32 |         callback(x)
33 |     if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
34 |     return x


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017 OpenAI (http://openai.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/baselines/ddpg/util.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | import gym
 4 | import numpy as np
 5 | import tensorflow as tf
 6 | from mpi4py import MPI
 7 | from baselines.common.mpi_moments import mpi_moments
 8 | 
 9 | 
10 | def reduce_var(x, axis=None, keepdims=False):
11 |     m = tf.reduce_mean(x, axis=axis, keep_dims=True)
12 |     devs_squared = tf.square(x - m)
13 |     return tf.reduce_mean(devs_squared, axis=axis, keep_dims=keepdims)
14 | 
15 | 
16 | def reduce_std(x, axis=None, keepdims=False):
17 |     return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
18 | 
19 | 
20 | def mpi_mean(value):
21 |     if value == []:
22 |         value = [0.]
23 |     if not isinstance(value, list):
24 |         value = [value]
25 |     return mpi_moments(np.array(value))[0][0]
26 | 
27 | 
28 | def mpi_std(value):
29 |     if value == []:
30 |         value = [0.]
31 |     if not isinstance(value, list):
32 |         value = [value]
33 |     return mpi_moments(np.array(value))[1][0]
34 | 
35 | 
36 | def mpi_max(value):
37 |     global_max = np.zeros(1, dtype='float64')
38 |     local_max = np.max(value).astype('float64')
39 |     MPI.COMM_WORLD.Reduce(local_max, global_max, op=MPI.MAX)
40 |     return global_max[0]
41 | 
42 | 
43 | def mpi_sum(value):
44 |     global_sum = np.zeros(1, dtype='float64')
45 |     local_sum = np.sum(np.array(value)).astype('float64')
46 |     MPI.COMM_WORLD.Reduce(local_sum, global_sum, op=MPI.SUM)
47 |     return global_sum[0]
48 | 


--------------------------------------------------------------------------------
/baselines/acktr/running_stat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # http://www.johndcook.com/blog/standard_deviation/
 4 | class RunningStat(object):
 5 |     def __init__(self, shape):
 6 |         self._n = 0
 7 |         self._M = np.zeros(shape)
 8 |         self._S = np.zeros(shape)
 9 |     def push(self, x):
10 |         x = np.asarray(x)
11 |         assert x.shape == self._M.shape
12 |         self._n += 1
13 |         if self._n == 1:
14 |             self._M[...] = x
15 |         else:
16 |             oldM = self._M.copy()
17 |             self._M[...] = oldM + (x - oldM)/self._n
18 |             self._S[...] = self._S + (x - oldM)*(x - self._M)
19 |     @property
20 |     def n(self):
21 |         return self._n
22 |     @property
23 |     def mean(self):
24 |         return self._M
25 |     @property
26 |     def var(self):
27 |         return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
28 |     @property
29 |     def std(self):
30 |         return np.sqrt(self.var)
31 |     @property
32 |     def shape(self):
33 |         return self._M.shape
34 | 
35 | def test_running_stat():
36 |     for shp in ((), (3,), (3,4)):
37 |         li = []
38 |         rs = RunningStat(shp)
39 |         for _ in range(5):
40 |             val = np.random.randn(*shp)
41 |             rs.push(val)
42 |             li.append(val)
43 |             m = np.mean(li, axis=0)
44 |             assert np.allclose(rs.mean, m)
45 |             v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
46 |             assert np.allclose(rs.var, v)
47 | 


--------------------------------------------------------------------------------
/baselines/common/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from contextlib import contextmanager
 3 | import numpy as np
 4 | import time
 5 | 
 6 | # ================================================================
 7 | # Misc
 8 | # ================================================================
 9 | 
10 | def fmt_row(width, row, header=False):
11 |     out = " | ".join(fmt_item(x, width) for x in row)
12 |     if header: out = out + "\n" + "-"*len(out)
13 |     return out
14 | 
15 | def fmt_item(x, l):
16 |     if isinstance(x, np.ndarray):
17 |         assert x.ndim==0
18 |         x = x.item()
19 |     if isinstance(x, float): rep = "%g"%x
20 |     else: rep = str(x)
21 |     return " "*(l - len(rep)) + rep
22 | 
23 | color2num = dict(
24 |     gray=30,
25 |     red=31,
26 |     green=32,
27 |     yellow=33,
28 |     blue=34,
29 |     magenta=35,
30 |     cyan=36,
31 |     white=37,
32 |     crimson=38
33 | )
34 | 
35 | def colorize(string, color, bold=False, highlight=False):
36 |     attr = []
37 |     num = color2num[color]
38 |     if highlight: num += 10
39 |     attr.append(str(num))
40 |     if bold: attr.append('1')
41 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
42 | 
43 | 
44 | MESSAGE_DEPTH = 0
45 | 
46 | @contextmanager
47 | def timed(msg):
48 |     global MESSAGE_DEPTH #pylint: disable=W0603
49 |     print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
50 |     tstart = time.time()
51 |     MESSAGE_DEPTH += 1
52 |     yield
53 |     MESSAGE_DEPTH -= 1
54 |     print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
55 | 


--------------------------------------------------------------------------------
/baselines/ppo1/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from baselines.common import set_global_seeds, tf_util as U
 3 | from baselines import bench
 4 | import os.path as osp
 5 | import gym, logging
 6 | from baselines import logger
 7 | import sys
 8 | 
 9 | def train(env_id, num_timesteps, seed):
10 |     from baselines.ppo1 import mlp_policy, pposgd_simple
11 |     U.make_session(num_cpu=1).__enter__()
12 |     set_global_seeds(seed)
13 |     env = gym.make(env_id)
14 |     def policy_fn(name, ob_space, ac_space):
15 |         return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space,
16 |             hid_size=64, num_hid_layers=2)
17 |     env = bench.Monitor(env, logger.get_dir() and 
18 |         osp.join(logger.get_dir(), "monitor.json"))
19 |     env.seed(seed)
20 |     gym.logger.setLevel(logging.WARN)
21 |     pposgd_simple.learn(env, policy_fn, 
22 |             max_timesteps=num_timesteps,
23 |             timesteps_per_batch=2048,
24 |             clip_param=0.2, entcoeff=0.0,
25 |             optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64,
26 |             gamma=0.99, lam=0.95, schedule='linear',
27 |         )
28 |     env.close()
29 | 
30 | def main():
31 |     import argparse
32 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
33 |     parser.add_argument('--env', help='environment ID', default='Hopper-v1')
34 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
35 |     args = parser.parse_args()
36 |     train(args.env, num_timesteps=1e6, seed=args.seed)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/baselines/acktr/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import argparse
 3 | import logging
 4 | import os
 5 | import tensorflow as tf
 6 | import gym
 7 | from baselines import logger
 8 | from baselines.common import set_global_seeds
 9 | from baselines import bench
10 | from baselines.acktr.acktr_cont import learn
11 | from baselines.acktr.policies import GaussianMlpPolicy
12 | from baselines.acktr.value_functions import NeuralNetValueFunction
13 | 
14 | def train(env_id, num_timesteps, seed):
15 |     env=gym.make(env_id)
16 |     if logger.get_dir():
17 |         env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json"))
18 |     set_global_seeds(seed)
19 |     env.seed(seed)
20 |     gym.logger.setLevel(logging.WARN)
21 | 
22 |     with tf.Session(config=tf.ConfigProto()) as session:
23 |         ob_dim = env.observation_space.shape[0]
24 |         ac_dim = env.action_space.shape[0]
25 |         with tf.variable_scope("vf"):
26 |             vf = NeuralNetValueFunction(ob_dim, ac_dim)
27 |         with tf.variable_scope("pi"):
28 |             policy = GaussianMlpPolicy(ob_dim, ac_dim)
29 | 
30 |         learn(env, policy=policy, vf=vf,
31 |             gamma=0.99, lam=0.97, timesteps_per_batch=2500,
32 |             desired_kl=0.002,
33 |             num_timesteps=num_timesteps, animate=False)
34 | 
35 |         env.close()
36 | 
37 | if __name__ == "__main__":
38 |     parser = argparse.ArgumentParser(description='Run Mujoco benchmark.')
39 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
40 |     parser.add_argument('--env', help='environment ID', type=str, default="Reacher-v1")
41 |     args = parser.parse_args()
42 |     train(args.env, num_timesteps=1e6, seed=args.seed)
43 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | from baselines.common import zipsame
 4 | 
 5 | def mpi_moments(x, axis=0):
 6 |     x = np.asarray(x, dtype='float64')
 7 |     newshape = list(x.shape)
 8 |     newshape.pop(axis)
 9 |     n = np.prod(newshape,dtype=int)
10 |     totalvec = np.zeros(n*2+1, 'float64')
11 |     addvec = np.concatenate([x.sum(axis=axis).ravel(), 
12 |         np.square(x).sum(axis=axis).ravel(), 
13 |         np.array([x.shape[axis]],dtype='float64')])
14 |     MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
15 |     sum = totalvec[:n]
16 |     sumsq = totalvec[n:2*n]
17 |     count = totalvec[2*n]
18 |     if count == 0:
19 |         mean = np.empty(newshape); mean[:] = np.nan
20 |         std = np.empty(newshape); std[:] = np.nan
21 |     else:
22 |         mean = sum/count
23 |         std = np.sqrt(np.maximum(sumsq/count - np.square(mean),0))
24 |     return mean, std, count
25 | 
26 | 
27 | def test_runningmeanstd():
28 |     comm = MPI.COMM_WORLD
29 |     np.random.seed(0)
30 |     for (triple,axis) in [
31 |         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
32 |         ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
33 |         ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
34 |         ]:
35 | 
36 | 
37 |         x = np.concatenate(triple, axis=axis)
38 |         ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
39 | 
40 | 
41 |         ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
42 | 
43 |         for (a1,a2) in zipsame(ms1, ms2):
44 |             print(a1, a2)
45 |             assert np.allclose(a1, a2)
46 |             print("ok!")
47 | 
48 | if __name__ == "__main__":
49 |     #mpirun -np 3 python <script>
50 |     test_runningmeanstd()


--------------------------------------------------------------------------------
/baselines/acktr/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os, logging, gym
 3 | from baselines import logger
 4 | from baselines.common import set_global_seeds
 5 | from baselines import bench
 6 | from baselines.acktr.acktr_disc import learn
 7 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 8 | from baselines.common.atari_wrappers import wrap_deepmind
 9 | from baselines.acktr.policies import CnnPolicy
10 | 
11 | def train(env_id, num_frames, seed, num_cpu):
12 |     num_timesteps = int(num_frames / 4 * 1.1) 
13 |     def make_env(rank):
14 |         def _thunk():
15 |             env = gym.make(env_id)
16 |             env.seed(seed + rank)
17 |             if logger.get_dir():
18 |                 env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
19 |             gym.logger.setLevel(logging.WARN)
20 |             return wrap_deepmind(env)
21 |         return _thunk
22 |     set_global_seeds(seed)
23 |     env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
24 |     policy_fn = CnnPolicy
25 |     learn(policy_fn, env, seed, total_timesteps=num_timesteps, nprocs=num_cpu)
26 |     env.close()
27 | 
28 | def main():
29 |     import argparse
30 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
31 |     parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
32 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
33 |     parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). '
34 |         'This number gets divided by 4 due to frameskip', type=int, default=40)
35 |     args = parser.parse_args()    
36 |     train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, num_cpu=32)
37 | 
38 | 
39 | if __name__ == '__main__':
40 |     main()
41 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
 1 | # tests for tf_util
 2 | import tensorflow as tf
 3 | from baselines.common.tf_util import (
 4 |     function,
 5 |     initialize,
 6 |     set_value,
 7 |     single_threaded_session
 8 | )
 9 | 
10 | 
11 | def test_set_value():
12 |     a = tf.Variable(42.)
13 |     with single_threaded_session():
14 |         set_value(a, 5)
15 |         assert a.eval() == 5
16 |         g = tf.get_default_graph()
17 |         g.finalize()
18 |         set_value(a, 6)
19 |         assert a.eval() == 6
20 | 
21 |         # test the test
22 |         try:
23 |             assert a.eval() == 7
24 |         except AssertionError:
25 |             pass
26 |         else:
27 |             assert False, "assertion should have failed"
28 | 
29 | 
30 | def test_function():
31 |     tf.reset_default_graph()
32 |     x = tf.placeholder(tf.int32, (), name="x")
33 |     y = tf.placeholder(tf.int32, (), name="y")
34 |     z = 3 * x + 2 * y
35 |     lin = function([x, y], z, givens={y: 0})
36 | 
37 |     with single_threaded_session():
38 |         initialize()
39 | 
40 |         assert lin(2) == 6
41 |         assert lin(x=3) == 9
42 |         assert lin(2, 2) == 10
43 |         assert lin(x=2, y=3) == 12
44 | 
45 | 
46 | def test_multikwargs():
47 |     tf.reset_default_graph()
48 |     x = tf.placeholder(tf.int32, (), name="x")
49 |     with tf.variable_scope("other"):
50 |         x2 = tf.placeholder(tf.int32, (), name="x")
51 |     z = 3 * x + 2 * x2
52 | 
53 |     lin = function([x, x2], z, givens={x2: 0})
54 |     with single_threaded_session():
55 |         initialize()
56 |         assert lin(2) == 6
57 |         assert lin(2, 2) == 10
58 |         expt_caught = False
59 |         try:
60 |             lin(x=2)
61 |         except AssertionError:
62 |             expt_caught = True
63 |         assert expt_caught
64 | 
65 | 
66 | if __name__ == '__main__':
67 |     test_set_value()
68 |     test_function()
69 |     test_multikwargs()
70 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/download_model.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import progressbar
 3 | 
 4 | from baselines.common.azure_utils import Container
 5 | 
 6 | 
 7 | def parse_args():
 8 |     parser = argparse.ArgumentParser("Download a pretrained model from Azure.")
 9 |     # Environment
10 |     parser.add_argument("--model-dir", type=str, default=None,
11 |                         help="save model in this directory this directory. ")
12 |     parser.add_argument("--account-name", type=str, default="openaisciszymon",
13 |                         help="account name for Azure Blob Storage")
14 |     parser.add_argument("--account-key", type=str, default=None,
15 |                         help="account key for Azure Blob Storage")
16 |     parser.add_argument("--container", type=str, default="dqn-blogpost",
17 |                         help="container name and blob name separated by colon serparated by colon")
18 |     parser.add_argument("--blob", type=str, default=None, help="blob with the model")
19 |     return parser.parse_args()
20 | 
21 | 
22 | def main():
23 |     args = parse_args()
24 |     c = Container(account_name=args.account_name,
25 |                   account_key=args.account_key,
26 |                   container_name=args.container)
27 | 
28 |     if args.blob is None:
29 |         print("Listing available models:")
30 |         print()
31 |         for blob in sorted(c.list(prefix="model-")):
32 |             print(blob)
33 |     else:
34 |         print("Downloading {} to {}...".format(args.blob, args.model_dir))
35 |         bar = None
36 | 
37 |         def callback(current, total):
38 |             nonlocal bar
39 |             if bar is None:
40 |                 bar = progressbar.ProgressBar(max_value=total)
41 |             bar.update(current)
42 | 
43 |         assert c.exists(args.blob), "model {} does not exist".format(args.blob)
44 | 
45 |         assert args.model_dir is not None
46 | 
47 |         c.get(args.model_dir, args.blob, callback=callback)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 


--------------------------------------------------------------------------------
/baselines/trpo_mpi/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # noinspection PyUnresolvedReferences
 3 | import mujoco_py # Mujoco must come before other imports. https://openai.slack.com/archives/C1H6P3R7B/p1492828680631850
 4 | from mpi4py import MPI
 5 | from baselines.common import set_global_seeds
 6 | import os.path as osp
 7 | import gym
 8 | import logging
 9 | from baselines import logger
10 | from baselines.ppo1.mlp_policy import MlpPolicy
11 | from baselines.common.mpi_fork import mpi_fork
12 | from baselines import bench
13 | from baselines.trpo_mpi import trpo_mpi
14 | import sys
15 | 
16 | def train(env_id, num_timesteps, seed):
17 |     import baselines.common.tf_util as U
18 |     sess = U.single_threaded_session()
19 |     sess.__enter__()
20 | 
21 |     rank = MPI.COMM_WORLD.Get_rank()
22 |     if rank != 0:
23 |         logger.set_level(logger.DISABLED)
24 |     workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
25 |     set_global_seeds(workerseed)
26 |     env = gym.make(env_id)
27 |     def policy_fn(name, ob_space, ac_space):
28 |         return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space,
29 |             hid_size=32, num_hid_layers=2)
30 |     env = bench.Monitor(env, logger.get_dir() and 
31 |         osp.join(logger.get_dir(), "%i.monitor.json" % rank))
32 |     env.seed(workerseed)
33 |     gym.logger.setLevel(logging.WARN)
34 | 
35 |     trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1,
36 |         max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
37 |     env.close()
38 | 
39 | def main():
40 |     import argparse
41 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
42 |     parser.add_argument('--env', help='environment ID', default='Hopper-v1')
43 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
44 |     args = parser.parse_args()
45 |     train(args.env, num_timesteps=1e6, seed=args.seed)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     main()
50 | 


--------------------------------------------------------------------------------
/baselines/ppo1/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from mpi4py import MPI
 4 | from baselines.common import set_global_seeds
 5 | from baselines import bench
 6 | import os.path as osp
 7 | import gym, logging
 8 | from baselines import logger
 9 | 
10 | def wrap_train(env):
11 |     from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack)
12 |     env = wrap_deepmind(env, clip_rewards=True)
13 |     env = FrameStack(env, 4)
14 |     return env
15 | 
16 | def train(env_id, num_frames, seed):
17 |     from baselines.ppo1 import pposgd_simple, cnn_policy
18 |     import baselines.common.tf_util as U
19 |     rank = MPI.COMM_WORLD.Get_rank()
20 |     sess = U.single_threaded_session()
21 |     sess.__enter__()
22 |     if rank != 0: logger.set_level(logger.DISABLED)
23 |     workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
24 |     set_global_seeds(workerseed)
25 |     env = gym.make(env_id)
26 |     def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
27 |         return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space)
28 |     env = bench.Monitor(env, logger.get_dir() and 
29 |         osp.join(logger.get_dir(), "%i.monitor.json" % rank))
30 |     env.seed(workerseed)
31 |     gym.logger.setLevel(logging.WARN)
32 | 
33 |     env = wrap_train(env)
34 |     num_timesteps = int(num_frames / 4 * 1.1)
35 |     env.seed(workerseed)
36 | 
37 |     pposgd_simple.learn(env, policy_fn,
38 |         max_timesteps=num_timesteps,
39 |         timesteps_per_batch=256,
40 |         clip_param=0.2, entcoeff=0.01,
41 |         optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64,
42 |         gamma=0.99, lam=0.95,
43 |         schedule='linear'
44 |     )
45 |     env.close()
46 | 
47 | def main():
48 |     import argparse
49 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
50 |     parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
51 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
52 |     args = parser.parse_args()
53 |     train(args.env, num_frames=40e6, seed=args.seed)
54 | 
55 | if __name__ == '__main__':
56 |     main()
57 | 


--------------------------------------------------------------------------------
/baselines/trpo_mpi/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from mpi4py import MPI
 3 | from baselines.common import set_global_seeds
 4 | import os.path as osp
 5 | import gym, logging
 6 | from baselines import logger
 7 | from baselines import bench
 8 | import sys
 9 | 
10 | def wrap_train(env):
11 |     from baselines.common.atari_wrappers import (wrap_deepmind, FrameStack)
12 |     env = wrap_deepmind(env, clip_rewards=False)
13 |     env = FrameStack(env, 3)
14 |     return env
15 | 
16 | def train(env_id, num_frames, seed):
17 |     from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
18 |     from baselines.trpo_mpi import trpo_mpi
19 |     import baselines.common.tf_util as U
20 |     rank = MPI.COMM_WORLD.Get_rank()
21 |     sess = U.single_threaded_session()
22 |     sess.__enter__()
23 |     if rank != 0:
24 |         logger.set_level(logger.DISABLED)
25 | 
26 | 
27 |     workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
28 |     set_global_seeds(workerseed)
29 |     env = gym.make(env_id)
30 |     def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613
31 |         return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space)
32 |     env = bench.Monitor(env, logger.get_dir() and 
33 |         osp.join(logger.get_dir(), "%i.monitor.json"%rank))
34 |     env.seed(workerseed)
35 |     gym.logger.setLevel(logging.WARN)
36 | 
37 |     env = wrap_train(env)
38 |     num_timesteps = int(num_frames / 4 * 1.1)
39 |     env.seed(workerseed)
40 | 
41 |     trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3,
42 |         max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00)
43 |     env.close()
44 | 
45 | def main():
46 |     import argparse
47 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
48 |     parser.add_argument('--env', help='environment ID', default='PongNoFrameskip-v4')
49 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
50 |     args = parser.parse_args()
51 |     train(args.env, num_frames=40e6, seed=args.seed)
52 | 
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/baselines/common/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Dataset(object):
 4 |     def __init__(self, data_map, deterministic=False, shuffle=True):
 5 |         self.data_map = data_map
 6 |         self.deterministic = deterministic
 7 |         self.enable_shuffle = shuffle
 8 |         self.n = next(iter(data_map.values())).shape[0]
 9 |         self._next_id = 0
10 |         self.shuffle()
11 | 
12 |     def shuffle(self):
13 |         if self.deterministic:
14 |             return
15 |         perm = np.arange(self.n)
16 |         np.random.shuffle(perm)
17 | 
18 |         for key in self.data_map:
19 |             self.data_map[key] = self.data_map[key][perm]
20 | 
21 |         self._next_id = 0
22 | 
23 |     def next_batch(self, batch_size):
24 |         if self._next_id >= self.n and self.enable_shuffle:
25 |             self.shuffle()
26 | 
27 |         cur_id = self._next_id
28 |         cur_batch_size = min(batch_size, self.n - self._next_id)
29 |         self._next_id += cur_batch_size
30 | 
31 |         data_map = dict()
32 |         for key in self.data_map:
33 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 |         return data_map
35 | 
36 |     def iterate_once(self, batch_size):
37 |         if self.enable_shuffle: self.shuffle()
38 | 
39 |         while self._next_id <= self.n - batch_size:
40 |             yield self.next_batch(batch_size)
41 |         self._next_id = 0
42 | 
43 |     def subset(self, num_elements, deterministic=True):
44 |         data_map = dict()
45 |         for key in self.data_map:
46 |             data_map[key] = self.data_map[key][:num_elements]
47 |         return Dataset(data_map, deterministic)
48 | 
49 | 
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 |     arrays = tuple(map(np.asarray, arrays))
53 |     n = arrays[0].shape[0]
54 |     assert all(a.shape[0] == n for a in arrays[1:])
55 |     inds = np.arange(n)
56 |     if shuffle: np.random.shuffle(inds)
57 |     sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 |     for batch_inds in np.array_split(inds, sections):
59 |         if include_final_partial_batch or len(batch_inds) == batch_size:
60 |             yield tuple(a[batch_inds] for a in arrays)
61 | 


--------------------------------------------------------------------------------
/baselines/a2c/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import os, logging, gym
 3 | from baselines import logger
 4 | from baselines.common import set_global_seeds
 5 | from baselines import bench
 6 | from baselines.a2c.a2c import learn
 7 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 8 | from baselines.common.atari_wrappers import wrap_deepmind
 9 | from baselines.a2c.policies import CnnPolicy, LstmPolicy, LnLstmPolicy
10 | 
11 | def train(env_id, num_frames, seed, policy, lrschedule, num_cpu):
12 |     num_timesteps = int(num_frames / 4 * 1.1) 
13 |     # divide by 4 due to frameskip, then do a little extras so episodes end
14 |     def make_env(rank):
15 |         def _thunk():
16 |             env = gym.make(env_id)
17 |             env.seed(seed + rank)
18 |             env = bench.Monitor(env, logger.get_dir() and 
19 |                 os.path.join(logger.get_dir(), "{}.monitor.json".format(rank)))
20 |             gym.logger.setLevel(logging.WARN)
21 |             return wrap_deepmind(env)
22 |         return _thunk
23 |     set_global_seeds(seed)
24 |     env = SubprocVecEnv([make_env(i) for i in range(num_cpu)])
25 |     if policy == 'cnn':
26 |         policy_fn = CnnPolicy
27 |     elif policy == 'lstm':
28 |         policy_fn = LstmPolicy
29 |     elif policy == 'lnlstm':
30 |         policy_fn = LnLstmPolicy
31 |     learn(policy_fn, env, seed, total_timesteps=num_timesteps, lrschedule=lrschedule)
32 |     env.close()
33 | 
34 | def main():
35 |     import argparse
36 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
37 |     parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
38 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
39 |     parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm'], default='cnn')
40 |     parser.add_argument('--lrschedule', help='Learning rate schedule', choices=['constant', 'linear'], default='constant')
41 |     parser.add_argument('--million_frames', help='How many frames to train (/ 1e6). '
42 |         'This number gets divided by 4 due to frameskip', type=int, default=40)
43 |     args = parser.parse_args()
44 |     train(args.env, num_frames=1e6 * args.million_frames, seed=args.seed, 
45 |         policy=args.policy, lrschedule=args.lrschedule, num_cpu=16)
46 | 
47 | if __name__ == '__main__':
48 |     main()
49 | 


--------------------------------------------------------------------------------
/baselines/ddpg/noise.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class AdaptiveParamNoiseSpec(object):
 5 |     def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01):
 6 |         self.initial_stddev = initial_stddev
 7 |         self.desired_action_stddev = desired_action_stddev
 8 |         self.adoption_coefficient = adoption_coefficient
 9 | 
10 |         self.current_stddev = initial_stddev
11 | 
12 |     def adapt(self, distance):
13 |         if distance > self.desired_action_stddev:
14 |             # Decrease stddev.
15 |             self.current_stddev /= self.adoption_coefficient
16 |         else:
17 |             # Increase stddev.
18 |             self.current_stddev *= self.adoption_coefficient
19 | 
20 |     def get_stats(self):
21 |         stats = {
22 |             'param_noise_stddev': self.current_stddev,
23 |         }
24 |         return stats
25 | 
26 |     def __repr__(self):
27 |         fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})'
28 |         return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient)
29 | 
30 | 
31 | class ActionNoise(object):
32 |     def reset(self):
33 |         pass
34 | 
35 | 
36 | class NormalActionNoise(ActionNoise):
37 |     def __init__(self, mu, sigma):
38 |         self.mu = mu
39 |         self.sigma = sigma
40 | 
41 |     def __call__(self):
42 |         return np.random.normal(self.mu, self.sigma)
43 | 
44 |     def __repr__(self):
45 |         return 'NormalActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
46 | 
47 | 
48 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
49 | class OrnsteinUhlenbeckActionNoise(ActionNoise):
50 |     def __init__(self, mu, sigma, theta=.15, dt=1e-2, x0=None):
51 |         self.theta = theta
52 |         self.mu = mu
53 |         self.sigma = sigma
54 |         self.dt = dt
55 |         self.x0 = x0
56 |         self.reset()
57 | 
58 |     def __call__(self):
59 |         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.sigma * np.sqrt(self.dt) * np.random.normal(size=self.mu.shape)
60 |         self.x_prev = x
61 |         return x
62 | 
63 |     def reset(self):
64 |         self.x_prev = self.x0 if self.x0 is not None else np.zeros_like(self.mu)
65 | 
66 |     def __repr__(self):
67 |         return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self.mu, self.sigma)
68 | 


--------------------------------------------------------------------------------
/baselines/common/math_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | 
 5 | def discount(x, gamma):
 6 |     """
 7 |     computes discounted sums along 0th dimension of x.
 8 | 
 9 |     inputs
10 |     ------
11 |     x: ndarray
12 |     gamma: float
13 | 
14 |     outputs
15 |     -------
16 |     y: ndarray with same shape as x, satisfying
17 | 
18 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 |                 where k = len(x) - t - 1
20 | 
21 |     """
22 |     assert x.ndim >= 1
23 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 | 
25 | def explained_variance(ypred,y):
26 |     """
27 |     Computes fraction of variance that ypred explains about y.
28 |     Returns 1 - Var[y-ypred] / Var[y]
29 | 
30 |     interpretation:
31 |         ev=0  =>  might as well have predicted zero
32 |         ev=1  =>  perfect prediction
33 |         ev<0  =>  worse than just predicting zero
34 | 
35 |     """
36 |     assert y.ndim == 1 and ypred.ndim == 1
37 |     vary = np.var(y)
38 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 | 
40 | def explained_variance_2d(ypred, y):
41 |     assert y.ndim == 2 and ypred.ndim == 2
42 |     vary = np.var(y, axis=0)
43 |     out = 1 - np.var(y-ypred)/vary
44 |     out[vary < 1e-10] = 0
45 |     return out
46 | 
47 | def ncc(ypred, y):
48 |     return np.corrcoef(ypred, y)[1,0]
49 | 
50 | def flatten_arrays(arrs):
51 |     return np.concatenate([arr.flat for arr in arrs])
52 | 
53 | def unflatten_vector(vec, shapes):
54 |     i=0
55 |     arrs = []
56 |     for shape in shapes:
57 |         size = np.prod(shape)
58 |         arr = vec[i:i+size].reshape(shape)
59 |         arrs.append(arr)
60 |         i += size
61 |     return arrs
62 | 
63 | def discount_with_boundaries(X, New, gamma):
64 |     """
65 |     X: 2d array of floats, time x features
66 |     New: 2d array of bools, indicating when a new episode has started
67 |     """
68 |     Y = np.zeros_like(X)
69 |     T = X.shape[0]
70 |     Y[T-1] = X[T-1]
71 |     for t in range(T-2, -1, -1):
72 |         Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 |     return Y
74 | 
75 | def test_discount_with_boundaries():
76 |     gamma=0.9
77 |     x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 |     starts = [1.0, 0.0, 0.0, 1.0]
79 |     y = discount_with_boundaries(x, starts, gamma)
80 |     assert np.allclose(y, [
81 |         1 + gamma * 2 + gamma**2 * 3,
82 |         2 + gamma * 3,
83 |         3,
84 |         4
85 |     ])


--------------------------------------------------------------------------------
/baselines/trpo_mpi/nosharing_cnn_policy.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.mpi_running_mean_std import RunningMeanStd
 2 | import baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import gym
 5 | from baselines.common.distributions import make_pdtype
 6 | 
 7 | class CnnPolicy(object):
 8 |     recurrent = False
 9 |     def __init__(self, name, ob_space, ac_space):
10 |         with tf.variable_scope(name):
11 |             self._init(ob_space, ac_space)
12 |             self.scope = tf.get_variable_scope().name
13 | 
14 |     def _init(self, ob_space, ac_space):
15 |         assert isinstance(ob_space, gym.spaces.Box)
16 | 
17 |         self.pdtype = pdtype = make_pdtype(ac_space)
18 |         sequence_length = None
19 | 
20 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 | 
22 |         obscaled = ob / 255.0
23 | 
24 |         with tf.variable_scope("pol"):
25 |             x = obscaled
26 |             x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
27 |             x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
28 |             x = U.flattenallbut0(x)
29 |             x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
30 |             logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
31 |             self.pd = pdtype.pdfromflat(logits)
32 |         with tf.variable_scope("vf"):
33 |             x = obscaled
34 |             x = tf.nn.relu(U.conv2d(x, 8, "l1", [8, 8], [4, 4], pad="VALID"))
35 |             x = tf.nn.relu(U.conv2d(x, 16, "l2", [4, 4], [2, 2], pad="VALID"))
36 |             x = U.flattenallbut0(x)
37 |             x = tf.nn.relu(U.dense(x, 128, 'lin', U.normc_initializer(1.0)))
38 |             self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))
39 |             self.vpredz = self.vpred
40 | 
41 |         self.state_in = []
42 |         self.state_out = []
43 | 
44 |         stochastic = tf.placeholder(dtype=tf.bool, shape=())
45 |         ac = self.pd.sample() # XXX
46 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
47 | 
48 |     def act(self, stochastic, ob):
49 |         ac1, vpred1 =  self._act(stochastic, ob[None])
50 |         return ac1[0], vpred1[0]
51 |     def get_variables(self):
52 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
53 |     def get_trainable_variables(self):
54 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
55 |     def get_initial_state(self):
56 |         return []
57 | 
58 | 


--------------------------------------------------------------------------------
/baselines/ppo1/cnn_policy.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.mpi_running_mean_std import RunningMeanStd
 2 | import baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import gym
 5 | from baselines.common.distributions import make_pdtype
 6 | 
 7 | class CnnPolicy(object):
 8 |     recurrent = False
 9 |     def __init__(self, name, ob_space, ac_space, kind='large'):
10 |         with tf.variable_scope(name):
11 |             self._init(ob_space, ac_space, kind)
12 |             self.scope = tf.get_variable_scope().name
13 | 
14 |     def _init(self, ob_space, ac_space, kind):
15 |         assert isinstance(ob_space, gym.spaces.Box)
16 | 
17 |         self.pdtype = pdtype = make_pdtype(ac_space)
18 |         sequence_length = None
19 | 
20 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 | 
22 |         x = ob / 255.0
23 |         if kind == 'small': # from A3C paper
24 |             x = tf.nn.relu(U.conv2d(x, 16, "l1", [8, 8], [4, 4], pad="VALID"))
25 |             x = tf.nn.relu(U.conv2d(x, 32, "l2", [4, 4], [2, 2], pad="VALID"))
26 |             x = U.flattenallbut0(x)
27 |             x = tf.nn.relu(U.dense(x, 256, 'lin', U.normc_initializer(1.0)))
28 |         elif kind == 'large': # Nature DQN
29 |             x = tf.nn.relu(U.conv2d(x, 32, "l1", [8, 8], [4, 4], pad="VALID"))
30 |             x = tf.nn.relu(U.conv2d(x, 64, "l2", [4, 4], [2, 2], pad="VALID"))
31 |             x = tf.nn.relu(U.conv2d(x, 64, "l3", [3, 3], [1, 1], pad="VALID"))
32 |             x = U.flattenallbut0(x)
33 |             x = tf.nn.relu(U.dense(x, 512, 'lin', U.normc_initializer(1.0)))
34 |         else:
35 |             raise NotImplementedError
36 | 
37 |         logits = U.dense(x, pdtype.param_shape()[0], "logits", U.normc_initializer(0.01))
38 |         self.pd = pdtype.pdfromflat(logits)
39 |         self.vpred = U.dense(x, 1, "value", U.normc_initializer(1.0))[:,0]
40 | 
41 |         self.state_in = []
42 |         self.state_out = []
43 | 
44 |         stochastic = tf.placeholder(dtype=tf.bool, shape=())
45 |         ac = self.pd.sample() # XXX
46 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
47 | 
48 |     def act(self, stochastic, ob):
49 |         ac1, vpred1 =  self._act(stochastic, ob[None])
50 |         return ac1[0], vpred1[0]
51 |     def get_variables(self):
52 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
53 |     def get_trainable_variables(self):
54 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
55 |     def get_initial_state(self):
56 |         return []
57 | 
58 | 


--------------------------------------------------------------------------------
/baselines/deepq/README.md:
--------------------------------------------------------------------------------
 1 | ## If you are curious.
 2 | 
 3 | ##### Train a Cartpole agent and watch it play once it converges!
 4 | 
 5 | Here's a list of commands to run to quickly get a working example:
 6 | 
 7 | <img src="../../data/cartpole.gif" width="25%" />
 8 | 
 9 | 
10 | ```bash
11 | # Train model and save the results to cartpole_model.pkl
12 | python -m baselines.deepq.experiments.train_cartpole
13 | # Load the model saved in cartpole_model.pkl and visualize the learned policy
14 | python -m baselines.deepq.experiments.enjoy_cartpole
15 | ```
16 | 
17 | 
18 | Be sure to check out the source code of [both](experiments/train_cartpole.py) [files](experiments/enjoy_cartpole.py)!
19 | 
20 | ## If you wish to apply DQN to solve a problem.
21 | 
22 | Check out our simple agent trained with one stop shop `deepq.learn` function. 
23 | 
24 | - [baselines/deepq/experiments/train_cartpole.py](experiments/train_cartpole.py) - train a Cartpole agent.
25 | - [baselines/deepq/experiments/train_pong.py](experiments/train_pong.py) - train a Pong agent using convolutional neural networks.
26 | 
27 | In particular notice that once `deepq.learn` finishes training it returns `act` function which can be used to select actions in the environment. Once trained you can easily save it and load at later time. For both of the files listed above there are complimentary files `enjoy_cartpole.py` and `enjoy_pong.py` respectively, that load and visualize the learned policy.
28 | 
29 | ## If you wish to experiment with the algorithm
30 | 
31 | ##### Check out the examples
32 | 
33 | 
34 | - [baselines/deepq/experiments/custom_cartpole.py](experiments/custom_cartpole.py) - Cartpole training with more fine grained control over the internals of DQN algorithm.
35 | - [baselines/deepq/experiments/atari/train.py](experiments/atari/train.py) - more robust setup for training at scale.
36 | 
37 | 
38 | ##### Download a pretrained Atari agent
39 | 
40 | For some research projects it is sometimes useful to have an already trained agent handy. There's a variety of models to choose from. You can list them all by running:
41 | 
42 | ```bash
43 | python -m baselines.deepq.experiments.atari.download_model
44 | ```
45 | 
46 | Once you pick a model, you can download it and visualize the learned policy. Be sure to pass `--dueling` flag to visualization script when using dueling models.
47 | 
48 | ```bash
49 | python -m baselines.deepq.experiments.atari.download_model --blob model-atari-duel-pong-1 --model-dir /tmp/models
50 | python -m baselines.deepq.experiments.atari.enjoy --model-dir /tmp/models/model-atari-duel-pong-1 --env Pong --dueling
51 | 
52 | ```
53 | 


--------------------------------------------------------------------------------
/baselines/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from multiprocessing import Process, Pipe
 3 | from baselines.common.vec_env import VecEnv
 4 | 
 5 | def worker(remote, env_fn_wrapper):
 6 |     env = env_fn_wrapper.x()
 7 |     while True:
 8 |         cmd, data = remote.recv()
 9 |         if cmd == 'step':
10 |             ob, reward, done, info = env.step(data)
11 |             if done:
12 |                 ob = env.reset()
13 |             remote.send((ob, reward, done, info))
14 |         elif cmd == 'reset':
15 |             ob = env.reset()
16 |             remote.send(ob)
17 |         elif cmd == 'close':
18 |             remote.close()
19 |             break
20 |         elif cmd == 'get_spaces':
21 |             remote.send((env.action_space, env.observation_space))
22 |         else:
23 |             raise NotImplementedError
24 | 
25 | class CloudpickleWrapper(object):
26 |     """
27 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
28 |     """
29 |     def __init__(self, x):
30 |         self.x = x
31 |     def __getstate__(self):
32 |         import cloudpickle
33 |         return cloudpickle.dumps(self.x)
34 |     def __setstate__(self, ob):
35 |         import pickle
36 |         self.x = pickle.loads(ob)
37 | 
38 | class SubprocVecEnv(VecEnv):
39 |     def __init__(self, env_fns):
40 |         """
41 |         envs: list of gym environments to run in subprocesses
42 |         """
43 |         nenvs = len(env_fns)
44 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])        
45 |         self.ps = [Process(target=worker, args=(work_remote, CloudpickleWrapper(env_fn))) 
46 |             for (work_remote, env_fn) in zip(self.work_remotes, env_fns)]
47 |         for p in self.ps:
48 |             p.start()
49 | 
50 |         self.remotes[0].send(('get_spaces', None))
51 |         self.action_space, self.observation_space = self.remotes[0].recv()
52 | 
53 | 
54 |     def step(self, actions):
55 |         for remote, action in zip(self.remotes, actions):
56 |             remote.send(('step', action))
57 |         results = [remote.recv() for remote in self.remotes]
58 |         obs, rews, dones, infos = zip(*results)
59 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
60 | 
61 |     def reset(self):
62 |         for remote in self.remotes:
63 |             remote.send(('reset', None))
64 |         return np.stack([remote.recv() for remote in self.remotes])
65 | 
66 |     def close(self):
67 |         for remote in self.remotes:
68 |             remote.send(('close', None))
69 |         for p in self.ps:
70 |             p.join()
71 | 
72 |     @property
73 |     def num_envs(self):
74 |         return len(self.remotes)
75 | 


--------------------------------------------------------------------------------
/baselines/ddpg/models.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.contrib as tc
 3 | 
 4 | 
 5 | class Model(object):
 6 |     def __init__(self, name):
 7 |         self.name = name
 8 | 
 9 |     @property
10 |     def vars(self):
11 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=self.name)
12 | 
13 |     @property
14 |     def trainable_vars(self):
15 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=self.name)
16 | 
17 |     @property
18 |     def perturbable_vars(self):
19 |         return [var for var in self.trainable_vars if 'LayerNorm' not in var.name]
20 | 
21 | 
22 | class Actor(Model):
23 |     def __init__(self, nb_actions, name='actor', layer_norm=True):
24 |         super(Actor, self).__init__(name=name)
25 |         self.nb_actions = nb_actions
26 |         self.layer_norm = layer_norm
27 | 
28 |     def __call__(self, obs, reuse=False):
29 |         with tf.variable_scope(self.name) as scope:
30 |             if reuse:
31 |                 scope.reuse_variables()
32 | 
33 |             x = obs
34 |             x = tf.layers.dense(x, 64)
35 |             if self.layer_norm:
36 |                 x = tc.layers.layer_norm(x, center=True, scale=True)
37 |             x = tf.nn.relu(x)
38 |             
39 |             x = tf.layers.dense(x, 64)
40 |             if self.layer_norm:
41 |                 x = tc.layers.layer_norm(x, center=True, scale=True)
42 |             x = tf.nn.relu(x)
43 |             
44 |             x = tf.layers.dense(x, self.nb_actions, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
45 |             x = tf.nn.tanh(x)
46 |         return x
47 | 
48 | 
49 | class Critic(Model):
50 |     def __init__(self, name='critic', layer_norm=True):
51 |         super(Critic, self).__init__(name=name)
52 |         self.layer_norm = layer_norm
53 | 
54 |     def __call__(self, obs, action, reuse=False):
55 |         with tf.variable_scope(self.name) as scope:
56 |             if reuse:
57 |                 scope.reuse_variables()
58 | 
59 |             x = obs
60 |             x = tf.layers.dense(x, 64)
61 |             if self.layer_norm:
62 |                 x = tc.layers.layer_norm(x, center=True, scale=True)
63 |             x = tf.nn.relu(x)
64 | 
65 |             x = tf.concat([x, action], axis=-1)
66 |             x = tf.layers.dense(x, 64)
67 |             if self.layer_norm:
68 |                 x = tc.layers.layer_norm(x, center=True, scale=True)
69 |             x = tf.nn.relu(x)
70 | 
71 |             x = tf.layers.dense(x, 1, kernel_initializer=tf.random_uniform_initializer(minval=-3e-3, maxval=3e-3))
72 |         return x
73 | 
74 |     @property
75 |     def output_vars(self):
76 |         output_vars = [var for var in self.trainable_vars if 'output' in var.name]
77 |         return output_vars
78 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/enjoy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gym
 3 | import os
 4 | import numpy as np
 5 | 
 6 | from gym.monitoring import VideoRecorder
 7 | 
 8 | import baselines.common.tf_util as U
 9 | 
10 | from baselines import deepq
11 | from baselines.common.misc_util import (
12 |     boolean_flag,
13 |     SimpleMonitor,
14 | )
15 | from baselines.common.atari_wrappers_deprecated import wrap_dqn
16 | from baselines.deepq.experiments.atari.model import model, dueling_model
17 | 
18 | 
19 | def parse_args():
20 |     parser = argparse.ArgumentParser("Run an already learned DQN model.")
21 |     # Environment
22 |     parser.add_argument("--env", type=str, required=True, help="name of the game")
23 |     parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
24 |     parser.add_argument("--video", type=str, default=None, help="Path to mp4 file where the video of first episode will be recorded.")
25 |     boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
26 |     boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
27 | 
28 |     return parser.parse_args()
29 | 
30 | 
31 | def make_env(game_name):
32 |     env = gym.make(game_name + "NoFrameskip-v4")
33 |     env = SimpleMonitor(env)
34 |     env = wrap_dqn(env)
35 |     return env
36 | 
37 | 
38 | def play(env, act, stochastic, video_path):
39 |     num_episodes = 0
40 |     video_recorder = None
41 |     video_recorder = VideoRecorder(
42 |         env, video_path, enabled=video_path is not None)
43 |     obs = env.reset()
44 |     while True:
45 |         env.unwrapped.render()
46 |         video_recorder.capture_frame()
47 |         action = act(np.array(obs)[None], stochastic=stochastic)[0]
48 |         obs, rew, done, info = env.step(action)
49 |         if done:
50 |             obs = env.reset()
51 |         if len(info["rewards"]) > num_episodes:
52 |             if len(info["rewards"]) == 1 and video_recorder.enabled:
53 |                 # save video of first episode
54 |                 print("Saved video.")
55 |                 video_recorder.close()
56 |                 video_recorder.enabled = False
57 |             print(info["rewards"][-1])
58 |             num_episodes = len(info["rewards"])
59 | 
60 | 
61 | if __name__ == '__main__':
62 |     with U.make_session(4) as sess:
63 |         args = parse_args()
64 |         env = make_env(args.env)
65 |         act = deepq.build_act(
66 |             make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
67 |             q_func=dueling_model if args.dueling else model,
68 |             num_actions=env.action_space.n)
69 |         U.load_state(os.path.join(args.model_dir, "saved"))
70 |         play(env, act, args.stochastic, args.video)
71 | 


--------------------------------------------------------------------------------
/baselines/ppo1/mlp_policy.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.mpi_running_mean_std import RunningMeanStd
 2 | import baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import gym
 5 | from baselines.common.distributions import make_pdtype
 6 | 
 7 | class MlpPolicy(object):
 8 |     recurrent = False
 9 |     def __init__(self, name, *args, **kwargs):
10 |         with tf.variable_scope(name):
11 |             self._init(*args, **kwargs)
12 |             self.scope = tf.get_variable_scope().name
13 | 
14 |     def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
15 |         assert isinstance(ob_space, gym.spaces.Box)
16 | 
17 |         self.pdtype = pdtype = make_pdtype(ac_space)
18 |         sequence_length = None
19 | 
20 |         ob = U.get_placeholder(name="ob", dtype=tf.float32, shape=[sequence_length] + list(ob_space.shape))
21 | 
22 |         with tf.variable_scope("obfilter"):
23 |             self.ob_rms = RunningMeanStd(shape=ob_space.shape)
24 | 
25 |         obz = tf.clip_by_value((ob - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
26 |         last_out = obz
27 |         for i in range(num_hid_layers):
28 |             last_out = tf.nn.tanh(U.dense(last_out, hid_size, "vffc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
29 |         self.vpred = U.dense(last_out, 1, "vffinal", weight_init=U.normc_initializer(1.0))[:,0]
30 | 
31 |         last_out = obz
32 |         for i in range(num_hid_layers):
33 |             last_out = tf.nn.tanh(U.dense(last_out, hid_size, "polfc%i"%(i+1), weight_init=U.normc_initializer(1.0)))
34 |         if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
35 |             mean = U.dense(last_out, pdtype.param_shape()[0]//2, "polfinal", U.normc_initializer(0.01))
36 |             logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], initializer=tf.zeros_initializer())
37 |             pdparam = U.concatenate([mean, mean * 0.0 + logstd], axis=1)
38 |         else:
39 |             pdparam = U.dense(last_out, pdtype.param_shape()[0], "polfinal", U.normc_initializer(0.01))
40 | 
41 |         self.pd = pdtype.pdfromflat(pdparam)
42 | 
43 |         self.state_in = []
44 |         self.state_out = []
45 | 
46 |         stochastic = tf.placeholder(dtype=tf.bool, shape=())
47 |         ac = U.switch(stochastic, self.pd.sample(), self.pd.mode())
48 |         self._act = U.function([stochastic, ob], [ac, self.vpred])
49 | 
50 |     def act(self, stochastic, ob):
51 |         ac1, vpred1 =  self._act(stochastic, ob[None])
52 |         return ac1[0], vpred1[0]
53 |     def get_variables(self):
54 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
55 |     def get_trainable_variables(self):
56 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
57 |     def get_initial_state(self):
58 |         return []
59 | 
60 | 


--------------------------------------------------------------------------------
/baselines/acktr/value_functions.py:
--------------------------------------------------------------------------------
 1 | from baselines import logger
 2 | import numpy as np
 3 | from baselines import common
 4 | from baselines.common import tf_util as U
 5 | import tensorflow as tf
 6 | from baselines.acktr import kfac
 7 | from baselines.acktr.utils import dense
 8 | 
 9 | class NeuralNetValueFunction(object):
10 |     def __init__(self, ob_dim, ac_dim): #pylint: disable=W0613
11 |         X = tf.placeholder(tf.float32, shape=[None, ob_dim*2+ac_dim*2+2]) # batch of observations
12 |         vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
13 |         wd_dict = {}
14 |         h1 = tf.nn.elu(dense(X, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
15 |         h2 = tf.nn.elu(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
16 |         vpred_n = dense(h2, 1, "hfinal", weight_init=None, bias_init=0, weight_loss_dict=wd_dict)[:,0]
17 |         sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
18 |         wd_loss = tf.get_collection("vf_losses", None)
19 |         loss = U.mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
20 |         loss_sampled = U.mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
21 |         self._predict = U.function([X], vpred_n)
22 |         optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001*(1-0.9), momentum=0.9, \
23 |                                     clip_kl=0.3, epsilon=0.1, stats_decay=0.95, \
24 |                                     async=1, kfac_update=2, cold_iter=50, \
25 |                                     weight_decay_dict=wd_dict, max_grad_norm=1.0)
26 |         vf_var_list = []
27 |         for var in tf.trainable_variables():
28 |             if "vf" in var.name:
29 |                 vf_var_list.append(var)
30 | 
31 |         update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
32 |         self.do_update = U.function([X, vtarg_n], update_op) #pylint: disable=E1101
33 |         U.initialize() # Initialize uninitialized TF variables
34 |     def _preproc(self, path):
35 |         l = pathlength(path)
36 |         al = np.arange(l).reshape(-1,1)/10.0
37 |         act = path["action_dist"].astype('float32')
38 |         X = np.concatenate([path['observation'], act, al, np.ones((l, 1))], axis=1)
39 |         return X
40 |     def predict(self, path):
41 |         return self._predict(self._preproc(path))
42 |     def fit(self, paths, targvals):
43 |         X = np.concatenate([self._preproc(p) for p in paths])
44 |         y = np.concatenate(targvals)
45 |         logger.record_tabular("EVBefore", common.explained_variance(self._predict(X), y))
46 |         for _ in range(25): self.do_update(X, y)
47 |         logger.record_tabular("EVAfter", common.explained_variance(self._predict(X), y))
48 | 
49 | def pathlength(path):
50 |     return path["reward"].shape[0]
51 | 


--------------------------------------------------------------------------------
/baselines/ddpg/memory.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RingBuffer(object):
 5 |     def __init__(self, maxlen, shape, dtype='float32'):
 6 |         self.maxlen = maxlen
 7 |         self.start = 0
 8 |         self.length = 0
 9 |         self.data = np.zeros((maxlen,) + shape).astype(dtype)
10 | 
11 |     def __len__(self):
12 |         return self.length
13 | 
14 |     def __getitem__(self, idx):
15 |         if idx < 0 or idx >= self.length:
16 |             raise KeyError()
17 |         return self.data[(self.start + idx) % self.maxlen]
18 | 
19 |     def get_batch(self, idxs):
20 |         return self.data[(self.start + idxs) % self.maxlen]
21 | 
22 |     def append(self, v):
23 |         if self.length < self.maxlen:
24 |             # We have space, simply increase the length.
25 |             self.length += 1
26 |         elif self.length == self.maxlen:
27 |             # No space, "remove" the first item.
28 |             self.start = (self.start + 1) % self.maxlen
29 |         else:
30 |             # This should never happen.
31 |             raise RuntimeError()
32 |         self.data[(self.start + self.length - 1) % self.maxlen] = v
33 | 
34 | 
35 | def array_min2d(x):
36 |     x = np.array(x)
37 |     if x.ndim >= 2:
38 |         return x
39 |     return x.reshape(-1, 1)
40 | 
41 | 
42 | class Memory(object):
43 |     def __init__(self, limit, action_shape, observation_shape):
44 |         self.limit = limit
45 | 
46 |         self.observations0 = RingBuffer(limit, shape=observation_shape)
47 |         self.actions = RingBuffer(limit, shape=action_shape)
48 |         self.rewards = RingBuffer(limit, shape=(1,))
49 |         self.terminals1 = RingBuffer(limit, shape=(1,))
50 |         self.observations1 = RingBuffer(limit, shape=observation_shape)
51 | 
52 |     def sample(self, batch_size):
53 |         # Draw such that we always have a proceeding element.
54 |         batch_idxs = np.random.random_integers(self.nb_entries - 2, size=batch_size)
55 | 
56 |         obs0_batch = self.observations0.get_batch(batch_idxs)
57 |         obs1_batch = self.observations1.get_batch(batch_idxs)
58 |         action_batch = self.actions.get_batch(batch_idxs)
59 |         reward_batch = self.rewards.get_batch(batch_idxs)
60 |         terminal1_batch = self.terminals1.get_batch(batch_idxs)
61 | 
62 |         result = {
63 |             'obs0': array_min2d(obs0_batch),
64 |             'obs1': array_min2d(obs1_batch),
65 |             'rewards': array_min2d(reward_batch),
66 |             'actions': array_min2d(action_batch),
67 |             'terminals1': array_min2d(terminal1_batch),
68 |         }
69 |         return result
70 | 
71 |     def append(self, obs0, action, reward, obs1, terminal1, training=True):
72 |         if not training:
73 |             return
74 |         
75 |         self.observations0.append(obs0)
76 |         self.actions.append(action)
77 |         self.rewards.append(reward)
78 |         self.observations1.append(obs1)
79 |         self.terminals1.append(terminal1)
80 | 
81 |     @property
82 |     def nb_entries(self):
83 |         return len(self.observations0)
84 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | class MpiAdam(object):
 7 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
 8 |         self.var_list = var_list
 9 |         self.beta1 = beta1
10 |         self.beta2 = beta2
11 |         self.epsilon = epsilon
12 |         self.scale_grad_by_procs = scale_grad_by_procs
13 |         size = sum(U.numel(v) for v in var_list)
14 |         self.m = np.zeros(size, 'float32')
15 |         self.v = np.zeros(size, 'float32')
16 |         self.t = 0
17 |         self.setfromflat = U.SetFromFlat(var_list)
18 |         self.getflat = U.GetFlat(var_list)
19 |         self.comm = MPI.COMM_WORLD if comm is None else comm
20 | 
21 |     def update(self, localg, stepsize):
22 |         if self.t % 100 == 0:
23 |             self.check_synced()
24 |         localg = localg.astype('float32')
25 |         globalg = np.zeros_like(localg)
26 |         self.comm.Allreduce(localg, globalg, op=MPI.SUM)
27 |         if self.scale_grad_by_procs:
28 |             globalg /= self.comm.Get_size()
29 | 
30 |         self.t += 1
31 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
32 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
33 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
34 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
35 |         self.setfromflat(self.getflat() + step)
36 | 
37 |     def sync(self):
38 |         theta = self.getflat()
39 |         self.comm.Bcast(theta, root=0)
40 |         self.setfromflat(theta)
41 | 
42 |     def check_synced(self):
43 |         if self.comm.Get_rank() == 0: # this is root
44 |             theta = self.getflat()
45 |             self.comm.Bcast(theta, root=0)
46 |         else:
47 |             thetalocal = self.getflat()
48 |             thetaroot = np.empty_like(thetalocal)
49 |             self.comm.Bcast(thetaroot, root=0)
50 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
51 | 
52 | @U.in_session
53 | def test_MpiAdam():
54 |     np.random.seed(0)
55 |     tf.set_random_seed(0)
56 |     
57 |     a = tf.Variable(np.random.randn(3).astype('float32'))
58 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
59 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
60 | 
61 |     stepsize = 1e-2
62 |     update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
63 |     do_update = U.function([], loss, updates=[update_op])
64 | 
65 |     tf.get_default_session().run(tf.global_variables_initializer())
66 |     for i in range(10):
67 |         print(i,do_update())
68 | 
69 |     tf.set_random_seed(0)
70 |     tf.get_default_session().run(tf.global_variables_initializer())
71 | 
72 |     var_list = [a,b]
73 |     lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
74 |     adam = MpiAdam(var_list)
75 | 
76 |     for i in range(10):
77 |         l,g = lossandgrad()
78 |         adam.update(g, stepsize)
79 |         print(i,l)


--------------------------------------------------------------------------------
/baselines/acktr/filters.py:
--------------------------------------------------------------------------------
 1 | from baselines.acktr.running_stat import RunningStat
 2 | from collections import deque
 3 | import numpy as np
 4 | 
 5 | class Filter(object):
 6 |     def __call__(self, x, update=True):
 7 |         raise NotImplementedError
 8 |     def reset(self):
 9 |         pass
10 | 
11 | class IdentityFilter(Filter):
12 |     def __call__(self, x, update=True):
13 |         return x
14 | 
15 | class CompositionFilter(Filter):
16 |     def __init__(self, fs):
17 |         self.fs = fs
18 |     def __call__(self, x, update=True):
19 |         for f in self.fs:
20 |             x = f(x)
21 |         return x
22 |     def output_shape(self, input_space):
23 |         out = input_space.shape
24 |         for f in self.fs:
25 |             out = f.output_shape(out)
26 |         return out
27 | 
28 | class ZFilter(Filter):
29 |     """
30 |     y = (x-mean)/std
31 |     using running estimates of mean,std
32 |     """
33 | 
34 |     def __init__(self, shape, demean=True, destd=True, clip=10.0):
35 |         self.demean = demean
36 |         self.destd = destd
37 |         self.clip = clip
38 | 
39 |         self.rs = RunningStat(shape)
40 | 
41 |     def __call__(self, x, update=True):
42 |         if update: self.rs.push(x)
43 |         if self.demean:
44 |             x = x - self.rs.mean
45 |         if self.destd:
46 |             x = x / (self.rs.std+1e-8)
47 |         if self.clip:
48 |             x = np.clip(x, -self.clip, self.clip)
49 |         return x
50 |     def output_shape(self, input_space):
51 |         return input_space.shape
52 | 
53 | class AddClock(Filter):
54 |     def __init__(self):
55 |         self.count = 0
56 |     def reset(self):
57 |         self.count = 0
58 |     def __call__(self, x, update=True):
59 |         return np.append(x, self.count/100.0)
60 |     def output_shape(self, input_space):
61 |         return (input_space.shape[0]+1,)
62 | 
63 | class FlattenFilter(Filter):
64 |     def __call__(self, x, update=True):
65 |         return x.ravel()
66 |     def output_shape(self, input_space):
67 |         return (int(np.prod(input_space.shape)),)
68 | 
69 | class Ind2OneHotFilter(Filter):
70 |     def __init__(self, n):
71 |         self.n = n
72 |     def __call__(self, x, update=True):
73 |         out = np.zeros(self.n)
74 |         out[x] = 1
75 |         return out
76 |     def output_shape(self, input_space):
77 |         return (input_space.n,)
78 | 
79 | class DivFilter(Filter):
80 |     def __init__(self, divisor):
81 |         self.divisor = divisor
82 |     def __call__(self, x, update=True):
83 |         return x / self.divisor
84 |     def output_shape(self, input_space):
85 |         return input_space.shape
86 | 
87 | class StackFilter(Filter):
88 |     def __init__(self, length):
89 |         self.stack = deque(maxlen=length)
90 |     def reset(self):
91 |         self.stack.clear()
92 |     def __call__(self, x, update=True):
93 |         self.stack.append(x)
94 |         while len(self.stack) < self.stack.maxlen:
95 |             self.stack.append(x)
96 |         return np.concatenate(self.stack, axis=-1)
97 |     def output_shape(self, input_space):
98 |         return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
99 | 


--------------------------------------------------------------------------------
/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  4 | 
  5 | 
  6 | def test_tree_set():
  7 |     tree = SumSegmentTree(4)
  8 | 
  9 |     tree[2] = 1.0
 10 |     tree[3] = 3.0
 11 | 
 12 |     assert np.isclose(tree.sum(), 4.0)
 13 |     assert np.isclose(tree.sum(0, 2), 0.0)
 14 |     assert np.isclose(tree.sum(0, 3), 1.0)
 15 |     assert np.isclose(tree.sum(2, 3), 1.0)
 16 |     assert np.isclose(tree.sum(2, -1), 1.0)
 17 |     assert np.isclose(tree.sum(2, 4), 4.0)
 18 | 
 19 | 
 20 | def test_tree_set_overlap():
 21 |     tree = SumSegmentTree(4)
 22 | 
 23 |     tree[2] = 1.0
 24 |     tree[2] = 3.0
 25 | 
 26 |     assert np.isclose(tree.sum(), 3.0)
 27 |     assert np.isclose(tree.sum(2, 3), 3.0)
 28 |     assert np.isclose(tree.sum(2, -1), 3.0)
 29 |     assert np.isclose(tree.sum(2, 4), 3.0)
 30 |     assert np.isclose(tree.sum(1, 2), 0.0)
 31 | 
 32 | 
 33 | def test_prefixsum_idx():
 34 |     tree = SumSegmentTree(4)
 35 | 
 36 |     tree[2] = 1.0
 37 |     tree[3] = 3.0
 38 | 
 39 |     assert tree.find_prefixsum_idx(0.0) == 2
 40 |     assert tree.find_prefixsum_idx(0.5) == 2
 41 |     assert tree.find_prefixsum_idx(0.99) == 2
 42 |     assert tree.find_prefixsum_idx(1.01) == 3
 43 |     assert tree.find_prefixsum_idx(3.00) == 3
 44 |     assert tree.find_prefixsum_idx(4.00) == 3
 45 | 
 46 | 
 47 | def test_prefixsum_idx2():
 48 |     tree = SumSegmentTree(4)
 49 | 
 50 |     tree[0] = 0.5
 51 |     tree[1] = 1.0
 52 |     tree[2] = 1.0
 53 |     tree[3] = 3.0
 54 | 
 55 |     assert tree.find_prefixsum_idx(0.00) == 0
 56 |     assert tree.find_prefixsum_idx(0.55) == 1
 57 |     assert tree.find_prefixsum_idx(0.99) == 1
 58 |     assert tree.find_prefixsum_idx(1.51) == 2
 59 |     assert tree.find_prefixsum_idx(3.00) == 3
 60 |     assert tree.find_prefixsum_idx(5.50) == 3
 61 | 
 62 | 
 63 | def test_max_interval_tree():
 64 |     tree = MinSegmentTree(4)
 65 | 
 66 |     tree[0] = 1.0
 67 |     tree[2] = 0.5
 68 |     tree[3] = 3.0
 69 | 
 70 |     assert np.isclose(tree.min(), 0.5)
 71 |     assert np.isclose(tree.min(0, 2), 1.0)
 72 |     assert np.isclose(tree.min(0, 3), 0.5)
 73 |     assert np.isclose(tree.min(0, -1), 0.5)
 74 |     assert np.isclose(tree.min(2, 4), 0.5)
 75 |     assert np.isclose(tree.min(3, 4), 3.0)
 76 | 
 77 |     tree[2] = 0.7
 78 | 
 79 |     assert np.isclose(tree.min(), 0.7)
 80 |     assert np.isclose(tree.min(0, 2), 1.0)
 81 |     assert np.isclose(tree.min(0, 3), 0.7)
 82 |     assert np.isclose(tree.min(0, -1), 0.7)
 83 |     assert np.isclose(tree.min(2, 4), 0.7)
 84 |     assert np.isclose(tree.min(3, 4), 3.0)
 85 | 
 86 |     tree[2] = 4.0
 87 | 
 88 |     assert np.isclose(tree.min(), 1.0)
 89 |     assert np.isclose(tree.min(0, 2), 1.0)
 90 |     assert np.isclose(tree.min(0, 3), 1.0)
 91 |     assert np.isclose(tree.min(0, -1), 1.0)
 92 |     assert np.isclose(tree.min(2, 4), 3.0)
 93 |     assert np.isclose(tree.min(2, 3), 4.0)
 94 |     assert np.isclose(tree.min(2, -1), 4.0)
 95 |     assert np.isclose(tree.min(3, 4), 3.0)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     test_tree_set()
100 |     test_tree_set_overlap()
101 |     test_prefixsum_idx()
102 |     test_prefixsum_idx2()
103 |     test_max_interval_tree()
104 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/model.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.contrib.layers as layers
 3 | 
 4 | 
 5 | def layer_norm_fn(x, relu=True):
 6 |     x = layers.layer_norm(x, scale=True, center=True)
 7 |     if relu:
 8 |         x = tf.nn.relu(x)
 9 |     return x
10 | 
11 | 
12 | def model(img_in, num_actions, scope, reuse=False, layer_norm=False):
13 |     """As described in https://storage.googleapis.com/deepmind-data/assets/papers/DeepMindNature14236Paper.pdf"""
14 |     with tf.variable_scope(scope, reuse=reuse):
15 |         out = img_in
16 |         with tf.variable_scope("convnet"):
17 |             # original architecture
18 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
19 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
20 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
21 |         conv_out = layers.flatten(out)
22 | 
23 |         with tf.variable_scope("action_value"):
24 |             value_out = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
25 |             if layer_norm:
26 |                 value_out = layer_norm_fn(value_out, relu=True)
27 |             else:
28 |                 value_out = tf.nn.relu(value_out)
29 |             value_out = layers.fully_connected(value_out, num_outputs=num_actions, activation_fn=None)
30 |         return value_out
31 | 
32 | 
33 | def dueling_model(img_in, num_actions, scope, reuse=False, layer_norm=False):
34 |     """As described in https://arxiv.org/abs/1511.06581"""
35 |     with tf.variable_scope(scope, reuse=reuse):
36 |         out = img_in
37 |         with tf.variable_scope("convnet"):
38 |             # original architecture
39 |             out = layers.convolution2d(out, num_outputs=32, kernel_size=8, stride=4, activation_fn=tf.nn.relu)
40 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=4, stride=2, activation_fn=tf.nn.relu)
41 |             out = layers.convolution2d(out, num_outputs=64, kernel_size=3, stride=1, activation_fn=tf.nn.relu)
42 |         conv_out = layers.flatten(out)
43 | 
44 |         with tf.variable_scope("state_value"):
45 |             state_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
46 |             if layer_norm:
47 |                 state_hidden = layer_norm_fn(state_hidden, relu=True)
48 |             else:
49 |                 state_hidden = tf.nn.relu(state_hidden)
50 |             state_score = layers.fully_connected(state_hidden, num_outputs=1, activation_fn=None)
51 |         with tf.variable_scope("action_value"):
52 |             actions_hidden = layers.fully_connected(conv_out, num_outputs=512, activation_fn=None)
53 |             if layer_norm:
54 |                 actions_hidden = layer_norm_fn(actions_hidden, relu=True)
55 |             else:
56 |                 actions_hidden = tf.nn.relu(actions_hidden)
57 |             action_scores = layers.fully_connected(actions_hidden, num_outputs=num_actions, activation_fn=None)
58 |             action_scores_mean = tf.reduce_mean(action_scores, 1)
59 |             action_scores = action_scores - tf.expand_dims(action_scores_mean, 1)
60 |         return state_score + action_scores
61 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/atari/wang2015_eval.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import gym
 3 | import numpy as np
 4 | import os
 5 | 
 6 | import baselines.common.tf_util as U
 7 | 
 8 | from baselines import deepq
 9 | from baselines.common.misc_util import get_wrapper_by_name, SimpleMonitor, boolean_flag, set_global_seeds
10 | from baselines.common.atari_wrappers_deprecated import wrap_dqn
11 | from baselines.deepq.experiments.atari.model import model, dueling_model
12 | 
13 | 
14 | def make_env(game_name):
15 |     env = gym.make(game_name + "NoFrameskip-v4")
16 |     env_monitored = SimpleMonitor(env)
17 |     env = wrap_dqn(env_monitored)
18 |     return env_monitored, env
19 | 
20 | 
21 | def parse_args():
22 |     parser = argparse.ArgumentParser("Evaluate an already learned DQN model.")
23 |     # Environment
24 |     parser.add_argument("--env", type=str, required=True, help="name of the game")
25 |     parser.add_argument("--model-dir", type=str, default=None, help="load model from this directory. ")
26 |     boolean_flag(parser, "stochastic", default=True, help="whether or not to use stochastic actions according to models eps value")
27 |     boolean_flag(parser, "dueling", default=False, help="whether or not to use dueling model")
28 | 
29 |     return parser.parse_args()
30 | 
31 | 
32 | def wang2015_eval(game_name, act, stochastic):
33 |     print("==================== wang2015 evaluation ====================")
34 |     episode_rewards = []
35 | 
36 |     for num_noops in range(1, 31):
37 |         env_monitored, eval_env = make_env(game_name)
38 |         eval_env.unwrapped.seed(1)
39 | 
40 |         get_wrapper_by_name(eval_env, "NoopResetEnv").override_num_noops = num_noops
41 | 
42 |         eval_episode_steps = 0
43 |         done = True
44 |         while True:
45 |             if done:
46 |                 obs = eval_env.reset()
47 |             eval_episode_steps += 1
48 |             action = act(np.array(obs)[None], stochastic=stochastic)[0]
49 | 
50 |             obs, reward, done, info = eval_env.step(action)
51 |             if done:
52 |                 obs = eval_env.reset()
53 |             if len(info["rewards"]) > 0:
54 |                 episode_rewards.append(info["rewards"][0])
55 |                 break
56 |             if info["steps"] > 108000:  # 5 minutes of gameplay
57 |                 episode_rewards.append(env_monitored._current_reward)
58 |                 break
59 |         print("Num steps in episode {} was {} yielding {} reward".format(
60 |               num_noops, eval_episode_steps, episode_rewards[-1]), flush=True)
61 |     print("Evaluation results: " + str(np.mean(episode_rewards)))
62 |     print("=============================================================")
63 |     return np.mean(episode_rewards)
64 | 
65 | 
66 | def main():
67 |     set_global_seeds(1)
68 |     args = parse_args()
69 |     with U.make_session(4) as sess:  # noqa
70 |         _, env = make_env(args.env)
71 |         act = deepq.build_act(
72 |             make_obs_ph=lambda name: U.Uint8Input(env.observation_space.shape, name=name),
73 |             q_func=dueling_model if args.dueling else model,
74 |             num_actions=env.action_space.n)
75 | 
76 |         U.load_state(os.path.join(args.model_dir, "saved"))
77 |         wang2015_eval(args.env, act, stochastic=args.stochastic)
78 | 
79 | 
80 | if __name__ == '__main__':
81 |     main()
82 | 


--------------------------------------------------------------------------------
/baselines/deepq/experiments/custom_cartpole.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import itertools
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | import tensorflow.contrib.layers as layers
 6 | 
 7 | import baselines.common.tf_util as U
 8 | 
 9 | from baselines import logger
10 | from baselines import deepq
11 | from baselines.deepq.replay_buffer import ReplayBuffer
12 | from baselines.common.schedules import LinearSchedule
13 | 
14 | 
15 | def model(inpt, num_actions, scope, reuse=False):
16 |     """This model takes as input an observation and returns values of all actions."""
17 |     with tf.variable_scope(scope, reuse=reuse):
18 |         out = inpt
19 |         out = layers.fully_connected(out, num_outputs=64, activation_fn=tf.nn.tanh)
20 |         out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
21 |         return out
22 | 
23 | 
24 | if __name__ == '__main__':
25 |     with U.make_session(8):
26 |         # Create the environment
27 |         env = gym.make("CartPole-v0")
28 |         # Create all the functions necessary to train the model
29 |         act, train, update_target, debug = deepq.build_train(
30 |             make_obs_ph=lambda name: U.BatchInput(env.observation_space.shape, name=name),
31 |             q_func=model,
32 |             num_actions=env.action_space.n,
33 |             optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
34 |         )
35 |         # Create the replay buffer
36 |         replay_buffer = ReplayBuffer(50000)
37 |         # Create the schedule for exploration starting from 1 (every action is random) down to
38 |         # 0.02 (98% of actions are selected according to values predicted by the model).
39 |         exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
40 | 
41 |         # Initialize the parameters and copy them to the target network.
42 |         U.initialize()
43 |         update_target()
44 | 
45 |         episode_rewards = [0.0]
46 |         obs = env.reset()
47 |         for t in itertools.count():
48 |             # Take action and update exploration to the newest value
49 |             action = act(obs[None], update_eps=exploration.value(t))[0]
50 |             new_obs, rew, done, _ = env.step(action)
51 |             # Store transition in the replay buffer.
52 |             replay_buffer.add(obs, action, rew, new_obs, float(done))
53 |             obs = new_obs
54 | 
55 |             episode_rewards[-1] += rew
56 |             if done:
57 |                 obs = env.reset()
58 |                 episode_rewards.append(0)
59 | 
60 |             is_solved = t > 100 and np.mean(episode_rewards[-101:-1]) >= 200
61 |             if is_solved:
62 |                 # Show off the result
63 |                 env.render()
64 |             else:
65 |                 # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
66 |                 if t > 1000:
67 |                     obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
68 |                     train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
69 |                 # Update target network periodically.
70 |                 if t % 1000 == 0:
71 |                     update_target()
72 | 
73 |             if done and len(episode_rewards) % 10 == 0:
74 |                 logger.record_tabular("steps", t)
75 |                 logger.record_tabular("episodes", len(episode_rewards))
76 |                 logger.record_tabular("mean episode reward", round(np.mean(episode_rewards[-101:-1]), 1))
77 |                 logger.record_tabular("% time spent exploring", int(100 * exploration.value(t)))
78 |                 logger.dump_tabular()
79 | 


--------------------------------------------------------------------------------
/baselines/deepq/models.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import tensorflow.contrib.layers as layers
 3 | 
 4 | 
 5 | def _mlp(hiddens, inpt, num_actions, scope, reuse=False, layer_norm=False):
 6 |     with tf.variable_scope(scope, reuse=reuse):
 7 |         out = inpt
 8 |         for hidden in hiddens:
 9 |             out = layers.fully_connected(out, num_outputs=hidden, activation_fn=None)
10 |             if layer_norm:
11 |                 out = layers.layer_norm(out, center=True, scale=True)
12 |             out = tf.nn.relu(out)
13 |         q_out = layers.fully_connected(out, num_outputs=num_actions, activation_fn=None)
14 |         return q_out
15 | 
16 | 
17 | def mlp(hiddens=[], layer_norm=False):
18 |     """This model takes as input an observation and returns values of all actions.
19 | 
20 |     Parameters
21 |     ----------
22 |     hiddens: [int]
23 |         list of sizes of hidden layers
24 | 
25 |     Returns
26 |     -------
27 |     q_func: function
28 |         q_function for DQN algorithm.
29 |     """
30 |     return lambda *args, **kwargs: _mlp(hiddens, layer_norm=layer_norm, *args, **kwargs)
31 | 
32 | 
33 | def _cnn_to_mlp(convs, hiddens, dueling, inpt, num_actions, scope, reuse=False, layer_norm=False):
34 |     with tf.variable_scope(scope, reuse=reuse):
35 |         out = inpt
36 |         with tf.variable_scope("convnet"):
37 |             for num_outputs, kernel_size, stride in convs:
38 |                 out = layers.convolution2d(out,
39 |                                            num_outputs=num_outputs,
40 |                                            kernel_size=kernel_size,
41 |                                            stride=stride,
42 |                                            activation_fn=tf.nn.relu)
43 |         conv_out = layers.flatten(out)
44 |         with tf.variable_scope("action_value"):
45 |             action_out = conv_out
46 |             for hidden in hiddens:
47 |                 action_out = layers.fully_connected(action_out, num_outputs=hidden, activation_fn=None)
48 |                 if layer_norm:
49 |                     action_out = layers.layer_norm(action_out, center=True, scale=True)
50 |                 action_out = tf.nn.relu(action_out)
51 |             action_scores = layers.fully_connected(action_out, num_outputs=num_actions, activation_fn=None)
52 | 
53 |         if dueling:
54 |             with tf.variable_scope("state_value"):
55 |                 state_out = conv_out
56 |                 for hidden in hiddens:
57 |                     state_out = layers.fully_connected(state_out, num_outputs=hidden, activation_fn=None)
58 |                     if layer_norm:
59 |                         state_out = layers.layer_norm(state_out, center=True, scale=True)
60 |                     state_out = tf.nn.relu(state_out)
61 |                 state_score = layers.fully_connected(state_out, num_outputs=1, activation_fn=None)
62 |             action_scores_mean = tf.reduce_mean(action_scores, 1)
63 |             action_scores_centered = action_scores - tf.expand_dims(action_scores_mean, 1)
64 |             q_out = state_score + action_scores_centered
65 |         else:
66 |             q_out = action_scores
67 |         return q_out
68 | 
69 | 
70 | def cnn_to_mlp(convs, hiddens, dueling=False, layer_norm=False):
71 |     """This model takes as input an observation and returns values of all actions.
72 | 
73 |     Parameters
74 |     ----------
75 |     convs: [(int, int int)]
76 |         list of convolutional layers in form of
77 |         (num_outputs, kernel_size, stride)
78 |     hiddens: [int]
79 |         list of sizes of hidden layers
80 |     dueling: bool
81 |         if true double the output MLP to compute a baseline
82 |         for action scores
83 | 
84 |     Returns
85 |     -------
86 |     q_func: function
87 |         q_function for DQN algorithm.
88 |     """
89 | 
90 |     return lambda *args, **kwargs: _cnn_to_mlp(convs, hiddens, dueling, layer_norm=layer_norm, *args, **kwargs)
91 | 
92 | 


--------------------------------------------------------------------------------
/baselines/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | from mpi4py import MPI
  2 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np
  3 | 
  4 | class RunningMeanStd(object):
  5 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
  6 |     def __init__(self, epsilon=1e-2, shape=()):
  7 | 
  8 |         self._sum = tf.get_variable(
  9 |             dtype=tf.float64,
 10 |             shape=shape,
 11 |             initializer=tf.constant_initializer(0.0),
 12 |             name="runningsum", trainable=False)
 13 |         self._sumsq = tf.get_variable(
 14 |             dtype=tf.float64,
 15 |             shape=shape,
 16 |             initializer=tf.constant_initializer(epsilon),
 17 |             name="runningsumsq", trainable=False)
 18 |         self._count = tf.get_variable(
 19 |             dtype=tf.float64,
 20 |             shape=(),
 21 |             initializer=tf.constant_initializer(epsilon),
 22 |             name="count", trainable=False)
 23 |         self.shape = shape
 24 | 
 25 |         self.mean = tf.to_float(self._sum / self._count)
 26 |         self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
 27 | 
 28 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 29 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 30 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 31 |         self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
 32 |             updates=[tf.assign_add(self._sum, newsum),
 33 |                      tf.assign_add(self._sumsq, newsumsq),
 34 |                      tf.assign_add(self._count, newcount)])
 35 | 
 36 | 
 37 |     def update(self, x):
 38 |         x = x.astype('float64')
 39 |         n = int(np.prod(self.shape))
 40 |         totalvec = np.zeros(n*2+1, 'float64')
 41 |         addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
 42 |         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 43 |         self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
 44 | 
 45 | @U.in_session
 46 | def test_runningmeanstd():
 47 |     for (x1, x2, x3) in [
 48 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
 49 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
 50 |         ]:
 51 | 
 52 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
 53 |         U.initialize()
 54 | 
 55 |         x = np.concatenate([x1, x2, x3], axis=0)
 56 |         ms1 = [x.mean(axis=0), x.std(axis=0)]
 57 |         rms.update(x1)
 58 |         rms.update(x2)
 59 |         rms.update(x3)
 60 |         ms2 = U.eval([rms.mean, rms.std])
 61 | 
 62 |         assert np.allclose(ms1, ms2)
 63 | 
 64 | @U.in_session
 65 | def test_dist():
 66 |     np.random.seed(0)
 67 |     p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
 68 |     q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
 69 | 
 70 |     # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
 71 |     # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
 72 | 
 73 |     comm = MPI.COMM_WORLD
 74 |     assert comm.Get_size()==2
 75 |     if comm.Get_rank()==0:
 76 |         x1,x2,x3 = p1,p2,p3
 77 |     elif comm.Get_rank()==1:
 78 |         x1,x2,x3 = q1,q2,q3
 79 |     else:
 80 |         assert False
 81 | 
 82 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 83 |     U.initialize()
 84 | 
 85 |     rms.update(x1)
 86 |     rms.update(x2)
 87 |     rms.update(x3)
 88 | 
 89 |     bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
 90 | 
 91 |     def checkallclose(x,y):
 92 |         print(x,y)
 93 |         return np.allclose(x,y)
 94 | 
 95 |     assert checkallclose(
 96 |         bigvec.mean(axis=0),
 97 |         U.eval(rms.mean)
 98 |     )
 99 |     assert checkallclose(
100 |         bigvec.std(axis=0),
101 |         U.eval(rms.std)
102 |     )
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     # Run with mpirun -np 2 python <filename>
107 |     test_dist()
108 | 


--------------------------------------------------------------------------------
/baselines/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t and t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/baselines/acktr/policies.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from baselines.acktr.utils import conv, fc, dense, conv_to_fc, sample, kl_div
 4 | from baselines.common.distributions import make_pdtype
 5 | import baselines.common.tf_util as U
 6 | import gym
 7 | 
 8 | class CnnPolicy(object):
 9 | 
10 |     def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
11 |         nbatch = nenv*nsteps
12 |         nh, nw, nc = ob_space.shape
13 |         ob_shape = (nbatch, nh, nw, nc*nstack)
14 |         nact = ac_space.n
15 |         X = tf.placeholder(tf.uint8, ob_shape) #obs
16 |         with tf.variable_scope("model", reuse=reuse):
17 |             h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
18 |             h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
19 |             h3 = conv(h2, 'c3', nf=32, rf=3, stride=1, init_scale=np.sqrt(2))
20 |             h3 = conv_to_fc(h3)
21 |             h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
22 |             pi = fc(h4, 'pi', nact, act=lambda x:x)
23 |             vf = fc(h4, 'v', 1, act=lambda x:x)
24 | 
25 |         v0 = vf[:, 0]
26 |         a0 = sample(pi)
27 |         self.initial_state = [] #not stateful
28 | 
29 |         def step(ob, *_args, **_kwargs):
30 |             a, v = sess.run([a0, v0], {X:ob})
31 |             return a, v, [] #dummy state
32 | 
33 |         def value(ob, *_args, **_kwargs):
34 |             return sess.run(v0, {X:ob})
35 | 
36 |         self.X = X
37 |         self.pi = pi
38 |         self.vf = vf
39 |         self.step = step
40 |         self.value = value
41 | 
42 | 
43 | class GaussianMlpPolicy(object):
44 |     def __init__(self, ob_dim, ac_dim):
45 |         # Here we'll construct a bunch of expressions, which will be used in two places:
46 |         # (1) When sampling actions
47 |         # (2) When computing loss functions, for the policy update
48 |         # Variables specific to (1) have the word "sampled" in them,
49 |         # whereas variables specific to (2) have the word "old" in them
50 |         ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim*2], name="ob") # batch of observations
51 |         oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions
52 |         oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim*2], name="oldac_dist") # batch of actions previous action distributions
53 |         adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate
54 |         oldlogprob_n = tf.placeholder(tf.float32, shape=[None], name='oldlogprob') # log probability of previous actions
55 |         wd_dict = {}
56 |         h1 = tf.nn.tanh(dense(ob_no, 64, "h1", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
57 |         h2 = tf.nn.tanh(dense(h1, 64, "h2", weight_init=U.normc_initializer(1.0), bias_init=0.0, weight_loss_dict=wd_dict))
58 |         mean_na = dense(h2, ac_dim, "mean", weight_init=U.normc_initializer(0.1), bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output
59 |         self.wd_dict = wd_dict
60 |         self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) # Variance on outputs
61 |         logstd_1a = tf.expand_dims(logstd_1a, 0)
62 |         std_1a = tf.exp(logstd_1a)
63 |         std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
64 |         ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
65 |         sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:,ac_dim:])) * ac_dist[:,ac_dim:] + ac_dist[:,:ac_dim] # This is the sampled action we'll perform.
66 |         logprobsampled_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of sampled action
67 |         logprob_n = - U.sum(tf.log(ac_dist[:,ac_dim:]), axis=1) - 0.5 * tf.log(2.0*np.pi)*ac_dim - 0.5 * U.sum(tf.square(ac_dist[:,:ac_dim] - oldac_na) / (tf.square(ac_dist[:,ac_dim:])), axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
68 |         kl = U.mean(kl_div(oldac_dist, ac_dist, ac_dim))
69 |         #kl = .5 * U.mean(tf.square(logprob_n - oldlogprob_n)) # Approximation of KL divergence between old policy used to generate actions, and new policy used to compute logprob_n
70 |         surr = - U.mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient
71 |         surr_sampled = - U.mean(logprob_n) # Sampled loss of the policy
72 |         self._act = U.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) # Generate a new action and its logprob
73 |         #self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) # Compute (approximate) KL divergence between old policy and new policy
74 |         self.compute_kl = U.function([ob_no, oldac_dist], kl)
75 |         self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) # Input and output variables needed for computing loss
76 |         U.initialize() # Initialize uninitialized TF variables
77 | 
78 |     def act(self, ob):
79 |         ac, ac_dist, logp = self._act(ob[None])
80 |         return ac[0], ac_dist[0], logp[0]
81 | 


--------------------------------------------------------------------------------
/baselines/a2c/policies.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch, lstm, lnlstm, sample, check_shape
  4 | from baselines.common.distributions import make_pdtype
  5 | import baselines.common.tf_util as U
  6 | import gym
  7 | 
  8 | class LnLstmPolicy(object):
  9 |     def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
 10 |         nbatch = nenv*nsteps
 11 |         nh, nw, nc = ob_space.shape
 12 |         ob_shape = (nbatch, nh, nw, nc*nstack)
 13 |         nact = ac_space.n
 14 |         X = tf.placeholder(tf.uint8, ob_shape) #obs
 15 |         M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
 16 |         S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
 17 |         with tf.variable_scope("model", reuse=reuse):
 18 |             h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
 19 |             h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
 20 |             h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
 21 |             h3 = conv_to_fc(h3)
 22 |             h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
 23 |             xs = batch_to_seq(h4, nenv, nsteps)
 24 |             ms = batch_to_seq(M, nenv, nsteps)
 25 |             h5, snew = lnlstm(xs, ms, S, 'lstm1', nh=nlstm)
 26 |             h5 = seq_to_batch(h5)
 27 |             pi = fc(h5, 'pi', nact, act=lambda x:x)
 28 |             vf = fc(h5, 'v', 1, act=lambda x:x)
 29 | 
 30 |         v0 = vf[:, 0]
 31 |         a0 = sample(pi)
 32 |         self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
 33 | 
 34 |         def step(ob, state, mask):
 35 |             a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
 36 |             return a, v, s
 37 | 
 38 |         def value(ob, state, mask):
 39 |             return sess.run(v0, {X:ob, S:state, M:mask})
 40 | 
 41 |         self.X = X
 42 |         self.M = M
 43 |         self.S = S
 44 |         self.pi = pi
 45 |         self.vf = vf
 46 |         self.step = step
 47 |         self.value = value
 48 | 
 49 | class LstmPolicy(object):
 50 | 
 51 |     def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, nlstm=256, reuse=False):
 52 |         nbatch = nenv*nsteps
 53 |         nh, nw, nc = ob_space.shape
 54 |         ob_shape = (nbatch, nh, nw, nc*nstack)
 55 |         nact = ac_space.n
 56 |         X = tf.placeholder(tf.uint8, ob_shape) #obs
 57 |         M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
 58 |         S = tf.placeholder(tf.float32, [nenv, nlstm*2]) #states
 59 |         with tf.variable_scope("model", reuse=reuse):
 60 |             h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
 61 |             h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
 62 |             h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
 63 |             h3 = conv_to_fc(h3)
 64 |             h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
 65 |             xs = batch_to_seq(h4, nenv, nsteps)
 66 |             ms = batch_to_seq(M, nenv, nsteps)
 67 |             h5, snew = lstm(xs, ms, S, 'lstm1', nh=nlstm)
 68 |             h5 = seq_to_batch(h5)
 69 |             pi = fc(h5, 'pi', nact, act=lambda x:x)
 70 |             vf = fc(h5, 'v', 1, act=lambda x:x)
 71 | 
 72 |         v0 = vf[:, 0]
 73 |         a0 = sample(pi)
 74 |         self.initial_state = np.zeros((nenv, nlstm*2), dtype=np.float32)
 75 | 
 76 |         def step(ob, state, mask):
 77 |             a, v, s = sess.run([a0, v0, snew], {X:ob, S:state, M:mask})
 78 |             return a, v, s
 79 | 
 80 |         def value(ob, state, mask):
 81 |             return sess.run(v0, {X:ob, S:state, M:mask})
 82 | 
 83 |         self.X = X
 84 |         self.M = M
 85 |         self.S = S
 86 |         self.pi = pi
 87 |         self.vf = vf
 88 |         self.step = step
 89 |         self.value = value
 90 | 
 91 | class CnnPolicy(object):
 92 | 
 93 |     def __init__(self, sess, ob_space, ac_space, nenv, nsteps, nstack, reuse=False):
 94 |         nbatch = nenv*nsteps
 95 |         nh, nw, nc = ob_space.shape
 96 |         ob_shape = (nbatch, nh, nw, nc*nstack)
 97 |         nact = ac_space.n
 98 |         X = tf.placeholder(tf.uint8, ob_shape) #obs
 99 |         with tf.variable_scope("model", reuse=reuse):
100 |             h = conv(tf.cast(X, tf.float32)/255., 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2))
101 |             h2 = conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2))
102 |             h3 = conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2))
103 |             h3 = conv_to_fc(h3)
104 |             h4 = fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))
105 |             pi = fc(h4, 'pi', nact, act=lambda x:x)
106 |             vf = fc(h4, 'v', 1, act=lambda x:x)
107 | 
108 |         v0 = vf[:, 0]
109 |         a0 = sample(pi)
110 |         self.initial_state = [] #not stateful
111 | 
112 |         def step(ob, *_args, **_kwargs):
113 |             a, v = sess.run([a0, v0], {X:ob})
114 |             return a, v, [] #dummy state
115 | 
116 |         def value(ob, *_args, **_kwargs):
117 |             return sess.run(v0, {X:ob})
118 | 
119 |         self.X = X
120 |         self.pi = pi
121 |         self.vf = vf
122 |         self.step = step
123 |         self.value = value
124 | 


--------------------------------------------------------------------------------
/baselines/ddpg/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | import os
  4 | import logging
  5 | from baselines import logger, bench
  6 | from baselines.common.misc_util import (
  7 |     set_global_seeds,
  8 |     boolean_flag,
  9 | )
 10 | import baselines.ddpg.training as training
 11 | from baselines.ddpg.models import Actor, Critic
 12 | from baselines.ddpg.memory import Memory
 13 | from baselines.ddpg.noise import *
 14 | 
 15 | import gym
 16 | import tensorflow as tf
 17 | from mpi4py import MPI
 18 | 
 19 | def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
 20 |     # Configure things.
 21 |     rank = MPI.COMM_WORLD.Get_rank()
 22 |     if rank != 0: logger.set_level(logger.DISABLED)
 23 | 
 24 |     # Create envs.
 25 |     env = gym.make(env_id)
 26 |     env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), "%i.monitor.json"%rank))
 27 |     gym.logger.setLevel(logging.WARN)
 28 | 
 29 |     if evaluation and rank==0:
 30 |         eval_env = gym.make(env_id)
 31 |         eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
 32 |         env = bench.Monitor(env, None)
 33 |     else:
 34 |         eval_env = None
 35 | 
 36 |     # Parse noise_type
 37 |     action_noise = None
 38 |     param_noise = None
 39 |     nb_actions = env.action_space.shape[-1]
 40 |     for current_noise_type in noise_type.split(','):
 41 |         current_noise_type = current_noise_type.strip()
 42 |         if current_noise_type == 'none':
 43 |             pass
 44 |         elif 'adaptive-param' in current_noise_type:
 45 |             _, stddev = current_noise_type.split('_')
 46 |             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
 47 |         elif 'normal' in current_noise_type:
 48 |             _, stddev = current_noise_type.split('_')
 49 |             action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
 50 |         elif 'ou' in current_noise_type:
 51 |             _, stddev = current_noise_type.split('_')
 52 |             action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
 53 |         else:
 54 |             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))
 55 | 
 56 |     # Configure components.
 57 |     memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape)
 58 |     critic = Critic(layer_norm=layer_norm)
 59 |     actor = Actor(nb_actions, layer_norm=layer_norm)
 60 | 
 61 |     # Seed everything to make things reproducible.
 62 |     seed = seed + 1000000 * rank
 63 |     logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
 64 |     tf.reset_default_graph()
 65 |     set_global_seeds(seed)
 66 |     env.seed(seed)
 67 |     if eval_env is not None:
 68 |         eval_env.seed(seed)
 69 | 
 70 |     # Disable logging for rank != 0 to avoid noise.
 71 |     if rank == 0:
 72 |         start_time = time.time()
 73 |     training.train(env=env, eval_env=eval_env, param_noise=param_noise,
 74 |         action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs)
 75 |     env.close()
 76 |     if eval_env is not None:
 77 |         eval_env.close()
 78 |     if rank == 0:
 79 |         logger.info('total runtime: {}s'.format(time.time() - start_time))
 80 | 
 81 | 
 82 | def parse_args():
 83 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 84 |     
 85 |     parser.add_argument('--env-id', type=str, default='HalfCheetah-v1')
 86 |     boolean_flag(parser, 'render-eval', default=False)
 87 |     boolean_flag(parser, 'layer-norm', default=True)
 88 |     boolean_flag(parser, 'render', default=False)
 89 |     boolean_flag(parser, 'normalize-returns', default=False)
 90 |     boolean_flag(parser, 'normalize-observations', default=True)
 91 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
 92 |     parser.add_argument('--critic-l2-reg', type=float, default=1e-2)
 93 |     parser.add_argument('--batch-size', type=int, default=64)  # per MPI worker
 94 |     parser.add_argument('--actor-lr', type=float, default=1e-4)
 95 |     parser.add_argument('--critic-lr', type=float, default=1e-3)
 96 |     boolean_flag(parser, 'popart', default=False)
 97 |     parser.add_argument('--gamma', type=float, default=0.99)
 98 |     parser.add_argument('--reward-scale', type=float, default=1.)
 99 |     parser.add_argument('--clip-norm', type=float, default=None)
100 |     parser.add_argument('--nb-epochs', type=int, default=500)  # with default settings, perform 1M steps total
101 |     parser.add_argument('--nb-epoch-cycles', type=int, default=20)
102 |     parser.add_argument('--nb-train-steps', type=int, default=50)  # per epoch cycle and MPI worker
103 |     parser.add_argument('--nb-eval-steps', type=int, default=100)  # per epoch cycle and MPI worker
104 |     parser.add_argument('--nb-rollout-steps', type=int, default=100)  # per epoch cycle and MPI worker
105 |     parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2')  # choices are adaptive-param_xx, ou_xx, normal_xx, none
106 |     boolean_flag(parser, 'evaluation', default=False)
107 |     return vars(parser.parse_args())
108 | 
109 | 
110 | if __name__ == '__main__':
111 |     args = parse_args()
112 |     # Run actual script.
113 |     run(**args)
114 | 


--------------------------------------------------------------------------------
/baselines/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """Build a Segment Tree data structure.
  7 | 
  8 |         https://en.wikipedia.org/wiki/Segment_tree
  9 | 
 10 |         Can be used as regular array, but with two
 11 |         important differences:
 12 | 
 13 |             a) setting item's value is slightly slower.
 14 |                It is O(lg capacity) instead of O(1).
 15 |             b) user has access to an efficient `reduce`
 16 |                operation which reduces `operation` over
 17 |                a contiguous subsequence of items in the
 18 |                array.
 19 | 
 20 |         Paramters
 21 |         ---------
 22 |         capacity: int
 23 |             Total size of the array - must be a power of two.
 24 |         operation: lambda obj, obj -> obj
 25 |             and operation for combining elements (eg. sum, max)
 26 |             must for a mathematical group together with the set of
 27 |             possible values for array elements.
 28 |         neutral_element: obj
 29 |             neutral element for the operation above. eg. float('-inf')
 30 |             for max and 0 for sum.
 31 |         """
 32 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 33 |         self._capacity = capacity
 34 |         self._value = [neutral_element for _ in range(2 * capacity)]
 35 |         self._operation = operation
 36 | 
 37 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 38 |         if start == node_start and end == node_end:
 39 |             return self._value[node]
 40 |         mid = (node_start + node_end) // 2
 41 |         if end <= mid:
 42 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 43 |         else:
 44 |             if mid + 1 <= start:
 45 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 46 |             else:
 47 |                 return self._operation(
 48 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 49 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 50 |                 )
 51 | 
 52 |     def reduce(self, start=0, end=None):
 53 |         """Returns result of applying `self.operation`
 54 |         to a contiguous subsequence of the array.
 55 | 
 56 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 57 | 
 58 |         Parameters
 59 |         ----------
 60 |         start: int
 61 |             beginning of the subsequence
 62 |         end: int
 63 |             end of the subsequences
 64 | 
 65 |         Returns
 66 |         -------
 67 |         reduced: obj
 68 |             result of reducing self.operation over the specified range of array elements.
 69 |         """
 70 |         if end is None:
 71 |             end = self._capacity
 72 |         if end < 0:
 73 |             end += self._capacity
 74 |         end -= 1
 75 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 76 | 
 77 |     def __setitem__(self, idx, val):
 78 |         # index of the leaf
 79 |         idx += self._capacity
 80 |         self._value[idx] = val
 81 |         idx //= 2
 82 |         while idx >= 1:
 83 |             self._value[idx] = self._operation(
 84 |                 self._value[2 * idx],
 85 |                 self._value[2 * idx + 1]
 86 |             )
 87 |             idx //= 2
 88 | 
 89 |     def __getitem__(self, idx):
 90 |         assert 0 <= idx < self._capacity
 91 |         return self._value[self._capacity + idx]
 92 | 
 93 | 
 94 | class SumSegmentTree(SegmentTree):
 95 |     def __init__(self, capacity):
 96 |         super(SumSegmentTree, self).__init__(
 97 |             capacity=capacity,
 98 |             operation=operator.add,
 99 |             neutral_element=0.0
100 |         )
101 | 
102 |     def sum(self, start=0, end=None):
103 |         """Returns arr[start] + ... + arr[end]"""
104 |         return super(SumSegmentTree, self).reduce(start, end)
105 | 
106 |     def find_prefixsum_idx(self, prefixsum):
107 |         """Find the highest index `i` in the array such that
108 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
109 | 
110 |         if array values are probabilities, this function
111 |         allows to sample indexes according to the discrete
112 |         probability efficiently.
113 | 
114 |         Parameters
115 |         ----------
116 |         perfixsum: float
117 |             upperbound on the sum of array prefix
118 | 
119 |         Returns
120 |         -------
121 |         idx: int
122 |             highest index satisfying the prefixsum constraint
123 |         """
124 |         assert 0 <= prefixsum <= self.sum() + 1e-5
125 |         idx = 1
126 |         while idx < self._capacity:  # while non-leaf
127 |             if self._value[2 * idx] > prefixsum:
128 |                 idx = 2 * idx
129 |             else:
130 |                 prefixsum -= self._value[2 * idx]
131 |                 idx = 2 * idx + 1
132 |         return idx - self._capacity
133 | 
134 | 
135 | class MinSegmentTree(SegmentTree):
136 |     def __init__(self, capacity):
137 |         super(MinSegmentTree, self).__init__(
138 |             capacity=capacity,
139 |             operation=min,
140 |             neutral_element=float('inf')
141 |         )
142 | 
143 |     def min(self, start=0, end=None):
144 |         """Returns min(arr[start], ...,  arr[end])"""
145 | 
146 |         return super(MinSegmentTree, self).reduce(start, end)
147 | 


--------------------------------------------------------------------------------
/baselines/acktr/kfac_utils.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | 
  5 | def gmatmul(a, b, transpose_a=False, transpose_b=False, reduce_dim=None):
  6 |     if reduce_dim == None:
  7 |         # general batch matmul
  8 |         if len(a.get_shape()) == 3 and len(b.get_shape()) == 3:
  9 |             return tf.batch_matmul(a, b, adj_x=transpose_a, adj_y=transpose_b)
 10 |         elif len(a.get_shape()) == 3 and len(b.get_shape()) == 2:
 11 |             if transpose_b:
 12 |                 N = b.get_shape()[0].value
 13 |             else:
 14 |                 N = b.get_shape()[1].value
 15 |             B = a.get_shape()[0].value
 16 |             if transpose_a:
 17 |                 K = a.get_shape()[1].value
 18 |                 a = tf.reshape(tf.transpose(a, [0, 2, 1]), [-1, K])
 19 |             else:
 20 |                 K = a.get_shape()[-1].value
 21 |                 a = tf.reshape(a, [-1, K])
 22 |             result = tf.matmul(a, b, transpose_b=transpose_b)
 23 |             result = tf.reshape(result, [B, -1, N])
 24 |             return result
 25 |         elif len(a.get_shape()) == 2 and len(b.get_shape()) == 3:
 26 |             if transpose_a:
 27 |                 M = a.get_shape()[1].value
 28 |             else:
 29 |                 M = a.get_shape()[0].value
 30 |             B = b.get_shape()[0].value
 31 |             if transpose_b:
 32 |                 K = b.get_shape()[-1].value
 33 |                 b = tf.transpose(tf.reshape(b, [-1, K]), [1, 0])
 34 |             else:
 35 |                 K = b.get_shape()[1].value
 36 |                 b = tf.transpose(tf.reshape(
 37 |                     tf.transpose(b, [0, 2, 1]), [-1, K]), [1, 0])
 38 |             result = tf.matmul(a, b, transpose_a=transpose_a)
 39 |             result = tf.transpose(tf.reshape(result, [M, B, -1]), [1, 0, 2])
 40 |             return result
 41 |         else:
 42 |             return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
 43 |     else:
 44 |         # weird batch matmul
 45 |         if len(a.get_shape()) == 2 and len(b.get_shape()) > 2:
 46 |             # reshape reduce_dim to the left most dim in b
 47 |             b_shape = b.get_shape()
 48 |             if reduce_dim != 0:
 49 |                 b_dims = list(range(len(b_shape)))
 50 |                 b_dims.remove(reduce_dim)
 51 |                 b_dims.insert(0, reduce_dim)
 52 |                 b = tf.transpose(b, b_dims)
 53 |             b_t_shape = b.get_shape()
 54 |             b = tf.reshape(b, [int(b_shape[reduce_dim]), -1])
 55 |             result = tf.matmul(a, b, transpose_a=transpose_a,
 56 |                                transpose_b=transpose_b)
 57 |             result = tf.reshape(result, b_t_shape)
 58 |             if reduce_dim != 0:
 59 |                 b_dims = list(range(len(b_shape)))
 60 |                 b_dims.remove(0)
 61 |                 b_dims.insert(reduce_dim, 0)
 62 |                 result = tf.transpose(result, b_dims)
 63 |             return result
 64 | 
 65 |         elif len(a.get_shape()) > 2 and len(b.get_shape()) == 2:
 66 |             # reshape reduce_dim to the right most dim in a
 67 |             a_shape = a.get_shape()
 68 |             outter_dim = len(a_shape) - 1
 69 |             reduce_dim = len(a_shape) - reduce_dim - 1
 70 |             if reduce_dim != outter_dim:
 71 |                 a_dims = list(range(len(a_shape)))
 72 |                 a_dims.remove(reduce_dim)
 73 |                 a_dims.insert(outter_dim, reduce_dim)
 74 |                 a = tf.transpose(a, a_dims)
 75 |             a_t_shape = a.get_shape()
 76 |             a = tf.reshape(a, [-1, int(a_shape[reduce_dim])])
 77 |             result = tf.matmul(a, b, transpose_a=transpose_a,
 78 |                                transpose_b=transpose_b)
 79 |             result = tf.reshape(result, a_t_shape)
 80 |             if reduce_dim != outter_dim:
 81 |                 a_dims = list(range(len(a_shape)))
 82 |                 a_dims.remove(outter_dim)
 83 |                 a_dims.insert(reduce_dim, outter_dim)
 84 |                 result = tf.transpose(result, a_dims)
 85 |             return result
 86 | 
 87 |         elif len(a.get_shape()) == 2 and len(b.get_shape()) == 2:
 88 |             return tf.matmul(a, b, transpose_a=transpose_a, transpose_b=transpose_b)
 89 | 
 90 |         assert False, 'something went wrong'
 91 | 
 92 | 
 93 | def clipoutNeg(vec, threshold=1e-6):
 94 |     mask = tf.cast(vec > threshold, tf.float32)
 95 |     return mask * vec
 96 | 
 97 | 
 98 | def detectMinVal(input_mat, var, threshold=1e-6, name='', debug=False):
 99 |     eigen_min = tf.reduce_min(input_mat)
100 |     eigen_max = tf.reduce_max(input_mat)
101 |     eigen_ratio = eigen_max / eigen_min
102 |     input_mat_clipped = clipoutNeg(input_mat, threshold)
103 | 
104 |     if debug:
105 |         input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), lambda: input_mat_clipped, lambda: tf.Print(
106 |             input_mat_clipped, [tf.convert_to_tensor('screwed ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), eigen_min, eigen_max, eigen_ratio]))
107 | 
108 |     return input_mat_clipped
109 | 
110 | 
111 | def factorReshape(Q, e, grad, facIndx=0, ftype='act'):
112 |     grad_shape = grad.get_shape()
113 |     if ftype == 'act':
114 |         assert e.get_shape()[0] == grad_shape[facIndx]
115 |         expanded_shape = [1, ] * len(grad_shape)
116 |         expanded_shape[facIndx] = -1
117 |         e = tf.reshape(e, expanded_shape)
118 |     if ftype == 'grad':
119 |         assert e.get_shape()[0] == grad_shape[len(grad_shape) - facIndx - 1]
120 |         expanded_shape = [1, ] * len(grad_shape)
121 |         expanded_shape[len(grad_shape) - facIndx - 1] = -1
122 |         e = tf.reshape(e, expanded_shape)
123 | 
124 |     return Q, e
125 | 


--------------------------------------------------------------------------------
/baselines/bench/monitor.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['Monitor', 'get_monitor_files', 'load_results']
  2 | 
  3 | import gym
  4 | from gym.core import Wrapper
  5 | from os import path
  6 | import time
  7 | from glob import glob
  8 | 
  9 | try:
 10 |     import ujson as json # Not necessary for monitor writing, but very useful for monitor loading
 11 | except ImportError:
 12 |     import json
 13 | 
 14 | class Monitor(Wrapper):
 15 |     EXT = "monitor.json"
 16 |     f = None
 17 | 
 18 |     def __init__(self, env, filename, allow_early_resets=False):
 19 |         Wrapper.__init__(self, env=env)
 20 |         self.tstart = time.time()
 21 |         if filename is None:
 22 |             self.f = None
 23 |             self.logger = None
 24 |         else:
 25 |             if not filename.endswith(Monitor.EXT):
 26 |                 filename = filename + "." + Monitor.EXT
 27 |             self.f = open(filename, "wt")
 28 |             self.logger = JSONLogger(self.f)
 29 |             self.logger.writekvs({"t_start": self.tstart, "gym_version": gym.__version__,
 30 |                 "env_id": env.spec.id if env.spec else 'Unknown'})
 31 |         self.allow_early_resets = allow_early_resets
 32 |         self.rewards = None
 33 |         self.needs_reset = True
 34 |         self.episode_rewards = []
 35 |         self.episode_lengths = []
 36 |         self.total_steps = 0
 37 |         self.current_metadata = {} # extra info that gets injected into each log entry
 38 |         # Useful for metalearning where we're modifying the environment externally
 39 |         # But want our logs to know about these modifications
 40 | 
 41 |     def __getstate__(self): # XXX
 42 |         d = self.__dict__.copy()
 43 |         if self.f:
 44 |             del d['f'], d['logger']
 45 |             d['_filename'] = self.f.name
 46 |             d['_num_episodes'] = len(self.episode_rewards)
 47 |         else:
 48 |             d['_filename'] = None
 49 |         return d
 50 |     def __setstate__(self, d):
 51 |         filename = d.pop('_filename')
 52 |         self.__dict__ = d
 53 |         if filename is not None:
 54 |             nlines = d.pop('_num_episodes') + 1
 55 |             self.f = open(filename, "r+t")
 56 |             for _ in range(nlines):
 57 |                 self.f.readline()
 58 |             self.f.truncate()        
 59 |             self.logger = JSONLogger(self.f)
 60 | 
 61 | 
 62 |     def reset(self):
 63 |         if not self.allow_early_resets and not self.needs_reset:
 64 |             raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
 65 |         self.rewards = []
 66 |         self.needs_reset = False
 67 |         return self.env.reset()
 68 | 
 69 |     def step(self, action):
 70 |         if self.needs_reset:
 71 |             raise RuntimeError("Tried to step environment that needs reset")
 72 |         ob, rew, done, info = self.env.step(action)
 73 |         self.rewards.append(rew)
 74 |         if done:
 75 |             self.needs_reset = True
 76 |             eprew = sum(self.rewards)
 77 |             eplen = len(self.rewards)
 78 |             epinfo = {"r": eprew, "l": eplen, "t": round(time.time() - self.tstart, 6)}
 79 |             epinfo.update(self.current_metadata)
 80 |             if self.logger:
 81 |                 self.logger.writekvs(epinfo)
 82 |             self.episode_rewards.append(eprew)
 83 |             self.episode_lengths.append(eplen)
 84 |             info['episode'] = epinfo
 85 |         self.total_steps += 1
 86 |         return (ob, rew, done, info)
 87 | 
 88 |     def close(self):
 89 |         if self.f is not None:
 90 |             self.f.close()
 91 | 
 92 |     def get_total_steps(self):
 93 |         return self.total_steps
 94 | 
 95 |     def get_episode_rewards(self):
 96 |         return self.episode_rewards
 97 | 
 98 |     def get_episode_lengths(self):
 99 |         return self.episode_lengths
100 | 
101 | class JSONLogger(object):
102 |     def __init__(self, file):
103 |         self.file = file
104 | 
105 |     def writekvs(self, kvs):
106 |         for k,v in kvs.items():
107 |             if hasattr(v, 'dtype'):
108 |                 v = v.tolist()
109 |                 kvs[k] = float(v)
110 |         self.file.write(json.dumps(kvs) + '\n')
111 |         self.file.flush()
112 | 
113 | 
114 | class LoadMonitorResultsError(Exception):
115 |     pass
116 | 
117 | def get_monitor_files(dir):
118 |     return glob(path.join(dir, "*" + Monitor.EXT))
119 | 
120 | def load_results(dir, raw_episodes=False):
121 |     fnames = get_monitor_files(dir)
122 |     if not fnames:
123 |         raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
124 |     episodes = []
125 |     headers = []
126 |     for fname in fnames:
127 |         with open(fname, 'rt') as fh:
128 |             lines = fh.readlines()
129 |         header = json.loads(lines[0])
130 |         headers.append(header)
131 |         for line in lines[1:]:
132 |             episode = json.loads(line)
133 |             episode['abstime'] = header['t_start'] + episode['t']
134 |             del episode['t']
135 |             episodes.append(episode)
136 |     header0 = headers[0]
137 |     for header in headers[1:]:
138 |         assert header['env_id'] == header0['env_id'], "mixing data from two envs"
139 |     episodes = sorted(episodes, key=lambda e: e['abstime'])
140 |     if raw_episodes: 
141 |         return episodes
142 |     else:
143 |         return {
144 |             'env_info': {'env_id': header0['env_id'], 'gym_version': header0['gym_version']},
145 |             'episode_end_times': [e['abstime'] for e in episodes],
146 |             'episode_lengths': [e['l'] for e in episodes],
147 |             'episode_rewards': [e['r'] for e in episodes],
148 |             'initial_reset_time': min([min(header['t_start'] for header in headers)])
149 |         }
150 | 


--------------------------------------------------------------------------------
/baselines/acktr/acktr_cont.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from baselines import logger
  4 | from baselines import common
  5 | from baselines.common import tf_util as U
  6 | from baselines.acktr import kfac
  7 | from baselines.acktr.filters import ZFilter
  8 | 
  9 | def pathlength(path):
 10 |     return path["reward"].shape[0]# Loss function that we'll differentiate to get the policy gradient
 11 | 
 12 | def rollout(env, policy, max_pathlength, animate=False, obfilter=None):
 13 |     """
 14 |     Simulate the env and policy for max_pathlength steps
 15 |     """
 16 |     ob = env.reset()
 17 |     prev_ob = np.float32(np.zeros(ob.shape))
 18 |     if obfilter: ob = obfilter(ob)
 19 |     terminated = False
 20 | 
 21 |     obs = []
 22 |     acs = []
 23 |     ac_dists = []
 24 |     logps = []
 25 |     rewards = []
 26 |     for _ in range(max_pathlength):
 27 |         if animate:
 28 |             env.render()
 29 |         state = np.concatenate([ob, prev_ob], -1)
 30 |         obs.append(state)
 31 |         ac, ac_dist, logp = policy.act(state)
 32 |         acs.append(ac)
 33 |         ac_dists.append(ac_dist)
 34 |         logps.append(logp)
 35 |         prev_ob = np.copy(ob)
 36 |         scaled_ac = env.action_space.low + (ac + 1.) * 0.5 * (env.action_space.high - env.action_space.low)
 37 |         scaled_ac = np.clip(scaled_ac, env.action_space.low, env.action_space.high)
 38 |         ob, rew, done, _ = env.step(scaled_ac)
 39 |         if obfilter: ob = obfilter(ob)
 40 |         rewards.append(rew)
 41 |         if done:
 42 |             terminated = True
 43 |             break
 44 |     return {"observation" : np.array(obs), "terminated" : terminated,
 45 |             "reward" : np.array(rewards), "action" : np.array(acs),
 46 |             "action_dist": np.array(ac_dists), "logp" : np.array(logps)}
 47 | 
 48 | def learn(env, policy, vf, gamma, lam, timesteps_per_batch, num_timesteps,
 49 |     animate=False, callback=None, desired_kl=0.002):
 50 | 
 51 |     obfilter = ZFilter(env.observation_space.shape)
 52 | 
 53 |     max_pathlength = env.spec.timestep_limit
 54 |     stepsize = tf.Variable(initial_value=np.float32(np.array(0.03)), name='stepsize')
 55 |     inputs, loss, loss_sampled = policy.update_info
 56 |     optim = kfac.KfacOptimizer(learning_rate=stepsize, cold_lr=stepsize*(1-0.9), momentum=0.9, kfac_update=2,\
 57 |                                 epsilon=1e-2, stats_decay=0.99, async=1, cold_iter=1,
 58 |                                 weight_decay_dict=policy.wd_dict, max_grad_norm=None)
 59 |     pi_var_list = []
 60 |     for var in tf.trainable_variables():
 61 |         if "pi" in var.name:
 62 |             pi_var_list.append(var)
 63 | 
 64 |     update_op, q_runner = optim.minimize(loss, loss_sampled, var_list=pi_var_list)
 65 |     do_update = U.function(inputs, update_op)
 66 |     U.initialize()
 67 | 
 68 |     # start queue runners
 69 |     enqueue_threads = []
 70 |     coord = tf.train.Coordinator()
 71 |     for qr in [q_runner, vf.q_runner]:
 72 |         assert (qr != None)
 73 |         enqueue_threads.extend(qr.create_threads(U.get_session(), coord=coord, start=True))
 74 | 
 75 |     i = 0
 76 |     timesteps_so_far = 0
 77 |     while True:
 78 |         if timesteps_so_far > num_timesteps:
 79 |             break
 80 |         logger.log("********** Iteration %i ************"%i)
 81 | 
 82 |         # Collect paths until we have enough timesteps
 83 |         timesteps_this_batch = 0
 84 |         paths = []
 85 |         while True:
 86 |             path = rollout(env, policy, max_pathlength, animate=(len(paths)==0 and (i % 10 == 0) and animate), obfilter=obfilter)
 87 |             paths.append(path)
 88 |             n = pathlength(path)
 89 |             timesteps_this_batch += n
 90 |             timesteps_so_far += n
 91 |             if timesteps_this_batch > timesteps_per_batch:
 92 |                 break
 93 | 
 94 |         # Estimate advantage function
 95 |         vtargs = []
 96 |         advs = []
 97 |         for path in paths:
 98 |             rew_t = path["reward"]
 99 |             return_t = common.discount(rew_t, gamma)
100 |             vtargs.append(return_t)
101 |             vpred_t = vf.predict(path)
102 |             vpred_t = np.append(vpred_t, 0.0 if path["terminated"] else vpred_t[-1])
103 |             delta_t = rew_t + gamma*vpred_t[1:] - vpred_t[:-1]
104 |             adv_t = common.discount(delta_t, gamma * lam)
105 |             advs.append(adv_t)
106 |         # Update value function
107 |         vf.fit(paths, vtargs)
108 | 
109 |         # Build arrays for policy update
110 |         ob_no = np.concatenate([path["observation"] for path in paths])
111 |         action_na = np.concatenate([path["action"] for path in paths])
112 |         oldac_dist = np.concatenate([path["action_dist"] for path in paths])
113 |         logp_n = np.concatenate([path["logp"] for path in paths])
114 |         adv_n = np.concatenate(advs)
115 |         standardized_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8)
116 | 
117 |         # Policy update
118 |         do_update(ob_no, action_na, standardized_adv_n)
119 | 
120 |         min_stepsize = np.float32(1e-8)
121 |         max_stepsize = np.float32(1e0)
122 |         # Adjust stepsize
123 |         kl = policy.compute_kl(ob_no, oldac_dist)
124 |         if kl > desired_kl * 2:
125 |             logger.log("kl too high")
126 |             U.eval(tf.assign(stepsize, tf.maximum(min_stepsize, stepsize / 1.5)))
127 |         elif kl < desired_kl / 2:
128 |             logger.log("kl too low")
129 |             U.eval(tf.assign(stepsize, tf.minimum(max_stepsize, stepsize * 1.5)))
130 |         else:
131 |             logger.log("kl just right!")
132 | 
133 |         logger.record_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths]))
134 |         logger.record_tabular("EpRewSEM", np.std([path["reward"].sum()/np.sqrt(len(paths)) for path in paths]))
135 |         logger.record_tabular("EpLenMean", np.mean([pathlength(path) for path in paths]))
136 |         logger.record_tabular("KL", kl)
137 |         if callback:
138 |             callback()
139 |         logger.dump_tabular()
140 |         i += 1
141 | 


--------------------------------------------------------------------------------
/baselines/bench/benchmarks.py:
--------------------------------------------------------------------------------
  1 | _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
  2 | _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
  3 | 
  4 | _BENCHMARKS = []
  5 | 
  6 | def register_benchmark(benchmark):
  7 |     for b in _BENCHMARKS:
  8 |         if b['name'] == benchmark['name']:
  9 |             raise ValueError('Benchmark with name %s already registered!'%b['name'])
 10 |     _BENCHMARKS.append(benchmark)
 11 | 
 12 | def list_benchmarks():
 13 |     return [b['name'] for b in _BENCHMARKS]
 14 | 
 15 | def get_benchmark(benchmark_name):
 16 |     for b in _BENCHMARKS:
 17 |         if b['name'] == benchmark_name:
 18 |             return b
 19 |     raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
 20 | 
 21 | def get_task(benchmark, env_id):
 22 |     """Get a task by env_id. Return None if the benchmark doesn't have the env"""
 23 |     return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
 24 | 
 25 | def find_task_for_env_id_in_any_benchmark(env_id):
 26 |     for bm in _BENCHMARKS:
 27 |         for task in bm["tasks"]:
 28 |             if task["env_id"]==env_id:
 29 |                 return bm, task
 30 |     return None, None
 31 | 
 32 | _ATARI_SUFFIX = 'NoFrameskip-v4'
 33 | 
 34 | register_benchmark({
 35 |     'name' : 'Atari200M',
 36 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 200M frames',
 37 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(200e6)} for _game in _atari7]
 38 | })
 39 | 
 40 | register_benchmark({
 41 |     'name' : 'Atari40M',
 42 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
 43 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atari7]
 44 | })
 45 | 
 46 | register_benchmark({
 47 |     'name' : 'Atari1Hr',
 48 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
 49 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_seconds' : 60*60} for _game in _atari7]
 50 | })
 51 | 
 52 | register_benchmark({
 53 |     'name' : 'AtariExploration40M',
 54 |     'description' :'7 Atari games emphasizing exploration, with pixel observations, 40M frames',
 55 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 2, 'num_timesteps' : int(40e6)} for _game in _atariexpl7]
 56 | })
 57 | 
 58 | 
 59 | # MuJoCo
 60 | 
 61 | _mujocosmall = [
 62 |     'InvertedDoublePendulum-v1', 'InvertedPendulum-v1',
 63 |     'HalfCheetah-v1', 'Hopper-v1', 'Walker2d-v1',
 64 |     'Reacher-v1', 'Swimmer-v1']
 65 | register_benchmark({
 66 |     'name' : 'Mujoco1M',
 67 |     'description' : 'Some small 2D MuJoCo tasks, run for 1M timesteps',
 68 |     'tasks' : [{'env_id' : _envid, 'trials' : 3, 'num_timesteps' : int(1e6)} for _envid in _mujocosmall]
 69 | })
 70 | register_benchmark({
 71 |     'name' : 'MujocoWalkers',
 72 |     'description' : 'MuJoCo forward walkers, run for 8M, humanoid 100M',
 73 |     'tasks' : [
 74 |         {'env_id' : "Hopper-v1",          'trials' : 4, 'num_timesteps' :   8*1000000 },
 75 |         {'env_id' : "Walker2d-v1",        'trials' : 4, 'num_timesteps' :   8*1000000 },
 76 |         {'env_id' : "Humanoid-v1",        'trials' : 4, 'num_timesteps' : 100*1000000 },
 77 |     ]
 78 | })
 79 | # To reproduce:
 80 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce MujocoWalkers myrun_ppo2_whiteobs1_cpu8
 81 | # (observation input filters necessary)
 82 | 
 83 | 
 84 | # Roboschool
 85 | 
 86 | register_benchmark({
 87 |     'name' : 'Roboschool8M',
 88 |     'description' : 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
 89 |     'tasks' : [
 90 |         {'env_id' : "RoboschoolReacher-v1",                 'trials' : 4, 'num_timesteps' :  2*1000000 },
 91 |         {'env_id' : "RoboschoolAnt-v1",                     'trials' : 4, 'num_timesteps' :  8*1000000 },
 92 |         {'env_id' : "RoboschoolHalfCheetah-v1",             'trials' : 4, 'num_timesteps' :  8*1000000 },
 93 |         {'env_id' : "RoboschoolHopper-v1",                  'trials' : 4, 'num_timesteps' :  8*1000000 },
 94 |         {'env_id' : "RoboschoolWalker2d-v1",                'trials' : 4, 'num_timesteps' :  8*1000000 },
 95 |         ]
 96 | })
 97 | register_benchmark({
 98 |     'name' : 'RoboschoolHarder',
 99 |     'description' : 'Test your might!!! Up to 12 hours on 32 cores',
100 |     'tasks' : [
101 |         {'env_id' : "RoboschoolHumanoid-v1",              'trials' : 4, 'num_timesteps' : 100*1000000 },
102 |         {'env_id' : "RoboschoolHumanoidFlagrun-v1",       'trials' : 4, 'num_timesteps' : 200*1000000 },
103 |         {'env_id' : "RoboschoolHumanoidFlagrunHarder-v1", 'trials' : 4, 'num_timesteps' : 400*1000000 },
104 |         ]
105 | })
106 | # To reproduce:
107 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce Roboschool8M     myrun_ppo2_cpu8
108 | # python3 baselines/baselines/ppo2/ppo2_run_benchmark.py gce RoboschoolHarder myrun_ppo2_cpu32_large_samples65536
109 | # (Large network, train on 65536 samples each iteration. Also, _large is really necessary only for Harder)
110 | 
111 | 
112 | # Other
113 | 
114 | _atari50 =  [ # actually 49
115 |             'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 
116 |             'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider',  'Bowling', 
117 |             'Boxing', 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', 
118 |             'DemonAttack', 'DoubleDunk',  'Enduro', 'FishingDerby', 'Freeway', 
119 |             'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',  
120 |             'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 
121 |             'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert', 
122 |             'Riverraid', 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 
123 |             'StarGunner', 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 
124 |             'Venture', 'VideoPinball', 'WizardOfWor', 'Zaxxon', 
125 | ]
126 | 
127 | register_benchmark({
128 |     'name' : 'Atari50_40M',
129 |     'description' :'7 Atari games from Mnih et al. (2013), with pixel observations, 40M frames',
130 |     'tasks'  : [{'env_id' : _game + _ATARI_SUFFIX, 'trials' : 3, 'num_timesteps' : int(40e6)} for _game in _atari50]
131 | })
132 | 
133 | def env_shortname(s):
134 |     "Make typical names above shorter, while keeping recognizable"
135 |     s = s.replace("NoFrameskip", "")
136 |     if s[:10]=="Roboschool": s = s[10:]
137 |     i = s.rfind("-v")
138 |     if i!=-1: s = s[:i]
139 | 
140 |     return s.lower()
141 | 


--------------------------------------------------------------------------------
/baselines/common/azure_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import tempfile
  3 | import zipfile
  4 | 
  5 | from azure.common import AzureMissingResourceHttpError
  6 | try:
  7 |     from azure.storage.blob import BlobService
  8 | except ImportError:
  9 |     from azure.storage.blob import BlockBlobService as BlobService
 10 | from shutil import unpack_archive
 11 | from threading import Event
 12 | 
 13 | """TODOS:
 14 |    - use Azure snapshots instead of hacky backups
 15 | """
 16 | 
 17 | 
 18 | def fixed_list_blobs(service, *args, **kwargs):
 19 |     """By defualt list_containers only returns a subset of results.
 20 | 
 21 |     This function attempts to fix this.
 22 |     """
 23 |     res = []
 24 |     next_marker = None
 25 |     while next_marker is None or len(next_marker) > 0:
 26 |         kwargs['marker'] = next_marker
 27 |         gen = service.list_blobs(*args, **kwargs)
 28 |         for b in gen:
 29 |             res.append(b.name)
 30 |         next_marker = gen.next_marker
 31 |     return res
 32 | 
 33 | 
 34 | def make_archive(source_path, dest_path):
 35 |     if source_path.endswith(os.path.sep):
 36 |         source_path = source_path.rstrip(os.path.sep)
 37 |     prefix_path = os.path.dirname(source_path)
 38 |     with zipfile.ZipFile(dest_path, "w", compression=zipfile.ZIP_STORED) as zf:
 39 |         if os.path.isdir(source_path):
 40 |             for dirname, subdirs, files in os.walk(source_path):
 41 |                 zf.write(dirname, os.path.relpath(dirname, prefix_path))
 42 |                 for filename in files:
 43 |                     filepath = os.path.join(dirname, filename)
 44 |                     zf.write(filepath, os.path.relpath(filepath, prefix_path))
 45 |         else:
 46 |             zf.write(source_path, os.path.relpath(source_path, prefix_path))
 47 | 
 48 | 
 49 | class Container(object):
 50 |     services = {}
 51 | 
 52 |     def __init__(self, account_name, account_key, container_name, maybe_create=False):
 53 |         self._account_name = account_name
 54 |         self._container_name = container_name
 55 |         if account_name not in Container.services:
 56 |             Container.services[account_name] = BlobService(account_name, account_key)
 57 |         self._service = Container.services[account_name]
 58 |         if maybe_create:
 59 |             self._service.create_container(self._container_name, fail_on_exist=False)
 60 | 
 61 |     def put(self, source_path, blob_name, callback=None):
 62 |         """Upload a file or directory from `source_path` to azure blob `blob_name`.
 63 | 
 64 |         Upload progress can be traced by an optional callback.
 65 |         """
 66 |         upload_done = Event()
 67 | 
 68 |         def progress_callback(current, total):
 69 |             if callback:
 70 |                 callback(current, total)
 71 |             if current >= total:
 72 |                 upload_done.set()
 73 | 
 74 |         # Attempt to make backup if an existing version is already available
 75 |         try:
 76 |             x_ms_copy_source = "https://{}.blob.core.windows.net/{}/{}".format(
 77 |                 self._account_name,
 78 |                 self._container_name,
 79 |                 blob_name
 80 |             )
 81 |             self._service.copy_blob(
 82 |                 container_name=self._container_name,
 83 |                 blob_name=blob_name + ".backup",
 84 |                 x_ms_copy_source=x_ms_copy_source
 85 |             )
 86 |         except AzureMissingResourceHttpError:
 87 |             pass
 88 | 
 89 |         with tempfile.TemporaryDirectory() as td:
 90 |             arcpath = os.path.join(td, "archive.zip")
 91 |             make_archive(source_path, arcpath)
 92 |             self._service.put_block_blob_from_path(
 93 |                 container_name=self._container_name,
 94 |                 blob_name=blob_name,
 95 |                 file_path=arcpath,
 96 |                 max_connections=4,
 97 |                 progress_callback=progress_callback,
 98 |                 max_retries=10)
 99 |             upload_done.wait()
100 | 
101 |     def get(self, dest_path, blob_name, callback=None):
102 |         """Download a file or directory to `dest_path` to azure blob `blob_name`.
103 | 
104 |         Warning! If directory is downloaded the `dest_path` is the parent directory.
105 | 
106 |         Upload progress can be traced by an optional callback.
107 |         """
108 |         download_done = Event()
109 | 
110 |         def progress_callback(current, total):
111 |             if callback:
112 |                 callback(current, total)
113 |             if current >= total:
114 |                 download_done.set()
115 | 
116 |         with tempfile.TemporaryDirectory() as td:
117 |             arcpath = os.path.join(td, "archive.zip")
118 |             for backup_blob_name in [blob_name, blob_name + '.backup']:
119 |                 try:
120 |                     properties = self._service.get_blob_properties(
121 |                         blob_name=backup_blob_name,
122 |                         container_name=self._container_name
123 |                     )
124 |                     if hasattr(properties, 'properties'):
125 |                         # Annoyingly, Azure has changed the API and this now returns a blob
126 |                         # instead of it's properties with up-to-date azure package.
127 |                         blob_size = properties.properties.content_length
128 |                     else:
129 |                         blob_size = properties['content-length']
130 |                     if int(blob_size) > 0:
131 |                         self._service.get_blob_to_path(
132 |                             container_name=self._container_name,
133 |                             blob_name=backup_blob_name,
134 |                             file_path=arcpath,
135 |                             max_connections=4,
136 |                             progress_callback=progress_callback)
137 |                         unpack_archive(arcpath, dest_path)
138 |                         download_done.wait()
139 |                         return True
140 |                 except AzureMissingResourceHttpError:
141 |                     pass
142 |         return False
143 | 
144 |     def list(self, prefix=None):
145 |         """List all blobs in the container."""
146 |         return fixed_list_blobs(self._service, self._container_name, prefix=prefix)
147 | 
148 |     def exists(self, blob_name):
149 |         """Returns true if `blob_name` exists in container."""
150 |         try:
151 |             self._service.get_blob_properties(
152 |                 blob_name=blob_name,
153 |                 container_name=self._container_name
154 |             )
155 |             return True
156 |         except AzureMissingResourceHttpError:
157 |             return False
158 | 


--------------------------------------------------------------------------------
/baselines/common/atari_wrappers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from collections import deque
  3 | from PIL import Image
  4 | import gym
  5 | from gym import spaces
  6 | 
  7 | 
  8 | class NoopResetEnv(gym.Wrapper):
  9 |     def __init__(self, env, noop_max=30):
 10 |         """Sample initial states by taking random number of no-ops on reset.
 11 |         No-op is assumed to be action 0.
 12 |         """
 13 |         gym.Wrapper.__init__(self, env)
 14 |         self.noop_max = noop_max
 15 |         self.override_num_noops = None
 16 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 17 | 
 18 |     def _reset(self):
 19 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 20 |         self.env.reset()
 21 |         if self.override_num_noops is not None:
 22 |             noops = self.override_num_noops
 23 |         else:
 24 |             noops = self.unwrapped.np_random.randint(1, self.noop_max + 1) #pylint: disable=E1101
 25 |         assert noops > 0
 26 |         obs = None
 27 |         for _ in range(noops):
 28 |             obs, _, done, _ = self.env.step(0)
 29 |             if done:
 30 |                 obs = self.env.reset()
 31 |         return obs
 32 | 
 33 | class FireResetEnv(gym.Wrapper):
 34 |     def __init__(self, env):
 35 |         """Take action on reset for environments that are fixed until firing."""
 36 |         gym.Wrapper.__init__(self, env)
 37 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 38 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 39 | 
 40 |     def _reset(self):
 41 |         self.env.reset()
 42 |         obs, _, done, _ = self.env.step(1)
 43 |         if done:
 44 |             self.env.reset()
 45 |         obs, _, done, _ = self.env.step(2)
 46 |         if done:
 47 |             self.env.reset()
 48 |         return obs
 49 | 
 50 | class EpisodicLifeEnv(gym.Wrapper):
 51 |     def __init__(self, env):
 52 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 53 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 54 |         """
 55 |         gym.Wrapper.__init__(self, env)
 56 |         self.lives = 0
 57 |         self.was_real_done  = True
 58 | 
 59 |     def _step(self, action):
 60 |         obs, reward, done, info = self.env.step(action)
 61 |         self.was_real_done = done
 62 |         # check current lives, make loss of life terminal,
 63 |         # then update lives to handle bonus lives
 64 |         lives = self.env.unwrapped.ale.lives()
 65 |         if lives < self.lives and lives > 0:
 66 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 67 |             # so its important to keep lives > 0, so that we only reset once
 68 |             # the environment advertises done.
 69 |             done = True
 70 |         self.lives = lives
 71 |         return obs, reward, done, info
 72 | 
 73 |     def _reset(self):
 74 |         """Reset only when lives are exhausted.
 75 |         This way all states are still reachable even though lives are episodic,
 76 |         and the learner need not know about any of this behind-the-scenes.
 77 |         """
 78 |         if self.was_real_done:
 79 |             obs = self.env.reset()
 80 |         else:
 81 |             # no-op step to advance from terminal/lost life state
 82 |             obs, _, _, _ = self.env.step(0)
 83 |         self.lives = self.env.unwrapped.ale.lives()
 84 |         return obs
 85 | 
 86 | class MaxAndSkipEnv(gym.Wrapper):
 87 |     def __init__(self, env, skip=4):
 88 |         """Return only every `skip`-th frame"""
 89 |         gym.Wrapper.__init__(self, env)
 90 |         # most recent raw observations (for max pooling across time steps)
 91 |         self._obs_buffer = deque(maxlen=2)
 92 |         self._skip       = skip
 93 | 
 94 |     def _step(self, action):
 95 |         """Repeat action, sum reward, and max over last observations."""
 96 |         total_reward = 0.0
 97 |         done = None
 98 |         for _ in range(self._skip):
 99 |             obs, reward, done, info = self.env.step(action)
100 |             self._obs_buffer.append(obs)
101 |             total_reward += reward
102 |             if done:
103 |                 break
104 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
105 | 
106 |         return max_frame, total_reward, done, info
107 | 
108 |     def _reset(self):
109 |         """Clear past frame buffer and init. to first obs. from inner env."""
110 |         self._obs_buffer.clear()
111 |         obs = self.env.reset()
112 |         self._obs_buffer.append(obs)
113 |         return obs
114 | 
115 | class ClipRewardEnv(gym.RewardWrapper):
116 |     def _reward(self, reward):
117 |         """Bin reward to {+1, 0, -1} by its sign."""
118 |         return np.sign(reward)
119 | 
120 | class WarpFrame(gym.ObservationWrapper):
121 |     def __init__(self, env):
122 |         """Warp frames to 84x84 as done in the Nature paper and later work."""
123 |         gym.ObservationWrapper.__init__(self, env)
124 |         self.res = 84
125 |         self.observation_space = spaces.Box(low=0, high=255, shape=(self.res, self.res, 1))
126 | 
127 |     def _observation(self, obs):
128 |         frame = np.dot(obs.astype('float32'), np.array([0.299, 0.587, 0.114], 'float32'))
129 |         frame = np.array(Image.fromarray(frame).resize((self.res, self.res),
130 |             resample=Image.BILINEAR), dtype=np.uint8)
131 |         return frame.reshape((self.res, self.res, 1))
132 | 
133 | class FrameStack(gym.Wrapper):
134 |     def __init__(self, env, k):
135 |         """Buffer observations and stack across channels (last axis)."""
136 |         gym.Wrapper.__init__(self, env)
137 |         self.k = k
138 |         self.frames = deque([], maxlen=k)
139 |         shp = env.observation_space.shape
140 |         assert shp[2] == 1  # can only stack 1-channel frames
141 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], k))
142 | 
143 |     def _reset(self):
144 |         """Clear buffer and re-fill by duplicating the first observation."""
145 |         ob = self.env.reset()
146 |         for _ in range(self.k): self.frames.append(ob)
147 |         return self._observation()
148 | 
149 |     def _step(self, action):
150 |         ob, reward, done, info = self.env.step(action)
151 |         self.frames.append(ob)
152 |         return self._observation(), reward, done, info
153 | 
154 |     def _observation(self):
155 |         assert len(self.frames) == self.k
156 |         return np.concatenate(self.frames, axis=2)
157 | 
158 | def wrap_deepmind(env, episode_life=True, clip_rewards=True):
159 |     """Configure environment for DeepMind-style Atari.
160 | 
161 |     Note: this does not include frame stacking!"""
162 |     assert 'NoFrameskip' in env.spec.id  # required for DeepMind-style skip
163 |     if episode_life:
164 |         env = EpisodicLifeEnv(env)
165 |     env = NoopResetEnv(env, noop_max=30)
166 |     env = MaxAndSkipEnv(env, skip=4)
167 |     if 'FIRE' in env.unwrapped.get_action_meanings():
168 |         env = FireResetEnv(env)
169 |     env = WarpFrame(env)
170 |     if clip_rewards:
171 |         env = ClipRewardEnv(env)
172 |     return env
173 | 


--------------------------------------------------------------------------------
/baselines/acktr/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import tensorflow as tf
  4 | import baselines.common.tf_util as U
  5 | from collections import deque
  6 | 
  7 | def sample(logits):
  8 |     noise = tf.random_uniform(tf.shape(logits))
  9 |     return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
 10 | 
 11 | def std(x):
 12 |     mean = tf.reduce_mean(x)
 13 |     var = tf.reduce_mean(tf.square(x-mean))
 14 |     return tf.sqrt(var)
 15 | 
 16 | def cat_entropy(logits):
 17 |     a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
 18 |     ea0 = tf.exp(a0)
 19 |     z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
 20 |     p0 = ea0 / z0
 21 |     return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
 22 | 
 23 | def cat_entropy_softmax(p0):
 24 |     return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
 25 | 
 26 | def mse(pred, target):
 27 |     return tf.square(pred-target)/2.
 28 | 
 29 | def ortho_init(scale=1.0):
 30 |     def _ortho_init(shape, dtype, partition_info=None):
 31 |         #lasagne ortho init for tf
 32 |         shape = tuple(shape)
 33 |         if len(shape) == 2:
 34 |             flat_shape = shape
 35 |         elif len(shape) == 4: # assumes NHWC
 36 |             flat_shape = (np.prod(shape[:-1]), shape[-1])
 37 |         else:
 38 |             raise NotImplementedError
 39 |         a = np.random.normal(0.0, 1.0, flat_shape)
 40 |         u, _, v = np.linalg.svd(a, full_matrices=False)
 41 |         q = u if u.shape == flat_shape else v # pick the one with the correct shape
 42 |         q = q.reshape(shape)
 43 |         return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
 44 |     return _ortho_init
 45 | 
 46 | def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
 47 |     with tf.variable_scope(scope):
 48 |         nin = x.get_shape()[3].value
 49 |         w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
 50 |         b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
 51 |         z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
 52 |         h = act(z)
 53 |         return h
 54 | 
 55 | def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
 56 |     with tf.variable_scope(scope):
 57 |         nin = x.get_shape()[1].value
 58 |         w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
 59 |         b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
 60 |         z = tf.matmul(x, w)+b
 61 |         h = act(z)
 62 |         return h
 63 | 
 64 | def dense(x, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
 65 |     with tf.variable_scope(name, reuse=reuse):
 66 |         assert (len(U.scope_name().split('/')) == 2)
 67 | 
 68 |         w = tf.get_variable("w", [x.get_shape()[1], size], initializer=weight_init)
 69 |         b = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
 70 |         weight_decay_fc = 3e-4
 71 | 
 72 |         if weight_loss_dict is not None:
 73 |             weight_decay = tf.multiply(tf.nn.l2_loss(w), weight_decay_fc, name='weight_decay_loss')
 74 |             if weight_loss_dict is not None:
 75 |                 weight_loss_dict[w] = weight_decay_fc
 76 |                 weight_loss_dict[b] = 0.0
 77 | 
 78 |             tf.add_to_collection(U.scope_name().split('/')[0] + '_' + 'losses', weight_decay)
 79 | 
 80 |         return tf.nn.bias_add(tf.matmul(x, w), b)
 81 | 
 82 | def conv_to_fc(x):
 83 |     nh = np.prod([v.value for v in x.get_shape()[1:]])
 84 |     x = tf.reshape(x, [-1, nh])
 85 |     return x
 86 | 
 87 | def kl_div(action_dist1, action_dist2, action_size):
 88 |     mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
 89 |     mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
 90 | 
 91 |     numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
 92 |     denominator = 2 * tf.square(std2) + 1e-8
 93 |     return tf.reduce_sum(
 94 |         numerator/denominator + tf.log(std2) - tf.log(std1),reduction_indices=-1)
 95 | 
 96 | def discount_with_dones(rewards, dones, gamma):
 97 |     discounted = []
 98 |     r = 0
 99 |     for reward, done in zip(rewards[::-1], dones[::-1]):
100 |         r = reward + gamma*r*(1.-done) # fixed off by one bug
101 |         discounted.append(r)
102 |     return discounted[::-1]
103 | 
104 | def find_trainable_variables(key):
105 |     with tf.variable_scope(key):
106 |         return tf.trainable_variables()
107 | 
108 | def make_path(f):
109 |     return os.makedirs(f, exist_ok=True)
110 | 
111 | def constant(p):
112 |     return 1
113 | 
114 | def linear(p):
115 |     return 1-p
116 | 
117 | 
118 | def middle_drop(p):
119 |     eps = 0.75
120 |     if 1-p<eps:
121 |         return eps*0.1
122 |     return 1-p
123 | 
124 | def double_linear_con(p):
125 |     p *= 2
126 |     eps = 0.125
127 |     if 1-p<eps:
128 |         return eps
129 |     return 1-p
130 | 
131 | 
132 | def double_middle_drop(p):
133 |     eps1 = 0.75
134 |     eps2 = 0.25
135 |     if 1-p<eps1:
136 |         if 1-p<eps2:
137 |             return eps2*0.5
138 |         return eps1*0.1
139 |     return 1-p
140 | 
141 | 
142 | schedules = {
143 |     'linear':linear,
144 |     'constant':constant,
145 |     'double_linear_con':double_linear_con,
146 |     'middle_drop':middle_drop,
147 |     'double_middle_drop':double_middle_drop
148 | }
149 | 
150 | class Scheduler(object):
151 | 
152 |     def __init__(self, v, nvalues, schedule):
153 |         self.n = 0.
154 |         self.v = v
155 |         self.nvalues = nvalues
156 |         self.schedule = schedules[schedule]
157 | 
158 |     def value(self):
159 |         current_value = self.v*self.schedule(self.n/self.nvalues)
160 |         self.n += 1.
161 |         return current_value
162 | 
163 |     def value_steps(self, steps):
164 |         return self.v*self.schedule(steps/self.nvalues)
165 | 
166 | 
167 | class EpisodeStats:
168 |     def __init__(self, nsteps, nenvs):
169 |         self.episode_rewards = []
170 |         for i in range(nenvs):
171 |             self.episode_rewards.append([])
172 |         self.lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
173 |         self.rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
174 |         self.nsteps = nsteps
175 |         self.nenvs = nenvs
176 | 
177 |     def feed(self, rewards, masks):
178 |         rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
179 |         masks = np.reshape(masks, [self.nenvs, self.nsteps])
180 |         for i in range(0, self.nenvs):
181 |             for j in range(0, self.nsteps):
182 |                 self.episode_rewards[i].append(rewards[i][j])
183 |                 if masks[i][j]:
184 |                     l = len(self.episode_rewards[i])
185 |                     s = sum(self.episode_rewards[i])
186 |                     self.lenbuffer.append(l)
187 |                     self.rewbuffer.append(s)
188 |                     self.episode_rewards[i] = []
189 | 
190 |     def mean_length(self):
191 |         if self.lenbuffer:
192 |             return np.mean(self.lenbuffer)
193 |         else:
194 |             return 0  # on the first params dump, no episodes are finished
195 | 
196 |     def mean_reward(self):
197 |         if self.rewbuffer:
198 |             return np.mean(self.rewbuffer)
199 |         else:
200 |             return 0
201 | 


--------------------------------------------------------------------------------
/baselines/deepq/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import random
  3 | 
  4 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  5 | 
  6 | 
  7 | class ReplayBuffer(object):
  8 |     def __init__(self, size):
  9 |         """Create Prioritized Replay buffer.
 10 | 
 11 |         Parameters
 12 |         ----------
 13 |         size: int
 14 |             Max number of transitions to store in the buffer. When the buffer
 15 |             overflows the old memories are dropped.
 16 |         """
 17 |         self._storage = []
 18 |         self._maxsize = size
 19 |         self._next_idx = 0
 20 | 
 21 |     def __len__(self):
 22 |         return len(self._storage)
 23 | 
 24 |     def add(self, obs_t, action, reward, obs_tp1, done):
 25 |         data = (obs_t, action, reward, obs_tp1, done)
 26 | 
 27 |         if self._next_idx >= len(self._storage):
 28 |             self._storage.append(data)
 29 |         else:
 30 |             self._storage[self._next_idx] = data
 31 |         self._next_idx = (self._next_idx + 1) % self._maxsize
 32 | 
 33 |     def _encode_sample(self, idxes):
 34 |         obses_t, actions, rewards, obses_tp1, dones = [], [], [], [], []
 35 |         for i in idxes:
 36 |             data = self._storage[i]
 37 |             obs_t, action, reward, obs_tp1, done = data
 38 |             obses_t.append(np.array(obs_t, copy=False))
 39 |             actions.append(np.array(action, copy=False))
 40 |             rewards.append(reward)
 41 |             obses_tp1.append(np.array(obs_tp1, copy=False))
 42 |             dones.append(done)
 43 |         return np.array(obses_t), np.array(actions), np.array(rewards), np.array(obses_tp1), np.array(dones)
 44 | 
 45 |     def sample(self, batch_size):
 46 |         """Sample a batch of experiences.
 47 | 
 48 |         Parameters
 49 |         ----------
 50 |         batch_size: int
 51 |             How many transitions to sample.
 52 | 
 53 |         Returns
 54 |         -------
 55 |         obs_batch: np.array
 56 |             batch of observations
 57 |         act_batch: np.array
 58 |             batch of actions executed given obs_batch
 59 |         rew_batch: np.array
 60 |             rewards received as results of executing act_batch
 61 |         next_obs_batch: np.array
 62 |             next set of observations seen after executing act_batch
 63 |         done_mask: np.array
 64 |             done_mask[i] = 1 if executing act_batch[i] resulted in
 65 |             the end of an episode and 0 otherwise.
 66 |         """
 67 |         idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
 68 |         return self._encode_sample(idxes)
 69 | 
 70 | 
 71 | class PrioritizedReplayBuffer(ReplayBuffer):
 72 |     def __init__(self, size, alpha):
 73 |         """Create Prioritized Replay buffer.
 74 | 
 75 |         Parameters
 76 |         ----------
 77 |         size: int
 78 |             Max number of transitions to store in the buffer. When the buffer
 79 |             overflows the old memories are dropped.
 80 |         alpha: float
 81 |             how much prioritization is used
 82 |             (0 - no prioritization, 1 - full prioritization)
 83 | 
 84 |         See Also
 85 |         --------
 86 |         ReplayBuffer.__init__
 87 |         """
 88 |         super(PrioritizedReplayBuffer, self).__init__(size)
 89 |         assert alpha > 0
 90 |         self._alpha = alpha
 91 | 
 92 |         it_capacity = 1
 93 |         while it_capacity < size:
 94 |             it_capacity *= 2
 95 | 
 96 |         self._it_sum = SumSegmentTree(it_capacity)
 97 |         self._it_min = MinSegmentTree(it_capacity)
 98 |         self._max_priority = 1.0
 99 | 
100 |     def add(self, *args, **kwargs):
101 |         """See ReplayBuffer.store_effect"""
102 |         idx = self._next_idx
103 |         super().add(*args, **kwargs)
104 |         self._it_sum[idx] = self._max_priority ** self._alpha
105 |         self._it_min[idx] = self._max_priority ** self._alpha
106 | 
107 |     def _sample_proportional(self, batch_size):
108 |         res = []
109 |         for _ in range(batch_size):
110 |             # TODO(szymon): should we ensure no repeats?
111 |             mass = random.random() * self._it_sum.sum(0, len(self._storage) - 1)
112 |             idx = self._it_sum.find_prefixsum_idx(mass)
113 |             res.append(idx)
114 |         return res
115 | 
116 |     def sample(self, batch_size, beta):
117 |         """Sample a batch of experiences.
118 | 
119 |         compared to ReplayBuffer.sample
120 |         it also returns importance weights and idxes
121 |         of sampled experiences.
122 | 
123 | 
124 |         Parameters
125 |         ----------
126 |         batch_size: int
127 |             How many transitions to sample.
128 |         beta: float
129 |             To what degree to use importance weights
130 |             (0 - no corrections, 1 - full correction)
131 | 
132 |         Returns
133 |         -------
134 |         obs_batch: np.array
135 |             batch of observations
136 |         act_batch: np.array
137 |             batch of actions executed given obs_batch
138 |         rew_batch: np.array
139 |             rewards received as results of executing act_batch
140 |         next_obs_batch: np.array
141 |             next set of observations seen after executing act_batch
142 |         done_mask: np.array
143 |             done_mask[i] = 1 if executing act_batch[i] resulted in
144 |             the end of an episode and 0 otherwise.
145 |         weights: np.array
146 |             Array of shape (batch_size,) and dtype np.float32
147 |             denoting importance weight of each sampled transition
148 |         idxes: np.array
149 |             Array of shape (batch_size,) and dtype np.int32
150 |             idexes in buffer of sampled experiences
151 |         """
152 |         assert beta > 0
153 | 
154 |         idxes = self._sample_proportional(batch_size)
155 | 
156 |         weights = []
157 |         p_min = self._it_min.min() / self._it_sum.sum()
158 |         max_weight = (p_min * len(self._storage)) ** (-beta)
159 | 
160 |         for idx in idxes:
161 |             p_sample = self._it_sum[idx] / self._it_sum.sum()
162 |             weight = (p_sample * len(self._storage)) ** (-beta)
163 |             weights.append(weight / max_weight)
164 |         weights = np.array(weights)
165 |         encoded_sample = self._encode_sample(idxes)
166 |         return tuple(list(encoded_sample) + [weights, idxes])
167 | 
168 |     def update_priorities(self, idxes, priorities):
169 |         """Update priorities of sampled transitions.
170 | 
171 |         sets priority of transition at index idxes[i] in buffer
172 |         to priorities[i].
173 | 
174 |         Parameters
175 |         ----------
176 |         idxes: [int]
177 |             List of idxes of sampled transitions
178 |         priorities: [float]
179 |             List of updated priorities corresponding to
180 |             transitions at the sampled idxes denoted by
181 |             variable `idxes`.
182 |         """
183 |         assert len(idxes) == len(priorities)
184 |         for idx, priority in zip(idxes, priorities):
185 |             assert priority > 0
186 |             assert 0 <= idx < len(self._storage)
187 |             self._it_sum[idx] = priority ** self._alpha
188 |             self._it_min[idx] = priority ** self._alpha
189 | 
190 |             self._max_priority = max(self._max_priority, priority)
191 | 


--------------------------------------------------------------------------------
/baselines/a2c/a2c.py:
--------------------------------------------------------------------------------
  1 | import os.path as osp
  2 | import gym
  3 | import time
  4 | import joblib
  5 | import logging
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | from baselines import logger
  9 | 
 10 | from baselines.common import set_global_seeds, explained_variance
 11 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 12 | from baselines.common.atari_wrappers import wrap_deepmind
 13 | 
 14 | from baselines.a2c.utils import discount_with_dones
 15 | from baselines.a2c.utils import Scheduler, make_path, find_trainable_variables
 16 | from baselines.a2c.policies import CnnPolicy
 17 | from baselines.a2c.utils import cat_entropy, mse
 18 | 
 19 | class Model(object):
 20 | 
 21 |     def __init__(self, policy, ob_space, ac_space, nenvs, nsteps, nstack, num_procs,
 22 |             ent_coef=0.01, vf_coef=0.5, max_grad_norm=0.5, lr=7e-4,
 23 |             alpha=0.99, epsilon=1e-5, total_timesteps=int(80e6), lrschedule='linear'):
 24 |         config = tf.ConfigProto(allow_soft_placement=True,
 25 |                                 intra_op_parallelism_threads=num_procs,
 26 |                                 inter_op_parallelism_threads=num_procs)
 27 |         config.gpu_options.allow_growth = True
 28 |         sess = tf.Session(config=config)
 29 |         nact = ac_space.n
 30 |         nbatch = nenvs*nsteps
 31 | 
 32 |         A = tf.placeholder(tf.int32, [nbatch])
 33 |         ADV = tf.placeholder(tf.float32, [nbatch])
 34 |         R = tf.placeholder(tf.float32, [nbatch])
 35 |         LR = tf.placeholder(tf.float32, [])
 36 | 
 37 |         step_model = policy(sess, ob_space, ac_space, nenvs, 1, nstack, reuse=False)
 38 |         train_model = policy(sess, ob_space, ac_space, nenvs, nsteps, nstack, reuse=True)
 39 | 
 40 |         neglogpac = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=train_model.pi, labels=A)
 41 |         pg_loss = tf.reduce_mean(ADV * neglogpac)
 42 |         vf_loss = tf.reduce_mean(mse(tf.squeeze(train_model.vf), R))
 43 |         entropy = tf.reduce_mean(cat_entropy(train_model.pi))
 44 |         loss = pg_loss - entropy*ent_coef + vf_loss * vf_coef
 45 | 
 46 |         params = find_trainable_variables("model")
 47 |         grads = tf.gradients(loss, params)
 48 |         if max_grad_norm is not None:
 49 |             grads, grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
 50 |         grads = list(zip(grads, params))
 51 |         trainer = tf.train.RMSPropOptimizer(learning_rate=LR, decay=alpha, epsilon=epsilon)
 52 |         _train = trainer.apply_gradients(grads)
 53 | 
 54 |         lr = Scheduler(v=lr, nvalues=total_timesteps, schedule=lrschedule)
 55 | 
 56 |         def train(obs, states, rewards, masks, actions, values):
 57 |             advs = rewards - values
 58 |             for step in range(len(obs)):
 59 |                 cur_lr = lr.value()
 60 |             td_map = {train_model.X:obs, A:actions, ADV:advs, R:rewards, LR:cur_lr}
 61 |             if states != []:
 62 |                 td_map[train_model.S] = states
 63 |                 td_map[train_model.M] = masks
 64 |             policy_loss, value_loss, policy_entropy, _ = sess.run(
 65 |                 [pg_loss, vf_loss, entropy, _train],
 66 |                 td_map
 67 |             )
 68 |             return policy_loss, value_loss, policy_entropy
 69 | 
 70 |         def save(save_path):
 71 |             ps = sess.run(params)
 72 |             make_path(save_path)
 73 |             joblib.dump(ps, save_path)
 74 | 
 75 |         def load(load_path):
 76 |             loaded_params = joblib.load(load_path)
 77 |             restores = []
 78 |             for p, loaded_p in zip(params, loaded_params):
 79 |                 restores.append(p.assign(loaded_p))
 80 |             ps = sess.run(restores)
 81 | 
 82 |         self.train = train
 83 |         self.train_model = train_model
 84 |         self.step_model = step_model
 85 |         self.step = step_model.step
 86 |         self.value = step_model.value
 87 |         self.initial_state = step_model.initial_state
 88 |         self.save = save
 89 |         self.load = load
 90 |         tf.global_variables_initializer().run(session=sess)
 91 | 
 92 | class Runner(object):
 93 | 
 94 |     def __init__(self, env, model, nsteps=5, nstack=4, gamma=0.99):
 95 |         self.env = env
 96 |         self.model = model
 97 |         nh, nw, nc = env.observation_space.shape
 98 |         nenv = env.num_envs
 99 |         self.batch_ob_shape = (nenv*nsteps, nh, nw, nc*nstack)
100 |         self.obs = np.zeros((nenv, nh, nw, nc*nstack), dtype=np.uint8)
101 |         self.nc = nc
102 |         obs = env.reset()
103 |         self.update_obs(obs)
104 |         self.gamma = gamma
105 |         self.nsteps = nsteps
106 |         self.states = model.initial_state
107 |         self.dones = [False for _ in range(nenv)]
108 | 
109 |     def update_obs(self, obs):
110 |         # Do frame-stacking here instead of the FrameStack wrapper to reduce
111 |         # IPC overhead
112 |         self.obs = np.roll(self.obs, shift=-self.nc, axis=3)
113 |         self.obs[:, :, :, -self.nc:] = obs
114 | 
115 |     def run(self):
116 |         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
117 |         mb_states = self.states
118 |         for n in range(self.nsteps):
119 |             actions, values, states = self.model.step(self.obs, self.states, self.dones)
120 |             mb_obs.append(np.copy(self.obs))
121 |             mb_actions.append(actions)
122 |             mb_values.append(values)
123 |             mb_dones.append(self.dones)
124 |             obs, rewards, dones, _ = self.env.step(actions)
125 |             self.states = states
126 |             self.dones = dones
127 |             for n, done in enumerate(dones):
128 |                 if done:
129 |                     self.obs[n] = self.obs[n]*0
130 |             self.update_obs(obs)
131 |             mb_rewards.append(rewards)
132 |         mb_dones.append(self.dones)
133 |         #batch of steps to batch of rollouts
134 |         mb_obs = np.asarray(mb_obs, dtype=np.uint8).swapaxes(1, 0).reshape(self.batch_ob_shape)
135 |         mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
136 |         mb_actions = np.asarray(mb_actions, dtype=np.int32).swapaxes(1, 0)
137 |         mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
138 |         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
139 |         mb_masks = mb_dones[:, :-1]
140 |         mb_dones = mb_dones[:, 1:]
141 |         last_values = self.model.value(self.obs, self.states, self.dones).tolist()
142 |         #discount/bootstrap off value fn
143 |         for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
144 |             rewards = rewards.tolist()
145 |             dones = dones.tolist()
146 |             if dones[-1] == 0:
147 |                 rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
148 |             else:
149 |                 rewards = discount_with_dones(rewards, dones, self.gamma)
150 |             mb_rewards[n] = rewards
151 |         mb_rewards = mb_rewards.flatten()
152 |         mb_actions = mb_actions.flatten()
153 |         mb_values = mb_values.flatten()
154 |         mb_masks = mb_masks.flatten()
155 |         return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
156 | 
157 | def learn(policy, env, seed, nsteps=5, nstack=4, total_timesteps=int(80e6), vf_coef=0.5, ent_coef=0.01, max_grad_norm=0.5, lr=7e-4, lrschedule='linear', epsilon=1e-5, alpha=0.99, gamma=0.99, log_interval=100):
158 |     tf.reset_default_graph()
159 |     set_global_seeds(seed)
160 | 
161 |     nenvs = env.num_envs
162 |     ob_space = env.observation_space
163 |     ac_space = env.action_space
164 |     num_procs = len(env.remotes) # HACK
165 |     model = Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nenvs=nenvs, nsteps=nsteps, nstack=nstack, num_procs=num_procs, ent_coef=ent_coef, vf_coef=vf_coef,
166 |         max_grad_norm=max_grad_norm, lr=lr, alpha=alpha, epsilon=epsilon, total_timesteps=total_timesteps, lrschedule=lrschedule)
167 |     runner = Runner(env, model, nsteps=nsteps, nstack=nstack, gamma=gamma)
168 | 
169 |     nbatch = nenvs*nsteps
170 |     tstart = time.time()
171 |     for update in range(1, total_timesteps//nbatch+1):
172 |         obs, states, rewards, masks, actions, values = runner.run()
173 |         policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values)
174 |         nseconds = time.time()-tstart
175 |         fps = int((update*nbatch)/nseconds)
176 |         if update % log_interval == 0 or update == 1:
177 |             ev = explained_variance(values, rewards)
178 |             logger.record_tabular("nupdates", update)
179 |             logger.record_tabular("total_timesteps", update*nbatch)
180 |             logger.record_tabular("fps", fps)
181 |             logger.record_tabular("policy_entropy", float(policy_entropy))
182 |             logger.record_tabular("value_loss", float(value_loss))
183 |             logger.record_tabular("explained_variance", float(ev))
184 |             logger.dump_tabular()
185 |     env.close()
186 | 
187 | if __name__ == '__main__':
188 |     main()
189 | 


--------------------------------------------------------------------------------
/baselines/common/atari_wrappers_deprecated.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import gym
  3 | import numpy as np
  4 | 
  5 | from collections import deque
  6 | from gym import spaces
  7 | 
  8 | 
  9 | class NoopResetEnv(gym.Wrapper):
 10 |     def __init__(self, env=None, noop_max=30):
 11 |         """Sample initial states by taking random number of no-ops on reset.
 12 |         No-op is assumed to be action 0.
 13 |         """
 14 |         super(NoopResetEnv, self).__init__(env)
 15 |         self.noop_max = noop_max
 16 |         self.override_num_noops = None
 17 |         assert env.unwrapped.get_action_meanings()[0] == 'NOOP'
 18 | 
 19 |     def _reset(self):
 20 |         """ Do no-op action for a number of steps in [1, noop_max]."""
 21 |         self.env.reset()
 22 |         if self.override_num_noops is not None:
 23 |             noops = self.override_num_noops
 24 |         else:
 25 |             noops = np.random.randint(1, self.noop_max + 1)
 26 |         assert noops > 0
 27 |         obs = None
 28 |         for _ in range(noops):
 29 |             obs, _, done, _ = self.env.step(0)
 30 |             if done:
 31 |                 obs = self.env.reset()
 32 |         return obs
 33 | 
 34 | 
 35 | class FireResetEnv(gym.Wrapper):
 36 |     def __init__(self, env=None):
 37 |         """For environments where the user need to press FIRE for the game to start."""
 38 |         super(FireResetEnv, self).__init__(env)
 39 |         assert env.unwrapped.get_action_meanings()[1] == 'FIRE'
 40 |         assert len(env.unwrapped.get_action_meanings()) >= 3
 41 | 
 42 |     def _reset(self):
 43 |         self.env.reset()
 44 |         obs, _, done, _ = self.env.step(1)
 45 |         if done:
 46 |             self.env.reset()
 47 |         obs, _, done, _ = self.env.step(2)
 48 |         if done:
 49 |             self.env.reset()
 50 |         return obs
 51 | 
 52 | 
 53 | class EpisodicLifeEnv(gym.Wrapper):
 54 |     def __init__(self, env=None):
 55 |         """Make end-of-life == end-of-episode, but only reset on true game over.
 56 |         Done by DeepMind for the DQN and co. since it helps value estimation.
 57 |         """
 58 |         super(EpisodicLifeEnv, self).__init__(env)
 59 |         self.lives = 0
 60 |         self.was_real_done = True
 61 |         self.was_real_reset = False
 62 | 
 63 |     def _step(self, action):
 64 |         obs, reward, done, info = self.env.step(action)
 65 |         self.was_real_done = done
 66 |         # check current lives, make loss of life terminal,
 67 |         # then update lives to handle bonus lives
 68 |         lives = self.env.unwrapped.ale.lives()
 69 |         if lives < self.lives and lives > 0:
 70 |             # for Qbert somtimes we stay in lives == 0 condtion for a few frames
 71 |             # so its important to keep lives > 0, so that we only reset once
 72 |             # the environment advertises done.
 73 |             done = True
 74 |         self.lives = lives
 75 |         return obs, reward, done, info
 76 | 
 77 |     def _reset(self):
 78 |         """Reset only when lives are exhausted.
 79 |         This way all states are still reachable even though lives are episodic,
 80 |         and the learner need not know about any of this behind-the-scenes.
 81 |         """
 82 |         if self.was_real_done:
 83 |             obs = self.env.reset()
 84 |             self.was_real_reset = True
 85 |         else:
 86 |             # no-op step to advance from terminal/lost life state
 87 |             obs, _, _, _ = self.env.step(0)
 88 |             self.was_real_reset = False
 89 |         self.lives = self.env.unwrapped.ale.lives()
 90 |         return obs
 91 | 
 92 | 
 93 | class MaxAndSkipEnv(gym.Wrapper):
 94 |     def __init__(self, env=None, skip=4):
 95 |         """Return only every `skip`-th frame"""
 96 |         super(MaxAndSkipEnv, self).__init__(env)
 97 |         # most recent raw observations (for max pooling across time steps)
 98 |         self._obs_buffer = deque(maxlen=2)
 99 |         self._skip = skip
100 | 
101 |     def _step(self, action):
102 |         total_reward = 0.0
103 |         done = None
104 |         for _ in range(self._skip):
105 |             obs, reward, done, info = self.env.step(action)
106 |             self._obs_buffer.append(obs)
107 |             total_reward += reward
108 |             if done:
109 |                 break
110 | 
111 |         max_frame = np.max(np.stack(self._obs_buffer), axis=0)
112 | 
113 |         return max_frame, total_reward, done, info
114 | 
115 |     def _reset(self):
116 |         """Clear past frame buffer and init. to first obs. from inner env."""
117 |         self._obs_buffer.clear()
118 |         obs = self.env.reset()
119 |         self._obs_buffer.append(obs)
120 |         return obs
121 | 
122 | 
123 | class ProcessFrame84(gym.ObservationWrapper):
124 |     def __init__(self, env=None):
125 |         super(ProcessFrame84, self).__init__(env)
126 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
127 | 
128 |     def _observation(self, obs):
129 |         return ProcessFrame84.process(obs)
130 | 
131 |     @staticmethod
132 |     def process(frame):
133 |         if frame.size == 210 * 160 * 3:
134 |             img = np.reshape(frame, [210, 160, 3]).astype(np.float32)
135 |         elif frame.size == 250 * 160 * 3:
136 |             img = np.reshape(frame, [250, 160, 3]).astype(np.float32)
137 |         else:
138 |             assert False, "Unknown resolution."
139 |         img = img[:, :, 0] * 0.299 + img[:, :, 1] * 0.587 + img[:, :, 2] * 0.114
140 |         resized_screen = cv2.resize(img, (84, 110), interpolation=cv2.INTER_AREA)
141 |         x_t = resized_screen[18:102, :]
142 |         x_t = np.reshape(x_t, [84, 84, 1])
143 |         return x_t.astype(np.uint8)
144 | 
145 | 
146 | class ClippedRewardsWrapper(gym.RewardWrapper):
147 |     def _reward(self, reward):
148 |         """Change all the positive rewards to 1, negative to -1 and keep zero."""
149 |         return np.sign(reward)
150 | 
151 | 
152 | class LazyFrames(object):
153 |     def __init__(self, frames):
154 |         """This object ensures that common frames between the observations are only stored once.
155 |         It exists purely to optimize memory usage which can be huge for DQN's 1M frames replay
156 |         buffers.
157 | 
158 |         This object should only be converted to numpy array before being passed to the model.
159 | 
160 |         You'd not belive how complex the previous solution was."""
161 |         self._frames = frames
162 | 
163 |     def __array__(self, dtype=None):
164 |         out = np.concatenate(self._frames, axis=2)
165 |         if dtype is not None:
166 |             out = out.astype(dtype)
167 |         return out
168 | 
169 | 
170 | class FrameStack(gym.Wrapper):
171 |     def __init__(self, env, k):
172 |         """Stack k last frames.
173 | 
174 |         Returns lazy array, which is much more memory efficient.
175 | 
176 |         See Also
177 |         --------
178 |         baselines.common.atari_wrappers.LazyFrames
179 |         """
180 |         gym.Wrapper.__init__(self, env)
181 |         self.k = k
182 |         self.frames = deque([], maxlen=k)
183 |         shp = env.observation_space.shape
184 |         self.observation_space = spaces.Box(low=0, high=255, shape=(shp[0], shp[1], shp[2] * k))
185 | 
186 |     def _reset(self):
187 |         ob = self.env.reset()
188 |         for _ in range(self.k):
189 |             self.frames.append(ob)
190 |         return self._get_ob()
191 | 
192 |     def _step(self, action):
193 |         ob, reward, done, info = self.env.step(action)
194 |         self.frames.append(ob)
195 |         return self._get_ob(), reward, done, info
196 | 
197 |     def _get_ob(self):
198 |         assert len(self.frames) == self.k
199 |         return LazyFrames(list(self.frames))
200 | 
201 | 
202 | class ScaledFloatFrame(gym.ObservationWrapper):
203 |     def _observation(self, obs):
204 |         # careful! This undoes the memory optimization, use
205 |         # with smaller replay buffers only.
206 |         return np.array(obs).astype(np.float32) / 255.0
207 | 
208 | 
209 | def wrap_dqn(env):
210 |     """Apply a common set of wrappers for Atari games."""
211 |     assert 'NoFrameskip' in env.spec.id
212 |     env = EpisodicLifeEnv(env)
213 |     env = NoopResetEnv(env, noop_max=30)
214 |     env = MaxAndSkipEnv(env, skip=4)
215 |     if 'FIRE' in env.unwrapped.get_action_meanings():
216 |         env = FireResetEnv(env)
217 |     env = ProcessFrame84(env)
218 |     env = FrameStack(env, 4)
219 |     env = ClippedRewardsWrapper(env)
220 |     return env
221 | 
222 | 
223 | class A2cProcessFrame(gym.Wrapper):
224 |     def __init__(self, env):
225 |         gym.Wrapper.__init__(self, env)
226 |         self.observation_space = spaces.Box(low=0, high=255, shape=(84, 84, 1))
227 | 
228 |     def _step(self, action):
229 |         ob, reward, done, info = self.env.step(action)
230 |         return A2cProcessFrame.process(ob), reward, done, info
231 | 
232 |     def _reset(self):
233 |         return A2cProcessFrame.process(self.env.reset())
234 | 
235 |     @staticmethod
236 |     def process(frame):
237 |         frame = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
238 |         frame = cv2.resize(frame, (84, 84), interpolation=cv2.INTER_AREA)
239 |         return frame.reshape(84, 84, 1)
240 | 


--------------------------------------------------------------------------------
/baselines/ddpg/training.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from collections import deque
  4 | import pickle
  5 | 
  6 | from baselines.ddpg.ddpg import DDPG
  7 | from baselines.ddpg.util import mpi_mean, mpi_std, mpi_max, mpi_sum
  8 | import baselines.common.tf_util as U
  9 | 
 10 | from baselines import logger
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | from mpi4py import MPI
 14 | 
 15 | 
 16 | def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic,
 17 |     normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise,
 18 |     popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory,
 19 |     tau=0.01, eval_env=None, param_noise_adaption_interval=50):
 20 |     rank = MPI.COMM_WORLD.Get_rank()
 21 | 
 22 |     assert (np.abs(env.action_space.low) == env.action_space.high).all()  # we assume symmetric actions.
 23 |     max_action = env.action_space.high
 24 |     logger.info('scaling actions by {} before executing in env'.format(max_action))
 25 |     agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape,
 26 |         gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations,
 27 |         batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg,
 28 |         actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm,
 29 |         reward_scale=reward_scale)
 30 |     logger.info('Using agent with the following configuration:')
 31 |     logger.info(str(agent.__dict__.items()))
 32 | 
 33 |     # Set up logging stuff only for a single worker.
 34 |     if rank == 0:
 35 |         saver = tf.train.Saver()
 36 |     else:
 37 |         saver = None
 38 |     
 39 |     step = 0
 40 |     episode = 0
 41 |     eval_episode_rewards_history = deque(maxlen=100)
 42 |     episode_rewards_history = deque(maxlen=100)
 43 |     with U.single_threaded_session() as sess:
 44 |         # Prepare everything.
 45 |         agent.initialize(sess)
 46 |         sess.graph.finalize()
 47 | 
 48 |         agent.reset()
 49 |         obs = env.reset()
 50 |         if eval_env is not None:
 51 |             eval_obs = eval_env.reset()
 52 |         done = False
 53 |         episode_reward = 0.
 54 |         episode_step = 0
 55 |         episodes = 0
 56 |         t = 0
 57 | 
 58 |         epoch = 0
 59 |         start_time = time.time()
 60 | 
 61 |         epoch_episode_rewards = []
 62 |         epoch_episode_steps = []
 63 |         epoch_episode_eval_rewards = []
 64 |         epoch_episode_eval_steps = []
 65 |         epoch_start_time = time.time()
 66 |         epoch_actions = []
 67 |         epoch_qs = []
 68 |         epoch_episodes = 0
 69 |         for epoch in range(nb_epochs):
 70 |             for cycle in range(nb_epoch_cycles):
 71 |                 # Perform rollouts.
 72 |                 for t_rollout in range(nb_rollout_steps):
 73 |                     # Predict next action.
 74 |                     action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
 75 |                     assert action.shape == env.action_space.shape
 76 | 
 77 |                     # Execute next action.
 78 |                     if rank == 0 and render:
 79 |                         env.render()
 80 |                     assert max_action.shape == action.shape
 81 |                     new_obs, r, done, info = env.step(max_action * action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
 82 |                     t += 1
 83 |                     if rank == 0 and render:
 84 |                         env.render()
 85 |                     episode_reward += r
 86 |                     episode_step += 1
 87 | 
 88 |                     # Book-keeping.
 89 |                     epoch_actions.append(action)
 90 |                     epoch_qs.append(q)
 91 |                     agent.store_transition(obs, action, r, new_obs, done)
 92 |                     obs = new_obs
 93 | 
 94 |                     if done:
 95 |                         # Episode done.
 96 |                         epoch_episode_rewards.append(episode_reward)
 97 |                         episode_rewards_history.append(episode_reward)
 98 |                         epoch_episode_steps.append(episode_step)
 99 |                         episode_reward = 0.
100 |                         episode_step = 0
101 |                         epoch_episodes += 1
102 |                         episodes += 1
103 | 
104 |                         agent.reset()
105 |                         obs = env.reset()
106 | 
107 |                 # Train.
108 |                 epoch_actor_losses = []
109 |                 epoch_critic_losses = []
110 |                 epoch_adaptive_distances = []
111 |                 for t_train in range(nb_train_steps):
112 |                     # Adapt param noise, if necessary.
113 |                     if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0:
114 |                         distance = agent.adapt_param_noise()
115 |                         epoch_adaptive_distances.append(distance)
116 | 
117 |                     cl, al = agent.train()
118 |                     epoch_critic_losses.append(cl)
119 |                     epoch_actor_losses.append(al)
120 |                     agent.update_target_net()
121 | 
122 |                 # Evaluate.
123 |                 eval_episode_rewards = []
124 |                 eval_qs = []
125 |                 if eval_env is not None:
126 |                     eval_episode_reward = 0.
127 |                     for t_rollout in range(nb_eval_steps):
128 |                         eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True)
129 |                         eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action)  # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1])
130 |                         if render_eval:
131 |                             eval_env.render()
132 |                         eval_episode_reward += eval_r
133 | 
134 |                         eval_qs.append(eval_q)
135 |                         if eval_done:
136 |                             eval_obs = eval_env.reset()
137 |                             eval_episode_rewards.append(eval_episode_reward)
138 |                             eval_episode_rewards_history.append(eval_episode_reward)
139 |                             eval_episode_reward = 0.
140 | 
141 |             # Log stats.
142 |             epoch_train_duration = time.time() - epoch_start_time
143 |             duration = time.time() - start_time
144 |             stats = agent.get_stats()
145 |             combined_stats = {}
146 |             for key in sorted(stats.keys()):
147 |                 combined_stats[key] = mpi_mean(stats[key])
148 | 
149 |             # Rollout statistics.
150 |             combined_stats['rollout/return'] = mpi_mean(epoch_episode_rewards)
151 |             combined_stats['rollout/return_history'] = mpi_mean(np.mean(episode_rewards_history))
152 |             combined_stats['rollout/episode_steps'] = mpi_mean(epoch_episode_steps)
153 |             combined_stats['rollout/episodes'] = mpi_sum(epoch_episodes)
154 |             combined_stats['rollout/actions_mean'] = mpi_mean(epoch_actions)
155 |             combined_stats['rollout/actions_std'] = mpi_std(epoch_actions)
156 |             combined_stats['rollout/Q_mean'] = mpi_mean(epoch_qs)
157 |     
158 |             # Train statistics.
159 |             combined_stats['train/loss_actor'] = mpi_mean(epoch_actor_losses)
160 |             combined_stats['train/loss_critic'] = mpi_mean(epoch_critic_losses)
161 |             combined_stats['train/param_noise_distance'] = mpi_mean(epoch_adaptive_distances)
162 | 
163 |             # Evaluation statistics.
164 |             if eval_env is not None:
165 |                 combined_stats['eval/return'] = mpi_mean(eval_episode_rewards)
166 |                 combined_stats['eval/return_history'] = mpi_mean(np.mean(eval_episode_rewards_history))
167 |                 combined_stats['eval/Q'] = mpi_mean(eval_qs)
168 |                 combined_stats['eval/episodes'] = mpi_mean(len(eval_episode_rewards))
169 | 
170 |             # Total statistics.
171 |             combined_stats['total/duration'] = mpi_mean(duration)
172 |             combined_stats['total/steps_per_second'] = mpi_mean(float(t) / float(duration))
173 |             combined_stats['total/episodes'] = mpi_mean(episodes)
174 |             combined_stats['total/epochs'] = epoch + 1
175 |             combined_stats['total/steps'] = t
176 |             
177 |             for key in sorted(combined_stats.keys()):
178 |                 logger.record_tabular(key, combined_stats[key])
179 |             logger.dump_tabular()
180 |             logger.info('')
181 |             logdir = logger.get_dir()
182 |             if rank == 0 and logdir:
183 |                 if hasattr(env, 'get_state'):
184 |                     with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f:
185 |                         pickle.dump(env.get_state(), f)
186 |                 if eval_env and hasattr(eval_env, 'get_state'):
187 |                     with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f:
188 |                         pickle.dump(eval_env.get_state(), f)
189 | 
190 | 


--------------------------------------------------------------------------------
/baselines/a2c/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gym
  3 | import numpy as np
  4 | import tensorflow as tf
  5 | from gym import spaces
  6 | from collections import deque
  7 | 
  8 | def sample(logits):
  9 |     noise = tf.random_uniform(tf.shape(logits))
 10 |     return tf.argmax(logits - tf.log(-tf.log(noise)), 1)
 11 | 
 12 | def cat_entropy(logits):
 13 |     a0 = logits - tf.reduce_max(logits, 1, keep_dims=True)
 14 |     ea0 = tf.exp(a0)
 15 |     z0 = tf.reduce_sum(ea0, 1, keep_dims=True)
 16 |     p0 = ea0 / z0
 17 |     return tf.reduce_sum(p0 * (tf.log(z0) - a0), 1)
 18 | 
 19 | def cat_entropy_softmax(p0):
 20 |     return - tf.reduce_sum(p0 * tf.log(p0 + 1e-6), axis = 1)
 21 | 
 22 | def mse(pred, target):
 23 |     return tf.square(pred-target)/2.
 24 | 
 25 | def ortho_init(scale=1.0):
 26 |     def _ortho_init(shape, dtype, partition_info=None):
 27 |         #lasagne ortho init for tf
 28 |         shape = tuple(shape)
 29 |         if len(shape) == 2:
 30 |             flat_shape = shape
 31 |         elif len(shape) == 4: # assumes NHWC
 32 |             flat_shape = (np.prod(shape[:-1]), shape[-1])
 33 |         else:
 34 |             raise NotImplementedError
 35 |         a = np.random.normal(0.0, 1.0, flat_shape)
 36 |         u, _, v = np.linalg.svd(a, full_matrices=False)
 37 |         q = u if u.shape == flat_shape else v # pick the one with the correct shape
 38 |         q = q.reshape(shape)
 39 |         return (scale * q[:shape[0], :shape[1]]).astype(np.float32)
 40 |     return _ortho_init
 41 | 
 42 | def conv(x, scope, nf, rf, stride, pad='VALID', act=tf.nn.relu, init_scale=1.0):
 43 |     with tf.variable_scope(scope):
 44 |         nin = x.get_shape()[3].value
 45 |         w = tf.get_variable("w", [rf, rf, nin, nf], initializer=ortho_init(init_scale))
 46 |         b = tf.get_variable("b", [nf], initializer=tf.constant_initializer(0.0))
 47 |         z = tf.nn.conv2d(x, w, strides=[1, stride, stride, 1], padding=pad)+b
 48 |         h = act(z)
 49 |         return h
 50 | 
 51 | def fc(x, scope, nh, act=tf.nn.relu, init_scale=1.0):
 52 |     with tf.variable_scope(scope):
 53 |         nin = x.get_shape()[1].value
 54 |         w = tf.get_variable("w", [nin, nh], initializer=ortho_init(init_scale))
 55 |         b = tf.get_variable("b", [nh], initializer=tf.constant_initializer(0.0))
 56 |         z = tf.matmul(x, w)+b
 57 |         h = act(z)
 58 |         return h
 59 | 
 60 | def batch_to_seq(h, nbatch, nsteps, flat=False):
 61 |     if flat:
 62 |         h = tf.reshape(h, [nbatch, nsteps])
 63 |     else:
 64 |         h = tf.reshape(h, [nbatch, nsteps, -1])
 65 |     return [tf.squeeze(v, [1]) for v in tf.split(axis=1, num_or_size_splits=nsteps, value=h)]
 66 | 
 67 | def seq_to_batch(h, flat = False):
 68 |     shape = h[0].get_shape().as_list()
 69 |     if not flat:
 70 |         assert(len(shape) > 1)
 71 |         nh = h[0].get_shape()[-1].value
 72 |         return tf.reshape(tf.concat(axis=1, values=h), [-1, nh])
 73 |     else:
 74 |         return tf.reshape(tf.stack(values=h, axis=1), [-1])
 75 | 
 76 | def lstm(xs, ms, s, scope, nh, init_scale=1.0):
 77 |     nbatch, nin = [v.value for v in xs[0].get_shape()]
 78 |     nsteps = len(xs)
 79 |     with tf.variable_scope(scope):
 80 |         wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
 81 |         wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
 82 |         b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
 83 | 
 84 |     c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
 85 |     for idx, (x, m) in enumerate(zip(xs, ms)):
 86 |         c = c*(1-m)
 87 |         h = h*(1-m)
 88 |         z = tf.matmul(x, wx) + tf.matmul(h, wh) + b
 89 |         i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
 90 |         i = tf.nn.sigmoid(i)
 91 |         f = tf.nn.sigmoid(f)
 92 |         o = tf.nn.sigmoid(o)
 93 |         u = tf.tanh(u)
 94 |         c = f*c + i*u
 95 |         h = o*tf.tanh(c)
 96 |         xs[idx] = h
 97 |     s = tf.concat(axis=1, values=[c, h])
 98 |     return xs, s
 99 | 
100 | def _ln(x, g, b, e=1e-5, axes=[1]):
101 |     u, s = tf.nn.moments(x, axes=axes, keep_dims=True)
102 |     x = (x-u)/tf.sqrt(s+e)
103 |     x = x*g+b
104 |     return x
105 | 
106 | def lnlstm(xs, ms, s, scope, nh, init_scale=1.0):
107 |     nbatch, nin = [v.value for v in xs[0].get_shape()]
108 |     nsteps = len(xs)
109 |     with tf.variable_scope(scope):
110 |         wx = tf.get_variable("wx", [nin, nh*4], initializer=ortho_init(init_scale))
111 |         gx = tf.get_variable("gx", [nh*4], initializer=tf.constant_initializer(1.0))
112 |         bx = tf.get_variable("bx", [nh*4], initializer=tf.constant_initializer(0.0))
113 | 
114 |         wh = tf.get_variable("wh", [nh, nh*4], initializer=ortho_init(init_scale))
115 |         gh = tf.get_variable("gh", [nh*4], initializer=tf.constant_initializer(1.0))
116 |         bh = tf.get_variable("bh", [nh*4], initializer=tf.constant_initializer(0.0))
117 | 
118 |         b = tf.get_variable("b", [nh*4], initializer=tf.constant_initializer(0.0))
119 | 
120 |         gc = tf.get_variable("gc", [nh], initializer=tf.constant_initializer(1.0))
121 |         bc = tf.get_variable("bc", [nh], initializer=tf.constant_initializer(0.0))
122 | 
123 |     c, h = tf.split(axis=1, num_or_size_splits=2, value=s)
124 |     for idx, (x, m) in enumerate(zip(xs, ms)):
125 |         c = c*(1-m)
126 |         h = h*(1-m)
127 |         z = _ln(tf.matmul(x, wx), gx, bx) + _ln(tf.matmul(h, wh), gh, bh) + b
128 |         i, f, o, u = tf.split(axis=1, num_or_size_splits=4, value=z)
129 |         i = tf.nn.sigmoid(i)
130 |         f = tf.nn.sigmoid(f)
131 |         o = tf.nn.sigmoid(o)
132 |         u = tf.tanh(u)
133 |         c = f*c + i*u
134 |         h = o*tf.tanh(_ln(c, gc, bc))
135 |         xs[idx] = h
136 |     s = tf.concat(axis=1, values=[c, h])
137 |     return xs, s
138 | 
139 | def conv_to_fc(x):
140 |     nh = np.prod([v.value for v in x.get_shape()[1:]])
141 |     x = tf.reshape(x, [-1, nh])
142 |     return x
143 | 
144 | def discount_with_dones(rewards, dones, gamma):
145 |     discounted = []
146 |     r = 0
147 |     for reward, done in zip(rewards[::-1], dones[::-1]):
148 |         r = reward + gamma*r*(1.-done) # fixed off by one bug
149 |         discounted.append(r)
150 |     return discounted[::-1]
151 | 
152 | def find_trainable_variables(key):
153 |     with tf.variable_scope(key):
154 |         return tf.trainable_variables()
155 | 
156 | def make_path(f):
157 |     return os.makedirs(f, exist_ok=True)
158 | 
159 | def constant(p):
160 |     return 1
161 | 
162 | def linear(p):
163 |     return 1-p
164 | 
165 | schedules = {
166 |     'linear':linear,
167 |     'constant':constant
168 | }
169 | 
170 | class Scheduler(object):
171 | 
172 |     def __init__(self, v, nvalues, schedule):
173 |         self.n = 0.
174 |         self.v = v
175 |         self.nvalues = nvalues
176 |         self.schedule = schedules[schedule]
177 | 
178 |     def value(self):
179 |         current_value = self.v*self.schedule(self.n/self.nvalues)
180 |         self.n += 1.
181 |         return current_value
182 | 
183 |     def value_steps(self, steps):
184 |         return self.v*self.schedule(steps/self.nvalues)
185 | 
186 | 
187 | class EpisodeStats:
188 |     def __init__(self, nsteps, nenvs):
189 |         self.episode_rewards = []
190 |         for i in range(nenvs):
191 |             self.episode_rewards.append([])
192 |         self.lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
193 |         self.rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
194 |         self.nsteps = nsteps
195 |         self.nenvs = nenvs
196 | 
197 |     def feed(self, rewards, masks):
198 |         rewards = np.reshape(rewards, [self.nenvs, self.nsteps])
199 |         masks = np.reshape(masks, [self.nenvs, self.nsteps])
200 |         for i in range(0, self.nenvs):
201 |             for j in range(0, self.nsteps):
202 |                 self.episode_rewards[i].append(rewards[i][j])
203 |                 if masks[i][j]:
204 |                     l = len(self.episode_rewards[i])
205 |                     s = sum(self.episode_rewards[i])
206 |                     self.lenbuffer.append(l)
207 |                     self.rewbuffer.append(s)
208 |                     self.episode_rewards[i] = []
209 | 
210 |     def mean_length(self):
211 |         if self.lenbuffer:
212 |             return np.mean(self.lenbuffer)
213 |         else:
214 |             return 0  # on the first params dump, no episodes are finished
215 | 
216 |     def mean_reward(self):
217 |         if self.rewbuffer:
218 |             return np.mean(self.rewbuffer)
219 |         else:
220 |             return 0
221 | 
222 | 
223 | # For ACER
224 | def get_by_index(x, idx):
225 |     assert(len(x.get_shape()) == 2)
226 |     assert(len(idx.get_shape()) == 1)
227 |     idx_flattened = tf.range(0, x.shape[0]) * x.shape[1] + idx
228 |     y = tf.gather(tf.reshape(x, [-1]),  # flatten input
229 |                   idx_flattened)  # use flattened indices
230 |     return y
231 | 
232 | def check_shape(ts,shapes):
233 |     i = 0
234 |     for (t,shape) in zip(ts,shapes):
235 |         assert t.get_shape().as_list()==shape, "id " + str(i) + " shape " + str(t.get_shape()) + str(shape)
236 |         i += 1
237 | 
238 | def avg_norm(t):
239 |     return tf.reduce_mean(tf.sqrt(tf.reduce_sum(tf.square(t), axis=-1)))
240 | 
241 | def myadd(g1, g2, param):
242 |     print([g1, g2, param.name])
243 |     assert (not (g1 is None and g2 is None)), param.name
244 |     if g1 is None:
245 |         return g2
246 |     elif g2 is None:
247 |         return g1
248 |     else:
249 |         return g1 + g2
250 | 
251 | def my_explained_variance(qpred, q):
252 |     _, vary = tf.nn.moments(q, axes=[0, 1])
253 |     _, varpred = tf.nn.moments(q - qpred, axes=[0, 1])
254 |     check_shape([vary, varpred], [[]] * 2)
255 |     return 1.0 - (varpred / vary)
256 | 


--------------------------------------------------------------------------------
/baselines/logger.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import shutil
  4 | import os.path as osp
  5 | import json
  6 | import time
  7 | import datetime
  8 | import tempfile
  9 | 
 10 | LOG_OUTPUT_FORMATS = ['stdout', 'log', 'json']
 11 | 
 12 | DEBUG = 10
 13 | INFO = 20
 14 | WARN = 30
 15 | ERROR = 40
 16 | 
 17 | DISABLED = 50
 18 | 
 19 | class OutputFormat(object):
 20 |     def writekvs(self, kvs):
 21 |         """
 22 |         Write key-value pairs
 23 |         """
 24 |         raise NotImplementedError
 25 | 
 26 |     def writeseq(self, args):
 27 |         """
 28 |         Write a sequence of other data (e.g. a logging message)
 29 |         """
 30 |         pass
 31 | 
 32 |     def close(self):
 33 |         return
 34 | 
 35 | 
 36 | class HumanOutputFormat(OutputFormat):
 37 |     def __init__(self, file):
 38 |         self.file = file
 39 | 
 40 |     def writekvs(self, kvs):
 41 |         # Create strings for printing
 42 |         key2str = {}
 43 |         for (key, val) in sorted(kvs.items()):
 44 |             if isinstance(val, float):
 45 |                 valstr = '%-8.3g' % (val,)
 46 |             else:
 47 |                 valstr = str(val)
 48 |             key2str[self._truncate(key)] = self._truncate(valstr)
 49 | 
 50 |         # Find max widths
 51 |         keywidth = max(map(len, key2str.keys()))
 52 |         valwidth = max(map(len, key2str.values()))
 53 | 
 54 |         # Write out the data
 55 |         dashes = '-' * (keywidth + valwidth + 7)
 56 |         lines = [dashes]
 57 |         for (key, val) in sorted(key2str.items()):
 58 |             lines.append('| %s%s | %s%s |' % (
 59 |                 key,
 60 |                 ' ' * (keywidth - len(key)),
 61 |                 val,
 62 |                 ' ' * (valwidth - len(val)),
 63 |             ))
 64 |         lines.append(dashes)
 65 |         self.file.write('\n'.join(lines) + '\n')
 66 | 
 67 |         # Flush the output to the file
 68 |         self.file.flush()
 69 | 
 70 |     def _truncate(self, s):
 71 |         return s[:20] + '...' if len(s) > 23 else s
 72 | 
 73 |     def writeseq(self, args):
 74 |         for arg in args:
 75 |             self.file.write(arg)
 76 |         self.file.write('\n')
 77 |         self.file.flush()
 78 | 
 79 | class JSONOutputFormat(OutputFormat):
 80 |     def __init__(self, file):
 81 |         self.file = file
 82 | 
 83 |     def writekvs(self, kvs):
 84 |         for k, v in sorted(kvs.items()):
 85 |             if hasattr(v, 'dtype'):
 86 |                 v = v.tolist()
 87 |                 kvs[k] = float(v)
 88 |         self.file.write(json.dumps(kvs) + '\n')
 89 |         self.file.flush()
 90 | 
 91 | class TensorBoardOutputFormat(OutputFormat):
 92 |     """
 93 |     Dumps key/value pairs into TensorBoard's numeric format.
 94 |     """
 95 |     def __init__(self, dir):
 96 |         os.makedirs(dir, exist_ok=True)
 97 |         self.dir = dir
 98 |         self.step = 1
 99 |         prefix = 'events'
100 |         path = osp.join(osp.abspath(dir), prefix)
101 |         import tensorflow as tf
102 |         from tensorflow.python import pywrap_tensorflow        
103 |         from tensorflow.core.util import event_pb2
104 |         from tensorflow.python.util import compat
105 |         self.tf = tf
106 |         self.event_pb2 = event_pb2
107 |         self.pywrap_tensorflow = pywrap_tensorflow
108 |         self.writer = pywrap_tensorflow.EventsWriter(compat.as_bytes(path))
109 | 
110 |     def writekvs(self, kvs):
111 |         def summary_val(k, v):
112 |             kwargs = {'tag': k, 'simple_value': float(v)}
113 |             return self.tf.Summary.Value(**kwargs)
114 |         summary = self.tf.Summary(value=[summary_val(k, v) for k, v in kvs.items()])
115 |         event = self.event_pb2.Event(wall_time=time.time(), summary=summary)
116 |         event.step = self.step # is there any reason why you'd want to specify the step?
117 |         self.writer.WriteEvent(event)
118 |         self.writer.Flush()
119 |         self.step += 1
120 | 
121 |     def close(self):
122 |         if self.writer:
123 |             self.writer.Close()
124 |             self.writer = None
125 | 
126 | 
127 | def make_output_format(format, ev_dir):
128 |     os.makedirs(ev_dir, exist_ok=True)
129 |     if format == 'stdout':
130 |         return HumanOutputFormat(sys.stdout)
131 |     elif format == 'log':
132 |         log_file = open(osp.join(ev_dir, 'log.txt'), 'wt')
133 |         return HumanOutputFormat(log_file)
134 |     elif format == 'json':
135 |         json_file = open(osp.join(ev_dir, 'progress.json'), 'wt')
136 |         return JSONOutputFormat(json_file)
137 |     elif format == 'tensorboard':
138 |         return TensorBoardOutputFormat(osp.join(ev_dir, 'tb'))
139 |     else:
140 |         raise ValueError('Unknown format specified: %s' % (format,))
141 | 
142 | # ================================================================
143 | # API
144 | # ================================================================
145 | 
146 | def logkv(key, val):
147 |     """
148 |     Log a value of some diagnostic
149 |     Call this once for each diagnostic quantity, each iteration
150 |     """
151 |     Logger.CURRENT.logkv(key, val)
152 | 
153 | def logkvs(d):
154 |     """
155 |     Log a dictionary of key-value pairs
156 |     """
157 |     for (k, v) in d.items():
158 |         logkv(k, v)
159 | 
160 | def dumpkvs():
161 |     """
162 |     Write all of the diagnostics from the current iteration
163 | 
164 |     level: int. (see logger.py docs) If the global logger level is higher than
165 |                 the level argument here, don't print to stdout.
166 |     """
167 |     Logger.CURRENT.dumpkvs()
168 | 
169 | def getkvs():
170 |     return Logger.CURRENT.name2val    
171 | 
172 | 
173 | def log(*args, level=INFO):
174 |     """
175 |     Write the sequence of args, with no separators, to the console and output files (if you've configured an output file).
176 |     """
177 |     Logger.CURRENT.log(*args, level=level)
178 | 
179 | 
180 | def debug(*args):
181 |     log(*args, level=DEBUG)
182 | 
183 | 
184 | def info(*args):
185 |     log(*args, level=INFO)
186 | 
187 | 
188 | def warn(*args):
189 |     log(*args, level=WARN)
190 | 
191 | 
192 | def error(*args):
193 |     log(*args, level=ERROR)
194 | 
195 | 
196 | def set_level(level):
197 |     """
198 |     Set logging threshold on current logger.
199 |     """
200 |     Logger.CURRENT.set_level(level)
201 | 
202 | def get_dir():
203 |     """
204 |     Get directory that log files are being written to.
205 |     will be None if there is no output directory (i.e., if you didn't call start)
206 |     """
207 |     return Logger.CURRENT.get_dir()
208 | 
209 | record_tabular = logkv
210 | dump_tabular = dumpkvs
211 | 
212 | # ================================================================
213 | # Backend
214 | # ================================================================
215 | 
216 | class Logger(object):
217 |     DEFAULT = None  # A logger with no output files. (See right below class definition)
218 |                     # So that you can still log to the terminal without setting up any output files
219 |     CURRENT = None  # Current logger being used by the free functions above
220 | 
221 |     def __init__(self, dir, output_formats):
222 |         self.name2val = {}  # values this iteration
223 |         self.level = INFO
224 |         self.dir = dir
225 |         self.output_formats = output_formats
226 | 
227 |     # Logging API, forwarded
228 |     # ----------------------------------------
229 |     def logkv(self, key, val):
230 |         self.name2val[key] = val
231 | 
232 |     def dumpkvs(self):
233 |         if self.level == DISABLED: return
234 |         for fmt in self.output_formats:
235 |             fmt.writekvs(self.name2val)
236 |         self.name2val.clear()
237 | 
238 |     def log(self, *args, level=INFO):
239 |         if self.level <= level:
240 |             self._do_log(args)
241 | 
242 |     # Configuration
243 |     # ----------------------------------------
244 |     def set_level(self, level):
245 |         self.level = level
246 | 
247 |     def get_dir(self):
248 |         return self.dir
249 | 
250 |     def close(self):
251 |         for fmt in self.output_formats:
252 |             fmt.close()
253 | 
254 |     # Misc
255 |     # ----------------------------------------
256 |     def _do_log(self, args):
257 |         for fmt in self.output_formats:
258 |             fmt.writeseq(args)
259 | 
260 | Logger.DEFAULT = Logger.CURRENT = Logger(dir=None, output_formats=[HumanOutputFormat(sys.stdout)])
261 | 
262 | def configure(dir=None, format_strs=None):
263 |     assert Logger.CURRENT is Logger.DEFAULT,\
264 |         "Only call logger.configure() when it's in the default state. Try calling logger.reset() first."
265 |     prevlogger = Logger.CURRENT
266 |     if dir is None:
267 |         dir = os.getenv('OPENAI_LOGDIR')
268 |     if dir is None:
269 |         dir = osp.join(tempfile.gettempdir(), 
270 |             datetime.datetime.now().strftime("openai-%Y-%m-%d-%H-%M-%S-%f"))
271 |     if format_strs is None:
272 |         format_strs = LOG_OUTPUT_FORMATS
273 |     output_formats = [make_output_format(f, dir) for f in format_strs]
274 |     Logger.CURRENT = Logger(dir=dir, output_formats=output_formats)
275 |     log('Logging to %s'%dir)
276 | 
277 | if os.getenv('OPENAI_LOGDIR'): 
278 |     # if OPENAI_LOGDIR is set, configure the logger on import
279 |     # this kind of nasty (unexpected to user), but I don't know how else to inject the logger
280 |     # to a script that's getting run in a subprocess
281 |     configure(dir=os.getenv('OPENAI_LOGDIR'))
282 | 
283 | def reset():
284 |     Logger.CURRENT = Logger.DEFAULT
285 |     log('Reset logger')
286 | 
287 | # ================================================================
288 | 
289 | def _demo():
290 |     info("hi")
291 |     debug("shouldn't appear")
292 |     set_level(DEBUG)
293 |     debug("should appear")
294 |     dir = "/tmp/testlogging"
295 |     if os.path.exists(dir):
296 |         shutil.rmtree(dir)
297 |     with session(dir=dir):
298 |         logkv("a", 3)
299 |         logkv("b", 2.5)
300 |         dumpkvs()
301 |         logkv("b", -2.5)
302 |         logkv("a", 5.5)
303 |         dumpkvs()
304 |         info("^^^ should see a = 5.5")
305 | 
306 |     logkv("b", -2.5)
307 |     dumpkvs()
308 | 
309 |     logkv("a", "longasslongasslongasslongasslongasslongassvalue")
310 |     dumpkvs()
311 | 
312 | 
313 | if __name__ == "__main__":
314 |     _demo()
315 | 


--------------------------------------------------------------------------------