├── gym-control
    ├── rl
    │   ├── __init__.py
    │   ├── agents
    │   │   └── __init__.py
    │   ├── random.py
    │   ├── processors.py
    │   └── util.py
    ├── utils.py
    ├── README.md
    ├── collect.py
    ├── scripts
    │   ├── train-qlearn.sh
    │   ├── train-cem.sh
    │   ├── train-dqn.sh
    │   ├── train-sarsa.sh
    │   ├── train-duel-dqn.sh
    │   └── train-naf.sh
    ├── cem_cartpole.py
    ├── sarsa_cartpole.py
    ├── dqn_cartpole.py
    └── duel_dqn_cartpole.py
├── gym-atari
    ├── baselines
    │   ├── baselines
    │   │   ├── __init__.py
    │   │   ├── a2c
    │   │   │   ├── __init__.py
    │   │   │   ├── README.md
    │   │   │   └── runner.py
    │   │   ├── ppo2
    │   │   │   ├── __init__.py
    │   │   │   ├── README.md
    │   │   │   └── defaults.py
    │   │   ├── common
    │   │   │   ├── tests
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── envs
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── fixed_sequence_env.py
    │   │   │   │   │   ├── identity_env.py
    │   │   │   │   │   └── mnist_env.py
    │   │   │   │   ├── test_schedules.py
    │   │   │   │   ├── test_cartpole.py
    │   │   │   │   ├── test_tf_util.py
    │   │   │   │   ├── test_fixed_sequence.py
    │   │   │   │   ├── test_mnist.py
    │   │   │   │   ├── test_identity.py
    │   │   │   │   ├── util.py
    │   │   │   │   ├── test_segment_tree.py
    │   │   │   │   └── test_serialization.py
    │   │   │   ├── __init__.py
    │   │   │   ├── runners.py
    │   │   │   ├── mpi_fork.py
    │   │   │   ├── identity_env.py
    │   │   │   ├── tile_images.py
    │   │   │   ├── cg.py
    │   │   │   ├── mpi_adam_optimizer.py
    │   │   │   ├── vec_env
    │   │   │   │   ├── vec_frame_stack.py
    │   │   │   │   ├── vec_normalize.py
    │   │   │   │   ├── dummy_vec_env.py
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── subproc_vec_env.py
    │   │   │   ├── running_stat.py
    │   │   │   ├── console_util.py
    │   │   │   ├── input.py
    │   │   │   ├── mpi_moments.py
    │   │   │   ├── dataset.py
    │   │   │   ├── math_util.py
    │   │   │   ├── mpi_adam.py
    │   │   │   ├── filters.py
    │   │   │   ├── mpi_util.py
    │   │   │   ├── mpi_running_mean_std.py
    │   │   │   ├── schedules.py
    │   │   │   ├── segment_tree.py
    │   │   │   ├── cmd_util.py
    │   │   │   └── models.py
    │   │   ├── bench
    │   │   │   ├── __init__.py
    │   │   │   ├── benchmarks.py
    │   │   │   └── monitor.py
    │   │   ├── results_single.py
    │   │   └── results_plotter.py
    │   ├── LICENSE
    │   └── setup.py
    ├── .gitignore
    ├── scripts
    │   ├── train-normal.sh
    │   ├── visualize.py
    │   ├── train-alien.sh
    │   ├── train-phoenix.sh
    │   ├── train-carnival.sh
    │   ├── train-mspacman.sh
    │   └── train-seaquest.sh
    └── README.md
├── requirements.txt
├── LICENSE
├── .gitignore
└── README.md


/gym-control/rl/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gym-atari/.gitignore:
--------------------------------------------------------------------------------
1 | logs*
2 | backup*
3 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/a2c/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/ppo2/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | matplotlib
3 | keras==2.1.0
4 | h5py
5 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from baselines.bench.benchmarks import *
2 | from baselines.bench.monitor import *


--------------------------------------------------------------------------------
/gym-control/rl/agents/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from .dqn import DQNAgent, NAFAgent, ContinuousDQNAgent
3 | from .ddpg import DDPGAgent
4 | from .cem import CEMAgent
5 | from .sarsa import SarsaAgent, SARSAAgent
6 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from baselines.common.console_util import *
3 | from baselines.common.dataset import Dataset
4 | from baselines.common.math_util import *
5 | from baselines.common.misc_util import *
6 | 


--------------------------------------------------------------------------------
/gym-control/utils.py:
--------------------------------------------------------------------------------
1 | def str2bool(v):
2 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
3 |         return True
4 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
5 |         return False
6 |     else:
7 |         raise argparse.ArgumentTypeError('Boolean value expected.')


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/a2c/README.md:
--------------------------------------------------------------------------------
1 | # A2C
2 | 
3 | - Original paper: https://arxiv.org/abs/1602.01783
4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/
5 | - `python -m baselines.a2c.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/ppo2/README.md:
--------------------------------------------------------------------------------
1 | # PPO2
2 | 
3 | - Original paper: https://arxiv.org/abs/1707.06347
4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/
5 | - `python -m baselines.ppo2.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options.
6 | - `python -m baselines.ppo2.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment.
7 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/ppo2/defaults.py:
--------------------------------------------------------------------------------
 1 | def mujoco():
 2 |     return dict(
 3 |         nsteps=2048,
 4 |         nminibatches=32,
 5 |         lam=0.95,
 6 |         gamma=0.99,
 7 |         noptepochs=10,
 8 |         log_interval=1,
 9 |         ent_coef=0.0,
10 |         lr=lambda f: 3e-4 * f,
11 |         cliprange=0.2,
12 |         value_network='copy'
13 |     )
14 | 
15 | def atari():
16 |     return dict(
17 |         nsteps=128, nminibatches=4,
18 |         lam=0.95, gamma=0.99, noptepochs=4, log_interval=1,
19 |         ent_coef=.01,
20 |         lr=lambda f : f * 2.5e-4,
21 |         cliprange=lambda f : f * 0.1,
22 |     )
23 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/runners.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | class AbstractEnvRunner(ABC):
 5 |     def __init__(self, *, env, model, nsteps):
 6 |         self.env = env
 7 |         self.model = model
 8 |         self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
 9 |         self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
10 |         self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
11 |         self.obs[:] = env.reset()
12 |         self.nsteps = nsteps
13 |         self.states = model.initial_state
14 |         self.dones = [False for _ in range(nenv)]
15 | 
16 |     @abstractmethod
17 |     def run(self):
18 |         raise NotImplementedError
19 | 
20 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
 1 | import os, subprocess, sys
 2 | 
 3 | def mpi_fork(n, bind_to_core=False):
 4 |     """Re-launches the current script with workers
 5 |     Returns "parent" for original parent, "child" for MPI children
 6 |     """
 7 |     if n<=1: 
 8 |         return "child"
 9 |     if os.getenv("IN_MPI") is None:
10 |         env = os.environ.copy()
11 |         env.update(
12 |             MKL_NUM_THREADS="1",
13 |             OMP_NUM_THREADS="1",
14 |             IN_MPI="1"
15 |         )
16 |         args = ["mpirun", "-np", str(n)]
17 |         if bind_to_core:
18 |             args += ["-bind-to", "core"]
19 |         args += [sys.executable] + sys.argv
20 |         subprocess.check_call(args, env=env)
21 |         return "parent"
22 |     else:
23 |         return "child"
24 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/identity_env.py:
--------------------------------------------------------------------------------
 1 | from gym import Env
 2 | from gym.spaces import Discrete
 3 | 
 4 | 
 5 | class IdentityEnv(Env):
 6 |     def __init__(
 7 |             self,
 8 |             dim,
 9 |             ep_length=100,
10 |     ):
11 | 
12 |         self.action_space = Discrete(dim)
13 |         self.reset()
14 | 
15 |     def reset(self):
16 |         self._choose_next_state()
17 |         self.observation_space = self.action_space
18 | 
19 |         return self.state
20 | 
21 |     def step(self, actions):
22 |         rew = self._get_reward(actions)
23 |         self._choose_next_state()
24 |         return self.state, rew, False, {}
25 | 
26 |     def _choose_next_state(self):
27 |         self.state = self.action_space.sample()
28 | 
29 |     def _get_reward(self, actions):
30 |         return 1 if self.state == actions else 0
31 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tile_images.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def tile_images(img_nhwc):
 4 |     """
 5 |     Tile N images into one big PxQ image
 6 |     (P,Q) are chosen to be as close as possible, and if N
 7 |     is square, then P=Q.
 8 | 
 9 |     input: img_nhwc, list or array of images, ndim=4 once turned into array
10 |         n = batch index, h = height, w = width, c = channel
11 |     returns:
12 |         bigim_HWc, ndarray with ndim=3
13 |     """
14 |     img_nhwc = np.asarray(img_nhwc)
15 |     N, h, w, c = img_nhwc.shape
16 |     H = int(np.ceil(np.sqrt(N)))
17 |     W = int(np.ceil(float(N)/H))
18 |     img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
19 |     img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
20 |     img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
21 |     img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
22 |     return img_Hh_Ww_c
23 | 
24 | 


--------------------------------------------------------------------------------
/gym-control/README.md:
--------------------------------------------------------------------------------
 1 | # gym-control
 2 | Reinforcement Learning with Perturbed Reward (control games)
 3 | 
 4 | ## Usage
 5 | ### Training
 6 | ```
 7 | sh scripts/train-qlearn.sh  (Cartpole)
 8 | sh scripts/train-dqn.sh     (Cartpole)
 9 | sh scripts/train-ddpg.sh    (Pendulum)
10 | ```
11 | ### Visualizing
12 | ```
13 | sh scripts/visualize.sh
14 | ```
15 | ## References
16 | 1. *Q-Learning* Watkins et al., 1989
17 | 2. *Playing Atari with Deep Reinforcement Learning*, Mnih et al., 2013
18 | 3. *Human-level control through deep reinforcement learning*, Mnih et al., 2015
19 | 4. *Reinforcement learning: An introduction*, Sutton and Barto, 2011
20 | 5. *Learning Tetris Using the Noisy Cross-Entropy Method*, Szita et al., 2006
21 | 6. *Deep Reinforcement Learning (MLSS lecture notes)*, Schulman, 2016
22 | 7. *Continuous control with deep reinforcement learning*, Lillicrap et al., 2015
23 | 8. *Continuous Deep Q-Learning with Model-based Acceleration*, Gu et al., 2016
24 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule
 4 | 
 5 | 
 6 | def test_piecewise_schedule():
 7 |     ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
 8 | 
 9 |     assert np.isclose(ps.value(-10), 500)
10 |     assert np.isclose(ps.value(0), 150)
11 |     assert np.isclose(ps.value(5), 200)
12 |     assert np.isclose(ps.value(9), 80)
13 |     assert np.isclose(ps.value(50), 50)
14 |     assert np.isclose(ps.value(80), 50)
15 |     assert np.isclose(ps.value(150), 0)
16 |     assert np.isclose(ps.value(175), -25)
17 |     assert np.isclose(ps.value(201), 500)
18 |     assert np.isclose(ps.value(500), 500)
19 | 
20 |     assert np.isclose(ps.value(200 - 1e-10), -50)
21 | 
22 | 
23 | def test_constant_schedule():
24 |     cs = ConstantSchedule(5)
25 |     for i in range(-100, 100):
26 |         assert np.isclose(cs.value(i), 5)
27 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/cg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
 3 |     """
 4 |     Demmel p 312
 5 |     """
 6 |     p = b.copy()
 7 |     r = b.copy()
 8 |     x = np.zeros_like(b)
 9 |     rdotr = r.dot(r)
10 | 
11 |     fmtstr =  "%10i %10.3g %10.3g"
12 |     titlestr =  "%10s %10s %10s"
13 |     if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 | 
15 |     for i in range(cg_iters):
16 |         if callback is not None:
17 |             callback(x)
18 |         if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 |         z = f_Ax(p)
20 |         v = rdotr / p.dot(z)
21 |         x += v*p
22 |         r -= v*z
23 |         newrdotr = r.dot(r)
24 |         mu = newrdotr/rdotr
25 |         p = r + mu*p
26 | 
27 |         rdotr = newrdotr
28 |         if rdotr < residual_tol:
29 |             break
30 | 
31 |     if callback is not None:
32 |         callback(x)
33 |     if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
34 |     return x


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 wangjksjtu
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/gym-control/collect.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | 
 5 | 
 6 | parser = argparse.ArgumentParser()
 7 | parser.add_argument('--log_dir', default='logs/ddpg_pendulum/norm_one',
 8 |                     help='Log dir [default: logs/ddpg_pendulum/norm_one]')
 9 | parser.add_argument('--save_dir', default='docs/ddpg_pendulum/norm_one',
10 |                     help='Path of directory to saved [default: docs/ddpg_pendulum/norm_one]')
11 | FLAGS = parser.parse_args()
12 | 
13 | LOG_DIR = FLAGS.log_dir
14 | SAVE_DIR = FLAGS.save_dir
15 | 
16 | assert (os.path.exists(LOG_DIR))
17 | if not os.path.exists(SAVE_DIR):
18 |     os.makedirs(SAVE_DIR)
19 | 
20 | def collect():
21 |     for j in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
22 |         input_dir = os.path.join(LOG_DIR, str(j))
23 |         files = glob.glob(os.path.join(input_dir, "*.png"))
24 |         for fin in files:
25 |             filename = fin[fin.rindex("/")+1:]
26 |             fout = os.path.join(SAVE_DIR, filename)
27 |             print "cp '%s' '%s'" % (fin, fout)
28 |             os.system("cp '%s' '%s'" % (fin, fout))
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     collect()
33 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/test_cartpole.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | 
 4 | from baselines.run import get_learn_function
 5 | from baselines.common.tests.util import reward_per_episode_test
 6 | 
 7 | common_kwargs = dict(
 8 |     total_timesteps=30000,
 9 |     network='mlp',
10 |     gamma=1.0,
11 |     seed=0,
12 | )
13 |    
14 | learn_kwargs = {
15 |     'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
16 |     'acktr': dict(nsteps=32, value_network='copy'),
17 |     'deepq': {},
18 |     'ppo2': dict(value_network='copy'),
19 |     'trpo_mpi': {}
20 | }
21 | 
22 | @pytest.mark.slow
23 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
24 | def test_cartpole(alg):
25 |     '''
26 |     Test if the algorithm (with an mlp policy)
27 |     can learn to balance the cartpole
28 |     '''
29 | 
30 |     kwargs = common_kwargs.copy()
31 |     kwargs.update(learn_kwargs[alg])
32 | 
33 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
34 |     def env_fn(): 
35 |         
36 |         env = gym.make('CartPole-v0')
37 |         env.seed(0)
38 |         return env
39 | 
40 |     reward_per_episode_test(env_fn, learn_fn, 100)
41 | 


--------------------------------------------------------------------------------
/gym-atari/scripts/train-normal.sh:
--------------------------------------------------------------------------------
 1 | cd baselines
 2 | 
 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/pong/ppo2_50M_normal --normal=True)&
 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/carnival/ppo2_50M_normal --normal=True)&
 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/mspacman/ppo2_50M_normal --normal=True)&
 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/phoenix/ppo2_50M_normal --normal=True)&
 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/pong/ppo2_50M_normal --normal=True)&
 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/seaquest/ppo2_50M_normal --normal=True)&
 9 | 
10 | cd ..
11 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License
 2 | 
 3 | Copyright (c) 2017 OpenAI (http://openai.com)
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
 1 | # tests for tf_util
 2 | import tensorflow as tf
 3 | from baselines.common.tf_util import (
 4 |     function,
 5 |     initialize,
 6 |     single_threaded_session
 7 | )
 8 | 
 9 | 
10 | def test_function():
11 |     with tf.Graph().as_default():
12 |         x = tf.placeholder(tf.int32, (), name="x")
13 |         y = tf.placeholder(tf.int32, (), name="y")
14 |         z = 3 * x + 2 * y
15 |         lin = function([x, y], z, givens={y: 0})
16 | 
17 |         with single_threaded_session():
18 |             initialize()
19 | 
20 |             assert lin(2) == 6
21 |             assert lin(2, 2) == 10
22 | 
23 | 
24 | def test_multikwargs():
25 |     with tf.Graph().as_default():
26 |         x = tf.placeholder(tf.int32, (), name="x")
27 |         with tf.variable_scope("other"):
28 |             x2 = tf.placeholder(tf.int32, (), name="x")
29 |         z = 3 * x + 2 * x2
30 | 
31 |         lin = function([x, x2], z, givens={x2: 0})
32 |         with single_threaded_session():
33 |             initialize()
34 |             assert lin(2) == 6
35 |             assert lin(2, 2) == 10
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     test_function()
40 |     test_multikwargs()
41 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys
 3 | 
 4 | if sys.version_info.major != 3:
 5 |     print('This Python is only compatible with Python 3, but you are running '
 6 |           'Python {}. The installation will likely fail.'.format(sys.version_info.major))
 7 | 
 8 | 
 9 | setup(name='baselines',
10 |       packages=[package for package in find_packages()
11 |                 if package.startswith('baselines')],
12 |       install_requires=[
13 |           'gym[atari,classic_control]',
14 |           'scipy',
15 |           'tqdm',
16 |           'joblib',
17 |           'dill',
18 |           'progressbar2',
19 |           'mpi4py',
20 |           'cloudpickle',
21 |           'tensorflow-gpu==1.10.0',
22 |           'click',
23 |           'opencv-python',
24 |       ],
25 |       extras_require={
26 |         'test': [
27 |             'filelock',
28 |             'pytest'
29 |         ]
30 |       },
31 |       description='OpenAI baselines: high quality implementations of reinforcement learning algorithms',
32 |       author='OpenAI',
33 |       url='https://github.com/openai/baselines',
34 |       author_email='gym@openai.com',
35 |       version='0.1.5')
36 | 


--------------------------------------------------------------------------------
/gym-atari/README.md:
--------------------------------------------------------------------------------
 1 | # gym-atari
 2 | Reinforcement Learning with Perturbed Reward (Atari Games)
 3 | 
 4 | ## Usage
 5 | ### Training
 6 | To train models with different noisy or surrogate rewards:
 7 | ```
 8 | sh scripts/train-pong.sh     (Pong-v4)
 9 | sh scripts/train-breakout.sh (Breakout-v4)
10 | ```
11 | If you want to train the models with specific hyper-parameters by yourself:
12 | ```
13 | cd baselines
14 | python -m baselines.run --alg=<name of the algorithm> --env=<environment name> [additional arguments]
15 | ```
16 | #### Example 1. PPO with Pong
17 | ```
18 | python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=3e7 \
19 |                         --save_path=logs-pong/pong/anti_iden/ppo2_30M_noisy_0.1 --weight=0.1 \
20 |                         --normal=False --surrogate=False --noise_type=anti_iden
21 | ```
22 | 
23 | ### Visualizing
24 | ```
25 | cd baselines
26 | python ../scripts/visualize.py --env_name Breakout --log_dir logs-breakout/ --num_timesteps 50000000 --noise_type anti_iden --all True
27 | ```
28 | To see HELP for the visualizing script:
29 | ```
30 | python ../scripts/visualize.py -h
31 | ```
32 | 
33 | ## References
34 | 1. *Proximal Policy Optimization Algorithms* John Schulman et al., 2017
35 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/envs/fixed_sequence_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import Env
 3 | from gym.spaces import Discrete
 4 | 
 5 | 
 6 | class FixedSequenceEnv(Env):
 7 |     def __init__(
 8 |             self,
 9 |             n_actions=10,
10 |             seed=0,
11 |             episode_len=100
12 |     ):
13 |         self.np_random = np.random.RandomState()
14 |         self.np_random.seed(seed)
15 |         self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)]
16 | 
17 |         self.action_space = Discrete(n_actions)
18 |         self.observation_space = Discrete(1)
19 | 
20 |         self.episode_len = episode_len
21 |         self.time = 0
22 |         self.reset()
23 | 
24 |     def reset(self):
25 |         self.time = 0
26 |         return 0
27 | 
28 |     def step(self, actions):
29 |         rew = self._get_reward(actions)
30 |         self._choose_next_state()
31 |         done = False
32 |         if self.episode_len and self.time >= self.episode_len:
33 |             rew = 0
34 |             done = True
35 | 
36 |         return 0, rew, done, {}
37 | 
38 |     def _choose_next_state(self):
39 |         self.time += 1
40 | 
41 |     def _get_reward(self, actions):
42 |         return 1 if actions == self.sequence[self.time] else 0
43 |         
44 | 
45 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/mpi_adam_optimizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from mpi4py import MPI
 4 | 
 5 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
 6 |     """Adam optimizer that averages gradients across mpi processes."""
 7 |     def __init__(self, comm, **kwargs):
 8 |         self.comm = comm
 9 |         tf.train.AdamOptimizer.__init__(self, **kwargs)
10 |     def compute_gradients(self, loss, var_list, **kwargs):
11 |         grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
12 |         grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
13 |         flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0)
14 |         shapes = [v.shape.as_list() for g, v in grads_and_vars]
15 |         sizes = [int(np.prod(s)) for s in shapes]
16 | 
17 |         num_tasks = self.comm.Get_size()
18 |         buf = np.zeros(sum(sizes), np.float32)
19 | 
20 |         def _collect_grads(flat_grad):
21 |             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
22 |             np.divide(buf, float(num_tasks), out=buf)
23 |             return buf
24 | 
25 |         avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32)
26 |         avg_flat_grad.set_shape(flat_grad.shape)
27 |         avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
28 |         avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
29 |                     for g, (_, v) in zip(avg_grads, grads_and_vars)]
30 | 
31 |         return avg_grads_and_vars
32 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.vec_env import VecEnvWrapper
 2 | import numpy as np
 3 | from gym import spaces
 4 | 
 5 | class VecFrameStack(VecEnvWrapper):
 6 |     """
 7 |     Vectorized environment base class
 8 |     """
 9 |     def __init__(self, venv, nstack):
10 |         self.venv = venv
11 |         self.nstack = nstack
12 |         wos = venv.observation_space # wrapped ob space
13 |         low = np.repeat(wos.low, self.nstack, axis=-1)
14 |         high = np.repeat(wos.high, self.nstack, axis=-1)
15 |         self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype)
16 |         observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
17 |         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
18 | 
19 |     def step_wait(self):
20 |         obs, rews, news, infos = self.venv.step_wait()
21 |         self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
22 |         for (i, new) in enumerate(news):
23 |             if new:
24 |                 self.stackedobs[i] = 0
25 |         self.stackedobs[..., -obs.shape[-1]:] = obs
26 |         return self.stackedobs, rews, news, infos
27 | 
28 |     def reset(self):
29 |         """
30 |         Reset all environments
31 |         """
32 |         obs = self.venv.reset()
33 |         self.stackedobs[...] = 0
34 |         self.stackedobs[..., -obs.shape[-1]:] = obs
35 |         return self.stackedobs
36 | 
37 |     def close(self):
38 |         self.venv.close()
39 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/running_stat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | # http://www.johndcook.com/blog/standard_deviation/
 4 | class RunningStat(object):
 5 |     def __init__(self, shape):
 6 |         self._n = 0
 7 |         self._M = np.zeros(shape)
 8 |         self._S = np.zeros(shape)
 9 |     def push(self, x):
10 |         x = np.asarray(x)
11 |         assert x.shape == self._M.shape
12 |         self._n += 1
13 |         if self._n == 1:
14 |             self._M[...] = x
15 |         else:
16 |             oldM = self._M.copy()
17 |             self._M[...] = oldM + (x - oldM)/self._n
18 |             self._S[...] = self._S + (x - oldM)*(x - self._M)
19 |     @property
20 |     def n(self):
21 |         return self._n
22 |     @property
23 |     def mean(self):
24 |         return self._M
25 |     @property
26 |     def var(self):
27 |         return self._S/(self._n - 1) if self._n > 1 else np.square(self._M)
28 |     @property
29 |     def std(self):
30 |         return np.sqrt(self.var)
31 |     @property
32 |     def shape(self):
33 |         return self._M.shape
34 | 
35 | def test_running_stat():
36 |     for shp in ((), (3,), (3,4)):
37 |         li = []
38 |         rs = RunningStat(shp)
39 |         for _ in range(5):
40 |             val = np.random.randn(*shp)
41 |             rs.push(val)
42 |             li.append(val)
43 |             m = np.mean(li, axis=0)
44 |             assert np.allclose(rs.mean, m)
45 |             v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0)
46 |             assert np.allclose(rs.var, v)
47 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/test_fixed_sequence.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
 3 | 
 4 | from baselines.common.tests.util import simple_test
 5 | from baselines.run import get_learn_function
 6 | 
 7 | common_kwargs = dict(
 8 |     seed=0,
 9 |     total_timesteps=50000,
10 | )
11 |     
12 | learn_kwargs = {
13 |     'a2c': {},
14 |     'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
15 |     # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
16 |     # github issue: https://github.com/openai/baselines/issues/188
17 |     # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
18 | }
19 | 
20 | 
21 | alg_list = learn_kwargs.keys()
22 | rnn_list = ['lstm']
23 | 
24 | @pytest.mark.slow
25 | @pytest.mark.parametrize("alg", alg_list)
26 | @pytest.mark.parametrize("rnn", rnn_list)
27 | def test_fixed_sequence(alg, rnn):
28 |     '''
29 |     Test if the algorithm (with a given policy)
30 |     can learn an identity transformation (i.e. return observation as an action)
31 |     '''
32 | 
33 |     kwargs = learn_kwargs[alg]
34 |     kwargs.update(common_kwargs)
35 | 
36 |     episode_len = 5
37 |     env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len)
38 |     learn = lambda e: get_learn_function(alg)(
39 |         env=e, 
40 |         network=rnn,
41 |         **kwargs
42 |     )
43 | 
44 |     simple_test(env_fn, learn, 0.7)
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     test_fixed_sequence('ppo2', 'lstm')
49 | 
50 |     
51 | 
52 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from contextlib import contextmanager
 3 | import numpy as np
 4 | import time
 5 | 
 6 | # ================================================================
 7 | # Misc
 8 | # ================================================================
 9 | 
10 | def fmt_row(width, row, header=False):
11 |     out = " | ".join(fmt_item(x, width) for x in row)
12 |     if header: out = out + "\n" + "-"*len(out)
13 |     return out
14 | 
15 | def fmt_item(x, l):
16 |     if isinstance(x, np.ndarray):
17 |         assert x.ndim==0
18 |         x = x.item()
19 |     if isinstance(x, (float, np.float32, np.float64)):
20 |         v = abs(x)
21 |         if (v < 1e-4 or v > 1e+4) and v > 0:
22 |             rep = "%7.2e" % x
23 |         else:
24 |             rep = "%7.5f" % x
25 |     else: rep = str(x)
26 |     return " "*(l - len(rep)) + rep
27 | 
28 | color2num = dict(
29 |     gray=30,
30 |     red=31,
31 |     green=32,
32 |     yellow=33,
33 |     blue=34,
34 |     magenta=35,
35 |     cyan=36,
36 |     white=37,
37 |     crimson=38
38 | )
39 | 
40 | def colorize(string, color, bold=False, highlight=False):
41 |     attr = []
42 |     num = color2num[color]
43 |     if highlight: num += 10
44 |     attr.append(str(num))
45 |     if bold: attr.append('1')
46 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
47 | 
48 | 
49 | MESSAGE_DEPTH = 0
50 | 
51 | @contextmanager
52 | def timed(msg):
53 |     global MESSAGE_DEPTH #pylint: disable=W0603
54 |     print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
55 |     tstart = time.time()
56 |     MESSAGE_DEPTH += 1
57 |     yield
58 |     MESSAGE_DEPTH -= 1
59 |     print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
60 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/test_mnist.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | # from baselines.acer import acer_simple as acer
 4 | from baselines.common.tests.envs.mnist_env import MnistEnv
 5 | from baselines.common.tests.util import simple_test
 6 | from baselines.run import get_learn_function
 7 | 
 8 | 
 9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?  
10 | # GitHub issue https://github.com/openai/baselines/issues/189
11 | common_kwargs = {
12 |     'seed': 0,
13 |     'network':'cnn',
14 |     'gamma':0.9,
15 |     'pad':'SAME'
16 | }
17 | 
18 | learn_args = {
19 |     'a2c': dict(total_timesteps=50000),
20 |     # TODO need to resolve inference (step) API differences for acer; also slow
21 |     # 'acer': dict(seed=0, total_timesteps=1000),
22 |     'deepq': dict(total_timesteps=5000),
23 |     'acktr': dict(total_timesteps=30000),
24 |     'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
25 |     'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
26 | }
27 | 
28 |  
29 | #tests pass, but are too slow on travis. Same algorithms are covered 
30 | # by other tests with less compute-hungry nn's and by benchmarks
31 | @pytest.mark.skip 
32 | @pytest.mark.slow
33 | @pytest.mark.parametrize("alg", learn_args.keys())
34 | def test_mnist(alg):
35 |     '''
36 |     Test if the algorithm can learn to classify MNIST digits. 
37 |     Uses CNN policy. 
38 |     '''
39 |     
40 |     learn_kwargs = learn_args[alg]
41 |     learn_kwargs.update(common_kwargs)
42 |     
43 |     learn = get_learn_function(alg)
44 |     learn_fn = lambda e: learn(env=e, **learn_kwargs)
45 |     env_fn = lambda: MnistEnv(seed=0, episode_len=100)
46 | 
47 |     simple_test(env_fn, learn_fn, 0.6)
48 | 
49 | if __name__ == '__main__':
50 |     test_mnist('deepq')
51 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/test_identity.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv
 3 | from baselines.run import get_learn_function
 4 | from baselines.common.tests.util import simple_test
 5 | 
 6 | common_kwargs = dict(
 7 |     total_timesteps=30000,
 8 |     network='mlp',
 9 |     gamma=0.9,
10 |     seed=0,
11 | )
12 |    
13 | learn_kwargs = {
14 |     'a2c' : {},
15 |     'acktr': {},
16 |     'deepq': {},
17 |     'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
18 |     'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
19 | }
20 | 
21 | 
22 | @pytest.mark.slow
23 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
24 | def test_discrete_identity(alg):
25 |     '''
26 |     Test if the algorithm (with an mlp policy)
27 |     can learn an identity transformation (i.e. return observation as an action)
28 |     '''
29 | 
30 |     kwargs = learn_kwargs[alg]
31 |     kwargs.update(common_kwargs)
32 | 
33 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
34 |     env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
35 |     simple_test(env_fn, learn_fn, 0.9)
36 | 
37 | @pytest.mark.slow
38 | @pytest.mark.parametrize("alg", ['a2c', 'ppo2', 'trpo_mpi'])
39 | def test_continuous_identity(alg):
40 |     '''
41 |     Test if the algorithm (with an mlp policy)
42 |     can learn an identity transformation (i.e. return observation as an action)
43 |     to a required precision
44 |     '''
45 | 
46 |     kwargs = learn_kwargs[alg]
47 |     kwargs.update(common_kwargs)
48 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
49 | 
50 |     env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
51 |     simple_test(env_fn, learn_fn, -0.1)
52 | 
53 | if __name__ == '__main__':
54 |     test_continuous_identity('a2c')    
55 | 
56 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/input.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from gym.spaces import Discrete, Box
 3 | 
 4 | def observation_placeholder(ob_space, batch_size=None, name='Ob'):
 5 |     ''' 
 6 |     Create placeholder to feed observations into of the size appropriate to the observation space
 7 |     
 8 |     Parameters:
 9 |     ----------
10 | 
11 |     ob_space: gym.Space     observation space
12 |     
13 |     batch_size: int         size of the batch to be fed into input. Can be left None in most cases. 
14 | 
15 |     name: str               name of the placeholder
16 | 
17 |     Returns:
18 |     -------
19 | 
20 |     tensorflow placeholder tensor
21 |     '''
22 | 
23 |     assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \
24 |         'Can only deal with Discrete and Box observation spaces for now'
25 | 
26 |     return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name)
27 | 
28 | 
29 | def observation_input(ob_space, batch_size=None, name='Ob'):
30 |     ''' 
31 |     Create placeholder to feed observations into of the size appropriate to the observation space, and add input 
32 |     encoder of the appropriate type. 
33 |     '''
34 | 
35 |     placeholder = observation_placeholder(ob_space, batch_size, name)
36 |     return placeholder, encode_observation(ob_space, placeholder)
37 | 
38 | def encode_observation(ob_space, placeholder):
39 |     '''
40 |     Encode input in the way that is appropriate to the observation space
41 | 
42 |     Parameters:
43 |     ----------
44 |     
45 |     ob_space: gym.Space             observation space
46 |     
47 |     placeholder: tf.placeholder     observation input placeholder
48 |     '''
49 |     if isinstance(ob_space, Discrete):
50 |         return tf.to_float(tf.one_hot(placeholder, ob_space.n))
51 | 
52 |     elif isinstance(ob_space, Box):
53 |         return tf.to_float(placeholder)
54 |     else:
55 |         raise NotImplementedError
56 | 
57 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/envs/identity_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import abstractmethod
 3 | from gym import Env
 4 | from gym.spaces import Discrete, Box
 5 | 
 6 | 
 7 | class IdentityEnv(Env):
 8 |     def __init__(
 9 |             self,
10 |             episode_len=None
11 |     ):
12 | 
13 |         self.episode_len = episode_len
14 |         self.time = 0
15 |         self.reset()
16 | 
17 |     def reset(self):
18 |         self._choose_next_state()
19 |         self.time = 0
20 |         self.observation_space = self.action_space
21 | 
22 |         return self.state
23 | 
24 |     def step(self, actions):
25 |         rew = self._get_reward(actions)
26 |         self._choose_next_state()
27 |         done = False
28 |         if self.episode_len and self.time >= self.episode_len:
29 |             rew = 0
30 |             done = True
31 | 
32 |         return self.state, rew, done, {}
33 | 
34 |     def _choose_next_state(self):
35 |         self.state = self.action_space.sample()
36 |         self.time += 1
37 | 
38 |     @abstractmethod
39 |     def _get_reward(self, actions):
40 |         raise NotImplementedError
41 | 
42 | 
43 | class DiscreteIdentityEnv(IdentityEnv):
44 |     def __init__(
45 |             self,
46 |             dim,
47 |             episode_len=None,
48 |     ):
49 | 
50 |         self.action_space = Discrete(dim)
51 |         super().__init__(episode_len=episode_len)
52 | 
53 |     def _get_reward(self, actions):
54 |         return 1 if self.state == actions else 0
55 | 
56 | 
57 | class BoxIdentityEnv(IdentityEnv):
58 |     def __init__(
59 |             self,
60 |             shape,
61 |             episode_len=None,
62 |     ):
63 | 
64 |         self.action_space = Box(low=-1.0, high=1.0, shape=shape)
65 |         super().__init__(episode_len=episode_len)
66 | 
67 |     def _get_reward(self, actions):
68 |         diff = actions - self.state
69 |         diff = diff[:]
70 |         return -0.5 * np.dot(diff, diff)
71 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
 1 | from baselines.common.vec_env import VecEnvWrapper
 2 | from baselines.common.running_mean_std import RunningMeanStd
 3 | import numpy as np
 4 | 
 5 | class VecNormalize(VecEnvWrapper):
 6 |     """
 7 |     Vectorized environment base class
 8 |     """
 9 |     def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8):
10 |         VecEnvWrapper.__init__(self, venv)
11 |         self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
12 |         self.ret_rms = RunningMeanStd(shape=()) if ret else None
13 |         #self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='observation_running_mean_std') if ob else None
14 |         #self.ret_rms = TfRunningMeanStd(shape=(), scope='return_running_mean_std') if ret else None
15 |         self.clipob = clipob
16 |         self.cliprew = cliprew
17 |         self.ret = np.zeros(self.num_envs)
18 |         self.gamma = gamma
19 |         self.epsilon = epsilon
20 | 
21 |     def step_wait(self):
22 |         """
23 |         Apply sequence of actions to sequence of environments
24 |         actions -> (observations, rewards, news)
25 | 
26 |         where 'news' is a boolean vector indicating whether each element is new.
27 |         """
28 |         obs, rews, news, infos = self.venv.step_wait()
29 |         self.ret = self.ret * self.gamma + rews
30 |         obs = self._obfilt(obs)
31 |         if self.ret_rms:
32 |             self.ret_rms.update(self.ret)
33 |             rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
34 |         return obs, rews, news, infos
35 | 
36 |     def _obfilt(self, obs):
37 |         if self.ob_rms:
38 |             self.ob_rms.update(obs)
39 |             obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
40 |             return obs
41 |         else:
42 |             return obs
43 | 
44 |     def reset(self):
45 |         """
46 |         Reset all environments
47 |         """
48 |         obs = self.venv.reset()
49 |         return self._obfilt(obs)
50 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | from baselines.common import zipsame
 4 | 
 5 | 
 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False):
 7 |     x = np.asarray(x)
 8 |     assert x.ndim > 0
 9 |     if comm is None: comm = MPI.COMM_WORLD
10 |     xsum = x.sum(axis=axis, keepdims=keepdims)
11 |     n = xsum.size
12 |     localsum = np.zeros(n+1, x.dtype)
13 |     localsum[:n] = xsum.ravel()
14 |     localsum[n] = x.shape[axis]
15 |     globalsum = np.zeros_like(localsum)
16 |     comm.Allreduce(localsum, globalsum, op=MPI.SUM)
17 |     return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
18 | 
19 | def mpi_moments(x, axis=0, comm=None, keepdims=False):
20 |     x = np.asarray(x)
21 |     assert x.ndim > 0
22 |     mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
23 |     sqdiffs = np.square(x - mean)
24 |     meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
25 |     assert count1 == count
26 |     std = np.sqrt(meansqdiff)
27 |     if not keepdims:
28 |         newshape = mean.shape[:axis] + mean.shape[axis+1:]
29 |         mean = mean.reshape(newshape)
30 |         std = std.reshape(newshape)
31 |     return mean, std, count
32 | 
33 | 
34 | def test_runningmeanstd():
35 |     import subprocess
36 |     subprocess.check_call(['mpirun', '-np', '3', 
37 |         'python','-c', 
38 |         'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
39 | 
40 | def _helper_runningmeanstd():
41 |     comm = MPI.COMM_WORLD
42 |     np.random.seed(0)
43 |     for (triple,axis) in [
44 |         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
45 |         ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
46 |         ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
47 |         ]:
48 | 
49 | 
50 |         x = np.concatenate(triple, axis=axis)
51 |         ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
52 | 
53 | 
54 |         ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
55 | 
56 |         for (a1,a2) in zipsame(ms1, ms2):
57 |             print(a1, a2)
58 |             assert np.allclose(a1, a2)
59 |             print("ok!")
60 | 
61 | 


--------------------------------------------------------------------------------
/gym-control/rl/random.py:
--------------------------------------------------------------------------------
 1 | from __future__ import division
 2 | import numpy as np
 3 | 
 4 | 
 5 | class RandomProcess(object):
 6 |     def reset_states(self):
 7 |         pass
 8 | 
 9 | 
10 | class AnnealedGaussianProcess(RandomProcess):
11 |     def __init__(self, mu, sigma, sigma_min, n_steps_annealing):
12 |         self.mu = mu
13 |         self.sigma = sigma
14 |         self.n_steps = 0
15 | 
16 |         if sigma_min is not None:
17 |             self.m = -float(sigma - sigma_min) / float(n_steps_annealing)
18 |             self.c = sigma
19 |             self.sigma_min = sigma_min
20 |         else:
21 |             self.m = 0.
22 |             self.c = sigma
23 |             self.sigma_min = sigma
24 | 
25 |     @property
26 |     def current_sigma(self):
27 |         sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c)
28 |         return sigma
29 | 
30 | 
31 | class GaussianWhiteNoiseProcess(AnnealedGaussianProcess):
32 |     def __init__(self, mu=0., sigma=1., sigma_min=None, n_steps_annealing=1000, size=1):
33 |         super(GaussianWhiteNoiseProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
34 |         self.size = size
35 | 
36 |     def sample(self):
37 |         sample = np.random.normal(self.mu, self.current_sigma, self.size)
38 |         self.n_steps += 1
39 |         return sample
40 | 
41 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
42 | class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess):
43 |     def __init__(self, theta, mu=0., sigma=1., dt=1e-2, size=1, sigma_min=None, n_steps_annealing=1000):
44 |         super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing)
45 |         self.theta = theta
46 |         self.mu = mu
47 |         self.dt = dt
48 |         self.size = size
49 |         self.reset_states()
50 | 
51 |     def sample(self):
52 |         x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size)
53 |         self.x_prev = x
54 |         self.n_steps += 1
55 |         return x
56 | 
57 |     def reset_states(self):
58 |         self.x_prev = np.random.normal(self.mu,self.current_sigma,self.size)
59 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Dataset(object):
 4 |     def __init__(self, data_map, deterministic=False, shuffle=True):
 5 |         self.data_map = data_map
 6 |         self.deterministic = deterministic
 7 |         self.enable_shuffle = shuffle
 8 |         self.n = next(iter(data_map.values())).shape[0]
 9 |         self._next_id = 0
10 |         self.shuffle()
11 | 
12 |     def shuffle(self):
13 |         if self.deterministic:
14 |             return
15 |         perm = np.arange(self.n)
16 |         np.random.shuffle(perm)
17 | 
18 |         for key in self.data_map:
19 |             self.data_map[key] = self.data_map[key][perm]
20 | 
21 |         self._next_id = 0
22 | 
23 |     def next_batch(self, batch_size):
24 |         if self._next_id >= self.n and self.enable_shuffle:
25 |             self.shuffle()
26 | 
27 |         cur_id = self._next_id
28 |         cur_batch_size = min(batch_size, self.n - self._next_id)
29 |         self._next_id += cur_batch_size
30 | 
31 |         data_map = dict()
32 |         for key in self.data_map:
33 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 |         return data_map
35 | 
36 |     def iterate_once(self, batch_size):
37 |         if self.enable_shuffle: self.shuffle()
38 | 
39 |         while self._next_id <= self.n - batch_size:
40 |             yield self.next_batch(batch_size)
41 |         self._next_id = 0
42 | 
43 |     def subset(self, num_elements, deterministic=True):
44 |         data_map = dict()
45 |         for key in self.data_map:
46 |             data_map[key] = self.data_map[key][:num_elements]
47 |         return Dataset(data_map, deterministic)
48 | 
49 | 
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 |     arrays = tuple(map(np.asarray, arrays))
53 |     n = arrays[0].shape[0]
54 |     assert all(a.shape[0] == n for a in arrays[1:])
55 |     inds = np.arange(n)
56 |     if shuffle: np.random.shuffle(inds)
57 |     sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 |     for batch_inds in np.array_split(inds, sections):
59 |         if include_final_partial_batch or len(batch_inds) == batch_size:
60 |             yield tuple(a[batch_inds] for a in arrays)
61 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/envs/mnist_env.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import numpy as np
 3 | import tempfile
 4 | import filelock
 5 | from gym import Env
 6 | from gym.spaces import Discrete, Box
 7 | 
 8 | 
 9 | 
10 | class MnistEnv(Env):
11 |     def __init__(
12 |             self,
13 |             seed=0,
14 |             episode_len=None,
15 |             no_images=None
16 |     ):
17 |         from tensorflow.examples.tutorials.mnist import input_data
18 |         # we could use temporary directory for this with a context manager and 
19 |         # TemporaryDirecotry, but then each test that uses mnist would re-download the data
20 |         # this way the data is not cleaned up, but we only download it once per machine
21 |         mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
22 |         with filelock.FileLock(mnist_path + '.lock'):
23 |            self.mnist = input_data.read_data_sets(mnist_path)
24 | 
25 |         self.np_random = np.random.RandomState()
26 |         self.np_random.seed(seed)
27 | 
28 |         self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
29 |         self.action_space = Discrete(10)
30 |         self.episode_len = episode_len
31 |         self.time = 0
32 |         self.no_images = no_images
33 | 
34 |         self.train_mode()
35 |         self.reset()
36 |         
37 |     def reset(self):
38 |         self._choose_next_state()
39 |         self.time = 0
40 | 
41 |         return self.state[0]
42 | 
43 |     def step(self, actions):
44 |         rew = self._get_reward(actions)
45 |         self._choose_next_state()
46 |         done = False
47 |         if self.episode_len and self.time >= self.episode_len:
48 |             rew = 0
49 |             done = True
50 | 
51 |         return self.state[0], rew, done, {}
52 | 
53 |     def train_mode(self):
54 |         self.dataset = self.mnist.train
55 | 
56 |     def test_mode(self):
57 |         self.dataset = self.mnist.test
58 | 
59 |     def _choose_next_state(self):
60 |         max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
61 |         index = self.np_random.randint(0, max_index)
62 |         image = self.dataset.images[index].reshape(28,28,1)*255
63 |         label = self.dataset.labels[index]
64 |         self.state = (image, label)
65 |         self.time += 1
66 | 
67 |     def _get_reward(self, actions):
68 |         return 1 if self.state[1] == actions else 0
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/math_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | 
 5 | def discount(x, gamma):
 6 |     """
 7 |     computes discounted sums along 0th dimension of x.
 8 | 
 9 |     inputs
10 |     ------
11 |     x: ndarray
12 |     gamma: float
13 | 
14 |     outputs
15 |     -------
16 |     y: ndarray with same shape as x, satisfying
17 | 
18 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 |                 where k = len(x) - t - 1
20 | 
21 |     """
22 |     assert x.ndim >= 1
23 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 | 
25 | def explained_variance(ypred,y):
26 |     """
27 |     Computes fraction of variance that ypred explains about y.
28 |     Returns 1 - Var[y-ypred] / Var[y]
29 | 
30 |     interpretation:
31 |         ev=0  =>  might as well have predicted zero
32 |         ev=1  =>  perfect prediction
33 |         ev<0  =>  worse than just predicting zero
34 | 
35 |     """
36 |     assert y.ndim == 1 and ypred.ndim == 1
37 |     vary = np.var(y)
38 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 | 
40 | def explained_variance_2d(ypred, y):
41 |     assert y.ndim == 2 and ypred.ndim == 2
42 |     vary = np.var(y, axis=0)
43 |     out = 1 - np.var(y-ypred)/vary
44 |     out[vary < 1e-10] = 0
45 |     return out
46 | 
47 | def ncc(ypred, y):
48 |     return np.corrcoef(ypred, y)[1,0]
49 | 
50 | def flatten_arrays(arrs):
51 |     return np.concatenate([arr.flat for arr in arrs])
52 | 
53 | def unflatten_vector(vec, shapes):
54 |     i=0
55 |     arrs = []
56 |     for shape in shapes:
57 |         size = np.prod(shape)
58 |         arr = vec[i:i+size].reshape(shape)
59 |         arrs.append(arr)
60 |         i += size
61 |     return arrs
62 | 
63 | def discount_with_boundaries(X, New, gamma):
64 |     """
65 |     X: 2d array of floats, time x features
66 |     New: 2d array of bools, indicating when a new episode has started
67 |     """
68 |     Y = np.zeros_like(X)
69 |     T = X.shape[0]
70 |     Y[T-1] = X[T-1]
71 |     for t in range(T-2, -1, -1):
72 |         Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 |     return Y
74 | 
75 | def test_discount_with_boundaries():
76 |     gamma=0.9
77 |     x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 |     starts = [1.0, 0.0, 0.0, 1.0]
79 |     y = discount_with_boundaries(x, starts, gamma)
80 |     assert np.allclose(y, [
81 |         1 + gamma * 2 + gamma**2 * 3,
82 |         2 + gamma * 3,
83 |         3,
84 |         4
85 |     ])


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/gym-control/rl/processors.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | 
 3 | import numpy as np
 4 | 
 5 | from rl.core import Processor
 6 | from rl.util import WhiteningNormalizer
 7 | 
 8 | 
 9 | class MultiInputProcessor(Processor):
10 |     """Converts observations from an environment with multiple observations for use in a neural network
11 |     policy.
12 | 
13 |     In some cases, you have environments that return multiple different observations per timestep 
14 |     (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may
15 |     be used to report the angles for each joint). Usually, this can be handled by a policy that has
16 |     multiple inputs, one for each modality. However, observations are returned by the environment
17 |     in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network
18 |     expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`.
19 |     This processor converts observations appropriate for this use case.
20 | 
21 |     # Arguments
22 |         nb_inputs (integer): The number of inputs, that is different modalities, to be used.
23 |             Your neural network that you use for the policy must have a corresponding number of
24 |             inputs.
25 |     """
26 |     def __init__(self, nb_inputs):
27 |         self.nb_inputs = nb_inputs
28 | 
29 |     def process_state_batch(self, state_batch):
30 |         input_batches = [[] for x in range(self.nb_inputs)]
31 |         for state in state_batch:
32 |             processed_state = [[] for x in range(self.nb_inputs)]
33 |             for observation in state:
34 |                 assert len(observation) == self.nb_inputs
35 |                 for o, s in zip(observation, processed_state):
36 |                     s.append(o)
37 |             for idx, s in enumerate(processed_state):
38 |                 input_batches[idx].append(s)
39 |         return [np.array(x) for x in input_batches]
40 | 
41 | 
42 | class WhiteningNormalizerProcessor(Processor):
43 |     """Normalizes the observations to have zero mean and standard deviation of one,
44 |     i.e. it applies whitening to the inputs.
45 | 
46 |     This typically helps significantly with learning, especially if different dimensions are
47 |     on different scales. However, it complicates training in the sense that you will have to store
48 |     these weights alongside the policy if you intend to load it later. It is the responsibility of
49 |     the user to do so.
50 |     """
51 |     def __init__(self):
52 |         self.normalizer = None
53 | 
54 |     def process_state_batch(self, batch):
55 |         if self.normalizer is None:
56 |             self.normalizer = WhiteningNormalizer(shape=batch.shape[1:], dtype=batch.dtype)
57 |         self.normalizer.update(batch)
58 |         return self.normalizer.normalize(batch)
59 | 


--------------------------------------------------------------------------------
/gym-atari/scripts/visualize.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | 
 4 | def str2bool(v):
 5 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 6 |         return True
 7 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 8 |         return False
 9 |     else:
10 |         raise argparse.ArgumentTypeError('Boolean value expected.')
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--log_dir', type=str, default='baselines/logs',
14 |                     help='The path of log directory [default: baselines/logs')
15 | parser.add_argument('--all', type=str2bool, default=False,
16 |                     help='Plot all the curves (diff errs) [default: False]')
17 | parser.add_argument('--weight', type=float, default=0.2,
18 |                     help='Weight of noise [default: 0.2]')
19 | parser.add_argument('--noise_type', type=str, default='anti_iden',
20 |                     help='Type of additional noise [default: anti_iden]')
21 | parser.add_argument('--save_dir', type=str, default='../results',
22 |                     help='Path of root directory to save plots [default: save_dir]')
23 | parser.add_argument('--env_name', type=str, default='Pong',
24 |                     help='Name of Atari game')
25 | parser.add_argument('--num_timesteps', type=int, default=5e7,
26 |                     help='Number of timesteps')
27 | 
28 | FLAGS = parser.parse_args()
29 | 
30 | LOG_DIR = FLAGS.log_dir
31 | ALL = FLAGS.all
32 | WEIGHT = FLAGS.weight
33 | NOISE_TYPE = FLAGS.noise_type
34 | SAVE_DIR = FLAGS.save_dir
35 | ENV = FLAGS.env_name
36 | NUM_TIMESTEPS = FLAGS.num_timesteps
37 | 
38 | assert (os.path.exists(LOG_DIR))
39 | assert (NOISE_TYPE in ['norm_one', 'norm_all', 'anti_iden'])
40 | 
41 | SAVE_DIR = os.path.join(SAVE_DIR, ENV)
42 | if not os.path.exists(SAVE_DIR):
43 |     os.makedirs(SAVE_DIR)
44 | 
45 | def visualize():
46 |     if ALL:
47 |         weights_list = [0.1, 0.2, 0.3, 0.4, 
48 |                         0.6, 0.7, 0.8, 0.9]
49 |         if NOISE_TYPE != "anti_iden":
50 |             weights_list.append(0.5)
51 |     else:
52 |         weights_list = [WEIGHT]
53 |     
54 |     for weight in weights_list:
55 |         print ("python -m baselines.results_compare --log_dir %s --task_name %s \
56 |                    --weight %s --noise_type %s --num_timesteps %s --save_dir %s" % \
57 |                   (LOG_DIR, ENV, str(weight), NOISE_TYPE, str(NUM_TIMESTEPS), SAVE_DIR))
58 |         os.system("python -m baselines.results_compare --log_dir %s --task_name %s \
59 |                    --weight %s --noise_type %s --num_timesteps %s --save_dir %s" % \
60 |                   (LOG_DIR, ENV, str(weight), NOISE_TYPE, str(NUM_TIMESTEPS), SAVE_DIR))
61 |         print (LOG_DIR, ENV, str(weight), NOISE_TYPE, str(NUM_TIMESTEPS), SAVE_DIR)
62 |     #os.system("cd ..")
63 | 
64 | if __name__ == "__main__":
65 |     visualize()


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/a2c/runner.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from baselines.a2c.utils import discount_with_dones
 3 | from baselines.common.runners import AbstractEnvRunner
 4 | 
 5 | class Runner(AbstractEnvRunner):
 6 | 
 7 |     def __init__(self, env, model, nsteps=5, gamma=0.99):
 8 |         super().__init__(env=env, model=model, nsteps=nsteps)
 9 |         self.gamma = gamma
10 |         self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()]
11 |         self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype
12 |     
13 |     def run(self):
14 |         mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[]
15 |         mb_states = self.states
16 |         for n in range(self.nsteps):
17 |             actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones)
18 |             mb_obs.append(np.copy(self.obs))
19 |             mb_actions.append(actions)
20 |             mb_values.append(values)
21 |             mb_dones.append(self.dones)
22 |             obs, rewards, dones, _ = self.env.step(actions)
23 |             # TODO: surrogate reward
24 |             self.states = states
25 |             self.dones = dones
26 |             for n, done in enumerate(dones):
27 |                 if done:
28 |                     self.obs[n] = self.obs[n]*0
29 |             self.obs = obs
30 |             mb_rewards.append(rewards)
31 |         mb_dones.append(self.dones)
32 |         #batch of steps to batch of rollouts
33 | 
34 |         mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape)
35 |         mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0)
36 |         mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0)
37 |         mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0)
38 |         mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0)
39 |         mb_masks = mb_dones[:, :-1]
40 |         mb_dones = mb_dones[:, 1:]
41 | 
42 | 
43 |         if self.gamma > 0.0:
44 |             #discount/bootstrap off value fn
45 |             last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist()
46 |             for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)):
47 |                 rewards = rewards.tolist()
48 |                 dones = dones.tolist()
49 |                 if dones[-1] == 0:
50 |                     rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1]
51 |                 else:
52 |                     rewards = discount_with_dones(rewards, dones, self.gamma)
53 | 
54 |                 mb_rewards[n] = rewards
55 |     
56 |         mb_actions = mb_actions.reshape(self.batch_action_shape)
57 | 
58 |         mb_rewards = mb_rewards.flatten()
59 |         mb_values = mb_values.flatten()
60 |         mb_masks = mb_masks.flatten()
61 |         return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values
62 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from gym.spaces import np_random
 4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 5 | 
 6 | N_TRIALS = 10000
 7 | N_EPISODES = 100
 8 | 
 9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
10 |     np.random.seed(0)
11 |     np_random.seed(0)
12 | 
13 |     env = DummyVecEnv([env_fn])
14 | 
15 | 
16 |     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
17 |         tf.set_random_seed(0)
18 | 
19 |         model = learn_fn(env)
20 | 
21 |         sum_rew = 0
22 |         done = True
23 | 
24 |         for i in range(n_trials):
25 |             if done:
26 |                 obs = env.reset()
27 |                 state = model.initial_state
28 | 
29 |             if state is not None:
30 |                 a, v, state, _ = model.step(obs, S=state, M=[False])
31 |             else:
32 |                 a, v, _, _ = model.step(obs)
33 |             
34 |             obs, rew, done, _ = env.step(a)
35 |             sum_rew += float(rew)
36 | 
37 |         print("Reward in {} trials is {}".format(n_trials, sum_rew))
38 |         assert sum_rew > min_reward_fraction * n_trials, \
39 |             'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
40 | 
41 | 
42 | 
43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
44 |     env = DummyVecEnv([env_fn])
45 | 
46 |     with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default():
47 |         model = learn_fn(env)
48 | 
49 |         N_TRIALS = 100    
50 | 
51 |         observations, actions, rewards = rollout(env, model, N_TRIALS)
52 |         rewards = [sum(r) for r in rewards]
53 | 
54 |         avg_rew = sum(rewards) / N_TRIALS
55 |         print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
56 |         assert avg_rew > min_avg_reward, \
57 |             'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
58 | 
59 | def rollout(env, model, n_trials):
60 |     rewards = []
61 |     actions = []
62 |     observations = []
63 | 
64 |     for i in range(n_trials):
65 |         obs = env.reset()
66 |         state = model.initial_state
67 |         episode_rew = []
68 |         episode_actions = []
69 |         episode_obs = []
70 | 
71 |         while True:
72 |             if state is not None:
73 |                 a, v, state, _ = model.step(obs, S=state, M=[False])
74 |             else:
75 |                 a,v, _, _ = model.step(obs)
76 | 
77 |             obs, rew, done, _ = env.step(a)
78 | 
79 |             episode_rew.append(rew)
80 |             episode_actions.append(a)
81 |             episode_obs.append(obs)
82 | 
83 |             if done:
84 |                 break
85 | 
86 |         rewards.append(episode_rew)
87 |         actions.append(episode_actions)
88 |         observations.append(episode_obs)
89 | 
90 |     return observations, actions, rewards
91 | 
92 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import baselines.common.tf_util as U
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | class MpiAdam(object):
 7 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
 8 |         self.var_list = var_list
 9 |         self.beta1 = beta1
10 |         self.beta2 = beta2
11 |         self.epsilon = epsilon
12 |         self.scale_grad_by_procs = scale_grad_by_procs
13 |         size = sum(U.numel(v) for v in var_list)
14 |         self.m = np.zeros(size, 'float32')
15 |         self.v = np.zeros(size, 'float32')
16 |         self.t = 0
17 |         self.setfromflat = U.SetFromFlat(var_list)
18 |         self.getflat = U.GetFlat(var_list)
19 |         self.comm = MPI.COMM_WORLD if comm is None else comm
20 | 
21 |     def update(self, localg, stepsize):
22 |         if self.t % 100 == 0:
23 |             self.check_synced()
24 |         localg = localg.astype('float32')
25 |         globalg = np.zeros_like(localg)
26 |         self.comm.Allreduce(localg, globalg, op=MPI.SUM)
27 |         if self.scale_grad_by_procs:
28 |             globalg /= self.comm.Get_size()
29 | 
30 |         self.t += 1
31 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
32 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
33 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
34 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
35 |         self.setfromflat(self.getflat() + step)
36 | 
37 |     def sync(self):
38 |         theta = self.getflat()
39 |         self.comm.Bcast(theta, root=0)
40 |         self.setfromflat(theta)
41 | 
42 |     def check_synced(self):
43 |         if self.comm.Get_rank() == 0: # this is root
44 |             theta = self.getflat()
45 |             self.comm.Bcast(theta, root=0)
46 |         else:
47 |             thetalocal = self.getflat()
48 |             thetaroot = np.empty_like(thetalocal)
49 |             self.comm.Bcast(thetaroot, root=0)
50 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
51 | 
52 | @U.in_session
53 | def test_MpiAdam():
54 |     np.random.seed(0)
55 |     tf.set_random_seed(0)
56 | 
57 |     a = tf.Variable(np.random.randn(3).astype('float32'))
58 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
59 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
60 | 
61 |     stepsize = 1e-2
62 |     update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
63 |     do_update = U.function([], loss, updates=[update_op])
64 | 
65 |     tf.get_default_session().run(tf.global_variables_initializer())
66 |     for i in range(10):
67 |         print(i,do_update())
68 | 
69 |     tf.set_random_seed(0)
70 |     tf.get_default_session().run(tf.global_variables_initializer())
71 | 
72 |     var_list = [a,b]
73 |     lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op])
74 |     adam = MpiAdam(var_list)
75 | 
76 |     for i in range(10):
77 |         l,g = lossandgrad()
78 |         adam.update(g, stepsize)
79 |         print(i,l)


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import spaces
 3 | from collections import OrderedDict
 4 | from . import VecEnv
 5 | 
 6 | class DummyVecEnv(VecEnv):
 7 |     def __init__(self, env_fns):
 8 |         self.envs = [fn() for fn in env_fns]
 9 |         env = self.envs[0]
10 |         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
11 |         shapes, dtypes = {}, {}
12 |         self.keys = []
13 |         obs_space = env.observation_space
14 | 
15 |         if isinstance(obs_space, spaces.Dict):
16 |             assert isinstance(obs_space.spaces, OrderedDict)
17 |             subspaces = obs_space.spaces
18 |         else:
19 |             subspaces = {None: obs_space}
20 | 
21 |         for key, box in subspaces.items():
22 |             shapes[key] = box.shape
23 |             dtypes[key] = box.dtype
24 |             self.keys.append(key)
25 |         
26 |         self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
27 |         self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
28 |         self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
29 |         self.buf_infos = [{} for _ in range(self.num_envs)]
30 |         self.actions = None
31 | 
32 |     def step_async(self, actions):
33 |         listify = True
34 |         try:
35 |             if len(actions) == self.num_envs:
36 |                 listify = False
37 |         except TypeError:
38 |             pass
39 | 
40 |         if not listify:
41 |             self.actions = actions
42 |         else:
43 |             assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
44 |             self.actions = [actions]
45 | 
46 |     def step_wait(self):
47 |         for e in range(self.num_envs):
48 |             action = self.actions[e]
49 |             if isinstance(self.envs[e].action_space, spaces.Discrete):
50 |                 action = int(action)
51 | 
52 |             obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
53 |             if self.buf_dones[e]:
54 |                 obs = self.envs[e].reset()
55 |             self._save_obs(e, obs)
56 |         return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones),
57 |                 self.buf_infos.copy())
58 | 
59 |     def reset(self):
60 |         for e in range(self.num_envs):
61 |             obs = self.envs[e].reset()
62 |             self._save_obs(e, obs)
63 |         return self._obs_from_buf()
64 | 
65 |     def close(self):
66 |         return
67 | 
68 |     def render(self, mode='human'):
69 |         return [e.render(mode=mode) for e in self.envs]
70 | 
71 |     def _save_obs(self, e, obs):
72 |         for k in self.keys:
73 |             if k is None:
74 |                 self.buf_obs[k][e] = obs
75 |             else:
76 |                 self.buf_obs[k][e] = obs[k]
77 | 
78 |     def _obs_from_buf(self):
79 |         if self.keys==[None]:
80 |             return self.buf_obs[None]
81 |         else:
82 |             return self.buf_obs
83 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/filters.py:
--------------------------------------------------------------------------------
 1 | from .running_stat import RunningStat
 2 | from collections import deque
 3 | import numpy as np
 4 | 
 5 | class Filter(object):
 6 |     def __call__(self, x, update=True):
 7 |         raise NotImplementedError
 8 |     def reset(self):
 9 |         pass
10 | 
11 | class IdentityFilter(Filter):
12 |     def __call__(self, x, update=True):
13 |         return x
14 | 
15 | class CompositionFilter(Filter):
16 |     def __init__(self, fs):
17 |         self.fs = fs
18 |     def __call__(self, x, update=True):
19 |         for f in self.fs:
20 |             x = f(x)
21 |         return x
22 |     def output_shape(self, input_space):
23 |         out = input_space.shape
24 |         for f in self.fs:
25 |             out = f.output_shape(out)
26 |         return out
27 | 
28 | class ZFilter(Filter):
29 |     """
30 |     y = (x-mean)/std
31 |     using running estimates of mean,std
32 |     """
33 | 
34 |     def __init__(self, shape, demean=True, destd=True, clip=10.0):
35 |         self.demean = demean
36 |         self.destd = destd
37 |         self.clip = clip
38 | 
39 |         self.rs = RunningStat(shape)
40 | 
41 |     def __call__(self, x, update=True):
42 |         if update: self.rs.push(x)
43 |         if self.demean:
44 |             x = x - self.rs.mean
45 |         if self.destd:
46 |             x = x / (self.rs.std+1e-8)
47 |         if self.clip:
48 |             x = np.clip(x, -self.clip, self.clip)
49 |         return x
50 |     def output_shape(self, input_space):
51 |         return input_space.shape
52 | 
53 | class AddClock(Filter):
54 |     def __init__(self):
55 |         self.count = 0
56 |     def reset(self):
57 |         self.count = 0
58 |     def __call__(self, x, update=True):
59 |         return np.append(x, self.count/100.0)
60 |     def output_shape(self, input_space):
61 |         return (input_space.shape[0]+1,)
62 | 
63 | class FlattenFilter(Filter):
64 |     def __call__(self, x, update=True):
65 |         return x.ravel()
66 |     def output_shape(self, input_space):
67 |         return (int(np.prod(input_space.shape)),)
68 | 
69 | class Ind2OneHotFilter(Filter):
70 |     def __init__(self, n):
71 |         self.n = n
72 |     def __call__(self, x, update=True):
73 |         out = np.zeros(self.n)
74 |         out[x] = 1
75 |         return out
76 |     def output_shape(self, input_space):
77 |         return (input_space.n,)
78 | 
79 | class DivFilter(Filter):
80 |     def __init__(self, divisor):
81 |         self.divisor = divisor
82 |     def __call__(self, x, update=True):
83 |         return x / self.divisor
84 |     def output_shape(self, input_space):
85 |         return input_space.shape
86 | 
87 | class StackFilter(Filter):
88 |     def __init__(self, length):
89 |         self.stack = deque(maxlen=length)
90 |     def reset(self):
91 |         self.stack.clear()
92 |     def __call__(self, x, update=True):
93 |         self.stack.append(x)
94 |         while len(self.stack) < self.stack.maxlen:
95 |             self.stack.append(x)
96 |         return np.concatenate(self.stack, axis=-1)
97 |     def output_shape(self, input_space):
98 |         return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,)
99 | 


--------------------------------------------------------------------------------
/gym-control/scripts/train-qlearn.sh:
--------------------------------------------------------------------------------
 1 | for i in $(seq 1 3);
 2 | do
 3 | for log_dir in qlearn/$i
 4 | do
 5 | (python qlearn_cartpole.py --log_dir $log_dir)&
 6 | 
 7 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)&
 8 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)&
 9 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)&
10 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)&
11 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)&
12 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)&
13 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)&
14 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)&
15 | 
16 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)&
17 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)&
18 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)&
19 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)&
20 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)&
21 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)&
22 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)&
23 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)&
24 | 
25 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)&
26 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)&
27 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)&
28 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)&
29 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)&
30 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)&
31 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)&
32 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)&
33 | 
34 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)&
35 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)&
36 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)&
37 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)&
38 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)&
39 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)&
40 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)&
41 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)&
42 | done
43 | done
44 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree
  4 | 
  5 | 
  6 | def test_tree_set():
  7 |     tree = SumSegmentTree(4)
  8 | 
  9 |     tree[2] = 1.0
 10 |     tree[3] = 3.0
 11 | 
 12 |     assert np.isclose(tree.sum(), 4.0)
 13 |     assert np.isclose(tree.sum(0, 2), 0.0)
 14 |     assert np.isclose(tree.sum(0, 3), 1.0)
 15 |     assert np.isclose(tree.sum(2, 3), 1.0)
 16 |     assert np.isclose(tree.sum(2, -1), 1.0)
 17 |     assert np.isclose(tree.sum(2, 4), 4.0)
 18 | 
 19 | 
 20 | def test_tree_set_overlap():
 21 |     tree = SumSegmentTree(4)
 22 | 
 23 |     tree[2] = 1.0
 24 |     tree[2] = 3.0
 25 | 
 26 |     assert np.isclose(tree.sum(), 3.0)
 27 |     assert np.isclose(tree.sum(2, 3), 3.0)
 28 |     assert np.isclose(tree.sum(2, -1), 3.0)
 29 |     assert np.isclose(tree.sum(2, 4), 3.0)
 30 |     assert np.isclose(tree.sum(1, 2), 0.0)
 31 | 
 32 | 
 33 | def test_prefixsum_idx():
 34 |     tree = SumSegmentTree(4)
 35 | 
 36 |     tree[2] = 1.0
 37 |     tree[3] = 3.0
 38 | 
 39 |     assert tree.find_prefixsum_idx(0.0) == 2
 40 |     assert tree.find_prefixsum_idx(0.5) == 2
 41 |     assert tree.find_prefixsum_idx(0.99) == 2
 42 |     assert tree.find_prefixsum_idx(1.01) == 3
 43 |     assert tree.find_prefixsum_idx(3.00) == 3
 44 |     assert tree.find_prefixsum_idx(4.00) == 3
 45 | 
 46 | 
 47 | def test_prefixsum_idx2():
 48 |     tree = SumSegmentTree(4)
 49 | 
 50 |     tree[0] = 0.5
 51 |     tree[1] = 1.0
 52 |     tree[2] = 1.0
 53 |     tree[3] = 3.0
 54 | 
 55 |     assert tree.find_prefixsum_idx(0.00) == 0
 56 |     assert tree.find_prefixsum_idx(0.55) == 1
 57 |     assert tree.find_prefixsum_idx(0.99) == 1
 58 |     assert tree.find_prefixsum_idx(1.51) == 2
 59 |     assert tree.find_prefixsum_idx(3.00) == 3
 60 |     assert tree.find_prefixsum_idx(5.50) == 3
 61 | 
 62 | 
 63 | def test_max_interval_tree():
 64 |     tree = MinSegmentTree(4)
 65 | 
 66 |     tree[0] = 1.0
 67 |     tree[2] = 0.5
 68 |     tree[3] = 3.0
 69 | 
 70 |     assert np.isclose(tree.min(), 0.5)
 71 |     assert np.isclose(tree.min(0, 2), 1.0)
 72 |     assert np.isclose(tree.min(0, 3), 0.5)
 73 |     assert np.isclose(tree.min(0, -1), 0.5)
 74 |     assert np.isclose(tree.min(2, 4), 0.5)
 75 |     assert np.isclose(tree.min(3, 4), 3.0)
 76 | 
 77 |     tree[2] = 0.7
 78 | 
 79 |     assert np.isclose(tree.min(), 0.7)
 80 |     assert np.isclose(tree.min(0, 2), 1.0)
 81 |     assert np.isclose(tree.min(0, 3), 0.7)
 82 |     assert np.isclose(tree.min(0, -1), 0.7)
 83 |     assert np.isclose(tree.min(2, 4), 0.7)
 84 |     assert np.isclose(tree.min(3, 4), 3.0)
 85 | 
 86 |     tree[2] = 4.0
 87 | 
 88 |     assert np.isclose(tree.min(), 1.0)
 89 |     assert np.isclose(tree.min(0, 2), 1.0)
 90 |     assert np.isclose(tree.min(0, 3), 1.0)
 91 |     assert np.isclose(tree.min(0, -1), 1.0)
 92 |     assert np.isclose(tree.min(2, 4), 3.0)
 93 |     assert np.isclose(tree.min(2, 3), 4.0)
 94 |     assert np.isclose(tree.min(2, -1), 4.0)
 95 |     assert np.isclose(tree.min(3, 4), 3.0)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     test_tree_set()
100 |     test_tree_set_overlap()
101 |     test_prefixsum_idx()
102 |     test_prefixsum_idx2()
103 |     test_max_interval_tree()
104 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/tests/test_serialization.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | import pytest
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | 
 7 | from baselines.common.tests.envs.mnist_env import MnistEnv
 8 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv
 9 | from baselines.run import get_learn_function
10 | from baselines.common.tf_util import make_session, get_session
11 | 
12 | from functools import partial
13 | 
14 | 
15 | learn_kwargs = {
16 |     'deepq': {},
17 |     'a2c': {}, 
18 |     'acktr': {},
19 |     'ppo2': {'nminibatches': 1, 'nsteps': 10},
20 |     'trpo_mpi': {},
21 | }
22 | 
23 | network_kwargs = {
24 |     'mlp': {}, 
25 |     'cnn': {'pad': 'SAME'}, 
26 |     'lstm': {},
27 |     'cnn_lnlstm': {'pad': 'SAME'}
28 | }
29 | 
30 | 
31 | @pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
32 | @pytest.mark.parametrize("network_fn", network_kwargs.keys())
33 | def test_serialization(learn_fn, network_fn):
34 |     '''
35 |     Test if the trained model can be serialized 
36 |     '''
37 | 
38 |     
39 |     if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
40 |             # TODO make acktr work with recurrent policies
41 |             # and test
42 |             # github issue: https://github.com/openai/baselines/issues/194
43 |             return 
44 | 
45 |     env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)])
46 |     ob = env.reset().copy()
47 |     learn = get_learn_function(learn_fn)
48 | 
49 |     kwargs = {}
50 |     kwargs.update(network_kwargs[network_fn])
51 |     kwargs.update(learn_kwargs[learn_fn])
52 | 
53 | 
54 |     learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)
55 | 
56 |     with tempfile.TemporaryDirectory() as td:
57 |         model_path = os.path.join(td, 'serialization_test_model')
58 | 
59 |         with tf.Graph().as_default(), make_session().as_default():
60 |             model = learn(total_timesteps=100)
61 |             model.save(model_path)
62 |             mean1, std1 = _get_action_stats(model, ob)
63 |             variables_dict1 = _serialize_variables()
64 | 
65 |         with tf.Graph().as_default(), make_session().as_default():
66 |             model = learn(total_timesteps=0, load_path=model_path)
67 |             mean2, std2 = _get_action_stats(model, ob)
68 |             variables_dict2 = _serialize_variables()
69 | 
70 |         for k, v in variables_dict1.items():
71 |             np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
72 |                 err_msg='saved and loaded variable {} value mismatch'.format(k))
73 | 
74 |         np.testing.assert_allclose(mean1, mean2, atol=0.5)
75 |         np.testing.assert_allclose(std1, std2, atol=0.5)
76 | 
77 |  
78 | 
79 | def _serialize_variables():
80 |     sess = get_session()
81 |     variables = tf.trainable_variables()    
82 |     values = sess.run(variables)
83 |     return {var.name: value for var, value in zip(variables, values)}
84 |     
85 | 
86 | def _get_action_stats(model, ob):
87 |     ntrials = 1000
88 |     if model.initial_state is None or model.initial_state == []:
89 |         actions = np.array([model.step(ob)[0] for _ in range(ntrials)])
90 |     else:
91 |         actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)])
92 | 
93 |     mean = np.mean(actions, axis=0)
94 |     std = np.std(actions, axis=0)
95 | 
96 |     return mean, std
97 | 
98 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/mpi_util.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | from mpi4py import MPI
  3 | import os, numpy as np
  4 | import platform
  5 | import shutil
  6 | import subprocess
  7 | 
  8 | def sync_from_root(sess, variables, comm=None):
  9 |     """
 10 |     Send the root node's parameters to every worker.
 11 |     Arguments:
 12 |       sess: the TensorFlow session.
 13 |       variables: all parameter variables including optimizer's
 14 |     """
 15 |     if comm is None: comm = MPI.COMM_WORLD
 16 |     rank = comm.Get_rank()
 17 |     for var in variables:
 18 |         if rank == 0:
 19 |             comm.Bcast(sess.run(var))
 20 |         else:
 21 |             import tensorflow as tf
 22 |             returned_var = np.empty(var.shape, dtype='float32')
 23 |             comm.Bcast(returned_var)
 24 |             sess.run(tf.assign(var, returned_var))
 25 | 
 26 | def gpu_count():
 27 |     """
 28 |     Count the GPUs on this machine.
 29 |     """
 30 |     if shutil.which('nvidia-smi') is None:
 31 |         return 0
 32 |     output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
 33 |     return max(0, len(output.split(b'\n')) - 2)
 34 | 
 35 | def setup_mpi_gpus():
 36 |     """
 37 |     Set CUDA_VISIBLE_DEVICES using MPI.
 38 |     """
 39 |     num_gpus = gpu_count()
 40 |     if num_gpus == 0:
 41 |         return
 42 |     local_rank, _ = get_local_rank_size(MPI.COMM_WORLD)
 43 |     os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus)
 44 | 
 45 | def get_local_rank_size(comm):
 46 |     """
 47 |     Returns the rank of each process on its machine
 48 |     The processes on a given machine will be assigned ranks
 49 |         0, 1, 2, ..., N-1,
 50 |     where N is the number of processes on this machine.
 51 | 
 52 |     Useful if you want to assign one gpu per machine
 53 |     """
 54 |     this_node = platform.node()
 55 |     ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
 56 |     node2rankssofar = defaultdict(int)
 57 |     local_rank = None
 58 |     for (rank, node) in ranks_nodes:
 59 |         if rank == comm.Get_rank():
 60 |             local_rank = node2rankssofar[node]
 61 |         node2rankssofar[node] += 1
 62 |     assert local_rank is not None
 63 |     return local_rank, node2rankssofar[this_node]
 64 | 
 65 | def share_file(comm, path):
 66 |     """
 67 |     Copies the file from rank 0 to all other ranks
 68 |     Puts it in the same place on all machines
 69 |     """
 70 |     localrank, _ = get_local_rank_size(comm)
 71 |     if comm.Get_rank() == 0:
 72 |         with open(path, 'rb') as fh:
 73 |             data = fh.read()
 74 |         comm.bcast(data)
 75 |     else:
 76 |         data = comm.bcast(None)
 77 |         if localrank == 0:
 78 |             os.makedirs(os.path.dirname(path), exist_ok=True)
 79 |             with open(path, 'wb') as fh:
 80 |                 fh.write(data)
 81 |     comm.Barrier()
 82 | 
 83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True):
 84 |     if comm is None: return d
 85 |     alldicts = comm.allgather(d)
 86 |     size = comm.size
 87 |     k2li = defaultdict(list)
 88 |     for d in alldicts:
 89 |         for (k,v) in d.items():
 90 |             k2li[k].append(v)
 91 |     result = {}
 92 |     for (k,li) in k2li.items():
 93 |         if assert_all_have_data:
 94 |             assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
 95 |         if op=='mean':
 96 |             result[k] = np.mean(li, axis=0)
 97 |         elif op=='sum':
 98 |             result[k] = np.sum(li, axis=0)
 99 |         else:
100 |             assert 0, op
101 |     return result
102 | 


--------------------------------------------------------------------------------
/gym-atari/scripts/train-alien.sh:
--------------------------------------------------------------------------------
 1 | cd baselines
 2 | 
 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)&
 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)&
 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)&
 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)&
 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)&
 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)&
 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)&
10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)&
11 | 
12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)&
13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)&
14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)&
15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)&
16 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)&
17 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)&
18 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)&
19 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)&
20 | 
21 | cd ..
22 | 


--------------------------------------------------------------------------------
/gym-atari/scripts/train-phoenix.sh:
--------------------------------------------------------------------------------
 1 | cd baselines
 2 | 
 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)&
 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)&
 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)&
 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)&
 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)&
 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)&
 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)&
10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)&
11 | 
12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)&
13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)&
14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)&
15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)&
16 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)&
17 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)&
18 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)&
19 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)&
20 | 
21 | cd ..
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # RL with Perturbed Rewards
 2 | 
 3 | This is the tensorflow implementation of [Reinforcement Learning with Perturbed Rewards](https://arxiv.org/abs/1810.01032) as described in the following AAAI 2020 paper (__Spotlight__):
 4 | 
 5 | ```
 6 | @inproceedings{wang2020rlnoisy,
 7 |   title={Reinforcement Learning with Perturbed Rewards},
 8 |   author={Wang, Jingkang and Liu, Yang and Li, Bo},
 9 |   booktitle={AAAI},
10 |   year={2020}
11 | }
12 | ```
13 | 
14 | The implementation is based on [keras-rl](https://github.com/keras-rl/keras-rl) and [OpenAI baselines](https://github.com/openai/baselines) frameworks. Thanks to the original authors!
15 | 
16 | - `gym-control`: Classic control games
17 | - `gym-atari`:   Atari-2600 games
18 | 
19 | ## Dependencies
20 | - python 3.5
21 | - tensorflow 1.10.0, keras 2.1.0
22 | - gym, scipy, scipy, joblib, keras
23 | - progressbar2, mpi4py, cloudpickle, opencv-python, h5py, pandas
24 | 
25 | Note: make sure that you have successfully installed the baseline package and other packages following (using [virtualenvwrapper](https://virtualenvwrapper.readthedocs.io/en/latest/) to create virtual environment):
26 | ```
27 | mkvirtualenv rl-noisy --python==/usr/bin/python3
28 | pip install -r requirements.txt
29 | cd gym-atari/baselines
30 | pip install -e .
31 | ```
32 | 
33 | ## Examples
34 | - Classic control (DQN on Cartpole)
35 | ```
36 | cd gym-control
37 | python cem_cartpole.py                                           # true reward
38 | python dqn_cartpole.py --error_positive 0.1 --reward noisy       # perturbed reward
39 | python dqn_cartpole.py --error_positive 0.1 --reward surrogate   # surrogate reward (estimated)
40 | ```
41 | - Atari-2600 (PPO on Phoenix)
42 | ```
43 | cd gym-atari/baselines
44 | python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 \  # true reward
45 |        --num_timesteps=5e7 --normal=True                          
46 | python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 \  # noisy reward
47 |        --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.2 \
48 |        --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden
49 | python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 \  # surrogate reward (estimated)
50 |        --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.2 \
51 |        --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden
52 | ```
53 | 
54 | ## Reproduce the Results
55 | To reproduce all the results reported in the paper, please refer to `scripts/` folders in `rl-noisy-reward-control` and `rl-noisy-reward-atari`:
56 | - `gym-control/scripts`
57 |   - Cartpole
58 |     - `train-cem.sh` (CEM)
59 |     - `train-dqn.sh` (DQN)
60 |     - `train-duel-dqn.sh` (Dueling-DQN)
61 |     - `train-qlearn.sh` (Q-Learning)
62 |     - `train-sarsa.sh` (Deep SARSA)
63 |   - Pendulum
64 |     - `train-ddpg.sh` (DDPG)
65 |     - `train-naf.sh` (NAF)
66 | - `gym-atari/scripts`
67 |   - `train-alien.sh` (Alien)
68 |   - `train-carnival.sh` (Carnival)
69 |   - `train-mspacman.sh` (MsPacman)
70 |   - `train-phoenix.sh` (Phoenix)
71 |   - `train-pong.sh` (Pong)
72 |   - `train-seaquest.sh` (Seaquest)
73 |   - `train-normal.sh` (Training with true rewards)
74 | 
75 | 
76 | If you have eight available GPUs (Memory > 8GB), you can directly run the `*.sh` scripts one at a time. Otherwise, you can follow the instructions in the scripts and run the experiments. It ususally takes one or two days (GTX-1080 Ti) to train the policy.
77 | ```
78 | cd rl-noisy-reward-atari/baselines
79 | sh scripts/train-alien.sh
80 | ```
81 | The logs and models will be saved automatically. We provide `results_single.py` for getting the averaged scores:
82 | ```
83 | python -m baselines.results_single --log_dir logs-alien
84 | ```
85 | 
86 | ## Citation
87 | Please cite our paper if you use this code in your research work.
88 | 
89 | ## Questions/Bugs
90 | Please submit a Github issue or contact wangjk@cs.toronto.edu if you have any questions or find any bugs.
91 | 


--------------------------------------------------------------------------------
/gym-atari/scripts/train-carnival.sh:
--------------------------------------------------------------------------------
 1 | cd baselines
 2 | 
 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)&
 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)&
 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)&
 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)&
 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)&
 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)&
 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)&
10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)&
11 | 
12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)&
13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)&
14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)&
15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)&
16 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)&
17 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)&
18 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)&
19 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)&
20 | 
21 | cd ..
22 | 


--------------------------------------------------------------------------------
/gym-atari/scripts/train-mspacman.sh:
--------------------------------------------------------------------------------
 1 | cd baselines
 2 | 
 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)&
 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)&
 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)&
 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)&
 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)&
 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)&
 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)&
10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)&
11 | 
12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)&
13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)&
14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)&
15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)&
16 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)&
17 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)&
18 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)&
19 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)&
20 | 
21 | cd ..
22 | 


--------------------------------------------------------------------------------
/gym-atari/scripts/train-seaquest.sh:
--------------------------------------------------------------------------------
 1 | cd baselines
 2 | 
 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)&
 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)&
 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)&
 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)&
 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)&
 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)&
 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)&
10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)&
11 | 
12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)&
13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)&
14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)&
15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)&
16 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)&
17 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)&
18 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)&
19 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)&
20 | 
21 | cd ..
22 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from baselines import logger
  3 | 
  4 | class AlreadySteppingError(Exception):
  5 |     """
  6 |     Raised when an asynchronous step is running while
  7 |     step_async() is called again.
  8 |     """
  9 |     def __init__(self):
 10 |         msg = 'already running an async step'
 11 |         Exception.__init__(self, msg)
 12 | 
 13 | class NotSteppingError(Exception):
 14 |     """
 15 |     Raised when an asynchronous step is not running but
 16 |     step_wait() is called.
 17 |     """
 18 |     def __init__(self):
 19 |         msg = 'not running an async step'
 20 |         Exception.__init__(self, msg)
 21 | 
 22 | class VecEnv(ABC):
 23 |     """
 24 |     An abstract asynchronous, vectorized environment.
 25 |     """
 26 |     def __init__(self, num_envs, observation_space, action_space):
 27 |         self.num_envs = num_envs
 28 |         self.observation_space = observation_space
 29 |         self.action_space = action_space
 30 | 
 31 |     @abstractmethod
 32 |     def reset(self):
 33 |         """
 34 |         Reset all the environments and return an array of
 35 |         observations, or a tuple of observation arrays.
 36 | 
 37 |         If step_async is still doing work, that work will
 38 |         be cancelled and step_wait() should not be called
 39 |         until step_async() is invoked again.
 40 |         """
 41 |         pass
 42 | 
 43 |     @abstractmethod
 44 |     def step_async(self, actions):
 45 |         """
 46 |         Tell all the environments to start taking a step
 47 |         with the given actions.
 48 |         Call step_wait() to get the results of the step.
 49 | 
 50 |         You should not call this if a step_async run is
 51 |         already pending.
 52 |         """
 53 |         pass
 54 | 
 55 |     @abstractmethod
 56 |     def step_wait(self):
 57 |         """
 58 |         Wait for the step taken with step_async().
 59 | 
 60 |         Returns (obs, rews, dones, infos):
 61 |          - obs: an array of observations, or a tuple of
 62 |                 arrays of observations.
 63 |          - rews: an array of rewards
 64 |          - dones: an array of "episode done" booleans
 65 |          - infos: a sequence of info objects
 66 |         """
 67 |         pass
 68 | 
 69 |     @abstractmethod
 70 |     def close(self):
 71 |         """
 72 |         Clean up the environments' resources.
 73 |         """
 74 |         pass
 75 | 
 76 |     def step(self, actions):
 77 |         self.step_async(actions)
 78 |         return self.step_wait()
 79 | 
 80 |     def render(self, mode='human'):
 81 |         logger.warn('Render not defined for %s'%self)
 82 | 
 83 |     @property
 84 |     def unwrapped(self):
 85 |         if isinstance(self, VecEnvWrapper):
 86 |             return self.venv.unwrapped
 87 |         else:
 88 |             return self
 89 | 
 90 | class VecEnvWrapper(VecEnv):
 91 |     def __init__(self, venv, observation_space=None, action_space=None):
 92 |         self.venv = venv
 93 |         VecEnv.__init__(self, 
 94 |             num_envs=venv.num_envs,
 95 |             observation_space=observation_space or venv.observation_space, 
 96 |             action_space=action_space or venv.action_space)
 97 | 
 98 |     def step_async(self, actions):
 99 |         self.venv.step_async(actions)
100 | 
101 |     @abstractmethod
102 |     def reset(self):
103 |         pass
104 | 
105 |     @abstractmethod
106 |     def step_wait(self):
107 |         pass
108 | 
109 |     def close(self):
110 |         return self.venv.close()
111 | 
112 |     def render(self):
113 |         self.venv.render()
114 | 
115 | class CloudpickleWrapper(object):
116 |     """
117 |     Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
118 |     """
119 |     def __init__(self, x):
120 |         self.x = x
121 |     def __getstate__(self):
122 |         import cloudpickle
123 |         return cloudpickle.dumps(self.x)
124 |     def __setstate__(self, ob):
125 |         import pickle
126 |         self.x = pickle.loads(ob)
127 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from multiprocessing import Process, Pipe
  3 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper
  4 | from baselines.common.tile_images import tile_images
  5 | 
  6 | 
  7 | def worker(remote, parent_remote, env_fn_wrapper):
  8 |     parent_remote.close()
  9 |     env = env_fn_wrapper.x()
 10 |     try:
 11 |         while True:
 12 |             cmd, data = remote.recv()
 13 |             if cmd == 'step':
 14 |                 ob, reward, done, info = env.step(data)
 15 |                 if done:
 16 |                     ob = env.reset()
 17 |                 remote.send((ob, reward, done, info))
 18 |             elif cmd == 'reset':
 19 |                 ob = env.reset()
 20 |                 remote.send(ob)
 21 |             elif cmd == 'render':
 22 |                 remote.send(env.render(mode='rgb_array'))
 23 |             elif cmd == 'close':
 24 |                 remote.close()
 25 |                 break
 26 |             elif cmd == 'get_spaces':
 27 |                 remote.send((env.observation_space, env.action_space))
 28 |             else:
 29 |                 raise NotImplementedError
 30 |     except KeyboardInterrupt:
 31 |         print('SubprocVecEnv worker: got KeyboardInterrupt')
 32 |     finally:
 33 |         env.close()
 34 | 
 35 | class SubprocVecEnv(VecEnv):
 36 |     def __init__(self, env_fns, spaces=None):
 37 |         """
 38 |         envs: list of gym environments to run in subprocesses
 39 |         """
 40 |         self.waiting = False
 41 |         self.closed = False
 42 |         nenvs = len(env_fns)
 43 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)])
 44 |         self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
 45 |             for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
 46 |         for p in self.ps:
 47 |             p.daemon = True # if the main process crashes, we should not cause things to hang
 48 |             p.start()
 49 |         for remote in self.work_remotes:
 50 |             remote.close()
 51 | 
 52 |         self.remotes[0].send(('get_spaces', None))
 53 |         observation_space, action_space = self.remotes[0].recv()
 54 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
 55 | 
 56 |     def step_async(self, actions):
 57 |         for remote, action in zip(self.remotes, actions):
 58 |             remote.send(('step', action))
 59 |         self.waiting = True
 60 | 
 61 |     def step_wait(self):
 62 |         results = [remote.recv() for remote in self.remotes]
 63 |         self.waiting = False
 64 |         obs, rews, dones, infos = zip(*results)
 65 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
 66 | 
 67 |     def reset(self):
 68 |         for remote in self.remotes:
 69 |             remote.send(('reset', None))
 70 |         return np.stack([remote.recv() for remote in self.remotes])
 71 | 
 72 |     def reset_task(self):
 73 |         for remote in self.remotes:
 74 |             remote.send(('reset_task', None))
 75 |         return np.stack([remote.recv() for remote in self.remotes])
 76 | 
 77 |     def close(self):
 78 |         if self.closed:
 79 |             return
 80 |         if self.waiting:
 81 |             for remote in self.remotes:            
 82 |                 remote.recv()
 83 |         for remote in self.remotes:
 84 |             remote.send(('close', None))
 85 |         for p in self.ps:
 86 |             p.join()
 87 |         self.closed = True
 88 | 
 89 |     def render(self, mode='human'):
 90 |         for pipe in self.remotes:
 91 |             pipe.send(('render', None))
 92 |         imgs = [pipe.recv() for pipe in self.remotes]
 93 |         bigimg = tile_images(imgs)
 94 |         if mode == 'human':
 95 |             import cv2
 96 |             cv2.imshow('vecenv', bigimg[:,:,::-1])
 97 |             cv2.waitKey(1)
 98 |         elif mode == 'rgb_array':
 99 |             return bigimg
100 |         else:
101 |             raise NotImplementedError


--------------------------------------------------------------------------------
/gym-control/scripts/train-cem.sh:
--------------------------------------------------------------------------------
 1 | for log_dir in logs_01 logs_02 logs_03
 2 | do
 3 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir)&
 4 | 
 5 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)&
 6 | (export CUDA_VISIBLE_DEVICES=1 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)&
 7 | (export CUDA_VISIBLE_DEVICES=2 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)&
 8 | (export CUDA_VISIBLE_DEVICES=3 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)&
 9 | (export CUDA_VISIBLE_DEVICES=4 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)&
10 | (export CUDA_VISIBLE_DEVICES=5 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)&
11 | (export CUDA_VISIBLE_DEVICES=6 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)&
12 | (export CUDA_VISIBLE_DEVICES=7 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)&
13 | 
14 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)&
15 | (export CUDA_VISIBLE_DEVICES=1 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)&
16 | (export CUDA_VISIBLE_DEVICES=2 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)&
17 | (export CUDA_VISIBLE_DEVICES=3 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)&
18 | (export CUDA_VISIBLE_DEVICES=4 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)&
19 | (export CUDA_VISIBLE_DEVICES=5 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)&
20 | (export CUDA_VISIBLE_DEVICES=6 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)&
21 | (export CUDA_VISIBLE_DEVICES=7 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)&
22 | 
23 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)&
24 | (export CUDA_VISIBLE_DEVICES=1 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)&
25 | (export CUDA_VISIBLE_DEVICES=2 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)&
26 | (export CUDA_VISIBLE_DEVICES=3 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)&
27 | (export CUDA_VISIBLE_DEVICES=4 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)&
28 | (export CUDA_VISIBLE_DEVICES=5 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)&
29 | (export CUDA_VISIBLE_DEVICES=6 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)&
30 | (export CUDA_VISIBLE_DEVICES=7 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)&
31 | 
32 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)&
33 | (export CUDA_VISIBLE_DEVICES=1 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)&
34 | (export CUDA_VISIBLE_DEVICES=2 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)&
35 | (export CUDA_VISIBLE_DEVICES=3 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)&
36 | (export CUDA_VISIBLE_DEVICES=4 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)&
37 | (export CUDA_VISIBLE_DEVICES=5 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)&
38 | (export CUDA_VISIBLE_DEVICES=6 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)&
39 | (export CUDA_VISIBLE_DEVICES=7 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)&
40 | done
41 | 


--------------------------------------------------------------------------------
/gym-control/scripts/train-dqn.sh:
--------------------------------------------------------------------------------
 1 | for log_dir in logs_01 logs_02 logs_03
 2 | do
 3 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir)&
 4 | 
 5 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)&
 6 | (export CUDA_VISIBLE_DEVICES=1 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)&
 7 | (export CUDA_VISIBLE_DEVICES=2 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)&
 8 | (export CUDA_VISIBLE_DEVICES=3 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)&
 9 | (export CUDA_VISIBLE_DEVICES=4 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)&
10 | (export CUDA_VISIBLE_DEVICES=5 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)&
11 | (export CUDA_VISIBLE_DEVICES=6 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)&
12 | (export CUDA_VISIBLE_DEVICES=7 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)&
13 | 
14 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)&
15 | (export CUDA_VISIBLE_DEVICES=1 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)&
16 | (export CUDA_VISIBLE_DEVICES=2 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)&
17 | (export CUDA_VISIBLE_DEVICES=3 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)&
18 | (export CUDA_VISIBLE_DEVICES=4 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)&
19 | (export CUDA_VISIBLE_DEVICES=5 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)&
20 | (export CUDA_VISIBLE_DEVICES=6 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)&
21 | (export CUDA_VISIBLE_DEVICES=7 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)&
22 | 
23 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)&
24 | (export CUDA_VISIBLE_DEVICES=1 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)&
25 | (export CUDA_VISIBLE_DEVICES=2 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)&
26 | (export CUDA_VISIBLE_DEVICES=3 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)&
27 | (export CUDA_VISIBLE_DEVICES=4 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)&
28 | (export CUDA_VISIBLE_DEVICES=5 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)&
29 | (export CUDA_VISIBLE_DEVICES=6 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)&
30 | (export CUDA_VISIBLE_DEVICES=7 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)&
31 | 
32 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)&
33 | (export CUDA_VISIBLE_DEVICES=1 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)&
34 | (export CUDA_VISIBLE_DEVICES=2 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)&
35 | (export CUDA_VISIBLE_DEVICES=3 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)&
36 | (export CUDA_VISIBLE_DEVICES=4 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)&
37 | (export CUDA_VISIBLE_DEVICES=5 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)&
38 | (export CUDA_VISIBLE_DEVICES=6 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)&
39 | (export CUDA_VISIBLE_DEVICES=7 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)&
40 | done
41 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | from mpi4py import MPI
  2 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np
  3 | 
  4 | class RunningMeanStd(object):
  5 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
  6 |     def __init__(self, epsilon=1e-2, shape=()):
  7 | 
  8 |         self._sum = tf.get_variable(
  9 |             dtype=tf.float64,
 10 |             shape=shape,
 11 |             initializer=tf.constant_initializer(0.0),
 12 |             name="runningsum", trainable=False)
 13 |         self._sumsq = tf.get_variable(
 14 |             dtype=tf.float64,
 15 |             shape=shape,
 16 |             initializer=tf.constant_initializer(epsilon),
 17 |             name="runningsumsq", trainable=False)
 18 |         self._count = tf.get_variable(
 19 |             dtype=tf.float64,
 20 |             shape=(),
 21 |             initializer=tf.constant_initializer(epsilon),
 22 |             name="count", trainable=False)
 23 |         self.shape = shape
 24 | 
 25 |         self.mean = tf.to_float(self._sum / self._count)
 26 |         self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
 27 | 
 28 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 29 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 30 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 31 |         self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
 32 |             updates=[tf.assign_add(self._sum, newsum),
 33 |                      tf.assign_add(self._sumsq, newsumsq),
 34 |                      tf.assign_add(self._count, newcount)])
 35 | 
 36 | 
 37 |     def update(self, x):
 38 |         x = x.astype('float64')
 39 |         n = int(np.prod(self.shape))
 40 |         totalvec = np.zeros(n*2+1, 'float64')
 41 |         addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
 42 |         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 43 |         self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
 44 | 
 45 | @U.in_session
 46 | def test_runningmeanstd():
 47 |     for (x1, x2, x3) in [
 48 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
 49 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
 50 |         ]:
 51 | 
 52 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
 53 |         U.initialize()
 54 | 
 55 |         x = np.concatenate([x1, x2, x3], axis=0)
 56 |         ms1 = [x.mean(axis=0), x.std(axis=0)]
 57 |         rms.update(x1)
 58 |         rms.update(x2)
 59 |         rms.update(x3)
 60 |         ms2 = [rms.mean.eval(), rms.std.eval()]
 61 | 
 62 |         assert np.allclose(ms1, ms2)
 63 | 
 64 | @U.in_session
 65 | def test_dist():
 66 |     np.random.seed(0)
 67 |     p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
 68 |     q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
 69 | 
 70 |     # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
 71 |     # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
 72 | 
 73 |     comm = MPI.COMM_WORLD
 74 |     assert comm.Get_size()==2
 75 |     if comm.Get_rank()==0:
 76 |         x1,x2,x3 = p1,p2,p3
 77 |     elif comm.Get_rank()==1:
 78 |         x1,x2,x3 = q1,q2,q3
 79 |     else:
 80 |         assert False
 81 | 
 82 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 83 |     U.initialize()
 84 | 
 85 |     rms.update(x1)
 86 |     rms.update(x2)
 87 |     rms.update(x3)
 88 | 
 89 |     bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
 90 | 
 91 |     def checkallclose(x,y):
 92 |         print(x,y)
 93 |         return np.allclose(x,y)
 94 | 
 95 |     assert checkallclose(
 96 |         bigvec.mean(axis=0),
 97 |         rms.mean.eval(),
 98 |     )
 99 |     assert checkallclose(
100 |         bigvec.std(axis=0),
101 |         rms.std.eval(),
102 |     )
103 | 
104 | 
105 | if __name__ == "__main__":
106 |     # Run with mpirun -np 2 python <filename>
107 |     test_dist()
108 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t and t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/gym-control/scripts/train-sarsa.sh:
--------------------------------------------------------------------------------
 1 | for log_dir in logs_01 logs_02 logs_03
 2 | do
 3 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir)&
 4 | 
 5 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)&
 6 | (export CUDA_VISIBLE_DEVICES=1 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)&
 7 | (export CUDA_VISIBLE_DEVICES=2 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)&
 8 | (export CUDA_VISIBLE_DEVICES=3 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)&
 9 | (export CUDA_VISIBLE_DEVICES=4 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)&
10 | (export CUDA_VISIBLE_DEVICES=5 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)&
11 | (export CUDA_VISIBLE_DEVICES=6 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)&
12 | (export CUDA_VISIBLE_DEVICES=7 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)&
13 | 
14 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)&
15 | (export CUDA_VISIBLE_DEVICES=1 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)&
16 | (export CUDA_VISIBLE_DEVICES=2 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)&
17 | (export CUDA_VISIBLE_DEVICES=3 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)&
18 | (export CUDA_VISIBLE_DEVICES=4 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)&
19 | (export CUDA_VISIBLE_DEVICES=5 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)&
20 | (export CUDA_VISIBLE_DEVICES=6 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)&
21 | (export CUDA_VISIBLE_DEVICES=7 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)&
22 | 
23 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)&
24 | (export CUDA_VISIBLE_DEVICES=1 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)&
25 | (export CUDA_VISIBLE_DEVICES=2 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)&
26 | (export CUDA_VISIBLE_DEVICES=3 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)&
27 | (export CUDA_VISIBLE_DEVICES=4 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)&
28 | (export CUDA_VISIBLE_DEVICES=5 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)&
29 | (export CUDA_VISIBLE_DEVICES=6 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)&
30 | (export CUDA_VISIBLE_DEVICES=7 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)&
31 | 
32 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)&
33 | (export CUDA_VISIBLE_DEVICES=1 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)&
34 | (export CUDA_VISIBLE_DEVICES=2 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)&
35 | (export CUDA_VISIBLE_DEVICES=3 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)&
36 | (export CUDA_VISIBLE_DEVICES=4 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)&
37 | (export CUDA_VISIBLE_DEVICES=5 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)&
38 | (export CUDA_VISIBLE_DEVICES=6 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)&
39 | (export CUDA_VISIBLE_DEVICES=7 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)&
40 | done
41 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/results_single.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import os
  3 | import glob
  4 | import numpy as np
  5 | import matplotlib
  6 | import matplotlib.pyplot as plt
  7 | import seaborn as sns
  8 | sns.set()
  9 | sns.set_color_codes()
 10 | 
 11 | from baselines.bench.monitor import load_results
 12 | 
 13 | matplotlib.rcParams.update({'font.size': 30})
 14 | 
 15 | X_TIMESTEPS = 'timesteps'
 16 | X_EPISODES = 'episodes'
 17 | X_WALLTIME = 'walltime_hrs'
 18 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 19 | EPISODES_WINDOW = 100
 20 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
 21 |         'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
 22 |         'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
 23 | 
 24 | def rolling_window(a, window):
 25 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
 26 |     strides = a.strides + (a.strides[-1],)
 27 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 28 | 
 29 | def window_func(x, y, window, func):
 30 |     yw = rolling_window(y, window)
 31 |     yw_func = func(yw, axis=-1)
 32 |     return x[window-1:], yw_func
 33 | 
 34 | def ts2xy(ts, xaxis):
 35 |     if xaxis == X_TIMESTEPS:
 36 |         x = np.cumsum(ts.l.values)
 37 |         y = ts.r.values
 38 |     elif xaxis == X_EPISODES:
 39 |         x = np.arange(len(ts))
 40 |         y = ts.r.values
 41 |     elif xaxis == X_WALLTIME:
 42 |         x = ts.t.values / 3600.
 43 |         y = ts.r.values
 44 |     else:
 45 |         raise NotImplementedError
 46 |     return x, y
 47 | 
 48 | 
 49 | def plot_results_single(ax, input_dir, num_timesteps, xaxis):
 50 |     ts = load_results(input_dir)
 51 |     ts = ts[ts.l.cumsum() <= num_timesteps]
 52 |     xy_list = ts2xy(ts, xaxis)
 53 | 
 54 |     x = xy_list[0]
 55 |     y = xy_list[1]
 56 |     ax.plot(x, y, alpha=0.4, linewidth=0.8, c=sns.color_palette()[0])
 57 |     x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
 58 |     print ("avg_100: %.1f" % np.mean(y_mean[-100:]))
 59 |     ax.plot(x, y_mean, linewidth=0.8, c=sns.color_palette()[0], label='normal')
 60 | 
 61 |     # plt.set_title(title)
 62 |     # ax.set_ylabel("Episode Rewards")
 63 |     # ax.legend()
 64 |     # plt.tight_layout()
 65 | 
 66 | 
 67 | def main():
 68 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 69 |     parser.add_argument('--log_dir', help='Path of log directory', default='logs')
 70 |     parser.add_argument('--num_timesteps', type=int, default=int(5e7))
 71 |     parser.add_argument('--xaxis', help='Varible on X-axis', default = X_TIMESTEPS)
 72 |     parser.add_argument('--task_name', help='Name of atari game', default='Pong')
 73 |     parser.add_argument('--save_dir', help = 'Directory of output plots', default='../results')
 74 |     parser.add_argument('--noise_type', type=str, help='noise type (norm_one/norm_all/anti_iden)',
 75 |                         default='anti_iden')
 76 |     parser.add_argument('--plot_normal', type=str, help='whether to plot baseline with normal rewards')
 77 |     args = parser.parse_args()
 78 | 
 79 |     args.save_dir = os.path.join(args.save_dir, "paper")
 80 |     if not os.path.exists(args.save_dir):
 81 |         os.makedirs(args.save_dir)
 82 | 
 83 |     dirs = glob.glob(os.path.join(args.log_dir, "openai*"))
 84 |     dirs = sorted(dirs)
 85 | 
 86 |     for input_dir in dirs:
 87 | 
 88 |         with open(os.path.join(input_dir, "setting.txt"), "r") as f:
 89 |             line = f.readlines()[-1].rstrip()
 90 |             # normal = line.split()[1][0:-1].split(',')[0]
 91 |             weight = float(line.split()[3][0:-1].split(',')[0])
 92 |             surrogate = line.split()[5][0:-1].split(',')[0]
 93 |             # noise_type = line.split()[7][0:-1].split(')')[0]
 94 |             if weight in [0.1, 0.3, 0.7, 0.9] and surrogate == 'True':
 95 |                 print ("-" * 20)
 96 |                 print (line)
 97 |                 plot_results_single(plt, input_dir, args.num_timesteps, args.xaxis)
 98 |                 print ("-" * 20)
 99 | 
100 | if __name__ == '__main__':
101 |     main()
102 | 


--------------------------------------------------------------------------------
/gym-control/scripts/train-duel-dqn.sh:
--------------------------------------------------------------------------------
 1 | for log_dir in logs_01 logs_02 logs_03
 2 | do
 3 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir)&
 4 | 
 5 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)&
 6 | (export CUDA_VISIBLE_DEVICES=1 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)&
 7 | (export CUDA_VISIBLE_DEVICES=2 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)&
 8 | (export CUDA_VISIBLE_DEVICES=3 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)&
 9 | (export CUDA_VISIBLE_DEVICES=4 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)&
10 | (export CUDA_VISIBLE_DEVICES=5 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)&
11 | (export CUDA_VISIBLE_DEVICES=6 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)&
12 | (export CUDA_VISIBLE_DEVICES=7 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)&
13 | 
14 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)&
15 | (export CUDA_VISIBLE_DEVICES=1 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)&
16 | (export CUDA_VISIBLE_DEVICES=2 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)&
17 | (export CUDA_VISIBLE_DEVICES=3 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)&
18 | (export CUDA_VISIBLE_DEVICES=4 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)&
19 | (export CUDA_VISIBLE_DEVICES=5 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)&
20 | (export CUDA_VISIBLE_DEVICES=6 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)&
21 | (export CUDA_VISIBLE_DEVICES=7 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)&
22 | 
23 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)&
24 | (export CUDA_VISIBLE_DEVICES=1 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)&
25 | (export CUDA_VISIBLE_DEVICES=2 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)&
26 | (export CUDA_VISIBLE_DEVICES=3 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)&
27 | (export CUDA_VISIBLE_DEVICES=4 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)&
28 | (export CUDA_VISIBLE_DEVICES=5 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)&
29 | (export CUDA_VISIBLE_DEVICES=6 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)&
30 | (export CUDA_VISIBLE_DEVICES=7 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)&
31 | 
32 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)&
33 | (export CUDA_VISIBLE_DEVICES=1 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)&
34 | (export CUDA_VISIBLE_DEVICES=2 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)&
35 | (export CUDA_VISIBLE_DEVICES=3 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)&
36 | (export CUDA_VISIBLE_DEVICES=4 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)&
37 | (export CUDA_VISIBLE_DEVICES=5 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)&
38 | (export CUDA_VISIBLE_DEVICES=6 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)&
39 | (export CUDA_VISIBLE_DEVICES=7 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)&
40 | done
41 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/results_plotter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib
  3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | plt.rcParams['svg.fonttype'] = 'none'
  7 | 
  8 | from baselines.bench.monitor import load_results
  9 | 
 10 | X_TIMESTEPS = 'timesteps'
 11 | X_EPISODES = 'episodes'
 12 | X_WALLTIME = 'walltime_hrs'
 13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 14 | EPISODES_WINDOW = 100
 15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
 16 |         'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
 17 |         'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
 18 | 
 19 | def rolling_window(a, window):
 20 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
 21 |     strides = a.strides + (a.strides[-1],)
 22 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 23 | 
 24 | def window_func(x, y, window, func):
 25 |     yw = rolling_window(y, window)
 26 |     yw_func = func(yw, axis=-1)
 27 |     return x[window-1:], yw_func
 28 | 
 29 | def ts2xy(ts, xaxis):
 30 |     if xaxis == X_TIMESTEPS:
 31 |         x = np.cumsum(ts.l.values)
 32 |         y = ts.r.values
 33 |     elif xaxis == X_EPISODES:
 34 |         x = np.arange(len(ts))
 35 |         y = ts.r.values
 36 |     elif xaxis == X_WALLTIME:
 37 |         x = ts.t.values / 3600.
 38 |         y = ts.r.values
 39 |     else:
 40 |         raise NotImplementedError
 41 |     return x, y
 42 | 
 43 | def plot_curves(xy_list, xaxis, title):
 44 |     plt.figure(figsize=(8,2))
 45 |     maxx = max(xy[0][-1] for xy in xy_list)
 46 |     minx = 0
 47 |     for (i, (x, y)) in enumerate(xy_list):
 48 |         color = COLORS[i]
 49 |         plt.scatter(x, y, s=2)
 50 |         x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
 51 |         plt.plot(x, y_mean, color=color)
 52 |     plt.xlim(minx, maxx)
 53 |     plt.title(title)
 54 |     plt.xlabel(xaxis)
 55 |     plt.ylabel("Episode Rewards")
 56 |     plt.tight_layout()
 57 | 
 58 | def plot_results(dirs, num_timesteps, xaxis, task_name):
 59 |     tslist = []
 60 |     for dir in dirs:
 61 |         ts = load_results(dir)
 62 |         ts = ts[ts.l.cumsum() <= num_timesteps]
 63 |         tslist.append(ts)
 64 |     xy_list = [ts2xy(ts, xaxis) for ts in tslist]
 65 |     plot_curves(xy_list, xaxis, task_name)
 66 | 
 67 | # Example usage in jupyter-notebook
 68 | # from baselines import log_viewer
 69 | # %matplotlib inline
 70 | # log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
 71 | # Here ./log is a directory containing the monitor.csv files
 72 | 
 73 | def main():
 74 |     import argparse
 75 |     import os
 76 |     import glob
 77 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 78 |     parser.add_argument('--log_dir', help='Path of log directory', default='logs')
 79 |     parser.add_argument('--num_timesteps', type=int, default=int(10e6))
 80 |     parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
 81 |     parser.add_argument('--task_name', help = 'Title of plot', default = 'PongNoFrameskip-v4')
 82 |     parser.add_argument('--weight', help = 'Weight of noise', default = 0.2, type=float)
 83 |     parser.add_argument('--save_dir', help = 'Didrectory of output plots', default = 'results')
 84 |     args = parser.parse_args()
 85 | 
 86 |     if not os.path.exists(args.save_dir):
 87 |         os.makedirs(args.save_dir)
 88 | 
 89 |     dirs = glob.glob(os.path.join(args.log_dir, "*"))
 90 |     sorted(dirs)
 91 |     cnt = 0
 92 |     for directory in dirs:
 93 |         print directory
 94 |         with open(os.path.join(directory, "setting.txt"), "r") as f:
 95 |             line = f.readlines()[-1].rstrip()
 96 |             print (line.split())
 97 |             normal = line.split()[1][0:-1].split(',')[0]
 98 |             weight = float(line.split()[3][0:-1].split(',')[0])
 99 |             surrogate = line.split()[5][0:-1].split(',')[0]
100 |             noise_type = line.split()[7][0:-1].split(')')[0]
101 |             print (normal, weight, surrogate, noise_type)
102 |         if normal == 'True':
103 |             title = args.task_name + " (normal)"
104 |         elif surrogate == 'False':
105 |             title = args.task_name + " (noisy-" + str(weight) + "-" + noise_type + ")"
106 |         else:
107 |             title = args.task_name + " (surrogate-" + str(weight) + "-" + noise_type + ")"
108 |         
109 |         print (weight, args.weight)
110 |         if weight == args.weight:
111 |             print (args.weight)
112 |             plot_results([directory], args.num_timesteps, args.xaxis, title)
113 |             plt.savefig(os.path.join(args.save_dir, title + ".png"))
114 |         cnt += 1
115 |     print cnt
116 | 
117 | 
118 | if __name__ == '__main__':
119 |     main()
120 | 


--------------------------------------------------------------------------------
/gym-control/rl/util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from keras.models import model_from_config, Sequential, Model, model_from_config
  4 | import keras.optimizers as optimizers
  5 | import keras.backend as K
  6 | 
  7 | 
  8 | def clone_model(model, custom_objects={}):
  9 |     # Requires Keras 1.0.7 since get_config has breaking changes.
 10 |     config = {
 11 |         'class_name': model.__class__.__name__,
 12 |         'config': model.get_config(),
 13 |     }
 14 |     clone = model_from_config(config, custom_objects=custom_objects)
 15 |     clone.set_weights(model.get_weights())
 16 |     return clone
 17 | 
 18 | 
 19 | def clone_optimizer(optimizer):
 20 |     if type(optimizer) is str:
 21 |         return optimizers.get(optimizer)
 22 |     # Requires Keras 1.0.7 since get_config has breaking changes.
 23 |     params = dict([(k, v) for k, v in optimizer.get_config().items()])
 24 |     config = {
 25 |         'class_name': optimizer.__class__.__name__,
 26 |         'config': params,
 27 |     }
 28 |     if hasattr(optimizers, 'optimizer_from_config'):
 29 |         # COMPATIBILITY: Keras < 2.0
 30 |         clone = optimizers.optimizer_from_config(config)
 31 |     else:
 32 |         clone = optimizers.deserialize(config)
 33 |     return clone
 34 | 
 35 | 
 36 | def get_soft_target_model_updates(target, source, tau):
 37 |     target_weights = target.trainable_weights + sum([l.non_trainable_weights for l in target.layers], [])
 38 |     source_weights = source.trainable_weights + sum([l.non_trainable_weights for l in source.layers], [])
 39 |     assert len(target_weights) == len(source_weights)
 40 | 
 41 |     # Create updates.
 42 |     updates = []
 43 |     for tw, sw in zip(target_weights, source_weights):
 44 |         updates.append((tw, tau * sw + (1. - tau) * tw))
 45 |     return updates
 46 | 
 47 | 
 48 | def get_object_config(o):
 49 |     if o is None:
 50 |         return None
 51 | 
 52 |     config = {
 53 |         'class_name': o.__class__.__name__,
 54 |         'config': o.get_config()
 55 |     }
 56 |     return config
 57 | 
 58 | 
 59 | def huber_loss(y_true, y_pred, clip_value):
 60 |     # Huber loss, see https://en.wikipedia.org/wiki/Huber_loss and
 61 |     # https://medium.com/@karpathy/yes-you-should-understand-backprop-e2f06eab496b
 62 |     # for details.
 63 |     assert clip_value > 0.
 64 | 
 65 |     x = y_true - y_pred
 66 |     if np.isinf(clip_value):
 67 |         # Spacial case for infinity since Tensorflow does have problems
 68 |         # if we compare `K.abs(x) < np.inf`.
 69 |         return .5 * K.square(x)
 70 | 
 71 |     condition = K.abs(x) < clip_value
 72 |     squared_loss = .5 * K.square(x)
 73 |     linear_loss = clip_value * (K.abs(x) - .5 * clip_value)
 74 |     if K.backend() == 'tensorflow':
 75 |         import tensorflow as tf
 76 |         if hasattr(tf, 'select'):
 77 |             return tf.select(condition, squared_loss, linear_loss)  # condition, true, false
 78 |         else:
 79 |             return tf.where(condition, squared_loss, linear_loss)  # condition, true, false
 80 |     elif K.backend() == 'theano':
 81 |         from theano import tensor as T
 82 |         return T.switch(condition, squared_loss, linear_loss)
 83 |     else:
 84 |         raise RuntimeError('Unknown backend "{}".'.format(K.backend()))
 85 | 
 86 | 
 87 | class AdditionalUpdatesOptimizer(optimizers.Optimizer):
 88 |     def __init__(self, optimizer, additional_updates):
 89 |         super(AdditionalUpdatesOptimizer, self).__init__()
 90 |         self.optimizer = optimizer
 91 |         self.additional_updates = additional_updates
 92 | 
 93 |     def get_updates(self, params, loss):
 94 |         updates = self.optimizer.get_updates(params=params, loss=loss)
 95 |         updates += self.additional_updates
 96 |         self.updates = updates
 97 |         return self.updates
 98 | 
 99 |     def get_config(self):
100 |         return self.optimizer.get_config()
101 | 
102 | 
103 | # Based on https://github.com/openai/baselines/blob/master/baselines/common/mpi_running_mean_std.py
104 | class WhiteningNormalizer(object):
105 |     def __init__(self, shape, eps=1e-2, dtype=np.float64):
106 |         self.eps = eps
107 |         self.shape = shape
108 |         self.dtype = dtype
109 | 
110 |         self._sum = np.zeros(shape, dtype=dtype)
111 |         self._sumsq = np.zeros(shape, dtype=dtype)
112 |         self._count = 0
113 | 
114 |         self.mean = np.zeros(shape, dtype=dtype)
115 |         self.std = np.ones(shape, dtype=dtype)
116 | 
117 |     def normalize(self, x):
118 |         return (x - self.mean) / self.std
119 | 
120 |     def denormalize(self, x):
121 |         return self.std * x + self.mean
122 | 
123 |     def update(self, x):
124 |         if x.ndim == len(self.shape):
125 |             x = x.reshape(-1, *self.shape)
126 |         assert x.shape[1:] == self.shape
127 | 
128 |         self._count += x.shape[0]
129 |         self._sum += np.sum(x, axis=0)
130 |         self._sumsq += np.sum(np.square(x), axis=0)
131 | 
132 |         self.mean = self._sum / float(self._count)
133 |         self.std = np.sqrt(np.maximum(np.square(self.eps), self._sumsq / float(self._count) - np.square(self.mean)))
134 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """Build a Segment Tree data structure.
  7 | 
  8 |         https://en.wikipedia.org/wiki/Segment_tree
  9 | 
 10 |         Can be used as regular array, but with two
 11 |         important differences:
 12 | 
 13 |             a) setting item's value is slightly slower.
 14 |                It is O(lg capacity) instead of O(1).
 15 |             b) user has access to an efficient ( O(log segment size) )
 16 |                `reduce` operation which reduces `operation` over
 17 |                a contiguous subsequence of items in the array.
 18 | 
 19 |         Paramters
 20 |         ---------
 21 |         capacity: int
 22 |             Total size of the array - must be a power of two.
 23 |         operation: lambda obj, obj -> obj
 24 |             and operation for combining elements (eg. sum, max)
 25 |             must form a mathematical group together with the set of
 26 |             possible values for array elements (i.e. be associative)
 27 |         neutral_element: obj
 28 |             neutral element for the operation above. eg. float('-inf')
 29 |             for max and 0 for sum.
 30 |         """
 31 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 32 |         self._capacity = capacity
 33 |         self._value = [neutral_element for _ in range(2 * capacity)]
 34 |         self._operation = operation
 35 | 
 36 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 37 |         if start == node_start and end == node_end:
 38 |             return self._value[node]
 39 |         mid = (node_start + node_end) // 2
 40 |         if end <= mid:
 41 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 42 |         else:
 43 |             if mid + 1 <= start:
 44 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 45 |             else:
 46 |                 return self._operation(
 47 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 48 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 49 |                 )
 50 | 
 51 |     def reduce(self, start=0, end=None):
 52 |         """Returns result of applying `self.operation`
 53 |         to a contiguous subsequence of the array.
 54 | 
 55 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 56 | 
 57 |         Parameters
 58 |         ----------
 59 |         start: int
 60 |             beginning of the subsequence
 61 |         end: int
 62 |             end of the subsequences
 63 | 
 64 |         Returns
 65 |         -------
 66 |         reduced: obj
 67 |             result of reducing self.operation over the specified range of array elements.
 68 |         """
 69 |         if end is None:
 70 |             end = self._capacity
 71 |         if end < 0:
 72 |             end += self._capacity
 73 |         end -= 1
 74 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 75 | 
 76 |     def __setitem__(self, idx, val):
 77 |         # index of the leaf
 78 |         idx += self._capacity
 79 |         self._value[idx] = val
 80 |         idx //= 2
 81 |         while idx >= 1:
 82 |             self._value[idx] = self._operation(
 83 |                 self._value[2 * idx],
 84 |                 self._value[2 * idx + 1]
 85 |             )
 86 |             idx //= 2
 87 | 
 88 |     def __getitem__(self, idx):
 89 |         assert 0 <= idx < self._capacity
 90 |         return self._value[self._capacity + idx]
 91 | 
 92 | 
 93 | class SumSegmentTree(SegmentTree):
 94 |     def __init__(self, capacity):
 95 |         super(SumSegmentTree, self).__init__(
 96 |             capacity=capacity,
 97 |             operation=operator.add,
 98 |             neutral_element=0.0
 99 |         )
100 | 
101 |     def sum(self, start=0, end=None):
102 |         """Returns arr[start] + ... + arr[end]"""
103 |         return super(SumSegmentTree, self).reduce(start, end)
104 | 
105 |     def find_prefixsum_idx(self, prefixsum):
106 |         """Find the highest index `i` in the array such that
107 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
108 | 
109 |         if array values are probabilities, this function
110 |         allows to sample indexes according to the discrete
111 |         probability efficiently.
112 | 
113 |         Parameters
114 |         ----------
115 |         perfixsum: float
116 |             upperbound on the sum of array prefix
117 | 
118 |         Returns
119 |         -------
120 |         idx: int
121 |             highest index satisfying the prefixsum constraint
122 |         """
123 |         assert 0 <= prefixsum <= self.sum() + 1e-5
124 |         idx = 1
125 |         while idx < self._capacity:  # while non-leaf
126 |             if self._value[2 * idx] > prefixsum:
127 |                 idx = 2 * idx
128 |             else:
129 |                 prefixsum -= self._value[2 * idx]
130 |                 idx = 2 * idx + 1
131 |         return idx - self._capacity
132 | 
133 | 
134 | class MinSegmentTree(SegmentTree):
135 |     def __init__(self, capacity):
136 |         super(MinSegmentTree, self).__init__(
137 |             capacity=capacity,
138 |             operation=min,
139 |             neutral_element=float('inf')
140 |         )
141 | 
142 |     def min(self, start=0, end=None):
143 |         """Returns min(arr[start], ...,  arr[end])"""
144 | 
145 |         return super(MinSegmentTree, self).reduce(start, end)
146 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/cmd_util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helpers for scripts like run_atari.py.
  3 | """
  4 | 
  5 | import os
  6 | try:
  7 |     from mpi4py import MPI
  8 | except ImportError:
  9 |     MPI = None
 10 | 
 11 | import gym
 12 | from gym.wrappers import FlattenDictWrapper
 13 | from baselines import logger
 14 | from baselines.bench import Monitor
 15 | from baselines.common import set_global_seeds
 16 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind
 17 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 18 | 
 19 | def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0):
 20 |     """
 21 |     Create a wrapped, monitored SubprocVecEnv for Atari.
 22 |     """
 23 |     if wrapper_kwargs is None: wrapper_kwargs = {}
 24 |     mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
 25 |     def make_env(rank): # pylint: disable=C0111
 26 |         def _thunk():
 27 |             env = make_atari(env_id)
 28 |             env.seed(seed + 10000*mpi_rank + rank if seed is not None else None)
 29 |             env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank)))
 30 |             return wrap_deepmind(env, **wrapper_kwargs)
 31 |         return _thunk
 32 |     set_global_seeds(seed)
 33 |     return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
 34 | 
 35 | def make_mujoco_env(env_id, seed, reward_scale=1.0):
 36 |     """
 37 |     Create a wrapped, monitored gym.Env for MuJoCo.
 38 |     """
 39 |     rank = MPI.COMM_WORLD.Get_rank()
 40 |     myseed = seed  + 1000 * rank if seed is not None else None
 41 |     set_global_seeds(myseed)
 42 |     env = gym.make(env_id)
 43 |     env = Monitor(env, os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True)
 44 |     env.seed(seed)
 45 | 
 46 |     if reward_scale != 1.0:
 47 |         from baselines.common.retro_wrappers import RewardScaler
 48 |         env = RewardScaler(env, reward_scale)
 49 | 
 50 |     return env
 51 | 
 52 | def make_robotics_env(env_id, seed, rank=0):
 53 |     """
 54 |     Create a wrapped, monitored gym.Env for MuJoCo.
 55 |     """
 56 |     set_global_seeds(seed)
 57 |     env = gym.make(env_id)
 58 |     env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
 59 |     env = Monitor(
 60 |         env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
 61 |         info_keywords=('is_success',))
 62 |     env.seed(seed)
 63 |     return env
 64 | 
 65 | def arg_parser():
 66 |     """
 67 |     Create an empty argparse.ArgumentParser.
 68 |     """
 69 |     import argparse
 70 |     return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 71 | 
 72 | def atari_arg_parser():
 73 |     """
 74 |     Create an argparse.ArgumentParser for run_atari.py.
 75 |     """
 76 |     print('Obsolete - use common_arg_parser instead')
 77 |     return common_arg_parser()
 78 | 
 79 | def mujoco_arg_parser():
 80 |     print('Obsolete - use common_arg_parser instead')
 81 |     return common_arg_parser()
 82 | 
 83 | def str2bool(v):
 84 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 85 |         return True
 86 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 87 |         return False
 88 |     else:
 89 |         raise argparse.ArgumentTypeError('Boolean value expected.')
 90 | 
 91 | def common_arg_parser():
 92 |     """
 93 |     Create an argparse.ArgumentParser for run_mujoco.py.
 94 |     """
 95 |     parser = arg_parser()
 96 |     parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
 97 |     parser.add_argument('--seed', help='RNG seed', type=int, default=2019)
 98 |     parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2')
 99 |     parser.add_argument('--num_timesteps', type=float, default=1e6), 
100 |     parser.add_argument('--weight', help='weight of noise', type=float, default=0.1)
101 |     parser.add_argument('--normal', help='no noise', type=str2bool, default=True)
102 |     parser.add_argument('--surrogate', help='surrogate reward', type=str2bool, default=False)
103 |     parser.add_argument('--noise_type', help='noise type (norm_one, norm_all, max_one, anti_iden)', type=str, default='norm_one')
104 |     parser.add_argument('--network', help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)', default=None)
105 |     parser.add_argument('--gamestate', help='game state to load (so far only used in retro games)', default=None)
106 |     parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int)
107 |     parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float)
108 |     parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
109 |     parser.add_argument('--play', default=False, action='store_true')
110 |     return parser
111 | 
112 | def robotics_arg_parser():
113 |     """
114 |     Create an argparse.ArgumentParser for run_mujoco.py.
115 |     """
116 |     parser = arg_parser()
117 |     parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
118 |     parser.add_argument('--seed', help='RNG seed', type=int, default=None)
119 |     parser.add_argument('--num-timesteps', type=int, default=int(1e6))
120 |     return parser
121 | 
122 | 
123 | def parse_unknown_args(args):
124 |     """
125 |     Parse arguments not consumed by arg parser into a dicitonary
126 |     """
127 |     retval = {}
128 |     for arg in args:
129 |         assert arg.startswith('--')
130 |         assert '=' in arg, 'cannot parse arg {}'.format(arg)
131 |         key = arg.split('=')[0][2:]
132 |         value = arg.split('=')[1]
133 |         retval[key] = value
134 | 
135 |     return retval
136 | 
137 | 
138 | 
139 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/bench/benchmarks.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import os.path as osp
  3 | import os
  4 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
  5 | 
  6 | _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders']
  7 | _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture']
  8 | 
  9 | _BENCHMARKS = []
 10 | 
 11 | remove_version_re = re.compile(r'-v\d+$')
 12 | 
 13 | 
 14 | def register_benchmark(benchmark):
 15 |     for b in _BENCHMARKS:
 16 |         if b['name'] == benchmark['name']:
 17 |             raise ValueError('Benchmark with name %s already registered!' % b['name'])
 18 | 
 19 |     # automatically add a description if it is not present
 20 |     if 'tasks' in benchmark:
 21 |         for t in benchmark['tasks']:
 22 |             if 'desc' not in t:
 23 |                 t['desc'] = remove_version_re.sub('', t['env_id'])
 24 |     _BENCHMARKS.append(benchmark)
 25 | 
 26 | 
 27 | def list_benchmarks():
 28 |     return [b['name'] for b in _BENCHMARKS]
 29 | 
 30 | 
 31 | def get_benchmark(benchmark_name):
 32 |     for b in _BENCHMARKS:
 33 |         if b['name'] == benchmark_name:
 34 |             return b
 35 |     raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks()))
 36 | 
 37 | 
 38 | def get_task(benchmark, env_id):
 39 |     """Get a task by env_id. Return None if the benchmark doesn't have the env"""
 40 |     return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None)
 41 | 
 42 | 
 43 | def find_task_for_env_id_in_any_benchmark(env_id):
 44 |     for bm in _BENCHMARKS:
 45 |         for task in bm["tasks"]:
 46 |             if task["env_id"] == env_id:
 47 |                 return bm, task
 48 |     return None, None
 49 | 
 50 | 
 51 | _ATARI_SUFFIX = 'NoFrameskip-v4'
 52 | 
 53 | register_benchmark({
 54 |     'name': 'Atari50M',
 55 |     'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps',
 56 |     'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7]
 57 | })
 58 | 
 59 | register_benchmark({
 60 |     'name': 'Atari10M',
 61 |     'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
 62 |     'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 6, 'num_timesteps': int(10e6)} for _game in _atari7]
 63 | })
 64 | 
 65 | register_benchmark({
 66 |     'name': 'Atari1Hr',
 67 |     'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime',
 68 |     'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7]
 69 | })
 70 | 
 71 | register_benchmark({
 72 |     'name': 'AtariExploration10M',
 73 |     'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps',
 74 |     'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7]
 75 | })
 76 | 
 77 | 
 78 | # MuJoCo
 79 | 
 80 | _mujocosmall = [
 81 |     'InvertedDoublePendulum-v2', 'InvertedPendulum-v2',
 82 |     'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2',
 83 |     'Reacher-v2', 'Swimmer-v2']
 84 | register_benchmark({
 85 |     'name': 'Mujoco1M',
 86 |     'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps',
 87 |     'tasks': [{'env_id': _envid, 'trials': 6, 'num_timesteps': int(1e6)} for _envid in _mujocosmall]
 88 | })
 89 | 
 90 | register_benchmark({
 91 |     'name': 'MujocoWalkers',
 92 |     'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M',
 93 |     'tasks': [
 94 |         {'env_id': "Hopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
 95 |         {'env_id': "Walker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
 96 |         {'env_id': "Humanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
 97 |     ]
 98 | })
 99 | 
100 | # Roboschool
101 | 
102 | register_benchmark({
103 |     'name': 'Roboschool8M',
104 |     'description': 'Small 2D tasks, up to 30 minutes to complete on 8 cores',
105 |     'tasks': [
106 |         {'env_id': "RoboschoolReacher-v1", 'trials': 4, 'num_timesteps': 2 * 1000000},
107 |         {'env_id': "RoboschoolAnt-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
108 |         {'env_id': "RoboschoolHalfCheetah-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
109 |         {'env_id': "RoboschoolHopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
110 |         {'env_id': "RoboschoolWalker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000},
111 |     ]
112 | })
113 | register_benchmark({
114 |     'name': 'RoboschoolHarder',
115 |     'description': 'Test your might!!! Up to 12 hours on 32 cores',
116 |     'tasks': [
117 |         {'env_id': "RoboschoolHumanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000},
118 |         {'env_id': "RoboschoolHumanoidFlagrun-v1", 'trials': 4, 'num_timesteps': 200 * 1000000},
119 |         {'env_id': "RoboschoolHumanoidFlagrunHarder-v1", 'trials': 4, 'num_timesteps': 400 * 1000000},
120 |     ]
121 | })
122 | 
123 | # Other
124 | 
125 | _atari50 = [  # actually 47
126 |     'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids',
127 |     'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling',
128 |     'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber',
129 |     'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway',
130 |     'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond',
131 |     'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman',
132 |     'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert',
133 |     'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 'StarGunner',
134 |     'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture',
135 |     'VideoPinball', 'WizardOfWor', 'Zaxxon',
136 | ]
137 | 
138 | register_benchmark({
139 |     'name': 'Atari50_10M',
140 |     'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps',
141 |     'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50]
142 | })
143 | 
144 | # HER DDPG
145 | 
146 | register_benchmark({
147 |     'name': 'HerDdpg',
148 |     'description': 'Smoke-test only benchmark of HER',
149 |     'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}]
150 | })
151 | 
152 | 


--------------------------------------------------------------------------------
/gym-control/scripts/train-naf.sh:
--------------------------------------------------------------------------------
 1 | for log_dir in naf/1
 2 | do
 3 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.1 --reward noisy --noise_type norm_all --log_dir $log_dir)&
 4 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.2 --reward noisy --noise_type norm_all --log_dir $log_dir)&
 5 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.3 --reward noisy --noise_type norm_all --log_dir $log_dir)&
 6 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.4 --reward noisy --noise_type norm_all --log_dir $log_dir)&
 7 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.9 --reward noisy --noise_type norm_all --log_dir $log_dir)&
 8 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.6 --reward noisy --noise_type norm_all --log_dir $log_dir)&
 9 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.7 --reward noisy --noise_type norm_all --log_dir $log_dir)&
10 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.8 --reward noisy --noise_type norm_all --log_dir $log_dir)&
11 | 
12 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.1 --reward surrogate --noise_type norm_all --log_dir $log_dir)&
13 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.2 --reward surrogate --noise_type norm_all --log_dir $log_dir)&
14 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.3 --reward surrogate --noise_type norm_all --log_dir $log_dir)&
15 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.4 --reward surrogate --noise_type norm_all --log_dir $log_dir)&
16 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.9 --reward surrogate --noise_type norm_all --log_dir $log_dir)&
17 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.6 --reward surrogate --noise_type norm_all --log_dir $log_dir)&
18 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.7 --reward surrogate --noise_type norm_all --log_dir $log_dir)&
19 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.8 --reward surrogate --noise_type norm_all --log_dir $log_dir)&
20 | 
21 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.1 --reward noisy --noise_type norm_one --log_dir $log_dir)&
22 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.2 --reward noisy --noise_type norm_one --log_dir $log_dir)&
23 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.3 --reward noisy --noise_type norm_one --log_dir $log_dir)&
24 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.4 --reward noisy --noise_type norm_one --log_dir $log_dir)&
25 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.9 --reward noisy --noise_type norm_one --log_dir $log_dir)&
26 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.6 --reward noisy --noise_type norm_one --log_dir $log_dir)&
27 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.7 --reward noisy --noise_type norm_one --log_dir $log_dir)&
28 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.8 --reward noisy --noise_type norm_one --log_dir $log_dir)&
29 | 
30 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.1 --reward surrogate --noise_type norm_one --log_dir $log_dir)&
31 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.2 --reward surrogate --noise_type norm_one --log_dir $log_dir)&
32 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.3 --reward surrogate --noise_type norm_one --log_dir $log_dir)&
33 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.4 --reward surrogate --noise_type norm_one --log_dir $log_dir)&
34 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.9 --reward surrogate --noise_type norm_one --log_dir $log_dir)&
35 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.6 --reward surrogate --noise_type norm_one --log_dir $log_dir)&
36 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.7 --reward surrogate --noise_type norm_one --log_dir $log_dir)&
37 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.8 --reward surrogate --noise_type norm_one --log_dir $log_dir)&
38 | 
39 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.1 --reward noisy --noise_type anti_iden --log_dir $log_dir)&
40 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.2 --reward noisy --noise_type anti_iden --log_dir $log_dir)&
41 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.3 --reward noisy --noise_type anti_iden --log_dir $log_dir)&
42 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.4 --reward noisy --noise_type anti_iden --log_dir $log_dir)&
43 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.9 --reward noisy --noise_type anti_iden --log_dir $log_dir)&
44 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.6 --reward noisy --noise_type anti_iden --log_dir $log_dir)&
45 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.7 --reward noisy --noise_type anti_iden --log_dir $log_dir)&
46 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.8 --reward noisy --noise_type anti_iden --log_dir $log_dir)&
47 | 
48 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.1 --reward surrogate --noise_type anti_iden --log_dir $log_dir)&
49 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.2 --reward surrogate --noise_type anti_iden --log_dir $log_dir)&
50 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.3 --reward surrogate --noise_type anti_iden --log_dir $log_dir)&
51 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.4 --reward surrogate --noise_type anti_iden --log_dir $log_dir)&
52 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.9 --reward surrogate --noise_type anti_iden --log_dir $log_dir)&
53 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.6 --reward surrogate --noise_type anti_iden --log_dir $log_dir)&
54 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.7 --reward surrogate --noise_type anti_iden --log_dir $log_dir)&
55 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.8 --reward surrogate --noise_type anti_iden --log_dir $log_dir)&
56 | 
57 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --log_dir $log_dir)&
58 | done
59 | 


--------------------------------------------------------------------------------
/gym-control/cem_cartpole.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import pandas
  4 | import numpy as np
  5 | import os
  6 | import gym
  7 | 
  8 | from keras.models import Sequential
  9 | from keras.layers import Dense, Activation, Flatten
 10 | import tensorflow as tf
 11 | 
 12 | from rl.agents.cem import CEMAgent
 13 | from rl.memory import EpisodeParameterMemory
 14 | from noise_estimator import CartpoleProcessor, CartpoleSurrogateProcessor
 15 | from utils import *
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument('--error_positive', type=float, default=0.2,
 19 |                     help='Error positive rate [default: 0.2]')
 20 | parser.add_argument('--error_negative', type=float, default=0.0,
 21 |                     help='Error negative rate [default: 0.0]')
 22 | parser.add_argument('--log_dir', default='logs',
 23 |                     help='Log dir [default: logs]')
 24 | parser.add_argument('--reward', default='normal',
 25 |                     help='reward choice: normal/noisy/surrogate [default: normal]')
 26 | parser.add_argument('--smooth', type=str2bool, default=False,
 27 |                     help='Add smoothing to rewards [default: False]')
 28 | FLAGS = parser.parse_args()
 29 | 
 30 | ERR_P = FLAGS.error_positive
 31 | ERR_N = FLAGS.error_negative
 32 | REWARD = FLAGS.reward
 33 | SMOOTH = FLAGS.smooth
 34 | 
 35 | if REWARD == "normal":
 36 |     LOG_DIR = os.path.join(FLAGS.log_dir, "cem_cartpole")
 37 | else:
 38 |     LOG_DIR = os.path.join(os.path.join(FLAGS.log_dir, "cem_cartpole"), str(ERR_P))
 39 | ENV_NAME = 'CartPole-v0'
 40 | 
 41 | if not os.path.exists(LOG_DIR):
 42 |     os.makedirs(LOG_DIR)
 43 | os.system('cp cem_cartpole.py %s' % (LOG_DIR)) # bkp of train procedure
 44 | LOG_FOUT = open(os.path.join(LOG_DIR, 'setting.txt'), 'w')
 45 | LOG_FOUT.write(str(FLAGS)+'\n')
 46 | 
 47 | def train():
 48 |     # Get the environment and extract the number of actions.
 49 |     env = gym.make(ENV_NAME)
 50 |     np.random.seed(123)
 51 |     env.seed(123)
 52 | 
 53 |     nb_actions = env.action_space.n
 54 |     obs_dim = env.observation_space.shape[0]
 55 | 
 56 |     config = tf.ConfigProto()
 57 |     config.gpu_options.allow_growth = True
 58 |     sess = tf.Session(config=config)
 59 |     from keras import backend as K
 60 |     K.set_session(sess)
 61 | 
 62 |     # Option 1 : Simple model
 63 |     # model = Sequential()
 64 |     # model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
 65 |     # model.add(Dense(nb_actions))
 66 |     # model.add(Activation('softmax'))
 67 | 
 68 |     # Option 2: deep network
 69 |     model = Sequential()
 70 |     model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
 71 |     model.add(Dense(16))
 72 |     model.add(Activation('relu'))
 73 |     model.add(Dense(16))
 74 |     model.add(Activation('relu'))
 75 |     model.add(Dense(16))
 76 |     model.add(Activation('relu'))
 77 |     model.add(Dense(nb_actions))
 78 |     model.add(Activation('softmax'))
 79 | 
 80 |     model.summary()
 81 | 
 82 |     # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
 83 |     # even the metrics!
 84 |     memory = EpisodeParameterMemory(limit=1000, window_length=1)
 85 | 
 86 |     if REWARD == "normal":
 87 |         cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
 88 |                        batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05)
 89 |         cem.compile()
 90 |         history_normal = cem.fit(env, nb_steps=100000, visualize=False, verbose=2)
 91 |         cem.save_weights(os.path.join(LOG_DIR, 'cem_normal_{}_params.h5f'.format(ENV_NAME)), overwrite=True)
 92 |         cem.test(env, nb_episodes=5, visualize=False)
 93 | 
 94 |         pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv"))
 95 | 
 96 |     elif REWARD == "noisy":
 97 |         if not SMOOTH:
 98 |             processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=False)
 99 |         else:
100 |             processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False)
101 | 
102 |         # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False)
103 |         cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
104 |                        batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05,
105 |                        processor=processor_noisy)
106 |         cem.compile()
107 |         history_noisy = cem.fit(env, nb_steps=100000, visualize=False, verbose=2)
108 |         if not SMOOTH:
109 |             cem.save_weights(os.path.join(LOG_DIR, 'cem_noisy_{}_params.h5f'.format(ENV_NAME)), overwrite=True)
110 |             pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv"))
111 | 
112 |         else:
113 |             cem.save_weights(os.path.join(LOG_DIR, 'cem_noisy_smooth_{}_params.h5f'.format(ENV_NAME)), overwrite=True)
114 |             pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv"))
115 | 
116 |         cem.test(env, nb_episodes=5, visualize=False)
117 | 
118 |     elif REWARD == "surrogate":
119 |         if not SMOOTH:
120 |             processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True)
121 |         else:
122 |             processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True)
123 | 
124 |         # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True)
125 |         cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory,
126 |                        batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05,
127 |                        processor=processor_surrogate)
128 |         cem.compile()
129 |         history_surrogate = cem.fit(env, nb_steps=100000, visualize=False, verbose=2)
130 |         if not SMOOTH:
131 |             cem.save_weights(os.path.join(LOG_DIR, 'cem_surrogate_{}_params.h5f'.format(ENV_NAME)), overwrite=True)
132 |             pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv"))
133 |         else:
134 |             cem.save_weights(os.path.join(LOG_DIR, 'cem_surrogate_smooth_{}_params.h5f'.format(ENV_NAME)), overwrite=True)
135 |             pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv"))
136 | 
137 |         cem.test(env, nb_episodes=5, visualize=False)
138 | 
139 |     else:
140 |         raise NotImplementedError
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     train()
145 | 


--------------------------------------------------------------------------------
/gym-control/sarsa_cartpole.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import pandas
  4 | import numpy as np
  5 | import os
  6 | import gym
  7 | 
  8 | from keras.layers import Activation, Dense, Flatten
  9 | from keras.models import Sequential
 10 | from keras.optimizers import Adam
 11 | import tensorflow as tf
 12 | 
 13 | from rl.agents import SARSAAgent
 14 | from rl.core import Processor
 15 | from rl.policy import BoltzmannQPolicy
 16 | from noise_estimator import *
 17 | from utils import *
 18 | 
 19 | 
 20 | parser = argparse.ArgumentParser()
 21 | parser.add_argument('--error_positive', type=float, default=0.2,
 22 |                     help='Error positive rate [default: 0.2]')
 23 | parser.add_argument('--error_negative', type=float, default=0.0,
 24 |                     help='Error negative rate [default: 0.0]')
 25 | parser.add_argument('--log_dir', default='logs',
 26 |                     help='Log dir [default: logs]')
 27 | parser.add_argument('--reward', default='normal',
 28 |                     help='reward choice: normal/noisy/surrogate [default: normal]')
 29 | parser.add_argument('--smooth', type=str2bool, default=False,
 30 |                     help='Add smoothing to rewards [default: False]')
 31 | FLAGS = parser.parse_args()
 32 | 
 33 | ERR_P = FLAGS.error_positive
 34 | ERR_N = FLAGS.error_negative
 35 | REWARD = FLAGS.reward
 36 | SMOOTH = FLAGS.smooth
 37 | 
 38 | if REWARD == "normal":
 39 |     LOG_DIR = os.path.join(FLAGS.log_dir, "sarsa_cartpole")
 40 | else:
 41 |     LOG_DIR = os.path.join(os.path.join(FLAGS.log_dir, "sarsa_cartpole"), str(ERR_P))
 42 | ENV_NAME = 'CartPole-v0'
 43 | 
 44 | if not os.path.exists(LOG_DIR):
 45 |     os.makedirs(LOG_DIR)
 46 | os.system('cp sarsa_cartpole.py %s' % (LOG_DIR)) # bkp of train procedure
 47 | print ('cp sarsa_cartpole.py %s' % (LOG_DIR))
 48 | LOG_FOUT = open(os.path.join(LOG_DIR, 'setting.txt'), 'w')
 49 | LOG_FOUT.write(str(FLAGS)+'\n')
 50 | 
 51 | 
 52 | def log_string(out_str):
 53 |     LOG_FOUT.write(out_str+'\n')
 54 |     LOG_FOUT.flush()
 55 |     print(out_str)
 56 | 
 57 | def build_state(features):
 58 |     return int("".join(map(lambda feature: str(int(feature)), features)))
 59 | 
 60 | def to_bin(value, bins):
 61 |     return np.digitize(x=[value], bins=bins)[0]
 62 | 
 63 | 
 64 | def train():
 65 |     # Get the environment and extract the number of actions.
 66 |     env = gym.make(ENV_NAME)
 67 |     np.random.seed(123)
 68 |     env.seed(123)
 69 |     nb_actions = env.action_space.n
 70 | 
 71 |     config = tf.ConfigProto()
 72 |     config.gpu_options.allow_growth = True
 73 |     sess = tf.Session(config=config)
 74 |     from keras import backend as K
 75 |     K.set_session(sess)
 76 | 
 77 |     # Next, we build a very simple model.
 78 |     model = Sequential()
 79 |     model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
 80 |     model.add(Dense(16))
 81 |     model.add(Activation('relu'))
 82 |     model.add(Dense(16))
 83 |     model.add(Activation('relu'))
 84 |     model.add(Dense(16))
 85 |     model.add(Activation('relu'))
 86 |     model.add(Dense(nb_actions))
 87 |     model.add(Activation('linear'))
 88 |     print(model.summary())
 89 | 
 90 |     # SARSA does not require a memory.
 91 |     policy = BoltzmannQPolicy()
 92 |     # processor_noisy = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=False)
 93 |     # processor_surrogate = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=True)
 94 |     if not SMOOTH:
 95 |         processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=False)
 96 |         processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=True)
 97 |     else:
 98 |         processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=False)
 99 |         processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=True)        
100 | 
101 |     if REWARD == "normal":
102 |         sarsa_normal = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
103 |                                   policy=policy)
104 |         sarsa_normal.compile(Adam(lr=1e-3), metrics=['mae'])
105 |         history_normal = sarsa_normal.fit(env, nb_steps=50000, visualize=False, verbose=2)
106 |         sarsa_normal.save_weights(os.path.join(LOG_DIR, 'sarsa_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
107 |         sarsa_normal.test(env, nb_episodes=10, visualize=False, verbose=2)
108 | 
109 |         pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv"))
110 | 
111 | 
112 |     elif REWARD == "noisy":
113 |         sarsa_noisy = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
114 |                                  policy=policy, processor=processor_noisy)
115 |         sarsa_noisy.compile(Adam(lr=1e-3), metrics=['mae'])
116 |         history_noisy = sarsa_noisy.fit(env, nb_steps=50000, visualize=False, verbose=2)
117 |         if not SMOOTH:
118 |             sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
119 |             pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv"))
120 |         else:
121 |             sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
122 |             pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv"))
123 | 
124 |         sarsa_noisy.test(env, nb_episodes=10, visualize=False)
125 | 
126 | 
127 |     elif REWARD == "surrogate":
128 |         sarsa_surrogate = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 
129 |                                      policy=policy, processor=processor_surrogate)
130 |         sarsa_surrogate.compile(Adam(lr=1e-3), metrics=['mae'])
131 |         history_surrogate = sarsa_surrogate.fit(env, nb_steps=50000, visualize=False, verbose=2)
132 |         if not SMOOTH:
133 |             sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
134 |             pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv"))
135 | 
136 |         else:
137 |             sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
138 |             pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv"))
139 | 
140 |         sarsa_surrogate.test(env, nb_episodes=10, visualize=False)
141 | 
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     train()


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/common/models.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from baselines.a2c import utils
  4 | from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch
  5 | from baselines.common.mpi_running_mean_std import RunningMeanStd
  6 | import tensorflow.contrib.layers as layers
  7 | 
  8 | 
  9 | def nature_cnn(unscaled_images, **conv_kwargs):
 10 |     """
 11 |     CNN from Nature paper.
 12 |     """
 13 |     scaled_images = tf.cast(unscaled_images, tf.float32) / 255.
 14 |     activ = tf.nn.relu
 15 |     h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2),
 16 |                    **conv_kwargs))
 17 |     h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
 18 |     h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs))
 19 |     h3 = conv_to_fc(h3)
 20 |     return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2)))
 21 | 
 22 | 
 23 | def mlp(num_layers=2, num_hidden=64, activation=tf.tanh):
 24 |     """
 25 |     Simple fully connected layer policy. Separate stacks of fully-connected layers are used for policy and value function estimation.
 26 |     More customized fully-connected policies can be obtained by using PolicyWithV class directly.
 27 | 
 28 |     Parameters:
 29 |     ----------
 30 | 
 31 |     num_layers: int                 number of fully-connected layers (default: 2)
 32 |     
 33 |     num_hidden: int                 size of fully-connected layers (default: 64)
 34 |     
 35 |     activation:                     activation function (default: tf.tanh)
 36 |         
 37 |     Returns:
 38 |     -------
 39 | 
 40 |     function that builds fully connected network with a given input placeholder
 41 |     """        
 42 |     def network_fn(X):
 43 |         h = tf.layers.flatten(X)
 44 |         for i in range(num_layers):
 45 |             h = activation(fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2)))
 46 |         return h, None
 47 | 
 48 |     return network_fn
 49 |   
 50 | 
 51 | def cnn(**conv_kwargs):
 52 |     def network_fn(X):
 53 |         return nature_cnn(X, **conv_kwargs), None
 54 |     return network_fn
 55 | 
 56 | def cnn_small(**conv_kwargs):
 57 |     def network_fn(X):
 58 |         h = tf.cast(X, tf.float32) / 255.
 59 |         
 60 |         activ = tf.nn.relu
 61 |         h = activ(conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs))
 62 |         h = activ(conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs))
 63 |         h = conv_to_fc(h)
 64 |         h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2)))
 65 |         return h, None
 66 |     return network_fn
 67 | 
 68 | 
 69 | 
 70 | def lstm(nlstm=128, layer_norm=False):
 71 |     def network_fn(X, nenv=1):
 72 |         nbatch = X.shape[0] 
 73 |         nsteps = nbatch // nenv
 74 |          
 75 |         h = tf.layers.flatten(X)
 76 | 
 77 |         M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
 78 |         S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
 79 | 
 80 |         xs = batch_to_seq(h, nenv, nsteps)
 81 |         ms = batch_to_seq(M, nenv, nsteps)
 82 | 
 83 |         if layer_norm:
 84 |             h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
 85 |         else:
 86 |             h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
 87 |             
 88 |         h = seq_to_batch(h5)
 89 |         initial_state = np.zeros(S.shape.as_list(), dtype=float)
 90 | 
 91 |         return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
 92 | 
 93 |     return network_fn
 94 | 
 95 | 
 96 | def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs):
 97 |     def network_fn(X, nenv=1):
 98 |         nbatch = X.shape[0] 
 99 |         nsteps = nbatch // nenv
100 |          
101 |         h = nature_cnn(X, **conv_kwargs)
102 |        
103 |         M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1)
104 |         S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states
105 | 
106 |         xs = batch_to_seq(h, nenv, nsteps)
107 |         ms = batch_to_seq(M, nenv, nsteps)
108 | 
109 |         if layer_norm:
110 |             h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm)
111 |         else:
112 |             h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm)
113 |             
114 |         h = seq_to_batch(h5)
115 |         initial_state = np.zeros(S.shape.as_list(), dtype=float)
116 | 
117 |         return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state}
118 | 
119 |     return network_fn
120 | 
121 | def cnn_lnlstm(nlstm=128, **conv_kwargs):
122 |     return cnn_lstm(nlstm, layer_norm=True, **conv_kwargs)
123 | 
124 | 
125 | def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs):
126 |     ''' 
127 |     convolutions-only net
128 | 
129 |     Parameters:
130 |     ----------
131 | 
132 |     conv:       list of triples (filter_number, filter_size, stride) specifying parameters for each layer. 
133 | 
134 |     Returns:
135 | 
136 |     function that takes tensorflow tensor as input and returns the output of the last convolutional layer
137 |     
138 |     '''
139 | 
140 |     def network_fn(X):
141 |         out = tf.cast(X, tf.float32) / 255.
142 |         with tf.variable_scope("convnet"):
143 |             for num_outputs, kernel_size, stride in convs:
144 |                 out = layers.convolution2d(out,
145 |                                            num_outputs=num_outputs,
146 |                                            kernel_size=kernel_size,
147 |                                            stride=stride,
148 |                                            activation_fn=tf.nn.relu,
149 |                                            **conv_kwargs)
150 | 
151 |         return out, None
152 |     return network_fn
153 | 
154 | def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]):
155 |     rms = RunningMeanStd(shape=x.shape[1:])
156 |     norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range))
157 |     return norm_x, rms
158 |     
159 | 
160 | def get_network_builder(name):
161 |     # TODO: replace with reflection? 
162 |     if name == 'cnn':
163 |         return cnn
164 |     elif name == 'cnn_small':
165 |         return cnn_small
166 |     elif name == 'conv_only':
167 |         return conv_only
168 |     elif name == 'mlp':
169 |         return mlp
170 |     elif name == 'lstm':
171 |         return lstm
172 |     elif name == 'cnn_lstm':
173 |         return cnn_lstm
174 |     elif name == 'cnn_lnlstm':
175 |         return cnn_lnlstm
176 |     else:
177 |         raise ValueError('Unknown network type: {}'.format(name))
178 | 


--------------------------------------------------------------------------------
/gym-atari/baselines/baselines/bench/monitor.py:
--------------------------------------------------------------------------------
  1 | __all__ = ['Monitor', 'get_monitor_files', 'load_results']
  2 | 
  3 | import gym
  4 | from gym.core import Wrapper
  5 | import time
  6 | from glob import glob
  7 | import csv
  8 | import os.path as osp
  9 | import json
 10 | import numpy as np
 11 | 
 12 | class Monitor(Wrapper):
 13 |     EXT = "monitor.csv"
 14 |     f = None
 15 | 
 16 |     def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()):
 17 |         Wrapper.__init__(self, env=env)
 18 |         self.tstart = time.time()
 19 |         if filename is None:
 20 |             self.f = None
 21 |             self.logger = None
 22 |         else:
 23 |             if not filename.endswith(Monitor.EXT):
 24 |                 if osp.isdir(filename):
 25 |                     filename = osp.join(filename, Monitor.EXT)
 26 |                 else:
 27 |                     filename = filename + "." + Monitor.EXT
 28 |             self.f = open(filename, "wt")
 29 |             self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id}))
 30 |             self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords)
 31 |             self.logger.writeheader()
 32 |             self.f.flush()
 33 | 
 34 |         self.reset_keywords = reset_keywords
 35 |         self.info_keywords = info_keywords
 36 |         self.allow_early_resets = allow_early_resets
 37 |         self.rewards = None
 38 |         self.needs_reset = True
 39 |         self.episode_rewards = []
 40 |         self.episode_lengths = []
 41 |         self.episode_times = []
 42 |         self.total_steps = 0
 43 |         self.current_reset_info = {} # extra info about the current episode, that was passed in during reset()
 44 | 
 45 |     def reset(self, **kwargs):
 46 |         if not self.allow_early_resets and not self.needs_reset:
 47 |             raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)")
 48 |         self.rewards = []
 49 |         self.needs_reset = False
 50 |         for k in self.reset_keywords:
 51 |             v = kwargs.get(k)
 52 |             if v is None:
 53 |                 raise ValueError('Expected you to pass kwarg %s into reset'%k)
 54 |             self.current_reset_info[k] = v
 55 |         return self.env.reset(**kwargs)
 56 | 
 57 |     def step(self, action):
 58 |         if self.needs_reset:
 59 |             raise RuntimeError("Tried to step environment that needs reset")
 60 |         ob, rew, done, info = self.env.step(action)
 61 |         self.rewards.append(rew)
 62 |         if done:
 63 |             self.needs_reset = True
 64 |             eprew = sum(self.rewards)
 65 |             eplen = len(self.rewards)
 66 |             epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)}
 67 |             for k in self.info_keywords:
 68 |                 epinfo[k] = info[k]
 69 |             self.episode_rewards.append(eprew)
 70 |             self.episode_lengths.append(eplen)
 71 |             self.episode_times.append(time.time() - self.tstart)
 72 |             epinfo.update(self.current_reset_info)
 73 |             if self.logger:
 74 |                 self.logger.writerow(epinfo)
 75 |                 self.f.flush()
 76 |             info['episode'] = epinfo
 77 |         self.total_steps += 1
 78 |         return (ob, rew, done, info)
 79 | 
 80 |     def close(self):
 81 |         if self.f is not None:
 82 |             self.f.close()
 83 | 
 84 |     def get_total_steps(self):
 85 |         return self.total_steps
 86 | 
 87 |     def get_episode_rewards(self):
 88 |         return self.episode_rewards
 89 | 
 90 |     def get_episode_lengths(self):
 91 |         return self.episode_lengths
 92 | 
 93 |     def get_episode_times(self):
 94 |         return self.episode_times
 95 | 
 96 | class LoadMonitorResultsError(Exception):
 97 |     pass
 98 | 
 99 | def get_monitor_files(dir):
100 |     return glob(osp.join(dir, "*" + Monitor.EXT))
101 | 
102 | def load_results(dir):
103 |     import pandas
104 |     monitor_files = (
105 |         glob(osp.join(dir, "*monitor.json")) + 
106 |         glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files
107 |     if not monitor_files:
108 |         raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir))
109 |     dfs = []
110 |     headers = []
111 |     for fname in monitor_files:
112 |         with open(fname, 'rt') as fh:
113 |             if fname.endswith('csv'):
114 |                 firstline = fh.readline()
115 |                 if not firstline:
116 |                     continue
117 |                 assert firstline[0] == '#'
118 |                 header = json.loads(firstline[1:])
119 |                 df = pandas.read_csv(fh, index_col=None)
120 |                 headers.append(header)
121 |             elif fname.endswith('json'): # Deprecated json format
122 |                 episodes = []
123 |                 lines = fh.readlines()
124 |                 header = json.loads(lines[0])
125 |                 headers.append(header)
126 |                 for line in lines[1:]:
127 |                     episode = json.loads(line)
128 |                     episodes.append(episode)
129 |                 df = pandas.DataFrame(episodes)
130 |             else:
131 |                 assert 0, 'unreachable'
132 |             df['t'] += header['t_start']
133 |         dfs.append(df)
134 |     df = pandas.concat(dfs)
135 |     df.sort_values('t', inplace=True)
136 |     df.reset_index(inplace=True)
137 |     df['t'] -= min(header['t_start'] for header in headers)
138 |     df.headers = headers # HACK to preserve backwards compatibility
139 |     return df
140 | 
141 | def test_monitor():
142 |     env = gym.make("CartPole-v1")
143 |     env.seed(0)
144 |     mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4()
145 |     menv = Monitor(env, mon_file)
146 |     menv.reset()
147 |     for _ in range(1000):
148 |         _, _, done, _ = menv.step(0)
149 |         if done:
150 |             menv.reset()
151 | 
152 |     f = open(mon_file, 'rt')
153 | 
154 |     firstline = f.readline()
155 |     assert firstline.startswith('#')
156 |     metadata = json.loads(firstline[1:])
157 |     assert metadata['env_id'] == "CartPole-v1"
158 |     assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'},  "Incorrect keys in monitor metadata"
159 | 
160 |     last_logline = pandas.read_csv(f, index_col=None)
161 |     assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline"
162 |     f.close()
163 |     os.remove(mon_file)
164 | 


--------------------------------------------------------------------------------
/gym-control/dqn_cartpole.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import pandas
  4 | import numpy as np
  5 | import os
  6 | import gym
  7 | 
  8 | from keras.layers import Activation, Dense, Flatten
  9 | from keras.models import Sequential
 10 | from keras.optimizers import Adam
 11 | import tensorflow as tf
 12 | 
 13 | from rl.agents.dqn import DQNAgent
 14 | from rl.core import Processor
 15 | from rl.memory import SequentialMemory
 16 | from rl.policy import BoltzmannQPolicy
 17 | from noise_estimator import *
 18 | from utils import *
 19 | 
 20 | 
 21 | parser = argparse.ArgumentParser()
 22 | parser.add_argument('--error_positive', type=float, default=0.2,
 23 |                     help='Error positive rate [default: 0.2]')
 24 | parser.add_argument('--error_negative', type=float, default=0.0,
 25 |                     help='Error negative rate [default: 0.0]')
 26 | parser.add_argument('--log_dir', default='logs',
 27 |                     help='Log dir [default: logs]')
 28 | parser.add_argument('--reward', default='normal',
 29 |                     help='Reward choice: normal/noisy/surrogate [default: normal]')
 30 | parser.add_argument('--smooth', type=str2bool, default=False,
 31 |                     help='Add smoothing to rewards [default: False]')
 32 | FLAGS = parser.parse_args()
 33 | 
 34 | ERR_P = FLAGS.error_positive
 35 | ERR_N = FLAGS.error_negative
 36 | REWARD = FLAGS.reward
 37 | SMOOTH = FLAGS.smooth
 38 | 
 39 | if REWARD == "normal":
 40 |     LOG_DIR = os.path.join(FLAGS.log_dir, "dqn_cartpole")
 41 | else:
 42 |     LOG_DIR = os.path.join(os.path.join(FLAGS.log_dir, "dqn_cartpole"), str(ERR_P))
 43 | ENV_NAME = 'CartPole-v0'
 44 | 
 45 | if not os.path.exists(LOG_DIR):
 46 |     os.makedirs(LOG_DIR)
 47 | os.system('cp dqn_cartpole.py %s' % (LOG_DIR)) # bkp of train procedure
 48 | LOG_FOUT = open(os.path.join(LOG_DIR, 'setting.txt'), 'w')
 49 | LOG_FOUT.write(str(FLAGS)+'\n')
 50 | 
 51 | 
 52 | def train():
 53 |     # Get the environment and extract the number of actions.
 54 |     env = gym.make(ENV_NAME)
 55 |     np.random.seed(123)
 56 |     env.seed(123)
 57 |     nb_actions = env.action_space.n
 58 | 
 59 |     config = tf.ConfigProto()
 60 |     config.gpu_options.allow_growth = True
 61 |     sess = tf.Session(config=config)
 62 |     from keras import backend as K
 63 |     K.set_session(sess)
 64 | 
 65 |     # Next, we build a very simple model.
 66 |     model = Sequential()
 67 |     model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
 68 |     model.add(Dense(16))
 69 |     model.add(Activation('relu'))
 70 |     model.add(Dense(16))
 71 |     model.add(Activation('relu'))
 72 |     model.add(Dense(16))
 73 |     model.add(Activation('relu'))
 74 |     model.add(Dense(nb_actions))
 75 |     model.add(Activation('linear'))
 76 |     model.summary()
 77 | 
 78 |     # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
 79 |     # even the metrics!
 80 |     memory = SequentialMemory(limit=50000, window_length=1)
 81 |     policy = BoltzmannQPolicy()
 82 | 
 83 |     # Okay, now it's time to learn something! We visualize the training here for show, but this
 84 |     # slows down training quite a lot. You can always safely abort the training prematurely using
 85 |     # Ctrl + C.
 86 |     if REWARD == "normal":
 87 |         dqn_normal = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
 88 |                               target_model_update=1e-2, policy=policy)
 89 |         dqn_normal.compile(Adam(lr=1e-3), metrics=['mae'])
 90 |         history_normal = dqn_normal.fit(env, nb_steps=10000, visualize=False, verbose=2)
 91 |         dqn_normal.save_weights(os.path.join(LOG_DIR, 'dqn_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
 92 |         dqn_normal.test(env, nb_episodes=10, visualize=False, verbose=2)
 93 |         
 94 |         pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv"))
 95 | 
 96 |     elif REWARD == "noisy":
 97 |         if not SMOOTH:
 98 |             processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, surrogate=False)
 99 |         else:
100 |             processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False)
101 | 
102 |         # processor_noisy = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False)
103 |         dqn_noisy = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
104 |                              target_model_update=1e-2, policy=policy, processor=processor_noisy)
105 |         dqn_noisy.compile(Adam(lr=1e-3), metrics=['mae'])
106 |         history_noisy = dqn_noisy.fit(env, nb_steps=10000, visualize=False, verbose=2)
107 |         if not SMOOTH:
108 |             dqn_noisy.save_weights(os.path.join(LOG_DIR, 'dqn_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
109 |             pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv"))
110 |         else:
111 |             dqn_noisy.save_weights(os.path.join(LOG_DIR, 'dqn_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
112 |             pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv"))
113 | 
114 |         dqn_noisy.test(env, nb_episodes=10, visualize=False, verbose=2)
115 |         
116 | 
117 |     elif REWARD == "surrogate":
118 |         if not SMOOTH:
119 |             processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True)
120 |         else:
121 |             processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True)
122 | 
123 |         # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True)
124 |         dqn_surrogate = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
125 |                                  target_model_update=1e-2, policy=policy, processor=processor_surrogate)
126 |         dqn_surrogate.compile(Adam(lr=1e-3), metrics=['mae'])    
127 |         history_surrogate = dqn_surrogate.fit(env, nb_steps=10000, visualize=False, verbose=2)
128 |         if not SMOOTH:
129 |             dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'dqn_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
130 |             pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv"))
131 |         else:
132 |             dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'dqn_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
133 |             pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv"))
134 | 
135 |         dqn_surrogate.test(env, nb_episodes=10, visualize=False, verbose=2)
136 | 
137 |     else:
138 |         raise NotImplementedError
139 | 
140 | if __name__ == "__main__":
141 |     train()
142 | 


--------------------------------------------------------------------------------
/gym-control/duel_dqn_cartpole.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import pandas
  4 | import numpy as np
  5 | import os
  6 | import gym
  7 | 
  8 | from keras.layers import Activation, Dense, Flatten
  9 | from keras.models import Sequential
 10 | from keras.optimizers import Adam
 11 | import tensorflow as tf
 12 | 
 13 | from rl.agents.dqn import DQNAgent
 14 | from rl.core import Processor
 15 | from rl.memory import SequentialMemory
 16 | from rl.policy import BoltzmannQPolicy
 17 | from noise_estimator import *
 18 | from utils import *
 19 | 
 20 | 
 21 | parser = argparse.ArgumentParser()
 22 | parser.add_argument('--error_positive', type=float, default=0.2,
 23 |                     help='Error positive rate [default: 0.2]')
 24 | parser.add_argument('--error_negative', type=float, default=0.0,
 25 |                     help='Error negative rate [default: 0.0]')
 26 | parser.add_argument('--log_dir', default='logs',
 27 |                     help='Log dir [default: logs]')
 28 | parser.add_argument('--reward', default='normal',
 29 |                     help='Reward choice: normal/noisy/surrogate [default: normal]')
 30 | parser.add_argument('--smooth', type=str2bool, default=False,
 31 |                     help='Add smoothing to rewards [default: False]')
 32 | FLAGS = parser.parse_args()
 33 | 
 34 | ERR_P = FLAGS.error_positive
 35 | ERR_N = FLAGS.error_negative
 36 | REWARD = FLAGS.reward
 37 | SMOOTH = FLAGS.smooth
 38 | 
 39 | if REWARD == "normal":
 40 |     LOG_DIR = os.path.join(FLAGS.log_dir, "duel_dqn_cartpole")
 41 | else:
 42 |     LOG_DIR = os.path.join(os.path.join(FLAGS.log_dir, "duel_dqn_cartpole"), str(ERR_P))
 43 | ENV_NAME = 'CartPole-v0'
 44 | 
 45 | if not os.path.exists(LOG_DIR):
 46 |     os.makedirs(LOG_DIR)
 47 | os.system('cp duel_dqn_cartpole.py %s' % (LOG_DIR)) # bkp of train procedure
 48 | LOG_FOUT = open(os.path.join(LOG_DIR, 'setting.txt'), 'w')
 49 | LOG_FOUT.write(str(FLAGS)+'\n')
 50 | 
 51 | 
 52 | def train():
 53 |     # Get the environment and extract the number of actions.
 54 |     env = gym.make(ENV_NAME)
 55 |     np.random.seed(123)
 56 |     env.seed(123)
 57 |     nb_actions = env.action_space.n
 58 | 
 59 |     config = tf.ConfigProto()
 60 |     config.gpu_options.allow_growth = True
 61 |     sess = tf.Session(config=config)
 62 |     from keras import backend as K
 63 |     K.set_session(sess)
 64 | 
 65 |     # Next, we build a very simple model.
 66 |     model = Sequential()
 67 |     model.add(Flatten(input_shape=(1,) + env.observation_space.shape))
 68 |     model.add(Dense(16))
 69 |     model.add(Activation('relu'))
 70 |     model.add(Dense(16))
 71 |     model.add(Activation('relu'))
 72 |     model.add(Dense(16))
 73 |     model.add(Activation('relu'))
 74 |     model.add(Dense(nb_actions))
 75 |     model.add(Activation('linear'))
 76 |     model.summary()
 77 | 
 78 |     # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and
 79 |     # even the metrics!
 80 |     memory = SequentialMemory(limit=50000, window_length=1)
 81 |     policy = BoltzmannQPolicy()
 82 | 
 83 |     # Okay, now it's time to learn something! We visualize the training here for show, but this
 84 |     # slows down training quite a lot. You can always safely abort the training prematurely using
 85 |     # Ctrl + C.
 86 |     if REWARD == "normal":
 87 |         dqn_normal = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
 88 |                               enable_dueling_network=True, dueling_type='avg',
 89 |                               target_model_update=1e-2, policy=policy)
 90 |         dqn_normal.compile(Adam(lr=1e-3), metrics=['mae'])
 91 |         history_normal = dqn_normal.fit(env, nb_steps=10000, visualize=False, verbose=2)
 92 |         dqn_normal.save_weights(os.path.join(LOG_DIR, 'duel_dqn_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
 93 |         dqn_normal.test(env, nb_episodes=10, visualize=False, verbose=2)
 94 |         
 95 |         pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv"))
 96 | 
 97 |     elif REWARD == "noisy":
 98 |         if not SMOOTH:
 99 |             processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=False)
100 |         else:
101 |             processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False)
102 |         
103 |         # processor_noisy = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False)
104 |         dqn_noisy = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
105 |                              enable_dueling_network=True, dueling_type='avg',
106 |                              target_model_update=1e-2, policy=policy, processor=processor_noisy)
107 |         dqn_noisy.compile(Adam(lr=1e-3), metrics=['mae'])
108 |         history_noisy = dqn_noisy.fit(env, nb_steps=10000, visualize=False, verbose=2)
109 |         if not SMOOTH:
110 |             dqn_noisy.save_weights(os.path.join(LOG_DIR, 'duel_dqn_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
111 |             pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv"))
112 |         else:
113 |             dqn_noisy.save_weights(os.path.join(LOG_DIR, 'duel_dqn_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
114 |             pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv"))
115 | 
116 |         dqn_noisy.test(env, nb_episodes=10, visualize=False, verbose=2)
117 |         
118 | 
119 |     elif REWARD == "surrogate":
120 |         if not SMOOTH:
121 |             processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True)
122 |         else:
123 |             processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True)
124 | 
125 |         # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True)
126 |         dqn_surrogate = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10,
127 |                                  enable_dueling_network=True, dueling_type='avg',
128 |                                  target_model_update=1e-2, policy=policy, processor=processor_surrogate)
129 |         dqn_surrogate.compile(Adam(lr=1e-3), metrics=['mae'])    
130 |         history_surrogate = dqn_surrogate.fit(env, nb_steps=10000, visualize=False, verbose=2)
131 |         if not SMOOTH:
132 |             dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'duel_dqn_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
133 |             pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv"))
134 |         else:
135 |             dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'duel_dqn_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True)
136 |             pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv"))
137 | 
138 |         dqn_surrogate.test(env, nb_episodes=10, visualize=False, verbose=2)
139 | 
140 |     else:
141 |         raise NotImplementedError
142 | 
143 | if __name__ == "__main__":
144 |     train()


--------------------------------------------------------------------------------