├── gym-control ├── rl │ ├── __init__.py │ ├── agents │ │ └── __init__.py │ ├── random.py │ ├── processors.py │ └── util.py ├── utils.py ├── README.md ├── collect.py ├── scripts │ ├── train-qlearn.sh │ ├── train-cem.sh │ ├── train-dqn.sh │ ├── train-sarsa.sh │ ├── train-duel-dqn.sh │ └── train-naf.sh ├── cem_cartpole.py ├── sarsa_cartpole.py ├── dqn_cartpole.py └── duel_dqn_cartpole.py ├── gym-atari ├── baselines │ ├── baselines │ │ ├── __init__.py │ │ ├── a2c │ │ │ ├── __init__.py │ │ │ ├── README.md │ │ │ └── runner.py │ │ ├── ppo2 │ │ │ ├── __init__.py │ │ │ ├── README.md │ │ │ └── defaults.py │ │ ├── common │ │ │ ├── tests │ │ │ │ ├── __init__.py │ │ │ │ ├── envs │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── fixed_sequence_env.py │ │ │ │ │ ├── identity_env.py │ │ │ │ │ └── mnist_env.py │ │ │ │ ├── test_schedules.py │ │ │ │ ├── test_cartpole.py │ │ │ │ ├── test_tf_util.py │ │ │ │ ├── test_fixed_sequence.py │ │ │ │ ├── test_mnist.py │ │ │ │ ├── test_identity.py │ │ │ │ ├── util.py │ │ │ │ ├── test_segment_tree.py │ │ │ │ └── test_serialization.py │ │ │ ├── __init__.py │ │ │ ├── runners.py │ │ │ ├── mpi_fork.py │ │ │ ├── identity_env.py │ │ │ ├── tile_images.py │ │ │ ├── cg.py │ │ │ ├── mpi_adam_optimizer.py │ │ │ ├── vec_env │ │ │ │ ├── vec_frame_stack.py │ │ │ │ ├── vec_normalize.py │ │ │ │ ├── dummy_vec_env.py │ │ │ │ ├── __init__.py │ │ │ │ └── subproc_vec_env.py │ │ │ ├── running_stat.py │ │ │ ├── console_util.py │ │ │ ├── input.py │ │ │ ├── mpi_moments.py │ │ │ ├── dataset.py │ │ │ ├── math_util.py │ │ │ ├── mpi_adam.py │ │ │ ├── filters.py │ │ │ ├── mpi_util.py │ │ │ ├── mpi_running_mean_std.py │ │ │ ├── schedules.py │ │ │ ├── segment_tree.py │ │ │ ├── cmd_util.py │ │ │ └── models.py │ │ ├── bench │ │ │ ├── __init__.py │ │ │ ├── benchmarks.py │ │ │ └── monitor.py │ │ ├── results_single.py │ │ └── results_plotter.py │ ├── LICENSE │ └── setup.py ├── .gitignore ├── scripts │ ├── train-normal.sh │ ├── visualize.py │ ├── train-alien.sh │ ├── train-phoenix.sh │ ├── train-carnival.sh │ ├── train-mspacman.sh │ └── train-seaquest.sh └── README.md ├── requirements.txt ├── LICENSE ├── .gitignore └── README.md /gym-control/rl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gym-atari/.gitignore: -------------------------------------------------------------------------------- 1 | logs* 2 | backup* 3 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/a2c/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/ppo2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | matplotlib 3 | keras==2.1.0 4 | h5py 5 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/bench/__init__.py: -------------------------------------------------------------------------------- 1 | from baselines.bench.benchmarks import * 2 | from baselines.bench.monitor import * -------------------------------------------------------------------------------- /gym-control/rl/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from .dqn import DQNAgent, NAFAgent, ContinuousDQNAgent 3 | from .ddpg import DDPGAgent 4 | from .cem import CEMAgent 5 | from .sarsa import SarsaAgent, SARSAAgent 6 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F403 2 | from baselines.common.console_util import * 3 | from baselines.common.dataset import Dataset 4 | from baselines.common.math_util import * 5 | from baselines.common.misc_util import * 6 | -------------------------------------------------------------------------------- /gym-control/utils.py: -------------------------------------------------------------------------------- 1 | def str2bool(v): 2 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 3 | return True 4 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 5 | return False 6 | else: 7 | raise argparse.ArgumentTypeError('Boolean value expected.') -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/a2c/README.md: -------------------------------------------------------------------------------- 1 | # A2C 2 | 3 | - Original paper: https://arxiv.org/abs/1602.01783 4 | - Baselines blog post: https://blog.openai.com/baselines-acktr-a2c/ 5 | - `python -m baselines.a2c.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/ppo2/README.md: -------------------------------------------------------------------------------- 1 | # PPO2 2 | 3 | - Original paper: https://arxiv.org/abs/1707.06347 4 | - Baselines blog post: https://blog.openai.com/openai-baselines-ppo/ 5 | - `python -m baselines.ppo2.run_atari` runs the algorithm for 40M frames = 10M timesteps on an Atari game. See help (`-h`) for more options. 6 | - `python -m baselines.ppo2.run_mujoco` runs the algorithm for 1M frames on a Mujoco environment. 7 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/ppo2/defaults.py: -------------------------------------------------------------------------------- 1 | def mujoco(): 2 | return dict( 3 | nsteps=2048, 4 | nminibatches=32, 5 | lam=0.95, 6 | gamma=0.99, 7 | noptepochs=10, 8 | log_interval=1, 9 | ent_coef=0.0, 10 | lr=lambda f: 3e-4 * f, 11 | cliprange=0.2, 12 | value_network='copy' 13 | ) 14 | 15 | def atari(): 16 | return dict( 17 | nsteps=128, nminibatches=4, 18 | lam=0.95, gamma=0.99, noptepochs=4, log_interval=1, 19 | ent_coef=.01, 20 | lr=lambda f : f * 2.5e-4, 21 | cliprange=lambda f : f * 0.1, 22 | ) 23 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/runners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | 4 | class AbstractEnvRunner(ABC): 5 | def __init__(self, *, env, model, nsteps): 6 | self.env = env 7 | self.model = model 8 | self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1 9 | self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape 10 | self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) 11 | self.obs[:] = env.reset() 12 | self.nsteps = nsteps 13 | self.states = model.initial_state 14 | self.dones = [False for _ in range(nenv)] 15 | 16 | @abstractmethod 17 | def run(self): 18 | raise NotImplementedError 19 | 20 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os, subprocess, sys 2 | 3 | def mpi_fork(n, bind_to_core=False): 4 | """Re-launches the current script with workers 5 | Returns "parent" for original parent, "child" for MPI children 6 | """ 7 | if n<=1: 8 | return "child" 9 | if os.getenv("IN_MPI") is None: 10 | env = os.environ.copy() 11 | env.update( 12 | MKL_NUM_THREADS="1", 13 | OMP_NUM_THREADS="1", 14 | IN_MPI="1" 15 | ) 16 | args = ["mpirun", "-np", str(n)] 17 | if bind_to_core: 18 | args += ["-bind-to", "core"] 19 | args += [sys.executable] + sys.argv 20 | subprocess.check_call(args, env=env) 21 | return "parent" 22 | else: 23 | return "child" 24 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/identity_env.py: -------------------------------------------------------------------------------- 1 | from gym import Env 2 | from gym.spaces import Discrete 3 | 4 | 5 | class IdentityEnv(Env): 6 | def __init__( 7 | self, 8 | dim, 9 | ep_length=100, 10 | ): 11 | 12 | self.action_space = Discrete(dim) 13 | self.reset() 14 | 15 | def reset(self): 16 | self._choose_next_state() 17 | self.observation_space = self.action_space 18 | 19 | return self.state 20 | 21 | def step(self, actions): 22 | rew = self._get_reward(actions) 23 | self._choose_next_state() 24 | return self.state, rew, False, {} 25 | 26 | def _choose_next_state(self): 27 | self.state = self.action_space.sample() 28 | 29 | def _get_reward(self, actions): 30 | return 1 if self.state == actions else 0 31 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tile_images.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def tile_images(img_nhwc): 4 | """ 5 | Tile N images into one big PxQ image 6 | (P,Q) are chosen to be as close as possible, and if N 7 | is square, then P=Q. 8 | 9 | input: img_nhwc, list or array of images, ndim=4 once turned into array 10 | n = batch index, h = height, w = width, c = channel 11 | returns: 12 | bigim_HWc, ndarray with ndim=3 13 | """ 14 | img_nhwc = np.asarray(img_nhwc) 15 | N, h, w, c = img_nhwc.shape 16 | H = int(np.ceil(np.sqrt(N))) 17 | W = int(np.ceil(float(N)/H)) 18 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)]) 19 | img_HWhwc = img_nhwc.reshape(H, W, h, w, c) 20 | img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4) 21 | img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c) 22 | return img_Hh_Ww_c 23 | 24 | -------------------------------------------------------------------------------- /gym-control/README.md: -------------------------------------------------------------------------------- 1 | # gym-control 2 | Reinforcement Learning with Perturbed Reward (control games) 3 | 4 | ## Usage 5 | ### Training 6 | ``` 7 | sh scripts/train-qlearn.sh (Cartpole) 8 | sh scripts/train-dqn.sh (Cartpole) 9 | sh scripts/train-ddpg.sh (Pendulum) 10 | ``` 11 | ### Visualizing 12 | ``` 13 | sh scripts/visualize.sh 14 | ``` 15 | ## References 16 | 1. *Q-Learning* Watkins et al., 1989 17 | 2. *Playing Atari with Deep Reinforcement Learning*, Mnih et al., 2013 18 | 3. *Human-level control through deep reinforcement learning*, Mnih et al., 2015 19 | 4. *Reinforcement learning: An introduction*, Sutton and Barto, 2011 20 | 5. *Learning Tetris Using the Noisy Cross-Entropy Method*, Szita et al., 2006 21 | 6. *Deep Reinforcement Learning (MLSS lecture notes)*, Schulman, 2016 22 | 7. *Continuous control with deep reinforcement learning*, Lillicrap et al., 2015 23 | 8. *Continuous Deep Q-Learning with Model-based Acceleration*, Gu et al., 2016 24 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/test_schedules.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.schedules import ConstantSchedule, PiecewiseSchedule 4 | 5 | 6 | def test_piecewise_schedule(): 7 | ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500) 8 | 9 | assert np.isclose(ps.value(-10), 500) 10 | assert np.isclose(ps.value(0), 150) 11 | assert np.isclose(ps.value(5), 200) 12 | assert np.isclose(ps.value(9), 80) 13 | assert np.isclose(ps.value(50), 50) 14 | assert np.isclose(ps.value(80), 50) 15 | assert np.isclose(ps.value(150), 0) 16 | assert np.isclose(ps.value(175), -25) 17 | assert np.isclose(ps.value(201), 500) 18 | assert np.isclose(ps.value(500), 500) 19 | 20 | assert np.isclose(ps.value(200 - 1e-10), -50) 21 | 22 | 23 | def test_constant_schedule(): 24 | cs = ConstantSchedule(5) 25 | for i in range(-100, 100): 26 | assert np.isclose(cs.value(i), 5) 27 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/cg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 3 | """ 4 | Demmel p 312 5 | """ 6 | p = b.copy() 7 | r = b.copy() 8 | x = np.zeros_like(b) 9 | rdotr = r.dot(r) 10 | 11 | fmtstr = "%10i %10.3g %10.3g" 12 | titlestr = "%10s %10s %10s" 13 | if verbose: print(titlestr % ("iter", "residual norm", "soln norm")) 14 | 15 | for i in range(cg_iters): 16 | if callback is not None: 17 | callback(x) 18 | if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x))) 19 | z = f_Ax(p) 20 | v = rdotr / p.dot(z) 21 | x += v*p 22 | r -= v*z 23 | newrdotr = r.dot(r) 24 | mu = newrdotr/rdotr 25 | p = r + mu*p 26 | 27 | rdotr = newrdotr 28 | if rdotr < residual_tol: 29 | break 30 | 31 | if callback is not None: 32 | callback(x) 33 | if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x))) # pylint: disable=W0631 34 | return x -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 wangjksjtu 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /gym-control/collect.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | 5 | 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--log_dir', default='logs/ddpg_pendulum/norm_one', 8 | help='Log dir [default: logs/ddpg_pendulum/norm_one]') 9 | parser.add_argument('--save_dir', default='docs/ddpg_pendulum/norm_one', 10 | help='Path of directory to saved [default: docs/ddpg_pendulum/norm_one]') 11 | FLAGS = parser.parse_args() 12 | 13 | LOG_DIR = FLAGS.log_dir 14 | SAVE_DIR = FLAGS.save_dir 15 | 16 | assert (os.path.exists(LOG_DIR)) 17 | if not os.path.exists(SAVE_DIR): 18 | os.makedirs(SAVE_DIR) 19 | 20 | def collect(): 21 | for j in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8]: 22 | input_dir = os.path.join(LOG_DIR, str(j)) 23 | files = glob.glob(os.path.join(input_dir, "*.png")) 24 | for fin in files: 25 | filename = fin[fin.rindex("/")+1:] 26 | fout = os.path.join(SAVE_DIR, filename) 27 | print "cp '%s' '%s'" % (fin, fout) 28 | os.system("cp '%s' '%s'" % (fin, fout)) 29 | 30 | 31 | if __name__ == "__main__": 32 | collect() 33 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/test_cartpole.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import gym 3 | 4 | from baselines.run import get_learn_function 5 | from baselines.common.tests.util import reward_per_episode_test 6 | 7 | common_kwargs = dict( 8 | total_timesteps=30000, 9 | network='mlp', 10 | gamma=1.0, 11 | seed=0, 12 | ) 13 | 14 | learn_kwargs = { 15 | 'a2c' : dict(nsteps=32, value_network='copy', lr=0.05), 16 | 'acktr': dict(nsteps=32, value_network='copy'), 17 | 'deepq': {}, 18 | 'ppo2': dict(value_network='copy'), 19 | 'trpo_mpi': {} 20 | } 21 | 22 | @pytest.mark.slow 23 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 24 | def test_cartpole(alg): 25 | ''' 26 | Test if the algorithm (with an mlp policy) 27 | can learn to balance the cartpole 28 | ''' 29 | 30 | kwargs = common_kwargs.copy() 31 | kwargs.update(learn_kwargs[alg]) 32 | 33 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 34 | def env_fn(): 35 | 36 | env = gym.make('CartPole-v0') 37 | env.seed(0) 38 | return env 39 | 40 | reward_per_episode_test(env_fn, learn_fn, 100) 41 | -------------------------------------------------------------------------------- /gym-atari/scripts/train-normal.sh: -------------------------------------------------------------------------------- 1 | cd baselines 2 | 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/pong/ppo2_50M_normal --normal=True)& 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/carnival/ppo2_50M_normal --normal=True)& 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/mspacman/ppo2_50M_normal --normal=True)& 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/phoenix/ppo2_50M_normal --normal=True)& 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/pong/ppo2_50M_normal --normal=True)& 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-normal/seaquest/ppo2_50M_normal --normal=True)& 9 | 10 | cd .. 11 | -------------------------------------------------------------------------------- /gym-atari/baselines/LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License 2 | 3 | Copyright (c) 2017 OpenAI (http://openai.com) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/test_tf_util.py: -------------------------------------------------------------------------------- 1 | # tests for tf_util 2 | import tensorflow as tf 3 | from baselines.common.tf_util import ( 4 | function, 5 | initialize, 6 | single_threaded_session 7 | ) 8 | 9 | 10 | def test_function(): 11 | with tf.Graph().as_default(): 12 | x = tf.placeholder(tf.int32, (), name="x") 13 | y = tf.placeholder(tf.int32, (), name="y") 14 | z = 3 * x + 2 * y 15 | lin = function([x, y], z, givens={y: 0}) 16 | 17 | with single_threaded_session(): 18 | initialize() 19 | 20 | assert lin(2) == 6 21 | assert lin(2, 2) == 10 22 | 23 | 24 | def test_multikwargs(): 25 | with tf.Graph().as_default(): 26 | x = tf.placeholder(tf.int32, (), name="x") 27 | with tf.variable_scope("other"): 28 | x2 = tf.placeholder(tf.int32, (), name="x") 29 | z = 3 * x + 2 * x2 30 | 31 | lin = function([x, x2], z, givens={x2: 0}) 32 | with single_threaded_session(): 33 | initialize() 34 | assert lin(2) == 6 35 | assert lin(2, 2) == 10 36 | 37 | 38 | if __name__ == '__main__': 39 | test_function() 40 | test_multikwargs() 41 | -------------------------------------------------------------------------------- /gym-atari/baselines/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | 4 | if sys.version_info.major != 3: 5 | print('This Python is only compatible with Python 3, but you are running ' 6 | 'Python {}. The installation will likely fail.'.format(sys.version_info.major)) 7 | 8 | 9 | setup(name='baselines', 10 | packages=[package for package in find_packages() 11 | if package.startswith('baselines')], 12 | install_requires=[ 13 | 'gym[atari,classic_control]', 14 | 'scipy', 15 | 'tqdm', 16 | 'joblib', 17 | 'dill', 18 | 'progressbar2', 19 | 'mpi4py', 20 | 'cloudpickle', 21 | 'tensorflow-gpu==1.10.0', 22 | 'click', 23 | 'opencv-python', 24 | ], 25 | extras_require={ 26 | 'test': [ 27 | 'filelock', 28 | 'pytest' 29 | ] 30 | }, 31 | description='OpenAI baselines: high quality implementations of reinforcement learning algorithms', 32 | author='OpenAI', 33 | url='https://github.com/openai/baselines', 34 | author_email='gym@openai.com', 35 | version='0.1.5') 36 | -------------------------------------------------------------------------------- /gym-atari/README.md: -------------------------------------------------------------------------------- 1 | # gym-atari 2 | Reinforcement Learning with Perturbed Reward (Atari Games) 3 | 4 | ## Usage 5 | ### Training 6 | To train models with different noisy or surrogate rewards: 7 | ``` 8 | sh scripts/train-pong.sh (Pong-v4) 9 | sh scripts/train-breakout.sh (Breakout-v4) 10 | ``` 11 | If you want to train the models with specific hyper-parameters by yourself: 12 | ``` 13 | cd baselines 14 | python -m baselines.run --alg= --env= [additional arguments] 15 | ``` 16 | #### Example 1. PPO with Pong 17 | ``` 18 | python -m baselines.run --alg=ppo2 --env=PongNoFrameskip-v4 --num_timesteps=3e7 \ 19 | --save_path=logs-pong/pong/anti_iden/ppo2_30M_noisy_0.1 --weight=0.1 \ 20 | --normal=False --surrogate=False --noise_type=anti_iden 21 | ``` 22 | 23 | ### Visualizing 24 | ``` 25 | cd baselines 26 | python ../scripts/visualize.py --env_name Breakout --log_dir logs-breakout/ --num_timesteps 50000000 --noise_type anti_iden --all True 27 | ``` 28 | To see HELP for the visualizing script: 29 | ``` 30 | python ../scripts/visualize.py -h 31 | ``` 32 | 33 | ## References 34 | 1. *Proximal Policy Optimization Algorithms* John Schulman et al., 2017 35 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/envs/fixed_sequence_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import Env 3 | from gym.spaces import Discrete 4 | 5 | 6 | class FixedSequenceEnv(Env): 7 | def __init__( 8 | self, 9 | n_actions=10, 10 | seed=0, 11 | episode_len=100 12 | ): 13 | self.np_random = np.random.RandomState() 14 | self.np_random.seed(seed) 15 | self.sequence = [self.np_random.randint(0, n_actions-1) for _ in range(episode_len)] 16 | 17 | self.action_space = Discrete(n_actions) 18 | self.observation_space = Discrete(1) 19 | 20 | self.episode_len = episode_len 21 | self.time = 0 22 | self.reset() 23 | 24 | def reset(self): 25 | self.time = 0 26 | return 0 27 | 28 | def step(self, actions): 29 | rew = self._get_reward(actions) 30 | self._choose_next_state() 31 | done = False 32 | if self.episode_len and self.time >= self.episode_len: 33 | rew = 0 34 | done = True 35 | 36 | return 0, rew, done, {} 37 | 38 | def _choose_next_state(self): 39 | self.time += 1 40 | 41 | def _get_reward(self, actions): 42 | return 1 if actions == self.sequence[self.time] else 0 43 | 44 | 45 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/mpi_adam_optimizer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from mpi4py import MPI 4 | 5 | class MpiAdamOptimizer(tf.train.AdamOptimizer): 6 | """Adam optimizer that averages gradients across mpi processes.""" 7 | def __init__(self, comm, **kwargs): 8 | self.comm = comm 9 | tf.train.AdamOptimizer.__init__(self, **kwargs) 10 | def compute_gradients(self, loss, var_list, **kwargs): 11 | grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs) 12 | grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None] 13 | flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) 14 | shapes = [v.shape.as_list() for g, v in grads_and_vars] 15 | sizes = [int(np.prod(s)) for s in shapes] 16 | 17 | num_tasks = self.comm.Get_size() 18 | buf = np.zeros(sum(sizes), np.float32) 19 | 20 | def _collect_grads(flat_grad): 21 | self.comm.Allreduce(flat_grad, buf, op=MPI.SUM) 22 | np.divide(buf, float(num_tasks), out=buf) 23 | return buf 24 | 25 | avg_flat_grad = tf.py_func(_collect_grads, [flat_grad], tf.float32) 26 | avg_flat_grad.set_shape(flat_grad.shape) 27 | avg_grads = tf.split(avg_flat_grad, sizes, axis=0) 28 | avg_grads_and_vars = [(tf.reshape(g, v.shape), v) 29 | for g, (_, v) in zip(avg_grads, grads_and_vars)] 30 | 31 | return avg_grads_and_vars 32 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/vec_env/vec_frame_stack.py: -------------------------------------------------------------------------------- 1 | from baselines.common.vec_env import VecEnvWrapper 2 | import numpy as np 3 | from gym import spaces 4 | 5 | class VecFrameStack(VecEnvWrapper): 6 | """ 7 | Vectorized environment base class 8 | """ 9 | def __init__(self, venv, nstack): 10 | self.venv = venv 11 | self.nstack = nstack 12 | wos = venv.observation_space # wrapped ob space 13 | low = np.repeat(wos.low, self.nstack, axis=-1) 14 | high = np.repeat(wos.high, self.nstack, axis=-1) 15 | self.stackedobs = np.zeros((venv.num_envs,)+low.shape, low.dtype) 16 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) 17 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 18 | 19 | def step_wait(self): 20 | obs, rews, news, infos = self.venv.step_wait() 21 | self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1) 22 | for (i, new) in enumerate(news): 23 | if new: 24 | self.stackedobs[i] = 0 25 | self.stackedobs[..., -obs.shape[-1]:] = obs 26 | return self.stackedobs, rews, news, infos 27 | 28 | def reset(self): 29 | """ 30 | Reset all environments 31 | """ 32 | obs = self.venv.reset() 33 | self.stackedobs[...] = 0 34 | self.stackedobs[..., -obs.shape[-1]:] = obs 35 | return self.stackedobs 36 | 37 | def close(self): 38 | self.venv.close() 39 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/running_stat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | # http://www.johndcook.com/blog/standard_deviation/ 4 | class RunningStat(object): 5 | def __init__(self, shape): 6 | self._n = 0 7 | self._M = np.zeros(shape) 8 | self._S = np.zeros(shape) 9 | def push(self, x): 10 | x = np.asarray(x) 11 | assert x.shape == self._M.shape 12 | self._n += 1 13 | if self._n == 1: 14 | self._M[...] = x 15 | else: 16 | oldM = self._M.copy() 17 | self._M[...] = oldM + (x - oldM)/self._n 18 | self._S[...] = self._S + (x - oldM)*(x - self._M) 19 | @property 20 | def n(self): 21 | return self._n 22 | @property 23 | def mean(self): 24 | return self._M 25 | @property 26 | def var(self): 27 | return self._S/(self._n - 1) if self._n > 1 else np.square(self._M) 28 | @property 29 | def std(self): 30 | return np.sqrt(self.var) 31 | @property 32 | def shape(self): 33 | return self._M.shape 34 | 35 | def test_running_stat(): 36 | for shp in ((), (3,), (3,4)): 37 | li = [] 38 | rs = RunningStat(shp) 39 | for _ in range(5): 40 | val = np.random.randn(*shp) 41 | rs.push(val) 42 | li.append(val) 43 | m = np.mean(li, axis=0) 44 | assert np.allclose(rs.mean, m) 45 | v = np.square(m) if (len(li) == 1) else np.var(li, ddof=1, axis=0) 46 | assert np.allclose(rs.var, v) 47 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/test_fixed_sequence.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.fixed_sequence_env import FixedSequenceEnv 3 | 4 | from baselines.common.tests.util import simple_test 5 | from baselines.run import get_learn_function 6 | 7 | common_kwargs = dict( 8 | seed=0, 9 | total_timesteps=50000, 10 | ) 11 | 12 | learn_kwargs = { 13 | 'a2c': {}, 14 | 'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1), 15 | # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps) 16 | # github issue: https://github.com/openai/baselines/issues/188 17 | # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001) 18 | } 19 | 20 | 21 | alg_list = learn_kwargs.keys() 22 | rnn_list = ['lstm'] 23 | 24 | @pytest.mark.slow 25 | @pytest.mark.parametrize("alg", alg_list) 26 | @pytest.mark.parametrize("rnn", rnn_list) 27 | def test_fixed_sequence(alg, rnn): 28 | ''' 29 | Test if the algorithm (with a given policy) 30 | can learn an identity transformation (i.e. return observation as an action) 31 | ''' 32 | 33 | kwargs = learn_kwargs[alg] 34 | kwargs.update(common_kwargs) 35 | 36 | episode_len = 5 37 | env_fn = lambda: FixedSequenceEnv(10, episode_len=episode_len) 38 | learn = lambda e: get_learn_function(alg)( 39 | env=e, 40 | network=rnn, 41 | **kwargs 42 | ) 43 | 44 | simple_test(env_fn, learn, 0.7) 45 | 46 | 47 | if __name__ == '__main__': 48 | test_fixed_sequence('ppo2', 'lstm') 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from contextlib import contextmanager 3 | import numpy as np 4 | import time 5 | 6 | # ================================================================ 7 | # Misc 8 | # ================================================================ 9 | 10 | def fmt_row(width, row, header=False): 11 | out = " | ".join(fmt_item(x, width) for x in row) 12 | if header: out = out + "\n" + "-"*len(out) 13 | return out 14 | 15 | def fmt_item(x, l): 16 | if isinstance(x, np.ndarray): 17 | assert x.ndim==0 18 | x = x.item() 19 | if isinstance(x, (float, np.float32, np.float64)): 20 | v = abs(x) 21 | if (v < 1e-4 or v > 1e+4) and v > 0: 22 | rep = "%7.2e" % x 23 | else: 24 | rep = "%7.5f" % x 25 | else: rep = str(x) 26 | return " "*(l - len(rep)) + rep 27 | 28 | color2num = dict( 29 | gray=30, 30 | red=31, 31 | green=32, 32 | yellow=33, 33 | blue=34, 34 | magenta=35, 35 | cyan=36, 36 | white=37, 37 | crimson=38 38 | ) 39 | 40 | def colorize(string, color, bold=False, highlight=False): 41 | attr = [] 42 | num = color2num[color] 43 | if highlight: num += 10 44 | attr.append(str(num)) 45 | if bold: attr.append('1') 46 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 47 | 48 | 49 | MESSAGE_DEPTH = 0 50 | 51 | @contextmanager 52 | def timed(msg): 53 | global MESSAGE_DEPTH #pylint: disable=W0603 54 | print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta')) 55 | tstart = time.time() 56 | MESSAGE_DEPTH += 1 57 | yield 58 | MESSAGE_DEPTH -= 1 59 | print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta')) 60 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/test_mnist.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | # from baselines.acer import acer_simple as acer 4 | from baselines.common.tests.envs.mnist_env import MnistEnv 5 | from baselines.common.tests.util import simple_test 6 | from baselines.run import get_learn_function 7 | 8 | 9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem? 10 | # GitHub issue https://github.com/openai/baselines/issues/189 11 | common_kwargs = { 12 | 'seed': 0, 13 | 'network':'cnn', 14 | 'gamma':0.9, 15 | 'pad':'SAME' 16 | } 17 | 18 | learn_args = { 19 | 'a2c': dict(total_timesteps=50000), 20 | # TODO need to resolve inference (step) API differences for acer; also slow 21 | # 'acer': dict(seed=0, total_timesteps=1000), 22 | 'deepq': dict(total_timesteps=5000), 23 | 'acktr': dict(total_timesteps=30000), 24 | 'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0), 25 | 'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001) 26 | } 27 | 28 | 29 | #tests pass, but are too slow on travis. Same algorithms are covered 30 | # by other tests with less compute-hungry nn's and by benchmarks 31 | @pytest.mark.skip 32 | @pytest.mark.slow 33 | @pytest.mark.parametrize("alg", learn_args.keys()) 34 | def test_mnist(alg): 35 | ''' 36 | Test if the algorithm can learn to classify MNIST digits. 37 | Uses CNN policy. 38 | ''' 39 | 40 | learn_kwargs = learn_args[alg] 41 | learn_kwargs.update(common_kwargs) 42 | 43 | learn = get_learn_function(alg) 44 | learn_fn = lambda e: learn(env=e, **learn_kwargs) 45 | env_fn = lambda: MnistEnv(seed=0, episode_len=100) 46 | 47 | simple_test(env_fn, learn_fn, 0.6) 48 | 49 | if __name__ == '__main__': 50 | test_mnist('deepq') 51 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/test_identity.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from baselines.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv 3 | from baselines.run import get_learn_function 4 | from baselines.common.tests.util import simple_test 5 | 6 | common_kwargs = dict( 7 | total_timesteps=30000, 8 | network='mlp', 9 | gamma=0.9, 10 | seed=0, 11 | ) 12 | 13 | learn_kwargs = { 14 | 'a2c' : {}, 15 | 'acktr': {}, 16 | 'deepq': {}, 17 | 'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0), 18 | 'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01) 19 | } 20 | 21 | 22 | @pytest.mark.slow 23 | @pytest.mark.parametrize("alg", learn_kwargs.keys()) 24 | def test_discrete_identity(alg): 25 | ''' 26 | Test if the algorithm (with an mlp policy) 27 | can learn an identity transformation (i.e. return observation as an action) 28 | ''' 29 | 30 | kwargs = learn_kwargs[alg] 31 | kwargs.update(common_kwargs) 32 | 33 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 34 | env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100) 35 | simple_test(env_fn, learn_fn, 0.9) 36 | 37 | @pytest.mark.slow 38 | @pytest.mark.parametrize("alg", ['a2c', 'ppo2', 'trpo_mpi']) 39 | def test_continuous_identity(alg): 40 | ''' 41 | Test if the algorithm (with an mlp policy) 42 | can learn an identity transformation (i.e. return observation as an action) 43 | to a required precision 44 | ''' 45 | 46 | kwargs = learn_kwargs[alg] 47 | kwargs.update(common_kwargs) 48 | learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs) 49 | 50 | env_fn = lambda: BoxIdentityEnv((1,), episode_len=100) 51 | simple_test(env_fn, learn_fn, -0.1) 52 | 53 | if __name__ == '__main__': 54 | test_continuous_identity('a2c') 55 | 56 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/input.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from gym.spaces import Discrete, Box 3 | 4 | def observation_placeholder(ob_space, batch_size=None, name='Ob'): 5 | ''' 6 | Create placeholder to feed observations into of the size appropriate to the observation space 7 | 8 | Parameters: 9 | ---------- 10 | 11 | ob_space: gym.Space observation space 12 | 13 | batch_size: int size of the batch to be fed into input. Can be left None in most cases. 14 | 15 | name: str name of the placeholder 16 | 17 | Returns: 18 | ------- 19 | 20 | tensorflow placeholder tensor 21 | ''' 22 | 23 | assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box), \ 24 | 'Can only deal with Discrete and Box observation spaces for now' 25 | 26 | return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name) 27 | 28 | 29 | def observation_input(ob_space, batch_size=None, name='Ob'): 30 | ''' 31 | Create placeholder to feed observations into of the size appropriate to the observation space, and add input 32 | encoder of the appropriate type. 33 | ''' 34 | 35 | placeholder = observation_placeholder(ob_space, batch_size, name) 36 | return placeholder, encode_observation(ob_space, placeholder) 37 | 38 | def encode_observation(ob_space, placeholder): 39 | ''' 40 | Encode input in the way that is appropriate to the observation space 41 | 42 | Parameters: 43 | ---------- 44 | 45 | ob_space: gym.Space observation space 46 | 47 | placeholder: tf.placeholder observation input placeholder 48 | ''' 49 | if isinstance(ob_space, Discrete): 50 | return tf.to_float(tf.one_hot(placeholder, ob_space.n)) 51 | 52 | elif isinstance(ob_space, Box): 53 | return tf.to_float(placeholder) 54 | else: 55 | raise NotImplementedError 56 | 57 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/envs/identity_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import abstractmethod 3 | from gym import Env 4 | from gym.spaces import Discrete, Box 5 | 6 | 7 | class IdentityEnv(Env): 8 | def __init__( 9 | self, 10 | episode_len=None 11 | ): 12 | 13 | self.episode_len = episode_len 14 | self.time = 0 15 | self.reset() 16 | 17 | def reset(self): 18 | self._choose_next_state() 19 | self.time = 0 20 | self.observation_space = self.action_space 21 | 22 | return self.state 23 | 24 | def step(self, actions): 25 | rew = self._get_reward(actions) 26 | self._choose_next_state() 27 | done = False 28 | if self.episode_len and self.time >= self.episode_len: 29 | rew = 0 30 | done = True 31 | 32 | return self.state, rew, done, {} 33 | 34 | def _choose_next_state(self): 35 | self.state = self.action_space.sample() 36 | self.time += 1 37 | 38 | @abstractmethod 39 | def _get_reward(self, actions): 40 | raise NotImplementedError 41 | 42 | 43 | class DiscreteIdentityEnv(IdentityEnv): 44 | def __init__( 45 | self, 46 | dim, 47 | episode_len=None, 48 | ): 49 | 50 | self.action_space = Discrete(dim) 51 | super().__init__(episode_len=episode_len) 52 | 53 | def _get_reward(self, actions): 54 | return 1 if self.state == actions else 0 55 | 56 | 57 | class BoxIdentityEnv(IdentityEnv): 58 | def __init__( 59 | self, 60 | shape, 61 | episode_len=None, 62 | ): 63 | 64 | self.action_space = Box(low=-1.0, high=1.0, shape=shape) 65 | super().__init__(episode_len=episode_len) 66 | 67 | def _get_reward(self, actions): 68 | diff = actions - self.state 69 | diff = diff[:] 70 | return -0.5 * np.dot(diff, diff) 71 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/vec_env/vec_normalize.py: -------------------------------------------------------------------------------- 1 | from baselines.common.vec_env import VecEnvWrapper 2 | from baselines.common.running_mean_std import RunningMeanStd 3 | import numpy as np 4 | 5 | class VecNormalize(VecEnvWrapper): 6 | """ 7 | Vectorized environment base class 8 | """ 9 | def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8): 10 | VecEnvWrapper.__init__(self, venv) 11 | self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None 12 | self.ret_rms = RunningMeanStd(shape=()) if ret else None 13 | #self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='observation_running_mean_std') if ob else None 14 | #self.ret_rms = TfRunningMeanStd(shape=(), scope='return_running_mean_std') if ret else None 15 | self.clipob = clipob 16 | self.cliprew = cliprew 17 | self.ret = np.zeros(self.num_envs) 18 | self.gamma = gamma 19 | self.epsilon = epsilon 20 | 21 | def step_wait(self): 22 | """ 23 | Apply sequence of actions to sequence of environments 24 | actions -> (observations, rewards, news) 25 | 26 | where 'news' is a boolean vector indicating whether each element is new. 27 | """ 28 | obs, rews, news, infos = self.venv.step_wait() 29 | self.ret = self.ret * self.gamma + rews 30 | obs = self._obfilt(obs) 31 | if self.ret_rms: 32 | self.ret_rms.update(self.ret) 33 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew) 34 | return obs, rews, news, infos 35 | 36 | def _obfilt(self, obs): 37 | if self.ob_rms: 38 | self.ob_rms.update(obs) 39 | obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob) 40 | return obs 41 | else: 42 | return obs 43 | 44 | def reset(self): 45 | """ 46 | Reset all environments 47 | """ 48 | obs = self.venv.reset() 49 | return self._obfilt(obs) 50 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | from baselines.common import zipsame 4 | 5 | 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False): 7 | x = np.asarray(x) 8 | assert x.ndim > 0 9 | if comm is None: comm = MPI.COMM_WORLD 10 | xsum = x.sum(axis=axis, keepdims=keepdims) 11 | n = xsum.size 12 | localsum = np.zeros(n+1, x.dtype) 13 | localsum[:n] = xsum.ravel() 14 | localsum[n] = x.shape[axis] 15 | globalsum = np.zeros_like(localsum) 16 | comm.Allreduce(localsum, globalsum, op=MPI.SUM) 17 | return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n] 18 | 19 | def mpi_moments(x, axis=0, comm=None, keepdims=False): 20 | x = np.asarray(x) 21 | assert x.ndim > 0 22 | mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True) 23 | sqdiffs = np.square(x - mean) 24 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) 25 | assert count1 == count 26 | std = np.sqrt(meansqdiff) 27 | if not keepdims: 28 | newshape = mean.shape[:axis] + mean.shape[axis+1:] 29 | mean = mean.reshape(newshape) 30 | std = std.reshape(newshape) 31 | return mean, std, count 32 | 33 | 34 | def test_runningmeanstd(): 35 | import subprocess 36 | subprocess.check_call(['mpirun', '-np', '3', 37 | 'python','-c', 38 | 'from baselines.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()']) 39 | 40 | def _helper_runningmeanstd(): 41 | comm = MPI.COMM_WORLD 42 | np.random.seed(0) 43 | for (triple,axis) in [ 44 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0), 45 | ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0), 46 | ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1), 47 | ]: 48 | 49 | 50 | x = np.concatenate(triple, axis=axis) 51 | ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]] 52 | 53 | 54 | ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis) 55 | 56 | for (a1,a2) in zipsame(ms1, ms2): 57 | print(a1, a2) 58 | assert np.allclose(a1, a2) 59 | print("ok!") 60 | 61 | -------------------------------------------------------------------------------- /gym-control/rl/random.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | import numpy as np 3 | 4 | 5 | class RandomProcess(object): 6 | def reset_states(self): 7 | pass 8 | 9 | 10 | class AnnealedGaussianProcess(RandomProcess): 11 | def __init__(self, mu, sigma, sigma_min, n_steps_annealing): 12 | self.mu = mu 13 | self.sigma = sigma 14 | self.n_steps = 0 15 | 16 | if sigma_min is not None: 17 | self.m = -float(sigma - sigma_min) / float(n_steps_annealing) 18 | self.c = sigma 19 | self.sigma_min = sigma_min 20 | else: 21 | self.m = 0. 22 | self.c = sigma 23 | self.sigma_min = sigma 24 | 25 | @property 26 | def current_sigma(self): 27 | sigma = max(self.sigma_min, self.m * float(self.n_steps) + self.c) 28 | return sigma 29 | 30 | 31 | class GaussianWhiteNoiseProcess(AnnealedGaussianProcess): 32 | def __init__(self, mu=0., sigma=1., sigma_min=None, n_steps_annealing=1000, size=1): 33 | super(GaussianWhiteNoiseProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing) 34 | self.size = size 35 | 36 | def sample(self): 37 | sample = np.random.normal(self.mu, self.current_sigma, self.size) 38 | self.n_steps += 1 39 | return sample 40 | 41 | # Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 42 | class OrnsteinUhlenbeckProcess(AnnealedGaussianProcess): 43 | def __init__(self, theta, mu=0., sigma=1., dt=1e-2, size=1, sigma_min=None, n_steps_annealing=1000): 44 | super(OrnsteinUhlenbeckProcess, self).__init__(mu=mu, sigma=sigma, sigma_min=sigma_min, n_steps_annealing=n_steps_annealing) 45 | self.theta = theta 46 | self.mu = mu 47 | self.dt = dt 48 | self.size = size 49 | self.reset_states() 50 | 51 | def sample(self): 52 | x = self.x_prev + self.theta * (self.mu - self.x_prev) * self.dt + self.current_sigma * np.sqrt(self.dt) * np.random.normal(size=self.size) 53 | self.x_prev = x 54 | self.n_steps += 1 55 | return x 56 | 57 | def reset_states(self): 58 | self.x_prev = np.random.normal(self.mu,self.current_sigma,self.size) 59 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Dataset(object): 4 | def __init__(self, data_map, deterministic=False, shuffle=True): 5 | self.data_map = data_map 6 | self.deterministic = deterministic 7 | self.enable_shuffle = shuffle 8 | self.n = next(iter(data_map.values())).shape[0] 9 | self._next_id = 0 10 | self.shuffle() 11 | 12 | def shuffle(self): 13 | if self.deterministic: 14 | return 15 | perm = np.arange(self.n) 16 | np.random.shuffle(perm) 17 | 18 | for key in self.data_map: 19 | self.data_map[key] = self.data_map[key][perm] 20 | 21 | self._next_id = 0 22 | 23 | def next_batch(self, batch_size): 24 | if self._next_id >= self.n and self.enable_shuffle: 25 | self.shuffle() 26 | 27 | cur_id = self._next_id 28 | cur_batch_size = min(batch_size, self.n - self._next_id) 29 | self._next_id += cur_batch_size 30 | 31 | data_map = dict() 32 | for key in self.data_map: 33 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 34 | return data_map 35 | 36 | def iterate_once(self, batch_size): 37 | if self.enable_shuffle: self.shuffle() 38 | 39 | while self._next_id <= self.n - batch_size: 40 | yield self.next_batch(batch_size) 41 | self._next_id = 0 42 | 43 | def subset(self, num_elements, deterministic=True): 44 | data_map = dict() 45 | for key in self.data_map: 46 | data_map[key] = self.data_map[key][:num_elements] 47 | return Dataset(data_map, deterministic) 48 | 49 | 50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 51 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 52 | arrays = tuple(map(np.asarray, arrays)) 53 | n = arrays[0].shape[0] 54 | assert all(a.shape[0] == n for a in arrays[1:]) 55 | inds = np.arange(n) 56 | if shuffle: np.random.shuffle(inds) 57 | sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches 58 | for batch_inds in np.array_split(inds, sections): 59 | if include_final_partial_batch or len(batch_inds) == batch_size: 60 | yield tuple(a[batch_inds] for a in arrays) 61 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/envs/mnist_env.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import numpy as np 3 | import tempfile 4 | import filelock 5 | from gym import Env 6 | from gym.spaces import Discrete, Box 7 | 8 | 9 | 10 | class MnistEnv(Env): 11 | def __init__( 12 | self, 13 | seed=0, 14 | episode_len=None, 15 | no_images=None 16 | ): 17 | from tensorflow.examples.tutorials.mnist import input_data 18 | # we could use temporary directory for this with a context manager and 19 | # TemporaryDirecotry, but then each test that uses mnist would re-download the data 20 | # this way the data is not cleaned up, but we only download it once per machine 21 | mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data') 22 | with filelock.FileLock(mnist_path + '.lock'): 23 | self.mnist = input_data.read_data_sets(mnist_path) 24 | 25 | self.np_random = np.random.RandomState() 26 | self.np_random.seed(seed) 27 | 28 | self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1)) 29 | self.action_space = Discrete(10) 30 | self.episode_len = episode_len 31 | self.time = 0 32 | self.no_images = no_images 33 | 34 | self.train_mode() 35 | self.reset() 36 | 37 | def reset(self): 38 | self._choose_next_state() 39 | self.time = 0 40 | 41 | return self.state[0] 42 | 43 | def step(self, actions): 44 | rew = self._get_reward(actions) 45 | self._choose_next_state() 46 | done = False 47 | if self.episode_len and self.time >= self.episode_len: 48 | rew = 0 49 | done = True 50 | 51 | return self.state[0], rew, done, {} 52 | 53 | def train_mode(self): 54 | self.dataset = self.mnist.train 55 | 56 | def test_mode(self): 57 | self.dataset = self.mnist.test 58 | 59 | def _choose_next_state(self): 60 | max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1 61 | index = self.np_random.randint(0, max_index) 62 | image = self.dataset.images[index].reshape(28,28,1)*255 63 | label = self.dataset.labels[index] 64 | self.state = (image, label) 65 | self.time += 1 66 | 67 | def _get_reward(self, actions): 68 | return 1 if self.state[1] == actions else 0 69 | 70 | 71 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(x, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of x. 8 | 9 | inputs 10 | ------ 11 | x: ndarray 12 | gamma: float 13 | 14 | outputs 15 | ------- 16 | y: ndarray with same shape as x, satisfying 17 | 18 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 19 | where k = len(x) - t - 1 20 | 21 | """ 22 | assert x.ndim >= 1 23 | return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1] 24 | 25 | def explained_variance(ypred,y): 26 | """ 27 | Computes fraction of variance that ypred explains about y. 28 | Returns 1 - Var[y-ypred] / Var[y] 29 | 30 | interpretation: 31 | ev=0 => might as well have predicted zero 32 | ev=1 => perfect prediction 33 | ev<0 => worse than just predicting zero 34 | 35 | """ 36 | assert y.ndim == 1 and ypred.ndim == 1 37 | vary = np.var(y) 38 | return np.nan if vary==0 else 1 - np.var(y-ypred)/vary 39 | 40 | def explained_variance_2d(ypred, y): 41 | assert y.ndim == 2 and ypred.ndim == 2 42 | vary = np.var(y, axis=0) 43 | out = 1 - np.var(y-ypred)/vary 44 | out[vary < 1e-10] = 0 45 | return out 46 | 47 | def ncc(ypred, y): 48 | return np.corrcoef(ypred, y)[1,0] 49 | 50 | def flatten_arrays(arrs): 51 | return np.concatenate([arr.flat for arr in arrs]) 52 | 53 | def unflatten_vector(vec, shapes): 54 | i=0 55 | arrs = [] 56 | for shape in shapes: 57 | size = np.prod(shape) 58 | arr = vec[i:i+size].reshape(shape) 59 | arrs.append(arr) 60 | i += size 61 | return arrs 62 | 63 | def discount_with_boundaries(X, New, gamma): 64 | """ 65 | X: 2d array of floats, time x features 66 | New: 2d array of bools, indicating when a new episode has started 67 | """ 68 | Y = np.zeros_like(X) 69 | T = X.shape[0] 70 | Y[T-1] = X[T-1] 71 | for t in range(T-2, -1, -1): 72 | Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1]) 73 | return Y 74 | 75 | def test_discount_with_boundaries(): 76 | gamma=0.9 77 | x = np.array([1.0, 2.0, 3.0, 4.0], 'float32') 78 | starts = [1.0, 0.0, 0.0, 1.0] 79 | y = discount_with_boundaries(x, starts, gamma) 80 | assert np.allclose(y, [ 81 | 1 + gamma * 2 + gamma**2 * 3, 82 | 2 + gamma * 3, 83 | 3, 84 | 4 85 | ]) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /gym-control/rl/processors.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import numpy as np 4 | 5 | from rl.core import Processor 6 | from rl.util import WhiteningNormalizer 7 | 8 | 9 | class MultiInputProcessor(Processor): 10 | """Converts observations from an environment with multiple observations for use in a neural network 11 | policy. 12 | 13 | In some cases, you have environments that return multiple different observations per timestep 14 | (in a robotics context, for example, a camera may be used to view the scene and a joint encoder may 15 | be used to report the angles for each joint). Usually, this can be handled by a policy that has 16 | multiple inputs, one for each modality. However, observations are returned by the environment 17 | in the form of a tuple `[(modality1_t, modality2_t, ..., modalityn_t) for t in T]` but the neural network 18 | expects them in per-modality batches like so: `[[modality1_1, ..., modality1_T], ..., [[modalityn_1, ..., modalityn_T]]`. 19 | This processor converts observations appropriate for this use case. 20 | 21 | # Arguments 22 | nb_inputs (integer): The number of inputs, that is different modalities, to be used. 23 | Your neural network that you use for the policy must have a corresponding number of 24 | inputs. 25 | """ 26 | def __init__(self, nb_inputs): 27 | self.nb_inputs = nb_inputs 28 | 29 | def process_state_batch(self, state_batch): 30 | input_batches = [[] for x in range(self.nb_inputs)] 31 | for state in state_batch: 32 | processed_state = [[] for x in range(self.nb_inputs)] 33 | for observation in state: 34 | assert len(observation) == self.nb_inputs 35 | for o, s in zip(observation, processed_state): 36 | s.append(o) 37 | for idx, s in enumerate(processed_state): 38 | input_batches[idx].append(s) 39 | return [np.array(x) for x in input_batches] 40 | 41 | 42 | class WhiteningNormalizerProcessor(Processor): 43 | """Normalizes the observations to have zero mean and standard deviation of one, 44 | i.e. it applies whitening to the inputs. 45 | 46 | This typically helps significantly with learning, especially if different dimensions are 47 | on different scales. However, it complicates training in the sense that you will have to store 48 | these weights alongside the policy if you intend to load it later. It is the responsibility of 49 | the user to do so. 50 | """ 51 | def __init__(self): 52 | self.normalizer = None 53 | 54 | def process_state_batch(self, batch): 55 | if self.normalizer is None: 56 | self.normalizer = WhiteningNormalizer(shape=batch.shape[1:], dtype=batch.dtype) 57 | self.normalizer.update(batch) 58 | return self.normalizer.normalize(batch) 59 | -------------------------------------------------------------------------------- /gym-atari/scripts/visualize.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | 4 | def str2bool(v): 5 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 6 | return True 7 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 8 | return False 9 | else: 10 | raise argparse.ArgumentTypeError('Boolean value expected.') 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--log_dir', type=str, default='baselines/logs', 14 | help='The path of log directory [default: baselines/logs') 15 | parser.add_argument('--all', type=str2bool, default=False, 16 | help='Plot all the curves (diff errs) [default: False]') 17 | parser.add_argument('--weight', type=float, default=0.2, 18 | help='Weight of noise [default: 0.2]') 19 | parser.add_argument('--noise_type', type=str, default='anti_iden', 20 | help='Type of additional noise [default: anti_iden]') 21 | parser.add_argument('--save_dir', type=str, default='../results', 22 | help='Path of root directory to save plots [default: save_dir]') 23 | parser.add_argument('--env_name', type=str, default='Pong', 24 | help='Name of Atari game') 25 | parser.add_argument('--num_timesteps', type=int, default=5e7, 26 | help='Number of timesteps') 27 | 28 | FLAGS = parser.parse_args() 29 | 30 | LOG_DIR = FLAGS.log_dir 31 | ALL = FLAGS.all 32 | WEIGHT = FLAGS.weight 33 | NOISE_TYPE = FLAGS.noise_type 34 | SAVE_DIR = FLAGS.save_dir 35 | ENV = FLAGS.env_name 36 | NUM_TIMESTEPS = FLAGS.num_timesteps 37 | 38 | assert (os.path.exists(LOG_DIR)) 39 | assert (NOISE_TYPE in ['norm_one', 'norm_all', 'anti_iden']) 40 | 41 | SAVE_DIR = os.path.join(SAVE_DIR, ENV) 42 | if not os.path.exists(SAVE_DIR): 43 | os.makedirs(SAVE_DIR) 44 | 45 | def visualize(): 46 | if ALL: 47 | weights_list = [0.1, 0.2, 0.3, 0.4, 48 | 0.6, 0.7, 0.8, 0.9] 49 | if NOISE_TYPE != "anti_iden": 50 | weights_list.append(0.5) 51 | else: 52 | weights_list = [WEIGHT] 53 | 54 | for weight in weights_list: 55 | print ("python -m baselines.results_compare --log_dir %s --task_name %s \ 56 | --weight %s --noise_type %s --num_timesteps %s --save_dir %s" % \ 57 | (LOG_DIR, ENV, str(weight), NOISE_TYPE, str(NUM_TIMESTEPS), SAVE_DIR)) 58 | os.system("python -m baselines.results_compare --log_dir %s --task_name %s \ 59 | --weight %s --noise_type %s --num_timesteps %s --save_dir %s" % \ 60 | (LOG_DIR, ENV, str(weight), NOISE_TYPE, str(NUM_TIMESTEPS), SAVE_DIR)) 61 | print (LOG_DIR, ENV, str(weight), NOISE_TYPE, str(NUM_TIMESTEPS), SAVE_DIR) 62 | #os.system("cd ..") 63 | 64 | if __name__ == "__main__": 65 | visualize() -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/a2c/runner.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from baselines.a2c.utils import discount_with_dones 3 | from baselines.common.runners import AbstractEnvRunner 4 | 5 | class Runner(AbstractEnvRunner): 6 | 7 | def __init__(self, env, model, nsteps=5, gamma=0.99): 8 | super().__init__(env=env, model=model, nsteps=nsteps) 9 | self.gamma = gamma 10 | self.batch_action_shape = [x if x is not None else -1 for x in model.train_model.action.shape.as_list()] 11 | self.ob_dtype = model.train_model.X.dtype.as_numpy_dtype 12 | 13 | def run(self): 14 | mb_obs, mb_rewards, mb_actions, mb_values, mb_dones = [],[],[],[],[] 15 | mb_states = self.states 16 | for n in range(self.nsteps): 17 | actions, values, states, _ = self.model.step(self.obs, S=self.states, M=self.dones) 18 | mb_obs.append(np.copy(self.obs)) 19 | mb_actions.append(actions) 20 | mb_values.append(values) 21 | mb_dones.append(self.dones) 22 | obs, rewards, dones, _ = self.env.step(actions) 23 | # TODO: surrogate reward 24 | self.states = states 25 | self.dones = dones 26 | for n, done in enumerate(dones): 27 | if done: 28 | self.obs[n] = self.obs[n]*0 29 | self.obs = obs 30 | mb_rewards.append(rewards) 31 | mb_dones.append(self.dones) 32 | #batch of steps to batch of rollouts 33 | 34 | mb_obs = np.asarray(mb_obs, dtype=self.ob_dtype).swapaxes(1, 0).reshape(self.batch_ob_shape) 35 | mb_rewards = np.asarray(mb_rewards, dtype=np.float32).swapaxes(1, 0) 36 | mb_actions = np.asarray(mb_actions, dtype=self.model.train_model.action.dtype.name).swapaxes(1, 0) 37 | mb_values = np.asarray(mb_values, dtype=np.float32).swapaxes(1, 0) 38 | mb_dones = np.asarray(mb_dones, dtype=np.bool).swapaxes(1, 0) 39 | mb_masks = mb_dones[:, :-1] 40 | mb_dones = mb_dones[:, 1:] 41 | 42 | 43 | if self.gamma > 0.0: 44 | #discount/bootstrap off value fn 45 | last_values = self.model.value(self.obs, S=self.states, M=self.dones).tolist() 46 | for n, (rewards, dones, value) in enumerate(zip(mb_rewards, mb_dones, last_values)): 47 | rewards = rewards.tolist() 48 | dones = dones.tolist() 49 | if dones[-1] == 0: 50 | rewards = discount_with_dones(rewards+[value], dones+[0], self.gamma)[:-1] 51 | else: 52 | rewards = discount_with_dones(rewards, dones, self.gamma) 53 | 54 | mb_rewards[n] = rewards 55 | 56 | mb_actions = mb_actions.reshape(self.batch_action_shape) 57 | 58 | mb_rewards = mb_rewards.flatten() 59 | mb_values = mb_values.flatten() 60 | mb_masks = mb_masks.flatten() 61 | return mb_obs, mb_states, mb_rewards, mb_masks, mb_actions, mb_values 62 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/util.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from gym.spaces import np_random 4 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 5 | 6 | N_TRIALS = 10000 7 | N_EPISODES = 100 8 | 9 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS): 10 | np.random.seed(0) 11 | np_random.seed(0) 12 | 13 | env = DummyVecEnv([env_fn]) 14 | 15 | 16 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): 17 | tf.set_random_seed(0) 18 | 19 | model = learn_fn(env) 20 | 21 | sum_rew = 0 22 | done = True 23 | 24 | for i in range(n_trials): 25 | if done: 26 | obs = env.reset() 27 | state = model.initial_state 28 | 29 | if state is not None: 30 | a, v, state, _ = model.step(obs, S=state, M=[False]) 31 | else: 32 | a, v, _, _ = model.step(obs) 33 | 34 | obs, rew, done, _ = env.step(a) 35 | sum_rew += float(rew) 36 | 37 | print("Reward in {} trials is {}".format(n_trials, sum_rew)) 38 | assert sum_rew > min_reward_fraction * n_trials, \ 39 | 'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials) 40 | 41 | 42 | 43 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES): 44 | env = DummyVecEnv([env_fn]) 45 | 46 | with tf.Graph().as_default(), tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default(): 47 | model = learn_fn(env) 48 | 49 | N_TRIALS = 100 50 | 51 | observations, actions, rewards = rollout(env, model, N_TRIALS) 52 | rewards = [sum(r) for r in rewards] 53 | 54 | avg_rew = sum(rewards) / N_TRIALS 55 | print("Average reward in {} episodes is {}".format(n_trials, avg_rew)) 56 | assert avg_rew > min_avg_reward, \ 57 | 'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward) 58 | 59 | def rollout(env, model, n_trials): 60 | rewards = [] 61 | actions = [] 62 | observations = [] 63 | 64 | for i in range(n_trials): 65 | obs = env.reset() 66 | state = model.initial_state 67 | episode_rew = [] 68 | episode_actions = [] 69 | episode_obs = [] 70 | 71 | while True: 72 | if state is not None: 73 | a, v, state, _ = model.step(obs, S=state, M=[False]) 74 | else: 75 | a,v, _, _ = model.step(obs) 76 | 77 | obs, rew, done, _ = env.step(a) 78 | 79 | episode_rew.append(rew) 80 | episode_actions.append(a) 81 | episode_obs.append(obs) 82 | 83 | if done: 84 | break 85 | 86 | rewards.append(episode_rew) 87 | actions.append(episode_actions) 88 | observations.append(episode_obs) 89 | 90 | return observations, actions, rewards 91 | 92 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/mpi_adam.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import baselines.common.tf_util as U 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | class MpiAdam(object): 7 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None): 8 | self.var_list = var_list 9 | self.beta1 = beta1 10 | self.beta2 = beta2 11 | self.epsilon = epsilon 12 | self.scale_grad_by_procs = scale_grad_by_procs 13 | size = sum(U.numel(v) for v in var_list) 14 | self.m = np.zeros(size, 'float32') 15 | self.v = np.zeros(size, 'float32') 16 | self.t = 0 17 | self.setfromflat = U.SetFromFlat(var_list) 18 | self.getflat = U.GetFlat(var_list) 19 | self.comm = MPI.COMM_WORLD if comm is None else comm 20 | 21 | def update(self, localg, stepsize): 22 | if self.t % 100 == 0: 23 | self.check_synced() 24 | localg = localg.astype('float32') 25 | globalg = np.zeros_like(localg) 26 | self.comm.Allreduce(localg, globalg, op=MPI.SUM) 27 | if self.scale_grad_by_procs: 28 | globalg /= self.comm.Get_size() 29 | 30 | self.t += 1 31 | a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t) 32 | self.m = self.beta1 * self.m + (1 - self.beta1) * globalg 33 | self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg) 34 | step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon) 35 | self.setfromflat(self.getflat() + step) 36 | 37 | def sync(self): 38 | theta = self.getflat() 39 | self.comm.Bcast(theta, root=0) 40 | self.setfromflat(theta) 41 | 42 | def check_synced(self): 43 | if self.comm.Get_rank() == 0: # this is root 44 | theta = self.getflat() 45 | self.comm.Bcast(theta, root=0) 46 | else: 47 | thetalocal = self.getflat() 48 | thetaroot = np.empty_like(thetalocal) 49 | self.comm.Bcast(thetaroot, root=0) 50 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 51 | 52 | @U.in_session 53 | def test_MpiAdam(): 54 | np.random.seed(0) 55 | tf.set_random_seed(0) 56 | 57 | a = tf.Variable(np.random.randn(3).astype('float32')) 58 | b = tf.Variable(np.random.randn(2,5).astype('float32')) 59 | loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b)) 60 | 61 | stepsize = 1e-2 62 | update_op = tf.train.AdamOptimizer(stepsize).minimize(loss) 63 | do_update = U.function([], loss, updates=[update_op]) 64 | 65 | tf.get_default_session().run(tf.global_variables_initializer()) 66 | for i in range(10): 67 | print(i,do_update()) 68 | 69 | tf.set_random_seed(0) 70 | tf.get_default_session().run(tf.global_variables_initializer()) 71 | 72 | var_list = [a,b] 73 | lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)], updates=[update_op]) 74 | adam = MpiAdam(var_list) 75 | 76 | for i in range(10): 77 | l,g = lossandgrad() 78 | adam.update(g, stepsize) 79 | print(i,l) -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/vec_env/dummy_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | from collections import OrderedDict 4 | from . import VecEnv 5 | 6 | class DummyVecEnv(VecEnv): 7 | def __init__(self, env_fns): 8 | self.envs = [fn() for fn in env_fns] 9 | env = self.envs[0] 10 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 11 | shapes, dtypes = {}, {} 12 | self.keys = [] 13 | obs_space = env.observation_space 14 | 15 | if isinstance(obs_space, spaces.Dict): 16 | assert isinstance(obs_space.spaces, OrderedDict) 17 | subspaces = obs_space.spaces 18 | else: 19 | subspaces = {None: obs_space} 20 | 21 | for key, box in subspaces.items(): 22 | shapes[key] = box.shape 23 | dtypes[key] = box.dtype 24 | self.keys.append(key) 25 | 26 | self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys } 27 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) 28 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) 29 | self.buf_infos = [{} for _ in range(self.num_envs)] 30 | self.actions = None 31 | 32 | def step_async(self, actions): 33 | listify = True 34 | try: 35 | if len(actions) == self.num_envs: 36 | listify = False 37 | except TypeError: 38 | pass 39 | 40 | if not listify: 41 | self.actions = actions 42 | else: 43 | assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs) 44 | self.actions = [actions] 45 | 46 | def step_wait(self): 47 | for e in range(self.num_envs): 48 | action = self.actions[e] 49 | if isinstance(self.envs[e].action_space, spaces.Discrete): 50 | action = int(action) 51 | 52 | obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action) 53 | if self.buf_dones[e]: 54 | obs = self.envs[e].reset() 55 | self._save_obs(e, obs) 56 | return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones), 57 | self.buf_infos.copy()) 58 | 59 | def reset(self): 60 | for e in range(self.num_envs): 61 | obs = self.envs[e].reset() 62 | self._save_obs(e, obs) 63 | return self._obs_from_buf() 64 | 65 | def close(self): 66 | return 67 | 68 | def render(self, mode='human'): 69 | return [e.render(mode=mode) for e in self.envs] 70 | 71 | def _save_obs(self, e, obs): 72 | for k in self.keys: 73 | if k is None: 74 | self.buf_obs[k][e] = obs 75 | else: 76 | self.buf_obs[k][e] = obs[k] 77 | 78 | def _obs_from_buf(self): 79 | if self.keys==[None]: 80 | return self.buf_obs[None] 81 | else: 82 | return self.buf_obs 83 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/filters.py: -------------------------------------------------------------------------------- 1 | from .running_stat import RunningStat 2 | from collections import deque 3 | import numpy as np 4 | 5 | class Filter(object): 6 | def __call__(self, x, update=True): 7 | raise NotImplementedError 8 | def reset(self): 9 | pass 10 | 11 | class IdentityFilter(Filter): 12 | def __call__(self, x, update=True): 13 | return x 14 | 15 | class CompositionFilter(Filter): 16 | def __init__(self, fs): 17 | self.fs = fs 18 | def __call__(self, x, update=True): 19 | for f in self.fs: 20 | x = f(x) 21 | return x 22 | def output_shape(self, input_space): 23 | out = input_space.shape 24 | for f in self.fs: 25 | out = f.output_shape(out) 26 | return out 27 | 28 | class ZFilter(Filter): 29 | """ 30 | y = (x-mean)/std 31 | using running estimates of mean,std 32 | """ 33 | 34 | def __init__(self, shape, demean=True, destd=True, clip=10.0): 35 | self.demean = demean 36 | self.destd = destd 37 | self.clip = clip 38 | 39 | self.rs = RunningStat(shape) 40 | 41 | def __call__(self, x, update=True): 42 | if update: self.rs.push(x) 43 | if self.demean: 44 | x = x - self.rs.mean 45 | if self.destd: 46 | x = x / (self.rs.std+1e-8) 47 | if self.clip: 48 | x = np.clip(x, -self.clip, self.clip) 49 | return x 50 | def output_shape(self, input_space): 51 | return input_space.shape 52 | 53 | class AddClock(Filter): 54 | def __init__(self): 55 | self.count = 0 56 | def reset(self): 57 | self.count = 0 58 | def __call__(self, x, update=True): 59 | return np.append(x, self.count/100.0) 60 | def output_shape(self, input_space): 61 | return (input_space.shape[0]+1,) 62 | 63 | class FlattenFilter(Filter): 64 | def __call__(self, x, update=True): 65 | return x.ravel() 66 | def output_shape(self, input_space): 67 | return (int(np.prod(input_space.shape)),) 68 | 69 | class Ind2OneHotFilter(Filter): 70 | def __init__(self, n): 71 | self.n = n 72 | def __call__(self, x, update=True): 73 | out = np.zeros(self.n) 74 | out[x] = 1 75 | return out 76 | def output_shape(self, input_space): 77 | return (input_space.n,) 78 | 79 | class DivFilter(Filter): 80 | def __init__(self, divisor): 81 | self.divisor = divisor 82 | def __call__(self, x, update=True): 83 | return x / self.divisor 84 | def output_shape(self, input_space): 85 | return input_space.shape 86 | 87 | class StackFilter(Filter): 88 | def __init__(self, length): 89 | self.stack = deque(maxlen=length) 90 | def reset(self): 91 | self.stack.clear() 92 | def __call__(self, x, update=True): 93 | self.stack.append(x) 94 | while len(self.stack) < self.stack.maxlen: 95 | self.stack.append(x) 96 | return np.concatenate(self.stack, axis=-1) 97 | def output_shape(self, input_space): 98 | return input_space.shape[:-1] + (input_space.shape[-1] * self.stack.maxlen,) 99 | -------------------------------------------------------------------------------- /gym-control/scripts/train-qlearn.sh: -------------------------------------------------------------------------------- 1 | for i in $(seq 1 3); 2 | do 3 | for log_dir in qlearn/$i 4 | do 5 | (python qlearn_cartpole.py --log_dir $log_dir)& 6 | 7 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)& 8 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)& 9 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)& 10 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)& 11 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)& 12 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)& 13 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)& 14 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)& 15 | 16 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)& 17 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)& 18 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)& 19 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)& 20 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)& 21 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)& 22 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)& 23 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)& 24 | 25 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)& 26 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)& 27 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)& 28 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)& 29 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)& 30 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)& 31 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)& 32 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)& 33 | 34 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)& 35 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)& 36 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)& 37 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)& 38 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)& 39 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)& 40 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)& 41 | (python qlearn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)& 42 | done 43 | done 44 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/test_segment_tree.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from baselines.common.segment_tree import SumSegmentTree, MinSegmentTree 4 | 5 | 6 | def test_tree_set(): 7 | tree = SumSegmentTree(4) 8 | 9 | tree[2] = 1.0 10 | tree[3] = 3.0 11 | 12 | assert np.isclose(tree.sum(), 4.0) 13 | assert np.isclose(tree.sum(0, 2), 0.0) 14 | assert np.isclose(tree.sum(0, 3), 1.0) 15 | assert np.isclose(tree.sum(2, 3), 1.0) 16 | assert np.isclose(tree.sum(2, -1), 1.0) 17 | assert np.isclose(tree.sum(2, 4), 4.0) 18 | 19 | 20 | def test_tree_set_overlap(): 21 | tree = SumSegmentTree(4) 22 | 23 | tree[2] = 1.0 24 | tree[2] = 3.0 25 | 26 | assert np.isclose(tree.sum(), 3.0) 27 | assert np.isclose(tree.sum(2, 3), 3.0) 28 | assert np.isclose(tree.sum(2, -1), 3.0) 29 | assert np.isclose(tree.sum(2, 4), 3.0) 30 | assert np.isclose(tree.sum(1, 2), 0.0) 31 | 32 | 33 | def test_prefixsum_idx(): 34 | tree = SumSegmentTree(4) 35 | 36 | tree[2] = 1.0 37 | tree[3] = 3.0 38 | 39 | assert tree.find_prefixsum_idx(0.0) == 2 40 | assert tree.find_prefixsum_idx(0.5) == 2 41 | assert tree.find_prefixsum_idx(0.99) == 2 42 | assert tree.find_prefixsum_idx(1.01) == 3 43 | assert tree.find_prefixsum_idx(3.00) == 3 44 | assert tree.find_prefixsum_idx(4.00) == 3 45 | 46 | 47 | def test_prefixsum_idx2(): 48 | tree = SumSegmentTree(4) 49 | 50 | tree[0] = 0.5 51 | tree[1] = 1.0 52 | tree[2] = 1.0 53 | tree[3] = 3.0 54 | 55 | assert tree.find_prefixsum_idx(0.00) == 0 56 | assert tree.find_prefixsum_idx(0.55) == 1 57 | assert tree.find_prefixsum_idx(0.99) == 1 58 | assert tree.find_prefixsum_idx(1.51) == 2 59 | assert tree.find_prefixsum_idx(3.00) == 3 60 | assert tree.find_prefixsum_idx(5.50) == 3 61 | 62 | 63 | def test_max_interval_tree(): 64 | tree = MinSegmentTree(4) 65 | 66 | tree[0] = 1.0 67 | tree[2] = 0.5 68 | tree[3] = 3.0 69 | 70 | assert np.isclose(tree.min(), 0.5) 71 | assert np.isclose(tree.min(0, 2), 1.0) 72 | assert np.isclose(tree.min(0, 3), 0.5) 73 | assert np.isclose(tree.min(0, -1), 0.5) 74 | assert np.isclose(tree.min(2, 4), 0.5) 75 | assert np.isclose(tree.min(3, 4), 3.0) 76 | 77 | tree[2] = 0.7 78 | 79 | assert np.isclose(tree.min(), 0.7) 80 | assert np.isclose(tree.min(0, 2), 1.0) 81 | assert np.isclose(tree.min(0, 3), 0.7) 82 | assert np.isclose(tree.min(0, -1), 0.7) 83 | assert np.isclose(tree.min(2, 4), 0.7) 84 | assert np.isclose(tree.min(3, 4), 3.0) 85 | 86 | tree[2] = 4.0 87 | 88 | assert np.isclose(tree.min(), 1.0) 89 | assert np.isclose(tree.min(0, 2), 1.0) 90 | assert np.isclose(tree.min(0, 3), 1.0) 91 | assert np.isclose(tree.min(0, -1), 1.0) 92 | assert np.isclose(tree.min(2, 4), 3.0) 93 | assert np.isclose(tree.min(2, 3), 4.0) 94 | assert np.isclose(tree.min(2, -1), 4.0) 95 | assert np.isclose(tree.min(3, 4), 3.0) 96 | 97 | 98 | if __name__ == '__main__': 99 | test_tree_set() 100 | test_tree_set_overlap() 101 | test_prefixsum_idx() 102 | test_prefixsum_idx2() 103 | test_max_interval_tree() 104 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/tests/test_serialization.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | import pytest 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | from baselines.common.tests.envs.mnist_env import MnistEnv 8 | from baselines.common.vec_env.dummy_vec_env import DummyVecEnv 9 | from baselines.run import get_learn_function 10 | from baselines.common.tf_util import make_session, get_session 11 | 12 | from functools import partial 13 | 14 | 15 | learn_kwargs = { 16 | 'deepq': {}, 17 | 'a2c': {}, 18 | 'acktr': {}, 19 | 'ppo2': {'nminibatches': 1, 'nsteps': 10}, 20 | 'trpo_mpi': {}, 21 | } 22 | 23 | network_kwargs = { 24 | 'mlp': {}, 25 | 'cnn': {'pad': 'SAME'}, 26 | 'lstm': {}, 27 | 'cnn_lnlstm': {'pad': 'SAME'} 28 | } 29 | 30 | 31 | @pytest.mark.parametrize("learn_fn", learn_kwargs.keys()) 32 | @pytest.mark.parametrize("network_fn", network_kwargs.keys()) 33 | def test_serialization(learn_fn, network_fn): 34 | ''' 35 | Test if the trained model can be serialized 36 | ''' 37 | 38 | 39 | if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']: 40 | # TODO make acktr work with recurrent policies 41 | # and test 42 | # github issue: https://github.com/openai/baselines/issues/194 43 | return 44 | 45 | env = DummyVecEnv([lambda: MnistEnv(10, episode_len=100)]) 46 | ob = env.reset().copy() 47 | learn = get_learn_function(learn_fn) 48 | 49 | kwargs = {} 50 | kwargs.update(network_kwargs[network_fn]) 51 | kwargs.update(learn_kwargs[learn_fn]) 52 | 53 | 54 | learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs) 55 | 56 | with tempfile.TemporaryDirectory() as td: 57 | model_path = os.path.join(td, 'serialization_test_model') 58 | 59 | with tf.Graph().as_default(), make_session().as_default(): 60 | model = learn(total_timesteps=100) 61 | model.save(model_path) 62 | mean1, std1 = _get_action_stats(model, ob) 63 | variables_dict1 = _serialize_variables() 64 | 65 | with tf.Graph().as_default(), make_session().as_default(): 66 | model = learn(total_timesteps=0, load_path=model_path) 67 | mean2, std2 = _get_action_stats(model, ob) 68 | variables_dict2 = _serialize_variables() 69 | 70 | for k, v in variables_dict1.items(): 71 | np.testing.assert_allclose(v, variables_dict2[k], atol=0.01, 72 | err_msg='saved and loaded variable {} value mismatch'.format(k)) 73 | 74 | np.testing.assert_allclose(mean1, mean2, atol=0.5) 75 | np.testing.assert_allclose(std1, std2, atol=0.5) 76 | 77 | 78 | 79 | def _serialize_variables(): 80 | sess = get_session() 81 | variables = tf.trainable_variables() 82 | values = sess.run(variables) 83 | return {var.name: value for var, value in zip(variables, values)} 84 | 85 | 86 | def _get_action_stats(model, ob): 87 | ntrials = 1000 88 | if model.initial_state is None or model.initial_state == []: 89 | actions = np.array([model.step(ob)[0] for _ in range(ntrials)]) 90 | else: 91 | actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)]) 92 | 93 | mean = np.mean(actions, axis=0) 94 | std = np.std(actions, axis=0) 95 | 96 | return mean, std 97 | 98 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/mpi_util.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from mpi4py import MPI 3 | import os, numpy as np 4 | import platform 5 | import shutil 6 | import subprocess 7 | 8 | def sync_from_root(sess, variables, comm=None): 9 | """ 10 | Send the root node's parameters to every worker. 11 | Arguments: 12 | sess: the TensorFlow session. 13 | variables: all parameter variables including optimizer's 14 | """ 15 | if comm is None: comm = MPI.COMM_WORLD 16 | rank = comm.Get_rank() 17 | for var in variables: 18 | if rank == 0: 19 | comm.Bcast(sess.run(var)) 20 | else: 21 | import tensorflow as tf 22 | returned_var = np.empty(var.shape, dtype='float32') 23 | comm.Bcast(returned_var) 24 | sess.run(tf.assign(var, returned_var)) 25 | 26 | def gpu_count(): 27 | """ 28 | Count the GPUs on this machine. 29 | """ 30 | if shutil.which('nvidia-smi') is None: 31 | return 0 32 | output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv']) 33 | return max(0, len(output.split(b'\n')) - 2) 34 | 35 | def setup_mpi_gpus(): 36 | """ 37 | Set CUDA_VISIBLE_DEVICES using MPI. 38 | """ 39 | num_gpus = gpu_count() 40 | if num_gpus == 0: 41 | return 42 | local_rank, _ = get_local_rank_size(MPI.COMM_WORLD) 43 | os.environ['CUDA_VISIBLE_DEVICES'] = str(local_rank % num_gpus) 44 | 45 | def get_local_rank_size(comm): 46 | """ 47 | Returns the rank of each process on its machine 48 | The processes on a given machine will be assigned ranks 49 | 0, 1, 2, ..., N-1, 50 | where N is the number of processes on this machine. 51 | 52 | Useful if you want to assign one gpu per machine 53 | """ 54 | this_node = platform.node() 55 | ranks_nodes = comm.allgather((comm.Get_rank(), this_node)) 56 | node2rankssofar = defaultdict(int) 57 | local_rank = None 58 | for (rank, node) in ranks_nodes: 59 | if rank == comm.Get_rank(): 60 | local_rank = node2rankssofar[node] 61 | node2rankssofar[node] += 1 62 | assert local_rank is not None 63 | return local_rank, node2rankssofar[this_node] 64 | 65 | def share_file(comm, path): 66 | """ 67 | Copies the file from rank 0 to all other ranks 68 | Puts it in the same place on all machines 69 | """ 70 | localrank, _ = get_local_rank_size(comm) 71 | if comm.Get_rank() == 0: 72 | with open(path, 'rb') as fh: 73 | data = fh.read() 74 | comm.bcast(data) 75 | else: 76 | data = comm.bcast(None) 77 | if localrank == 0: 78 | os.makedirs(os.path.dirname(path), exist_ok=True) 79 | with open(path, 'wb') as fh: 80 | fh.write(data) 81 | comm.Barrier() 82 | 83 | def dict_gather(comm, d, op='mean', assert_all_have_data=True): 84 | if comm is None: return d 85 | alldicts = comm.allgather(d) 86 | size = comm.size 87 | k2li = defaultdict(list) 88 | for d in alldicts: 89 | for (k,v) in d.items(): 90 | k2li[k].append(v) 91 | result = {} 92 | for (k,li) in k2li.items(): 93 | if assert_all_have_data: 94 | assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k) 95 | if op=='mean': 96 | result[k] = np.mean(li, axis=0) 97 | elif op=='sum': 98 | result[k] = np.sum(li, axis=0) 99 | else: 100 | assert 0, op 101 | return result 102 | -------------------------------------------------------------------------------- /gym-atari/scripts/train-alien.sh: -------------------------------------------------------------------------------- 1 | cd baselines 2 | 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)& 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)& 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)& 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)& 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)& 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)& 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)& 10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)& 11 | 12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)& 13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)& 14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)& 15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)& 16 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)& 17 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)& 18 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)& 19 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=AlienNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-alien/alien/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)& 20 | 21 | cd .. 22 | -------------------------------------------------------------------------------- /gym-atari/scripts/train-phoenix.sh: -------------------------------------------------------------------------------- 1 | cd baselines 2 | 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)& 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)& 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)& 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)& 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)& 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)& 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)& 10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)& 11 | 12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)& 13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)& 14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)& 15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)& 16 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)& 17 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)& 18 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)& 19 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)& 20 | 21 | cd .. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RL with Perturbed Rewards 2 | 3 | This is the tensorflow implementation of [Reinforcement Learning with Perturbed Rewards](https://arxiv.org/abs/1810.01032) as described in the following AAAI 2020 paper (__Spotlight__): 4 | 5 | ``` 6 | @inproceedings{wang2020rlnoisy, 7 | title={Reinforcement Learning with Perturbed Rewards}, 8 | author={Wang, Jingkang and Liu, Yang and Li, Bo}, 9 | booktitle={AAAI}, 10 | year={2020} 11 | } 12 | ``` 13 | 14 | The implementation is based on [keras-rl](https://github.com/keras-rl/keras-rl) and [OpenAI baselines](https://github.com/openai/baselines) frameworks. Thanks to the original authors! 15 | 16 | - `gym-control`: Classic control games 17 | - `gym-atari`: Atari-2600 games 18 | 19 | ## Dependencies 20 | - python 3.5 21 | - tensorflow 1.10.0, keras 2.1.0 22 | - gym, scipy, scipy, joblib, keras 23 | - progressbar2, mpi4py, cloudpickle, opencv-python, h5py, pandas 24 | 25 | Note: make sure that you have successfully installed the baseline package and other packages following (using [virtualenvwrapper](https://virtualenvwrapper.readthedocs.io/en/latest/) to create virtual environment): 26 | ``` 27 | mkvirtualenv rl-noisy --python==/usr/bin/python3 28 | pip install -r requirements.txt 29 | cd gym-atari/baselines 30 | pip install -e . 31 | ``` 32 | 33 | ## Examples 34 | - Classic control (DQN on Cartpole) 35 | ``` 36 | cd gym-control 37 | python cem_cartpole.py # true reward 38 | python dqn_cartpole.py --error_positive 0.1 --reward noisy # perturbed reward 39 | python dqn_cartpole.py --error_positive 0.1 --reward surrogate # surrogate reward (estimated) 40 | ``` 41 | - Atari-2600 (PPO on Phoenix) 42 | ``` 43 | cd gym-atari/baselines 44 | python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 \ # true reward 45 | --num_timesteps=5e7 --normal=True 46 | python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 \ # noisy reward 47 | --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.2 \ 48 | --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden 49 | python -m baselines.run --alg=ppo2 --env=PhoenixNoFrameskip-v4 \ # surrogate reward (estimated) 50 | --num_timesteps=5e7 --save_path=logs-phoenix/phoenix/ppo2_50M_noisy_0.2 \ 51 | --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden 52 | ``` 53 | 54 | ## Reproduce the Results 55 | To reproduce all the results reported in the paper, please refer to `scripts/` folders in `rl-noisy-reward-control` and `rl-noisy-reward-atari`: 56 | - `gym-control/scripts` 57 | - Cartpole 58 | - `train-cem.sh` (CEM) 59 | - `train-dqn.sh` (DQN) 60 | - `train-duel-dqn.sh` (Dueling-DQN) 61 | - `train-qlearn.sh` (Q-Learning) 62 | - `train-sarsa.sh` (Deep SARSA) 63 | - Pendulum 64 | - `train-ddpg.sh` (DDPG) 65 | - `train-naf.sh` (NAF) 66 | - `gym-atari/scripts` 67 | - `train-alien.sh` (Alien) 68 | - `train-carnival.sh` (Carnival) 69 | - `train-mspacman.sh` (MsPacman) 70 | - `train-phoenix.sh` (Phoenix) 71 | - `train-pong.sh` (Pong) 72 | - `train-seaquest.sh` (Seaquest) 73 | - `train-normal.sh` (Training with true rewards) 74 | 75 | 76 | If you have eight available GPUs (Memory > 8GB), you can directly run the `*.sh` scripts one at a time. Otherwise, you can follow the instructions in the scripts and run the experiments. It ususally takes one or two days (GTX-1080 Ti) to train the policy. 77 | ``` 78 | cd rl-noisy-reward-atari/baselines 79 | sh scripts/train-alien.sh 80 | ``` 81 | The logs and models will be saved automatically. We provide `results_single.py` for getting the averaged scores: 82 | ``` 83 | python -m baselines.results_single --log_dir logs-alien 84 | ``` 85 | 86 | ## Citation 87 | Please cite our paper if you use this code in your research work. 88 | 89 | ## Questions/Bugs 90 | Please submit a Github issue or contact wangjk@cs.toronto.edu if you have any questions or find any bugs. 91 | -------------------------------------------------------------------------------- /gym-atari/scripts/train-carnival.sh: -------------------------------------------------------------------------------- 1 | cd baselines 2 | 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)& 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)& 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)& 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)& 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)& 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)& 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)& 10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)& 11 | 12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)& 13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)& 14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)& 15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)& 16 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)& 17 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)& 18 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)& 19 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=CarnivalNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-carnival/carnival/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)& 20 | 21 | cd .. 22 | -------------------------------------------------------------------------------- /gym-atari/scripts/train-mspacman.sh: -------------------------------------------------------------------------------- 1 | cd baselines 2 | 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)& 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)& 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)& 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)& 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)& 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)& 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)& 10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)& 11 | 12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)& 13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)& 14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)& 15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)& 16 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)& 17 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)& 18 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)& 19 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=MsPacmanNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-mspacman/mspacman/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)& 20 | 21 | cd .. 22 | -------------------------------------------------------------------------------- /gym-atari/scripts/train-seaquest.sh: -------------------------------------------------------------------------------- 1 | cd baselines 2 | 3 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.1 --weight=0.1 --normal=False --surrogate=False --noise_type=anti_iden)& 4 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.2 --weight=0.2 --normal=False --surrogate=False --noise_type=anti_iden)& 5 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.3 --weight=0.3 --normal=False --surrogate=False --noise_type=anti_iden)& 6 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.4 --weight=0.4 --normal=False --surrogate=False --noise_type=anti_iden)& 7 | (export CUDA_VISIBLE_DEVICES=4 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.6 --weight=0.6 --normal=False --surrogate=False --noise_type=anti_iden)& 8 | (export CUDA_VISIBLE_DEVICES=5 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.7 --weight=0.7 --normal=False --surrogate=False --noise_type=anti_iden)& 9 | (export CUDA_VISIBLE_DEVICES=6 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.8 --weight=0.8 --normal=False --surrogate=False --noise_type=anti_iden)& 10 | (export CUDA_VISIBLE_DEVICES=7 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_noisy_0.9 --weight=0.9 --normal=False --surrogate=False --noise_type=anti_iden)& 11 | 12 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.1 --weight=0.1 --normal=False --surrogate=True --noise_type=anti_iden)& 13 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.2 --weight=0.2 --normal=False --surrogate=True --noise_type=anti_iden)& 14 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.3 --weight=0.3 --normal=False --surrogate=True --noise_type=anti_iden)& 15 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.4 --weight=0.4 --normal=False --surrogate=True --noise_type=anti_iden)& 16 | (export CUDA_VISIBLE_DEVICES=0 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.6 --weight=0.6 --normal=False --surrogate=True --noise_type=anti_iden)& 17 | (export CUDA_VISIBLE_DEVICES=1 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.7 --weight=0.7 --normal=False --surrogate=True --noise_type=anti_iden)& 18 | (export CUDA_VISIBLE_DEVICES=2 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.8 --weight=0.8 --normal=False --surrogate=True --noise_type=anti_iden)& 19 | (export CUDA_VISIBLE_DEVICES=3 && python -m baselines.run --alg=ppo2 --env=SeaquestNoFrameskip-v4 --num_timesteps=5e7 --save_path=logs-seaquest/seaquest/ppo2_50M_surrogate_0.9 --weight=0.9 --normal=False --surrogate=True --noise_type=anti_iden)& 20 | 21 | cd .. 22 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/vec_env/__init__.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from baselines import logger 3 | 4 | class AlreadySteppingError(Exception): 5 | """ 6 | Raised when an asynchronous step is running while 7 | step_async() is called again. 8 | """ 9 | def __init__(self): 10 | msg = 'already running an async step' 11 | Exception.__init__(self, msg) 12 | 13 | class NotSteppingError(Exception): 14 | """ 15 | Raised when an asynchronous step is not running but 16 | step_wait() is called. 17 | """ 18 | def __init__(self): 19 | msg = 'not running an async step' 20 | Exception.__init__(self, msg) 21 | 22 | class VecEnv(ABC): 23 | """ 24 | An abstract asynchronous, vectorized environment. 25 | """ 26 | def __init__(self, num_envs, observation_space, action_space): 27 | self.num_envs = num_envs 28 | self.observation_space = observation_space 29 | self.action_space = action_space 30 | 31 | @abstractmethod 32 | def reset(self): 33 | """ 34 | Reset all the environments and return an array of 35 | observations, or a tuple of observation arrays. 36 | 37 | If step_async is still doing work, that work will 38 | be cancelled and step_wait() should not be called 39 | until step_async() is invoked again. 40 | """ 41 | pass 42 | 43 | @abstractmethod 44 | def step_async(self, actions): 45 | """ 46 | Tell all the environments to start taking a step 47 | with the given actions. 48 | Call step_wait() to get the results of the step. 49 | 50 | You should not call this if a step_async run is 51 | already pending. 52 | """ 53 | pass 54 | 55 | @abstractmethod 56 | def step_wait(self): 57 | """ 58 | Wait for the step taken with step_async(). 59 | 60 | Returns (obs, rews, dones, infos): 61 | - obs: an array of observations, or a tuple of 62 | arrays of observations. 63 | - rews: an array of rewards 64 | - dones: an array of "episode done" booleans 65 | - infos: a sequence of info objects 66 | """ 67 | pass 68 | 69 | @abstractmethod 70 | def close(self): 71 | """ 72 | Clean up the environments' resources. 73 | """ 74 | pass 75 | 76 | def step(self, actions): 77 | self.step_async(actions) 78 | return self.step_wait() 79 | 80 | def render(self, mode='human'): 81 | logger.warn('Render not defined for %s'%self) 82 | 83 | @property 84 | def unwrapped(self): 85 | if isinstance(self, VecEnvWrapper): 86 | return self.venv.unwrapped 87 | else: 88 | return self 89 | 90 | class VecEnvWrapper(VecEnv): 91 | def __init__(self, venv, observation_space=None, action_space=None): 92 | self.venv = venv 93 | VecEnv.__init__(self, 94 | num_envs=venv.num_envs, 95 | observation_space=observation_space or venv.observation_space, 96 | action_space=action_space or venv.action_space) 97 | 98 | def step_async(self, actions): 99 | self.venv.step_async(actions) 100 | 101 | @abstractmethod 102 | def reset(self): 103 | pass 104 | 105 | @abstractmethod 106 | def step_wait(self): 107 | pass 108 | 109 | def close(self): 110 | return self.venv.close() 111 | 112 | def render(self): 113 | self.venv.render() 114 | 115 | class CloudpickleWrapper(object): 116 | """ 117 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 118 | """ 119 | def __init__(self, x): 120 | self.x = x 121 | def __getstate__(self): 122 | import cloudpickle 123 | return cloudpickle.dumps(self.x) 124 | def __setstate__(self, ob): 125 | import pickle 126 | self.x = pickle.loads(ob) 127 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/vec_env/subproc_vec_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from multiprocessing import Process, Pipe 3 | from baselines.common.vec_env import VecEnv, CloudpickleWrapper 4 | from baselines.common.tile_images import tile_images 5 | 6 | 7 | def worker(remote, parent_remote, env_fn_wrapper): 8 | parent_remote.close() 9 | env = env_fn_wrapper.x() 10 | try: 11 | while True: 12 | cmd, data = remote.recv() 13 | if cmd == 'step': 14 | ob, reward, done, info = env.step(data) 15 | if done: 16 | ob = env.reset() 17 | remote.send((ob, reward, done, info)) 18 | elif cmd == 'reset': 19 | ob = env.reset() 20 | remote.send(ob) 21 | elif cmd == 'render': 22 | remote.send(env.render(mode='rgb_array')) 23 | elif cmd == 'close': 24 | remote.close() 25 | break 26 | elif cmd == 'get_spaces': 27 | remote.send((env.observation_space, env.action_space)) 28 | else: 29 | raise NotImplementedError 30 | except KeyboardInterrupt: 31 | print('SubprocVecEnv worker: got KeyboardInterrupt') 32 | finally: 33 | env.close() 34 | 35 | class SubprocVecEnv(VecEnv): 36 | def __init__(self, env_fns, spaces=None): 37 | """ 38 | envs: list of gym environments to run in subprocesses 39 | """ 40 | self.waiting = False 41 | self.closed = False 42 | nenvs = len(env_fns) 43 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(nenvs)]) 44 | self.ps = [Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 45 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 46 | for p in self.ps: 47 | p.daemon = True # if the main process crashes, we should not cause things to hang 48 | p.start() 49 | for remote in self.work_remotes: 50 | remote.close() 51 | 52 | self.remotes[0].send(('get_spaces', None)) 53 | observation_space, action_space = self.remotes[0].recv() 54 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 55 | 56 | def step_async(self, actions): 57 | for remote, action in zip(self.remotes, actions): 58 | remote.send(('step', action)) 59 | self.waiting = True 60 | 61 | def step_wait(self): 62 | results = [remote.recv() for remote in self.remotes] 63 | self.waiting = False 64 | obs, rews, dones, infos = zip(*results) 65 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 66 | 67 | def reset(self): 68 | for remote in self.remotes: 69 | remote.send(('reset', None)) 70 | return np.stack([remote.recv() for remote in self.remotes]) 71 | 72 | def reset_task(self): 73 | for remote in self.remotes: 74 | remote.send(('reset_task', None)) 75 | return np.stack([remote.recv() for remote in self.remotes]) 76 | 77 | def close(self): 78 | if self.closed: 79 | return 80 | if self.waiting: 81 | for remote in self.remotes: 82 | remote.recv() 83 | for remote in self.remotes: 84 | remote.send(('close', None)) 85 | for p in self.ps: 86 | p.join() 87 | self.closed = True 88 | 89 | def render(self, mode='human'): 90 | for pipe in self.remotes: 91 | pipe.send(('render', None)) 92 | imgs = [pipe.recv() for pipe in self.remotes] 93 | bigimg = tile_images(imgs) 94 | if mode == 'human': 95 | import cv2 96 | cv2.imshow('vecenv', bigimg[:,:,::-1]) 97 | cv2.waitKey(1) 98 | elif mode == 'rgb_array': 99 | return bigimg 100 | else: 101 | raise NotImplementedError -------------------------------------------------------------------------------- /gym-control/scripts/train-cem.sh: -------------------------------------------------------------------------------- 1 | for log_dir in logs_01 logs_02 logs_03 2 | do 3 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir)& 4 | 5 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)& 6 | (export CUDA_VISIBLE_DEVICES=1 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)& 7 | (export CUDA_VISIBLE_DEVICES=2 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)& 8 | (export CUDA_VISIBLE_DEVICES=3 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)& 9 | (export CUDA_VISIBLE_DEVICES=4 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)& 10 | (export CUDA_VISIBLE_DEVICES=5 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)& 11 | (export CUDA_VISIBLE_DEVICES=6 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)& 12 | (export CUDA_VISIBLE_DEVICES=7 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)& 13 | 14 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)& 15 | (export CUDA_VISIBLE_DEVICES=1 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)& 16 | (export CUDA_VISIBLE_DEVICES=2 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)& 17 | (export CUDA_VISIBLE_DEVICES=3 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)& 18 | (export CUDA_VISIBLE_DEVICES=4 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)& 19 | (export CUDA_VISIBLE_DEVICES=5 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)& 20 | (export CUDA_VISIBLE_DEVICES=6 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)& 21 | (export CUDA_VISIBLE_DEVICES=7 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)& 22 | 23 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)& 24 | (export CUDA_VISIBLE_DEVICES=1 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)& 25 | (export CUDA_VISIBLE_DEVICES=2 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)& 26 | (export CUDA_VISIBLE_DEVICES=3 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)& 27 | (export CUDA_VISIBLE_DEVICES=4 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)& 28 | (export CUDA_VISIBLE_DEVICES=5 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)& 29 | (export CUDA_VISIBLE_DEVICES=6 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)& 30 | (export CUDA_VISIBLE_DEVICES=7 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)& 31 | 32 | (export CUDA_VISIBLE_DEVICES=0 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)& 33 | (export CUDA_VISIBLE_DEVICES=1 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)& 34 | (export CUDA_VISIBLE_DEVICES=2 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)& 35 | (export CUDA_VISIBLE_DEVICES=3 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)& 36 | (export CUDA_VISIBLE_DEVICES=4 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)& 37 | (export CUDA_VISIBLE_DEVICES=5 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)& 38 | (export CUDA_VISIBLE_DEVICES=6 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)& 39 | (export CUDA_VISIBLE_DEVICES=7 && python cem_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)& 40 | done 41 | -------------------------------------------------------------------------------- /gym-control/scripts/train-dqn.sh: -------------------------------------------------------------------------------- 1 | for log_dir in logs_01 logs_02 logs_03 2 | do 3 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir)& 4 | 5 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)& 6 | (export CUDA_VISIBLE_DEVICES=1 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)& 7 | (export CUDA_VISIBLE_DEVICES=2 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)& 8 | (export CUDA_VISIBLE_DEVICES=3 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)& 9 | (export CUDA_VISIBLE_DEVICES=4 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)& 10 | (export CUDA_VISIBLE_DEVICES=5 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)& 11 | (export CUDA_VISIBLE_DEVICES=6 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)& 12 | (export CUDA_VISIBLE_DEVICES=7 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)& 13 | 14 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)& 15 | (export CUDA_VISIBLE_DEVICES=1 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)& 16 | (export CUDA_VISIBLE_DEVICES=2 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)& 17 | (export CUDA_VISIBLE_DEVICES=3 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)& 18 | (export CUDA_VISIBLE_DEVICES=4 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)& 19 | (export CUDA_VISIBLE_DEVICES=5 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)& 20 | (export CUDA_VISIBLE_DEVICES=6 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)& 21 | (export CUDA_VISIBLE_DEVICES=7 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)& 22 | 23 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)& 24 | (export CUDA_VISIBLE_DEVICES=1 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)& 25 | (export CUDA_VISIBLE_DEVICES=2 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)& 26 | (export CUDA_VISIBLE_DEVICES=3 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)& 27 | (export CUDA_VISIBLE_DEVICES=4 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)& 28 | (export CUDA_VISIBLE_DEVICES=5 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)& 29 | (export CUDA_VISIBLE_DEVICES=6 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)& 30 | (export CUDA_VISIBLE_DEVICES=7 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)& 31 | 32 | (export CUDA_VISIBLE_DEVICES=0 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)& 33 | (export CUDA_VISIBLE_DEVICES=1 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)& 34 | (export CUDA_VISIBLE_DEVICES=2 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)& 35 | (export CUDA_VISIBLE_DEVICES=3 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)& 36 | (export CUDA_VISIBLE_DEVICES=4 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)& 37 | (export CUDA_VISIBLE_DEVICES=5 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)& 38 | (export CUDA_VISIBLE_DEVICES=6 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)& 39 | (export CUDA_VISIBLE_DEVICES=7 && python dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)& 40 | done 41 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/mpi_running_mean_std.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import tensorflow as tf, baselines.common.tf_util as U, numpy as np 3 | 4 | class RunningMeanStd(object): 5 | # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 6 | def __init__(self, epsilon=1e-2, shape=()): 7 | 8 | self._sum = tf.get_variable( 9 | dtype=tf.float64, 10 | shape=shape, 11 | initializer=tf.constant_initializer(0.0), 12 | name="runningsum", trainable=False) 13 | self._sumsq = tf.get_variable( 14 | dtype=tf.float64, 15 | shape=shape, 16 | initializer=tf.constant_initializer(epsilon), 17 | name="runningsumsq", trainable=False) 18 | self._count = tf.get_variable( 19 | dtype=tf.float64, 20 | shape=(), 21 | initializer=tf.constant_initializer(epsilon), 22 | name="count", trainable=False) 23 | self.shape = shape 24 | 25 | self.mean = tf.to_float(self._sum / self._count) 26 | self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 )) 27 | 28 | newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') 29 | newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') 30 | newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') 31 | self.incfiltparams = U.function([newsum, newsumsq, newcount], [], 32 | updates=[tf.assign_add(self._sum, newsum), 33 | tf.assign_add(self._sumsq, newsumsq), 34 | tf.assign_add(self._count, newcount)]) 35 | 36 | 37 | def update(self, x): 38 | x = x.astype('float64') 39 | n = int(np.prod(self.shape)) 40 | totalvec = np.zeros(n*2+1, 'float64') 41 | addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')]) 42 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) 43 | self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n]) 44 | 45 | @U.in_session 46 | def test_runningmeanstd(): 47 | for (x1, x2, x3) in [ 48 | (np.random.randn(3), np.random.randn(4), np.random.randn(5)), 49 | (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)), 50 | ]: 51 | 52 | rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:]) 53 | U.initialize() 54 | 55 | x = np.concatenate([x1, x2, x3], axis=0) 56 | ms1 = [x.mean(axis=0), x.std(axis=0)] 57 | rms.update(x1) 58 | rms.update(x2) 59 | rms.update(x3) 60 | ms2 = [rms.mean.eval(), rms.std.eval()] 61 | 62 | assert np.allclose(ms1, ms2) 63 | 64 | @U.in_session 65 | def test_dist(): 66 | np.random.seed(0) 67 | p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1)) 68 | q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1)) 69 | 70 | # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5)) 71 | # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8)) 72 | 73 | comm = MPI.COMM_WORLD 74 | assert comm.Get_size()==2 75 | if comm.Get_rank()==0: 76 | x1,x2,x3 = p1,p2,p3 77 | elif comm.Get_rank()==1: 78 | x1,x2,x3 = q1,q2,q3 79 | else: 80 | assert False 81 | 82 | rms = RunningMeanStd(epsilon=0.0, shape=(1,)) 83 | U.initialize() 84 | 85 | rms.update(x1) 86 | rms.update(x2) 87 | rms.update(x3) 88 | 89 | bigvec = np.concatenate([p1,p2,p3,q1,q2,q3]) 90 | 91 | def checkallclose(x,y): 92 | print(x,y) 93 | return np.allclose(x,y) 94 | 95 | assert checkallclose( 96 | bigvec.mean(axis=0), 97 | rms.mean.eval(), 98 | ) 99 | assert checkallclose( 100 | bigvec.std(axis=0), 101 | rms.std.eval(), 102 | ) 103 | 104 | 105 | if __name__ == "__main__": 106 | # Run with mpirun -np 2 python 107 | test_dist() 108 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/schedules.py: -------------------------------------------------------------------------------- 1 | """This file is used for specifying various schedules that evolve over 2 | time throughout the execution of the algorithm, such as: 3 | - learning rate for the optimizer 4 | - exploration epsilon for the epsilon greedy exploration strategy 5 | - beta parameter for beta parameter in prioritized replay 6 | 7 | Each schedule has a function `value(t)` which returns the current value 8 | of the parameter given the timestep t of the optimization procedure. 9 | """ 10 | 11 | 12 | class Schedule(object): 13 | def value(self, t): 14 | """Value of the schedule at time t""" 15 | raise NotImplementedError() 16 | 17 | 18 | class ConstantSchedule(object): 19 | def __init__(self, value): 20 | """Value remains constant over time. 21 | 22 | Parameters 23 | ---------- 24 | value: float 25 | Constant value of the schedule 26 | """ 27 | self._v = value 28 | 29 | def value(self, t): 30 | """See Schedule.value""" 31 | return self._v 32 | 33 | 34 | def linear_interpolation(l, r, alpha): 35 | return l + alpha * (r - l) 36 | 37 | 38 | class PiecewiseSchedule(object): 39 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 40 | """Piecewise schedule. 41 | 42 | endpoints: [(int, int)] 43 | list of pairs `(time, value)` meanining that schedule should output 44 | `value` when `t==time`. All the values for time must be sorted in 45 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 46 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 47 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 48 | time passed between `time_a` and `time_b` for time `t`. 49 | interpolation: lambda float, float, float: float 50 | a function that takes value to the left and to the right of t according 51 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 52 | right endpoint that t has covered. See linear_interpolation for example. 53 | outside_value: float 54 | if the value is requested outside of all the intervals sepecified in 55 | `endpoints` this value is returned. If None then AssertionError is 56 | raised when outside value is requested. 57 | """ 58 | idxes = [e[0] for e in endpoints] 59 | assert idxes == sorted(idxes) 60 | self._interpolation = interpolation 61 | self._outside_value = outside_value 62 | self._endpoints = endpoints 63 | 64 | def value(self, t): 65 | """See Schedule.value""" 66 | for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]): 67 | if l_t <= t and t < r_t: 68 | alpha = float(t - l_t) / (r_t - l_t) 69 | return self._interpolation(l, r, alpha) 70 | 71 | # t does not belong to any of the pieces, so doom. 72 | assert self._outside_value is not None 73 | return self._outside_value 74 | 75 | 76 | class LinearSchedule(object): 77 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 78 | """Linear interpolation between initial_p and final_p over 79 | schedule_timesteps. After this many timesteps pass final_p is 80 | returned. 81 | 82 | Parameters 83 | ---------- 84 | schedule_timesteps: int 85 | Number of timesteps for which to linearly anneal initial_p 86 | to final_p 87 | initial_p: float 88 | initial output value 89 | final_p: float 90 | final output value 91 | """ 92 | self.schedule_timesteps = schedule_timesteps 93 | self.final_p = final_p 94 | self.initial_p = initial_p 95 | 96 | def value(self, t): 97 | """See Schedule.value""" 98 | fraction = min(float(t) / self.schedule_timesteps, 1.0) 99 | return self.initial_p + fraction * (self.final_p - self.initial_p) 100 | -------------------------------------------------------------------------------- /gym-control/scripts/train-sarsa.sh: -------------------------------------------------------------------------------- 1 | for log_dir in logs_01 logs_02 logs_03 2 | do 3 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir)& 4 | 5 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)& 6 | (export CUDA_VISIBLE_DEVICES=1 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)& 7 | (export CUDA_VISIBLE_DEVICES=2 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)& 8 | (export CUDA_VISIBLE_DEVICES=3 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)& 9 | (export CUDA_VISIBLE_DEVICES=4 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)& 10 | (export CUDA_VISIBLE_DEVICES=5 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)& 11 | (export CUDA_VISIBLE_DEVICES=6 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)& 12 | (export CUDA_VISIBLE_DEVICES=7 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)& 13 | 14 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)& 15 | (export CUDA_VISIBLE_DEVICES=1 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)& 16 | (export CUDA_VISIBLE_DEVICES=2 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)& 17 | (export CUDA_VISIBLE_DEVICES=3 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)& 18 | (export CUDA_VISIBLE_DEVICES=4 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)& 19 | (export CUDA_VISIBLE_DEVICES=5 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)& 20 | (export CUDA_VISIBLE_DEVICES=6 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)& 21 | (export CUDA_VISIBLE_DEVICES=7 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)& 22 | 23 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)& 24 | (export CUDA_VISIBLE_DEVICES=1 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)& 25 | (export CUDA_VISIBLE_DEVICES=2 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)& 26 | (export CUDA_VISIBLE_DEVICES=3 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)& 27 | (export CUDA_VISIBLE_DEVICES=4 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)& 28 | (export CUDA_VISIBLE_DEVICES=5 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)& 29 | (export CUDA_VISIBLE_DEVICES=6 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)& 30 | (export CUDA_VISIBLE_DEVICES=7 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)& 31 | 32 | (export CUDA_VISIBLE_DEVICES=0 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)& 33 | (export CUDA_VISIBLE_DEVICES=1 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)& 34 | (export CUDA_VISIBLE_DEVICES=2 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)& 35 | (export CUDA_VISIBLE_DEVICES=3 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)& 36 | (export CUDA_VISIBLE_DEVICES=4 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)& 37 | (export CUDA_VISIBLE_DEVICES=5 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)& 38 | (export CUDA_VISIBLE_DEVICES=6 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)& 39 | (export CUDA_VISIBLE_DEVICES=7 && python sarsa_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)& 40 | done 41 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/results_single.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import glob 4 | import numpy as np 5 | import matplotlib 6 | import matplotlib.pyplot as plt 7 | import seaborn as sns 8 | sns.set() 9 | sns.set_color_codes() 10 | 11 | from baselines.bench.monitor import load_results 12 | 13 | matplotlib.rcParams.update({'font.size': 30}) 14 | 15 | X_TIMESTEPS = 'timesteps' 16 | X_EPISODES = 'episodes' 17 | X_WALLTIME = 'walltime_hrs' 18 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] 19 | EPISODES_WINDOW = 100 20 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 21 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', 22 | 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] 23 | 24 | def rolling_window(a, window): 25 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 26 | strides = a.strides + (a.strides[-1],) 27 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 28 | 29 | def window_func(x, y, window, func): 30 | yw = rolling_window(y, window) 31 | yw_func = func(yw, axis=-1) 32 | return x[window-1:], yw_func 33 | 34 | def ts2xy(ts, xaxis): 35 | if xaxis == X_TIMESTEPS: 36 | x = np.cumsum(ts.l.values) 37 | y = ts.r.values 38 | elif xaxis == X_EPISODES: 39 | x = np.arange(len(ts)) 40 | y = ts.r.values 41 | elif xaxis == X_WALLTIME: 42 | x = ts.t.values / 3600. 43 | y = ts.r.values 44 | else: 45 | raise NotImplementedError 46 | return x, y 47 | 48 | 49 | def plot_results_single(ax, input_dir, num_timesteps, xaxis): 50 | ts = load_results(input_dir) 51 | ts = ts[ts.l.cumsum() <= num_timesteps] 52 | xy_list = ts2xy(ts, xaxis) 53 | 54 | x = xy_list[0] 55 | y = xy_list[1] 56 | ax.plot(x, y, alpha=0.4, linewidth=0.8, c=sns.color_palette()[0]) 57 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes 58 | print ("avg_100: %.1f" % np.mean(y_mean[-100:])) 59 | ax.plot(x, y_mean, linewidth=0.8, c=sns.color_palette()[0], label='normal') 60 | 61 | # plt.set_title(title) 62 | # ax.set_ylabel("Episode Rewards") 63 | # ax.legend() 64 | # plt.tight_layout() 65 | 66 | 67 | def main(): 68 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 69 | parser.add_argument('--log_dir', help='Path of log directory', default='logs') 70 | parser.add_argument('--num_timesteps', type=int, default=int(5e7)) 71 | parser.add_argument('--xaxis', help='Varible on X-axis', default = X_TIMESTEPS) 72 | parser.add_argument('--task_name', help='Name of atari game', default='Pong') 73 | parser.add_argument('--save_dir', help = 'Directory of output plots', default='../results') 74 | parser.add_argument('--noise_type', type=str, help='noise type (norm_one/norm_all/anti_iden)', 75 | default='anti_iden') 76 | parser.add_argument('--plot_normal', type=str, help='whether to plot baseline with normal rewards') 77 | args = parser.parse_args() 78 | 79 | args.save_dir = os.path.join(args.save_dir, "paper") 80 | if not os.path.exists(args.save_dir): 81 | os.makedirs(args.save_dir) 82 | 83 | dirs = glob.glob(os.path.join(args.log_dir, "openai*")) 84 | dirs = sorted(dirs) 85 | 86 | for input_dir in dirs: 87 | 88 | with open(os.path.join(input_dir, "setting.txt"), "r") as f: 89 | line = f.readlines()[-1].rstrip() 90 | # normal = line.split()[1][0:-1].split(',')[0] 91 | weight = float(line.split()[3][0:-1].split(',')[0]) 92 | surrogate = line.split()[5][0:-1].split(',')[0] 93 | # noise_type = line.split()[7][0:-1].split(')')[0] 94 | if weight in [0.1, 0.3, 0.7, 0.9] and surrogate == 'True': 95 | print ("-" * 20) 96 | print (line) 97 | plot_results_single(plt, input_dir, args.num_timesteps, args.xaxis) 98 | print ("-" * 20) 99 | 100 | if __name__ == '__main__': 101 | main() 102 | -------------------------------------------------------------------------------- /gym-control/scripts/train-duel-dqn.sh: -------------------------------------------------------------------------------- 1 | for log_dir in logs_01 logs_02 logs_03 2 | do 3 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir)& 4 | 5 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy)& 6 | (export CUDA_VISIBLE_DEVICES=1 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy)& 7 | (export CUDA_VISIBLE_DEVICES=2 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy)& 8 | (export CUDA_VISIBLE_DEVICES=3 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy)& 9 | (export CUDA_VISIBLE_DEVICES=4 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy)& 10 | (export CUDA_VISIBLE_DEVICES=5 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy)& 11 | (export CUDA_VISIBLE_DEVICES=6 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy)& 12 | (export CUDA_VISIBLE_DEVICES=7 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy)& 13 | 14 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward noisy --smooth True)& 15 | (export CUDA_VISIBLE_DEVICES=1 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward noisy --smooth True)& 16 | (export CUDA_VISIBLE_DEVICES=2 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward noisy --smooth True)& 17 | (export CUDA_VISIBLE_DEVICES=3 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward noisy --smooth True)& 18 | (export CUDA_VISIBLE_DEVICES=4 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward noisy --smooth True)& 19 | (export CUDA_VISIBLE_DEVICES=5 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward noisy --smooth True)& 20 | (export CUDA_VISIBLE_DEVICES=6 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward noisy --smooth True)& 21 | (export CUDA_VISIBLE_DEVICES=7 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward noisy --smooth True)& 22 | 23 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate)& 24 | (export CUDA_VISIBLE_DEVICES=1 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate)& 25 | (export CUDA_VISIBLE_DEVICES=2 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate)& 26 | (export CUDA_VISIBLE_DEVICES=3 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate)& 27 | (export CUDA_VISIBLE_DEVICES=4 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate)& 28 | (export CUDA_VISIBLE_DEVICES=5 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate)& 29 | (export CUDA_VISIBLE_DEVICES=6 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate)& 30 | (export CUDA_VISIBLE_DEVICES=7 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate)& 31 | 32 | (export CUDA_VISIBLE_DEVICES=0 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.1 --reward surrogate --smooth True)& 33 | (export CUDA_VISIBLE_DEVICES=1 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.2 --reward surrogate --smooth True)& 34 | (export CUDA_VISIBLE_DEVICES=2 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.3 --reward surrogate --smooth True)& 35 | (export CUDA_VISIBLE_DEVICES=3 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.4 --reward surrogate --smooth True)& 36 | (export CUDA_VISIBLE_DEVICES=4 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.6 --reward surrogate --smooth True)& 37 | (export CUDA_VISIBLE_DEVICES=5 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.7 --reward surrogate --smooth True)& 38 | (export CUDA_VISIBLE_DEVICES=6 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.8 --reward surrogate --smooth True)& 39 | (export CUDA_VISIBLE_DEVICES=7 && python duel_dqn_cartpole.py --log_dir $log_dir --error_positive 0.9 --reward surrogate --smooth True)& 40 | done 41 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/results_plotter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode 4 | 5 | import matplotlib.pyplot as plt 6 | plt.rcParams['svg.fonttype'] = 'none' 7 | 8 | from baselines.bench.monitor import load_results 9 | 10 | X_TIMESTEPS = 'timesteps' 11 | X_EPISODES = 'episodes' 12 | X_WALLTIME = 'walltime_hrs' 13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] 14 | EPISODES_WINDOW = 100 15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 16 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', 17 | 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] 18 | 19 | def rolling_window(a, window): 20 | shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) 21 | strides = a.strides + (a.strides[-1],) 22 | return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) 23 | 24 | def window_func(x, y, window, func): 25 | yw = rolling_window(y, window) 26 | yw_func = func(yw, axis=-1) 27 | return x[window-1:], yw_func 28 | 29 | def ts2xy(ts, xaxis): 30 | if xaxis == X_TIMESTEPS: 31 | x = np.cumsum(ts.l.values) 32 | y = ts.r.values 33 | elif xaxis == X_EPISODES: 34 | x = np.arange(len(ts)) 35 | y = ts.r.values 36 | elif xaxis == X_WALLTIME: 37 | x = ts.t.values / 3600. 38 | y = ts.r.values 39 | else: 40 | raise NotImplementedError 41 | return x, y 42 | 43 | def plot_curves(xy_list, xaxis, title): 44 | plt.figure(figsize=(8,2)) 45 | maxx = max(xy[0][-1] for xy in xy_list) 46 | minx = 0 47 | for (i, (x, y)) in enumerate(xy_list): 48 | color = COLORS[i] 49 | plt.scatter(x, y, s=2) 50 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes 51 | plt.plot(x, y_mean, color=color) 52 | plt.xlim(minx, maxx) 53 | plt.title(title) 54 | plt.xlabel(xaxis) 55 | plt.ylabel("Episode Rewards") 56 | plt.tight_layout() 57 | 58 | def plot_results(dirs, num_timesteps, xaxis, task_name): 59 | tslist = [] 60 | for dir in dirs: 61 | ts = load_results(dir) 62 | ts = ts[ts.l.cumsum() <= num_timesteps] 63 | tslist.append(ts) 64 | xy_list = [ts2xy(ts, xaxis) for ts in tslist] 65 | plot_curves(xy_list, xaxis, task_name) 66 | 67 | # Example usage in jupyter-notebook 68 | # from baselines import log_viewer 69 | # %matplotlib inline 70 | # log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout") 71 | # Here ./log is a directory containing the monitor.csv files 72 | 73 | def main(): 74 | import argparse 75 | import os 76 | import glob 77 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 78 | parser.add_argument('--log_dir', help='Path of log directory', default='logs') 79 | parser.add_argument('--num_timesteps', type=int, default=int(10e6)) 80 | parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS) 81 | parser.add_argument('--task_name', help = 'Title of plot', default = 'PongNoFrameskip-v4') 82 | parser.add_argument('--weight', help = 'Weight of noise', default = 0.2, type=float) 83 | parser.add_argument('--save_dir', help = 'Didrectory of output plots', default = 'results') 84 | args = parser.parse_args() 85 | 86 | if not os.path.exists(args.save_dir): 87 | os.makedirs(args.save_dir) 88 | 89 | dirs = glob.glob(os.path.join(args.log_dir, "*")) 90 | sorted(dirs) 91 | cnt = 0 92 | for directory in dirs: 93 | print directory 94 | with open(os.path.join(directory, "setting.txt"), "r") as f: 95 | line = f.readlines()[-1].rstrip() 96 | print (line.split()) 97 | normal = line.split()[1][0:-1].split(',')[0] 98 | weight = float(line.split()[3][0:-1].split(',')[0]) 99 | surrogate = line.split()[5][0:-1].split(',')[0] 100 | noise_type = line.split()[7][0:-1].split(')')[0] 101 | print (normal, weight, surrogate, noise_type) 102 | if normal == 'True': 103 | title = args.task_name + " (normal)" 104 | elif surrogate == 'False': 105 | title = args.task_name + " (noisy-" + str(weight) + "-" + noise_type + ")" 106 | else: 107 | title = args.task_name + " (surrogate-" + str(weight) + "-" + noise_type + ")" 108 | 109 | print (weight, args.weight) 110 | if weight == args.weight: 111 | print (args.weight) 112 | plot_results([directory], args.num_timesteps, args.xaxis, title) 113 | plt.savefig(os.path.join(args.save_dir, title + ".png")) 114 | cnt += 1 115 | print cnt 116 | 117 | 118 | if __name__ == '__main__': 119 | main() 120 | -------------------------------------------------------------------------------- /gym-control/rl/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from keras.models import model_from_config, Sequential, Model, model_from_config 4 | import keras.optimizers as optimizers 5 | import keras.backend as K 6 | 7 | 8 | def clone_model(model, custom_objects={}): 9 | # Requires Keras 1.0.7 since get_config has breaking changes. 10 | config = { 11 | 'class_name': model.__class__.__name__, 12 | 'config': model.get_config(), 13 | } 14 | clone = model_from_config(config, custom_objects=custom_objects) 15 | clone.set_weights(model.get_weights()) 16 | return clone 17 | 18 | 19 | def clone_optimizer(optimizer): 20 | if type(optimizer) is str: 21 | return optimizers.get(optimizer) 22 | # Requires Keras 1.0.7 since get_config has breaking changes. 23 | params = dict([(k, v) for k, v in optimizer.get_config().items()]) 24 | config = { 25 | 'class_name': optimizer.__class__.__name__, 26 | 'config': params, 27 | } 28 | if hasattr(optimizers, 'optimizer_from_config'): 29 | # COMPATIBILITY: Keras < 2.0 30 | clone = optimizers.optimizer_from_config(config) 31 | else: 32 | clone = optimizers.deserialize(config) 33 | return clone 34 | 35 | 36 | def get_soft_target_model_updates(target, source, tau): 37 | target_weights = target.trainable_weights + sum([l.non_trainable_weights for l in target.layers], []) 38 | source_weights = source.trainable_weights + sum([l.non_trainable_weights for l in source.layers], []) 39 | assert len(target_weights) == len(source_weights) 40 | 41 | # Create updates. 42 | updates = [] 43 | for tw, sw in zip(target_weights, source_weights): 44 | updates.append((tw, tau * sw + (1. - tau) * tw)) 45 | return updates 46 | 47 | 48 | def get_object_config(o): 49 | if o is None: 50 | return None 51 | 52 | config = { 53 | 'class_name': o.__class__.__name__, 54 | 'config': o.get_config() 55 | } 56 | return config 57 | 58 | 59 | def huber_loss(y_true, y_pred, clip_value): 60 | # Huber loss, see https://en.wikipedia.org/wiki/Huber_loss and 61 | # https://medium.com/@karpathy/yes-you-should-understand-backprop-e2f06eab496b 62 | # for details. 63 | assert clip_value > 0. 64 | 65 | x = y_true - y_pred 66 | if np.isinf(clip_value): 67 | # Spacial case for infinity since Tensorflow does have problems 68 | # if we compare `K.abs(x) < np.inf`. 69 | return .5 * K.square(x) 70 | 71 | condition = K.abs(x) < clip_value 72 | squared_loss = .5 * K.square(x) 73 | linear_loss = clip_value * (K.abs(x) - .5 * clip_value) 74 | if K.backend() == 'tensorflow': 75 | import tensorflow as tf 76 | if hasattr(tf, 'select'): 77 | return tf.select(condition, squared_loss, linear_loss) # condition, true, false 78 | else: 79 | return tf.where(condition, squared_loss, linear_loss) # condition, true, false 80 | elif K.backend() == 'theano': 81 | from theano import tensor as T 82 | return T.switch(condition, squared_loss, linear_loss) 83 | else: 84 | raise RuntimeError('Unknown backend "{}".'.format(K.backend())) 85 | 86 | 87 | class AdditionalUpdatesOptimizer(optimizers.Optimizer): 88 | def __init__(self, optimizer, additional_updates): 89 | super(AdditionalUpdatesOptimizer, self).__init__() 90 | self.optimizer = optimizer 91 | self.additional_updates = additional_updates 92 | 93 | def get_updates(self, params, loss): 94 | updates = self.optimizer.get_updates(params=params, loss=loss) 95 | updates += self.additional_updates 96 | self.updates = updates 97 | return self.updates 98 | 99 | def get_config(self): 100 | return self.optimizer.get_config() 101 | 102 | 103 | # Based on https://github.com/openai/baselines/blob/master/baselines/common/mpi_running_mean_std.py 104 | class WhiteningNormalizer(object): 105 | def __init__(self, shape, eps=1e-2, dtype=np.float64): 106 | self.eps = eps 107 | self.shape = shape 108 | self.dtype = dtype 109 | 110 | self._sum = np.zeros(shape, dtype=dtype) 111 | self._sumsq = np.zeros(shape, dtype=dtype) 112 | self._count = 0 113 | 114 | self.mean = np.zeros(shape, dtype=dtype) 115 | self.std = np.ones(shape, dtype=dtype) 116 | 117 | def normalize(self, x): 118 | return (x - self.mean) / self.std 119 | 120 | def denormalize(self, x): 121 | return self.std * x + self.mean 122 | 123 | def update(self, x): 124 | if x.ndim == len(self.shape): 125 | x = x.reshape(-1, *self.shape) 126 | assert x.shape[1:] == self.shape 127 | 128 | self._count += x.shape[0] 129 | self._sum += np.sum(x, axis=0) 130 | self._sumsq += np.sum(np.square(x), axis=0) 131 | 132 | self.mean = self._sum / float(self._count) 133 | self.std = np.sqrt(np.maximum(np.square(self.eps), self._sumsq / float(self._count) - np.square(self.mean))) 134 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/segment_tree.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | 4 | class SegmentTree(object): 5 | def __init__(self, capacity, operation, neutral_element): 6 | """Build a Segment Tree data structure. 7 | 8 | https://en.wikipedia.org/wiki/Segment_tree 9 | 10 | Can be used as regular array, but with two 11 | important differences: 12 | 13 | a) setting item's value is slightly slower. 14 | It is O(lg capacity) instead of O(1). 15 | b) user has access to an efficient ( O(log segment size) ) 16 | `reduce` operation which reduces `operation` over 17 | a contiguous subsequence of items in the array. 18 | 19 | Paramters 20 | --------- 21 | capacity: int 22 | Total size of the array - must be a power of two. 23 | operation: lambda obj, obj -> obj 24 | and operation for combining elements (eg. sum, max) 25 | must form a mathematical group together with the set of 26 | possible values for array elements (i.e. be associative) 27 | neutral_element: obj 28 | neutral element for the operation above. eg. float('-inf') 29 | for max and 0 for sum. 30 | """ 31 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 32 | self._capacity = capacity 33 | self._value = [neutral_element for _ in range(2 * capacity)] 34 | self._operation = operation 35 | 36 | def _reduce_helper(self, start, end, node, node_start, node_end): 37 | if start == node_start and end == node_end: 38 | return self._value[node] 39 | mid = (node_start + node_end) // 2 40 | if end <= mid: 41 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 42 | else: 43 | if mid + 1 <= start: 44 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 45 | else: 46 | return self._operation( 47 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 48 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 49 | ) 50 | 51 | def reduce(self, start=0, end=None): 52 | """Returns result of applying `self.operation` 53 | to a contiguous subsequence of the array. 54 | 55 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 56 | 57 | Parameters 58 | ---------- 59 | start: int 60 | beginning of the subsequence 61 | end: int 62 | end of the subsequences 63 | 64 | Returns 65 | ------- 66 | reduced: obj 67 | result of reducing self.operation over the specified range of array elements. 68 | """ 69 | if end is None: 70 | end = self._capacity 71 | if end < 0: 72 | end += self._capacity 73 | end -= 1 74 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 75 | 76 | def __setitem__(self, idx, val): 77 | # index of the leaf 78 | idx += self._capacity 79 | self._value[idx] = val 80 | idx //= 2 81 | while idx >= 1: 82 | self._value[idx] = self._operation( 83 | self._value[2 * idx], 84 | self._value[2 * idx + 1] 85 | ) 86 | idx //= 2 87 | 88 | def __getitem__(self, idx): 89 | assert 0 <= idx < self._capacity 90 | return self._value[self._capacity + idx] 91 | 92 | 93 | class SumSegmentTree(SegmentTree): 94 | def __init__(self, capacity): 95 | super(SumSegmentTree, self).__init__( 96 | capacity=capacity, 97 | operation=operator.add, 98 | neutral_element=0.0 99 | ) 100 | 101 | def sum(self, start=0, end=None): 102 | """Returns arr[start] + ... + arr[end]""" 103 | return super(SumSegmentTree, self).reduce(start, end) 104 | 105 | def find_prefixsum_idx(self, prefixsum): 106 | """Find the highest index `i` in the array such that 107 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 108 | 109 | if array values are probabilities, this function 110 | allows to sample indexes according to the discrete 111 | probability efficiently. 112 | 113 | Parameters 114 | ---------- 115 | perfixsum: float 116 | upperbound on the sum of array prefix 117 | 118 | Returns 119 | ------- 120 | idx: int 121 | highest index satisfying the prefixsum constraint 122 | """ 123 | assert 0 <= prefixsum <= self.sum() + 1e-5 124 | idx = 1 125 | while idx < self._capacity: # while non-leaf 126 | if self._value[2 * idx] > prefixsum: 127 | idx = 2 * idx 128 | else: 129 | prefixsum -= self._value[2 * idx] 130 | idx = 2 * idx + 1 131 | return idx - self._capacity 132 | 133 | 134 | class MinSegmentTree(SegmentTree): 135 | def __init__(self, capacity): 136 | super(MinSegmentTree, self).__init__( 137 | capacity=capacity, 138 | operation=min, 139 | neutral_element=float('inf') 140 | ) 141 | 142 | def min(self, start=0, end=None): 143 | """Returns min(arr[start], ..., arr[end])""" 144 | 145 | return super(MinSegmentTree, self).reduce(start, end) 146 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/cmd_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for scripts like run_atari.py. 3 | """ 4 | 5 | import os 6 | try: 7 | from mpi4py import MPI 8 | except ImportError: 9 | MPI = None 10 | 11 | import gym 12 | from gym.wrappers import FlattenDictWrapper 13 | from baselines import logger 14 | from baselines.bench import Monitor 15 | from baselines.common import set_global_seeds 16 | from baselines.common.atari_wrappers import make_atari, wrap_deepmind 17 | from baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 18 | 19 | def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0): 20 | """ 21 | Create a wrapped, monitored SubprocVecEnv for Atari. 22 | """ 23 | if wrapper_kwargs is None: wrapper_kwargs = {} 24 | mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 25 | def make_env(rank): # pylint: disable=C0111 26 | def _thunk(): 27 | env = make_atari(env_id) 28 | env.seed(seed + 10000*mpi_rank + rank if seed is not None else None) 29 | env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(rank))) 30 | return wrap_deepmind(env, **wrapper_kwargs) 31 | return _thunk 32 | set_global_seeds(seed) 33 | return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) 34 | 35 | def make_mujoco_env(env_id, seed, reward_scale=1.0): 36 | """ 37 | Create a wrapped, monitored gym.Env for MuJoCo. 38 | """ 39 | rank = MPI.COMM_WORLD.Get_rank() 40 | myseed = seed + 1000 * rank if seed is not None else None 41 | set_global_seeds(myseed) 42 | env = gym.make(env_id) 43 | env = Monitor(env, os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) 44 | env.seed(seed) 45 | 46 | if reward_scale != 1.0: 47 | from baselines.common.retro_wrappers import RewardScaler 48 | env = RewardScaler(env, reward_scale) 49 | 50 | return env 51 | 52 | def make_robotics_env(env_id, seed, rank=0): 53 | """ 54 | Create a wrapped, monitored gym.Env for MuJoCo. 55 | """ 56 | set_global_seeds(seed) 57 | env = gym.make(env_id) 58 | env = FlattenDictWrapper(env, ['observation', 'desired_goal']) 59 | env = Monitor( 60 | env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), 61 | info_keywords=('is_success',)) 62 | env.seed(seed) 63 | return env 64 | 65 | def arg_parser(): 66 | """ 67 | Create an empty argparse.ArgumentParser. 68 | """ 69 | import argparse 70 | return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 71 | 72 | def atari_arg_parser(): 73 | """ 74 | Create an argparse.ArgumentParser for run_atari.py. 75 | """ 76 | print('Obsolete - use common_arg_parser instead') 77 | return common_arg_parser() 78 | 79 | def mujoco_arg_parser(): 80 | print('Obsolete - use common_arg_parser instead') 81 | return common_arg_parser() 82 | 83 | def str2bool(v): 84 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 85 | return True 86 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 87 | return False 88 | else: 89 | raise argparse.ArgumentTypeError('Boolean value expected.') 90 | 91 | def common_arg_parser(): 92 | """ 93 | Create an argparse.ArgumentParser for run_mujoco.py. 94 | """ 95 | parser = arg_parser() 96 | parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') 97 | parser.add_argument('--seed', help='RNG seed', type=int, default=2019) 98 | parser.add_argument('--alg', help='Algorithm', type=str, default='ppo2') 99 | parser.add_argument('--num_timesteps', type=float, default=1e6), 100 | parser.add_argument('--weight', help='weight of noise', type=float, default=0.1) 101 | parser.add_argument('--normal', help='no noise', type=str2bool, default=True) 102 | parser.add_argument('--surrogate', help='surrogate reward', type=str2bool, default=False) 103 | parser.add_argument('--noise_type', help='noise type (norm_one, norm_all, max_one, anti_iden)', type=str, default='norm_one') 104 | parser.add_argument('--network', help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)', default=None) 105 | parser.add_argument('--gamestate', help='game state to load (so far only used in retro games)', default=None) 106 | parser.add_argument('--num_env', help='Number of environment copies being run in parallel. When not specified, set to number of cpus for Atari, and to 1 for Mujoco', default=None, type=int) 107 | parser.add_argument('--reward_scale', help='Reward scale factor. Default: 1.0', default=1.0, type=float) 108 | parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str) 109 | parser.add_argument('--play', default=False, action='store_true') 110 | return parser 111 | 112 | def robotics_arg_parser(): 113 | """ 114 | Create an argparse.ArgumentParser for run_mujoco.py. 115 | """ 116 | parser = arg_parser() 117 | parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0') 118 | parser.add_argument('--seed', help='RNG seed', type=int, default=None) 119 | parser.add_argument('--num-timesteps', type=int, default=int(1e6)) 120 | return parser 121 | 122 | 123 | def parse_unknown_args(args): 124 | """ 125 | Parse arguments not consumed by arg parser into a dicitonary 126 | """ 127 | retval = {} 128 | for arg in args: 129 | assert arg.startswith('--') 130 | assert '=' in arg, 'cannot parse arg {}'.format(arg) 131 | key = arg.split('=')[0][2:] 132 | value = arg.split('=')[1] 133 | retval[key] = value 134 | 135 | return retval 136 | 137 | 138 | 139 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/bench/benchmarks.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os.path as osp 3 | import os 4 | SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) 5 | 6 | _atari7 = ['BeamRider', 'Breakout', 'Enduro', 'Pong', 'Qbert', 'Seaquest', 'SpaceInvaders'] 7 | _atariexpl7 = ['Freeway', 'Gravitar', 'MontezumaRevenge', 'Pitfall', 'PrivateEye', 'Solaris', 'Venture'] 8 | 9 | _BENCHMARKS = [] 10 | 11 | remove_version_re = re.compile(r'-v\d+$') 12 | 13 | 14 | def register_benchmark(benchmark): 15 | for b in _BENCHMARKS: 16 | if b['name'] == benchmark['name']: 17 | raise ValueError('Benchmark with name %s already registered!' % b['name']) 18 | 19 | # automatically add a description if it is not present 20 | if 'tasks' in benchmark: 21 | for t in benchmark['tasks']: 22 | if 'desc' not in t: 23 | t['desc'] = remove_version_re.sub('', t['env_id']) 24 | _BENCHMARKS.append(benchmark) 25 | 26 | 27 | def list_benchmarks(): 28 | return [b['name'] for b in _BENCHMARKS] 29 | 30 | 31 | def get_benchmark(benchmark_name): 32 | for b in _BENCHMARKS: 33 | if b['name'] == benchmark_name: 34 | return b 35 | raise ValueError('%s not found! Known benchmarks: %s' % (benchmark_name, list_benchmarks())) 36 | 37 | 38 | def get_task(benchmark, env_id): 39 | """Get a task by env_id. Return None if the benchmark doesn't have the env""" 40 | return next(filter(lambda task: task['env_id'] == env_id, benchmark['tasks']), None) 41 | 42 | 43 | def find_task_for_env_id_in_any_benchmark(env_id): 44 | for bm in _BENCHMARKS: 45 | for task in bm["tasks"]: 46 | if task["env_id"] == env_id: 47 | return bm, task 48 | return None, None 49 | 50 | 51 | _ATARI_SUFFIX = 'NoFrameskip-v4' 52 | 53 | register_benchmark({ 54 | 'name': 'Atari50M', 55 | 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 50M timesteps', 56 | 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(50e6)} for _game in _atari7] 57 | }) 58 | 59 | register_benchmark({ 60 | 'name': 'Atari10M', 61 | 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', 62 | 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 6, 'num_timesteps': int(10e6)} for _game in _atari7] 63 | }) 64 | 65 | register_benchmark({ 66 | 'name': 'Atari1Hr', 67 | 'description': '7 Atari games from Mnih et al. (2013), with pixel observations, 1 hour of walltime', 68 | 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_seconds': 60 * 60} for _game in _atari7] 69 | }) 70 | 71 | register_benchmark({ 72 | 'name': 'AtariExploration10M', 73 | 'description': '7 Atari games emphasizing exploration, with pixel observations, 10M timesteps', 74 | 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atariexpl7] 75 | }) 76 | 77 | 78 | # MuJoCo 79 | 80 | _mujocosmall = [ 81 | 'InvertedDoublePendulum-v2', 'InvertedPendulum-v2', 82 | 'HalfCheetah-v2', 'Hopper-v2', 'Walker2d-v2', 83 | 'Reacher-v2', 'Swimmer-v2'] 84 | register_benchmark({ 85 | 'name': 'Mujoco1M', 86 | 'description': 'Some small 2D MuJoCo tasks, run for 1M timesteps', 87 | 'tasks': [{'env_id': _envid, 'trials': 6, 'num_timesteps': int(1e6)} for _envid in _mujocosmall] 88 | }) 89 | 90 | register_benchmark({ 91 | 'name': 'MujocoWalkers', 92 | 'description': 'MuJoCo forward walkers, run for 8M, humanoid 100M', 93 | 'tasks': [ 94 | {'env_id': "Hopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, 95 | {'env_id': "Walker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, 96 | {'env_id': "Humanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000}, 97 | ] 98 | }) 99 | 100 | # Roboschool 101 | 102 | register_benchmark({ 103 | 'name': 'Roboschool8M', 104 | 'description': 'Small 2D tasks, up to 30 minutes to complete on 8 cores', 105 | 'tasks': [ 106 | {'env_id': "RoboschoolReacher-v1", 'trials': 4, 'num_timesteps': 2 * 1000000}, 107 | {'env_id': "RoboschoolAnt-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, 108 | {'env_id': "RoboschoolHalfCheetah-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, 109 | {'env_id': "RoboschoolHopper-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, 110 | {'env_id': "RoboschoolWalker2d-v1", 'trials': 4, 'num_timesteps': 8 * 1000000}, 111 | ] 112 | }) 113 | register_benchmark({ 114 | 'name': 'RoboschoolHarder', 115 | 'description': 'Test your might!!! Up to 12 hours on 32 cores', 116 | 'tasks': [ 117 | {'env_id': "RoboschoolHumanoid-v1", 'trials': 4, 'num_timesteps': 100 * 1000000}, 118 | {'env_id': "RoboschoolHumanoidFlagrun-v1", 'trials': 4, 'num_timesteps': 200 * 1000000}, 119 | {'env_id': "RoboschoolHumanoidFlagrunHarder-v1", 'trials': 4, 'num_timesteps': 400 * 1000000}, 120 | ] 121 | }) 122 | 123 | # Other 124 | 125 | _atari50 = [ # actually 47 126 | 'Alien', 'Amidar', 'Assault', 'Asterix', 'Asteroids', 127 | 'Atlantis', 'BankHeist', 'BattleZone', 'BeamRider', 'Bowling', 128 | 'Breakout', 'Centipede', 'ChopperCommand', 'CrazyClimber', 129 | 'DemonAttack', 'DoubleDunk', 'Enduro', 'FishingDerby', 'Freeway', 130 | 'Frostbite', 'Gopher', 'Gravitar', 'IceHockey', 'Jamesbond', 131 | 'Kangaroo', 'Krull', 'KungFuMaster', 'MontezumaRevenge', 'MsPacman', 132 | 'NameThisGame', 'Pitfall', 'Pong', 'PrivateEye', 'Qbert', 133 | 'RoadRunner', 'Robotank', 'Seaquest', 'SpaceInvaders', 'StarGunner', 134 | 'Tennis', 'TimePilot', 'Tutankham', 'UpNDown', 'Venture', 135 | 'VideoPinball', 'WizardOfWor', 'Zaxxon', 136 | ] 137 | 138 | register_benchmark({ 139 | 'name': 'Atari50_10M', 140 | 'description': '47 Atari games from Mnih et al. (2013), with pixel observations, 10M timesteps', 141 | 'tasks': [{'desc': _game, 'env_id': _game + _ATARI_SUFFIX, 'trials': 2, 'num_timesteps': int(10e6)} for _game in _atari50] 142 | }) 143 | 144 | # HER DDPG 145 | 146 | register_benchmark({ 147 | 'name': 'HerDdpg', 148 | 'description': 'Smoke-test only benchmark of HER', 149 | 'tasks': [{'trials': 1, 'env_id': 'FetchReach-v1'}] 150 | }) 151 | 152 | -------------------------------------------------------------------------------- /gym-control/scripts/train-naf.sh: -------------------------------------------------------------------------------- 1 | for log_dir in naf/1 2 | do 3 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.1 --reward noisy --noise_type norm_all --log_dir $log_dir)& 4 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.2 --reward noisy --noise_type norm_all --log_dir $log_dir)& 5 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.3 --reward noisy --noise_type norm_all --log_dir $log_dir)& 6 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.4 --reward noisy --noise_type norm_all --log_dir $log_dir)& 7 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.9 --reward noisy --noise_type norm_all --log_dir $log_dir)& 8 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.6 --reward noisy --noise_type norm_all --log_dir $log_dir)& 9 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.7 --reward noisy --noise_type norm_all --log_dir $log_dir)& 10 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.8 --reward noisy --noise_type norm_all --log_dir $log_dir)& 11 | 12 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.1 --reward surrogate --noise_type norm_all --log_dir $log_dir)& 13 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.2 --reward surrogate --noise_type norm_all --log_dir $log_dir)& 14 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.3 --reward surrogate --noise_type norm_all --log_dir $log_dir)& 15 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.4 --reward surrogate --noise_type norm_all --log_dir $log_dir)& 16 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.9 --reward surrogate --noise_type norm_all --log_dir $log_dir)& 17 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.6 --reward surrogate --noise_type norm_all --log_dir $log_dir)& 18 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.7 --reward surrogate --noise_type norm_all --log_dir $log_dir)& 19 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.8 --reward surrogate --noise_type norm_all --log_dir $log_dir)& 20 | 21 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.1 --reward noisy --noise_type norm_one --log_dir $log_dir)& 22 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.2 --reward noisy --noise_type norm_one --log_dir $log_dir)& 23 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.3 --reward noisy --noise_type norm_one --log_dir $log_dir)& 24 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.4 --reward noisy --noise_type norm_one --log_dir $log_dir)& 25 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.9 --reward noisy --noise_type norm_one --log_dir $log_dir)& 26 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.6 --reward noisy --noise_type norm_one --log_dir $log_dir)& 27 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.7 --reward noisy --noise_type norm_one --log_dir $log_dir)& 28 | (export CUDA_VISIBLE_DEVICES=5 && python naf_pendulum2.py --weight 0.8 --reward noisy --noise_type norm_one --log_dir $log_dir)& 29 | 30 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.1 --reward surrogate --noise_type norm_one --log_dir $log_dir)& 31 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.2 --reward surrogate --noise_type norm_one --log_dir $log_dir)& 32 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.3 --reward surrogate --noise_type norm_one --log_dir $log_dir)& 33 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.4 --reward surrogate --noise_type norm_one --log_dir $log_dir)& 34 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.9 --reward surrogate --noise_type norm_one --log_dir $log_dir)& 35 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.6 --reward surrogate --noise_type norm_one --log_dir $log_dir)& 36 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.7 --reward surrogate --noise_type norm_one --log_dir $log_dir)& 37 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.8 --reward surrogate --noise_type norm_one --log_dir $log_dir)& 38 | 39 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.1 --reward noisy --noise_type anti_iden --log_dir $log_dir)& 40 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.2 --reward noisy --noise_type anti_iden --log_dir $log_dir)& 41 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.3 --reward noisy --noise_type anti_iden --log_dir $log_dir)& 42 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.4 --reward noisy --noise_type anti_iden --log_dir $log_dir)& 43 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.9 --reward noisy --noise_type anti_iden --log_dir $log_dir)& 44 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.6 --reward noisy --noise_type anti_iden --log_dir $log_dir)& 45 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.7 --reward noisy --noise_type anti_iden --log_dir $log_dir)& 46 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.8 --reward noisy --noise_type anti_iden --log_dir $log_dir)& 47 | 48 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.1 --reward surrogate --noise_type anti_iden --log_dir $log_dir)& 49 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.2 --reward surrogate --noise_type anti_iden --log_dir $log_dir)& 50 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.3 --reward surrogate --noise_type anti_iden --log_dir $log_dir)& 51 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.4 --reward surrogate --noise_type anti_iden --log_dir $log_dir)& 52 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.9 --reward surrogate --noise_type anti_iden --log_dir $log_dir)& 53 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.6 --reward surrogate --noise_type anti_iden --log_dir $log_dir)& 54 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.7 --reward surrogate --noise_type anti_iden --log_dir $log_dir)& 55 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --weight 0.8 --reward surrogate --noise_type anti_iden --log_dir $log_dir)& 56 | 57 | (export CUDA_VISIBLE_DEVICES=6 && python naf_pendulum2.py --log_dir $log_dir)& 58 | done 59 | -------------------------------------------------------------------------------- /gym-control/cem_cartpole.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import pandas 4 | import numpy as np 5 | import os 6 | import gym 7 | 8 | from keras.models import Sequential 9 | from keras.layers import Dense, Activation, Flatten 10 | import tensorflow as tf 11 | 12 | from rl.agents.cem import CEMAgent 13 | from rl.memory import EpisodeParameterMemory 14 | from noise_estimator import CartpoleProcessor, CartpoleSurrogateProcessor 15 | from utils import * 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument('--error_positive', type=float, default=0.2, 19 | help='Error positive rate [default: 0.2]') 20 | parser.add_argument('--error_negative', type=float, default=0.0, 21 | help='Error negative rate [default: 0.0]') 22 | parser.add_argument('--log_dir', default='logs', 23 | help='Log dir [default: logs]') 24 | parser.add_argument('--reward', default='normal', 25 | help='reward choice: normal/noisy/surrogate [default: normal]') 26 | parser.add_argument('--smooth', type=str2bool, default=False, 27 | help='Add smoothing to rewards [default: False]') 28 | FLAGS = parser.parse_args() 29 | 30 | ERR_P = FLAGS.error_positive 31 | ERR_N = FLAGS.error_negative 32 | REWARD = FLAGS.reward 33 | SMOOTH = FLAGS.smooth 34 | 35 | if REWARD == "normal": 36 | LOG_DIR = os.path.join(FLAGS.log_dir, "cem_cartpole") 37 | else: 38 | LOG_DIR = os.path.join(os.path.join(FLAGS.log_dir, "cem_cartpole"), str(ERR_P)) 39 | ENV_NAME = 'CartPole-v0' 40 | 41 | if not os.path.exists(LOG_DIR): 42 | os.makedirs(LOG_DIR) 43 | os.system('cp cem_cartpole.py %s' % (LOG_DIR)) # bkp of train procedure 44 | LOG_FOUT = open(os.path.join(LOG_DIR, 'setting.txt'), 'w') 45 | LOG_FOUT.write(str(FLAGS)+'\n') 46 | 47 | def train(): 48 | # Get the environment and extract the number of actions. 49 | env = gym.make(ENV_NAME) 50 | np.random.seed(123) 51 | env.seed(123) 52 | 53 | nb_actions = env.action_space.n 54 | obs_dim = env.observation_space.shape[0] 55 | 56 | config = tf.ConfigProto() 57 | config.gpu_options.allow_growth = True 58 | sess = tf.Session(config=config) 59 | from keras import backend as K 60 | K.set_session(sess) 61 | 62 | # Option 1 : Simple model 63 | # model = Sequential() 64 | # model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 65 | # model.add(Dense(nb_actions)) 66 | # model.add(Activation('softmax')) 67 | 68 | # Option 2: deep network 69 | model = Sequential() 70 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 71 | model.add(Dense(16)) 72 | model.add(Activation('relu')) 73 | model.add(Dense(16)) 74 | model.add(Activation('relu')) 75 | model.add(Dense(16)) 76 | model.add(Activation('relu')) 77 | model.add(Dense(nb_actions)) 78 | model.add(Activation('softmax')) 79 | 80 | model.summary() 81 | 82 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 83 | # even the metrics! 84 | memory = EpisodeParameterMemory(limit=1000, window_length=1) 85 | 86 | if REWARD == "normal": 87 | cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, 88 | batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05) 89 | cem.compile() 90 | history_normal = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) 91 | cem.save_weights(os.path.join(LOG_DIR, 'cem_normal_{}_params.h5f'.format(ENV_NAME)), overwrite=True) 92 | cem.test(env, nb_episodes=5, visualize=False) 93 | 94 | pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) 95 | 96 | elif REWARD == "noisy": 97 | if not SMOOTH: 98 | processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=False) 99 | else: 100 | processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False) 101 | 102 | # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False) 103 | cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, 104 | batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05, 105 | processor=processor_noisy) 106 | cem.compile() 107 | history_noisy = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) 108 | if not SMOOTH: 109 | cem.save_weights(os.path.join(LOG_DIR, 'cem_noisy_{}_params.h5f'.format(ENV_NAME)), overwrite=True) 110 | pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) 111 | 112 | else: 113 | cem.save_weights(os.path.join(LOG_DIR, 'cem_noisy_smooth_{}_params.h5f'.format(ENV_NAME)), overwrite=True) 114 | pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) 115 | 116 | cem.test(env, nb_episodes=5, visualize=False) 117 | 118 | elif REWARD == "surrogate": 119 | if not SMOOTH: 120 | processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True) 121 | else: 122 | processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True) 123 | 124 | # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True) 125 | cem = CEMAgent(model=model, nb_actions=nb_actions, memory=memory, 126 | batch_size=50, nb_steps_warmup=2000, train_interval=50, elite_frac=0.05, 127 | processor=processor_surrogate) 128 | cem.compile() 129 | history_surrogate = cem.fit(env, nb_steps=100000, visualize=False, verbose=2) 130 | if not SMOOTH: 131 | cem.save_weights(os.path.join(LOG_DIR, 'cem_surrogate_{}_params.h5f'.format(ENV_NAME)), overwrite=True) 132 | pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) 133 | else: 134 | cem.save_weights(os.path.join(LOG_DIR, 'cem_surrogate_smooth_{}_params.h5f'.format(ENV_NAME)), overwrite=True) 135 | pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) 136 | 137 | cem.test(env, nb_episodes=5, visualize=False) 138 | 139 | else: 140 | raise NotImplementedError 141 | 142 | 143 | if __name__ == "__main__": 144 | train() 145 | -------------------------------------------------------------------------------- /gym-control/sarsa_cartpole.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import pandas 4 | import numpy as np 5 | import os 6 | import gym 7 | 8 | from keras.layers import Activation, Dense, Flatten 9 | from keras.models import Sequential 10 | from keras.optimizers import Adam 11 | import tensorflow as tf 12 | 13 | from rl.agents import SARSAAgent 14 | from rl.core import Processor 15 | from rl.policy import BoltzmannQPolicy 16 | from noise_estimator import * 17 | from utils import * 18 | 19 | 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument('--error_positive', type=float, default=0.2, 22 | help='Error positive rate [default: 0.2]') 23 | parser.add_argument('--error_negative', type=float, default=0.0, 24 | help='Error negative rate [default: 0.0]') 25 | parser.add_argument('--log_dir', default='logs', 26 | help='Log dir [default: logs]') 27 | parser.add_argument('--reward', default='normal', 28 | help='reward choice: normal/noisy/surrogate [default: normal]') 29 | parser.add_argument('--smooth', type=str2bool, default=False, 30 | help='Add smoothing to rewards [default: False]') 31 | FLAGS = parser.parse_args() 32 | 33 | ERR_P = FLAGS.error_positive 34 | ERR_N = FLAGS.error_negative 35 | REWARD = FLAGS.reward 36 | SMOOTH = FLAGS.smooth 37 | 38 | if REWARD == "normal": 39 | LOG_DIR = os.path.join(FLAGS.log_dir, "sarsa_cartpole") 40 | else: 41 | LOG_DIR = os.path.join(os.path.join(FLAGS.log_dir, "sarsa_cartpole"), str(ERR_P)) 42 | ENV_NAME = 'CartPole-v0' 43 | 44 | if not os.path.exists(LOG_DIR): 45 | os.makedirs(LOG_DIR) 46 | os.system('cp sarsa_cartpole.py %s' % (LOG_DIR)) # bkp of train procedure 47 | print ('cp sarsa_cartpole.py %s' % (LOG_DIR)) 48 | LOG_FOUT = open(os.path.join(LOG_DIR, 'setting.txt'), 'w') 49 | LOG_FOUT.write(str(FLAGS)+'\n') 50 | 51 | 52 | def log_string(out_str): 53 | LOG_FOUT.write(out_str+'\n') 54 | LOG_FOUT.flush() 55 | print(out_str) 56 | 57 | def build_state(features): 58 | return int("".join(map(lambda feature: str(int(feature)), features))) 59 | 60 | def to_bin(value, bins): 61 | return np.digitize(x=[value], bins=bins)[0] 62 | 63 | 64 | def train(): 65 | # Get the environment and extract the number of actions. 66 | env = gym.make(ENV_NAME) 67 | np.random.seed(123) 68 | env.seed(123) 69 | nb_actions = env.action_space.n 70 | 71 | config = tf.ConfigProto() 72 | config.gpu_options.allow_growth = True 73 | sess = tf.Session(config=config) 74 | from keras import backend as K 75 | K.set_session(sess) 76 | 77 | # Next, we build a very simple model. 78 | model = Sequential() 79 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 80 | model.add(Dense(16)) 81 | model.add(Activation('relu')) 82 | model.add(Dense(16)) 83 | model.add(Activation('relu')) 84 | model.add(Dense(16)) 85 | model.add(Activation('relu')) 86 | model.add(Dense(nb_actions)) 87 | model.add(Activation('linear')) 88 | print(model.summary()) 89 | 90 | # SARSA does not require a memory. 91 | policy = BoltzmannQPolicy() 92 | # processor_noisy = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=False) 93 | # processor_surrogate = CartpoleSurrogateProcessor(e_= ERR_N, e=ERR_P, surrogate=True) 94 | if not SMOOTH: 95 | processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=False) 96 | processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=False, surrogate=True) 97 | else: 98 | processor_noisy = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=False) 99 | processor_surrogate = CartpoleProcessor(e_= ERR_N, e=ERR_P, smooth=True, surrogate=True) 100 | 101 | if REWARD == "normal": 102 | sarsa_normal = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 103 | policy=policy) 104 | sarsa_normal.compile(Adam(lr=1e-3), metrics=['mae']) 105 | history_normal = sarsa_normal.fit(env, nb_steps=50000, visualize=False, verbose=2) 106 | sarsa_normal.save_weights(os.path.join(LOG_DIR, 'sarsa_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 107 | sarsa_normal.test(env, nb_episodes=10, visualize=False, verbose=2) 108 | 109 | pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) 110 | 111 | 112 | elif REWARD == "noisy": 113 | sarsa_noisy = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 114 | policy=policy, processor=processor_noisy) 115 | sarsa_noisy.compile(Adam(lr=1e-3), metrics=['mae']) 116 | history_noisy = sarsa_noisy.fit(env, nb_steps=50000, visualize=False, verbose=2) 117 | if not SMOOTH: 118 | sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 119 | pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) 120 | else: 121 | sarsa_noisy.save_weights(os.path.join(LOG_DIR, 'sarsa_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 122 | pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) 123 | 124 | sarsa_noisy.test(env, nb_episodes=10, visualize=False) 125 | 126 | 127 | elif REWARD == "surrogate": 128 | sarsa_surrogate = SARSAAgent(model=model, nb_actions=nb_actions, nb_steps_warmup=10, 129 | policy=policy, processor=processor_surrogate) 130 | sarsa_surrogate.compile(Adam(lr=1e-3), metrics=['mae']) 131 | history_surrogate = sarsa_surrogate.fit(env, nb_steps=50000, visualize=False, verbose=2) 132 | if not SMOOTH: 133 | sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 134 | pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) 135 | 136 | else: 137 | sarsa_surrogate.save_weights(os.path.join(LOG_DIR, 'sarsa_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 138 | pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) 139 | 140 | sarsa_surrogate.test(env, nb_episodes=10, visualize=False) 141 | 142 | 143 | 144 | if __name__ == "__main__": 145 | train() -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/common/models.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from baselines.a2c import utils 4 | from baselines.a2c.utils import conv, fc, conv_to_fc, batch_to_seq, seq_to_batch 5 | from baselines.common.mpi_running_mean_std import RunningMeanStd 6 | import tensorflow.contrib.layers as layers 7 | 8 | 9 | def nature_cnn(unscaled_images, **conv_kwargs): 10 | """ 11 | CNN from Nature paper. 12 | """ 13 | scaled_images = tf.cast(unscaled_images, tf.float32) / 255. 14 | activ = tf.nn.relu 15 | h = activ(conv(scaled_images, 'c1', nf=32, rf=8, stride=4, init_scale=np.sqrt(2), 16 | **conv_kwargs)) 17 | h2 = activ(conv(h, 'c2', nf=64, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) 18 | h3 = activ(conv(h2, 'c3', nf=64, rf=3, stride=1, init_scale=np.sqrt(2), **conv_kwargs)) 19 | h3 = conv_to_fc(h3) 20 | return activ(fc(h3, 'fc1', nh=512, init_scale=np.sqrt(2))) 21 | 22 | 23 | def mlp(num_layers=2, num_hidden=64, activation=tf.tanh): 24 | """ 25 | Simple fully connected layer policy. Separate stacks of fully-connected layers are used for policy and value function estimation. 26 | More customized fully-connected policies can be obtained by using PolicyWithV class directly. 27 | 28 | Parameters: 29 | ---------- 30 | 31 | num_layers: int number of fully-connected layers (default: 2) 32 | 33 | num_hidden: int size of fully-connected layers (default: 64) 34 | 35 | activation: activation function (default: tf.tanh) 36 | 37 | Returns: 38 | ------- 39 | 40 | function that builds fully connected network with a given input placeholder 41 | """ 42 | def network_fn(X): 43 | h = tf.layers.flatten(X) 44 | for i in range(num_layers): 45 | h = activation(fc(h, 'mlp_fc{}'.format(i), nh=num_hidden, init_scale=np.sqrt(2))) 46 | return h, None 47 | 48 | return network_fn 49 | 50 | 51 | def cnn(**conv_kwargs): 52 | def network_fn(X): 53 | return nature_cnn(X, **conv_kwargs), None 54 | return network_fn 55 | 56 | def cnn_small(**conv_kwargs): 57 | def network_fn(X): 58 | h = tf.cast(X, tf.float32) / 255. 59 | 60 | activ = tf.nn.relu 61 | h = activ(conv(h, 'c1', nf=8, rf=8, stride=4, init_scale=np.sqrt(2), **conv_kwargs)) 62 | h = activ(conv(h, 'c2', nf=16, rf=4, stride=2, init_scale=np.sqrt(2), **conv_kwargs)) 63 | h = conv_to_fc(h) 64 | h = activ(fc(h, 'fc1', nh=128, init_scale=np.sqrt(2))) 65 | return h, None 66 | return network_fn 67 | 68 | 69 | 70 | def lstm(nlstm=128, layer_norm=False): 71 | def network_fn(X, nenv=1): 72 | nbatch = X.shape[0] 73 | nsteps = nbatch // nenv 74 | 75 | h = tf.layers.flatten(X) 76 | 77 | M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) 78 | S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states 79 | 80 | xs = batch_to_seq(h, nenv, nsteps) 81 | ms = batch_to_seq(M, nenv, nsteps) 82 | 83 | if layer_norm: 84 | h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) 85 | else: 86 | h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) 87 | 88 | h = seq_to_batch(h5) 89 | initial_state = np.zeros(S.shape.as_list(), dtype=float) 90 | 91 | return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state} 92 | 93 | return network_fn 94 | 95 | 96 | def cnn_lstm(nlstm=128, layer_norm=False, **conv_kwargs): 97 | def network_fn(X, nenv=1): 98 | nbatch = X.shape[0] 99 | nsteps = nbatch // nenv 100 | 101 | h = nature_cnn(X, **conv_kwargs) 102 | 103 | M = tf.placeholder(tf.float32, [nbatch]) #mask (done t-1) 104 | S = tf.placeholder(tf.float32, [nenv, 2*nlstm]) #states 105 | 106 | xs = batch_to_seq(h, nenv, nsteps) 107 | ms = batch_to_seq(M, nenv, nsteps) 108 | 109 | if layer_norm: 110 | h5, snew = utils.lnlstm(xs, ms, S, scope='lnlstm', nh=nlstm) 111 | else: 112 | h5, snew = utils.lstm(xs, ms, S, scope='lstm', nh=nlstm) 113 | 114 | h = seq_to_batch(h5) 115 | initial_state = np.zeros(S.shape.as_list(), dtype=float) 116 | 117 | return h, {'S':S, 'M':M, 'state':snew, 'initial_state':initial_state} 118 | 119 | return network_fn 120 | 121 | def cnn_lnlstm(nlstm=128, **conv_kwargs): 122 | return cnn_lstm(nlstm, layer_norm=True, **conv_kwargs) 123 | 124 | 125 | def conv_only(convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], **conv_kwargs): 126 | ''' 127 | convolutions-only net 128 | 129 | Parameters: 130 | ---------- 131 | 132 | conv: list of triples (filter_number, filter_size, stride) specifying parameters for each layer. 133 | 134 | Returns: 135 | 136 | function that takes tensorflow tensor as input and returns the output of the last convolutional layer 137 | 138 | ''' 139 | 140 | def network_fn(X): 141 | out = tf.cast(X, tf.float32) / 255. 142 | with tf.variable_scope("convnet"): 143 | for num_outputs, kernel_size, stride in convs: 144 | out = layers.convolution2d(out, 145 | num_outputs=num_outputs, 146 | kernel_size=kernel_size, 147 | stride=stride, 148 | activation_fn=tf.nn.relu, 149 | **conv_kwargs) 150 | 151 | return out, None 152 | return network_fn 153 | 154 | def _normalize_clip_observation(x, clip_range=[-5.0, 5.0]): 155 | rms = RunningMeanStd(shape=x.shape[1:]) 156 | norm_x = tf.clip_by_value((x - rms.mean) / rms.std, min(clip_range), max(clip_range)) 157 | return norm_x, rms 158 | 159 | 160 | def get_network_builder(name): 161 | # TODO: replace with reflection? 162 | if name == 'cnn': 163 | return cnn 164 | elif name == 'cnn_small': 165 | return cnn_small 166 | elif name == 'conv_only': 167 | return conv_only 168 | elif name == 'mlp': 169 | return mlp 170 | elif name == 'lstm': 171 | return lstm 172 | elif name == 'cnn_lstm': 173 | return cnn_lstm 174 | elif name == 'cnn_lnlstm': 175 | return cnn_lnlstm 176 | else: 177 | raise ValueError('Unknown network type: {}'.format(name)) 178 | -------------------------------------------------------------------------------- /gym-atari/baselines/baselines/bench/monitor.py: -------------------------------------------------------------------------------- 1 | __all__ = ['Monitor', 'get_monitor_files', 'load_results'] 2 | 3 | import gym 4 | from gym.core import Wrapper 5 | import time 6 | from glob import glob 7 | import csv 8 | import os.path as osp 9 | import json 10 | import numpy as np 11 | 12 | class Monitor(Wrapper): 13 | EXT = "monitor.csv" 14 | f = None 15 | 16 | def __init__(self, env, filename, allow_early_resets=False, reset_keywords=(), info_keywords=()): 17 | Wrapper.__init__(self, env=env) 18 | self.tstart = time.time() 19 | if filename is None: 20 | self.f = None 21 | self.logger = None 22 | else: 23 | if not filename.endswith(Monitor.EXT): 24 | if osp.isdir(filename): 25 | filename = osp.join(filename, Monitor.EXT) 26 | else: 27 | filename = filename + "." + Monitor.EXT 28 | self.f = open(filename, "wt") 29 | self.f.write('#%s\n'%json.dumps({"t_start": self.tstart, 'env_id' : env.spec and env.spec.id})) 30 | self.logger = csv.DictWriter(self.f, fieldnames=('r', 'l', 't')+reset_keywords+info_keywords) 31 | self.logger.writeheader() 32 | self.f.flush() 33 | 34 | self.reset_keywords = reset_keywords 35 | self.info_keywords = info_keywords 36 | self.allow_early_resets = allow_early_resets 37 | self.rewards = None 38 | self.needs_reset = True 39 | self.episode_rewards = [] 40 | self.episode_lengths = [] 41 | self.episode_times = [] 42 | self.total_steps = 0 43 | self.current_reset_info = {} # extra info about the current episode, that was passed in during reset() 44 | 45 | def reset(self, **kwargs): 46 | if not self.allow_early_resets and not self.needs_reset: 47 | raise RuntimeError("Tried to reset an environment before done. If you want to allow early resets, wrap your env with Monitor(env, path, allow_early_resets=True)") 48 | self.rewards = [] 49 | self.needs_reset = False 50 | for k in self.reset_keywords: 51 | v = kwargs.get(k) 52 | if v is None: 53 | raise ValueError('Expected you to pass kwarg %s into reset'%k) 54 | self.current_reset_info[k] = v 55 | return self.env.reset(**kwargs) 56 | 57 | def step(self, action): 58 | if self.needs_reset: 59 | raise RuntimeError("Tried to step environment that needs reset") 60 | ob, rew, done, info = self.env.step(action) 61 | self.rewards.append(rew) 62 | if done: 63 | self.needs_reset = True 64 | eprew = sum(self.rewards) 65 | eplen = len(self.rewards) 66 | epinfo = {"r": round(eprew, 6), "l": eplen, "t": round(time.time() - self.tstart, 6)} 67 | for k in self.info_keywords: 68 | epinfo[k] = info[k] 69 | self.episode_rewards.append(eprew) 70 | self.episode_lengths.append(eplen) 71 | self.episode_times.append(time.time() - self.tstart) 72 | epinfo.update(self.current_reset_info) 73 | if self.logger: 74 | self.logger.writerow(epinfo) 75 | self.f.flush() 76 | info['episode'] = epinfo 77 | self.total_steps += 1 78 | return (ob, rew, done, info) 79 | 80 | def close(self): 81 | if self.f is not None: 82 | self.f.close() 83 | 84 | def get_total_steps(self): 85 | return self.total_steps 86 | 87 | def get_episode_rewards(self): 88 | return self.episode_rewards 89 | 90 | def get_episode_lengths(self): 91 | return self.episode_lengths 92 | 93 | def get_episode_times(self): 94 | return self.episode_times 95 | 96 | class LoadMonitorResultsError(Exception): 97 | pass 98 | 99 | def get_monitor_files(dir): 100 | return glob(osp.join(dir, "*" + Monitor.EXT)) 101 | 102 | def load_results(dir): 103 | import pandas 104 | monitor_files = ( 105 | glob(osp.join(dir, "*monitor.json")) + 106 | glob(osp.join(dir, "*monitor.csv"))) # get both csv and (old) json files 107 | if not monitor_files: 108 | raise LoadMonitorResultsError("no monitor files of the form *%s found in %s" % (Monitor.EXT, dir)) 109 | dfs = [] 110 | headers = [] 111 | for fname in monitor_files: 112 | with open(fname, 'rt') as fh: 113 | if fname.endswith('csv'): 114 | firstline = fh.readline() 115 | if not firstline: 116 | continue 117 | assert firstline[0] == '#' 118 | header = json.loads(firstline[1:]) 119 | df = pandas.read_csv(fh, index_col=None) 120 | headers.append(header) 121 | elif fname.endswith('json'): # Deprecated json format 122 | episodes = [] 123 | lines = fh.readlines() 124 | header = json.loads(lines[0]) 125 | headers.append(header) 126 | for line in lines[1:]: 127 | episode = json.loads(line) 128 | episodes.append(episode) 129 | df = pandas.DataFrame(episodes) 130 | else: 131 | assert 0, 'unreachable' 132 | df['t'] += header['t_start'] 133 | dfs.append(df) 134 | df = pandas.concat(dfs) 135 | df.sort_values('t', inplace=True) 136 | df.reset_index(inplace=True) 137 | df['t'] -= min(header['t_start'] for header in headers) 138 | df.headers = headers # HACK to preserve backwards compatibility 139 | return df 140 | 141 | def test_monitor(): 142 | env = gym.make("CartPole-v1") 143 | env.seed(0) 144 | mon_file = "/tmp/baselines-test-%s.monitor.csv" % uuid.uuid4() 145 | menv = Monitor(env, mon_file) 146 | menv.reset() 147 | for _ in range(1000): 148 | _, _, done, _ = menv.step(0) 149 | if done: 150 | menv.reset() 151 | 152 | f = open(mon_file, 'rt') 153 | 154 | firstline = f.readline() 155 | assert firstline.startswith('#') 156 | metadata = json.loads(firstline[1:]) 157 | assert metadata['env_id'] == "CartPole-v1" 158 | assert set(metadata.keys()) == {'env_id', 'gym_version', 't_start'}, "Incorrect keys in monitor metadata" 159 | 160 | last_logline = pandas.read_csv(f, index_col=None) 161 | assert set(last_logline.keys()) == {'l', 't', 'r'}, "Incorrect keys in monitor logline" 162 | f.close() 163 | os.remove(mon_file) 164 | -------------------------------------------------------------------------------- /gym-control/dqn_cartpole.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import pandas 4 | import numpy as np 5 | import os 6 | import gym 7 | 8 | from keras.layers import Activation, Dense, Flatten 9 | from keras.models import Sequential 10 | from keras.optimizers import Adam 11 | import tensorflow as tf 12 | 13 | from rl.agents.dqn import DQNAgent 14 | from rl.core import Processor 15 | from rl.memory import SequentialMemory 16 | from rl.policy import BoltzmannQPolicy 17 | from noise_estimator import * 18 | from utils import * 19 | 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--error_positive', type=float, default=0.2, 23 | help='Error positive rate [default: 0.2]') 24 | parser.add_argument('--error_negative', type=float, default=0.0, 25 | help='Error negative rate [default: 0.0]') 26 | parser.add_argument('--log_dir', default='logs', 27 | help='Log dir [default: logs]') 28 | parser.add_argument('--reward', default='normal', 29 | help='Reward choice: normal/noisy/surrogate [default: normal]') 30 | parser.add_argument('--smooth', type=str2bool, default=False, 31 | help='Add smoothing to rewards [default: False]') 32 | FLAGS = parser.parse_args() 33 | 34 | ERR_P = FLAGS.error_positive 35 | ERR_N = FLAGS.error_negative 36 | REWARD = FLAGS.reward 37 | SMOOTH = FLAGS.smooth 38 | 39 | if REWARD == "normal": 40 | LOG_DIR = os.path.join(FLAGS.log_dir, "dqn_cartpole") 41 | else: 42 | LOG_DIR = os.path.join(os.path.join(FLAGS.log_dir, "dqn_cartpole"), str(ERR_P)) 43 | ENV_NAME = 'CartPole-v0' 44 | 45 | if not os.path.exists(LOG_DIR): 46 | os.makedirs(LOG_DIR) 47 | os.system('cp dqn_cartpole.py %s' % (LOG_DIR)) # bkp of train procedure 48 | LOG_FOUT = open(os.path.join(LOG_DIR, 'setting.txt'), 'w') 49 | LOG_FOUT.write(str(FLAGS)+'\n') 50 | 51 | 52 | def train(): 53 | # Get the environment and extract the number of actions. 54 | env = gym.make(ENV_NAME) 55 | np.random.seed(123) 56 | env.seed(123) 57 | nb_actions = env.action_space.n 58 | 59 | config = tf.ConfigProto() 60 | config.gpu_options.allow_growth = True 61 | sess = tf.Session(config=config) 62 | from keras import backend as K 63 | K.set_session(sess) 64 | 65 | # Next, we build a very simple model. 66 | model = Sequential() 67 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 68 | model.add(Dense(16)) 69 | model.add(Activation('relu')) 70 | model.add(Dense(16)) 71 | model.add(Activation('relu')) 72 | model.add(Dense(16)) 73 | model.add(Activation('relu')) 74 | model.add(Dense(nb_actions)) 75 | model.add(Activation('linear')) 76 | model.summary() 77 | 78 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 79 | # even the metrics! 80 | memory = SequentialMemory(limit=50000, window_length=1) 81 | policy = BoltzmannQPolicy() 82 | 83 | # Okay, now it's time to learn something! We visualize the training here for show, but this 84 | # slows down training quite a lot. You can always safely abort the training prematurely using 85 | # Ctrl + C. 86 | if REWARD == "normal": 87 | dqn_normal = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 88 | target_model_update=1e-2, policy=policy) 89 | dqn_normal.compile(Adam(lr=1e-3), metrics=['mae']) 90 | history_normal = dqn_normal.fit(env, nb_steps=10000, visualize=False, verbose=2) 91 | dqn_normal.save_weights(os.path.join(LOG_DIR, 'dqn_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 92 | dqn_normal.test(env, nb_episodes=10, visualize=False, verbose=2) 93 | 94 | pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) 95 | 96 | elif REWARD == "noisy": 97 | if not SMOOTH: 98 | processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, surrogate=False) 99 | else: 100 | processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False) 101 | 102 | # processor_noisy = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False) 103 | dqn_noisy = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 104 | target_model_update=1e-2, policy=policy, processor=processor_noisy) 105 | dqn_noisy.compile(Adam(lr=1e-3), metrics=['mae']) 106 | history_noisy = dqn_noisy.fit(env, nb_steps=10000, visualize=False, verbose=2) 107 | if not SMOOTH: 108 | dqn_noisy.save_weights(os.path.join(LOG_DIR, 'dqn_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 109 | pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) 110 | else: 111 | dqn_noisy.save_weights(os.path.join(LOG_DIR, 'dqn_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 112 | pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) 113 | 114 | dqn_noisy.test(env, nb_episodes=10, visualize=False, verbose=2) 115 | 116 | 117 | elif REWARD == "surrogate": 118 | if not SMOOTH: 119 | processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True) 120 | else: 121 | processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True) 122 | 123 | # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True) 124 | dqn_surrogate = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 125 | target_model_update=1e-2, policy=policy, processor=processor_surrogate) 126 | dqn_surrogate.compile(Adam(lr=1e-3), metrics=['mae']) 127 | history_surrogate = dqn_surrogate.fit(env, nb_steps=10000, visualize=False, verbose=2) 128 | if not SMOOTH: 129 | dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'dqn_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 130 | pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) 131 | else: 132 | dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'dqn_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 133 | pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) 134 | 135 | dqn_surrogate.test(env, nb_episodes=10, visualize=False, verbose=2) 136 | 137 | else: 138 | raise NotImplementedError 139 | 140 | if __name__ == "__main__": 141 | train() 142 | -------------------------------------------------------------------------------- /gym-control/duel_dqn_cartpole.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import pandas 4 | import numpy as np 5 | import os 6 | import gym 7 | 8 | from keras.layers import Activation, Dense, Flatten 9 | from keras.models import Sequential 10 | from keras.optimizers import Adam 11 | import tensorflow as tf 12 | 13 | from rl.agents.dqn import DQNAgent 14 | from rl.core import Processor 15 | from rl.memory import SequentialMemory 16 | from rl.policy import BoltzmannQPolicy 17 | from noise_estimator import * 18 | from utils import * 19 | 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--error_positive', type=float, default=0.2, 23 | help='Error positive rate [default: 0.2]') 24 | parser.add_argument('--error_negative', type=float, default=0.0, 25 | help='Error negative rate [default: 0.0]') 26 | parser.add_argument('--log_dir', default='logs', 27 | help='Log dir [default: logs]') 28 | parser.add_argument('--reward', default='normal', 29 | help='Reward choice: normal/noisy/surrogate [default: normal]') 30 | parser.add_argument('--smooth', type=str2bool, default=False, 31 | help='Add smoothing to rewards [default: False]') 32 | FLAGS = parser.parse_args() 33 | 34 | ERR_P = FLAGS.error_positive 35 | ERR_N = FLAGS.error_negative 36 | REWARD = FLAGS.reward 37 | SMOOTH = FLAGS.smooth 38 | 39 | if REWARD == "normal": 40 | LOG_DIR = os.path.join(FLAGS.log_dir, "duel_dqn_cartpole") 41 | else: 42 | LOG_DIR = os.path.join(os.path.join(FLAGS.log_dir, "duel_dqn_cartpole"), str(ERR_P)) 43 | ENV_NAME = 'CartPole-v0' 44 | 45 | if not os.path.exists(LOG_DIR): 46 | os.makedirs(LOG_DIR) 47 | os.system('cp duel_dqn_cartpole.py %s' % (LOG_DIR)) # bkp of train procedure 48 | LOG_FOUT = open(os.path.join(LOG_DIR, 'setting.txt'), 'w') 49 | LOG_FOUT.write(str(FLAGS)+'\n') 50 | 51 | 52 | def train(): 53 | # Get the environment and extract the number of actions. 54 | env = gym.make(ENV_NAME) 55 | np.random.seed(123) 56 | env.seed(123) 57 | nb_actions = env.action_space.n 58 | 59 | config = tf.ConfigProto() 60 | config.gpu_options.allow_growth = True 61 | sess = tf.Session(config=config) 62 | from keras import backend as K 63 | K.set_session(sess) 64 | 65 | # Next, we build a very simple model. 66 | model = Sequential() 67 | model.add(Flatten(input_shape=(1,) + env.observation_space.shape)) 68 | model.add(Dense(16)) 69 | model.add(Activation('relu')) 70 | model.add(Dense(16)) 71 | model.add(Activation('relu')) 72 | model.add(Dense(16)) 73 | model.add(Activation('relu')) 74 | model.add(Dense(nb_actions)) 75 | model.add(Activation('linear')) 76 | model.summary() 77 | 78 | # Finally, we configure and compile our agent. You can use every built-in Keras optimizer and 79 | # even the metrics! 80 | memory = SequentialMemory(limit=50000, window_length=1) 81 | policy = BoltzmannQPolicy() 82 | 83 | # Okay, now it's time to learn something! We visualize the training here for show, but this 84 | # slows down training quite a lot. You can always safely abort the training prematurely using 85 | # Ctrl + C. 86 | if REWARD == "normal": 87 | dqn_normal = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 88 | enable_dueling_network=True, dueling_type='avg', 89 | target_model_update=1e-2, policy=policy) 90 | dqn_normal.compile(Adam(lr=1e-3), metrics=['mae']) 91 | history_normal = dqn_normal.fit(env, nb_steps=10000, visualize=False, verbose=2) 92 | dqn_normal.save_weights(os.path.join(LOG_DIR, 'duel_dqn_normal_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 93 | dqn_normal.test(env, nb_episodes=10, visualize=False, verbose=2) 94 | 95 | pandas.DataFrame(history_normal.history).to_csv(os.path.join(LOG_DIR, "normal.csv")) 96 | 97 | elif REWARD == "noisy": 98 | if not SMOOTH: 99 | processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=False) 100 | else: 101 | processor_noisy = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=False) 102 | 103 | # processor_noisy = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=False) 104 | dqn_noisy = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 105 | enable_dueling_network=True, dueling_type='avg', 106 | target_model_update=1e-2, policy=policy, processor=processor_noisy) 107 | dqn_noisy.compile(Adam(lr=1e-3), metrics=['mae']) 108 | history_noisy = dqn_noisy.fit(env, nb_steps=10000, visualize=False, verbose=2) 109 | if not SMOOTH: 110 | dqn_noisy.save_weights(os.path.join(LOG_DIR, 'duel_dqn_noisy_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 111 | pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy.csv")) 112 | else: 113 | dqn_noisy.save_weights(os.path.join(LOG_DIR, 'duel_dqn_noisy_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 114 | pandas.DataFrame(history_noisy.history).to_csv(os.path.join(LOG_DIR, "noisy_smooth.csv")) 115 | 116 | dqn_noisy.test(env, nb_episodes=10, visualize=False, verbose=2) 117 | 118 | 119 | elif REWARD == "surrogate": 120 | if not SMOOTH: 121 | processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=False, surrogate=True) 122 | else: 123 | processor_surrogate = CartpoleProcessor(e_=ERR_N, e=ERR_P, smooth=True, surrogate=True) 124 | 125 | # processor_surrogate = CartpoleSurrogateProcessor(e_=ERR_N, e=ERR_P, surrogate=True) 126 | dqn_surrogate = DQNAgent(model=model, nb_actions=nb_actions, memory=memory, nb_steps_warmup=10, 127 | enable_dueling_network=True, dueling_type='avg', 128 | target_model_update=1e-2, policy=policy, processor=processor_surrogate) 129 | dqn_surrogate.compile(Adam(lr=1e-3), metrics=['mae']) 130 | history_surrogate = dqn_surrogate.fit(env, nb_steps=10000, visualize=False, verbose=2) 131 | if not SMOOTH: 132 | dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'duel_dqn_surrogate_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 133 | pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate.csv")) 134 | else: 135 | dqn_surrogate.save_weights(os.path.join(LOG_DIR, 'duel_dqn_surrogate_smooth_{}_weights.h5f'.format(ENV_NAME)), overwrite=True) 136 | pandas.DataFrame(history_surrogate.history).to_csv(os.path.join(LOG_DIR, "surrogate_smooth.csv")) 137 | 138 | dqn_surrogate.test(env, nb_episodes=10, visualize=False, verbose=2) 139 | 140 | else: 141 | raise NotImplementedError 142 | 143 | if __name__ == "__main__": 144 | train() --------------------------------------------------------------------------------