├── .DS_Store ├── .gitignore ├── 1_sb_ppo_agent.py ├── 2_supermario_dqn.ipynb ├── 4_pong_dqn (1).ipynb ├── README.md └── stable_baselines ├── __init__.py ├── a2c ├── __init__.py ├── a2c.py ├── run_atari.py └── utils.py ├── acer ├── __init__.py ├── acer_simple.py ├── buffer.py └── run_atari.py ├── acktr ├── __init__.py ├── acktr_cont.py ├── acktr_disc.py ├── kfac.py ├── kfac_utils.py ├── policies.py ├── run_atari.py ├── run_mujoco.py ├── utils.py └── value_functions.py ├── bench ├── __init__.py ├── benchmarks.py └── monitor.py ├── common ├── __init__.py ├── atari_wrappers.py ├── base_class.py ├── cg.py ├── cmd_util.py ├── console_util.py ├── dataset.py ├── distributions.py ├── filters.py ├── identity_env.py ├── input.py ├── math_util.py ├── misc_util.py ├── mpi_adam.py ├── mpi_fork.py ├── mpi_moments.py ├── mpi_running_mean_std.py ├── policies.py ├── runners.py ├── running_mean_std.py ├── running_stat.py ├── schedules.py ├── segment_tree.py ├── tf_util.py ├── tile_images.py └── vec_env │ ├── __init__.py │ ├── base_vec_env.py │ ├── dummy_vec_env.py │ ├── subproc_vec_env.py │ ├── vec_frame_stack.py │ └── vec_normalize.py ├── ddpg ├── __init__.py ├── ddpg.py ├── main.py ├── memory.py ├── noise.py └── policies.py ├── deepq ├── __init__.py ├── build_graph.py ├── dqn.py ├── dqn_10_sb_dqn_supermari_decay_resolution_exploration.py ├── dqn_12_sb_dqn_supermari_decay_resolution_exploration_step_10.py ├── dqn_13_sb_dqn_supermari_decay_resolution_exploration_step_20_exploration_term_5000.py ├── dqn_14_resoultion_network.py ├── dqn_9_sb_dqn_supermari_resolution_exploration.py ├── experiments │ ├── __init__.py │ ├── custom_cartpole.py │ ├── enjoy_cartpole.py │ ├── enjoy_mountaincar.py │ ├── enjoy_pong.py │ ├── run_atari.py │ ├── train_cartpole.py │ └── train_mountaincar.py ├── policies.py ├── replay_buffer.py └── utils.py ├── gail ├── __init__.py ├── adversary.py ├── behavior_clone.py ├── dataset │ ├── __init__.py │ └── mujocodset.py ├── gail_eval.py ├── mlp_policy.py ├── model.py ├── result │ ├── HalfCheetah-normalized-deterministic-scores.png │ ├── HalfCheetah-normalized-stochastic-scores.png │ ├── HalfCheetah-unnormalized-deterministic-scores.png │ ├── HalfCheetah-unnormalized-stochastic-scores.png │ ├── Hopper-normalized-deterministic-scores.png │ ├── Hopper-normalized-stochastic-scores.png │ ├── Hopper-unnormalized-deterministic-scores.png │ ├── Hopper-unnormalized-stochastic-scores.png │ ├── Humanoid-normalized-deterministic-scores.png │ ├── Humanoid-normalized-stochastic-scores.png │ ├── Humanoid-unnormalized-deterministic-scores.png │ ├── Humanoid-unnormalized-stochastic-scores.png │ ├── HumanoidStandup-normalized-deterministic-scores.png │ ├── HumanoidStandup-normalized-stochastic-scores.png │ ├── HumanoidStandup-unnormalized-deterministic-scores.png │ ├── HumanoidStandup-unnormalized-stochastic-scores.png │ ├── Walker2d-normalized-deterministic-scores.png │ ├── Walker2d-normalized-stochastic-scores.png │ ├── Walker2d-unnormalized-deterministic-scores.png │ ├── Walker2d-unnormalized-stochastic-scores.png │ ├── gail-result.md │ ├── halfcheetah-training.png │ ├── hopper-training.png │ ├── humanoid-training.png │ ├── humanoidstandup-training.png │ └── walker2d-training.png ├── run_mujoco.py └── statistics.py ├── her ├── __init__.py ├── actor_critic.py ├── ddpg.py ├── experiment │ ├── __init__.py │ ├── config.py │ ├── play.py │ ├── plot.py │ └── train.py ├── her.py ├── normalizer.py ├── replay_buffer.py ├── rollout.py └── util.py ├── logger.py ├── ppo1 ├── __init__.py ├── experiments │ └── train_cartpole.py ├── mlp_policy.py ├── pposgd_simple.py ├── run_atari.py ├── run_humanoid.py ├── run_mujoco.py └── run_robotics.py ├── ppo2 ├── __init__.py ├── ppo2.py ├── ppo2_2_sb_ppo_action_resolution.py ├── ppo2_3_sb_ppo_resolution_exploration_decay.py ├── run_atari.py └── run_mujoco.py ├── results_plotter.py └── trpo_mpi ├── __init__.py ├── run_atari.py ├── run_mujoco.py ├── trpo_mpi.py └── utils.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/.DS_Store -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.gitignore.io/api/python 2 | 3 | ### Python ### 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .nox/ 45 | .coverage 46 | .coverage.* 47 | .cache 48 | nosetests.xml 49 | coverage.xml 50 | *.cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | 63 | # Flask stuff: 64 | instance/ 65 | .webassets-cache 66 | 67 | # Scrapy stuff: 68 | .scrapy 69 | 70 | # Sphinx documentation 71 | docs/_build/ 72 | 73 | # PyBuilder 74 | target/ 75 | 76 | # Jupyter Notebook 77 | .ipynb_checkpoints 78 | 79 | # IPython 80 | profile_default/ 81 | ipython_config.py 82 | 83 | # pyenv 84 | .python-version 85 | 86 | # celery beat schedule file 87 | celerybeat-schedule 88 | 89 | # SageMath parsed files 90 | *.sage.py 91 | 92 | # Environments 93 | .env 94 | .venv 95 | env/ 96 | venv/ 97 | ENV/ 98 | env.bak/ 99 | venv.bak/ 100 | 101 | # Spyder project settings 102 | .spyderproject 103 | .spyproject 104 | 105 | # Rope project settings 106 | .ropeproject 107 | 108 | # mkdocs documentation 109 | /site 110 | 111 | # mypy 112 | .mypy_cache/ 113 | .dmypy.json 114 | dmypy.json 115 | 116 | # Pyre type checker 117 | .pyre/ 118 | 119 | ### Python Patch ### 120 | .venv/ 121 | 122 | ### Python.VirtualEnv Stack ### 123 | # Virtualenv 124 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/ 125 | [Bb]in 126 | [Ii]nclude 127 | [Ll]ib 128 | [Ll]ib64 129 | [Ll]ocal 130 | [Ss]cripts 131 | pyvenv.cfg 132 | pip-selfcheck.json 133 | 134 | 135 | # End of https://www.gitignore.io/api/python 136 | -------------------------------------------------------------------------------- /1_sb_ppo_agent.py: -------------------------------------------------------------------------------- 1 | #import retro 2 | 3 | import gym_super_mario_bros 4 | import logging 5 | import gym 6 | import gym_super_mario_bros 7 | import numpy as np 8 | import sys 9 | from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv 10 | import random 11 | from stable_baselines.ppo2.ppo2 import PPO2 12 | 13 | from stable_baselines import PPO2 14 | from stable_baselines.common.policies import CnnPolicy 15 | from stable_baselines.common.vec_env import DummyVecEnv 16 | 17 | movements = [ 18 | ['NOP'], 19 | ['A'], 20 | ['B'], 21 | ['right'], 22 | ['right', 'A'], 23 | ['right', 'B'], 24 | ['right', 'A', 'B'], 25 | ['left'], 26 | ['left', 'A'], 27 | ['left', 'B'], 28 | ['left', 'A', 'B'], 29 | # ['down'], 30 | # ['up'] 31 | ] 32 | 33 | 34 | _env = gym_super_mario_bros.make('SuperMarioBros-v0') 35 | #_env = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='rectangle') 36 | env = BinarySpaceToDiscreteSpaceEnv(_env, movements) 37 | env = DummyVecEnv([lambda: env]) 38 | model = PPO2(policy=CnnPolicy, env=env, verbose=1) 39 | model.learn(total_timesteps=10000) 40 | 41 | obs = env.reset() 42 | 43 | while True: 44 | action, _info = model.predict(obs) 45 | 46 | obs, rewards, dones, info = env.step(action) 47 | print("학습끝") 48 | print(rewards) 49 | env.render() 50 | -------------------------------------------------------------------------------- /stable_baselines/__init__.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | from stable_baselines.a2c import A2C 5 | from stable_baselines.acer import ACER 6 | from stable_baselines.acktr import ACKTR 7 | from stable_baselines.ddpg import DDPG 8 | from stable_baselines.deepq import DQN 9 | from stable_baselines.gail import GAIL 10 | from stable_baselines.ppo1 import PPO1 11 | from stable_baselines.ppo2 import PPO2 12 | from stable_baselines.trpo_mpi import TRPO 13 | 14 | __version__ = "2.1.1.a0" 15 | 16 | 17 | # patch Gym spaces to add equality functions, if not implemented 18 | # See https://github.com/openai/gym/issues/1171 19 | if gym.spaces.MultiBinary.__eq__ == object.__eq__: # by default, all classes have the __eq__ function from object. 20 | def _eq(self, other): 21 | return self.n == other.n 22 | 23 | gym.spaces.MultiBinary.__eq__ = _eq 24 | 25 | if gym.spaces.MultiDiscrete.__eq__ == object.__eq__: 26 | def _eq(self, other): 27 | return np.all(self.nvec == other.nvec) 28 | 29 | gym.spaces.MultiDiscrete.__eq__ = _eq 30 | -------------------------------------------------------------------------------- /stable_baselines/a2c/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.a2c.a2c import A2C 2 | -------------------------------------------------------------------------------- /stable_baselines/a2c/run_atari.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from stable_baselines import logger 4 | from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser 5 | from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack 6 | from stable_baselines.a2c import A2C 7 | from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy 8 | 9 | 10 | def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env): 11 | """ 12 | Train A2C model for atari environment, for testing purposes 13 | 14 | :param env_id: (str) Environment ID 15 | :param num_timesteps: (int) The total number of samples 16 | :param seed: (int) The initial seed for training 17 | :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) 18 | :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 19 | 'double_linear_con', 'middle_drop' or 'double_middle_drop') 20 | :param num_env: (int) The number of environments 21 | """ 22 | policy_fn = None 23 | if policy == 'cnn': 24 | policy_fn = CnnPolicy 25 | elif policy == 'lstm': 26 | policy_fn = CnnLstmPolicy 27 | elif policy == 'lnlstm': 28 | policy_fn = CnnLnLstmPolicy 29 | if policy_fn is None: 30 | raise ValueError("Error: policy {} not implemented".format(policy)) 31 | 32 | env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4) 33 | 34 | model = A2C(policy_fn, env, lr_schedule=lr_schedule) 35 | model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) 36 | env.close() 37 | 38 | 39 | def main(): 40 | """ 41 | Runs the test 42 | """ 43 | parser = atari_arg_parser() 44 | parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') 45 | parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', 46 | help='Learning rate schedule') 47 | args = parser.parse_args() 48 | logger.configure() 49 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lr_schedule=args.lr_schedule, 50 | num_env=16) 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /stable_baselines/acer/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.acer.acer_simple import ACER 2 | -------------------------------------------------------------------------------- /stable_baselines/acer/run_atari.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from stable_baselines import logger 4 | from stable_baselines.acer import ACER 5 | from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy 6 | from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser 7 | from stable_baselines.common.vec_env import VecFrameStack 8 | 9 | 10 | def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu): 11 | """ 12 | train an ACER model on atari 13 | 14 | :param env_id: (str) Environment ID 15 | :param num_timesteps: (int) The total number of samples 16 | :param seed: (int) The initial seed for training 17 | :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...) 18 | :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant', 19 | 'double_linear_con', 'middle_drop' or 'double_middle_drop') 20 | :param num_cpu: (int) The number of cpu to train on 21 | """ 22 | env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) 23 | if policy == 'cnn': 24 | policy_fn = CnnPolicy 25 | elif policy == 'lstm': 26 | policy_fn = CnnLstmPolicy 27 | else: 28 | print("Policy {} not implemented".format(policy)) 29 | return 30 | 31 | model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000) 32 | model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) 33 | env.close() 34 | 35 | 36 | def main(): 37 | """ 38 | Runs the test 39 | """ 40 | parser = atari_arg_parser() 41 | parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture') 42 | parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant', 43 | help='Learning rate schedule') 44 | parser.add_argument('--logdir', help='Directory for logging') 45 | args = parser.parse_args() 46 | logger.configure(args.logdir) 47 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, 48 | policy=args.policy, lr_schedule=args.lr_schedule, num_cpu=16) 49 | 50 | 51 | if __name__ == '__main__': 52 | main() 53 | -------------------------------------------------------------------------------- /stable_baselines/acktr/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.acktr.acktr_disc import ACKTR 2 | -------------------------------------------------------------------------------- /stable_baselines/acktr/kfac_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def gmatmul(tensor_a, tensor_b, transpose_a=False, transpose_b=False, reduce_dim=None): 5 | """ 6 | Do a matrix multiplication with tensor 'a' and 'b', even when their shape do not match 7 | 8 | :param tensor_a: (TensorFlow Tensor) 9 | :param tensor_b: (TensorFlow Tensor) 10 | :param transpose_a: (bool) If 'a' needs transposing 11 | :param transpose_b: (bool) If 'b' needs transposing 12 | :param reduce_dim: (int) the multiplication over the dim 13 | :return: (TensorFlow Tensor) a * b 14 | """ 15 | assert reduce_dim is not None 16 | 17 | # weird batch matmul 18 | if len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) > 2: 19 | # reshape reduce_dim to the left most dim in b 20 | b_shape = tensor_b.get_shape() 21 | if reduce_dim != 0: 22 | b_dims = list(range(len(b_shape))) 23 | b_dims.remove(reduce_dim) 24 | b_dims.insert(0, reduce_dim) 25 | tensor_b = tf.transpose(tensor_b, b_dims) 26 | b_t_shape = tensor_b.get_shape() 27 | tensor_b = tf.reshape(tensor_b, [int(b_shape[reduce_dim]), -1]) 28 | result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, 29 | transpose_b=transpose_b) 30 | result = tf.reshape(result, b_t_shape) 31 | if reduce_dim != 0: 32 | b_dims = list(range(len(b_shape))) 33 | b_dims.remove(0) 34 | b_dims.insert(reduce_dim, 0) 35 | result = tf.transpose(result, b_dims) 36 | return result 37 | 38 | elif len(tensor_a.get_shape()) > 2 and len(tensor_b.get_shape()) == 2: 39 | # reshape reduce_dim to the right most dim in a 40 | a_shape = tensor_a.get_shape() 41 | outter_dim = len(a_shape) - 1 42 | reduce_dim = len(a_shape) - reduce_dim - 1 43 | if reduce_dim != outter_dim: 44 | a_dims = list(range(len(a_shape))) 45 | a_dims.remove(reduce_dim) 46 | a_dims.insert(outter_dim, reduce_dim) 47 | tensor_a = tf.transpose(tensor_a, a_dims) 48 | a_t_shape = tensor_a.get_shape() 49 | tensor_a = tf.reshape(tensor_a, [-1, int(a_shape[reduce_dim])]) 50 | result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, 51 | transpose_b=transpose_b) 52 | result = tf.reshape(result, a_t_shape) 53 | if reduce_dim != outter_dim: 54 | a_dims = list(range(len(a_shape))) 55 | a_dims.remove(outter_dim) 56 | a_dims.insert(reduce_dim, outter_dim) 57 | result = tf.transpose(result, a_dims) 58 | return result 59 | 60 | elif len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) == 2: 61 | return tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b) 62 | 63 | assert False, 'something went wrong' 64 | 65 | 66 | def clipout_neg(vec, threshold=1e-6): 67 | """ 68 | clip to 0 if input lower than threshold value 69 | 70 | :param vec: (TensorFlow Tensor) 71 | :param threshold: (float) the cutoff threshold 72 | :return: (TensorFlow Tensor) clipped input 73 | """ 74 | mask = tf.cast(vec > threshold, tf.float32) 75 | return mask * vec 76 | 77 | 78 | def detect_min_val(input_mat, var, threshold=1e-6, name='', debug=False): 79 | """ 80 | If debug is not set, will run clipout_neg. Else, will clip and print out odd eigen values 81 | 82 | :param input_mat: (TensorFlow Tensor) 83 | :param var: (TensorFlow Tensor) variable 84 | :param threshold: (float) the cutoff threshold 85 | :param name: (str) the name of the variable 86 | :param debug: (bool) debug function 87 | :return: (TensorFlow Tensor) clipped tensor 88 | """ 89 | eigen_min = tf.reduce_min(input_mat) 90 | eigen_max = tf.reduce_max(input_mat) 91 | eigen_ratio = eigen_max / eigen_min 92 | input_mat_clipped = clipout_neg(input_mat, threshold) 93 | 94 | if debug: 95 | input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)), 96 | lambda: input_mat_clipped, lambda: tf.Print( 97 | input_mat_clipped, 98 | [tf.convert_to_tensor('odd ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name), 99 | eigen_min, eigen_max, eigen_ratio])) 100 | 101 | return input_mat_clipped 102 | 103 | 104 | def factor_reshape(eigen_vectors, eigen_values, grad, fac_idx=0, f_type='act'): 105 | """ 106 | factor and reshape input eigen values 107 | 108 | :param eigen_vectors: ([TensorFlow Tensor]) eigen vectors 109 | :param eigen_values: ([TensorFlow Tensor]) eigen values 110 | :param grad: ([TensorFlow Tensor]) gradient 111 | :param fac_idx: (int) index that should be factored 112 | :param f_type: (str) function type to factor and reshape 113 | :return: ([TensorFlow Tensor], [TensorFlow Tensor]) factored and reshaped eigen vectors 114 | and eigen values 115 | """ 116 | grad_shape = grad.get_shape() 117 | if f_type == 'act': 118 | assert eigen_values.get_shape()[0] == grad_shape[fac_idx] 119 | expanded_shape = [1, ] * len(grad_shape) 120 | expanded_shape[fac_idx] = -1 121 | eigen_values = tf.reshape(eigen_values, expanded_shape) 122 | if f_type == 'grad': 123 | assert eigen_values.get_shape()[0] == grad_shape[len(grad_shape) - fac_idx - 1] 124 | expanded_shape = [1, ] * len(grad_shape) 125 | expanded_shape[len(grad_shape) - fac_idx - 1] = -1 126 | eigen_values = tf.reshape(eigen_values, expanded_shape) 127 | 128 | return eigen_vectors, eigen_values 129 | -------------------------------------------------------------------------------- /stable_baselines/acktr/policies.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from stable_baselines.acktr.utils import dense, kl_div 5 | import stable_baselines.common.tf_util as tf_util 6 | 7 | 8 | class GaussianMlpPolicy(object): 9 | def __init__(self, ob_dim, ac_dim): 10 | """ 11 | Create a gaussian MLP policy 12 | 13 | :param ob_dim: (int) Observation dimention 14 | :param ac_dim: (int) action dimention 15 | """ 16 | # Here we'll construct a bunch of expressions, which will be used in two places: 17 | # (1) When sampling actions 18 | # (2) When computing loss functions, for the policy update 19 | # Variables specific to (1) have the word "sampled" in them, 20 | # whereas variables specific to (2) have the word "old" in them 21 | ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob") # batch of observations 22 | oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac") # batch of actions previous actions 23 | # batch of actions previous action distributions 24 | oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim * 2], name="oldac_dist") 25 | adv_n = tf.placeholder(tf.float32, shape=[None], name="adv") # advantage function estimate 26 | wd_dict = {} 27 | layer_1 = tf.nn.tanh(dense(ob_no, 64, "h1", 28 | weight_init=tf_util.normc_initializer(1.0), 29 | bias_init=0.0, weight_loss_dict=wd_dict)) 30 | layer_2 = tf.nn.tanh(dense(layer_1, 64, "h2", 31 | weight_init=tf_util.normc_initializer(1.0), 32 | bias_init=0.0, weight_loss_dict=wd_dict)) 33 | mean_na = dense(layer_2, ac_dim, "mean", weight_init=tf_util.normc_initializer(0.1), 34 | bias_init=0.0, weight_loss_dict=wd_dict) # Mean control output 35 | self.wd_dict = wd_dict 36 | # Variance on outputs 37 | self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer()) 38 | logstd_1a = tf.expand_dims(logstd_1a, 0) 39 | std_1a = tf.exp(logstd_1a) 40 | std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1]) 41 | ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1) 42 | # This is the sampled action we'll perform. 43 | sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:, ac_dim:])) * ac_dist[:, ac_dim:] + ac_dist[:, :ac_dim] 44 | logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( 45 | 2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum( 46 | tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:, ac_dim:])), 47 | axis=1) # Logprob of sampled action 48 | logprob_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log( 49 | 2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum( 50 | tf.square(ac_dist[:, :ac_dim] - oldac_na) / (tf.square(ac_dist[:, ac_dim:])), 51 | axis=1) # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy) 52 | kl_loss = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim)) 53 | # kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n)) 54 | # Approximation of KL divergence between old policy used to generate actions, 55 | # and new policy used to compute logprob_n 56 | surr = - tf.reduce_mean(adv_n * logprob_n) # Loss function that we'll differentiate to get the policy gradient 57 | surr_sampled = - tf.reduce_mean(logprob_n) # Sampled loss of the policy 58 | # Generate a new action and its logprob 59 | self._act = tf_util.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n]) 60 | # self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl) 61 | # Compute (approximate) KL divergence between old policy and new policy 62 | self.compute_kl = tf_util.function([ob_no, oldac_dist], kl_loss) 63 | # Input and output variables needed for computing loss 64 | self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled) 65 | tf_util.initialize() # Initialize uninitialized TF variables 66 | 67 | def act(self, obs): 68 | """ 69 | get the action from an observation 70 | 71 | :param obs: ([float]) observation 72 | :return: ([float], [float], [float]) action, action_proba, logp 73 | """ 74 | action, ac_dist, logp = self._act(obs[None]) 75 | return action[0], ac_dist[0], logp[0] 76 | -------------------------------------------------------------------------------- /stable_baselines/acktr/run_atari.py: -------------------------------------------------------------------------------- 1 | from stable_baselines import logger 2 | from stable_baselines.acktr import ACKTR 3 | from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser 4 | from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack 5 | from stable_baselines.common.policies import CnnPolicy 6 | 7 | 8 | def train(env_id, num_timesteps, seed, num_cpu): 9 | """ 10 | train an ACKTR model on atari 11 | 12 | :param env_id: (str) Environment ID 13 | :param num_timesteps: (int) The total number of samples 14 | :param seed: (int) The initial seed for training 15 | :param num_cpu: (int) The number of cpu to train on 16 | """ 17 | env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4) 18 | model = ACKTR(CnnPolicy, env, nprocs=num_cpu) 19 | model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed) 20 | env.close() 21 | 22 | 23 | def main(): 24 | """ 25 | Runs the test 26 | """ 27 | args = atari_arg_parser().parse_args() 28 | logger.configure() 29 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32) 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /stable_baselines/acktr/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import tensorflow as tf 4 | 5 | from stable_baselines import logger 6 | from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 7 | from stable_baselines.acktr.acktr_cont import learn 8 | from stable_baselines.acktr.policies import GaussianMlpPolicy 9 | from stable_baselines.acktr.value_functions import NeuralNetValueFunction 10 | 11 | 12 | def train(env_id, num_timesteps, seed): 13 | """ 14 | train an ACKTR model on atari 15 | 16 | :param env_id: (str) Environment ID 17 | :param num_timesteps: (int) The total number of samples 18 | :param seed: (int) The initial seed for training 19 | """ 20 | env = make_mujoco_env(env_id, seed) 21 | 22 | with tf.Session(config=tf.ConfigProto()): 23 | ob_dim = env.observation_space.shape[0] 24 | ac_dim = env.action_space.shape[0] 25 | with tf.variable_scope("vf"): 26 | value_fn = NeuralNetValueFunction(ob_dim, ac_dim) 27 | with tf.variable_scope("pi"): 28 | policy = GaussianMlpPolicy(ob_dim, ac_dim) 29 | 30 | learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, 31 | num_timesteps=num_timesteps, animate=False) 32 | 33 | env.close() 34 | 35 | 36 | def main(): 37 | """ 38 | Runs the test 39 | """ 40 | args = mujoco_arg_parser().parse_args() 41 | logger.configure() 42 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 43 | 44 | 45 | if __name__ == "__main__": 46 | main() 47 | -------------------------------------------------------------------------------- /stable_baselines/acktr/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | def dense(input_tensor, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None): 5 | """ 6 | A dense Layer 7 | 8 | :param input_tensor: ([TensorFlow Tensor]) input 9 | :param size: (int) number of hidden neurons 10 | :param name: (str) layer name 11 | :param weight_init: (function or int or float) initialize the weight 12 | :param bias_init: (function or int or float) initialize the weight 13 | :param weight_loss_dict: (dict) store the weight loss if not None 14 | :param reuse: (bool) if can be reused 15 | :return: ([TensorFlow Tensor]) the output of the dense Layer 16 | """ 17 | with tf.variable_scope(name, reuse=reuse): 18 | assert len(tf.get_variable_scope().name.split('/')) == 2 19 | 20 | weight = tf.get_variable("w", [input_tensor.get_shape()[1], size], initializer=weight_init) 21 | bias = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init)) 22 | weight_decay_fc = 3e-4 23 | 24 | if weight_loss_dict is not None: 25 | weight_decay = tf.multiply(tf.nn.l2_loss(weight), weight_decay_fc, name='weight_decay_loss') 26 | weight_loss_dict[weight] = weight_decay_fc 27 | weight_loss_dict[bias] = 0.0 28 | 29 | tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay) 30 | 31 | return tf.nn.bias_add(tf.matmul(input_tensor, weight), bias) 32 | 33 | 34 | def kl_div(action_dist1, action_dist2, action_size): 35 | """ 36 | Kullback leiber divergence 37 | 38 | :param action_dist1: ([TensorFlow Tensor]) action distribution 1 39 | :param action_dist2: ([TensorFlow Tensor]) action distribution 2 40 | :param action_size: (int) the shape of an action 41 | :return: (float) Kullback leiber divergence 42 | """ 43 | mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:] 44 | mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:] 45 | 46 | numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2) 47 | denominator = 2 * tf.square(std2) + 1e-8 48 | return tf.reduce_sum( 49 | numerator / denominator + tf.log(std2) - tf.log(std1), reduction_indices=-1) 50 | -------------------------------------------------------------------------------- /stable_baselines/acktr/value_functions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from stable_baselines import logger 5 | import stable_baselines.common as common 6 | from stable_baselines.common import tf_util 7 | from stable_baselines.acktr import kfac 8 | from stable_baselines.acktr.utils import dense 9 | 10 | 11 | class NeuralNetValueFunction(object): 12 | def __init__(self, ob_dim, ac_dim, verbose=1): 13 | """ 14 | Create an MLP policy for a value function 15 | 16 | :param ob_dim: (int) Observation dimention 17 | :param ac_dim: (int) action dimention 18 | :param verbose: (int) verbosity level 19 | """ 20 | obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2]) # batch of observations 21 | vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg') 22 | wd_dict = {} 23 | layer_1 = tf.nn.elu(dense(obs_ph, 64, "h1", 24 | weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) 25 | layer_2 = tf.nn.elu(dense(layer_1, 64, "h2", 26 | weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict)) 27 | vpred_n = dense(layer_2, 1, "hfinal", 28 | weight_init=tf_util.normc_initializer(1.0), bias_init=0, 29 | weight_loss_dict=wd_dict)[:, 0] 30 | sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n)) 31 | wd_loss = tf.get_collection("vf_losses", None) 32 | loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss) 33 | loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n))) 34 | 35 | self._predict = tf_util.function([obs_ph], vpred_n) 36 | 37 | optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9, 38 | clip_kl=0.3, epsilon=0.1, stats_decay=0.95, 39 | async_eigen_decomp=True, kfac_update=2, cold_iter=50, 40 | weight_decay_dict=wd_dict, max_grad_norm=None, verbose=verbose) 41 | vf_var_list = [] 42 | for var in tf.trainable_variables(): 43 | if "vf" in var.name: 44 | vf_var_list.append(var) 45 | 46 | update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list) 47 | self.do_update = tf_util.function([obs_ph, vtarg_n], update_op) # pylint: disable=E1101 48 | tf_util.initialize() # Initialize uninitialized TF variables 49 | 50 | @classmethod 51 | def _preproc(cls, path): 52 | """ 53 | preprocess path 54 | 55 | :param path: ({TensorFlow Tensor}) the history of the network 56 | :return: ([TensorFlow Tensor]) processed input 57 | """ 58 | length = path["reward"].shape[0] 59 | # used to be named 'al', unfortunalty we cant seem to know why it was called 'al' or what it means. 60 | # Feel free to fix it if you know what is meant here. 61 | # Could mean 'array_length', but even then we are not sure how this array is useful for the network. 62 | al_capone = np.arange(length).reshape(-1, 1) / 10.0 63 | act = path["action_dist"].astype('float32') 64 | return np.concatenate([path['observation'], act, al_capone, np.ones((length, 1))], axis=1) 65 | 66 | def predict(self, path): 67 | """ 68 | predict value from history 69 | 70 | :param path: ({TensorFlow Tensor}) the history of the network 71 | :return: ([TensorFlow Tensor]) value function output 72 | """ 73 | return self._predict(self._preproc(path)) 74 | 75 | def fit(self, paths, targvals): 76 | """ 77 | fit paths to target values 78 | 79 | :param paths: ({TensorFlow Tensor}) the history of the network 80 | :param targvals: ([TensorFlow Tensor]) the expected value 81 | """ 82 | _input = np.concatenate([self._preproc(p) for p in paths]) 83 | targets = np.concatenate(targvals) 84 | logger.record_tabular("EVBefore", common.explained_variance(self._predict(_input), targets)) 85 | for _ in range(25): 86 | self.do_update(_input, targets) 87 | logger.record_tabular("EVAfter", common.explained_variance(self._predict(_input), targets)) 88 | -------------------------------------------------------------------------------- /stable_baselines/bench/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.bench.monitor import Monitor, load_results 2 | -------------------------------------------------------------------------------- /stable_baselines/common/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F403 2 | from stable_baselines.common.console_util import fmt_row, fmt_item, colorize 3 | from stable_baselines.common.dataset import Dataset 4 | from stable_baselines.common.math_util import discount, discount_with_boundaries, explained_variance, \ 5 | explained_variance_2d, flatten_arrays, unflatten_vector 6 | from stable_baselines.common.misc_util import zipsame, unpack, EzPickle, set_global_seeds, pretty_eta, RunningAvg,\ 7 | boolean_flag, get_wrapper_by_name, relatively_safe_pickle_dump, pickle_load 8 | from stable_baselines.common.base_class import BaseRLModel, ActorCriticRLModel, OffPolicyRLModel, SetVerbosity, \ 9 | TensorboardWriter 10 | -------------------------------------------------------------------------------- /stable_baselines/common/cg.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def conjugate_gradient(f_ax, b_vec, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10): 5 | """ 6 | conjugate gradient calculation (Ax = b), bases on 7 | https://epubs.siam.org/doi/book/10.1137/1.9781611971446 Demmel p 312 8 | 9 | :param f_ax: (function) The function describing the Matrix A dot the vector x 10 | (x being the input parameter of the function) 11 | :param b_vec: (numpy float) vector b, where Ax = b 12 | :param cg_iters: (int) the maximum number of iterations for converging 13 | :param callback: (function) callback the values of x while converging 14 | :param verbose: (bool) print extra information 15 | :param residual_tol: (float) the break point if the residual is below this value 16 | :return: (numpy float) vector x, where Ax = b 17 | """ 18 | first_basis_vect = b_vec.copy() # the first basis vector 19 | residual = b_vec.copy() # the residual 20 | x_var = np.zeros_like(b_vec) # vector x, where Ax = b 21 | residual_dot_residual = residual.dot(residual) # L2 norm of the residual 22 | 23 | fmt_str = "%10i %10.3g %10.3g" 24 | title_str = "%10s %10s %10s" 25 | if verbose: 26 | print(title_str % ("iter", "residual norm", "soln norm")) 27 | 28 | for i in range(cg_iters): 29 | if callback is not None: 30 | callback(x_var) 31 | if verbose: 32 | print(fmt_str % (i, residual_dot_residual, np.linalg.norm(x_var))) 33 | z_var = f_ax(first_basis_vect) 34 | v_var = residual_dot_residual / first_basis_vect.dot(z_var) 35 | x_var += v_var * first_basis_vect 36 | residual -= v_var * z_var 37 | new_residual_dot_residual = residual.dot(residual) 38 | mu_val = new_residual_dot_residual / residual_dot_residual 39 | first_basis_vect = residual + mu_val * first_basis_vect 40 | 41 | residual_dot_residual = new_residual_dot_residual 42 | if residual_dot_residual < residual_tol: 43 | break 44 | 45 | if callback is not None: 46 | callback(x_var) 47 | if verbose: 48 | print(fmt_str % (i + 1, residual_dot_residual, np.linalg.norm(x_var))) 49 | return x_var 50 | -------------------------------------------------------------------------------- /stable_baselines/common/cmd_util.py: -------------------------------------------------------------------------------- 1 | """ 2 | Helpers for scripts like run_atari.py. 3 | """ 4 | 5 | import os 6 | 7 | from mpi4py import MPI 8 | import gym 9 | from gym.wrappers import FlattenDictWrapper 10 | 11 | from stable_baselines import logger 12 | from stable_baselines.bench import Monitor 13 | from stable_baselines.common import set_global_seeds 14 | from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind 15 | from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 16 | 17 | 18 | def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0, allow_early_resets=True): 19 | """ 20 | Create a wrapped, monitored SubprocVecEnv for Atari. 21 | 22 | :param env_id: (str) the environment ID 23 | :param num_env: (int) the number of environment you wish to have in subprocesses 24 | :param seed: (int) the inital seed for RNG 25 | :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function 26 | :param start_index: (int) start rank index 27 | :param allow_early_resets: (bool) allows early reset of the environment 28 | :return: (Gym Environment) The atari environment 29 | """ 30 | if wrapper_kwargs is None: 31 | wrapper_kwargs = {} 32 | 33 | def make_env(rank): 34 | def _thunk(): 35 | env = make_atari(env_id) 36 | env.seed(seed + rank) 37 | env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), 38 | allow_early_resets=allow_early_resets) 39 | return wrap_deepmind(env, **wrapper_kwargs) 40 | return _thunk 41 | set_global_seeds(seed) 42 | return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)]) 43 | 44 | 45 | def make_mujoco_env(env_id, seed, allow_early_resets=True): 46 | """ 47 | Create a wrapped, monitored gym.Env for MuJoCo. 48 | 49 | :param env_id: (str) the environment ID 50 | :param seed: (int) the inital seed for RNG 51 | :param allow_early_resets: (bool) allows early reset of the environment 52 | :return: (Gym Environment) The mujoco environment 53 | """ 54 | rank = MPI.COMM_WORLD.Get_rank() 55 | set_global_seeds(seed + 10000 * rank) 56 | env = gym.make(env_id) 57 | env = Monitor(env, os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets) 58 | env.seed(seed) 59 | return env 60 | 61 | 62 | def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True): 63 | """ 64 | Create a wrapped, monitored gym.Env for MuJoCo. 65 | 66 | :param env_id: (str) the environment ID 67 | :param seed: (int) the inital seed for RNG 68 | :param rank: (int) the rank of the environment (for logging) 69 | :param allow_early_resets: (bool) allows early reset of the environment 70 | :return: (Gym Environment) The robotic environment 71 | """ 72 | set_global_seeds(seed) 73 | env = gym.make(env_id) 74 | env = FlattenDictWrapper(env, ['observation', 'desired_goal']) 75 | env = Monitor( 76 | env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), 77 | info_keywords=('is_success',), allow_early_resets=allow_early_resets) 78 | env.seed(seed) 79 | return env 80 | 81 | 82 | def arg_parser(): 83 | """ 84 | Create an empty argparse.ArgumentParser. 85 | 86 | :return: (ArgumentParser) 87 | """ 88 | import argparse 89 | return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 90 | 91 | 92 | def atari_arg_parser(): 93 | """ 94 | Create an argparse.ArgumentParser for run_atari.py. 95 | 96 | :return: (ArgumentParser) parser {'--env': 'BreakoutNoFrameskip-v4', '--seed': 0, '--num-timesteps': int(1e7)} 97 | """ 98 | parser = arg_parser() 99 | parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') 100 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 101 | parser.add_argument('--num-timesteps', type=int, default=int(1e7)) 102 | return parser 103 | 104 | 105 | def mujoco_arg_parser(): 106 | """ 107 | Create an argparse.ArgumentParser for run_mujoco.py. 108 | 109 | :return: (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False} 110 | """ 111 | parser = arg_parser() 112 | parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2') 113 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 114 | parser.add_argument('--num-timesteps', type=int, default=int(1e6)) 115 | parser.add_argument('--play', default=False, action='store_true') 116 | return parser 117 | 118 | 119 | def robotics_arg_parser(): 120 | """ 121 | Create an argparse.ArgumentParser for run_mujoco.py. 122 | 123 | :return: (ArgumentParser) parser {'--env': 'FetchReach-v0', '--seed': 0, '--num-timesteps': int(1e6)} 124 | """ 125 | parser = arg_parser() 126 | parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0') 127 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 128 | parser.add_argument('--num-timesteps', type=int, default=int(1e6)) 129 | return parser 130 | -------------------------------------------------------------------------------- /stable_baselines/common/console_util.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import numpy as np 4 | 5 | 6 | # ================================================================ 7 | # Misc 8 | # ================================================================ 9 | 10 | 11 | def fmt_row(width, row, header=False): 12 | """ 13 | fits a list of items to at least a certain length 14 | 15 | :param width: (int) the minimum width of the string 16 | :param row: ([Any]) a list of object you wish to get the string representation 17 | :param header: (bool) whether or not to return the string as a header 18 | :return: (str) the string representation of all the elements in 'row', of length >= 'width' 19 | """ 20 | out = " | ".join(fmt_item(x, width) for x in row) 21 | if header: 22 | out = out + "\n" + "-" * len(out) 23 | return out 24 | 25 | 26 | def fmt_item(item, min_width): 27 | """ 28 | fits items to a given string length 29 | 30 | :param item: (Any) the item you wish to get the string representation 31 | :param min_width: (int) the minimum width of the string 32 | :return: (str) the string representation of 'x' of length >= 'l' 33 | """ 34 | if isinstance(item, np.ndarray): 35 | assert item.ndim == 0 36 | item = item.item() 37 | if isinstance(item, (float, np.float32, np.float64)): 38 | value = abs(item) 39 | if (value < 1e-4 or value > 1e+4) and value > 0: 40 | rep = "%7.2e" % item 41 | else: 42 | rep = "%7.5f" % item 43 | else: 44 | rep = str(item) 45 | return " " * (min_width - len(rep)) + rep 46 | 47 | 48 | COLOR_TO_NUM = dict( 49 | gray=30, 50 | red=31, 51 | green=32, 52 | yellow=33, 53 | blue=34, 54 | magenta=35, 55 | cyan=36, 56 | white=37, 57 | crimson=38 58 | ) 59 | 60 | 61 | def colorize(string, color, bold=False, highlight=False): 62 | """ 63 | Colorize, bold and/or highlight a string for terminal print 64 | 65 | :param string: (str) input string 66 | :param color: (str) the color, the lookup table is the dict at console_util.color2num 67 | :param bold: (bool) if the string should be bold or not 68 | :param highlight: (bool) if the string should be highlighted or not 69 | :return: (str) the stylized output string 70 | """ 71 | attr = [] 72 | num = COLOR_TO_NUM[color] 73 | if highlight: 74 | num += 10 75 | attr.append(str(num)) 76 | if bold: 77 | attr.append('1') 78 | return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string) 79 | -------------------------------------------------------------------------------- /stable_baselines/common/dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Dataset(object): 5 | def __init__(self, data_map, deterministic=False, shuffle=True): 6 | """ 7 | Data loader that handles batches and shuffling. 8 | WARNING: this will alter the given data_map ordering, as dicts are mutable 9 | 10 | :param data_map: (dict) the input data, where every column is a key 11 | :param deterministic: (bool) disables the shuffle function 12 | :param shuffle: (bool) enable auto shuffle 13 | """ 14 | self.data_map = data_map 15 | self.deterministic = deterministic 16 | self.enable_shuffle = shuffle 17 | self.n_samples = next(iter(data_map.values())).shape[0] 18 | self._next_id = 0 19 | self.shuffle() 20 | 21 | def shuffle(self): 22 | """ 23 | shuffles the data_map 24 | """ 25 | if self.deterministic: 26 | return 27 | perm = np.arange(self.n_samples) 28 | np.random.shuffle(perm) 29 | 30 | for key in self.data_map: 31 | self.data_map[key] = self.data_map[key][perm] 32 | 33 | def next_batch(self, batch_size): 34 | """ 35 | returns a batch of data of a given size 36 | 37 | :param batch_size: (int) the size of the batch 38 | :return: (dict) a batch of the input data of size 'batch_size' 39 | """ 40 | if self._next_id >= self.n_samples: 41 | self._next_id = 0 42 | if self.enable_shuffle: 43 | self.shuffle() 44 | 45 | cur_id = self._next_id 46 | cur_batch_size = min(batch_size, self.n_samples - self._next_id) 47 | self._next_id += cur_batch_size 48 | 49 | data_map = dict() 50 | for key in self.data_map: 51 | data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size] 52 | return data_map 53 | 54 | def iterate_once(self, batch_size): 55 | """ 56 | generator that iterates over the dataset 57 | 58 | :param batch_size: (int) the size of the batch 59 | :return: (dict) a batch of the input data of size 'batch_size' 60 | """ 61 | if self.enable_shuffle: 62 | self.shuffle() 63 | 64 | while self._next_id <= self.n_samples - batch_size: 65 | yield self.next_batch(batch_size) 66 | self._next_id = 0 67 | 68 | def subset(self, num_elements, deterministic=True): 69 | """ 70 | Return a subset of the current dataset 71 | 72 | :param num_elements: (int) the number of element you wish to have in the subset 73 | :param deterministic: (bool) disables the shuffle function 74 | :return: (Dataset) a new subset of the current Dataset object 75 | """ 76 | data_map = dict() 77 | for key in self.data_map: 78 | data_map[key] = self.data_map[key][:num_elements] 79 | return Dataset(data_map, deterministic) 80 | 81 | 82 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True): 83 | """ 84 | Iterates over arrays in batches, must provide either num_batches or batch_size, the other must be None. 85 | 86 | :param arrays: (tuple) a tuple of arrays 87 | :param num_batches: (int) the number of batches, must be None is batch_size is defined 88 | :param batch_size: (int) the size of the batch, must be None is num_batches is defined 89 | :param shuffle: (bool) enable auto shuffle 90 | :param include_final_partial_batch: (bool) add the last batch if not the same size as the batch_size 91 | :return: (tuples) a tuple of a batch of the arrays 92 | """ 93 | assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both' 94 | arrays = tuple(map(np.asarray, arrays)) 95 | n_samples = arrays[0].shape[0] 96 | assert all(a.shape[0] == n_samples for a in arrays[1:]) 97 | inds = np.arange(n_samples) 98 | if shuffle: 99 | np.random.shuffle(inds) 100 | sections = np.arange(0, n_samples, batch_size)[1:] if num_batches is None else num_batches 101 | for batch_inds in np.array_split(inds, sections): 102 | if include_final_partial_batch or len(batch_inds) == batch_size: 103 | yield tuple(a[batch_inds] for a in arrays) 104 | -------------------------------------------------------------------------------- /stable_baselines/common/identity_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym import Env 4 | from gym.spaces import Discrete, MultiDiscrete, MultiBinary, Box 5 | 6 | 7 | class IdentityEnv(Env): 8 | def __init__(self, dim, ep_length=100): 9 | """ 10 | Identity environment for testing purposes 11 | 12 | :param dim: (int) the size of the dimensions you want to learn 13 | :param ep_length: (int) the length of each episodes in timesteps 14 | """ 15 | self.action_space = Discrete(dim) 16 | self.observation_space = self.action_space 17 | self.ep_length = ep_length 18 | self.current_step = 0 19 | self.dim = dim 20 | self.reset() 21 | 22 | def reset(self): 23 | self.current_step = 0 24 | self._choose_next_state() 25 | return self.state 26 | 27 | def step(self, action): 28 | reward = self._get_reward(action) 29 | self._choose_next_state() 30 | self.current_step += 1 31 | done = self.current_step >= self.ep_length 32 | return self.state, reward, done, {} 33 | 34 | def _choose_next_state(self): 35 | self.state = self.action_space.sample() 36 | 37 | def _get_reward(self, action): 38 | return 1 if np.all(self.state == action) else 0 39 | 40 | def render(self, mode='human'): 41 | pass 42 | 43 | 44 | class IdentityEnvBox(IdentityEnv): 45 | def __init__(self, low=-1, high=1, eps=0.05, ep_length=100): 46 | """ 47 | Identity environment for testing purposes 48 | 49 | :param dim: (int) the size of the dimensions you want to learn 50 | :param low: (float) the lower bound of the box dim 51 | :param high: (float) the upper bound of the box dim 52 | :param eps: (float) the epsilon bound for correct value 53 | :param ep_length: (int) the length of each episodes in timesteps 54 | """ 55 | super(IdentityEnvBox, self).__init__(1, ep_length) 56 | self.action_space = Box(low=low, high=high, shape=(1,), dtype=np.float32) 57 | self.observation_space = self.action_space 58 | self.eps = eps 59 | self.reset() 60 | 61 | def reset(self): 62 | self.current_step = 0 63 | self._choose_next_state() 64 | return self.state 65 | 66 | def step(self, action): 67 | reward = self._get_reward(action) 68 | self._choose_next_state() 69 | self.current_step += 1 70 | done = self.current_step >= self.ep_length 71 | return self.state, reward, done, {} 72 | 73 | def _choose_next_state(self): 74 | self.state = self.observation_space.sample() 75 | 76 | def _get_reward(self, action): 77 | return 1 if (self.state - self.eps) <= action <= (self.state + self.eps) else 0 78 | 79 | 80 | class IdentityEnvMultiDiscrete(IdentityEnv): 81 | def __init__(self, dim, ep_length=100): 82 | """ 83 | Identity environment for testing purposes 84 | 85 | :param dim: (int) the size of the dimensions you want to learn 86 | :param ep_length: (int) the length of each episodes in timesteps 87 | """ 88 | super(IdentityEnvMultiDiscrete, self).__init__(dim, ep_length) 89 | self.action_space = MultiDiscrete([dim, dim]) 90 | self.observation_space = self.action_space 91 | self.reset() 92 | 93 | 94 | class IdentityEnvMultiBinary(IdentityEnv): 95 | def __init__(self, dim, ep_length=100): 96 | """ 97 | Identity environment for testing purposes 98 | 99 | :param dim: (int) the size of the dimensions you want to learn 100 | :param ep_length: (int) the length of each episodes in timesteps 101 | """ 102 | super(IdentityEnvMultiBinary, self).__init__(dim, ep_length) 103 | self.action_space = MultiBinary(dim) 104 | self.observation_space = self.action_space 105 | self.reset() 106 | -------------------------------------------------------------------------------- /stable_baselines/common/input.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete 4 | 5 | 6 | def observation_input(ob_space, batch_size=None, name='Ob', scale=False): 7 | """ 8 | Build observation input with encoding depending on the observation space type 9 | 10 | When using Box ob_space, the input will be normalized between [1, 0] on the bounds ob_space.low and ob_space.high. 11 | 12 | :param ob_space: (Gym Space) The observation space 13 | :param batch_size: (int) batch size for input 14 | (default is None, so that resulting input placeholder can take tensors with any batch size) 15 | :param name: (str) tensorflow variable name for input placeholder 16 | :param scale: (bool) whether or not to scale the input 17 | :return: (TensorFlow Tensor, TensorFlow Tensor) input_placeholder, processed_input_tensor 18 | """ 19 | if isinstance(ob_space, Discrete): 20 | input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name) 21 | processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n)) 22 | return input_x, processed_x 23 | 24 | elif isinstance(ob_space, Box): 25 | input_x = tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name) 26 | processed_x = tf.to_float(input_x) 27 | # rescale to [1, 0] if the bounds are defined 28 | if (scale and 29 | not np.any(np.isinf(ob_space.low)) and not np.any(np.isinf(ob_space.high)) and 30 | np.any((ob_space.high - ob_space.low) != 0)): 31 | 32 | # equivalent to processed_x / 255.0 when bounds are set to [255, 0] 33 | processed_x = ((processed_x - ob_space.low) / (ob_space.high - ob_space.low)) 34 | return input_x, processed_x 35 | 36 | elif isinstance(ob_space, MultiBinary): 37 | input_x = tf.placeholder(shape=(batch_size, ob_space.n), dtype=tf.int32, name=name) 38 | processed_x = tf.to_float(input_x) 39 | return input_x, processed_x 40 | 41 | elif isinstance(ob_space, MultiDiscrete): 42 | input_x = tf.placeholder(shape=(batch_size, len(ob_space.nvec)), dtype=tf.int32, name=name) 43 | processed_x = tf.concat([tf.to_float(tf.one_hot(input_split, ob_space.nvec[i])) 44 | for i, input_split in enumerate(tf.split(input_x, len(ob_space.nvec), axis=-1))], 45 | axis=-1) 46 | return input_x, processed_x 47 | 48 | else: 49 | raise NotImplementedError("Error: the model does not support input space of type {}".format( 50 | type(ob_space).__name__)) 51 | -------------------------------------------------------------------------------- /stable_baselines/common/math_util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.signal 3 | 4 | 5 | def discount(vector, gamma): 6 | """ 7 | computes discounted sums along 0th dimension of vector x. 8 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 9 | where k = len(x) - t - 1 10 | 11 | :param vector: (np.ndarray) the input vector 12 | :param gamma: (float) the discount value 13 | :return: (np.ndarray) the output vector 14 | """ 15 | assert vector.ndim >= 1 16 | return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1] 17 | 18 | 19 | def explained_variance(y_pred, y_true): 20 | """ 21 | Computes fraction of variance that ypred explains about y. 22 | Returns 1 - Var[y-ypred] / Var[y] 23 | 24 | interpretation: 25 | ev=0 => might as well have predicted zero 26 | ev=1 => perfect prediction 27 | ev<0 => worse than just predicting zero 28 | 29 | :param y_pred: (np.ndarray) the prediction 30 | :param y_true: (np.ndarray) the expected value 31 | :return: (float) explained variance of ypred and y 32 | """ 33 | assert y_true.ndim == 1 and y_pred.ndim == 1 34 | var_y = np.var(y_true) 35 | return np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y 36 | 37 | 38 | def explained_variance_2d(y_pred, y_true): 39 | """ 40 | Computes fraction of variance that ypred explains about y, for 2D arrays. 41 | Returns 1 - Var[y-ypred] / Var[y] 42 | 43 | interpretation: 44 | ev=0 => might as well have predicted zero 45 | ev=1 => perfect prediction 46 | ev<0 => worse than just predicting zero 47 | 48 | :param y_pred: (np.ndarray) the prediction 49 | :param y_true: (np.ndarray) the expected value 50 | :return: (float) explained variance of ypred and y 51 | """ 52 | assert y_true.ndim == 2 and y_pred.ndim == 2 53 | var_y = np.var(y_true, axis=0) 54 | explained_var = 1 - np.var(y_true - y_pred) / var_y 55 | explained_var[var_y < 1e-10] = 0 56 | return explained_var 57 | 58 | 59 | def flatten_arrays(arrs): 60 | """ 61 | flattens a list of arrays down to 1D 62 | 63 | :param arrs: ([np.ndarray]) arrays 64 | :return: (np.ndarray) 1D flattend array 65 | """ 66 | return np.concatenate([arr.flat for arr in arrs]) 67 | 68 | 69 | def unflatten_vector(vec, shapes): 70 | """ 71 | reshape a flattened array 72 | 73 | :param vec: (np.ndarray) 1D arrays 74 | :param shapes: (tuple) 75 | :return: ([np.ndarray]) reshaped array 76 | """ 77 | i = 0 78 | arrs = [] 79 | for shape in shapes: 80 | size = np.prod(shape) 81 | arr = vec[i:i + size].reshape(shape) 82 | arrs.append(arr) 83 | i += size 84 | return arrs 85 | 86 | 87 | def discount_with_boundaries(rewards, episode_starts, gamma): 88 | """ 89 | computes discounted sums along 0th dimension of x (reward), while taking into account the start of each episode. 90 | y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k], 91 | where k = len(x) - t - 1 92 | 93 | :param rewards: (np.ndarray) the input vector (rewards) 94 | :param episode_starts: (np.ndarray) 2d array of bools, indicating when a new episode has started 95 | :param gamma: (float) the discount factor 96 | :return: (np.ndarray) the output vector (discounted rewards) 97 | """ 98 | discounted_rewards = np.zeros_like(rewards) 99 | n_samples = rewards.shape[0] 100 | discounted_rewards[n_samples - 1] = rewards[n_samples - 1] 101 | for step in range(n_samples - 2, -1, -1): 102 | discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1]) 103 | return discounted_rewards 104 | -------------------------------------------------------------------------------- /stable_baselines/common/mpi_adam.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | #from mpi4py import MPI 4 | 5 | import stable_baselines.common.tf_util as tf_utils 6 | 7 | 8 | class MpiAdam(object): 9 | def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None, 10 | sess=None): 11 | """ 12 | A parallel MPI implementation of the Adam optimizer for TensorFlow 13 | https://arxiv.org/abs/1412.6980 14 | 15 | :param var_list: ([TensorFlow Tensor]) the variables 16 | :param beta1: (float) Adam beta1 parameter 17 | :param beta2: (float) Adam beta1 parameter 18 | :param epsilon: (float) to help with preventing arithmetic issues 19 | :param scale_grad_by_procs: (bool) if the scaling should be done by processes 20 | :param comm: (MPI Communicators) if None, MPI.COMM_WORLD 21 | :param sess: (TensorFlow Session) if None, tf.get_default_session() 22 | """ 23 | self.var_list = var_list 24 | self.beta1 = beta1 25 | self.beta2 = beta2 26 | self.epsilon = epsilon 27 | self.scale_grad_by_procs = scale_grad_by_procs 28 | size = sum(tf_utils.numel(v) for v in var_list) 29 | # Exponential moving average of gradient values 30 | # "first moment estimate" m in the paper 31 | self.exp_avg = np.zeros(size, 'float32') 32 | # Exponential moving average of squared gradient values 33 | # "second raw moment estimate" v in the paper 34 | self.exp_avg_sq = np.zeros(size, 'float32') 35 | self.step = 0 36 | self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess) 37 | self.getflat = tf_utils.GetFlat(var_list, sess=sess) 38 | self.comm = MPI.COMM_WORLD if comm is None else comm 39 | 40 | def update(self, local_grad, learning_rate): 41 | """ 42 | update the values of the graph 43 | 44 | :param local_grad: (numpy float) the gradient 45 | :param learning_rate: (float) the learning_rate for the update 46 | """ 47 | if self.step % 100 == 0: 48 | self.check_synced() 49 | local_grad = local_grad.astype('float32') 50 | global_grad = np.zeros_like(local_grad) 51 | self.comm.Allreduce(local_grad, global_grad, op=MPI.SUM) 52 | if self.scale_grad_by_procs: 53 | global_grad /= self.comm.Get_size() 54 | 55 | self.step += 1 56 | # Learning rate with bias correction 57 | step_size = learning_rate * np.sqrt(1 - self.beta2 ** self.step) / (1 - self.beta1 ** self.step) 58 | # Decay the first and second moment running average coefficient 59 | self.exp_avg = self.beta1 * self.exp_avg + (1 - self.beta1) * global_grad 60 | self.exp_avg_sq = self.beta2 * self.exp_avg_sq + (1 - self.beta2) * (global_grad * global_grad) 61 | step = (- step_size) * self.exp_avg / (np.sqrt(self.exp_avg_sq) + self.epsilon) 62 | self.setfromflat(self.getflat() + step) 63 | 64 | def sync(self): 65 | """ 66 | syncronize the MPI threads 67 | """ 68 | theta = self.getflat() 69 | self.comm.Bcast(theta, root=0) 70 | self.setfromflat(theta) 71 | 72 | def check_synced(self): 73 | """ 74 | confirm the MPI threads are synced 75 | """ 76 | if self.comm.Get_rank() == 0: # this is root 77 | theta = self.getflat() 78 | self.comm.Bcast(theta, root=0) 79 | else: 80 | thetalocal = self.getflat() 81 | thetaroot = np.empty_like(thetalocal) 82 | self.comm.Bcast(thetaroot, root=0) 83 | assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal) 84 | 85 | 86 | @tf_utils.in_session 87 | def test_mpi_adam(): 88 | """ 89 | tests the MpiAdam object's functionality 90 | """ 91 | np.random.seed(0) 92 | tf.set_random_seed(0) 93 | 94 | a_var = tf.Variable(np.random.randn(3).astype('float32')) 95 | b_var = tf.Variable(np.random.randn(2, 5).astype('float32')) 96 | loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var)) 97 | 98 | learning_rate = 1e-2 99 | update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss) 100 | do_update = tf_utils.function([], loss, updates=[update_op]) 101 | 102 | tf.get_default_session().run(tf.global_variables_initializer()) 103 | for step in range(10): 104 | print(step, do_update()) 105 | 106 | tf.set_random_seed(0) 107 | tf.get_default_session().run(tf.global_variables_initializer()) 108 | 109 | var_list = [a_var, b_var] 110 | lossandgrad = tf_utils.function([], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op]) 111 | adam = MpiAdam(var_list) 112 | 113 | for step in range(10): 114 | loss, grad = lossandgrad() 115 | adam.update(grad, learning_rate) 116 | print(step, loss) 117 | 118 | 119 | if __name__ == "__main__": 120 | # Run with mpirun -np 2 python 121 | test_mpi_adam() 122 | -------------------------------------------------------------------------------- /stable_baselines/common/mpi_fork.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | 5 | 6 | def mpi_fork(rank, bind_to_core=False): 7 | """ 8 | Re-launches the current script with workers 9 | Returns "parent" for original parent, "child" for MPI children 10 | 11 | :param rank: (int) the rank 12 | :param bind_to_core: (bool) enables binding to core 13 | :return: (str) the correct type of thread name 14 | """ 15 | if rank <= 1: 16 | return "child" 17 | if os.getenv("IN_MPI") is None: 18 | env = os.environ.copy() 19 | env.update( 20 | MKL_NUM_THREADS="1", 21 | OMP_NUM_THREADS="1", 22 | IN_MPI="1" 23 | ) 24 | args = ["mpirun", "-np", str(rank)] 25 | if bind_to_core: 26 | args += ["-bind-to", "core"] 27 | args += [sys.executable] + sys.argv 28 | subprocess.check_call(args, env=env) 29 | return "parent" 30 | else: 31 | return "child" 32 | -------------------------------------------------------------------------------- /stable_baselines/common/mpi_moments.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | import numpy as np 3 | 4 | from stable_baselines.common import zipsame 5 | 6 | 7 | def mpi_mean(arr, axis=0, comm=None, keepdims=False): 8 | """ 9 | calculates the mean of an array, using MPI 10 | 11 | :param arr: (np.ndarray) 12 | :param axis: (int or tuple or list) the axis to run the means over 13 | :param comm: (MPI Communicators) if None, MPI.COMM_WORLD 14 | :param keepdims: (bool) keep the other dimensions intact 15 | :return: (np.ndarray or Number) the result of the sum 16 | """ 17 | arr = np.asarray(arr) 18 | assert arr.ndim > 0 19 | if comm is None: 20 | comm = MPI.COMM_WORLD 21 | xsum = arr.sum(axis=axis, keepdims=keepdims) 22 | size = xsum.size 23 | localsum = np.zeros(size + 1, arr.dtype) 24 | localsum[:size] = xsum.ravel() 25 | localsum[size] = arr.shape[axis] 26 | globalsum = np.zeros_like(localsum) 27 | comm.Allreduce(localsum, globalsum, op=MPI.SUM) 28 | return globalsum[:size].reshape(xsum.shape) / globalsum[size], globalsum[size] 29 | 30 | 31 | def mpi_moments(arr, axis=0, comm=None, keepdims=False): 32 | """ 33 | calculates the mean and std of an array, using MPI 34 | 35 | :param arr: (np.ndarray) 36 | :param axis: (int or tuple or list) the axis to run the moments over 37 | :param comm: (MPI Communicators) if None, MPI.COMM_WORLD 38 | :param keepdims: (bool) keep the other dimensions intact 39 | :return: (np.ndarray or Number) the result of the moments 40 | """ 41 | arr = np.asarray(arr) 42 | assert arr.ndim > 0 43 | mean, count = mpi_mean(arr, axis=axis, comm=comm, keepdims=True) 44 | sqdiffs = np.square(arr - mean) 45 | meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True) 46 | assert count1 == count 47 | std = np.sqrt(meansqdiff) 48 | if not keepdims: 49 | newshape = mean.shape[:axis] + mean.shape[axis+1:] 50 | mean = mean.reshape(newshape) 51 | std = std.reshape(newshape) 52 | return mean, std, count 53 | 54 | 55 | def _helper_runningmeanstd(): 56 | comm = MPI.COMM_WORLD 57 | np.random.seed(0) 58 | for (triple, axis) in [ 59 | ((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0), 60 | ((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0), 61 | ((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1)]: 62 | 63 | arr = np.concatenate(triple, axis=axis) 64 | ms1 = [arr.mean(axis=axis), arr.std(axis=axis), arr.shape[axis]] 65 | 66 | ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis) 67 | 68 | for (res_1, res_2) in zipsame(ms1, ms2): 69 | print(res_1, res_2) 70 | assert np.allclose(res_1, res_2) 71 | print("ok!") 72 | -------------------------------------------------------------------------------- /stable_baselines/common/mpi_running_mean_std.py: -------------------------------------------------------------------------------- 1 | #from mpi4py import MPI 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | import stable_baselines.common.tf_util as tf_util 6 | 7 | 8 | class RunningMeanStd(object): 9 | def __init__(self, epsilon=1e-2, shape=()): 10 | """ 11 | calulates the running mean and std of a data stream 12 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 13 | 14 | :param epsilon: (float) helps with arithmetic issues 15 | :param shape: (tuple) the shape of the data stream's output 16 | """ 17 | self._sum = tf.get_variable( 18 | dtype=tf.float64, 19 | shape=shape, 20 | initializer=tf.constant_initializer(0.0), 21 | name="runningsum", trainable=False) 22 | self._sumsq = tf.get_variable( 23 | dtype=tf.float64, 24 | shape=shape, 25 | initializer=tf.constant_initializer(epsilon), 26 | name="runningsumsq", trainable=False) 27 | self._count = tf.get_variable( 28 | dtype=tf.float64, 29 | shape=(), 30 | initializer=tf.constant_initializer(epsilon), 31 | name="count", trainable=False) 32 | self.shape = shape 33 | 34 | self.mean = tf.to_float(self._sum / self._count) 35 | self.std = tf.sqrt(tf.maximum(tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2)) 36 | 37 | newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum') 38 | newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var') 39 | newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count') 40 | self.incfiltparams = tf_util.function([newsum, newsumsq, newcount], [], 41 | updates=[tf.assign_add(self._sum, newsum), 42 | tf.assign_add(self._sumsq, newsumsq), 43 | tf.assign_add(self._count, newcount)]) 44 | 45 | def update(self, data): 46 | """ 47 | update the running mean and std 48 | 49 | :param data: (np.ndarray) the data 50 | """ 51 | data = data.astype('float64') 52 | data_size = int(np.prod(self.shape)) 53 | totalvec = np.zeros(data_size * 2 + 1, 'float64') 54 | addvec = np.concatenate([data.sum(axis=0).ravel(), np.square(data).sum(axis=0).ravel(), 55 | np.array([len(data)], dtype='float64')]) 56 | MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM) 57 | self.incfiltparams(totalvec[0: data_size].reshape(self.shape), 58 | totalvec[data_size: 2 * data_size].reshape(self.shape), totalvec[2 * data_size]) 59 | 60 | 61 | @tf_util.in_session 62 | def test_dist(): 63 | """ 64 | test the running mean std 65 | """ 66 | np.random.seed(0) 67 | p_1, p_2, p_3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1)) 68 | q_1, q_2, q_3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1)) 69 | 70 | comm = MPI.COMM_WORLD 71 | assert comm.Get_size() == 2 72 | if comm.Get_rank() == 0: 73 | x_1, x_2, x_3 = p_1, p_2, p_3 74 | elif comm.Get_rank() == 1: 75 | x_1, x_2, x_3 = q_1, q_2, q_3 76 | else: 77 | assert False 78 | 79 | rms = RunningMeanStd(epsilon=0.0, shape=(1,)) 80 | tf_util.initialize() 81 | 82 | rms.update(x_1) 83 | rms.update(x_2) 84 | rms.update(x_3) 85 | 86 | bigvec = np.concatenate([p_1, p_2, p_3, q_1, q_2, q_3]) 87 | 88 | def checkallclose(var_1, var_2): 89 | print(var_1, var_2) 90 | return np.allclose(var_1, var_2) 91 | 92 | assert checkallclose( 93 | bigvec.mean(axis=0), 94 | rms.mean.eval(), 95 | ) 96 | assert checkallclose( 97 | bigvec.std(axis=0), 98 | rms.std.eval(), 99 | ) 100 | 101 | 102 | if __name__ == "__main__": 103 | # Run with mpirun -np 2 python 104 | test_dist() 105 | -------------------------------------------------------------------------------- /stable_baselines/common/runners.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from abc import ABC, abstractmethod 3 | 4 | 5 | class AbstractEnvRunner(ABC): 6 | def __init__(self, *, env, model, n_steps): 7 | """ 8 | A runner to learn the policy of an environment for a model 9 | 10 | :param env: (Gym environment) The environment to learn from 11 | :param model: (Model) The model to learn 12 | :param n_steps: (int) The number of steps to run for each environment 13 | """ 14 | self.env = env 15 | self.model = model 16 | n_env = env.num_envs 17 | self.batch_ob_shape = (n_env*n_steps,) + env.observation_space.shape 18 | self.obs = np.zeros((n_env,) + env.observation_space.shape, dtype=env.observation_space.dtype.name) 19 | self.obs[:] = env.reset() 20 | self.n_steps = n_steps 21 | self.states = model.initial_state 22 | self.dones = [False for _ in range(n_env)] 23 | 24 | @abstractmethod 25 | def run(self): 26 | """ 27 | Run a learning step of the model 28 | """ 29 | raise NotImplementedError 30 | -------------------------------------------------------------------------------- /stable_baselines/common/running_mean_std.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RunningMeanStd(object): 5 | def __init__(self, epsilon=1e-4, shape=()): 6 | """ 7 | calulates the running mean and std of a data stream 8 | https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm 9 | 10 | :param epsilon: (float) helps with arithmetic issues 11 | :param shape: (tuple) the shape of the data stream's output 12 | """ 13 | self.mean = np.zeros(shape, 'float64') 14 | self.var = np.ones(shape, 'float64') 15 | self.count = epsilon 16 | 17 | def update(self, arr): 18 | batch_mean = np.mean(arr, axis=0) 19 | batch_var = np.var(arr, axis=0) 20 | batch_count = arr.shape[0] 21 | self.update_from_moments(batch_mean, batch_var, batch_count) 22 | 23 | def update_from_moments(self, batch_mean, batch_var, batch_count): 24 | delta = batch_mean - self.mean 25 | tot_count = self.count + batch_count 26 | 27 | new_mean = self.mean + delta * batch_count / tot_count 28 | m_a = self.var * self.count 29 | m_b = batch_var * batch_count 30 | m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count) 31 | new_var = m_2 / (self.count + batch_count) 32 | 33 | new_count = batch_count + self.count 34 | 35 | self.mean = new_mean 36 | self.var = new_var 37 | self.count = new_count 38 | -------------------------------------------------------------------------------- /stable_baselines/common/running_stat.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RunningStat(object): 5 | def __init__(self, shape): 6 | """ 7 | calulates the running mean and std of a data stream 8 | http://www.johndcook.com/blog/standard_deviation/ 9 | 10 | :param shape: (tuple) the shape of the data stream's output 11 | """ 12 | self._step = 0 13 | self._mean = np.zeros(shape) 14 | self._std = np.zeros(shape) 15 | 16 | def push(self, value): 17 | """ 18 | update the running mean and std 19 | 20 | :param value: (np.ndarray) the data 21 | """ 22 | value = np.asarray(value) 23 | assert value.shape == self._mean.shape 24 | self._step += 1 25 | if self._step == 1: 26 | self._mean[...] = value 27 | else: 28 | old_m = self._mean.copy() 29 | self._mean[...] = old_m + (value - old_m) / self._step 30 | self._std[...] = self._std + (value - old_m) * (value - self._mean) 31 | 32 | @property 33 | def n(self): 34 | """ 35 | the number of data points 36 | 37 | :return: (int) 38 | """ 39 | return self._step 40 | 41 | @property 42 | def mean(self): 43 | """ 44 | the average value 45 | 46 | :return: (float) 47 | """ 48 | return self._mean 49 | 50 | @property 51 | def var(self): 52 | """ 53 | the variation of the data points 54 | 55 | :return: (float) 56 | """ 57 | return self._std / (self._step - 1) if self._step > 1 else np.square(self._mean) 58 | 59 | @property 60 | def std(self): 61 | """ 62 | the standard deviation of the data points 63 | 64 | :return: (float) 65 | """ 66 | return np.sqrt(self.var) 67 | 68 | @property 69 | def shape(self): 70 | """ 71 | the shape of the data points 72 | 73 | :return: (tuple) 74 | """ 75 | return self._mean.shape 76 | -------------------------------------------------------------------------------- /stable_baselines/common/schedules.py: -------------------------------------------------------------------------------- 1 | """This file is used for specifying various schedules that evolve over 2 | time throughout the execution of the algorithm, such as: 3 | 4 | - learning rate for the optimizer 5 | - exploration epsilon for the epsilon greedy exploration strategy 6 | - beta parameter for beta parameter in prioritized replay 7 | 8 | Each schedule has a function `value(t)` which returns the current value 9 | of the parameter given the timestep t of the optimization procedure. 10 | """ 11 | 12 | 13 | class Schedule(object): 14 | def value(self, step): 15 | """ 16 | Value of the schedule for a given timestep 17 | 18 | :param step: (int) the timestep 19 | :return: (float) the output value for the given timestep 20 | """ 21 | raise NotImplementedError 22 | 23 | 24 | class ConstantSchedule(Schedule): 25 | """ 26 | Value remains constant over time. 27 | 28 | :param value: (float) Constant value of the schedule 29 | """ 30 | 31 | def __init__(self, value): 32 | self._value = value 33 | 34 | def value(self, step): 35 | return self._value 36 | 37 | 38 | def linear_interpolation(left, right, alpha): 39 | """ 40 | Linear interpolation between `left` and `right`. 41 | 42 | :param left: (float) left boundary 43 | :param right: (float) right boundary 44 | :param alpha: (float) coeff in [0, 1] 45 | :return: (float) 46 | """ 47 | 48 | return left + alpha * (right - left) 49 | 50 | 51 | class PiecewiseSchedule(Schedule): 52 | """ 53 | Piecewise schedule. 54 | 55 | :param endpoints: ([(int, int)]) 56 | list of pairs `(time, value)` meanining that schedule should output 57 | `value` when `t==time`. All the values for time must be sorted in 58 | an increasing order. When t is between two times, e.g. `(time_a, value_a)` 59 | and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs 60 | `interpolation(value_a, value_b, alpha)` where alpha is a fraction of 61 | time passed between `time_a` and `time_b` for time `t`. 62 | :param interpolation: (lambda (float, float, float): float) 63 | a function that takes value to the left and to the right of t according 64 | to the `endpoints`. Alpha is the fraction of distance from left endpoint to 65 | right endpoint that t has covered. See linear_interpolation for example. 66 | :param outside_value: (float) 67 | if the value is requested outside of all the intervals sepecified in 68 | `endpoints` this value is returned. If None then AssertionError is 69 | raised when outside value is requested. 70 | """ 71 | 72 | def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None): 73 | idxes = [e[0] for e in endpoints] 74 | assert idxes == sorted(idxes) 75 | self._interpolation = interpolation 76 | self._outside_value = outside_value 77 | self._endpoints = endpoints 78 | 79 | def value(self, step): 80 | for (left_t, left), (right_t, right) in zip(self._endpoints[:-1], self._endpoints[1:]): 81 | if left_t <= step < right_t: 82 | alpha = float(step - left_t) / (right_t - left_t) 83 | return self._interpolation(left, right, alpha) 84 | 85 | # t does not belong to any of the pieces, so doom. 86 | assert self._outside_value is not None 87 | return self._outside_value 88 | 89 | 90 | class LinearSchedule(Schedule): 91 | """ 92 | Linear interpolation between initial_p and final_p over 93 | schedule_timesteps. After this many timesteps pass final_p is 94 | returned. 95 | 96 | :param schedule_timesteps: (int) Number of timesteps for which to linearly anneal initial_p to final_p 97 | :param initial_p: (float) initial output value 98 | :param final_p: (float) final output value 99 | """ 100 | 101 | def __init__(self, schedule_timesteps, final_p, initial_p=1.0): 102 | self.schedule_timesteps = schedule_timesteps 103 | self.final_p = final_p 104 | self.initial_p = initial_p 105 | 106 | def value(self, step): 107 | fraction = min(float(step) / self.schedule_timesteps, 1.0) 108 | return self.initial_p + fraction * (self.final_p - self.initial_p) 109 | -------------------------------------------------------------------------------- /stable_baselines/common/segment_tree.py: -------------------------------------------------------------------------------- 1 | import operator 2 | 3 | 4 | class SegmentTree(object): 5 | def __init__(self, capacity, operation, neutral_element): 6 | """ 7 | Build a Segment Tree data structure. 8 | 9 | https://en.wikipedia.org/wiki/Segment_tree 10 | 11 | Can be used as regular array, but with two 12 | important differences: 13 | 14 | a) setting item's value is slightly slower. 15 | It is O(lg capacity) instead of O(1). 16 | b) user has access to an efficient ( O(log segment size) ) 17 | `reduce` operation which reduces `operation` over 18 | a contiguous subsequence of items in the array. 19 | 20 | :param capacity: (int) Total size of the array - must be a power of two. 21 | :param operation: (lambda (Any, Any): Any) operation for combining elements (eg. sum, max) must form a 22 | mathematical group together with the set of possible values for array elements (i.e. be associative) 23 | :param neutral_element: (Any) neutral element for the operation above. eg. float('-inf') for max and 0 for sum. 24 | """ 25 | assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2." 26 | self._capacity = capacity 27 | self._value = [neutral_element for _ in range(2 * capacity)] 28 | self._operation = operation 29 | 30 | def _reduce_helper(self, start, end, node, node_start, node_end): 31 | if start == node_start and end == node_end: 32 | return self._value[node] 33 | mid = (node_start + node_end) // 2 34 | if end <= mid: 35 | return self._reduce_helper(start, end, 2 * node, node_start, mid) 36 | else: 37 | if mid + 1 <= start: 38 | return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end) 39 | else: 40 | return self._operation( 41 | self._reduce_helper(start, mid, 2 * node, node_start, mid), 42 | self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end) 43 | ) 44 | 45 | def reduce(self, start=0, end=None): 46 | """ 47 | Returns result of applying `self.operation` 48 | to a contiguous subsequence of the array. 49 | 50 | self.operation(arr[start], operation(arr[start+1], operation(... arr[end]))) 51 | 52 | :param start: (int) beginning of the subsequence 53 | :param end: (int) end of the subsequences 54 | :return: (Any) result of reducing self.operation over the specified range of array elements. 55 | """ 56 | if end is None: 57 | end = self._capacity 58 | if end < 0: 59 | end += self._capacity 60 | end -= 1 61 | return self._reduce_helper(start, end, 1, 0, self._capacity - 1) 62 | 63 | def __setitem__(self, idx, val): 64 | # index of the leaf 65 | idx += self._capacity 66 | self._value[idx] = val 67 | idx //= 2 68 | while idx >= 1: 69 | self._value[idx] = self._operation( 70 | self._value[2 * idx], 71 | self._value[2 * idx + 1] 72 | ) 73 | idx //= 2 74 | 75 | def __getitem__(self, idx): 76 | assert 0 <= idx < self._capacity 77 | return self._value[self._capacity + idx] 78 | 79 | 80 | class SumSegmentTree(SegmentTree): 81 | def __init__(self, capacity): 82 | super(SumSegmentTree, self).__init__( 83 | capacity=capacity, 84 | operation=operator.add, 85 | neutral_element=0.0 86 | ) 87 | 88 | def sum(self, start=0, end=None): 89 | """ 90 | Returns arr[start] + ... + arr[end] 91 | 92 | :param start: (int) start position of the reduction (must be >= 0) 93 | :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1) 94 | :return: (Any) reduction of SumSegmentTree 95 | """ 96 | return super(SumSegmentTree, self).reduce(start, end) 97 | 98 | def find_prefixsum_idx(self, prefixsum): 99 | """ 100 | Find the highest index `i` in the array such that 101 | sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum 102 | 103 | if array values are probabilities, this function 104 | allows to sample indexes according to the discrete 105 | probability efficiently. 106 | 107 | :param prefixsum: (float) upperbound on the sum of array prefix 108 | :return: (int) highest index satisfying the prefixsum constraint 109 | """ 110 | assert 0 <= prefixsum <= self.sum() + 1e-5 111 | idx = 1 112 | while idx < self._capacity: # while non-leaf 113 | if self._value[2 * idx] > prefixsum: 114 | idx = 2 * idx 115 | else: 116 | prefixsum -= self._value[2 * idx] 117 | idx = 2 * idx + 1 118 | return idx - self._capacity 119 | 120 | 121 | class MinSegmentTree(SegmentTree): 122 | def __init__(self, capacity): 123 | super(MinSegmentTree, self).__init__( 124 | capacity=capacity, 125 | operation=min, 126 | neutral_element=float('inf') 127 | ) 128 | 129 | def min(self, start=0, end=None): 130 | """ 131 | Returns min(arr[start], ..., arr[end]) 132 | 133 | :param start: (int) start position of the reduction (must be >= 0) 134 | :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1) 135 | :return: (Any) reduction of MinSegmentTree 136 | """ 137 | return super(MinSegmentTree, self).reduce(start, end) 138 | -------------------------------------------------------------------------------- /stable_baselines/common/tile_images.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def tile_images(img_nhwc): 5 | """ 6 | Tile N images into one big PxQ image 7 | (P,Q) are chosen to be as close as possible, and if N 8 | is square, then P=Q. 9 | 10 | :param img_nhwc: (list) list or array of images, ndim=4 once turned into array. img nhwc 11 | n = batch index, h = height, w = width, c = channel 12 | :return: (numpy float) img_HWc, ndim=3 13 | """ 14 | img_nhwc = np.asarray(img_nhwc) 15 | n_images, height, width, n_channels = img_nhwc.shape 16 | # new_height was named H before 17 | new_height = int(np.ceil(np.sqrt(n_images))) 18 | # new_width was named W before 19 | new_width = int(np.ceil(float(n_images) / new_height)) 20 | img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)]) 21 | # img_HWhwc 22 | out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels) 23 | # img_HhWwc 24 | out_image = out_image.transpose(0, 2, 1, 3, 4) 25 | # img_Hh_Ww_c 26 | out_image = out_image.reshape(new_height * height, new_width * width, n_channels) 27 | return out_image 28 | 29 | -------------------------------------------------------------------------------- /stable_baselines/common/vec_env/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa F401 2 | from stable_baselines.common.vec_env.base_vec_env import AlreadySteppingError, NotSteppingError, VecEnv, VecEnvWrapper, \ 3 | CloudpickleWrapper 4 | from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv 5 | from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv 6 | from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack 7 | from stable_baselines.common.vec_env.vec_normalize import VecNormalize 8 | -------------------------------------------------------------------------------- /stable_baselines/common/vec_env/base_vec_env.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | import pickle 3 | 4 | import cloudpickle 5 | from stable_baselines import logger 6 | 7 | 8 | class AlreadySteppingError(Exception): 9 | """ 10 | Raised when an asynchronous step is running while 11 | step_async() is called again. 12 | """ 13 | 14 | def __init__(self): 15 | msg = 'already running an async step' 16 | Exception.__init__(self, msg) 17 | 18 | 19 | class NotSteppingError(Exception): 20 | """ 21 | Raised when an asynchronous step is not running but 22 | step_wait() is called. 23 | """ 24 | 25 | def __init__(self): 26 | msg = 'not running an async step' 27 | Exception.__init__(self, msg) 28 | 29 | 30 | class VecEnv(ABC): 31 | """ 32 | An abstract asynchronous, vectorized environment. 33 | 34 | :param num_envs: (int) the number of environments 35 | :param observation_space: (Gym Space) the observation space 36 | :param action_space: (Gym Space) the action space 37 | """ 38 | 39 | def __init__(self, num_envs, observation_space, action_space): 40 | self.num_envs = num_envs 41 | self.observation_space = observation_space 42 | self.action_space = action_space 43 | 44 | @abstractmethod 45 | def reset(self): 46 | """ 47 | Reset all the environments and return an array of 48 | observations, or a tuple of observation arrays. 49 | 50 | If step_async is still doing work, that work will 51 | be cancelled and step_wait() should not be called 52 | until step_async() is invoked again. 53 | 54 | :return: ([int] or [float]) observation 55 | """ 56 | pass 57 | 58 | @abstractmethod 59 | def step_async(self, actions): 60 | """ 61 | Tell all the environments to start taking a step 62 | with the given actions. 63 | Call step_wait() to get the results of the step. 64 | 65 | You should not call this if a step_async run is 66 | already pending. 67 | """ 68 | pass 69 | 70 | @abstractmethod 71 | def step_wait(self): 72 | """ 73 | Wait for the step taken with step_async(). 74 | 75 | :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information 76 | """ 77 | pass 78 | 79 | @abstractmethod 80 | def close(self): 81 | """ 82 | Clean up the environment's resources. 83 | """ 84 | pass 85 | 86 | def step(self, actions): 87 | """ 88 | Step the environments with the given action 89 | 90 | :param actions: ([int] or [float]) the action 91 | :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information 92 | """ 93 | self.step_async(actions) 94 | return self.step_wait() 95 | 96 | def get_images(self): 97 | """ 98 | Return RGB images from each environment 99 | """ 100 | raise NotImplementedError 101 | 102 | def render(self, *args, **kwargs): 103 | """ 104 | Gym environment rendering 105 | 106 | :param mode: (str) the rendering type 107 | """ 108 | logger.warn('Render not defined for %s' % self) 109 | 110 | @property 111 | def unwrapped(self): 112 | if isinstance(self, VecEnvWrapper): 113 | return self.venv.unwrapped 114 | else: 115 | return self 116 | 117 | 118 | class VecEnvWrapper(VecEnv): 119 | """ 120 | Vectorized environment base class 121 | 122 | :param venv: (VecEnv) the vectorized environment to wrap 123 | :param observation_space: (Gym Space) the observation space (can be None to load from venv) 124 | :param action_space: (Gym Space) the action space (can be None to load from venv) 125 | """ 126 | 127 | def __init__(self, venv, observation_space=None, action_space=None): 128 | self.venv = venv 129 | VecEnv.__init__(self, num_envs=venv.num_envs, observation_space=observation_space or venv.observation_space, 130 | action_space=action_space or venv.action_space) 131 | 132 | def step_async(self, actions): 133 | self.venv.step_async(actions) 134 | 135 | @abstractmethod 136 | def reset(self): 137 | pass 138 | 139 | @abstractmethod 140 | def step_wait(self): 141 | pass 142 | 143 | def close(self): 144 | return self.venv.close() 145 | 146 | def render(self, *args, **kwargs): 147 | return self.venv.render(*args, **kwargs) 148 | 149 | def get_images(self): 150 | return self.venv.get_images() 151 | 152 | 153 | class CloudpickleWrapper(object): 154 | def __init__(self, var): 155 | """ 156 | Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle) 157 | 158 | :param var: (Any) the variable you wish to wrap for pickling with cloudpickle 159 | """ 160 | self.var = var 161 | 162 | def __getstate__(self): 163 | return cloudpickle.dumps(self.var) 164 | 165 | def __setstate__(self, obs): 166 | self.var = pickle.loads(obs) 167 | -------------------------------------------------------------------------------- /stable_baselines/common/vec_env/dummy_vec_env.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import numpy as np 4 | from gym import spaces 5 | 6 | from . import VecEnv 7 | 8 | 9 | class DummyVecEnv(VecEnv): 10 | """ 11 | Creates a simple vectorized wrapper for multiple environments 12 | 13 | :param env_fns: ([Gym Environment]) the list of environments to vectorize 14 | """ 15 | 16 | def __init__(self, env_fns): 17 | self.envs = [fn() for fn in env_fns] 18 | env = self.envs[0] 19 | VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space) 20 | shapes, dtypes = {}, {} 21 | self.keys = [] 22 | obs_space = env.observation_space 23 | 24 | if isinstance(obs_space, spaces.Dict): 25 | assert isinstance(obs_space.spaces, OrderedDict) 26 | subspaces = obs_space.spaces 27 | else: 28 | subspaces = {None: obs_space} 29 | 30 | for key, box in subspaces.items(): 31 | shapes[key] = box.shape 32 | dtypes[key] = box.dtype 33 | self.keys.append(key) 34 | 35 | self.buf_obs = {k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys} 36 | self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool) 37 | self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32) 38 | self.buf_infos = [{} for _ in range(self.num_envs)] 39 | self.actions = None 40 | 41 | def step_async(self, actions): 42 | self.actions = actions 43 | 44 | def step_wait(self): 45 | for env_idx in range(self.num_envs): 46 | obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] =\ 47 | self.envs[env_idx].step(self.actions[env_idx]) 48 | if self.buf_dones[env_idx]: 49 | obs = self.envs[env_idx].reset() 50 | self._save_obs(env_idx, obs) 51 | return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones), 52 | self.buf_infos.copy()) 53 | 54 | def reset(self): 55 | for env_idx in range(self.num_envs): 56 | obs = self.envs[env_idx].reset() 57 | self._save_obs(env_idx, obs) 58 | return np.copy(self._obs_from_buf()) 59 | 60 | def close(self): 61 | return 62 | 63 | def get_images(self): 64 | return [env.render(mode='rgb_array') for env in self.envs] 65 | 66 | def render(self, *args, **kwargs): 67 | if self.num_envs == 1: 68 | return self.envs[0].render(*args, **kwargs) 69 | else: 70 | return super().render(*args, **kwargs) 71 | 72 | def _save_obs(self, env_idx, obs): 73 | for key in self.keys: 74 | if key is None: 75 | self.buf_obs[key][env_idx] = obs 76 | else: 77 | self.buf_obs[key][env_idx] = obs[key] 78 | 79 | def _obs_from_buf(self): 80 | if self.keys == [None]: 81 | return self.buf_obs[None] 82 | else: 83 | return self.buf_obs 84 | -------------------------------------------------------------------------------- /stable_baselines/common/vec_env/subproc_vec_env.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Pipe 2 | 3 | import numpy as np 4 | 5 | from stable_baselines.common.vec_env import VecEnv, CloudpickleWrapper 6 | from stable_baselines.common.tile_images import tile_images 7 | 8 | 9 | def _worker(remote, parent_remote, env_fn_wrapper): 10 | parent_remote.close() 11 | env = env_fn_wrapper.var() 12 | while True: 13 | try: 14 | cmd, data = remote.recv() 15 | if cmd == 'step': 16 | observation, reward, done, info = env.step(data) 17 | if done: 18 | observation = env.reset() 19 | remote.send((observation, reward, done, info)) 20 | elif cmd == 'reset': 21 | observation = env.reset() 22 | remote.send(observation) 23 | elif cmd == 'render': 24 | remote.send(env.render(*data[0], **data[1])) 25 | elif cmd == 'close': 26 | remote.close() 27 | break 28 | elif cmd == 'get_spaces': 29 | remote.send((env.observation_space, env.action_space)) 30 | else: 31 | raise NotImplementedError 32 | except EOFError: 33 | break 34 | 35 | 36 | class SubprocVecEnv(VecEnv): 37 | """ 38 | Creates a multiprocess vectorized wrapper for multiple environments 39 | 40 | :param env_fns: ([Gym Environment]) Environments to run in subprocesses 41 | """ 42 | 43 | def __init__(self, env_fns): 44 | self.waiting = False 45 | self.closed = False 46 | n_envs = len(env_fns) 47 | self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(n_envs)]) 48 | self.processes = [Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn))) 49 | for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)] 50 | for process in self.processes: 51 | process.daemon = True # if the main process crashes, we should not cause things to hang 52 | process.start() 53 | for remote in self.work_remotes: 54 | remote.close() 55 | 56 | self.remotes[0].send(('get_spaces', None)) 57 | observation_space, action_space = self.remotes[0].recv() 58 | VecEnv.__init__(self, len(env_fns), observation_space, action_space) 59 | 60 | def step_async(self, actions): 61 | for remote, action in zip(self.remotes, actions): 62 | remote.send(('step', action)) 63 | self.waiting = True 64 | 65 | def step_wait(self): 66 | results = [remote.recv() for remote in self.remotes] 67 | self.waiting = False 68 | obs, rews, dones, infos = zip(*results) 69 | return np.stack(obs), np.stack(rews), np.stack(dones), infos 70 | 71 | def reset(self): 72 | for remote in self.remotes: 73 | remote.send(('reset', None)) 74 | return np.stack([remote.recv() for remote in self.remotes]) 75 | 76 | def close(self): 77 | if self.closed: 78 | return 79 | if self.waiting: 80 | for remote in self.remotes: 81 | remote.recv() 82 | for remote in self.remotes: 83 | remote.send(('close', None)) 84 | for process in self.processes: 85 | process.join() 86 | self.closed = True 87 | 88 | def render(self, mode='human', *args, **kwargs): 89 | for pipe in self.remotes: 90 | # gather images from subprocesses 91 | # `mode` will be taken into account later 92 | pipe.send(('render', (args, {'mode': 'rgb_array', **kwargs}))) 93 | imgs = [pipe.recv() for pipe in self.remotes] 94 | # Create a big image by tiling images from subprocesses 95 | bigimg = tile_images(imgs) 96 | if mode == 'human': 97 | import cv2 98 | cv2.imshow('vecenv', bigimg[:, :, ::-1]) 99 | cv2.waitKey(1) 100 | elif mode == 'rgb_array': 101 | return bigimg 102 | else: 103 | raise NotImplementedError 104 | 105 | def get_images(self): 106 | for pipe in self.remotes: 107 | pipe.send(('render', {"mode": 'rgb_array'})) 108 | imgs = [pipe.recv() for pipe in self.remotes] 109 | return imgs 110 | -------------------------------------------------------------------------------- /stable_baselines/common/vec_env/vec_frame_stack.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import spaces 3 | 4 | from stable_baselines.common.vec_env import VecEnvWrapper 5 | 6 | 7 | class VecFrameStack(VecEnvWrapper): 8 | """ 9 | Frame stacking wrapper for vectorized environment 10 | 11 | :param venv: (VecEnv) the vectorized environment to wrap 12 | :param n_stack: (int) Number of frames to stack 13 | """ 14 | 15 | def __init__(self, venv, n_stack): 16 | self.venv = venv 17 | self.n_stack = n_stack 18 | wrapped_obs_space = venv.observation_space 19 | low = np.repeat(wrapped_obs_space.low, self.n_stack, axis=-1) 20 | high = np.repeat(wrapped_obs_space.high, self.n_stack, axis=-1) 21 | self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype) 22 | observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype) 23 | VecEnvWrapper.__init__(self, venv, observation_space=observation_space) 24 | 25 | def step_wait(self): 26 | observations, rewards, dones, infos = self.venv.step_wait() 27 | self.stackedobs = np.roll(self.stackedobs, shift=-observations.shape[-1], axis=-1) 28 | for i, done in enumerate(dones): 29 | if done: 30 | self.stackedobs[i] = 0 31 | self.stackedobs[..., -observations.shape[-1]:] = observations 32 | return self.stackedobs, rewards, dones, infos 33 | 34 | def reset(self): 35 | """ 36 | Reset all environments 37 | """ 38 | obs = self.venv.reset() 39 | self.stackedobs[...] = 0 40 | self.stackedobs[..., -obs.shape[-1]:] = obs 41 | return self.stackedobs 42 | 43 | def close(self): 44 | self.venv.close() 45 | -------------------------------------------------------------------------------- /stable_baselines/common/vec_env/vec_normalize.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import numpy as np 4 | 5 | from stable_baselines.common.vec_env import VecEnvWrapper 6 | from stable_baselines.common.running_mean_std import RunningMeanStd 7 | 8 | 9 | class VecNormalize(VecEnvWrapper): 10 | """ 11 | A moving average, normalizing wrapper for vectorized environment. 12 | has support for saving/loading moving average, 13 | 14 | :param venv: (VecEnv) the vectorized environment to wrap 15 | :param training: (bool) Whether to update or not the moving average 16 | :param norm_obs: (bool) Whether to normalize observation or not (default: True) 17 | :param norm_reward: (bool) Whether to normalize rewards or not (default: False) 18 | :param clip_obs: (float) Max absolute value for observation 19 | :param clip_reward: (float) Max value absolute for discounted reward 20 | :param gamma: (float) discount factor 21 | :param epsilon: (float) To avoid division by zero 22 | """ 23 | 24 | def __init__(self, venv, training=True, norm_obs=True, norm_reward=True, 25 | clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8): 26 | VecEnvWrapper.__init__(self, venv) 27 | self.obs_rms = RunningMeanStd(shape=self.observation_space.shape) 28 | self.ret_rms = RunningMeanStd(shape=()) 29 | self.clip_obs = clip_obs 30 | self.clip_reward = clip_reward 31 | # Returns: discounted rewards 32 | self.ret = np.zeros(self.num_envs) 33 | self.gamma = gamma 34 | self.epsilon = epsilon 35 | self.training = training 36 | self.norm_obs = norm_obs 37 | self.norm_reward = norm_reward 38 | self.old_obs = np.array([]) 39 | 40 | def step_wait(self): 41 | """ 42 | Apply sequence of actions to sequence of environments 43 | actions -> (observations, rewards, news) 44 | 45 | where 'news' is a boolean vector indicating whether each element is new. 46 | """ 47 | obs, rews, news, infos = self.venv.step_wait() 48 | self.ret = self.ret * self.gamma + rews 49 | self.old_obs = obs 50 | obs = self._normalize_observation(obs) 51 | if self.norm_reward: 52 | if self.training: 53 | self.ret_rms.update(self.ret) 54 | rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward) 55 | self.ret[news] = 0 56 | return obs, rews, news, infos 57 | 58 | def _normalize_observation(self, obs): 59 | """ 60 | :param obs: (numpy tensor) 61 | """ 62 | if self.norm_obs: 63 | if self.training: 64 | self.obs_rms.update(obs) 65 | obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs, 66 | self.clip_obs) 67 | return obs 68 | else: 69 | return obs 70 | 71 | def get_original_obs(self): 72 | """ 73 | returns the unnormalized observation 74 | 75 | :return: (numpy float) 76 | """ 77 | return self.old_obs 78 | 79 | def reset(self): 80 | """ 81 | Reset all environments 82 | """ 83 | obs = self.venv.reset() 84 | if len(np.array(obs).shape) == 1: # for when num_cpu is 1 85 | self.old_obs = [obs] 86 | else: 87 | self.old_obs = obs 88 | self.ret = np.zeros(self.num_envs) 89 | return self._normalize_observation(obs) 90 | 91 | def save_running_average(self, path): 92 | """ 93 | :param path: (str) path to log dir 94 | """ 95 | for rms, name in zip([self.obs_rms, self.ret_rms], ['obs_rms', 'ret_rms']): 96 | with open("{}/{}.pkl".format(path, name), 'wb') as file_handler: 97 | pickle.dump(rms, file_handler) 98 | 99 | def load_running_average(self, path): 100 | """ 101 | :param path: (str) path to log dir 102 | """ 103 | for name in ['obs_rms', 'ret_rms']: 104 | with open("{}/{}.pkl".format(path, name), 'rb') as file_handler: 105 | setattr(self, name, pickle.load(file_handler)) 106 | -------------------------------------------------------------------------------- /stable_baselines/ddpg/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.ddpg.ddpg import DDPG 2 | from stable_baselines.ddpg.policies import MlpPolicy, CnnPolicy, LnMlpPolicy, LnCnnPolicy 3 | from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise 4 | -------------------------------------------------------------------------------- /stable_baselines/ddpg/main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import time 3 | import os 4 | 5 | import gym 6 | import tensorflow as tf 7 | import numpy as np 8 | from mpi4py import MPI 9 | 10 | from stable_baselines import logger, bench 11 | from stable_baselines.common.misc_util import set_global_seeds, boolean_flag 12 | from stable_baselines.ddpg.policies import MlpPolicy, LnMlpPolicy 13 | from stable_baselines.ddpg import DDPG 14 | from stable_baselines.ddpg.memory import Memory 15 | from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec, OrnsteinUhlenbeckActionNoise, NormalActionNoise 16 | 17 | 18 | def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs): 19 | """ 20 | run the training of DDPG 21 | 22 | :param env_id: (str) the environment ID 23 | :param seed: (int) the initial random seed 24 | :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by 25 | seperating them with commas 26 | :param layer_norm: (bool) use layer normalization 27 | :param evaluation: (bool) enable evaluation of DDPG training 28 | :param kwargs: (dict) extra keywords for the training.train function 29 | """ 30 | 31 | # Configure things. 32 | rank = MPI.COMM_WORLD.Get_rank() 33 | if rank != 0: 34 | logger.set_level(logger.DISABLED) 35 | 36 | # Create envs. 37 | env = gym.make(env_id) 38 | env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) 39 | 40 | if evaluation and rank == 0: 41 | eval_env = gym.make(env_id) 42 | eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) 43 | env = bench.Monitor(env, None) 44 | else: 45 | eval_env = None 46 | 47 | # Parse noise_type 48 | action_noise = None 49 | param_noise = None 50 | nb_actions = env.action_space.shape[-1] 51 | for current_noise_type in noise_type.split(','): 52 | current_noise_type = current_noise_type.strip() 53 | if current_noise_type == 'none': 54 | pass 55 | elif 'adaptive-param' in current_noise_type: 56 | _, stddev = current_noise_type.split('_') 57 | param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) 58 | elif 'normal' in current_noise_type: 59 | _, stddev = current_noise_type.split('_') 60 | action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) 61 | elif 'ou' in current_noise_type: 62 | _, stddev = current_noise_type.split('_') 63 | action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions), 64 | sigma=float(stddev) * np.ones(nb_actions)) 65 | else: 66 | raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) 67 | 68 | # Seed everything to make things reproducible. 69 | seed = seed + 1000000 * rank 70 | logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) 71 | tf.reset_default_graph() 72 | set_global_seeds(seed) 73 | env.seed(seed) 74 | if eval_env is not None: 75 | eval_env.seed(seed) 76 | 77 | # Disable logging for rank != 0 to avoid noise. 78 | start_time = 0 79 | if rank == 0: 80 | start_time = time.time() 81 | 82 | if layer_norm: 83 | policy = LnMlpPolicy 84 | else: 85 | policy = MlpPolicy 86 | 87 | num_timesteps = kwargs['num_timesteps'] 88 | del kwargs['num_timesteps'] 89 | 90 | model = DDPG(policy=policy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise, 91 | action_noise=action_noise, memory_limit=int(1e6), verbose=2, **kwargs) 92 | model.learn(total_timesteps=num_timesteps) 93 | env.close() 94 | if eval_env is not None: 95 | eval_env.close() 96 | if rank == 0: 97 | logger.info('total runtime: {}s'.format(time.time() - start_time)) 98 | 99 | 100 | def parse_args(): 101 | """ 102 | parse the arguments for DDPG training 103 | 104 | :return: (dict) the arguments 105 | """ 106 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 107 | 108 | parser.add_argument('--env-id', type=str, default='HalfCheetah-v1') 109 | boolean_flag(parser, 'render-eval', default=False) 110 | boolean_flag(parser, 'layer-norm', default=True) 111 | boolean_flag(parser, 'render', default=False) 112 | boolean_flag(parser, 'normalize-returns', default=False) 113 | boolean_flag(parser, 'normalize-observations', default=True) 114 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 115 | parser.add_argument('--critic-l2-reg', type=float, default=1e-2) 116 | parser.add_argument('--batch-size', type=int, default=64) # per MPI worker 117 | parser.add_argument('--actor-lr', type=float, default=1e-4) 118 | parser.add_argument('--critic-lr', type=float, default=1e-3) 119 | boolean_flag(parser, 'enable-popart', default=False) 120 | parser.add_argument('--gamma', type=float, default=0.99) 121 | parser.add_argument('--reward-scale', type=float, default=1.) 122 | parser.add_argument('--clip-norm', type=float, default=None) 123 | parser.add_argument('--nb-train-steps', type=int, default=50) # per epoch cycle and MPI worker 124 | parser.add_argument('--nb-eval-steps', type=int, default=100) # per epoch cycle and MPI worker 125 | parser.add_argument('--nb-rollout-steps', type=int, default=100) # per epoch cycle and MPI worker 126 | # choices are adaptive-param_xx, ou_xx, normal_xx, none 127 | parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2') 128 | parser.add_argument('--num-timesteps', type=int, default=int(1e6)) 129 | boolean_flag(parser, 'evaluation', default=False) 130 | args = parser.parse_args() 131 | dict_args = vars(args) 132 | return dict_args 133 | 134 | 135 | if __name__ == '__main__': 136 | args = parse_args() 137 | if MPI.COMM_WORLD.Get_rank() == 0: 138 | logger.configure() 139 | # Run actual script. 140 | run(**args) 141 | -------------------------------------------------------------------------------- /stable_baselines/ddpg/memory.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class RingBuffer(object): 5 | def __init__(self, maxlen, shape, dtype='float32'): 6 | """ 7 | A buffer object, when full restarts at the initial position 8 | 9 | :param maxlen: (int) the max number of numpy objects to store 10 | :param shape: (tuple) the shape of the numpy objects you want to store 11 | :param dtype: (str) the name of the type of the numpy object you want to store 12 | """ 13 | self.maxlen = maxlen 14 | self.start = 0 15 | self.length = 0 16 | self.data = np.zeros((maxlen,) + shape).astype(dtype) 17 | 18 | def __len__(self): 19 | return self.length 20 | 21 | def __getitem__(self, idx): 22 | if idx < 0 or idx >= self.length: 23 | raise KeyError() 24 | return self.data[(self.start + idx) % self.maxlen] 25 | 26 | def get_batch(self, idxs): 27 | """ 28 | get the value at the indexes 29 | 30 | :param idxs: (int or numpy int) the indexes 31 | :return: (np.ndarray) the stored information in the buffer at the asked positions 32 | """ 33 | return self.data[(self.start + idxs) % self.maxlen] 34 | 35 | def append(self, var): 36 | """ 37 | Append an object to the buffer 38 | 39 | :param var: (np.ndarray) the object you wish to add 40 | """ 41 | if self.length < self.maxlen: 42 | # We have space, simply increase the length. 43 | self.length += 1 44 | elif self.length == self.maxlen: 45 | # No space, "remove" the first item. 46 | self.start = (self.start + 1) % self.maxlen 47 | else: 48 | # This should never happen. 49 | raise RuntimeError() 50 | self.data[(self.start + self.length - 1) % self.maxlen] = var 51 | 52 | 53 | def array_min2d(arr): 54 | """ 55 | cast to np.ndarray, and make sure it is of 2 dim 56 | 57 | :param arr: ([Any]) the array to clean 58 | :return: (np.ndarray) the cleaned array 59 | """ 60 | arr = np.array(arr) 61 | if arr.ndim >= 2: 62 | return arr 63 | return arr.reshape(-1, 1) 64 | 65 | 66 | class Memory(object): 67 | def __init__(self, limit, action_shape, observation_shape): 68 | """ 69 | The replay buffer object 70 | 71 | :param limit: (int) the max number of transitions to store 72 | :param action_shape: (tuple) the action shape 73 | :param observation_shape: (tuple) the observation shape 74 | """ 75 | self.limit = limit 76 | 77 | self.observations0 = RingBuffer(limit, shape=observation_shape) 78 | self.actions = RingBuffer(limit, shape=action_shape) 79 | self.rewards = RingBuffer(limit, shape=(1,)) 80 | self.terminals1 = RingBuffer(limit, shape=(1,)) 81 | self.observations1 = RingBuffer(limit, shape=observation_shape) 82 | 83 | def sample(self, batch_size): 84 | """ 85 | sample a random batch from the buffer 86 | 87 | :param batch_size: (int) the number of element to sample for the batch 88 | :return: (dict) the sampled batch 89 | """ 90 | # Draw such that we always have a proceeding element. 91 | batch_idxs = np.random.randint(low=1, high=self.nb_entries - 1, size=batch_size) 92 | 93 | obs0_batch = self.observations0.get_batch(batch_idxs) 94 | obs1_batch = self.observations1.get_batch(batch_idxs) 95 | action_batch = self.actions.get_batch(batch_idxs) 96 | reward_batch = self.rewards.get_batch(batch_idxs) 97 | terminal1_batch = self.terminals1.get_batch(batch_idxs) 98 | 99 | result = { 100 | 'obs0': array_min2d(obs0_batch), 101 | 'obs1': array_min2d(obs1_batch), 102 | 'rewards': array_min2d(reward_batch), 103 | 'actions': array_min2d(action_batch), 104 | 'terminals1': array_min2d(terminal1_batch), 105 | } 106 | return result 107 | 108 | def append(self, obs0, action, reward, obs1, terminal1, training=True): 109 | """ 110 | Append a transition to the buffer 111 | 112 | :param obs0: ([float] or [int]) the last observation 113 | :param action: ([float]) the action 114 | :param reward: (float] the reward 115 | :param obs1: ([float] or [int]) the current observation 116 | :param terminal1: (bool) is the episode done 117 | :param training: (bool) is the RL model training or not 118 | """ 119 | if not training: 120 | return 121 | 122 | self.observations0.append(obs0) 123 | self.actions.append(action) 124 | self.rewards.append(reward) 125 | self.observations1.append(obs1) 126 | self.terminals1.append(terminal1) 127 | 128 | @property 129 | def nb_entries(self): 130 | return len(self.observations0) 131 | -------------------------------------------------------------------------------- /stable_baselines/ddpg/noise.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class AdaptiveParamNoiseSpec(object): 5 | """ 6 | Implements adaptive parameter noise 7 | 8 | :param initial_stddev: (float) the initial value for the standard deviation of the noise 9 | :param desired_action_stddev: (float) the desired value for the standard deviation of the noise 10 | :param adoption_coefficient: (float) the update coefficient for the standard deviation of the noise 11 | """ 12 | def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01): 13 | self.initial_stddev = initial_stddev 14 | self.desired_action_stddev = desired_action_stddev 15 | self.adoption_coefficient = adoption_coefficient 16 | 17 | self.current_stddev = initial_stddev 18 | 19 | def adapt(self, distance): 20 | """ 21 | update the standard deviation for the parameter noise 22 | 23 | :param distance: (float) the noise distance applied to the parameters 24 | """ 25 | if distance > self.desired_action_stddev: 26 | # Decrease stddev. 27 | self.current_stddev /= self.adoption_coefficient 28 | else: 29 | # Increase stddev. 30 | self.current_stddev *= self.adoption_coefficient 31 | 32 | def get_stats(self): 33 | """ 34 | return the standard deviation for the parameter noise 35 | 36 | :return: (dict) the stats of the noise 37 | """ 38 | return {'param_noise_stddev': self.current_stddev} 39 | 40 | def __repr__(self): 41 | fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})' 42 | return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient) 43 | 44 | 45 | class ActionNoise(object): 46 | """ 47 | The action noise base class 48 | """ 49 | def reset(self): 50 | """ 51 | call end of episode reset for the noise 52 | """ 53 | pass 54 | 55 | 56 | class NormalActionNoise(ActionNoise): 57 | """ 58 | A gaussian action noise 59 | 60 | :param mean: (float) the mean value of the noise 61 | :param sigma: (float) the scale of the noise (std here) 62 | """ 63 | def __init__(self, mean, sigma): 64 | self._mu = mean 65 | self._sigma = sigma 66 | 67 | def __call__(self): 68 | return np.random.normal(self._mu, self._sigma) 69 | 70 | def __repr__(self): 71 | return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) 72 | 73 | 74 | class OrnsteinUhlenbeckActionNoise(ActionNoise): 75 | """ 76 | A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction. 77 | 78 | Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab 79 | 80 | :param mean: (float) the mean of the noise 81 | :param sigma: (float) the scale of the noise 82 | :param theta: (float) the rate of mean reversion 83 | :param dt: (float) the timestep for the noise 84 | :param initial_noise: ([float]) the initial value for the noise output, (if None: 0) 85 | """ 86 | 87 | def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None): 88 | self._theta = theta 89 | self._mu = mean 90 | self._sigma = sigma 91 | self._dt = dt 92 | self.initial_noise = initial_noise 93 | self.noise_prev = None 94 | self.reset() 95 | 96 | def __call__(self): 97 | noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \ 98 | self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape) 99 | self.noise_prev = noise 100 | return noise 101 | 102 | def reset(self): 103 | """ 104 | reset the Ornstein Uhlenbeck noise, to the initial position 105 | """ 106 | self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu) 107 | 108 | def __repr__(self): 109 | return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma) 110 | -------------------------------------------------------------------------------- /stable_baselines/deepq/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.deepq.policies import MlpPolicy, CnnPolicy, LnMlpPolicy, LnCnnPolicy 2 | from stable_baselines.deepq.build_graph import build_act, build_train # noqa 3 | from stable_baselines.deepq.dqn import DQN 4 | from stable_baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer # noqa 5 | 6 | 7 | def wrap_atari_dqn(env): 8 | """ 9 | wrap the environment in atari wrappers for DQN 10 | 11 | :param env: (Gym Environment) the environment 12 | :return: (Gym Environment) the wrapped environment 13 | """ 14 | from stable_baselines.common.atari_wrappers import wrap_deepmind 15 | return wrap_deepmind(env, frame_stack=True, scale=False) 16 | -------------------------------------------------------------------------------- /stable_baselines/deepq/experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/deepq/experiments/__init__.py -------------------------------------------------------------------------------- /stable_baselines/deepq/experiments/custom_cartpole.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import argparse 3 | 4 | import gym 5 | import numpy as np 6 | import tensorflow as tf 7 | 8 | import stable_baselines.common.tf_util as tf_utils 9 | from stable_baselines import logger, deepq 10 | from stable_baselines.deepq.replay_buffer import ReplayBuffer 11 | from stable_baselines.deepq.policies import FeedForwardPolicy 12 | from stable_baselines.common.schedules import LinearSchedule 13 | 14 | 15 | class CustomPolicy(FeedForwardPolicy): 16 | def __init__(self, *args, **kwargs): 17 | super(CustomPolicy, self).__init__(*args, **kwargs, 18 | layers=[64], 19 | feature_extraction="mlp") 20 | 21 | 22 | def main(args): 23 | """ 24 | Train a DQN agent on cartpole env 25 | :param args: (Parsed Arguments) the input arguments 26 | """ 27 | with tf_utils.make_session(8) as sess: 28 | # Create the environment 29 | env = gym.make("CartPole-v0") 30 | # Create all the functions necessary to train the model 31 | act, train, update_target, _ = deepq.build_train( 32 | q_func=CustomPolicy, 33 | ob_space=env.observation_space, 34 | ac_space=env.action_space, 35 | optimizer=tf.train.AdamOptimizer(learning_rate=5e-4), 36 | sess=sess 37 | ) 38 | # Create the replay buffer 39 | replay_buffer = ReplayBuffer(50000) 40 | # Create the schedule for exploration starting from 1 (every action is random) down to 41 | # 0.02 (98% of actions are selected according to values predicted by the model). 42 | exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02) 43 | 44 | # Initialize the parameters and copy them to the target network. 45 | tf_utils.initialize() 46 | update_target() 47 | 48 | episode_rewards = [0.0] 49 | obs = env.reset() 50 | for step in itertools.count(): 51 | # Take action and update exploration to the newest value 52 | action = act(obs[None], update_eps=exploration.value(step))[0] 53 | new_obs, rew, done, _ = env.step(action) 54 | # Store transition in the replay buffer. 55 | replay_buffer.add(obs, action, rew, new_obs, float(done)) 56 | obs = new_obs 57 | 58 | episode_rewards[-1] += rew 59 | if done: 60 | obs = env.reset() 61 | episode_rewards.append(0) 62 | 63 | if len(episode_rewards[-101:-1]) == 0: 64 | mean_100ep_reward = -np.inf 65 | else: 66 | mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) 67 | 68 | is_solved = step > 100 and mean_100ep_reward >= 200 69 | 70 | if args.no_render and step > args.max_timesteps: 71 | break 72 | 73 | if is_solved: 74 | if args.no_render: 75 | break 76 | # Show off the result 77 | env.render() 78 | else: 79 | # Minimize the error in Bellman's equation on a batch sampled from replay buffer. 80 | if step > 1000: 81 | obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32) 82 | train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards)) 83 | # Update target network periodically. 84 | if step % 1000 == 0: 85 | update_target() 86 | 87 | if done and len(episode_rewards) % 10 == 0: 88 | logger.record_tabular("steps", step) 89 | logger.record_tabular("episodes", len(episode_rewards)) 90 | logger.record_tabular("mean episode reward", mean_100ep_reward) 91 | logger.record_tabular("% time spent exploring", int(100 * exploration.value(step))) 92 | logger.dump_tabular() 93 | 94 | 95 | if __name__ == '__main__': 96 | parser = argparse.ArgumentParser(description="Train DQN on cartpole using a custom mlp") 97 | parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") 98 | parser.add_argument('--max-timesteps', default=50000, type=int, 99 | help="Maximum number of timesteps when not rendering") 100 | args = parser.parse_args() 101 | main(args) 102 | -------------------------------------------------------------------------------- /stable_baselines/deepq/experiments/enjoy_cartpole.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import gym 4 | 5 | from stable_baselines.deepq import DQN 6 | 7 | 8 | def main(args): 9 | """ 10 | Run a trained model for the cartpole problem 11 | 12 | :param args: (ArgumentParser) the input arguments 13 | """ 14 | env = gym.make("CartPole-v0") 15 | model = DQN.load("cartpole_model.pkl", env) 16 | 17 | while True: 18 | obs, done = env.reset(), False 19 | episode_rew = 0 20 | while not done: 21 | if not args.no_render: 22 | env.render() 23 | action, _ = model.predict(obs) 24 | obs, rew, done, _ = env.step(action) 25 | episode_rew += rew 26 | print("Episode reward", episode_rew) 27 | # No render is only used for automatic testing 28 | if args.no_render: 29 | break 30 | 31 | 32 | if __name__ == '__main__': 33 | parser = argparse.ArgumentParser(description="Enjoy trained DQN on cartpole") 34 | parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") 35 | args = parser.parse_args() 36 | main(args) 37 | -------------------------------------------------------------------------------- /stable_baselines/deepq/experiments/enjoy_mountaincar.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import gym 4 | import numpy as np 5 | 6 | from stable_baselines.deepq import DQN 7 | 8 | 9 | def main(args): 10 | """ 11 | Run a trained model for the mountain car problem 12 | 13 | :param args: (ArgumentParser) the input arguments 14 | """ 15 | env = gym.make("MountainCar-v0") 16 | model = DQN.load("mountaincar_model.pkl", env) 17 | 18 | while True: 19 | obs, done = env.reset(), False 20 | episode_rew = 0 21 | while not done: 22 | if not args.no_render: 23 | env.render() 24 | # Epsilon-greedy 25 | if np.random.random() < 0.02: 26 | action = env.action_space.sample() 27 | else: 28 | action, _ = model.predict(obs, deterministic=True) 29 | obs, rew, done, _ = env.step(action) 30 | episode_rew += rew 31 | print("Episode reward", episode_rew) 32 | # No render is only used for automatic testing 33 | if args.no_render: 34 | break 35 | 36 | 37 | if __name__ == '__main__': 38 | parser = argparse.ArgumentParser(description="Enjoy trained DQN on MountainCar") 39 | parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering") 40 | args = parser.parse_args() 41 | main(args) 42 | -------------------------------------------------------------------------------- /stable_baselines/deepq/experiments/enjoy_pong.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from stable_baselines import deepq 4 | from stable_baselines.deepq import DQN 5 | 6 | 7 | def main(): 8 | """ 9 | Run a trained model for the pong problem 10 | """ 11 | env = gym.make("PongNoFrameskip-v4") 12 | env = deepq.wrap_atari_dqn(env) 13 | model = DQN.load("pong_model.pkl", env) 14 | 15 | while True: 16 | obs, done = env.reset(), False 17 | episode_rew = 0 18 | while not done: 19 | env.render() 20 | action, _ = model.predict(obs) 21 | obs, rew, done, _ = env.step(action) 22 | episode_rew += rew 23 | print("Episode reward", episode_rew) 24 | 25 | 26 | if __name__ == '__main__': 27 | main() 28 | -------------------------------------------------------------------------------- /stable_baselines/deepq/experiments/run_atari.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from functools import partial 3 | 4 | from stable_baselines import bench, logger 5 | from stable_baselines.common import set_global_seeds 6 | from stable_baselines.common.atari_wrappers import make_atari 7 | from stable_baselines.deepq import DQN, wrap_atari_dqn, CnnPolicy 8 | 9 | 10 | def main(): 11 | """ 12 | Run the atari test 13 | """ 14 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 15 | parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') 16 | parser.add_argument('--seed', help='RNG seed', type=int, default=0) 17 | parser.add_argument('--prioritized', type=int, default=1) 18 | parser.add_argument('--dueling', type=int, default=1) 19 | parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) 20 | parser.add_argument('--num-timesteps', type=int, default=int(10e6)) 21 | parser.add_argument('--checkpoint-freq', type=int, default=10000) 22 | parser.add_argument('--checkpoint-path', type=str, default=None) 23 | 24 | args = parser.parse_args() 25 | logger.configure() 26 | set_global_seeds(args.seed) 27 | env = make_atari(args.env) 28 | env = bench.Monitor(env, logger.get_dir()) 29 | env = wrap_atari_dqn(env) 30 | policy = partial(CnnPolicy, dueling=args.dueling == 1) 31 | 32 | model = DQN( 33 | env=env, 34 | policy=policy, 35 | learning_rate=1e-4, 36 | buffer_size=10000, 37 | exploration_fraction=0.1, 38 | exploration_final_eps=0.01, 39 | train_freq=4, 40 | learning_starts=10000, 41 | target_network_update_freq=1000, 42 | gamma=0.99, 43 | prioritized_replay=bool(args.prioritized), 44 | prioritized_replay_alpha=args.prioritized_replay_alpha, 45 | checkpoint_freq=args.checkpoint_freq, 46 | checkpoint_path=args.checkpoint_path, 47 | ) 48 | model.learn(total_timesteps=args.num_timesteps) 49 | 50 | env.close() 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /stable_baselines/deepq/experiments/train_cartpole.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import gym 4 | import numpy as np 5 | 6 | from stable_baselines.deepq import DQN, MlpPolicy 7 | 8 | 9 | def callback(lcl, _glb): 10 | """ 11 | The callback function for logging and saving 12 | 13 | :param lcl: (dict) the local variables 14 | :param _glb: (dict) the global variables 15 | :return: (bool) is solved 16 | """ 17 | # stop training if reward exceeds 199 18 | if len(lcl['episode_rewards'][-101:-1]) == 0: 19 | mean_100ep_reward = -np.inf 20 | else: 21 | mean_100ep_reward = round(float(np.mean(lcl['episode_rewards'][-101:-1])), 1) 22 | is_solved = lcl['step'] > 100 and mean_100ep_reward >= 199 23 | return is_solved 24 | 25 | 26 | def main(args): 27 | """ 28 | Train and save the DQN model, for the cartpole problem 29 | 30 | :param args: (ArgumentParser) the input arguments 31 | """ 32 | env = gym.make("CartPole-v0") 33 | model = DQN( 34 | env=env, 35 | policy=MlpPolicy, 36 | learning_rate=1e-3, 37 | buffer_size=50000, 38 | exploration_fraction=0.1, 39 | exploration_final_eps=0.02, 40 | ) 41 | model.learn(total_timesteps=args.max_timesteps, callback=callback) 42 | 43 | print("Saving model to cartpole_model.pkl") 44 | model.save("cartpole_model.pkl") 45 | 46 | 47 | if __name__ == '__main__': 48 | parser = argparse.ArgumentParser(description="Train DQN on cartpole") 49 | parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps") 50 | args = parser.parse_args() 51 | main(args) 52 | -------------------------------------------------------------------------------- /stable_baselines/deepq/experiments/train_mountaincar.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import gym 4 | 5 | from stable_baselines.deepq import DQN 6 | from stable_baselines.deepq.policies import FeedForwardPolicy 7 | 8 | 9 | class CustomPolicy(FeedForwardPolicy): 10 | def __init__(self, *args, **kwargs): 11 | super(CustomPolicy, self).__init__(*args, **kwargs, 12 | layers=[64], 13 | layer_norm=True, 14 | feature_extraction="mlp") 15 | 16 | 17 | def main(args): 18 | """ 19 | Train and save the DQN model, for the mountain car problem 20 | 21 | :param args: (ArgumentParser) the input arguments 22 | """ 23 | env = gym.make("MountainCar-v0") 24 | 25 | # using layer norm policy here is important for parameter space noise! 26 | model = DQN( 27 | policy=CustomPolicy, 28 | env=env, 29 | learning_rate=1e-3, 30 | buffer_size=50000, 31 | exploration_fraction=0.1, 32 | exploration_final_eps=0.1, 33 | param_noise=True 34 | ) 35 | model.learn(total_timesteps=args.max_timesteps) 36 | 37 | print("Saving model to mountaincar_model.pkl") 38 | model.save("mountaincar_model.pkl") 39 | 40 | 41 | if __name__ == '__main__': 42 | parser = argparse.ArgumentParser(description="Train DQN on cartpole") 43 | parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps") 44 | args = parser.parse_args() 45 | main(args) 46 | -------------------------------------------------------------------------------- /stable_baselines/deepq/utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from stable_baselines.common.input import observation_input 4 | 5 | # ================================================================ 6 | # Placeholders 7 | # ================================================================ 8 | 9 | 10 | class TfInput(object): 11 | def __init__(self, name="(unnamed)"): 12 | """ 13 | Generalized Tensorflow placeholder. The main differences are: 14 | - possibly uses multiple placeholders internally and returns multiple values 15 | - can apply light postprocessing to the value feed to placeholder. 16 | 17 | :param name: (str) the input name 18 | """ 19 | self.name = name 20 | 21 | def get(self): 22 | """ 23 | Return the tf variable(s) representing the possibly postprocessed value 24 | of placeholder(s). 25 | 26 | :return: (TensorFlow Tensor) the placeholder 27 | """ 28 | raise NotImplementedError 29 | 30 | def make_feed_dict(self, data): 31 | """ 32 | Given data input it to the placeholder(s). 33 | 34 | :return: (dict) the given data input 35 | """ 36 | raise NotImplementedError 37 | 38 | 39 | class PlaceholderTfInput(TfInput): 40 | def __init__(self, placeholder): 41 | """ 42 | Wrapper for regular tensorflow placeholder. 43 | 44 | :param placeholder: (TensorFlow Tensor) 45 | """ 46 | super().__init__(placeholder.name) 47 | self._placeholder = placeholder 48 | 49 | def get(self): 50 | return self._placeholder 51 | 52 | def make_feed_dict(self, data): 53 | return {self._placeholder: data} 54 | 55 | 56 | class Uint8Input(PlaceholderTfInput): 57 | def __init__(self, shape, name=None): 58 | """ 59 | Takes input in uint8 format which is cast to float32 and divided by 255 60 | before passing it to the model. 61 | 62 | On GPU this ensures lower data transfer times. 63 | 64 | :param shape: ([int]) shape of the tensor. 65 | :param name: (str) name of the underlying placeholder 66 | """ 67 | 68 | super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name)) 69 | self._shape = shape 70 | self._output = tf.cast(super().get(), tf.float32) / 255.0 71 | 72 | def get(self): 73 | return self._output 74 | 75 | 76 | class ObservationInput(PlaceholderTfInput): 77 | def __init__(self, observation_space, name=None): 78 | """ 79 | Creates an input placeholder tailored to a specific observation space 80 | 81 | :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces 82 | types 83 | :param name: (str) tensorflow name of the underlying placeholder 84 | """ 85 | is_image = len(observation_space.shape) == 3 86 | inpt, self.processed_inpt = observation_input(observation_space, name=name, scale=is_image) 87 | super().__init__(inpt) 88 | 89 | def get(self): 90 | return self.processed_inpt 91 | -------------------------------------------------------------------------------- /stable_baselines/gail/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.gail.model import GAIL 2 | -------------------------------------------------------------------------------- /stable_baselines/gail/dataset/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/dataset/__init__.py -------------------------------------------------------------------------------- /stable_baselines/gail/dataset/mujocodset.py: -------------------------------------------------------------------------------- 1 | """ 2 | Data structure of the input .npz: 3 | the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs' 4 | the values of each item is a list storing the expert trajectory sequentially 5 | a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t] 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | 11 | from stable_baselines import logger 12 | 13 | 14 | class Dset(object): 15 | def __init__(self, inputs, labels, randomize): 16 | """ 17 | Dataset object 18 | 19 | :param inputs: (np.ndarray) the input values 20 | :param labels: (np.ndarray) the target values 21 | :param randomize: (bool) if the dataset should be shuffled 22 | """ 23 | self.inputs = inputs 24 | self.labels = labels 25 | assert len(self.inputs) == len(self.labels) 26 | self.randomize = randomize 27 | self.num_pairs = len(inputs) 28 | self.init_pointer() 29 | 30 | def init_pointer(self): 31 | """ 32 | initialize the pointer and shuffle the dataset, if randomize the dataset 33 | """ 34 | self.pointer = 0 35 | if self.randomize: 36 | idx = np.arange(self.num_pairs) 37 | np.random.shuffle(idx) 38 | self.inputs = self.inputs[idx, :] 39 | self.labels = self.labels[idx, :] 40 | 41 | def get_next_batch(self, batch_size): 42 | """ 43 | get the batch from the dataset 44 | 45 | :param batch_size: (int) the size of the batch from the dataset 46 | :return: (np.ndarray, np.ndarray) inputs and labels 47 | """ 48 | # if batch_size is negative -> return all 49 | if batch_size < 0: 50 | return self.inputs, self.labels 51 | if self.pointer + batch_size >= self.num_pairs: 52 | self.init_pointer() 53 | end = self.pointer + batch_size 54 | inputs = self.inputs[self.pointer:end, :] 55 | labels = self.labels[self.pointer:end, :] 56 | self.pointer = end 57 | return inputs, labels 58 | 59 | 60 | class MujocoDset(object): 61 | def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True): 62 | """ 63 | Dataset for mujoco 64 | 65 | :param expert_path: (str) the path to trajectory data 66 | :param train_fraction: (float) the train val split (0 to 1) 67 | :param traj_limitation: (int) the dims to load (if -1, load all) 68 | :param randomize: (bool) if the dataset should be shuffled 69 | """ 70 | traj_data = np.load(expert_path) 71 | if traj_limitation < 0: 72 | traj_limitation = len(traj_data['obs']) 73 | obs = traj_data['obs'][:traj_limitation] 74 | acs = traj_data['acs'][:traj_limitation] 75 | 76 | # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length 77 | # and S is the environment observation/action space. 78 | # Flatten to (N * L, prod(S)) 79 | self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])]) 80 | self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])]) 81 | 82 | self.rets = traj_data['ep_rets'][:traj_limitation] 83 | self.avg_ret = sum(self.rets)/len(self.rets) 84 | self.std_ret = np.std(np.array(self.rets)) 85 | if len(self.acs) > 2: 86 | self.acs = np.squeeze(self.acs) 87 | assert len(self.obs) == len(self.acs) 88 | self.num_traj = min(traj_limitation, len(traj_data['obs'])) 89 | self.num_transition = len(self.obs) 90 | self.randomize = randomize 91 | self.dset = Dset(self.obs, self.acs, self.randomize) 92 | # for behavior cloning 93 | self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction), :], 94 | self.acs[:int(self.num_transition*train_fraction), :], 95 | self.randomize) 96 | self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):, :], 97 | self.acs[int(self.num_transition*train_fraction):, :], 98 | self.randomize) 99 | self.log_info() 100 | 101 | def log_info(self): 102 | """ 103 | log the information of the dataset 104 | """ 105 | logger.log("Total trajectorues: %d" % self.num_traj) 106 | logger.log("Total transitions: %d" % self.num_transition) 107 | logger.log("Average returns: %f" % self.avg_ret) 108 | logger.log("Std for returns: %f" % self.std_ret) 109 | 110 | def get_next_batch(self, batch_size, split=None): 111 | """ 112 | get the batch from the dataset 113 | 114 | :param batch_size: (int) the size of the batch from the dataset 115 | :param split: (str) the type of data split (can be None, 'train', 'val') 116 | :return: (np.ndarray, np.ndarray) inputs and labels 117 | """ 118 | if split is None: 119 | return self.dset.get_next_batch(batch_size) 120 | elif split == 'train': 121 | return self.train_set.get_next_batch(batch_size) 122 | elif split == 'val': 123 | return self.val_set.get_next_batch(batch_size) 124 | else: 125 | raise NotImplementedError 126 | 127 | def plot(self): 128 | """ 129 | show and save (to 'histogram_rets.png') a histogram plotting of the episode returns 130 | """ 131 | plt.hist(self.rets) 132 | plt.savefig("histogram_rets.png") 133 | plt.close() 134 | 135 | 136 | def test(expert_path, traj_limitation, plot): 137 | """ 138 | test mujoco dataset object 139 | 140 | :param expert_path: (str) the path to trajectory data 141 | :param traj_limitation: (int) the dims to load (if -1, load all) 142 | :param plot: (bool) enable plotting 143 | """ 144 | dset = MujocoDset(expert_path, traj_limitation=traj_limitation) 145 | if plot: 146 | dset.plot() 147 | 148 | 149 | if __name__ == '__main__': 150 | import argparse 151 | parser = argparse.ArgumentParser() 152 | parser.add_argument("--expert_path", type=str, default="../data/deterministic.trpo.Hopper.0.00.npz") 153 | parser.add_argument("--traj_limitation", type=int, default=None) 154 | parser.add_argument("--plot", type=bool, default=False) 155 | args = parser.parse_args() 156 | test(args.expert_path, args.traj_limitation, args.plot) 157 | -------------------------------------------------------------------------------- /stable_baselines/gail/mlp_policy.py: -------------------------------------------------------------------------------- 1 | """ 2 | from stable_baselines/ppo1/mlp_policy.py and add simple modification 3 | (1) add reuse argument 4 | (2) cache the `stochastic` placeholder 5 | """ 6 | import gym 7 | import tensorflow as tf 8 | 9 | import stable_baselines.common.tf_util as tf_util 10 | from stable_baselines.acktr.utils import dense 11 | from stable_baselines.common.mpi_running_mean_std import RunningMeanStd 12 | from stable_baselines.ppo1.mlp_policy import BasePolicy 13 | 14 | 15 | class MlpPolicy(BasePolicy): 16 | recurrent = False 17 | 18 | def __init__(self, name, *args, sess=None, reuse=False, placeholders=None, **kwargs): 19 | """ 20 | MLP policy for Gail 21 | 22 | :param name: (str) the variable scope name 23 | :param ob_space: (Gym Space) The observation space of the environment 24 | :param ac_space: (Gym Space) The action space of the environment 25 | :param hid_size: (int) the size of the hidden layers 26 | :param num_hid_layers: (int) the number of hidden layers 27 | :param sess: (TensorFlow session) The current TensorFlow session containing the variables. 28 | :param reuse: (bool) allow resue of the graph 29 | :param placeholders: (dict) To feed existing placeholders if needed 30 | :param gaussian_fixed_var: (bool) fix the gaussian variance 31 | """ 32 | super(MlpPolicy, self).__init__(placeholders=placeholders) 33 | self.sess = sess 34 | with tf.variable_scope(name): 35 | if reuse: 36 | tf.get_variable_scope().reuse_variables() 37 | self._init(*args, **kwargs) 38 | self.scope = tf.get_variable_scope().name 39 | 40 | def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True): 41 | 42 | obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space) 43 | 44 | with tf.variable_scope("obfilter"): 45 | self.ob_rms = RunningMeanStd(shape=ob_space.shape) 46 | 47 | obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0) 48 | last_out = obz 49 | for i in range(num_hid_layers): 50 | last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1), 51 | weight_init=tf_util.normc_initializer(1.0))) 52 | self.vpred = dense(last_out, 1, "vffinal", weight_init=tf_util.normc_initializer(1.0))[:, 0] 53 | 54 | last_out = obz 55 | for i in range(num_hid_layers): 56 | last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1), 57 | weight_init=tf_util.normc_initializer(1.0))) 58 | 59 | if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box): 60 | mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", tf_util.normc_initializer(0.01)) 61 | logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2], 62 | initializer=tf.zeros_initializer()) 63 | pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1) 64 | else: 65 | pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", tf_util.normc_initializer(0.01)) 66 | 67 | self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam) 68 | self.state_in = [] 69 | self.state_out = [] 70 | 71 | # change for BC 72 | self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic") 73 | action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode()) 74 | self.action = action 75 | self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred]) 76 | -------------------------------------------------------------------------------- /stable_baselines/gail/model.py: -------------------------------------------------------------------------------- 1 | import gym 2 | 3 | from stable_baselines.common import ActorCriticRLModel 4 | from stable_baselines.common.policies import ActorCriticPolicy 5 | from stable_baselines.trpo_mpi import TRPO 6 | 7 | 8 | class GAIL(ActorCriticRLModel): 9 | """ 10 | Generative Adversarial Imitation Learning (GAIL) 11 | 12 | :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...) 13 | :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str) 14 | :param gamma: (float) the discount value 15 | :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon) 16 | :param max_kl: (float) the kullback leiber loss threashold 17 | :param cg_iters: (int) the number of iterations for the conjugate gradient calculation 18 | :param lam: (float) GAE factor 19 | :param entcoeff: (float) the weight for the entropy loss 20 | :param cg_damping: (float) the compute gradient dampening factor 21 | :param vf_stepsize: (float) the value function stepsize 22 | :param vf_iters: (int) the value function's number iterations for learning 23 | :param pretrained_weight: (str) the save location for the pretrained weights 24 | :param hidden_size: ([int]) the hidden dimension for the MLP 25 | :param expert_dataset: (Dset) the dataset manager 26 | :param save_per_iter: (int) the number of iterations before saving 27 | :param checkpoint_dir: (str) the location for saving checkpoints 28 | :param g_step: (int) number of steps to train policy in each epoch 29 | :param d_step: (int) number of steps to train discriminator in each epoch 30 | :param task_name: (str) the name of the task (can be None) 31 | :param d_stepsize: (float) the reward giver stepsize 32 | :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug 33 | :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance 34 | """ 35 | 36 | def __init__(self, policy, env, pretrained_weight=False, hidden_size_adversary=100, adversary_entcoeff=1e-3, 37 | expert_dataset=None, save_per_iter=1, checkpoint_dir="/tmp/gail/ckpt/", g_step=1, d_step=1, 38 | task_name="task_name", d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs): 39 | super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False, 40 | _init_setup_model=_init_setup_model) 41 | 42 | self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs) 43 | self.trpo.using_gail = True 44 | self.trpo.pretrained_weight = pretrained_weight 45 | self.trpo.expert_dataset = expert_dataset 46 | self.trpo.save_per_iter = save_per_iter 47 | self.trpo.checkpoint_dir = checkpoint_dir 48 | self.trpo.g_step = g_step 49 | self.trpo.d_step = d_step 50 | self.trpo.task_name = task_name 51 | self.trpo.d_stepsize = d_stepsize 52 | self.trpo.hidden_size_adversary = hidden_size_adversary 53 | self.trpo.adversary_entcoeff = adversary_entcoeff 54 | 55 | if _init_setup_model: 56 | self.setup_model() 57 | 58 | def set_env(self, env): 59 | super().set_env(env) 60 | self.trpo.set_env(env) 61 | 62 | def setup_model(self): 63 | assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the GAIL model must be an " \ 64 | "instance of common.policies.ActorCriticPolicy." 65 | assert isinstance(self.action_space, gym.spaces.Box), "Error: GAIL requires a continuous action space." 66 | 67 | self.trpo.setup_model() 68 | 69 | def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL"): 70 | self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name) 71 | return self 72 | 73 | def predict(self, observation, state=None, mask=None, deterministic=False): 74 | return self.trpo.predict(observation, state, mask, deterministic=deterministic) 75 | 76 | def action_probability(self, observation, state=None, mask=None): 77 | return self.trpo.action_probability(observation, state, mask) 78 | 79 | def save(self, save_path): 80 | self.trpo.save(save_path) 81 | 82 | @classmethod 83 | def load(cls, load_path, env=None, **kwargs): 84 | data, params = cls._load_from_file(load_path) 85 | 86 | model = cls(policy=data["policy"], env=None, _init_setup_model=False) 87 | model.trpo.__dict__.update(data) 88 | model.trpo.__dict__.update(kwargs) 89 | model.set_env(env) 90 | model.setup_model() 91 | 92 | restores = [] 93 | for param, loaded_p in zip(model.trpo.params, params): 94 | restores.append(param.assign(loaded_p)) 95 | model.trpo.sess.run(restores) 96 | 97 | return model 98 | -------------------------------------------------------------------------------- /stable_baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Hopper-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Hopper-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Hopper-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Hopper-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Hopper-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Hopper-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Hopper-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Hopper-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Humanoid-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Humanoid-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Humanoid-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Humanoid-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Walker2d-normalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Walker2d-normalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Walker2d-normalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Walker2d-normalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/gail-result.md: -------------------------------------------------------------------------------- 1 | # Results of GAIL/BC on Mujoco 2 | 3 | Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including 4 | Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0. 5 | 6 | ## Results 7 | 8 | ### Training through iterations 9 | 10 | - Hoppers-v1 11 | 12 | 13 | - HalfCheetah-v1 14 | 15 | 16 | - Walker2d-v1 17 | 18 | 19 | - Humanoid-v1 20 | 21 | 22 | - HumanoidStandup-v1 23 | 24 | 25 | For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing) 26 | 27 | ### Determinstic Polciy (Set std=0) 28 | | | Un-normalized | Normalized | 29 | |---|---|---| 30 | | Hopper-v1 | | | 31 | | HalfCheetah-v1 | | | 32 | | Walker2d-v1 | | | 33 | | Humanoid-v1 | | | 34 | | HumanoidStandup-v1 | | | 35 | 36 | ### Stochatic Policy 37 | | | Un-normalized | Normalized | 38 | |---|---|---| 39 | | Hopper-v1 | | | 40 | | HalfCheetah-v1 | | | 41 | | Walker2d-v1 | | | 42 | | Humanoid-v1 | | | 43 | | HumanoidStandup-v1 | | | 44 | 45 | ### details about GAIL imitator 46 | 47 | For all environments, the 48 | imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most 49 | 1024 transitions, and seed 0, 1, 2, 3, respectively. 50 | 51 | ### details about the BC imitators 52 | 53 | All BC imitators are trained with seed 0. 54 | -------------------------------------------------------------------------------- /stable_baselines/gail/result/halfcheetah-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/halfcheetah-training.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/hopper-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/hopper-training.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/humanoid-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/humanoid-training.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/humanoidstandup-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/humanoidstandup-training.png -------------------------------------------------------------------------------- /stable_baselines/gail/result/walker2d-training.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/walker2d-training.png -------------------------------------------------------------------------------- /stable_baselines/gail/statistics.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py 3 | """ 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | import stable_baselines.common.tf_util as tf_util 9 | 10 | 11 | class Stats: 12 | 13 | def __init__(self, scalar_keys=None, histogram_keys=None): 14 | """ 15 | initialize the placeholders from the input keys, for summary logging 16 | 17 | :param scalar_keys: ([str]) the name of all the scalar inputs 18 | :param histogram_keys: ([str]) the name of all the histogram inputs 19 | """ 20 | if scalar_keys is None: 21 | scalar_keys = [] 22 | if histogram_keys is None: 23 | histogram_keys = [] 24 | self.scalar_keys = scalar_keys 25 | self.histogram_keys = histogram_keys 26 | self.scalar_summaries = [] 27 | self.scalar_summaries_ph = [] 28 | self.histogram_summaries_ph = [] 29 | self.histogram_summaries = [] 30 | with tf.variable_scope('summary'): 31 | for key in scalar_keys: 32 | place_holder = tf.placeholder('float32', None, name=key + '.scalar.summary') 33 | string_summary = tf.summary.scalar(key + '.scalar.summary', place_holder) 34 | self.scalar_summaries_ph.append(place_holder) 35 | self.scalar_summaries.append(string_summary) 36 | for key in histogram_keys: 37 | place_holder = tf.placeholder('float32', None, name=key + '.histogram.summary') 38 | string_summary = tf.summary.scalar(key + '.histogram.summary', place_holder) 39 | self.histogram_summaries_ph.append(place_holder) 40 | self.histogram_summaries.append(string_summary) 41 | 42 | self.summaries = tf.summary.merge(self.scalar_summaries + self.histogram_summaries) 43 | 44 | def add_all_summary(self, writer, values, _iter): 45 | """ 46 | Note that the order of the incoming ```values``` should be the same as the that of the 47 | ```scalar_keys``` given in ```__init__``` 48 | 49 | :param writer: (TensorFlow FileWriter) the writer 50 | :param values: (TensorFlow Tensor or np.ndarray) the input for the summary run 51 | :param _iter: (Number) the global step value 52 | """ 53 | if np.sum(np.isnan(values) + 0) != 0: 54 | return 55 | sess = tf_util.get_session() 56 | keys = self.scalar_summaries_ph + self.histogram_summaries_ph 57 | feed_dict = {} 58 | for key, value in zip(keys, values): 59 | feed_dict.update({key: value}) 60 | summaries_str = sess.run(self.summaries, feed_dict) 61 | writer.add_summary(summaries_str, _iter) 62 | -------------------------------------------------------------------------------- /stable_baselines/her/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.her.her import HER 2 | -------------------------------------------------------------------------------- /stable_baselines/her/actor_critic.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from stable_baselines.her.util import mlp 4 | 5 | 6 | class ActorCritic: 7 | def __init__(self, inputs_tf, dim_obs, dim_goal, dim_action, 8 | max_u, o_stats, g_stats, hidden, layers, **kwargs): 9 | """The actor-critic network and related training code. 10 | 11 | :param inputs_tf: ({str: TensorFlow Tensor}) all necessary inputs for the network: the 12 | observation (o), the goal (g), and the action (u) 13 | :param dim_obs: (int) the dimension of the observations 14 | :param dim_goal: (int) the dimension of the goals 15 | :param dim_action: (int) the dimension of the actions 16 | :param max_u: (float) the maximum magnitude of actions; action outputs will be scaled accordingly 17 | :param o_stats (stable_baselines.her.Normalizer): normalizer for observations 18 | :param g_stats (stable_baselines.her.Normalizer): normalizer for goals 19 | :param hidden (int): number of hidden units that should be used in hidden layers 20 | :param layers (int): number of hidden layers 21 | """ 22 | self.inputs_tf = inputs_tf 23 | self.dim_obs = dim_obs 24 | self.dim_goal = dim_goal 25 | self.dim_action = dim_action 26 | self.max_u = max_u 27 | self.o_stats = o_stats 28 | self.g_stats = g_stats 29 | self.hidden = hidden 30 | self.layers = layers 31 | 32 | self.o_tf = inputs_tf['o'] 33 | self.g_tf = inputs_tf['g'] 34 | self.u_tf = inputs_tf['u'] 35 | 36 | # Prepare inputs for actor and critic. 37 | obs = self.o_stats.normalize(self.o_tf) 38 | goals = self.g_stats.normalize(self.g_tf) 39 | input_pi = tf.concat(axis=1, values=[obs, goals]) # for actor 40 | 41 | # Networks. 42 | with tf.variable_scope('pi'): 43 | self.pi_tf = self.max_u * tf.tanh(mlp( 44 | input_pi, [self.hidden] * self.layers + [self.dimu])) 45 | with tf.variable_scope('Q'): 46 | # for policy training 47 | input_q = tf.concat(axis=1, values=[obs, goals, self.pi_tf / self.max_u]) 48 | self.q_pi_tf = mlp(input_q, [self.hidden] * self.layers + [1]) 49 | # for critic training 50 | input_q = tf.concat(axis=1, values=[obs, goals, self.u_tf / self.max_u]) 51 | self._input_q = input_q # exposed for tests 52 | self.q_tf = mlp(input_q, [self.hidden] * self.layers + [1], reuse=True) 53 | -------------------------------------------------------------------------------- /stable_baselines/her/experiment/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/her/experiment/__init__.py -------------------------------------------------------------------------------- /stable_baselines/her/experiment/play.py: -------------------------------------------------------------------------------- 1 | import click 2 | import pickle 3 | 4 | import numpy as np 5 | 6 | from stable_baselines import logger 7 | from stable_baselines.common import set_global_seeds 8 | import stable_baselines.her.experiment.config as config 9 | from stable_baselines.her.rollout import RolloutWorker 10 | 11 | 12 | @click.command() 13 | @click.argument('policy_file', type=str) 14 | @click.option('--seed', type=int, default=0) 15 | @click.option('--n_test_rollouts', type=int, default=10) 16 | @click.option('--render', type=int, default=1) 17 | def main(policy_file, seed, n_test_rollouts, render): 18 | """ 19 | run HER from a saved policy 20 | 21 | :param policy_file: (str) pickle path to a saved policy 22 | :param seed: (int) initial seed 23 | :param n_test_rollouts: (int) the number of test rollouts 24 | :param render: (bool) if rendering should be done 25 | """ 26 | set_global_seeds(seed) 27 | 28 | # Load policy. 29 | with open(policy_file, 'rb') as file_handler: 30 | policy = pickle.load(file_handler) 31 | env_name = policy.info['env_name'] 32 | 33 | # Prepare params. 34 | params = config.DEFAULT_PARAMS 35 | if env_name in config.DEFAULT_ENV_PARAMS: 36 | params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in 37 | params['env_name'] = env_name 38 | params = config.prepare_params(params) 39 | config.log_params(params, logger_input=logger) 40 | 41 | dims = config.configure_dims(params) 42 | 43 | eval_params = { 44 | 'exploit': True, 45 | 'use_target_net': params['test_with_polyak'], 46 | 'compute_q': True, 47 | 'rollout_batch_size': 1, 48 | 'render': bool(render), 49 | } 50 | 51 | for name in ['time_horizon', 'gamma', 'noise_eps', 'random_eps']: 52 | eval_params[name] = params[name] 53 | 54 | evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) 55 | evaluator.seed(seed) 56 | 57 | # Run evaluation. 58 | evaluator.clear_history() 59 | for _ in range(n_test_rollouts): 60 | evaluator.generate_rollouts() 61 | 62 | # record logs 63 | for key, val in evaluator.logs('test'): 64 | logger.record_tabular(key, np.mean(val)) 65 | logger.dump_tabular() 66 | 67 | 68 | if __name__ == '__main__': 69 | main() 70 | -------------------------------------------------------------------------------- /stable_baselines/her/experiment/plot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | import matplotlib.pyplot as plt 6 | import numpy as np 7 | import seaborn as sns 8 | import glob2 9 | 10 | # Initialize seaborn 11 | sns.set() 12 | 13 | def smooth_reward_curve(x, y): 14 | """ 15 | smooth the reward curve 16 | 17 | :param x: (numpy float) the x coord of the reward 18 | :param y: (numpy float) the y coord of the reward 19 | :return: (numpy float, numpy float) smoothed x, smoothed y 20 | """ 21 | halfwidth = int(np.ceil(len(x) / 60)) # Halfwidth of our smoothing convolution 22 | k = halfwidth 23 | xsmoo = x 24 | ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1), 25 | mode='same') 26 | return xsmoo, ysmoo 27 | 28 | 29 | def load_results(file): 30 | """ 31 | load the results from a file 32 | 33 | :param file: (str) the saved results 34 | :return: (dict) the result 35 | """ 36 | if not os.path.exists(file): 37 | return None 38 | with open(file, 'r') as file_handler: 39 | lines = [line for line in file_handler] 40 | if len(lines) < 2: 41 | return None 42 | keys = [name.strip() for name in lines[0].split(',')] 43 | data = np.genfromtxt(file, delimiter=',', skip_header=1, filling_values=0.) 44 | if data.ndim == 1: 45 | data = data.reshape(1, -1) 46 | assert data.ndim == 2 47 | assert data.shape[-1] == len(keys) 48 | result = {} 49 | for idx, key in enumerate(keys): 50 | result[key] = data[:, idx] 51 | return result 52 | 53 | 54 | def pad(xs, value=np.nan): 55 | """ 56 | 57 | 58 | :param xs: 59 | :param value: 60 | :return: 61 | """ 62 | maxlen = np.max([len(x) for x in xs]) 63 | 64 | padded_xs = [] 65 | for x in xs: 66 | if x.shape[0] >= maxlen: 67 | padded_xs.append(x) 68 | 69 | padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value 70 | x_padded = np.concatenate([x, padding], axis=0) 71 | assert x_padded.shape[1:] == x.shape[1:] 72 | assert x_padded.shape[0] == maxlen 73 | padded_xs.append(x_padded) 74 | return np.array(padded_xs) 75 | 76 | 77 | parser = argparse.ArgumentParser() 78 | parser.add_argument('dir', type=str) 79 | parser.add_argument('--smooth', type=int, default=1) 80 | args = parser.parse_args() 81 | 82 | # Load all data. 83 | data = {} 84 | paths = [os.path.abspath(os.path.join(path, '..')) for path in glob2.glob(os.path.join(args.dir, '**', 'progress.csv'))] 85 | for curr_path in paths: 86 | if not os.path.isdir(curr_path): 87 | continue 88 | results = load_results(os.path.join(curr_path, 'progress.csv')) 89 | if not results: 90 | print('skipping {}'.format(curr_path)) 91 | continue 92 | print('loading {} ({})'.format(curr_path, len(results['epoch']))) 93 | with open(os.path.join(curr_path, 'params.json'), 'r') as f: 94 | params = json.load(f) 95 | 96 | success_rate = np.array(results['test/success_rate']) 97 | epoch = np.array(results['epoch']) + 1 98 | env_id = params['env_name'] 99 | replay_strategy = params['replay_strategy'] 100 | 101 | if replay_strategy == 'future': 102 | config = 'her' 103 | else: 104 | config = 'ddpg' 105 | if 'Dense' in env_id: 106 | config += '-dense' 107 | else: 108 | config += '-sparse' 109 | env_id = env_id.replace('Dense', '') 110 | 111 | # Process and smooth data. 112 | assert success_rate.shape == epoch.shape 113 | x = epoch 114 | y = success_rate 115 | if args.smooth: 116 | x, y = smooth_reward_curve(epoch, success_rate) 117 | assert x.shape == y.shape 118 | 119 | if env_id not in data: 120 | data[env_id] = {} 121 | if config not in data[env_id]: 122 | data[env_id][config] = [] 123 | data[env_id][config].append((x, y)) 124 | 125 | # Plot data. 126 | for env_id in sorted(data.keys()): 127 | print('exporting {}'.format(env_id)) 128 | plt.clf() 129 | 130 | for config in sorted(data[env_id].keys()): 131 | xs, ys = zip(*data[env_id][config]) 132 | xs, ys = pad(xs), pad(ys) 133 | assert xs.shape == ys.shape 134 | 135 | plt.plot(xs[0], np.nanmedian(ys, axis=0), label=config) 136 | plt.fill_between(xs[0], np.nanpercentile(ys, 25, axis=0), np.nanpercentile(ys, 75, axis=0), alpha=0.25) 137 | plt.title(env_id) 138 | plt.xlabel('Epoch') 139 | plt.ylabel('Median Success Rate') 140 | plt.legend() 141 | plt.savefig(os.path.join(args.dir, 'fig_{}.png'.format(env_id))) 142 | -------------------------------------------------------------------------------- /stable_baselines/her/her.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import gym 4 | 5 | from stable_baselines.common import BaseRLModel, SetVerbosity 6 | from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy 7 | 8 | 9 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun): 10 | """ 11 | Creates a sample function that can be used for HER experience replay. 12 | 13 | :param replay_strategy: (str) the HER replay strategy; if set to 'none', regular DDPG experience replay is used 14 | (can be 'future' or 'none'). 15 | :param replay_k: (int) the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times 16 | as many HER replays as regular replays are used) 17 | :param reward_fun: (function (dict, dict): float) function to re-compute the reward with substituted goals 18 | """ 19 | if replay_strategy == 'future': 20 | future_p = 1 - (1. / (1 + replay_k)) 21 | else: # 'replay_strategy' == 'none' 22 | future_p = 0 23 | 24 | def _sample_her_transitions(episode_batch, batch_size_in_transitions): 25 | """episode_batch is {key: array(buffer_size x T x dim_key)} 26 | """ 27 | time_horizon = episode_batch['u'].shape[1] 28 | rollout_batch_size = episode_batch['u'].shape[0] 29 | batch_size = batch_size_in_transitions 30 | 31 | # Select which episodes and time steps to use. 32 | episode_idxs = np.random.randint(0, rollout_batch_size, batch_size) 33 | t_samples = np.random.randint(time_horizon, size=batch_size) 34 | transitions = {key: episode_batch[key][episode_idxs, t_samples].copy() 35 | for key in episode_batch.keys()} 36 | 37 | # Select future time indexes proportional with probability future_p. These 38 | # will be used for HER replay by substituting in future goals. 39 | her_indexes = np.where(np.random.uniform(size=batch_size) < future_p) 40 | future_offset = np.random.uniform(size=batch_size) * (time_horizon - t_samples) 41 | future_offset = future_offset.astype(int) 42 | future_t = (t_samples + 1 + future_offset)[her_indexes] 43 | 44 | # Replace goal with achieved goal but only for the previously-selected 45 | # HER transitions (as defined by her_indexes). For the other transitions, 46 | # keep the original goal. 47 | future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t] 48 | transitions['g'][her_indexes] = future_ag 49 | 50 | # Reconstruct info dictionary for reward computation. 51 | info = {} 52 | for key, value in transitions.items(): 53 | if key.startswith('info_'): 54 | info[key.replace('info_', '')] = value 55 | 56 | # Re-compute reward since we may have substituted the goal. 57 | reward_params = {k: transitions[k] for k in ['ag_2', 'g']} 58 | reward_params['info'] = info 59 | transitions['r'] = reward_fun(**reward_params) 60 | 61 | transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:]) 62 | for k in transitions.keys()} 63 | 64 | assert transitions['u'].shape[0] == batch_size_in_transitions 65 | 66 | return transitions 67 | 68 | return _sample_her_transitions 69 | 70 | 71 | class HER(BaseRLModel): 72 | def __init__(self, policy, env, verbose=0, _init_setup_model=True): 73 | super().__init__(policy=policy, env=env, verbose=verbose, policy_base=ActorCriticPolicy, requires_vec_env=False) 74 | 75 | self.policy = policy 76 | 77 | self.sess = None 78 | self.graph = None 79 | 80 | if _init_setup_model: 81 | self.setup_model() 82 | 83 | def setup_model(self): 84 | with SetVerbosity(self.verbose): 85 | 86 | assert isinstance(self.action_space, gym.spaces.Box), \ 87 | "Error: HER cannot output a {} action space, only spaces.Box is supported.".format(self.action_space) 88 | assert not issubclass(self.policy, LstmPolicy), "Error: cannot use a recurrent policy for the HER model." 89 | assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the HER model must be an " \ 90 | "instance of common.policies.ActorCriticPolicy." 91 | 92 | self.graph = tf.Graph() 93 | with self.graph.as_default(): 94 | pass 95 | 96 | def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="HER"): 97 | with SetVerbosity(self.verbose): 98 | self._setup_learn(seed) 99 | 100 | return self 101 | 102 | def predict(self, observation, state=None, mask=None, deterministic=False): 103 | pass 104 | 105 | def action_probability(self, observation, state=None, mask=None): 106 | pass 107 | 108 | def save(self, save_path): 109 | pass 110 | 111 | @classmethod 112 | def load(cls, load_path, env=None, **kwargs): 113 | pass 114 | -------------------------------------------------------------------------------- /stable_baselines/her/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import threading 2 | 3 | import numpy as np 4 | 5 | 6 | class ReplayBuffer: 7 | def __init__(self, buffer_shapes, size_in_transitions, time_horizon, sample_transitions): 8 | """ 9 | Creates a replay buffer. 10 | 11 | :param buffer_shapes: ({str: int}) the shape for all buffers that are used in the replay buffer 12 | :param size_in_transitions: (int) the size of the buffer, measured in transitions 13 | :param time_horizon: (int) the time horizon for episodes 14 | :param sample_transitions: (function) a function that samples from the replay buffer 15 | """ 16 | self.buffer_shapes = buffer_shapes 17 | self.size = size_in_transitions // time_horizon 18 | self.time_horizon = time_horizon 19 | self.sample_transitions = sample_transitions 20 | 21 | # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)} 22 | self.buffers = {key: np.empty([self.size, *shape]) 23 | for key, shape in buffer_shapes.items()} 24 | 25 | # memory management 26 | self.current_size = 0 27 | self.n_transitions_stored = 0 28 | 29 | self.lock = threading.Lock() 30 | 31 | @property 32 | def full(self): 33 | with self.lock: 34 | return self.current_size == self.size 35 | 36 | def sample(self, batch_size): 37 | """ 38 | sample random transitions 39 | 40 | :param batch_size: (int) How many transitions to sample. 41 | :return: (dict) {key: array(batch_size x shapes[key])} 42 | """ 43 | buffers = {} 44 | 45 | with self.lock: 46 | assert self.current_size > 0 47 | for key in self.buffers.keys(): 48 | buffers[key] = self.buffers[key][:self.current_size] 49 | 50 | buffers['o_2'] = buffers['o'][:, 1:, :] 51 | buffers['ag_2'] = buffers['ag'][:, 1:, :] 52 | 53 | transitions = self.sample_transitions(buffers, batch_size) 54 | 55 | for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())): 56 | assert key in transitions, "key %s missing from transitions" % key 57 | 58 | return transitions 59 | 60 | def store_episode(self, episode_batch): 61 | """ 62 | Store an episode in the replay buffer 63 | 64 | :param episode_batch: (np.ndarray) batch_size x (T or T+1) x dim_key 65 | """ 66 | batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()] 67 | assert np.all(np.array(batch_sizes) == batch_sizes[0]) 68 | batch_size = batch_sizes[0] 69 | 70 | with self.lock: 71 | idxs = self._get_storage_idx(batch_size) 72 | 73 | # load inputs into buffers 74 | for key in self.buffers.keys(): 75 | self.buffers[key][idxs] = episode_batch[key] 76 | 77 | self.n_transitions_stored += batch_size * self.time_horizon 78 | 79 | def get_current_episode_size(self): 80 | """ 81 | get current episode size 82 | 83 | :return: (int) the current size of the episode 84 | """ 85 | with self.lock: 86 | return self.current_size 87 | 88 | def get_current_size(self): 89 | """ 90 | get current size of the buffer 91 | 92 | :return: (int) the current size of the buffer 93 | """ 94 | with self.lock: 95 | return self.current_size * self.time_horizon 96 | 97 | def get_transitions_stored(self): 98 | """ 99 | get the number of stored transitions 100 | 101 | :return: (int) the number of transitions stored 102 | """ 103 | with self.lock: 104 | return self.n_transitions_stored 105 | 106 | def clear_buffer(self): 107 | """ 108 | clear the buffer of all entries 109 | """ 110 | with self.lock: 111 | self.current_size = 0 112 | 113 | def _get_storage_idx(self, inc=None): 114 | inc = inc or 1 # size increment 115 | assert inc <= self.size, "Batch committed to replay is too large!" 116 | # go consecutively until you hit the end, and then go randomly. 117 | if self.current_size + inc <= self.size: 118 | idx = np.arange(self.current_size, self.current_size + inc) 119 | elif self.current_size < self.size: 120 | overflow = inc - (self.size - self.current_size) 121 | idx_a = np.arange(self.current_size, self.size) 122 | idx_b = np.random.randint(0, self.current_size, overflow) 123 | idx = np.concatenate([idx_a, idx_b]) 124 | else: 125 | idx = np.random.randint(0, self.size, inc) 126 | 127 | # update replay size 128 | self.current_size = min(self.size, self.current_size + inc) 129 | 130 | if inc == 1: 131 | idx = idx[0] 132 | return idx 133 | -------------------------------------------------------------------------------- /stable_baselines/her/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import importlib 5 | 6 | import tensorflow as tf 7 | import numpy as np 8 | from mpi4py import MPI 9 | 10 | from stable_baselines.common import tf_util 11 | 12 | 13 | def import_function(spec): 14 | """ 15 | Import a function identified by a string like "pkg.module:fn_name". 16 | 17 | :param spec: (str) the function to import 18 | :return: (function) 19 | """ 20 | mod_name, fn_name = spec.split(':') 21 | module = importlib.import_module(mod_name) 22 | func = getattr(module, fn_name) 23 | return func 24 | 25 | 26 | def flatten_grads(var_list, grads): 27 | """ 28 | Flattens a variables and their gradients. 29 | 30 | :param var_list: ([TensorFlow Tensor]) the variables 31 | :param grads: ([TensorFlow Tensor]) the gradients 32 | :return: (TensorFlow Tensor) the flattend variable and gradient 33 | """ 34 | return tf.concat([tf.reshape(grad, [tf_util.numel(v)]) 35 | for (v, grad) in zip(var_list, grads)], 0) 36 | 37 | 38 | def mlp(_input, layers_sizes, reuse=None, flatten=False, name=""): 39 | """ 40 | Creates a simple fully-connected neural network 41 | 42 | :param _input: (TensorFlow Tensor) the input 43 | :param layers_sizes: ([int]) the hidden layers 44 | :param reuse: (bool) Enable reuse of the network 45 | :param flatten: (bool) flatten the network output 46 | :param name: (str) the name of the network 47 | :return: (TensorFlow Tensor) the network 48 | """ 49 | for i, size in enumerate(layers_sizes): 50 | activation = tf.nn.relu if i < len(layers_sizes) - 1 else None 51 | _input = tf.layers.dense(inputs=_input, 52 | units=size, 53 | kernel_initializer=tf.contrib.layers.xavier_initializer(), 54 | reuse=reuse, 55 | name=name + '_' + str(i)) 56 | if activation: 57 | _input = activation(_input) 58 | if flatten: 59 | assert layers_sizes[-1] == 1 60 | _input = tf.reshape(_input, [-1]) 61 | return _input 62 | 63 | 64 | def install_mpi_excepthook(): 65 | """ 66 | setup the MPI exception hooks 67 | """ 68 | old_hook = sys.excepthook 69 | 70 | def new_hook(a, b, c): 71 | old_hook(a, b, c) 72 | sys.stdout.flush() 73 | sys.stderr.flush() 74 | MPI.COMM_WORLD.Abort() 75 | 76 | sys.excepthook = new_hook 77 | 78 | 79 | def mpi_fork(rank, extra_mpi_args=None): 80 | """ 81 | Re-launches the current script with workers 82 | Returns "parent" for original parent, "child" for MPI children 83 | 84 | :param rank: (int) the thread rank 85 | :param extra_mpi_args: (dict) extra arguments for MPI 86 | :return: (str) the correct type of thread name 87 | """ 88 | if extra_mpi_args is None: 89 | extra_mpi_args = [] 90 | 91 | if rank <= 1: 92 | return "child" 93 | if os.getenv("IN_MPI") is None: 94 | env = os.environ.copy() 95 | env.update( 96 | MKL_NUM_THREADS="1", 97 | OMP_NUM_THREADS="1", 98 | IN_MPI="1" 99 | ) 100 | # "-bind-to core" is crucial for good performance 101 | args = ["mpirun", "-np", str(rank)] + \ 102 | extra_mpi_args + \ 103 | [sys.executable] 104 | 105 | args += sys.argv 106 | subprocess.check_call(args, env=env) 107 | return "parent" 108 | else: 109 | install_mpi_excepthook() 110 | return "child" 111 | 112 | 113 | def convert_episode_to_batch_major(episode): 114 | """ 115 | Converts an episode to have the batch dimension in the major (first) dimension. 116 | 117 | :param episode: (dict) the episode batch 118 | :return: (dict) the episode batch with he batch dimension in the major (first) dimension. 119 | """ 120 | episode_batch = {} 121 | for key in episode.keys(): 122 | val = np.array(episode[key]).copy() 123 | # make inputs batch-major instead of time-major 124 | episode_batch[key] = val.swapaxes(0, 1) 125 | 126 | return episode_batch 127 | 128 | 129 | def transitions_in_episode_batch(episode_batch): 130 | """ 131 | Number of transitions in a given episode batch. 132 | 133 | :param episode_batch: (dict) the episode batch 134 | :return: (int) the number of transitions in episode batch 135 | """ 136 | shape = episode_batch['u'].shape 137 | return shape[0] * shape[1] 138 | 139 | 140 | def reshape_for_broadcasting(source, target): 141 | """ 142 | Reshapes a tensor (source) to have the correct shape and dtype of the target before broadcasting it with MPI. 143 | 144 | :param source: (TensorFlow Tensor) the input tensor 145 | :param target: (TensorFlow Tensor) the target tensor 146 | :return: (TensorFlow Tensor) the rehshaped tensor 147 | """ 148 | dim = len(target.get_shape()) 149 | shape = ([1] * (dim - 1)) + [-1] 150 | return tf.reshape(tf.cast(source, target.dtype), shape) 151 | -------------------------------------------------------------------------------- /stable_baselines/ppo1/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.ppo1.pposgd_simple import PPO1 2 | -------------------------------------------------------------------------------- /stable_baselines/ppo1/experiments/train_cartpole.py: -------------------------------------------------------------------------------- 1 | """ 2 | Simple test to check that PPO1 is running with no errors (see issue #50) 3 | """ 4 | from stable_baselines import PPO1 5 | 6 | 7 | if __name__ == '__main__': 8 | model = PPO1('MlpPolicy', 'CartPole-v1', schedule='linear', verbose=0) 9 | model.learn(total_timesteps=10000) 10 | -------------------------------------------------------------------------------- /stable_baselines/ppo1/mlp_policy.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from stable_baselines.common.input import observation_input 4 | from stable_baselines.common.distributions import make_proba_dist_type 5 | 6 | 7 | class BasePolicy(object): 8 | def __init__(self, placeholders=None): 9 | """ 10 | A base policy object for PPO1 11 | 12 | :param placeholders: (dict) To feed existing placeholders if needed 13 | """ 14 | super(BasePolicy, self).__init__() 15 | self.sess = None 16 | self.pdtype = None 17 | self._act = None 18 | self.scope = None 19 | self.obs_ph = None 20 | self.stochastic_ph = None 21 | self.processed_x = None 22 | 23 | if placeholders is not None: 24 | self.obs_ph = placeholders.get("obs", None) 25 | self.processed_x = placeholders.get("processed_obs", None) 26 | self.stochastic_ph = placeholders.get("stochastic", None) 27 | 28 | def get_obs_and_pdtype(self, ob_space, ac_space): 29 | """ 30 | Initialize probability distribution and get observation placeholder. 31 | 32 | :param ob_space: (Gym Spaces) the observation space 33 | :param ac_space: (Gym Spaces) the action space 34 | """ 35 | self.pdtype = pdtype = make_proba_dist_type(ac_space) 36 | 37 | if self.obs_ph is None: 38 | self.obs_ph, self.processed_x = observation_input(ob_space) 39 | else: 40 | assert self.processed_x is not None 41 | 42 | return self.obs_ph, pdtype 43 | 44 | def act(self, stochastic, obs): 45 | """ 46 | Get the action from the policy, using the observation 47 | 48 | :param stochastic: (bool) whether or not to use a stochastic or deterministic policy 49 | :param obs: (TensorFlow Tensor or np.ndarray) the observation 50 | :return: (np.ndarray, np.ndarray) the action and value function 51 | """ 52 | ac1, vpred1 = self._act(stochastic, obs[None], sess=self.sess) 53 | return ac1[0], vpred1[0] 54 | 55 | def get_variables(self): 56 | """ 57 | Get all the policy's variables 58 | 59 | :return: ([TensorFlow Tensor]) the variables of the network 60 | """ 61 | return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope) 62 | 63 | def get_trainable_variables(self): 64 | """ 65 | Get the policy's trainable variables 66 | 67 | :return: ([TensorFlow Tensor]) the trainable variables of the network 68 | """ 69 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope) 70 | 71 | @classmethod 72 | def get_initial_state(cls): 73 | """ 74 | Get the initial state 75 | 76 | :return: ([np.ndarray]) the initial state 77 | """ 78 | return [] 79 | -------------------------------------------------------------------------------- /stable_baselines/ppo1/run_atari.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | from mpi4py import MPI 5 | 6 | from stable_baselines.common import set_global_seeds 7 | from stable_baselines import bench, logger 8 | from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind 9 | from stable_baselines.common.cmd_util import atari_arg_parser 10 | from stable_baselines.common.policies import CnnPolicy 11 | from stable_baselines.ppo1 import PPO1 12 | 13 | 14 | def train(env_id, num_timesteps, seed): 15 | """ 16 | Train PPO1 model for Atari environments, for testing purposes 17 | 18 | :param env_id: (str) Environment ID 19 | :param num_timesteps: (int) The total number of samples 20 | :param seed: (int) The initial seed for training 21 | """ 22 | rank = MPI.COMM_WORLD.Get_rank() 23 | 24 | if rank == 0: 25 | logger.configure() 26 | else: 27 | logger.configure(format_strs=[]) 28 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() 29 | set_global_seeds(workerseed) 30 | env = make_atari(env_id) 31 | 32 | env = bench.Monitor(env, logger.get_dir() and 33 | os.path.join(logger.get_dir(), str(rank))) 34 | env.seed(workerseed) 35 | 36 | env = wrap_deepmind(env) 37 | env.seed(workerseed) 38 | 39 | model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, 40 | optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2) 41 | model.learn(total_timesteps=num_timesteps) 42 | env.close() 43 | 44 | 45 | def main(): 46 | """ 47 | Runs the test 48 | """ 49 | args = atari_arg_parser().parse_args() 50 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 51 | 52 | 53 | if __name__ == '__main__': 54 | main() 55 | -------------------------------------------------------------------------------- /stable_baselines/ppo1/run_humanoid.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | import gym 5 | 6 | from stable_baselines.ppo1 import PPO1 7 | from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 8 | from stable_baselines.common import tf_util 9 | from stable_baselines.common.policies import MlpPolicy 10 | from stable_baselines import logger 11 | 12 | 13 | def train(num_timesteps, seed, model_path=None): 14 | """ 15 | Train PPO1 model for the Humanoid environment, for testing purposes 16 | 17 | :param num_timesteps: (int) The total number of samples 18 | :param seed: (int) The initial seed for training 19 | :param model_path: (str) path to the model 20 | """ 21 | env_id = 'Humanoid-v2' 22 | 23 | env = make_mujoco_env(env_id, seed) 24 | 25 | # parameters below were the best found in a simple random search 26 | # these are good enough to make humanoid walk, but whether those are 27 | # an absolute best or not is not certain 28 | env = RewScale(env, 0.1) 29 | model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, 30 | optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') 31 | model.learn(total_timesteps=num_timesteps) 32 | env.close() 33 | if model_path: 34 | tf_util.save_state(model_path) 35 | 36 | return model 37 | 38 | 39 | class RewScale(gym.RewardWrapper): 40 | def __init__(self, env, scale): 41 | gym.RewardWrapper.__init__(self, env) 42 | self.scale = scale 43 | 44 | def reward(self, _reward): 45 | return _reward * self.scale 46 | 47 | 48 | def main(): 49 | """ 50 | Runs the test 51 | """ 52 | logger.configure() 53 | parser = mujoco_arg_parser() 54 | parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) 55 | parser.set_defaults(num_timesteps=int(2e7)) 56 | 57 | args = parser.parse_args() 58 | 59 | if not args.play: 60 | # train the model 61 | train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) 62 | else: 63 | # construct the model object, load pre-trained model and render 64 | model = train(num_timesteps=1, seed=args.seed) 65 | tf_util.load_state(args.model_path) 66 | env = make_mujoco_env('Humanoid-v2', seed=0) 67 | 68 | obs = env.reset() 69 | while True: 70 | action = model.policy.act(stochastic=False, obs=obs)[0] 71 | obs, _, done, _ = env.step(action) 72 | env.render() 73 | if done: 74 | obs = env.reset() 75 | 76 | 77 | if __name__ == '__main__': 78 | main() 79 | -------------------------------------------------------------------------------- /stable_baselines/ppo1/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from stable_baselines.ppo1 import PPO1 4 | from stable_baselines.common.policies import MlpPolicy 5 | from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 6 | from stable_baselines import logger 7 | 8 | 9 | def train(env_id, num_timesteps, seed): 10 | """ 11 | Train PPO1 model for the Mujoco environment, for testing purposes 12 | 13 | :param env_id: (str) Environment ID 14 | :param num_timesteps: (int) The total number of samples 15 | :param seed: (int) The initial seed for training 16 | """ 17 | env = make_mujoco_env(env_id, seed) 18 | model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, 19 | optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear') 20 | model.learn(total_timesteps=num_timesteps) 21 | env.close() 22 | 23 | 24 | def main(): 25 | """ 26 | Runs the test 27 | """ 28 | args = mujoco_arg_parser().parse_args() 29 | logger.configure() 30 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 31 | 32 | 33 | if __name__ == '__main__': 34 | main() 35 | -------------------------------------------------------------------------------- /stable_baselines/ppo1/run_robotics.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | from mpi4py import MPI 4 | import mujoco_py 5 | 6 | from stable_baselines.common import set_global_seeds 7 | from stable_baselines.common.policies import MlpPolicy 8 | from stable_baselines.common.cmd_util import make_robotics_env, robotics_arg_parser 9 | from stable_baselines.ppo1 import PPO1 10 | 11 | 12 | def train(env_id, num_timesteps, seed): 13 | """ 14 | Train PPO1 model for Robotics environment, for testing purposes 15 | 16 | :param env_id: (str) Environment ID 17 | :param num_timesteps: (int) The total number of samples 18 | :param seed: (int) The initial seed for training 19 | """ 20 | 21 | rank = MPI.COMM_WORLD.Get_rank() 22 | with mujoco_py.ignore_mujoco_warnings(): 23 | workerseed = seed + 10000 * rank 24 | set_global_seeds(workerseed) 25 | env = make_robotics_env(env_id, workerseed, rank=rank) 26 | 27 | model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5, 28 | optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear') 29 | model.learn(total_timesteps=num_timesteps) 30 | env.close() 31 | 32 | 33 | def main(): 34 | """ 35 | Runs the test 36 | """ 37 | args = robotics_arg_parser().parse_args() 38 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 39 | 40 | 41 | if __name__ == '__main__': 42 | main() 43 | -------------------------------------------------------------------------------- /stable_baselines/ppo2/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.ppo2.ppo2 import PPO2 2 | -------------------------------------------------------------------------------- /stable_baselines/ppo2/run_atari.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from stable_baselines import logger 3 | from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser 4 | from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack 5 | from stable_baselines.ppo2 import PPO2 6 | from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy, MlpPolicy 7 | 8 | 9 | def train(env_id, num_timesteps, seed, policy): 10 | """ 11 | Train PPO2 model for atari environment, for testing purposes 12 | 13 | :param env_id: (str) the environment id string 14 | :param num_timesteps: (int) the number of timesteps to run 15 | :param seed: (int) Used to seed the random generator. 16 | :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...) 17 | """ 18 | 19 | env = VecFrameStack(make_atari_env(env_id, 8, seed), 4) 20 | policy = {'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy}[policy] 21 | model = PPO2(policy=policy, env=env, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01, 22 | learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1) 23 | model.learn(total_timesteps=num_timesteps) 24 | 25 | 26 | def main(): 27 | """ 28 | Runs the test 29 | """ 30 | parser = atari_arg_parser() 31 | parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn') 32 | args = parser.parse_args() 33 | logger.configure() 34 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, 35 | policy=args.policy) 36 | 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /stable_baselines/ppo2/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import numpy as np 3 | import gym 4 | 5 | from stable_baselines.common.cmd_util import mujoco_arg_parser 6 | from stable_baselines import bench, logger 7 | from stable_baselines.common import set_global_seeds 8 | from stable_baselines.common.vec_env.vec_normalize import VecNormalize 9 | from stable_baselines.ppo2 import PPO2 10 | from stable_baselines.common.policies import MlpPolicy 11 | from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv 12 | 13 | 14 | def train(env_id, num_timesteps, seed): 15 | """ 16 | Train PPO2 model for Mujoco environment, for testing purposes 17 | 18 | :param env_id: (str) the environment id string 19 | :param num_timesteps: (int) the number of timesteps to run 20 | :param seed: (int) Used to seed the random generator. 21 | """ 22 | def make_env(): 23 | env_out = gym.make(env_id) 24 | env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True) 25 | return env_out 26 | 27 | env = DummyVecEnv([make_env]) 28 | env = VecNormalize(env) 29 | 30 | set_global_seeds(seed) 31 | policy = MlpPolicy 32 | model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10, 33 | ent_coef=0.0, learning_rate=3e-4, cliprange=0.2) 34 | model.learn(total_timesteps=num_timesteps) 35 | 36 | return model, env 37 | 38 | 39 | def main(): 40 | """ 41 | Runs the test 42 | """ 43 | args = mujoco_arg_parser().parse_args() 44 | logger.configure() 45 | model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 46 | 47 | if args.play: 48 | logger.log("Running trained model") 49 | obs = np.zeros((env.num_envs,) + env.observation_space.shape) 50 | obs[:] = env.reset() 51 | while True: 52 | actions = model.step(obs)[0] 53 | obs[:] = env.step(actions)[0] 54 | env.render() 55 | 56 | 57 | if __name__ == '__main__': 58 | main() 59 | -------------------------------------------------------------------------------- /stable_baselines/results_plotter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib 3 | import matplotlib.pyplot as plt 4 | 5 | from stable_baselines.bench.monitor import load_results 6 | 7 | # matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode 8 | plt.rcParams['svg.fonttype'] = 'none' 9 | 10 | X_TIMESTEPS = 'timesteps' 11 | X_EPISODES = 'episodes' 12 | X_WALLTIME = 'walltime_hrs' 13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME] 14 | EPISODES_WINDOW = 100 15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink', 16 | 'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise', 17 | 'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue'] 18 | 19 | 20 | def rolling_window(array, window): 21 | """ 22 | apply a rolling window to a np.ndarray 23 | 24 | :param array: (np.ndarray) the input Array 25 | :param window: (int) length of the rolling window 26 | :return: (np.ndarray) rolling window on the input array 27 | """ 28 | shape = array.shape[:-1] + (array.shape[-1] - window + 1, window) 29 | strides = array.strides + (array.strides[-1],) 30 | return np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides) 31 | 32 | 33 | def window_func(var_1, var_2, window, func): 34 | """ 35 | apply a function to the rolling window of 2 arrays 36 | 37 | :param var_1: (np.ndarray) variable 1 38 | :param var_2: (np.ndarray) variable 2 39 | :param window: (int) length of the rolling window 40 | :param func: (numpy function) function to apply on the rolling window on variable 2 (such as np.mean) 41 | :return: (np.ndarray, np.ndarray) the rolling output with applied function 42 | """ 43 | var_2_window = rolling_window(var_2, window) 44 | function_on_var2 = func(var_2_window, axis=-1) 45 | return var_1[window - 1:], function_on_var2 46 | 47 | 48 | def ts2xy(timesteps, xaxis): 49 | """ 50 | Decompose a timesteps variable to x ans ys 51 | 52 | :param timesteps: (Pandas DataFrame) the input data 53 | :param xaxis: (str) the axis for the x and y output 54 | (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') 55 | :return: (np.ndarray, np.ndarray) the x and y output 56 | """ 57 | if xaxis == X_TIMESTEPS: 58 | x_var = np.cumsum(timesteps.l.values) 59 | y_var = timesteps.r.values 60 | elif xaxis == X_EPISODES: 61 | x_var = np.arange(len(timesteps)) 62 | y_var = timesteps.r.values 63 | elif xaxis == X_WALLTIME: 64 | x_var = timesteps.t.values / 3600. 65 | y_var = timesteps.r.values 66 | else: 67 | raise NotImplementedError 68 | return x_var, y_var 69 | 70 | 71 | def plot_curves(xy_list, xaxis, title): 72 | """ 73 | plot the curves 74 | 75 | :param xy_list: ([(np.ndarray, np.ndarray)]) the x and y coordinates to plot 76 | :param xaxis: (str) the axis for the x and y output 77 | (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') 78 | :param title: (str) the title of the plot 79 | """ 80 | 81 | plt.figure(figsize=(8, 2)) 82 | maxx = max(xy[0][-1] for xy in xy_list) 83 | minx = 0 84 | for (i, (x, y)) in enumerate(xy_list): 85 | color = COLORS[i] 86 | plt.scatter(x, y, s=2) 87 | x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) # So returns average of last EPISODE_WINDOW episodes 88 | plt.plot(x, y_mean, color=color) 89 | plt.xlim(minx, maxx) 90 | plt.title(title) 91 | plt.xlabel(xaxis) 92 | plt.ylabel("Episode Rewards") 93 | plt.tight_layout() 94 | 95 | 96 | def plot_results(dirs, num_timesteps, xaxis, task_name): 97 | """ 98 | plot the results 99 | 100 | :param dirs: ([str]) the save location of the results to plot 101 | :param num_timesteps: (int) only plot the points below this value 102 | :param xaxis: (str) the axis for the x and y output 103 | (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs') 104 | :param task_name: (str) the title of the task to plot 105 | """ 106 | 107 | tslist = [] 108 | for folder in dirs: 109 | timesteps = load_results(folder) 110 | timesteps = timesteps[timesteps.l.cumsum() <= num_timesteps] 111 | tslist.append(timesteps) 112 | xy_list = [ts2xy(timesteps_item, xaxis) for timesteps_item in tslist] 113 | plot_curves(xy_list, xaxis, task_name) 114 | 115 | 116 | def main(): 117 | """ 118 | Example usage in jupyter-notebook 119 | 120 | .. code-block:: python 121 | 122 | from stable_baselines import log_viewer 123 | %matplotlib inline 124 | log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout") 125 | 126 | Here ./log is a directory containing the monitor.csv files 127 | """ 128 | import argparse 129 | import os 130 | parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) 131 | parser.add_argument('--dirs', help='List of log directories', nargs='*', default=['./log']) 132 | parser.add_argument('--num_timesteps', type=int, default=int(10e6)) 133 | parser.add_argument('--xaxis', help='Varible on X-axis', default=X_TIMESTEPS) 134 | parser.add_argument('--task_name', help='Title of plot', default='Breakout') 135 | args = parser.parse_args() 136 | args.dirs = [os.path.abspath(folder) for folder in args.dirs] 137 | plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name) 138 | plt.show() 139 | 140 | 141 | if __name__ == '__main__': 142 | main() 143 | -------------------------------------------------------------------------------- /stable_baselines/trpo_mpi/__init__.py: -------------------------------------------------------------------------------- 1 | from stable_baselines.trpo_mpi.trpo_mpi import TRPO 2 | -------------------------------------------------------------------------------- /stable_baselines/trpo_mpi/run_atari.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | from mpi4py import MPI 5 | 6 | from stable_baselines.common import set_global_seeds 7 | from stable_baselines import bench, logger 8 | from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind 9 | from stable_baselines.common.cmd_util import atari_arg_parser 10 | from stable_baselines.common.policies import CnnPolicy 11 | # from stable_baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy 12 | from stable_baselines.trpo_mpi import TRPO 13 | 14 | 15 | def train(env_id, num_timesteps, seed): 16 | """ 17 | Train TRPO model for the atari environment, for testing purposes 18 | 19 | :param env_id: (str) Environment ID 20 | :param num_timesteps: (int) The total number of samples 21 | :param seed: (int) The initial seed for training 22 | """ 23 | rank = MPI.COMM_WORLD.Get_rank() 24 | 25 | if rank == 0: 26 | logger.configure() 27 | else: 28 | logger.configure(format_strs=[]) 29 | 30 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() 31 | set_global_seeds(workerseed) 32 | env = make_atari(env_id) 33 | 34 | # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None): # pylint: disable=W0613 35 | # return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders) 36 | 37 | env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) 38 | env.seed(workerseed) 39 | 40 | env = wrap_deepmind(env) 41 | env.seed(workerseed) 42 | 43 | model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0, 44 | gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4) 45 | model.learn(total_timesteps=int(num_timesteps * 1.1)) 46 | env.close() 47 | 48 | 49 | def main(): 50 | """ 51 | Runs the test 52 | """ 53 | args = atari_arg_parser().parse_args() 54 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 55 | 56 | 57 | if __name__ == "__main__": 58 | main() 59 | -------------------------------------------------------------------------------- /stable_baselines/trpo_mpi/run_mujoco.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # noinspection PyUnresolvedReferences 3 | from mpi4py import MPI 4 | 5 | from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser 6 | from stable_baselines.common.policies import MlpPolicy 7 | from stable_baselines import logger 8 | from stable_baselines.trpo_mpi import TRPO 9 | import stable_baselines.common.tf_util as tf_util 10 | 11 | 12 | def train(env_id, num_timesteps, seed): 13 | """ 14 | Train TRPO model for the mujoco environment, for testing purposes 15 | 16 | :param env_id: (str) Environment ID 17 | :param num_timesteps: (int) The total number of samples 18 | :param seed: (int) The initial seed for training 19 | """ 20 | with tf_util.single_threaded_session(): 21 | rank = MPI.COMM_WORLD.Get_rank() 22 | if rank == 0: 23 | logger.configure() 24 | else: 25 | logger.configure(format_strs=[]) 26 | logger.set_level(logger.DISABLED) 27 | workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() 28 | 29 | env = make_mujoco_env(env_id, workerseed) 30 | model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0, 31 | gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) 32 | model.learn(total_timesteps=num_timesteps) 33 | env.close() 34 | 35 | 36 | def main(): 37 | """ 38 | Runs the test 39 | """ 40 | args = mujoco_arg_parser().parse_args() 41 | train(args.env, num_timesteps=args.num_timesteps, seed=args.seed) 42 | 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /stable_baselines/trpo_mpi/utils.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import numpy as np 3 | 4 | from stable_baselines.common.vec_env import VecEnv 5 | 6 | 7 | def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False): 8 | """ 9 | Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) 10 | 11 | :param policy: (MLPPolicy) the policy 12 | :param env: (Gym Environment) the environment 13 | :param horizon: (int) the number of timesteps to run per batch 14 | :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action 15 | :param gail: (bool) Whether we are using this generator for standard trpo or with gail 16 | :return: (dict) generator that returns a dict with the following keys: 17 | 18 | - ob: (np.ndarray) observations 19 | - rew: (numpy float) rewards (if gail is used it is the predicted reward) 20 | - vpred: (numpy float) action logits 21 | - new: (numpy bool) dones (is end of episode) 22 | - ac: (np.ndarray) actions 23 | - prevac: (np.ndarray) previous actions 24 | - nextvpred: (numpy float) next action logits 25 | - ep_rets: (float) cumulated current episode reward 26 | - ep_lens: (int) the length of the current episode 27 | - ep_true_rets: (float) the real environment reward 28 | """ 29 | # Check when using GAIL 30 | assert not (gail and reward_giver is None), "You must pass a reward giver when using GAIL" 31 | 32 | # Initialize state variables 33 | step = 0 34 | action = env.action_space.sample() # not used, just so we have the datatype 35 | new = True 36 | observation = env.reset() 37 | 38 | cur_ep_ret = 0 # return in current episode 39 | cur_ep_len = 0 # len of current episode 40 | cur_ep_true_ret = 0 41 | ep_true_rets = [] 42 | ep_rets = [] # returns of completed episodes in this segment 43 | ep_lens = [] # Episode lengths 44 | 45 | # Initialize history arrays 46 | observations = np.array([observation for _ in range(horizon)]) 47 | true_rews = np.zeros(horizon, 'float32') 48 | rews = np.zeros(horizon, 'float32') 49 | vpreds = np.zeros(horizon, 'float32') 50 | dones = np.zeros(horizon, 'int32') 51 | actions = np.array([action for _ in range(horizon)]) 52 | prev_actions = actions.copy() 53 | states = policy.initial_state 54 | done = None 55 | 56 | while True: 57 | prevac = action 58 | action, vpred, states, _ = policy.step(observation.reshape(-1, *observation.shape), states, done) 59 | # Slight weirdness here because we need value function at time T 60 | # before returning segment [0, T-1] so we get the correct 61 | # terminal value 62 | if step > 0 and step % horizon == 0: 63 | # Fix to avoid "mean of empty slice" warning when there is only one episode 64 | if len(ep_rets) == 0: 65 | ep_rets = [cur_ep_ret] 66 | ep_lens = [cur_ep_len] 67 | ep_true_rets = [cur_ep_true_ret] 68 | total_timesteps = cur_ep_len 69 | else: 70 | total_timesteps = sum(ep_lens) + cur_ep_len 71 | 72 | yield {"ob": observations, "rew": rews, "dones": dones, "true_rew": true_rews, "vpred": vpreds, 73 | "ac": actions, "prevac": prev_actions, "nextvpred": vpred * (1 - new), "ep_rets": ep_rets, 74 | "ep_lens": ep_lens, "ep_true_rets": ep_true_rets, "total_timestep": total_timesteps} 75 | _, vpred, _, _ = policy.step(observation.reshape(-1, *observation.shape)) 76 | # Be careful!!! if you change the downstream algorithm to aggregate 77 | # several of these batches, then be sure to do a deepcopy 78 | ep_rets = [] 79 | ep_true_rets = [] 80 | ep_lens = [] 81 | i = step % horizon 82 | observations[i] = observation 83 | vpreds[i] = vpred[0] 84 | actions[i] = action[0] 85 | prev_actions[i] = prevac 86 | 87 | clipped_action = action 88 | # Clip the actions to avoid out of bound error 89 | if isinstance(env.action_space, gym.spaces.Box): 90 | clipped_action = np.clip(action, env.action_space.low, env.action_space.high) 91 | 92 | if gail: 93 | rew = reward_giver.get_reward(observation, clipped_action[0]) 94 | observation, true_rew, done, _info = env.step(clipped_action[0]) 95 | else: 96 | observation, rew, done, _info = env.step(clipped_action[0]) 97 | true_rew = rew 98 | rews[i] = rew 99 | true_rews[i] = true_rew 100 | dones[i] = done 101 | 102 | cur_ep_ret += rew 103 | cur_ep_true_ret += true_rew 104 | cur_ep_len += 1 105 | if done: 106 | ep_rets.append(cur_ep_ret) 107 | ep_true_rets.append(cur_ep_true_ret) 108 | ep_lens.append(cur_ep_len) 109 | cur_ep_ret = 0 110 | cur_ep_true_ret = 0 111 | cur_ep_len = 0 112 | if not isinstance(env, VecEnv): 113 | observation = env.reset() 114 | step += 1 115 | 116 | 117 | def add_vtarg_and_adv(seg, gamma, lam): 118 | """ 119 | Compute target value using TD(lambda) estimator, and advantage with GAE(lambda) 120 | 121 | :param seg: (dict) the current segment of the trajectory (see traj_segment_generator return for more information) 122 | :param gamma: (float) Discount factor 123 | :param lam: (float) GAE factor 124 | """ 125 | # last element is only used for last vtarg, but we already zeroed it if last new = 1 126 | new = np.append(seg["dones"], 0) 127 | vpred = np.append(seg["vpred"], seg["nextvpred"]) 128 | rew_len = len(seg["rew"]) 129 | seg["adv"] = gaelam = np.empty(rew_len, 'float32') 130 | rew = seg["rew"] 131 | lastgaelam = 0 132 | for step in reversed(range(rew_len)): 133 | nonterminal = 1 - new[step + 1] 134 | delta = rew[step] + gamma * vpred[step + 1] * nonterminal - vpred[step] 135 | gaelam[step] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam 136 | seg["tdlamret"] = seg["adv"] + seg["vpred"] 137 | 138 | 139 | def flatten_lists(listoflists): 140 | """ 141 | Flatten a python list of list 142 | 143 | :param listoflists: (list(list)) 144 | :return: (list) 145 | """ 146 | return [el for list_ in listoflists for el in list_] 147 | --------------------------------------------------------------------------------