├── .DS_Store
├── .gitignore
├── 1_sb_ppo_agent.py
├── 2_supermario_dqn.ipynb
├── 4_pong_dqn (1).ipynb
├── README.md
└── stable_baselines
    ├── __init__.py
    ├── a2c
        ├── __init__.py
        ├── a2c.py
        ├── run_atari.py
        └── utils.py
    ├── acer
        ├── __init__.py
        ├── acer_simple.py
        ├── buffer.py
        └── run_atari.py
    ├── acktr
        ├── __init__.py
        ├── acktr_cont.py
        ├── acktr_disc.py
        ├── kfac.py
        ├── kfac_utils.py
        ├── policies.py
        ├── run_atari.py
        ├── run_mujoco.py
        ├── utils.py
        └── value_functions.py
    ├── bench
        ├── __init__.py
        ├── benchmarks.py
        └── monitor.py
    ├── common
        ├── __init__.py
        ├── atari_wrappers.py
        ├── base_class.py
        ├── cg.py
        ├── cmd_util.py
        ├── console_util.py
        ├── dataset.py
        ├── distributions.py
        ├── filters.py
        ├── identity_env.py
        ├── input.py
        ├── math_util.py
        ├── misc_util.py
        ├── mpi_adam.py
        ├── mpi_fork.py
        ├── mpi_moments.py
        ├── mpi_running_mean_std.py
        ├── policies.py
        ├── runners.py
        ├── running_mean_std.py
        ├── running_stat.py
        ├── schedules.py
        ├── segment_tree.py
        ├── tf_util.py
        ├── tile_images.py
        └── vec_env
        │   ├── __init__.py
        │   ├── base_vec_env.py
        │   ├── dummy_vec_env.py
        │   ├── subproc_vec_env.py
        │   ├── vec_frame_stack.py
        │   └── vec_normalize.py
    ├── ddpg
        ├── __init__.py
        ├── ddpg.py
        ├── main.py
        ├── memory.py
        ├── noise.py
        └── policies.py
    ├── deepq
        ├── __init__.py
        ├── build_graph.py
        ├── dqn.py
        ├── dqn_10_sb_dqn_supermari_decay_resolution_exploration.py
        ├── dqn_12_sb_dqn_supermari_decay_resolution_exploration_step_10.py
        ├── dqn_13_sb_dqn_supermari_decay_resolution_exploration_step_20_exploration_term_5000.py
        ├── dqn_14_resoultion_network.py
        ├── dqn_9_sb_dqn_supermari_resolution_exploration.py
        ├── experiments
        │   ├── __init__.py
        │   ├── custom_cartpole.py
        │   ├── enjoy_cartpole.py
        │   ├── enjoy_mountaincar.py
        │   ├── enjoy_pong.py
        │   ├── run_atari.py
        │   ├── train_cartpole.py
        │   └── train_mountaincar.py
        ├── policies.py
        ├── replay_buffer.py
        └── utils.py
    ├── gail
        ├── __init__.py
        ├── adversary.py
        ├── behavior_clone.py
        ├── dataset
        │   ├── __init__.py
        │   └── mujocodset.py
        ├── gail_eval.py
        ├── mlp_policy.py
        ├── model.py
        ├── result
        │   ├── HalfCheetah-normalized-deterministic-scores.png
        │   ├── HalfCheetah-normalized-stochastic-scores.png
        │   ├── HalfCheetah-unnormalized-deterministic-scores.png
        │   ├── HalfCheetah-unnormalized-stochastic-scores.png
        │   ├── Hopper-normalized-deterministic-scores.png
        │   ├── Hopper-normalized-stochastic-scores.png
        │   ├── Hopper-unnormalized-deterministic-scores.png
        │   ├── Hopper-unnormalized-stochastic-scores.png
        │   ├── Humanoid-normalized-deterministic-scores.png
        │   ├── Humanoid-normalized-stochastic-scores.png
        │   ├── Humanoid-unnormalized-deterministic-scores.png
        │   ├── Humanoid-unnormalized-stochastic-scores.png
        │   ├── HumanoidStandup-normalized-deterministic-scores.png
        │   ├── HumanoidStandup-normalized-stochastic-scores.png
        │   ├── HumanoidStandup-unnormalized-deterministic-scores.png
        │   ├── HumanoidStandup-unnormalized-stochastic-scores.png
        │   ├── Walker2d-normalized-deterministic-scores.png
        │   ├── Walker2d-normalized-stochastic-scores.png
        │   ├── Walker2d-unnormalized-deterministic-scores.png
        │   ├── Walker2d-unnormalized-stochastic-scores.png
        │   ├── gail-result.md
        │   ├── halfcheetah-training.png
        │   ├── hopper-training.png
        │   ├── humanoid-training.png
        │   ├── humanoidstandup-training.png
        │   └── walker2d-training.png
        ├── run_mujoco.py
        └── statistics.py
    ├── her
        ├── __init__.py
        ├── actor_critic.py
        ├── ddpg.py
        ├── experiment
        │   ├── __init__.py
        │   ├── config.py
        │   ├── play.py
        │   ├── plot.py
        │   └── train.py
        ├── her.py
        ├── normalizer.py
        ├── replay_buffer.py
        ├── rollout.py
        └── util.py
    ├── logger.py
    ├── ppo1
        ├── __init__.py
        ├── experiments
        │   └── train_cartpole.py
        ├── mlp_policy.py
        ├── pposgd_simple.py
        ├── run_atari.py
        ├── run_humanoid.py
        ├── run_mujoco.py
        └── run_robotics.py
    ├── ppo2
        ├── __init__.py
        ├── ppo2.py
        ├── ppo2_2_sb_ppo_action_resolution.py
        ├── ppo2_3_sb_ppo_resolution_exploration_decay.py
        ├── run_atari.py
        └── run_mujoco.py
    ├── results_plotter.py
    └── trpo_mpi
        ├── __init__.py
        ├── run_atari.py
        ├── run_mujoco.py
        ├── trpo_mpi.py
        └── utils.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/.DS_Store


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Created by https://www.gitignore.io/api/python
  2 | 
  3 | ### Python ###
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | *.egg-info/
 27 | .installed.cfg
 28 | *.egg
 29 | MANIFEST
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | 
 63 | # Flask stuff:
 64 | instance/
 65 | .webassets-cache
 66 | 
 67 | # Scrapy stuff:
 68 | .scrapy
 69 | 
 70 | # Sphinx documentation
 71 | docs/_build/
 72 | 
 73 | # PyBuilder
 74 | target/
 75 | 
 76 | # Jupyter Notebook
 77 | .ipynb_checkpoints
 78 | 
 79 | # IPython
 80 | profile_default/
 81 | ipython_config.py
 82 | 
 83 | # pyenv
 84 | .python-version
 85 | 
 86 | # celery beat schedule file
 87 | celerybeat-schedule
 88 | 
 89 | # SageMath parsed files
 90 | *.sage.py
 91 | 
 92 | # Environments
 93 | .env
 94 | .venv
 95 | env/
 96 | venv/
 97 | ENV/
 98 | env.bak/
 99 | venv.bak/
100 | 
101 | # Spyder project settings
102 | .spyderproject
103 | .spyproject
104 | 
105 | # Rope project settings
106 | .ropeproject
107 | 
108 | # mkdocs documentation
109 | /site
110 | 
111 | # mypy
112 | .mypy_cache/
113 | .dmypy.json
114 | dmypy.json
115 | 
116 | # Pyre type checker
117 | .pyre/
118 | 
119 | ### Python Patch ###
120 | .venv/
121 | 
122 | ### Python.VirtualEnv Stack ###
123 | # Virtualenv
124 | # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
125 | [Bb]in
126 | [Ii]nclude
127 | [Ll]ib
128 | [Ll]ib64
129 | [Ll]ocal
130 | [Ss]cripts
131 | pyvenv.cfg
132 | pip-selfcheck.json
133 | 
134 | 
135 | # End of https://www.gitignore.io/api/python
136 | 


--------------------------------------------------------------------------------
/1_sb_ppo_agent.py:
--------------------------------------------------------------------------------
 1 | #import retro
 2 | 
 3 | import gym_super_mario_bros
 4 | import logging
 5 | import gym
 6 | import gym_super_mario_bros
 7 | import numpy as np
 8 | import sys
 9 | from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv
10 | import random
11 | from stable_baselines.ppo2.ppo2 import PPO2
12 | 
13 | from stable_baselines import PPO2
14 | from stable_baselines.common.policies import CnnPolicy
15 | from stable_baselines.common.vec_env import DummyVecEnv
16 | 
17 | movements = [
18 |     ['NOP'],
19 |     ['A'],
20 |     ['B'],
21 |     ['right'],
22 |     ['right', 'A'],
23 |     ['right', 'B'],
24 |     ['right', 'A', 'B'],
25 |     ['left'],
26 |     ['left', 'A'],
27 |     ['left', 'B'],
28 |     ['left', 'A', 'B'],
29 | #    ['down'],
30 | #    ['up']
31 | ]
32 | 
33 | 
34 | _env = gym_super_mario_bros.make('SuperMarioBros-v0')
35 | #_env = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='rectangle')
36 | env = BinarySpaceToDiscreteSpaceEnv(_env, movements)
37 | env = DummyVecEnv([lambda: env])
38 | model = PPO2(policy=CnnPolicy, env=env, verbose=1)
39 | model.learn(total_timesteps=10000)
40 | 
41 | obs = env.reset()
42 | 
43 | while True:
44 |     action, _info = model.predict(obs)
45 | 
46 |     obs, rewards, dones, info = env.step(action)
47 |     print("학습끝")
48 |     print(rewards)
49 |     env.render()
50 | 


--------------------------------------------------------------------------------
/stable_baselines/__init__.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | import numpy as np
 3 | 
 4 | from stable_baselines.a2c import A2C
 5 | from stable_baselines.acer import ACER
 6 | from stable_baselines.acktr import ACKTR
 7 | from stable_baselines.ddpg import DDPG
 8 | from stable_baselines.deepq import DQN
 9 | from stable_baselines.gail import GAIL
10 | from stable_baselines.ppo1 import PPO1
11 | from stable_baselines.ppo2 import PPO2
12 | from stable_baselines.trpo_mpi import TRPO
13 | 
14 | __version__ = "2.1.1.a0"
15 | 
16 | 
17 | # patch Gym spaces to add equality functions, if not implemented
18 | # See https://github.com/openai/gym/issues/1171
19 | if gym.spaces.MultiBinary.__eq__ == object.__eq__:  # by default, all classes have the __eq__ function from object.
20 |     def _eq(self, other):
21 |         return self.n == other.n
22 | 
23 |     gym.spaces.MultiBinary.__eq__ = _eq
24 | 
25 | if gym.spaces.MultiDiscrete.__eq__ == object.__eq__:
26 |     def _eq(self, other):
27 |         return np.all(self.nvec == other.nvec)
28 | 
29 |     gym.spaces.MultiDiscrete.__eq__ = _eq
30 | 


--------------------------------------------------------------------------------
/stable_baselines/a2c/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.a2c.a2c import A2C
2 | 


--------------------------------------------------------------------------------
/stable_baselines/a2c/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from stable_baselines import logger
 4 | from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser
 5 | from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack
 6 | from stable_baselines.a2c import A2C
 7 | from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy
 8 | 
 9 | 
10 | def train(env_id, num_timesteps, seed, policy, lr_schedule, num_env):
11 |     """
12 |     Train A2C model for atari environment, for testing purposes
13 | 
14 |     :param env_id: (str) Environment ID
15 |     :param num_timesteps: (int) The total number of samples
16 |     :param seed: (int) The initial seed for training
17 |     :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
18 |     :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
19 |                                  'double_linear_con', 'middle_drop' or 'double_middle_drop')
20 |     :param num_env: (int) The number of environments
21 |     """
22 |     policy_fn = None
23 |     if policy == 'cnn':
24 |         policy_fn = CnnPolicy
25 |     elif policy == 'lstm':
26 |         policy_fn = CnnLstmPolicy
27 |     elif policy == 'lnlstm':
28 |         policy_fn = CnnLnLstmPolicy
29 |     if policy_fn is None:
30 |         raise ValueError("Error: policy {} not implemented".format(policy))
31 | 
32 |     env = VecFrameStack(make_atari_env(env_id, num_env, seed), 4)
33 | 
34 |     model = A2C(policy_fn, env, lr_schedule=lr_schedule)
35 |     model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
36 |     env.close()
37 | 
38 | 
39 | def main():
40 |     """
41 |     Runs the test
42 |     """
43 |     parser = atari_arg_parser()
44 |     parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture')
45 |     parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant',
46 |                         help='Learning rate schedule')
47 |     args = parser.parse_args()
48 |     logger.configure()
49 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, policy=args.policy, lr_schedule=args.lr_schedule,
50 |           num_env=16)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/stable_baselines/acer/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.acer.acer_simple import ACER
2 | 


--------------------------------------------------------------------------------
/stable_baselines/acer/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from stable_baselines import logger
 4 | from stable_baselines.acer import ACER
 5 | from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy
 6 | from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser
 7 | from stable_baselines.common.vec_env import VecFrameStack
 8 | 
 9 | 
10 | def train(env_id, num_timesteps, seed, policy, lr_schedule, num_cpu):
11 |     """
12 |     train an ACER model on atari
13 | 
14 |     :param env_id: (str) Environment ID
15 |     :param num_timesteps: (int) The total number of samples
16 |     :param seed: (int) The initial seed for training
17 |     :param policy: (A2CPolicy) The policy model to use (MLP, CNN, LSTM, ...)
18 |     :param lr_schedule: (str) The type of scheduler for the learning rate update ('linear', 'constant',
19 |                                  'double_linear_con', 'middle_drop' or 'double_middle_drop')
20 |     :param num_cpu: (int) The number of cpu to train on
21 |     """
22 |     env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
23 |     if policy == 'cnn':
24 |         policy_fn = CnnPolicy
25 |     elif policy == 'lstm':
26 |         policy_fn = CnnLstmPolicy
27 |     else:
28 |         print("Policy {} not implemented".format(policy))
29 |         return
30 | 
31 |     model = ACER(policy_fn, env, lr_schedule=lr_schedule, buffer_size=5000)
32 |     model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
33 |     env.close()
34 | 
35 | 
36 | def main():
37 |     """
38 |     Runs the test
39 |     """
40 |     parser = atari_arg_parser()
41 |     parser.add_argument('--policy', choices=['cnn', 'lstm', 'lnlstm'], default='cnn', help='Policy architecture')
42 |     parser.add_argument('--lr_schedule', choices=['constant', 'linear'], default='constant',
43 |                         help='Learning rate schedule')
44 |     parser.add_argument('--logdir', help='Directory for logging')
45 |     args = parser.parse_args()
46 |     logger.configure(args.logdir)
47 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
48 |           policy=args.policy, lr_schedule=args.lr_schedule, num_cpu=16)
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     main()
53 | 


--------------------------------------------------------------------------------
/stable_baselines/acktr/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.acktr.acktr_disc import ACKTR
2 | 


--------------------------------------------------------------------------------
/stable_baselines/acktr/kfac_utils.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | 
  3 | 
  4 | def gmatmul(tensor_a, tensor_b, transpose_a=False, transpose_b=False, reduce_dim=None):
  5 |     """
  6 |     Do a matrix multiplication with tensor 'a' and 'b', even when their shape do not match
  7 | 
  8 |     :param tensor_a: (TensorFlow Tensor)
  9 |     :param tensor_b: (TensorFlow Tensor)
 10 |     :param transpose_a: (bool) If 'a' needs transposing
 11 |     :param transpose_b: (bool) If 'b' needs transposing
 12 |     :param reduce_dim: (int) the multiplication over the dim
 13 |     :return: (TensorFlow Tensor) a * b
 14 |     """
 15 |     assert reduce_dim is not None
 16 | 
 17 |     # weird batch matmul
 18 |     if len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) > 2:
 19 |         # reshape reduce_dim to the left most dim in b
 20 |         b_shape = tensor_b.get_shape()
 21 |         if reduce_dim != 0:
 22 |             b_dims = list(range(len(b_shape)))
 23 |             b_dims.remove(reduce_dim)
 24 |             b_dims.insert(0, reduce_dim)
 25 |             tensor_b = tf.transpose(tensor_b, b_dims)
 26 |         b_t_shape = tensor_b.get_shape()
 27 |         tensor_b = tf.reshape(tensor_b, [int(b_shape[reduce_dim]), -1])
 28 |         result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a,
 29 |                            transpose_b=transpose_b)
 30 |         result = tf.reshape(result, b_t_shape)
 31 |         if reduce_dim != 0:
 32 |             b_dims = list(range(len(b_shape)))
 33 |             b_dims.remove(0)
 34 |             b_dims.insert(reduce_dim, 0)
 35 |             result = tf.transpose(result, b_dims)
 36 |         return result
 37 | 
 38 |     elif len(tensor_a.get_shape()) > 2 and len(tensor_b.get_shape()) == 2:
 39 |         # reshape reduce_dim to the right most dim in a
 40 |         a_shape = tensor_a.get_shape()
 41 |         outter_dim = len(a_shape) - 1
 42 |         reduce_dim = len(a_shape) - reduce_dim - 1
 43 |         if reduce_dim != outter_dim:
 44 |             a_dims = list(range(len(a_shape)))
 45 |             a_dims.remove(reduce_dim)
 46 |             a_dims.insert(outter_dim, reduce_dim)
 47 |             tensor_a = tf.transpose(tensor_a, a_dims)
 48 |         a_t_shape = tensor_a.get_shape()
 49 |         tensor_a = tf.reshape(tensor_a, [-1, int(a_shape[reduce_dim])])
 50 |         result = tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a,
 51 |                            transpose_b=transpose_b)
 52 |         result = tf.reshape(result, a_t_shape)
 53 |         if reduce_dim != outter_dim:
 54 |             a_dims = list(range(len(a_shape)))
 55 |             a_dims.remove(outter_dim)
 56 |             a_dims.insert(reduce_dim, outter_dim)
 57 |             result = tf.transpose(result, a_dims)
 58 |         return result
 59 | 
 60 |     elif len(tensor_a.get_shape()) == 2 and len(tensor_b.get_shape()) == 2:
 61 |         return tf.matmul(tensor_a, tensor_b, transpose_a=transpose_a, transpose_b=transpose_b)
 62 | 
 63 |     assert False, 'something went wrong'
 64 | 
 65 | 
 66 | def clipout_neg(vec, threshold=1e-6):
 67 |     """
 68 |     clip to 0 if input lower than threshold value
 69 | 
 70 |     :param vec: (TensorFlow Tensor)
 71 |     :param threshold: (float) the cutoff threshold
 72 |     :return: (TensorFlow Tensor) clipped input
 73 |     """
 74 |     mask = tf.cast(vec > threshold, tf.float32)
 75 |     return mask * vec
 76 | 
 77 | 
 78 | def detect_min_val(input_mat, var, threshold=1e-6, name='', debug=False):
 79 |     """
 80 |     If debug is not set, will run clipout_neg. Else, will clip and print out odd eigen values
 81 | 
 82 |     :param input_mat: (TensorFlow Tensor)
 83 |     :param var: (TensorFlow Tensor) variable
 84 |     :param threshold: (float) the cutoff threshold
 85 |     :param name: (str) the name of the variable
 86 |     :param debug: (bool) debug function
 87 |     :return: (TensorFlow Tensor) clipped tensor
 88 |     """
 89 |     eigen_min = tf.reduce_min(input_mat)
 90 |     eigen_max = tf.reduce_max(input_mat)
 91 |     eigen_ratio = eigen_max / eigen_min
 92 |     input_mat_clipped = clipout_neg(input_mat, threshold)
 93 | 
 94 |     if debug:
 95 |         input_mat_clipped = tf.cond(tf.logical_or(tf.greater(eigen_ratio, 0.), tf.less(eigen_ratio, -500)),
 96 |                                     lambda: input_mat_clipped, lambda: tf.Print(
 97 |                 input_mat_clipped,
 98 |                 [tf.convert_to_tensor('odd ratio ' + name + ' eigen values!!!'), tf.convert_to_tensor(var.name),
 99 |                  eigen_min, eigen_max, eigen_ratio]))
100 | 
101 |     return input_mat_clipped
102 | 
103 | 
104 | def factor_reshape(eigen_vectors, eigen_values, grad, fac_idx=0, f_type='act'):
105 |     """
106 |     factor and reshape input eigen values
107 | 
108 |     :param eigen_vectors: ([TensorFlow Tensor]) eigen vectors
109 |     :param eigen_values: ([TensorFlow Tensor]) eigen values
110 |     :param grad: ([TensorFlow Tensor]) gradient
111 |     :param fac_idx: (int) index that should be factored
112 |     :param f_type: (str) function type to factor and reshape
113 |     :return: ([TensorFlow Tensor], [TensorFlow Tensor]) factored and reshaped eigen vectors
114 |             and eigen values
115 |     """
116 |     grad_shape = grad.get_shape()
117 |     if f_type == 'act':
118 |         assert eigen_values.get_shape()[0] == grad_shape[fac_idx]
119 |         expanded_shape = [1, ] * len(grad_shape)
120 |         expanded_shape[fac_idx] = -1
121 |         eigen_values = tf.reshape(eigen_values, expanded_shape)
122 |     if f_type == 'grad':
123 |         assert eigen_values.get_shape()[0] == grad_shape[len(grad_shape) - fac_idx - 1]
124 |         expanded_shape = [1, ] * len(grad_shape)
125 |         expanded_shape[len(grad_shape) - fac_idx - 1] = -1
126 |         eigen_values = tf.reshape(eigen_values, expanded_shape)
127 | 
128 |     return eigen_vectors, eigen_values
129 | 


--------------------------------------------------------------------------------
/stable_baselines/acktr/policies.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | from stable_baselines.acktr.utils import dense, kl_div
 5 | import stable_baselines.common.tf_util as tf_util
 6 | 
 7 | 
 8 | class GaussianMlpPolicy(object):
 9 |     def __init__(self, ob_dim, ac_dim):
10 |         """
11 |         Create a gaussian MLP policy
12 | 
13 |         :param ob_dim: (int) Observation dimention
14 |         :param ac_dim: (int) action dimention
15 |         """
16 |         # Here we'll construct a bunch of expressions, which will be used in two places:
17 |         # (1) When sampling actions
18 |         # (2) When computing loss functions, for the policy update
19 |         # Variables specific to (1) have the word "sampled" in them,
20 |         # whereas variables specific to (2) have the word "old" in them
21 |         ob_no = tf.placeholder(tf.float32, shape=[None, ob_dim * 2], name="ob")  # batch of observations
22 |         oldac_na = tf.placeholder(tf.float32, shape=[None, ac_dim], name="ac")  # batch of actions previous actions
23 |         # batch of actions previous action distributions
24 |         oldac_dist = tf.placeholder(tf.float32, shape=[None, ac_dim * 2], name="oldac_dist")
25 |         adv_n = tf.placeholder(tf.float32, shape=[None], name="adv")  # advantage function estimate
26 |         wd_dict = {}
27 |         layer_1 = tf.nn.tanh(dense(ob_no, 64, "h1",
28 |                                    weight_init=tf_util.normc_initializer(1.0),
29 |                                    bias_init=0.0, weight_loss_dict=wd_dict))
30 |         layer_2 = tf.nn.tanh(dense(layer_1, 64, "h2",
31 |                                    weight_init=tf_util.normc_initializer(1.0),
32 |                                    bias_init=0.0, weight_loss_dict=wd_dict))
33 |         mean_na = dense(layer_2, ac_dim, "mean", weight_init=tf_util.normc_initializer(0.1),
34 |                         bias_init=0.0, weight_loss_dict=wd_dict)  # Mean control output
35 |         self.wd_dict = wd_dict
36 |         # Variance on outputs
37 |         self.logstd_1a = logstd_1a = tf.get_variable("logstd", [ac_dim], tf.float32, tf.zeros_initializer())
38 |         logstd_1a = tf.expand_dims(logstd_1a, 0)
39 |         std_1a = tf.exp(logstd_1a)
40 |         std_na = tf.tile(std_1a, [tf.shape(mean_na)[0], 1])
41 |         ac_dist = tf.concat([tf.reshape(mean_na, [-1, ac_dim]), tf.reshape(std_na, [-1, ac_dim])], 1)
42 |         # This is the sampled action we'll perform.
43 |         sampled_ac_na = tf.random_normal(tf.shape(ac_dist[:, ac_dim:])) * ac_dist[:, ac_dim:] + ac_dist[:, :ac_dim]
44 |         logprobsampled_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
45 |             2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
46 |             tf.square(ac_dist[:, :ac_dim] - sampled_ac_na) / (tf.square(ac_dist[:, ac_dim:])),
47 |             axis=1)  # Logprob of sampled action
48 |         logprob_n = - tf.reduce_sum(tf.log(ac_dist[:, ac_dim:]), axis=1) - 0.5 * tf.log(
49 |             2.0 * np.pi) * ac_dim - 0.5 * tf.reduce_sum(
50 |             tf.square(ac_dist[:, :ac_dim] - oldac_na) / (tf.square(ac_dist[:, ac_dim:])),
51 |             axis=1)  # Logprob of previous actions under CURRENT policy (whereas oldlogprob_n is under OLD policy)
52 |         kl_loss = tf.reduce_mean(kl_div(oldac_dist, ac_dist, ac_dim))
53 |         # kl = .5 * tf.reduce_mean(tf.square(logprob_n - oldlogprob_n))
54 |         # Approximation of KL divergence between old policy used to generate actions,
55 |         # and new policy used to compute logprob_n
56 |         surr = - tf.reduce_mean(adv_n * logprob_n)  # Loss function that we'll differentiate to get the policy gradient
57 |         surr_sampled = - tf.reduce_mean(logprob_n)  # Sampled loss of the policy
58 |         # Generate a new action and its logprob
59 |         self._act = tf_util.function([ob_no], [sampled_ac_na, ac_dist, logprobsampled_n])
60 |         # self.compute_kl = U.function([ob_no, oldac_na, oldlogprob_n], kl)
61 |         #  Compute (approximate) KL divergence between old policy and new policy
62 |         self.compute_kl = tf_util.function([ob_no, oldac_dist], kl_loss)
63 |         # Input and output variables needed for computing loss
64 |         self.update_info = ((ob_no, oldac_na, adv_n), surr, surr_sampled)
65 |         tf_util.initialize()  # Initialize uninitialized TF variables
66 | 
67 |     def act(self, obs):
68 |         """
69 |         get the action from an observation
70 | 
71 |         :param obs: ([float]) observation
72 |         :return: ([float], [float], [float]) action, action_proba, logp
73 |         """
74 |         action, ac_dist, logp = self._act(obs[None])
75 |         return action[0], ac_dist[0], logp[0]
76 | 


--------------------------------------------------------------------------------
/stable_baselines/acktr/run_atari.py:
--------------------------------------------------------------------------------
 1 | from stable_baselines import logger
 2 | from stable_baselines.acktr import ACKTR
 3 | from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser
 4 | from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack
 5 | from stable_baselines.common.policies import CnnPolicy
 6 | 
 7 | 
 8 | def train(env_id, num_timesteps, seed, num_cpu):
 9 |     """
10 |     train an ACKTR model on atari
11 | 
12 |     :param env_id: (str) Environment ID
13 |     :param num_timesteps: (int) The total number of samples
14 |     :param seed: (int) The initial seed for training
15 |     :param num_cpu: (int) The number of cpu to train on
16 |     """
17 |     env = VecFrameStack(make_atari_env(env_id, num_cpu, seed), 4)
18 |     model = ACKTR(CnnPolicy, env, nprocs=num_cpu)
19 |     model.learn(total_timesteps=int(num_timesteps * 1.1), seed=seed)
20 |     env.close()
21 | 
22 | 
23 | def main():
24 |     """
25 |     Runs the test
26 |     """
27 |     args = atari_arg_parser().parse_args()
28 |     logger.configure()
29 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed, num_cpu=32)
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     main()
34 | 


--------------------------------------------------------------------------------
/stable_baselines/acktr/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from stable_baselines import logger
 6 | from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 7 | from stable_baselines.acktr.acktr_cont import learn
 8 | from stable_baselines.acktr.policies import GaussianMlpPolicy
 9 | from stable_baselines.acktr.value_functions import NeuralNetValueFunction
10 | 
11 | 
12 | def train(env_id, num_timesteps, seed):
13 |     """
14 |     train an ACKTR model on atari
15 | 
16 |     :param env_id: (str) Environment ID
17 |     :param num_timesteps: (int) The total number of samples
18 |     :param seed: (int) The initial seed for training
19 |     """
20 |     env = make_mujoco_env(env_id, seed)
21 | 
22 |     with tf.Session(config=tf.ConfigProto()):
23 |         ob_dim = env.observation_space.shape[0]
24 |         ac_dim = env.action_space.shape[0]
25 |         with tf.variable_scope("vf"):
26 |             value_fn = NeuralNetValueFunction(ob_dim, ac_dim)
27 |         with tf.variable_scope("pi"):
28 |             policy = GaussianMlpPolicy(ob_dim, ac_dim)
29 | 
30 |         learn(env, policy=policy, value_fn=value_fn, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002,
31 |               num_timesteps=num_timesteps, animate=False)
32 | 
33 |         env.close()
34 | 
35 | 
36 | def main():
37 |     """
38 |     Runs the test
39 |     """
40 |     args = mujoco_arg_parser().parse_args()
41 |     logger.configure()
42 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     main()
47 | 


--------------------------------------------------------------------------------
/stable_baselines/acktr/utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | def dense(input_tensor, size, name, weight_init=None, bias_init=0, weight_loss_dict=None, reuse=None):
 5 |     """
 6 |     A dense Layer
 7 |     
 8 |     :param input_tensor: ([TensorFlow Tensor]) input
 9 |     :param size: (int) number of hidden neurons
10 |     :param name: (str) layer name
11 |     :param weight_init: (function or int or float) initialize the weight
12 |     :param bias_init: (function or int or float) initialize the weight
13 |     :param weight_loss_dict: (dict) store the weight loss if not None
14 |     :param reuse: (bool) if can be reused
15 |     :return: ([TensorFlow Tensor]) the output of the dense Layer
16 |     """
17 |     with tf.variable_scope(name, reuse=reuse):
18 |         assert len(tf.get_variable_scope().name.split('/')) == 2
19 | 
20 |         weight = tf.get_variable("w", [input_tensor.get_shape()[1], size], initializer=weight_init)
21 |         bias = tf.get_variable("b", [size], initializer=tf.constant_initializer(bias_init))
22 |         weight_decay_fc = 3e-4
23 | 
24 |         if weight_loss_dict is not None:
25 |             weight_decay = tf.multiply(tf.nn.l2_loss(weight), weight_decay_fc, name='weight_decay_loss')
26 |             weight_loss_dict[weight] = weight_decay_fc
27 |             weight_loss_dict[bias] = 0.0
28 | 
29 |             tf.add_to_collection(tf.get_variable_scope().name.split('/')[0] + '_' + 'losses', weight_decay)
30 | 
31 |         return tf.nn.bias_add(tf.matmul(input_tensor, weight), bias)
32 | 
33 | 
34 | def kl_div(action_dist1, action_dist2, action_size):
35 |     """
36 |     Kullback leiber divergence
37 |     
38 |     :param action_dist1: ([TensorFlow Tensor]) action distribution 1
39 |     :param action_dist2: ([TensorFlow Tensor]) action distribution 2
40 |     :param action_size: (int) the shape of an action
41 |     :return: (float) Kullback leiber divergence
42 |     """
43 |     mean1, std1 = action_dist1[:, :action_size], action_dist1[:, action_size:]
44 |     mean2, std2 = action_dist2[:, :action_size], action_dist2[:, action_size:]
45 | 
46 |     numerator = tf.square(mean1 - mean2) + tf.square(std1) - tf.square(std2)
47 |     denominator = 2 * tf.square(std2) + 1e-8
48 |     return tf.reduce_sum(
49 |         numerator / denominator + tf.log(std2) - tf.log(std1), reduction_indices=-1)
50 | 


--------------------------------------------------------------------------------
/stable_baselines/acktr/value_functions.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | from stable_baselines import logger
 5 | import stable_baselines.common as common
 6 | from stable_baselines.common import tf_util
 7 | from stable_baselines.acktr import kfac
 8 | from stable_baselines.acktr.utils import dense
 9 | 
10 | 
11 | class NeuralNetValueFunction(object):
12 |     def __init__(self, ob_dim, ac_dim, verbose=1):
13 |         """
14 |         Create an MLP policy for a value function
15 | 
16 |         :param ob_dim: (int) Observation dimention
17 |         :param ac_dim: (int) action dimention
18 |         :param verbose: (int) verbosity level
19 |         """
20 |         obs_ph = tf.placeholder(tf.float32, shape=[None, ob_dim * 2 + ac_dim * 2 + 2])  # batch of observations
21 |         vtarg_n = tf.placeholder(tf.float32, shape=[None], name='vtarg')
22 |         wd_dict = {}
23 |         layer_1 = tf.nn.elu(dense(obs_ph, 64, "h1",
24 |                                   weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
25 |         layer_2 = tf.nn.elu(dense(layer_1, 64, "h2",
26 |                                   weight_init=tf_util.normc_initializer(1.0), bias_init=0, weight_loss_dict=wd_dict))
27 |         vpred_n = dense(layer_2, 1, "hfinal",
28 |                         weight_init=tf_util.normc_initializer(1.0), bias_init=0,
29 |                         weight_loss_dict=wd_dict)[:, 0]
30 |         sample_vpred_n = vpred_n + tf.random_normal(tf.shape(vpred_n))
31 |         wd_loss = tf.get_collection("vf_losses", None)
32 |         loss = tf.reduce_mean(tf.square(vpred_n - vtarg_n)) + tf.add_n(wd_loss)
33 |         loss_sampled = tf.reduce_mean(tf.square(vpred_n - tf.stop_gradient(sample_vpred_n)))
34 | 
35 |         self._predict = tf_util.function([obs_ph], vpred_n)
36 | 
37 |         optim = kfac.KfacOptimizer(learning_rate=0.001, cold_lr=0.001 * (1 - 0.9), momentum=0.9,
38 |                                    clip_kl=0.3, epsilon=0.1, stats_decay=0.95,
39 |                                    async_eigen_decomp=True, kfac_update=2, cold_iter=50,
40 |                                    weight_decay_dict=wd_dict, max_grad_norm=None, verbose=verbose)
41 |         vf_var_list = []
42 |         for var in tf.trainable_variables():
43 |             if "vf" in var.name:
44 |                 vf_var_list.append(var)
45 | 
46 |         update_op, self.q_runner = optim.minimize(loss, loss_sampled, var_list=vf_var_list)
47 |         self.do_update = tf_util.function([obs_ph, vtarg_n], update_op)  # pylint: disable=E1101
48 |         tf_util.initialize()  # Initialize uninitialized TF variables
49 | 
50 |     @classmethod
51 |     def _preproc(cls, path):
52 |         """
53 |         preprocess path
54 | 
55 |         :param path: ({TensorFlow Tensor}) the history of the network
56 |         :return: ([TensorFlow Tensor]) processed input
57 |         """
58 |         length = path["reward"].shape[0]
59 |         # used to be named 'al', unfortunalty we cant seem to know why it was called 'al' or what it means.
60 |         # Feel free to fix it if you know what is meant here.
61 |         # Could mean 'array_length', but even then we are not sure how this array is useful for the network.
62 |         al_capone = np.arange(length).reshape(-1, 1) / 10.0
63 |         act = path["action_dist"].astype('float32')
64 |         return np.concatenate([path['observation'], act, al_capone, np.ones((length, 1))], axis=1)
65 | 
66 |     def predict(self, path):
67 |         """
68 |         predict value from history
69 | 
70 |         :param path: ({TensorFlow Tensor}) the history of the network
71 |         :return: ([TensorFlow Tensor]) value function output
72 |         """
73 |         return self._predict(self._preproc(path))
74 | 
75 |     def fit(self, paths, targvals):
76 |         """
77 |         fit paths to target values
78 | 
79 |         :param paths: ({TensorFlow Tensor}) the history of the network
80 |         :param targvals: ([TensorFlow Tensor]) the expected value
81 |         """
82 |         _input = np.concatenate([self._preproc(p) for p in paths])
83 |         targets = np.concatenate(targvals)
84 |         logger.record_tabular("EVBefore", common.explained_variance(self._predict(_input), targets))
85 |         for _ in range(25):
86 |             self.do_update(_input, targets)
87 |         logger.record_tabular("EVAfter", common.explained_variance(self._predict(_input), targets))
88 | 


--------------------------------------------------------------------------------
/stable_baselines/bench/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.bench.monitor import Monitor, load_results
2 | 


--------------------------------------------------------------------------------
/stable_baselines/common/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa F403
 2 | from stable_baselines.common.console_util import fmt_row, fmt_item, colorize
 3 | from stable_baselines.common.dataset import Dataset
 4 | from stable_baselines.common.math_util import discount, discount_with_boundaries, explained_variance, \
 5 |     explained_variance_2d, flatten_arrays, unflatten_vector
 6 | from stable_baselines.common.misc_util import zipsame, unpack, EzPickle, set_global_seeds, pretty_eta, RunningAvg,\
 7 |     boolean_flag, get_wrapper_by_name, relatively_safe_pickle_dump, pickle_load
 8 | from stable_baselines.common.base_class import BaseRLModel, ActorCriticRLModel, OffPolicyRLModel, SetVerbosity, \
 9 |     TensorboardWriter
10 | 


--------------------------------------------------------------------------------
/stable_baselines/common/cg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def conjugate_gradient(f_ax, b_vec, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
 5 |     """
 6 |     conjugate gradient calculation (Ax = b), bases on
 7 |     https://epubs.siam.org/doi/book/10.1137/1.9781611971446 Demmel p 312
 8 | 
 9 |     :param f_ax: (function) The function describing the Matrix A dot the vector x
10 |                  (x being the input parameter of the function)
11 |     :param b_vec: (numpy float) vector b, where Ax = b
12 |     :param cg_iters: (int) the maximum number of iterations for converging
13 |     :param callback: (function) callback the values of x while converging
14 |     :param verbose: (bool) print extra information
15 |     :param residual_tol: (float) the break point if the residual is below this value
16 |     :return: (numpy float) vector x, where Ax = b
17 |     """
18 |     first_basis_vect = b_vec.copy()  # the first basis vector
19 |     residual = b_vec.copy()  # the residual
20 |     x_var = np.zeros_like(b_vec)  # vector x, where Ax = b
21 |     residual_dot_residual = residual.dot(residual)  # L2 norm of the residual
22 | 
23 |     fmt_str = "%10i %10.3g %10.3g"
24 |     title_str = "%10s %10s %10s"
25 |     if verbose:
26 |         print(title_str % ("iter", "residual norm", "soln norm"))
27 | 
28 |     for i in range(cg_iters):
29 |         if callback is not None:
30 |             callback(x_var)
31 |         if verbose:
32 |             print(fmt_str % (i, residual_dot_residual, np.linalg.norm(x_var)))
33 |         z_var = f_ax(first_basis_vect)
34 |         v_var = residual_dot_residual / first_basis_vect.dot(z_var)
35 |         x_var += v_var * first_basis_vect
36 |         residual -= v_var * z_var
37 |         new_residual_dot_residual = residual.dot(residual)
38 |         mu_val = new_residual_dot_residual / residual_dot_residual
39 |         first_basis_vect = residual + mu_val * first_basis_vect
40 | 
41 |         residual_dot_residual = new_residual_dot_residual
42 |         if residual_dot_residual < residual_tol:
43 |             break
44 | 
45 |     if callback is not None:
46 |         callback(x_var)
47 |     if verbose:
48 |         print(fmt_str % (i + 1, residual_dot_residual, np.linalg.norm(x_var)))
49 |     return x_var
50 | 


--------------------------------------------------------------------------------
/stable_baselines/common/cmd_util.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Helpers for scripts like run_atari.py.
  3 | """
  4 | 
  5 | import os
  6 | 
  7 | from mpi4py import MPI
  8 | import gym
  9 | from gym.wrappers import FlattenDictWrapper
 10 | 
 11 | from stable_baselines import logger
 12 | from stable_baselines.bench import Monitor
 13 | from stable_baselines.common import set_global_seeds
 14 | from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind
 15 | from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
 16 | 
 17 | 
 18 | def make_atari_env(env_id, num_env, seed, wrapper_kwargs=None, start_index=0, allow_early_resets=True):
 19 |     """
 20 |     Create a wrapped, monitored SubprocVecEnv for Atari.
 21 |     
 22 |     :param env_id: (str) the environment ID
 23 |     :param num_env: (int) the number of environment you wish to have in subprocesses
 24 |     :param seed: (int) the inital seed for RNG
 25 |     :param wrapper_kwargs: (dict) the parameters for wrap_deepmind function
 26 |     :param start_index: (int) start rank index
 27 |     :param allow_early_resets: (bool) allows early reset of the environment
 28 |     :return: (Gym Environment) The atari environment
 29 |     """
 30 |     if wrapper_kwargs is None:
 31 |         wrapper_kwargs = {}
 32 | 
 33 |     def make_env(rank):
 34 |         def _thunk():
 35 |             env = make_atari(env_id)
 36 |             env.seed(seed + rank)
 37 |             env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
 38 |                           allow_early_resets=allow_early_resets)
 39 |             return wrap_deepmind(env, **wrapper_kwargs)
 40 |         return _thunk
 41 |     set_global_seeds(seed)
 42 |     return SubprocVecEnv([make_env(i + start_index) for i in range(num_env)])
 43 | 
 44 | 
 45 | def make_mujoco_env(env_id, seed, allow_early_resets=True):
 46 |     """
 47 |     Create a wrapped, monitored gym.Env for MuJoCo.
 48 |     
 49 |     :param env_id: (str) the environment ID
 50 |     :param seed: (int) the inital seed for RNG
 51 |     :param allow_early_resets: (bool) allows early reset of the environment
 52 |     :return: (Gym Environment) The mujoco environment
 53 |     """
 54 |     rank = MPI.COMM_WORLD.Get_rank()
 55 |     set_global_seeds(seed + 10000 * rank)
 56 |     env = gym.make(env_id)
 57 |     env = Monitor(env, os.path.join(logger.get_dir(), str(rank)), allow_early_resets=allow_early_resets)
 58 |     env.seed(seed)
 59 |     return env
 60 | 
 61 | 
 62 | def make_robotics_env(env_id, seed, rank=0, allow_early_resets=True):
 63 |     """
 64 |     Create a wrapped, monitored gym.Env for MuJoCo.
 65 |     
 66 |     :param env_id: (str) the environment ID
 67 |     :param seed: (int) the inital seed for RNG
 68 |     :param rank: (int) the rank of the environment (for logging)
 69 |     :param allow_early_resets: (bool) allows early reset of the environment
 70 |     :return: (Gym Environment) The robotic environment
 71 |     """
 72 |     set_global_seeds(seed)
 73 |     env = gym.make(env_id)
 74 |     env = FlattenDictWrapper(env, ['observation', 'desired_goal'])
 75 |     env = Monitor(
 76 |         env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
 77 |         info_keywords=('is_success',), allow_early_resets=allow_early_resets)
 78 |     env.seed(seed)
 79 |     return env
 80 | 
 81 | 
 82 | def arg_parser():
 83 |     """
 84 |     Create an empty argparse.ArgumentParser.
 85 |     
 86 |     :return: (ArgumentParser)
 87 |     """
 88 |     import argparse
 89 |     return argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 90 | 
 91 | 
 92 | def atari_arg_parser():
 93 |     """
 94 |     Create an argparse.ArgumentParser for run_atari.py.
 95 |     
 96 |     :return: (ArgumentParser) parser {'--env': 'BreakoutNoFrameskip-v4', '--seed': 0, '--num-timesteps': int(1e7)}
 97 |     """
 98 |     parser = arg_parser()
 99 |     parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
100 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
101 |     parser.add_argument('--num-timesteps', type=int, default=int(1e7))
102 |     return parser
103 | 
104 | 
105 | def mujoco_arg_parser():
106 |     """
107 |     Create an argparse.ArgumentParser for run_mujoco.py.
108 |     
109 |     :return:  (ArgumentParser) parser {'--env': 'Reacher-v2', '--seed': 0, '--num-timesteps': int(1e6), '--play': False}
110 |     """
111 |     parser = arg_parser()
112 |     parser.add_argument('--env', help='environment ID', type=str, default='Reacher-v2')
113 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
114 |     parser.add_argument('--num-timesteps', type=int, default=int(1e6))
115 |     parser.add_argument('--play', default=False, action='store_true')
116 |     return parser
117 | 
118 | 
119 | def robotics_arg_parser():
120 |     """
121 |     Create an argparse.ArgumentParser for run_mujoco.py.
122 |     
123 |     :return: (ArgumentParser) parser {'--env': 'FetchReach-v0', '--seed': 0, '--num-timesteps': int(1e6)}
124 |     """
125 |     parser = arg_parser()
126 |     parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v0')
127 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
128 |     parser.add_argument('--num-timesteps', type=int, default=int(1e6))
129 |     return parser
130 | 


--------------------------------------------------------------------------------
/stable_baselines/common/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | # ================================================================
 7 | # Misc
 8 | # ================================================================
 9 | 
10 | 
11 | def fmt_row(width, row, header=False):
12 |     """
13 |     fits a list of items to at least a certain length
14 | 
15 |     :param width: (int) the minimum width of the string
16 |     :param row: ([Any]) a list of object you wish to get the string representation
17 |     :param header: (bool) whether or not to return the string as a header
18 |     :return: (str) the string representation of all the elements in 'row', of length >= 'width'
19 |     """
20 |     out = " | ".join(fmt_item(x, width) for x in row)
21 |     if header:
22 |         out = out + "\n" + "-" * len(out)
23 |     return out
24 | 
25 | 
26 | def fmt_item(item, min_width):
27 |     """
28 |     fits items to a given string length
29 | 
30 |     :param item: (Any) the item you wish to get the string representation
31 |     :param min_width: (int) the minimum width of the string
32 |     :return: (str) the string representation of 'x' of length >= 'l'
33 |     """
34 |     if isinstance(item, np.ndarray):
35 |         assert item.ndim == 0
36 |         item = item.item()
37 |     if isinstance(item, (float, np.float32, np.float64)):
38 |         value = abs(item)
39 |         if (value < 1e-4 or value > 1e+4) and value > 0:
40 |             rep = "%7.2e" % item
41 |         else:
42 |             rep = "%7.5f" % item
43 |     else:
44 |         rep = str(item)
45 |     return " " * (min_width - len(rep)) + rep
46 | 
47 | 
48 | COLOR_TO_NUM = dict(
49 |     gray=30,
50 |     red=31,
51 |     green=32,
52 |     yellow=33,
53 |     blue=34,
54 |     magenta=35,
55 |     cyan=36,
56 |     white=37,
57 |     crimson=38
58 | )
59 | 
60 | 
61 | def colorize(string, color, bold=False, highlight=False):
62 |     """
63 |     Colorize, bold and/or highlight a string for terminal print
64 | 
65 |     :param string: (str) input string
66 |     :param color: (str) the color, the lookup table is the dict at console_util.color2num
67 |     :param bold: (bool) if the string should be bold or not
68 |     :param highlight: (bool) if the string should be highlighted or not
69 |     :return: (str) the stylized output string
70 |     """
71 |     attr = []
72 |     num = COLOR_TO_NUM[color]
73 |     if highlight:
74 |         num += 10
75 |     attr.append(str(num))
76 |     if bold:
77 |         attr.append('1')
78 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
79 | 


--------------------------------------------------------------------------------
/stable_baselines/common/dataset.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class Dataset(object):
  5 |     def __init__(self, data_map, deterministic=False, shuffle=True):
  6 |         """
  7 |         Data loader that handles batches and shuffling.
  8 |         WARNING: this will alter the given data_map ordering, as dicts are mutable
  9 | 
 10 |         :param data_map: (dict) the input data, where every column is a key
 11 |         :param deterministic: (bool) disables the shuffle function
 12 |         :param shuffle: (bool) enable auto shuffle
 13 |         """
 14 |         self.data_map = data_map
 15 |         self.deterministic = deterministic
 16 |         self.enable_shuffle = shuffle
 17 |         self.n_samples = next(iter(data_map.values())).shape[0]
 18 |         self._next_id = 0
 19 |         self.shuffle()
 20 | 
 21 |     def shuffle(self):
 22 |         """
 23 |         shuffles the data_map
 24 |         """
 25 |         if self.deterministic:
 26 |             return
 27 |         perm = np.arange(self.n_samples)
 28 |         np.random.shuffle(perm)
 29 | 
 30 |         for key in self.data_map:
 31 |             self.data_map[key] = self.data_map[key][perm]
 32 | 
 33 |     def next_batch(self, batch_size):
 34 |         """
 35 |         returns a batch of data of a given size
 36 | 
 37 |         :param batch_size: (int) the size of the batch
 38 |         :return: (dict) a batch of the input data of size 'batch_size'
 39 |         """
 40 |         if self._next_id >= self.n_samples:
 41 |             self._next_id = 0
 42 |             if self.enable_shuffle:
 43 |                 self.shuffle()
 44 | 
 45 |         cur_id = self._next_id
 46 |         cur_batch_size = min(batch_size, self.n_samples - self._next_id)
 47 |         self._next_id += cur_batch_size
 48 | 
 49 |         data_map = dict()
 50 |         for key in self.data_map:
 51 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
 52 |         return data_map
 53 | 
 54 |     def iterate_once(self, batch_size):
 55 |         """
 56 |         generator that iterates over the dataset
 57 | 
 58 |         :param batch_size: (int) the size of the batch
 59 |         :return: (dict) a batch of the input data of size 'batch_size'
 60 |         """
 61 |         if self.enable_shuffle:
 62 |             self.shuffle()
 63 | 
 64 |         while self._next_id <= self.n_samples - batch_size:
 65 |             yield self.next_batch(batch_size)
 66 |         self._next_id = 0
 67 | 
 68 |     def subset(self, num_elements, deterministic=True):
 69 |         """
 70 |         Return a subset of the current dataset
 71 | 
 72 |         :param num_elements: (int) the number of element you wish to have in the subset
 73 |         :param deterministic: (bool) disables the shuffle function
 74 |         :return: (Dataset) a new subset of the current Dataset object
 75 |         """
 76 |         data_map = dict()
 77 |         for key in self.data_map:
 78 |             data_map[key] = self.data_map[key][:num_elements]
 79 |         return Dataset(data_map, deterministic)
 80 | 
 81 | 
 82 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
 83 |     """
 84 |     Iterates over arrays in batches, must provide either num_batches or batch_size, the other must be None.
 85 | 
 86 |     :param arrays: (tuple) a tuple of arrays
 87 |     :param num_batches: (int) the number of batches, must be None is batch_size is defined
 88 |     :param batch_size: (int) the size of the batch, must be None is num_batches is defined
 89 |     :param shuffle: (bool) enable auto shuffle
 90 |     :param include_final_partial_batch: (bool) add the last batch if not the same size as the batch_size
 91 |     :return: (tuples) a tuple of a batch of the arrays
 92 |     """
 93 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
 94 |     arrays = tuple(map(np.asarray, arrays))
 95 |     n_samples = arrays[0].shape[0]
 96 |     assert all(a.shape[0] == n_samples for a in arrays[1:])
 97 |     inds = np.arange(n_samples)
 98 |     if shuffle:
 99 |         np.random.shuffle(inds)
100 |     sections = np.arange(0, n_samples, batch_size)[1:] if num_batches is None else num_batches
101 |     for batch_inds in np.array_split(inds, sections):
102 |         if include_final_partial_batch or len(batch_inds) == batch_size:
103 |             yield tuple(a[batch_inds] for a in arrays)
104 | 


--------------------------------------------------------------------------------
/stable_baselines/common/identity_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from gym import Env
  4 | from gym.spaces import Discrete, MultiDiscrete, MultiBinary, Box
  5 | 
  6 | 
  7 | class IdentityEnv(Env):
  8 |     def __init__(self, dim, ep_length=100):
  9 |         """
 10 |         Identity environment for testing purposes
 11 | 
 12 |         :param dim: (int) the size of the dimensions you want to learn
 13 |         :param ep_length: (int) the length of each episodes in timesteps
 14 |         """
 15 |         self.action_space = Discrete(dim)
 16 |         self.observation_space = self.action_space
 17 |         self.ep_length = ep_length
 18 |         self.current_step = 0
 19 |         self.dim = dim
 20 |         self.reset()
 21 | 
 22 |     def reset(self):
 23 |         self.current_step = 0
 24 |         self._choose_next_state()
 25 |         return self.state
 26 | 
 27 |     def step(self, action):
 28 |         reward = self._get_reward(action)
 29 |         self._choose_next_state()
 30 |         self.current_step += 1
 31 |         done = self.current_step >= self.ep_length
 32 |         return self.state, reward, done, {}
 33 | 
 34 |     def _choose_next_state(self):
 35 |         self.state = self.action_space.sample()
 36 | 
 37 |     def _get_reward(self, action):
 38 |         return 1 if np.all(self.state == action) else 0
 39 | 
 40 |     def render(self, mode='human'):
 41 |         pass
 42 | 
 43 | 
 44 | class IdentityEnvBox(IdentityEnv):
 45 |     def __init__(self, low=-1, high=1, eps=0.05, ep_length=100):
 46 |         """
 47 |         Identity environment for testing purposes
 48 | 
 49 |         :param dim: (int) the size of the dimensions you want to learn
 50 |         :param low: (float) the lower bound of the box dim
 51 |         :param high: (float) the upper bound of the box dim
 52 |         :param eps: (float) the epsilon bound for correct value
 53 |         :param ep_length: (int) the length of each episodes in timesteps
 54 |         """
 55 |         super(IdentityEnvBox, self).__init__(1, ep_length)
 56 |         self.action_space = Box(low=low, high=high, shape=(1,), dtype=np.float32)
 57 |         self.observation_space = self.action_space
 58 |         self.eps = eps
 59 |         self.reset()
 60 | 
 61 |     def reset(self):
 62 |         self.current_step = 0
 63 |         self._choose_next_state()
 64 |         return self.state
 65 | 
 66 |     def step(self, action):
 67 |         reward = self._get_reward(action)
 68 |         self._choose_next_state()
 69 |         self.current_step += 1
 70 |         done = self.current_step >= self.ep_length
 71 |         return self.state, reward, done, {}
 72 | 
 73 |     def _choose_next_state(self):
 74 |         self.state = self.observation_space.sample()
 75 | 
 76 |     def _get_reward(self, action):
 77 |         return 1 if (self.state - self.eps) <= action <= (self.state + self.eps) else 0
 78 | 
 79 | 
 80 | class IdentityEnvMultiDiscrete(IdentityEnv):
 81 |     def __init__(self, dim, ep_length=100):
 82 |         """
 83 |         Identity environment for testing purposes
 84 | 
 85 |         :param dim: (int) the size of the dimensions you want to learn
 86 |         :param ep_length: (int) the length of each episodes in timesteps
 87 |         """
 88 |         super(IdentityEnvMultiDiscrete, self).__init__(dim, ep_length)
 89 |         self.action_space = MultiDiscrete([dim, dim])
 90 |         self.observation_space = self.action_space
 91 |         self.reset()
 92 | 
 93 | 
 94 | class IdentityEnvMultiBinary(IdentityEnv):
 95 |     def __init__(self, dim, ep_length=100):
 96 |         """
 97 |         Identity environment for testing purposes
 98 | 
 99 |         :param dim: (int) the size of the dimensions you want to learn
100 |         :param ep_length: (int) the length of each episodes in timesteps
101 |         """
102 |         super(IdentityEnvMultiBinary, self).__init__(dim, ep_length)
103 |         self.action_space = MultiBinary(dim)
104 |         self.observation_space = self.action_space
105 |         self.reset()
106 | 


--------------------------------------------------------------------------------
/stable_baselines/common/input.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from gym.spaces import Discrete, Box, MultiBinary, MultiDiscrete
 4 | 
 5 | 
 6 | def observation_input(ob_space, batch_size=None, name='Ob', scale=False):
 7 |     """
 8 |     Build observation input with encoding depending on the observation space type
 9 | 
10 |     When using Box ob_space, the input will be normalized between [1, 0] on the bounds ob_space.low and ob_space.high.
11 | 
12 |     :param ob_space: (Gym Space) The observation space
13 |     :param batch_size: (int) batch size for input
14 |                        (default is None, so that resulting input placeholder can take tensors with any batch size)
15 |     :param name: (str) tensorflow variable name for input placeholder
16 |     :param scale: (bool) whether or not to scale the input
17 |     :return: (TensorFlow Tensor, TensorFlow Tensor) input_placeholder, processed_input_tensor
18 |     """
19 |     if isinstance(ob_space, Discrete):
20 |         input_x = tf.placeholder(shape=(batch_size,), dtype=tf.int32, name=name)
21 |         processed_x = tf.to_float(tf.one_hot(input_x, ob_space.n))
22 |         return input_x, processed_x
23 | 
24 |     elif isinstance(ob_space, Box):
25 |         input_x = tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=ob_space.dtype, name=name)
26 |         processed_x = tf.to_float(input_x)
27 |         # rescale to [1, 0] if the bounds are defined
28 |         if (scale and
29 |            not np.any(np.isinf(ob_space.low)) and not np.any(np.isinf(ob_space.high)) and
30 |            np.any((ob_space.high - ob_space.low) != 0)):
31 | 
32 |             # equivalent to processed_x / 255.0 when bounds are set to [255, 0]
33 |             processed_x = ((processed_x - ob_space.low) / (ob_space.high - ob_space.low))
34 |         return input_x, processed_x
35 | 
36 |     elif isinstance(ob_space, MultiBinary):
37 |         input_x = tf.placeholder(shape=(batch_size, ob_space.n), dtype=tf.int32, name=name)
38 |         processed_x = tf.to_float(input_x)
39 |         return input_x, processed_x
40 | 
41 |     elif isinstance(ob_space, MultiDiscrete):
42 |         input_x = tf.placeholder(shape=(batch_size, len(ob_space.nvec)), dtype=tf.int32, name=name)
43 |         processed_x = tf.concat([tf.to_float(tf.one_hot(input_split, ob_space.nvec[i]))
44 |                                  for i, input_split in enumerate(tf.split(input_x, len(ob_space.nvec), axis=-1))],
45 |                                 axis=-1)
46 |         return input_x, processed_x
47 | 
48 |     else:
49 |         raise NotImplementedError("Error: the model does not support input space of type {}".format(
50 |             type(ob_space).__name__))
51 | 


--------------------------------------------------------------------------------
/stable_baselines/common/math_util.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import scipy.signal
  3 | 
  4 | 
  5 | def discount(vector, gamma):
  6 |     """
  7 |     computes discounted sums along 0th dimension of vector x.
  8 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
  9 |                 where k = len(x) - t - 1
 10 | 
 11 |     :param vector: (np.ndarray) the input vector
 12 |     :param gamma: (float) the discount value
 13 |     :return: (np.ndarray) the output vector
 14 |     """
 15 |     assert vector.ndim >= 1
 16 |     return scipy.signal.lfilter([1], [1, -gamma], vector[::-1], axis=0)[::-1]
 17 | 
 18 | 
 19 | def explained_variance(y_pred, y_true):
 20 |     """
 21 |     Computes fraction of variance that ypred explains about y.
 22 |     Returns 1 - Var[y-ypred] / Var[y]
 23 | 
 24 |     interpretation:
 25 |         ev=0  =>  might as well have predicted zero
 26 |         ev=1  =>  perfect prediction
 27 |         ev<0  =>  worse than just predicting zero
 28 | 
 29 |     :param y_pred: (np.ndarray) the prediction
 30 |     :param y_true: (np.ndarray) the expected value
 31 |     :return: (float) explained variance of ypred and y
 32 |     """
 33 |     assert y_true.ndim == 1 and y_pred.ndim == 1
 34 |     var_y = np.var(y_true)
 35 |     return np.nan if var_y == 0 else 1 - np.var(y_true - y_pred) / var_y
 36 | 
 37 | 
 38 | def explained_variance_2d(y_pred, y_true):
 39 |     """
 40 |     Computes fraction of variance that ypred explains about y, for 2D arrays.
 41 |     Returns 1 - Var[y-ypred] / Var[y]
 42 | 
 43 |     interpretation:
 44 |         ev=0  =>  might as well have predicted zero
 45 |         ev=1  =>  perfect prediction
 46 |         ev<0  =>  worse than just predicting zero
 47 | 
 48 |     :param y_pred: (np.ndarray) the prediction
 49 |     :param y_true: (np.ndarray) the expected value
 50 |     :return: (float) explained variance of ypred and y
 51 |     """
 52 |     assert y_true.ndim == 2 and y_pred.ndim == 2
 53 |     var_y = np.var(y_true, axis=0)
 54 |     explained_var = 1 - np.var(y_true - y_pred) / var_y
 55 |     explained_var[var_y < 1e-10] = 0
 56 |     return explained_var
 57 | 
 58 | 
 59 | def flatten_arrays(arrs):
 60 |     """
 61 |     flattens a list of arrays down to 1D
 62 | 
 63 |     :param arrs: ([np.ndarray]) arrays
 64 |     :return: (np.ndarray) 1D flattend array
 65 |     """
 66 |     return np.concatenate([arr.flat for arr in arrs])
 67 | 
 68 | 
 69 | def unflatten_vector(vec, shapes):
 70 |     """
 71 |     reshape a flattened array
 72 | 
 73 |     :param vec: (np.ndarray) 1D arrays
 74 |     :param shapes: (tuple)
 75 |     :return: ([np.ndarray]) reshaped array
 76 |     """
 77 |     i = 0
 78 |     arrs = []
 79 |     for shape in shapes:
 80 |         size = np.prod(shape)
 81 |         arr = vec[i:i + size].reshape(shape)
 82 |         arrs.append(arr)
 83 |         i += size
 84 |     return arrs
 85 | 
 86 | 
 87 | def discount_with_boundaries(rewards, episode_starts, gamma):
 88 |     """
 89 |     computes discounted sums along 0th dimension of x (reward), while taking into account the start of each episode.
 90 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
 91 |                 where k = len(x) - t - 1
 92 | 
 93 |     :param rewards: (np.ndarray) the input vector (rewards)
 94 |     :param episode_starts: (np.ndarray) 2d array of bools, indicating when a new episode has started
 95 |     :param gamma: (float) the discount factor
 96 |     :return: (np.ndarray) the output vector (discounted rewards)
 97 |     """
 98 |     discounted_rewards = np.zeros_like(rewards)
 99 |     n_samples = rewards.shape[0]
100 |     discounted_rewards[n_samples - 1] = rewards[n_samples - 1]
101 |     for step in range(n_samples - 2, -1, -1):
102 |         discounted_rewards[step] = rewards[step] + gamma * discounted_rewards[step + 1] * (1 - episode_starts[step + 1])
103 |     return discounted_rewards
104 | 


--------------------------------------------------------------------------------
/stable_baselines/common/mpi_adam.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | #from mpi4py import MPI
  4 | 
  5 | import stable_baselines.common.tf_util as tf_utils
  6 | 
  7 | 
  8 | class MpiAdam(object):
  9 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None,
 10 |                  sess=None):
 11 |         """
 12 |         A parallel MPI implementation of the Adam optimizer for TensorFlow
 13 |         https://arxiv.org/abs/1412.6980
 14 | 
 15 |         :param var_list: ([TensorFlow Tensor]) the variables
 16 |         :param beta1: (float) Adam beta1 parameter
 17 |         :param beta2: (float) Adam beta1 parameter
 18 |         :param epsilon: (float) to help with preventing arithmetic issues
 19 |         :param scale_grad_by_procs: (bool) if the scaling should be done by processes
 20 |         :param comm: (MPI Communicators) if None, MPI.COMM_WORLD
 21 |         :param sess: (TensorFlow Session) if None, tf.get_default_session()
 22 |         """
 23 |         self.var_list = var_list
 24 |         self.beta1 = beta1
 25 |         self.beta2 = beta2
 26 |         self.epsilon = epsilon
 27 |         self.scale_grad_by_procs = scale_grad_by_procs
 28 |         size = sum(tf_utils.numel(v) for v in var_list)
 29 |         # Exponential moving average of gradient values
 30 |         # "first moment estimate" m in the paper
 31 |         self.exp_avg = np.zeros(size, 'float32')
 32 |         # Exponential moving average of squared gradient values
 33 |         # "second raw moment estimate" v in the paper
 34 |         self.exp_avg_sq = np.zeros(size, 'float32')
 35 |         self.step = 0
 36 |         self.setfromflat = tf_utils.SetFromFlat(var_list, sess=sess)
 37 |         self.getflat = tf_utils.GetFlat(var_list, sess=sess)
 38 |         self.comm = MPI.COMM_WORLD if comm is None else comm
 39 | 
 40 |     def update(self, local_grad, learning_rate):
 41 |         """
 42 |         update the values of the graph
 43 | 
 44 |         :param local_grad: (numpy float) the gradient
 45 |         :param learning_rate: (float) the learning_rate for the update
 46 |         """
 47 |         if self.step % 100 == 0:
 48 |             self.check_synced()
 49 |         local_grad = local_grad.astype('float32')
 50 |         global_grad = np.zeros_like(local_grad)
 51 |         self.comm.Allreduce(local_grad, global_grad, op=MPI.SUM)
 52 |         if self.scale_grad_by_procs:
 53 |             global_grad /= self.comm.Get_size()
 54 | 
 55 |         self.step += 1
 56 |         # Learning rate with bias correction
 57 |         step_size = learning_rate * np.sqrt(1 - self.beta2 ** self.step) / (1 - self.beta1 ** self.step)
 58 |         # Decay the first and second moment running average coefficient
 59 |         self.exp_avg = self.beta1 * self.exp_avg + (1 - self.beta1) * global_grad
 60 |         self.exp_avg_sq = self.beta2 * self.exp_avg_sq + (1 - self.beta2) * (global_grad * global_grad)
 61 |         step = (- step_size) * self.exp_avg / (np.sqrt(self.exp_avg_sq) + self.epsilon)
 62 |         self.setfromflat(self.getflat() + step)
 63 | 
 64 |     def sync(self):
 65 |         """
 66 |         syncronize the MPI threads
 67 |         """
 68 |         theta = self.getflat()
 69 |         self.comm.Bcast(theta, root=0)
 70 |         self.setfromflat(theta)
 71 | 
 72 |     def check_synced(self):
 73 |         """
 74 |         confirm the MPI threads are synced
 75 |         """
 76 |         if self.comm.Get_rank() == 0:  # this is root
 77 |             theta = self.getflat()
 78 |             self.comm.Bcast(theta, root=0)
 79 |         else:
 80 |             thetalocal = self.getflat()
 81 |             thetaroot = np.empty_like(thetalocal)
 82 |             self.comm.Bcast(thetaroot, root=0)
 83 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
 84 | 
 85 | 
 86 | @tf_utils.in_session
 87 | def test_mpi_adam():
 88 |     """
 89 |     tests the MpiAdam object's functionality
 90 |     """
 91 |     np.random.seed(0)
 92 |     tf.set_random_seed(0)
 93 | 
 94 |     a_var = tf.Variable(np.random.randn(3).astype('float32'))
 95 |     b_var = tf.Variable(np.random.randn(2, 5).astype('float32'))
 96 |     loss = tf.reduce_sum(tf.square(a_var)) + tf.reduce_sum(tf.sin(b_var))
 97 | 
 98 |     learning_rate = 1e-2
 99 |     update_op = tf.train.AdamOptimizer(learning_rate).minimize(loss)
100 |     do_update = tf_utils.function([], loss, updates=[update_op])
101 | 
102 |     tf.get_default_session().run(tf.global_variables_initializer())
103 |     for step in range(10):
104 |         print(step, do_update())
105 | 
106 |     tf.set_random_seed(0)
107 |     tf.get_default_session().run(tf.global_variables_initializer())
108 | 
109 |     var_list = [a_var, b_var]
110 |     lossandgrad = tf_utils.function([], [loss, tf_utils.flatgrad(loss, var_list)], updates=[update_op])
111 |     adam = MpiAdam(var_list)
112 | 
113 |     for step in range(10):
114 |         loss, grad = lossandgrad()
115 |         adam.update(grad, learning_rate)
116 |         print(step, loss)
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     # Run with mpirun -np 2 python <filename>
121 |     test_mpi_adam()
122 | 


--------------------------------------------------------------------------------
/stable_baselines/common/mpi_fork.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import sys
 4 | 
 5 | 
 6 | def mpi_fork(rank, bind_to_core=False):
 7 |     """
 8 |     Re-launches the current script with workers
 9 |     Returns "parent" for original parent, "child" for MPI children
10 | 
11 |     :param rank: (int) the rank
12 |     :param bind_to_core: (bool) enables binding to core
13 |     :return: (str) the correct type of thread name
14 |     """
15 |     if rank <= 1:
16 |         return "child"
17 |     if os.getenv("IN_MPI") is None:
18 |         env = os.environ.copy()
19 |         env.update(
20 |             MKL_NUM_THREADS="1",
21 |             OMP_NUM_THREADS="1",
22 |             IN_MPI="1"
23 |         )
24 |         args = ["mpirun", "-np", str(rank)]
25 |         if bind_to_core:
26 |             args += ["-bind-to", "core"]
27 |         args += [sys.executable] + sys.argv
28 |         subprocess.check_call(args, env=env)
29 |         return "parent"
30 |     else:
31 |         return "child"
32 | 


--------------------------------------------------------------------------------
/stable_baselines/common/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | 
 4 | from stable_baselines.common import zipsame
 5 | 
 6 | 
 7 | def mpi_mean(arr, axis=0, comm=None, keepdims=False):
 8 |     """
 9 |     calculates the mean of an array, using MPI
10 | 
11 |     :param arr: (np.ndarray)
12 |     :param axis: (int or tuple or list) the axis to run the means over
13 |     :param comm: (MPI Communicators) if None, MPI.COMM_WORLD
14 |     :param keepdims: (bool) keep the other dimensions intact
15 |     :return: (np.ndarray or Number) the result of the sum
16 |     """
17 |     arr = np.asarray(arr)
18 |     assert arr.ndim > 0
19 |     if comm is None:
20 |         comm = MPI.COMM_WORLD
21 |     xsum = arr.sum(axis=axis, keepdims=keepdims)
22 |     size = xsum.size
23 |     localsum = np.zeros(size + 1, arr.dtype)
24 |     localsum[:size] = xsum.ravel()
25 |     localsum[size] = arr.shape[axis]
26 |     globalsum = np.zeros_like(localsum)
27 |     comm.Allreduce(localsum, globalsum, op=MPI.SUM)
28 |     return globalsum[:size].reshape(xsum.shape) / globalsum[size], globalsum[size]
29 | 
30 | 
31 | def mpi_moments(arr, axis=0, comm=None, keepdims=False):
32 |     """
33 |     calculates the mean and std of an array, using MPI
34 | 
35 |     :param arr: (np.ndarray)
36 |     :param axis: (int or tuple or list) the axis to run the moments over
37 |     :param comm: (MPI Communicators) if None, MPI.COMM_WORLD
38 |     :param keepdims: (bool) keep the other dimensions intact
39 |     :return: (np.ndarray or Number) the result of the moments
40 |     """
41 |     arr = np.asarray(arr)
42 |     assert arr.ndim > 0
43 |     mean, count = mpi_mean(arr, axis=axis, comm=comm, keepdims=True)
44 |     sqdiffs = np.square(arr - mean)
45 |     meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
46 |     assert count1 == count
47 |     std = np.sqrt(meansqdiff)
48 |     if not keepdims:
49 |         newshape = mean.shape[:axis] + mean.shape[axis+1:]
50 |         mean = mean.reshape(newshape)
51 |         std = std.reshape(newshape)
52 |     return mean, std, count
53 | 
54 | 
55 | def _helper_runningmeanstd():
56 |     comm = MPI.COMM_WORLD
57 |     np.random.seed(0)
58 |     for (triple, axis) in [
59 |          ((np.random.randn(3), np.random.randn(4), np.random.randn(5)), 0),
60 |          ((np.random.randn(3, 2), np.random.randn(4, 2), np.random.randn(5, 2)), 0),
61 |          ((np.random.randn(2, 3), np.random.randn(2, 4), np.random.randn(2, 4)), 1)]:
62 | 
63 |         arr = np.concatenate(triple, axis=axis)
64 |         ms1 = [arr.mean(axis=axis), arr.std(axis=axis), arr.shape[axis]]
65 | 
66 |         ms2 = mpi_moments(triple[comm.Get_rank()], axis=axis)
67 | 
68 |         for (res_1, res_2) in zipsame(ms1, ms2):
69 |             print(res_1, res_2)
70 |             assert np.allclose(res_1, res_2)
71 |             print("ok!")
72 | 


--------------------------------------------------------------------------------
/stable_baselines/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | #from mpi4py import MPI
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | 
  5 | import stable_baselines.common.tf_util as tf_util
  6 | 
  7 | 
  8 | class RunningMeanStd(object):
  9 |     def __init__(self, epsilon=1e-2, shape=()):
 10 |         """
 11 |         calulates the running mean and std of a data stream
 12 |         https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
 13 | 
 14 |         :param epsilon: (float) helps with arithmetic issues
 15 |         :param shape: (tuple) the shape of the data stream's output
 16 |         """
 17 |         self._sum = tf.get_variable(
 18 |             dtype=tf.float64,
 19 |             shape=shape,
 20 |             initializer=tf.constant_initializer(0.0),
 21 |             name="runningsum", trainable=False)
 22 |         self._sumsq = tf.get_variable(
 23 |             dtype=tf.float64,
 24 |             shape=shape,
 25 |             initializer=tf.constant_initializer(epsilon),
 26 |             name="runningsumsq", trainable=False)
 27 |         self._count = tf.get_variable(
 28 |             dtype=tf.float64,
 29 |             shape=(),
 30 |             initializer=tf.constant_initializer(epsilon),
 31 |             name="count", trainable=False)
 32 |         self.shape = shape
 33 | 
 34 |         self.mean = tf.to_float(self._sum / self._count)
 35 |         self.std = tf.sqrt(tf.maximum(tf.to_float(self._sumsq / self._count) - tf.square(self.mean), 1e-2))
 36 | 
 37 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 38 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 39 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 40 |         self.incfiltparams = tf_util.function([newsum, newsumsq, newcount], [],
 41 |                                               updates=[tf.assign_add(self._sum, newsum),
 42 |                                                        tf.assign_add(self._sumsq, newsumsq),
 43 |                                                        tf.assign_add(self._count, newcount)])
 44 | 
 45 |     def update(self, data):
 46 |         """
 47 |         update the running mean and std
 48 | 
 49 |         :param data: (np.ndarray) the data
 50 |         """
 51 |         data = data.astype('float64')
 52 |         data_size = int(np.prod(self.shape))
 53 |         totalvec = np.zeros(data_size * 2 + 1, 'float64')
 54 |         addvec = np.concatenate([data.sum(axis=0).ravel(), np.square(data).sum(axis=0).ravel(),
 55 |                                  np.array([len(data)], dtype='float64')])
 56 |         MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 57 |         self.incfiltparams(totalvec[0: data_size].reshape(self.shape),
 58 |                            totalvec[data_size: 2 * data_size].reshape(self.shape), totalvec[2 * data_size])
 59 | 
 60 | 
 61 | @tf_util.in_session
 62 | def test_dist():
 63 |     """
 64 |     test the running mean std
 65 |     """
 66 |     np.random.seed(0)
 67 |     p_1, p_2, p_3 = (np.random.randn(3, 1), np.random.randn(4, 1), np.random.randn(5, 1))
 68 |     q_1, q_2, q_3 = (np.random.randn(6, 1), np.random.randn(7, 1), np.random.randn(8, 1))
 69 | 
 70 |     comm = MPI.COMM_WORLD
 71 |     assert comm.Get_size() == 2
 72 |     if comm.Get_rank() == 0:
 73 |         x_1, x_2, x_3 = p_1, p_2, p_3
 74 |     elif comm.Get_rank() == 1:
 75 |         x_1, x_2, x_3 = q_1, q_2, q_3
 76 |     else:
 77 |         assert False
 78 | 
 79 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 80 |     tf_util.initialize()
 81 | 
 82 |     rms.update(x_1)
 83 |     rms.update(x_2)
 84 |     rms.update(x_3)
 85 | 
 86 |     bigvec = np.concatenate([p_1, p_2, p_3, q_1, q_2, q_3])
 87 | 
 88 |     def checkallclose(var_1, var_2):
 89 |         print(var_1, var_2)
 90 |         return np.allclose(var_1, var_2)
 91 | 
 92 |     assert checkallclose(
 93 |         bigvec.mean(axis=0),
 94 |         rms.mean.eval(),
 95 |     )
 96 |     assert checkallclose(
 97 |         bigvec.std(axis=0),
 98 |         rms.std.eval(),
 99 |     )
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     # Run with mpirun -np 2 python <filename>
104 |     test_dist()
105 | 


--------------------------------------------------------------------------------
/stable_baselines/common/runners.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | 
 5 | class AbstractEnvRunner(ABC):
 6 |     def __init__(self, *, env, model, n_steps):
 7 |         """
 8 |         A runner to learn the policy of an environment for a model
 9 | 
10 |         :param env: (Gym environment) The environment to learn from
11 |         :param model: (Model) The model to learn
12 |         :param n_steps: (int) The number of steps to run for each environment
13 |         """
14 |         self.env = env
15 |         self.model = model
16 |         n_env = env.num_envs
17 |         self.batch_ob_shape = (n_env*n_steps,) + env.observation_space.shape
18 |         self.obs = np.zeros((n_env,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
19 |         self.obs[:] = env.reset()
20 |         self.n_steps = n_steps
21 |         self.states = model.initial_state
22 |         self.dones = [False for _ in range(n_env)]
23 | 
24 |     @abstractmethod
25 |     def run(self):
26 |         """
27 |         Run a learning step of the model
28 |         """
29 |         raise NotImplementedError
30 | 


--------------------------------------------------------------------------------
/stable_baselines/common/running_mean_std.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RunningMeanStd(object):
 5 |     def __init__(self, epsilon=1e-4, shape=()):
 6 |         """
 7 |         calulates the running mean and std of a data stream
 8 |         https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
 9 | 
10 |         :param epsilon: (float) helps with arithmetic issues
11 |         :param shape: (tuple) the shape of the data stream's output
12 |         """
13 |         self.mean = np.zeros(shape, 'float64')
14 |         self.var = np.ones(shape, 'float64')
15 |         self.count = epsilon
16 | 
17 |     def update(self, arr):
18 |         batch_mean = np.mean(arr, axis=0)
19 |         batch_var = np.var(arr, axis=0)
20 |         batch_count = arr.shape[0]
21 |         self.update_from_moments(batch_mean, batch_var, batch_count)
22 | 
23 |     def update_from_moments(self, batch_mean, batch_var, batch_count):
24 |         delta = batch_mean - self.mean
25 |         tot_count = self.count + batch_count
26 | 
27 |         new_mean = self.mean + delta * batch_count / tot_count
28 |         m_a = self.var * self.count
29 |         m_b = batch_var * batch_count
30 |         m_2 = m_a + m_b + np.square(delta) * self.count * batch_count / (self.count + batch_count)
31 |         new_var = m_2 / (self.count + batch_count)
32 | 
33 |         new_count = batch_count + self.count
34 | 
35 |         self.mean = new_mean
36 |         self.var = new_var
37 |         self.count = new_count
38 | 


--------------------------------------------------------------------------------
/stable_baselines/common/running_stat.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class RunningStat(object):
 5 |     def __init__(self, shape):
 6 |         """
 7 |         calulates the running mean and std of a data stream
 8 |         http://www.johndcook.com/blog/standard_deviation/
 9 | 
10 |         :param shape: (tuple) the shape of the data stream's output
11 |         """
12 |         self._step = 0
13 |         self._mean = np.zeros(shape)
14 |         self._std = np.zeros(shape)
15 | 
16 |     def push(self, value):
17 |         """
18 |         update the running mean and std
19 | 
20 |         :param value: (np.ndarray) the data
21 |         """
22 |         value = np.asarray(value)
23 |         assert value.shape == self._mean.shape
24 |         self._step += 1
25 |         if self._step == 1:
26 |             self._mean[...] = value
27 |         else:
28 |             old_m = self._mean.copy()
29 |             self._mean[...] = old_m + (value - old_m) / self._step
30 |             self._std[...] = self._std + (value - old_m) * (value - self._mean)
31 | 
32 |     @property
33 |     def n(self):
34 |         """
35 |         the number of data points
36 | 
37 |         :return: (int)
38 |         """
39 |         return self._step
40 | 
41 |     @property
42 |     def mean(self):
43 |         """
44 |         the average value
45 | 
46 |         :return: (float)
47 |         """
48 |         return self._mean
49 | 
50 |     @property
51 |     def var(self):
52 |         """
53 |         the variation of the data points
54 | 
55 |         :return: (float)
56 |         """
57 |         return self._std / (self._step - 1) if self._step > 1 else np.square(self._mean)
58 | 
59 |     @property
60 |     def std(self):
61 |         """
62 |         the standard deviation of the data points
63 | 
64 |         :return: (float)
65 |         """
66 |         return np.sqrt(self.var)
67 | 
68 |     @property
69 |     def shape(self):
70 |         """
71 |         the shape of the data points
72 | 
73 |         :return: (tuple)
74 |         """
75 |         return self._mean.shape
76 | 


--------------------------------------------------------------------------------
/stable_baselines/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 | 
  4 |  - learning rate for the optimizer
  5 |  - exploration epsilon for the epsilon greedy exploration strategy
  6 |  - beta parameter for beta parameter in prioritized replay
  7 | 
  8 | Each schedule has a function `value(t)` which returns the current value
  9 | of the parameter given the timestep t of the optimization procedure.
 10 | """
 11 | 
 12 | 
 13 | class Schedule(object):
 14 |     def value(self, step):
 15 |         """
 16 |         Value of the schedule for a given timestep
 17 | 
 18 |         :param step: (int) the timestep
 19 |         :return: (float) the output value for the given timestep
 20 |         """
 21 |         raise NotImplementedError
 22 | 
 23 | 
 24 | class ConstantSchedule(Schedule):
 25 |     """
 26 |     Value remains constant over time.
 27 | 
 28 |     :param value: (float) Constant value of the schedule
 29 |     """
 30 | 
 31 |     def __init__(self, value):
 32 |         self._value = value
 33 | 
 34 |     def value(self, step):
 35 |         return self._value
 36 | 
 37 | 
 38 | def linear_interpolation(left, right, alpha):
 39 |     """
 40 |     Linear interpolation between `left` and `right`.
 41 | 
 42 |     :param left: (float) left boundary
 43 |     :param right: (float) right boundary
 44 |     :param alpha: (float) coeff in [0, 1]
 45 |     :return: (float)
 46 |     """
 47 | 
 48 |     return left + alpha * (right - left)
 49 | 
 50 | 
 51 | class PiecewiseSchedule(Schedule):
 52 |     """
 53 |     Piecewise schedule.
 54 | 
 55 |     :param endpoints: ([(int, int)])
 56 |         list of pairs `(time, value)` meanining that schedule should output
 57 |         `value` when `t==time`. All the values for time must be sorted in
 58 |         an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 59 |         and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 60 |         `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 61 |         time passed between `time_a` and `time_b` for time `t`.
 62 |     :param interpolation: (lambda (float, float, float): float)
 63 |         a function that takes value to the left and to the right of t according
 64 |         to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 65 |         right endpoint that t has covered. See linear_interpolation for example.
 66 |     :param outside_value: (float)
 67 |         if the value is requested outside of all the intervals sepecified in
 68 |         `endpoints` this value is returned. If None then AssertionError is
 69 |         raised when outside value is requested.
 70 |     """
 71 | 
 72 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 73 |         idxes = [e[0] for e in endpoints]
 74 |         assert idxes == sorted(idxes)
 75 |         self._interpolation = interpolation
 76 |         self._outside_value = outside_value
 77 |         self._endpoints = endpoints
 78 | 
 79 |     def value(self, step):
 80 |         for (left_t, left), (right_t, right) in zip(self._endpoints[:-1], self._endpoints[1:]):
 81 |             if left_t <= step < right_t:
 82 |                 alpha = float(step - left_t) / (right_t - left_t)
 83 |                 return self._interpolation(left, right, alpha)
 84 | 
 85 |         # t does not belong to any of the pieces, so doom.
 86 |         assert self._outside_value is not None
 87 |         return self._outside_value
 88 | 
 89 | 
 90 | class LinearSchedule(Schedule):
 91 |     """
 92 |     Linear interpolation between initial_p and final_p over
 93 |     schedule_timesteps. After this many timesteps pass final_p is
 94 |     returned.
 95 | 
 96 |     :param schedule_timesteps: (int) Number of timesteps for which to linearly anneal initial_p to final_p
 97 |     :param initial_p: (float) initial output value
 98 |     :param final_p: (float) final output value
 99 |     """
100 | 
101 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
102 |         self.schedule_timesteps = schedule_timesteps
103 |         self.final_p = final_p
104 |         self.initial_p = initial_p
105 | 
106 |     def value(self, step):
107 |         fraction = min(float(step) / self.schedule_timesteps, 1.0)
108 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
109 | 


--------------------------------------------------------------------------------
/stable_baselines/common/segment_tree.py:
--------------------------------------------------------------------------------
  1 | import operator
  2 | 
  3 | 
  4 | class SegmentTree(object):
  5 |     def __init__(self, capacity, operation, neutral_element):
  6 |         """
  7 |         Build a Segment Tree data structure.
  8 | 
  9 |         https://en.wikipedia.org/wiki/Segment_tree
 10 | 
 11 |         Can be used as regular array, but with two
 12 |         important differences:
 13 | 
 14 |             a) setting item's value is slightly slower.
 15 |                It is O(lg capacity) instead of O(1).
 16 |             b) user has access to an efficient ( O(log segment size) )
 17 |                `reduce` operation which reduces `operation` over
 18 |                a contiguous subsequence of items in the array.
 19 | 
 20 |         :param capacity: (int) Total size of the array - must be a power of two.
 21 |         :param operation: (lambda (Any, Any): Any) operation for combining elements (eg. sum, max) must form a
 22 |             mathematical group together with the set of possible values for array elements (i.e. be associative)
 23 |         :param neutral_element: (Any) neutral element for the operation above. eg. float('-inf') for max and 0 for sum.
 24 |         """
 25 |         assert capacity > 0 and capacity & (capacity - 1) == 0, "capacity must be positive and a power of 2."
 26 |         self._capacity = capacity
 27 |         self._value = [neutral_element for _ in range(2 * capacity)]
 28 |         self._operation = operation
 29 | 
 30 |     def _reduce_helper(self, start, end, node, node_start, node_end):
 31 |         if start == node_start and end == node_end:
 32 |             return self._value[node]
 33 |         mid = (node_start + node_end) // 2
 34 |         if end <= mid:
 35 |             return self._reduce_helper(start, end, 2 * node, node_start, mid)
 36 |         else:
 37 |             if mid + 1 <= start:
 38 |                 return self._reduce_helper(start, end, 2 * node + 1, mid + 1, node_end)
 39 |             else:
 40 |                 return self._operation(
 41 |                     self._reduce_helper(start, mid, 2 * node, node_start, mid),
 42 |                     self._reduce_helper(mid + 1, end, 2 * node + 1, mid + 1, node_end)
 43 |                 )
 44 | 
 45 |     def reduce(self, start=0, end=None):
 46 |         """
 47 |         Returns result of applying `self.operation`
 48 |         to a contiguous subsequence of the array.
 49 | 
 50 |             self.operation(arr[start], operation(arr[start+1], operation(... arr[end])))
 51 | 
 52 |         :param start: (int) beginning of the subsequence
 53 |         :param end: (int) end of the subsequences
 54 |         :return: (Any) result of reducing self.operation over the specified range of array elements.
 55 |         """
 56 |         if end is None:
 57 |             end = self._capacity
 58 |         if end < 0:
 59 |             end += self._capacity
 60 |         end -= 1
 61 |         return self._reduce_helper(start, end, 1, 0, self._capacity - 1)
 62 | 
 63 |     def __setitem__(self, idx, val):
 64 |         # index of the leaf
 65 |         idx += self._capacity
 66 |         self._value[idx] = val
 67 |         idx //= 2
 68 |         while idx >= 1:
 69 |             self._value[idx] = self._operation(
 70 |                 self._value[2 * idx],
 71 |                 self._value[2 * idx + 1]
 72 |             )
 73 |             idx //= 2
 74 | 
 75 |     def __getitem__(self, idx):
 76 |         assert 0 <= idx < self._capacity
 77 |         return self._value[self._capacity + idx]
 78 | 
 79 | 
 80 | class SumSegmentTree(SegmentTree):
 81 |     def __init__(self, capacity):
 82 |         super(SumSegmentTree, self).__init__(
 83 |             capacity=capacity,
 84 |             operation=operator.add,
 85 |             neutral_element=0.0
 86 |         )
 87 | 
 88 |     def sum(self, start=0, end=None):
 89 |         """
 90 |         Returns arr[start] + ... + arr[end]
 91 | 
 92 |         :param start: (int) start position of the reduction (must be >= 0)
 93 |         :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1)
 94 |         :return: (Any) reduction of SumSegmentTree
 95 |         """
 96 |         return super(SumSegmentTree, self).reduce(start, end)
 97 | 
 98 |     def find_prefixsum_idx(self, prefixsum):
 99 |         """
100 |         Find the highest index `i` in the array such that
101 |             sum(arr[0] + arr[1] + ... + arr[i - i]) <= prefixsum
102 | 
103 |         if array values are probabilities, this function
104 |         allows to sample indexes according to the discrete
105 |         probability efficiently.
106 | 
107 |         :param prefixsum: (float) upperbound on the sum of array prefix
108 |         :return: (int) highest index satisfying the prefixsum constraint
109 |         """
110 |         assert 0 <= prefixsum <= self.sum() + 1e-5
111 |         idx = 1
112 |         while idx < self._capacity:  # while non-leaf
113 |             if self._value[2 * idx] > prefixsum:
114 |                 idx = 2 * idx
115 |             else:
116 |                 prefixsum -= self._value[2 * idx]
117 |                 idx = 2 * idx + 1
118 |         return idx - self._capacity
119 | 
120 | 
121 | class MinSegmentTree(SegmentTree):
122 |     def __init__(self, capacity):
123 |         super(MinSegmentTree, self).__init__(
124 |             capacity=capacity,
125 |             operation=min,
126 |             neutral_element=float('inf')
127 |         )
128 | 
129 |     def min(self, start=0, end=None):
130 |         """
131 |         Returns min(arr[start], ...,  arr[end])
132 | 
133 |         :param start: (int) start position of the reduction (must be >= 0)
134 |         :param end: (int) end position of the reduction (must be < len(arr), can be None for len(arr) - 1)
135 |         :return: (Any) reduction of MinSegmentTree
136 |         """
137 |         return super(MinSegmentTree, self).reduce(start, end)
138 | 


--------------------------------------------------------------------------------
/stable_baselines/common/tile_images.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def tile_images(img_nhwc):
 5 |     """
 6 |     Tile N images into one big PxQ image
 7 |     (P,Q) are chosen to be as close as possible, and if N
 8 |     is square, then P=Q.
 9 | 
10 |     :param img_nhwc: (list) list or array of images, ndim=4 once turned into array. img nhwc
11 |         n = batch index, h = height, w = width, c = channel
12 |     :return: (numpy float) img_HWc, ndim=3
13 |     """
14 |     img_nhwc = np.asarray(img_nhwc)
15 |     n_images, height, width, n_channels = img_nhwc.shape
16 |     # new_height was named H before
17 |     new_height = int(np.ceil(np.sqrt(n_images)))
18 |     # new_width was named W before
19 |     new_width = int(np.ceil(float(n_images) / new_height))
20 |     img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0] * 0 for _ in range(n_images, new_height * new_width)])
21 |     # img_HWhwc
22 |     out_image = img_nhwc.reshape(new_height, new_width, height, width, n_channels)
23 |     # img_HhWwc
24 |     out_image = out_image.transpose(0, 2, 1, 3, 4)
25 |     # img_Hh_Ww_c
26 |     out_image = out_image.reshape(new_height * height, new_width * width, n_channels)
27 |     return out_image
28 | 
29 | 


--------------------------------------------------------------------------------
/stable_baselines/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F401
2 | from stable_baselines.common.vec_env.base_vec_env import AlreadySteppingError, NotSteppingError, VecEnv, VecEnvWrapper, \
3 |     CloudpickleWrapper
4 | from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
5 | from stable_baselines.common.vec_env.subproc_vec_env import SubprocVecEnv
6 | from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack
7 | from stable_baselines.common.vec_env.vec_normalize import VecNormalize
8 | 


--------------------------------------------------------------------------------
/stable_baselines/common/vec_env/base_vec_env.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | import pickle
  3 | 
  4 | import cloudpickle
  5 | from stable_baselines import logger
  6 | 
  7 | 
  8 | class AlreadySteppingError(Exception):
  9 |     """
 10 |     Raised when an asynchronous step is running while
 11 |     step_async() is called again.
 12 |     """
 13 | 
 14 |     def __init__(self):
 15 |         msg = 'already running an async step'
 16 |         Exception.__init__(self, msg)
 17 | 
 18 | 
 19 | class NotSteppingError(Exception):
 20 |     """
 21 |     Raised when an asynchronous step is not running but
 22 |     step_wait() is called.
 23 |     """
 24 | 
 25 |     def __init__(self):
 26 |         msg = 'not running an async step'
 27 |         Exception.__init__(self, msg)
 28 | 
 29 | 
 30 | class VecEnv(ABC):
 31 |     """
 32 |     An abstract asynchronous, vectorized environment.
 33 | 
 34 |     :param num_envs: (int) the number of environments
 35 |     :param observation_space: (Gym Space) the observation space
 36 |     :param action_space: (Gym Space) the action space
 37 |     """
 38 | 
 39 |     def __init__(self, num_envs, observation_space, action_space):
 40 |         self.num_envs = num_envs
 41 |         self.observation_space = observation_space
 42 |         self.action_space = action_space
 43 | 
 44 |     @abstractmethod
 45 |     def reset(self):
 46 |         """
 47 |         Reset all the environments and return an array of
 48 |         observations, or a tuple of observation arrays.
 49 | 
 50 |         If step_async is still doing work, that work will
 51 |         be cancelled and step_wait() should not be called
 52 |         until step_async() is invoked again.
 53 | 
 54 |         :return: ([int] or [float]) observation
 55 |         """
 56 |         pass
 57 | 
 58 |     @abstractmethod
 59 |     def step_async(self, actions):
 60 |         """
 61 |         Tell all the environments to start taking a step
 62 |         with the given actions.
 63 |         Call step_wait() to get the results of the step.
 64 | 
 65 |         You should not call this if a step_async run is
 66 |         already pending.
 67 |         """
 68 |         pass
 69 | 
 70 |     @abstractmethod
 71 |     def step_wait(self):
 72 |         """
 73 |         Wait for the step taken with step_async().
 74 | 
 75 |         :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information
 76 |         """
 77 |         pass
 78 | 
 79 |     @abstractmethod
 80 |     def close(self):
 81 |         """
 82 |         Clean up the environment's resources.
 83 |         """
 84 |         pass
 85 | 
 86 |     def step(self, actions):
 87 |         """
 88 |         Step the environments with the given action
 89 | 
 90 |         :param actions: ([int] or [float]) the action
 91 |         :return: ([int] or [float], [float], [bool], dict) observation, reward, done, information
 92 |         """
 93 |         self.step_async(actions)
 94 |         return self.step_wait()
 95 | 
 96 |     def get_images(self):
 97 |         """
 98 |         Return RGB images from each environment
 99 |         """
100 |         raise NotImplementedError
101 | 
102 |     def render(self, *args, **kwargs):
103 |         """
104 |         Gym environment rendering
105 | 
106 |         :param mode: (str) the rendering type
107 |         """
108 |         logger.warn('Render not defined for %s' % self)
109 | 
110 |     @property
111 |     def unwrapped(self):
112 |         if isinstance(self, VecEnvWrapper):
113 |             return self.venv.unwrapped
114 |         else:
115 |             return self
116 | 
117 | 
118 | class VecEnvWrapper(VecEnv):
119 |     """
120 |     Vectorized environment base class
121 | 
122 |     :param venv: (VecEnv) the vectorized environment to wrap
123 |     :param observation_space: (Gym Space) the observation space (can be None to load from venv)
124 |     :param action_space: (Gym Space) the action space (can be None to load from venv)
125 |     """
126 | 
127 |     def __init__(self, venv, observation_space=None, action_space=None):
128 |         self.venv = venv
129 |         VecEnv.__init__(self, num_envs=venv.num_envs, observation_space=observation_space or venv.observation_space,
130 |                         action_space=action_space or venv.action_space)
131 | 
132 |     def step_async(self, actions):
133 |         self.venv.step_async(actions)
134 | 
135 |     @abstractmethod
136 |     def reset(self):
137 |         pass
138 | 
139 |     @abstractmethod
140 |     def step_wait(self):
141 |         pass
142 | 
143 |     def close(self):
144 |         return self.venv.close()
145 | 
146 |     def render(self, *args, **kwargs):
147 |         return self.venv.render(*args, **kwargs)
148 | 
149 |     def get_images(self):
150 |         return self.venv.get_images()
151 | 
152 | 
153 | class CloudpickleWrapper(object):
154 |     def __init__(self, var):
155 |         """
156 |         Uses cloudpickle to serialize contents (otherwise multiprocessing tries to use pickle)
157 | 
158 |         :param var: (Any) the variable you wish to wrap for pickling with cloudpickle
159 |         """
160 |         self.var = var
161 | 
162 |     def __getstate__(self):
163 |         return cloudpickle.dumps(self.var)
164 | 
165 |     def __setstate__(self, obs):
166 |         self.var = pickle.loads(obs)
167 | 


--------------------------------------------------------------------------------
/stable_baselines/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import numpy as np
 4 | from gym import spaces
 5 | 
 6 | from . import VecEnv
 7 | 
 8 | 
 9 | class DummyVecEnv(VecEnv):
10 |     """
11 |     Creates a simple vectorized wrapper for multiple environments
12 | 
13 |     :param env_fns: ([Gym Environment]) the list of environments to vectorize
14 |     """
15 |     
16 |     def __init__(self, env_fns):
17 |         self.envs = [fn() for fn in env_fns]
18 |         env = self.envs[0]
19 |         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
20 |         shapes, dtypes = {}, {}
21 |         self.keys = []
22 |         obs_space = env.observation_space
23 | 
24 |         if isinstance(obs_space, spaces.Dict):
25 |             assert isinstance(obs_space.spaces, OrderedDict)
26 |             subspaces = obs_space.spaces
27 |         else:
28 |             subspaces = {None: obs_space}
29 | 
30 |         for key, box in subspaces.items():
31 |             shapes[key] = box.shape
32 |             dtypes[key] = box.dtype
33 |             self.keys.append(key)
34 | 
35 |         self.buf_obs = {k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys}
36 |         self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
37 |         self.buf_rews = np.zeros((self.num_envs,), dtype=np.float32)
38 |         self.buf_infos = [{} for _ in range(self.num_envs)]
39 |         self.actions = None
40 | 
41 |     def step_async(self, actions):
42 |         self.actions = actions
43 | 
44 |     def step_wait(self):
45 |         for env_idx in range(self.num_envs):
46 |             obs, self.buf_rews[env_idx], self.buf_dones[env_idx], self.buf_infos[env_idx] =\
47 |                 self.envs[env_idx].step(self.actions[env_idx])
48 |             if self.buf_dones[env_idx]:
49 |                 obs = self.envs[env_idx].reset()
50 |             self._save_obs(env_idx, obs)
51 |         return (np.copy(self._obs_from_buf()), np.copy(self.buf_rews), np.copy(self.buf_dones),
52 |                 self.buf_infos.copy())
53 | 
54 |     def reset(self):
55 |         for env_idx in range(self.num_envs):
56 |             obs = self.envs[env_idx].reset()
57 |             self._save_obs(env_idx, obs)
58 |         return np.copy(self._obs_from_buf())
59 | 
60 |     def close(self):
61 |         return
62 | 
63 |     def get_images(self):
64 |         return [env.render(mode='rgb_array') for env in self.envs]
65 | 
66 |     def render(self, *args, **kwargs):
67 |         if self.num_envs == 1:
68 |             return self.envs[0].render(*args, **kwargs)
69 |         else:
70 |             return super().render(*args, **kwargs)
71 | 
72 |     def _save_obs(self, env_idx, obs):
73 |         for key in self.keys:
74 |             if key is None:
75 |                 self.buf_obs[key][env_idx] = obs
76 |             else:
77 |                 self.buf_obs[key][env_idx] = obs[key]
78 | 
79 |     def _obs_from_buf(self):
80 |         if self.keys == [None]:
81 |             return self.buf_obs[None]
82 |         else:
83 |             return self.buf_obs
84 | 


--------------------------------------------------------------------------------
/stable_baselines/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Process, Pipe
  2 | 
  3 | import numpy as np
  4 | 
  5 | from stable_baselines.common.vec_env import VecEnv, CloudpickleWrapper
  6 | from stable_baselines.common.tile_images import tile_images
  7 | 
  8 | 
  9 | def _worker(remote, parent_remote, env_fn_wrapper):
 10 |     parent_remote.close()
 11 |     env = env_fn_wrapper.var()
 12 |     while True:
 13 |         try:
 14 |             cmd, data = remote.recv()
 15 |             if cmd == 'step':
 16 |                 observation, reward, done, info = env.step(data)
 17 |                 if done:
 18 |                     observation = env.reset()
 19 |                 remote.send((observation, reward, done, info))
 20 |             elif cmd == 'reset':
 21 |                 observation = env.reset()
 22 |                 remote.send(observation)
 23 |             elif cmd == 'render':
 24 |                 remote.send(env.render(*data[0], **data[1]))
 25 |             elif cmd == 'close':
 26 |                 remote.close()
 27 |                 break
 28 |             elif cmd == 'get_spaces':
 29 |                 remote.send((env.observation_space, env.action_space))
 30 |             else:
 31 |                 raise NotImplementedError
 32 |         except EOFError:
 33 |             break
 34 | 
 35 | 
 36 | class SubprocVecEnv(VecEnv):
 37 |     """
 38 |     Creates a multiprocess vectorized wrapper for multiple environments
 39 | 
 40 |     :param env_fns: ([Gym Environment]) Environments to run in subprocesses
 41 |     """
 42 | 
 43 |     def __init__(self, env_fns):
 44 |         self.waiting = False
 45 |         self.closed = False
 46 |         n_envs = len(env_fns)
 47 |         self.remotes, self.work_remotes = zip(*[Pipe() for _ in range(n_envs)])
 48 |         self.processes = [Process(target=_worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
 49 |                           for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
 50 |         for process in self.processes:
 51 |             process.daemon = True  # if the main process crashes, we should not cause things to hang
 52 |             process.start()
 53 |         for remote in self.work_remotes:
 54 |             remote.close()
 55 | 
 56 |         self.remotes[0].send(('get_spaces', None))
 57 |         observation_space, action_space = self.remotes[0].recv()
 58 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
 59 | 
 60 |     def step_async(self, actions):
 61 |         for remote, action in zip(self.remotes, actions):
 62 |             remote.send(('step', action))
 63 |         self.waiting = True
 64 | 
 65 |     def step_wait(self):
 66 |         results = [remote.recv() for remote in self.remotes]
 67 |         self.waiting = False
 68 |         obs, rews, dones, infos = zip(*results)
 69 |         return np.stack(obs), np.stack(rews), np.stack(dones), infos
 70 | 
 71 |     def reset(self):
 72 |         for remote in self.remotes:
 73 |             remote.send(('reset', None))
 74 |         return np.stack([remote.recv() for remote in self.remotes])
 75 | 
 76 |     def close(self):
 77 |         if self.closed:
 78 |             return
 79 |         if self.waiting:
 80 |             for remote in self.remotes:
 81 |                 remote.recv()
 82 |         for remote in self.remotes:
 83 |             remote.send(('close', None))
 84 |         for process in self.processes:
 85 |             process.join()
 86 |         self.closed = True
 87 | 
 88 |     def render(self, mode='human', *args, **kwargs):
 89 |         for pipe in self.remotes:
 90 |             # gather images from subprocesses
 91 |             # `mode` will be taken into account later
 92 |             pipe.send(('render', (args, {'mode': 'rgb_array', **kwargs})))
 93 |         imgs = [pipe.recv() for pipe in self.remotes]
 94 |         # Create a big image by tiling images from subprocesses
 95 |         bigimg = tile_images(imgs)
 96 |         if mode == 'human':
 97 |             import cv2
 98 |             cv2.imshow('vecenv', bigimg[:, :, ::-1])
 99 |             cv2.waitKey(1)
100 |         elif mode == 'rgb_array':
101 |             return bigimg
102 |         else:
103 |             raise NotImplementedError
104 | 
105 |     def get_images(self):
106 |         for pipe in self.remotes:
107 |             pipe.send(('render', {"mode": 'rgb_array'}))
108 |         imgs = [pipe.recv() for pipe in self.remotes]
109 |         return imgs
110 | 


--------------------------------------------------------------------------------
/stable_baselines/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import spaces
 3 | 
 4 | from stable_baselines.common.vec_env import VecEnvWrapper
 5 | 
 6 | 
 7 | class VecFrameStack(VecEnvWrapper):
 8 |     """
 9 |     Frame stacking wrapper for vectorized environment
10 | 
11 |     :param venv: (VecEnv) the vectorized environment to wrap
12 |     :param n_stack: (int) Number of frames to stack
13 |     """
14 |     
15 |     def __init__(self, venv, n_stack):
16 |         self.venv = venv
17 |         self.n_stack = n_stack
18 |         wrapped_obs_space = venv.observation_space
19 |         low = np.repeat(wrapped_obs_space.low, self.n_stack, axis=-1)
20 |         high = np.repeat(wrapped_obs_space.high, self.n_stack, axis=-1)
21 |         self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
22 |         observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
23 |         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
24 | 
25 |     def step_wait(self):
26 |         observations, rewards, dones, infos = self.venv.step_wait()
27 |         self.stackedobs = np.roll(self.stackedobs, shift=-observations.shape[-1], axis=-1)
28 |         for i, done in enumerate(dones):
29 |             if done:
30 |                 self.stackedobs[i] = 0
31 |         self.stackedobs[..., -observations.shape[-1]:] = observations
32 |         return self.stackedobs, rewards, dones, infos
33 | 
34 |     def reset(self):
35 |         """
36 |         Reset all environments
37 |         """
38 |         obs = self.venv.reset()
39 |         self.stackedobs[...] = 0
40 |         self.stackedobs[..., -obs.shape[-1]:] = obs
41 |         return self.stackedobs
42 | 
43 |     def close(self):
44 |         self.venv.close()
45 | 


--------------------------------------------------------------------------------
/stable_baselines/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | 
  3 | import numpy as np
  4 | 
  5 | from stable_baselines.common.vec_env import VecEnvWrapper
  6 | from stable_baselines.common.running_mean_std import RunningMeanStd
  7 | 
  8 | 
  9 | class VecNormalize(VecEnvWrapper):
 10 |     """
 11 |     A moving average, normalizing wrapper for vectorized environment.
 12 |     has support for saving/loading moving average,
 13 | 
 14 |     :param venv: (VecEnv) the vectorized environment to wrap
 15 |     :param training: (bool) Whether to update or not the moving average
 16 |     :param norm_obs: (bool) Whether to normalize observation or not (default: True)
 17 |     :param norm_reward: (bool) Whether to normalize rewards or not (default: False)
 18 |     :param clip_obs: (float) Max absolute value for observation
 19 |     :param clip_reward: (float) Max value absolute for discounted reward
 20 |     :param gamma: (float) discount factor
 21 |     :param epsilon: (float) To avoid division by zero
 22 |     """
 23 | 
 24 |     def __init__(self, venv, training=True, norm_obs=True, norm_reward=True,
 25 |                  clip_obs=10., clip_reward=10., gamma=0.99, epsilon=1e-8):
 26 |         VecEnvWrapper.__init__(self, venv)
 27 |         self.obs_rms = RunningMeanStd(shape=self.observation_space.shape)
 28 |         self.ret_rms = RunningMeanStd(shape=())
 29 |         self.clip_obs = clip_obs
 30 |         self.clip_reward = clip_reward
 31 |         # Returns: discounted rewards
 32 |         self.ret = np.zeros(self.num_envs)
 33 |         self.gamma = gamma
 34 |         self.epsilon = epsilon
 35 |         self.training = training
 36 |         self.norm_obs = norm_obs
 37 |         self.norm_reward = norm_reward
 38 |         self.old_obs = np.array([])
 39 | 
 40 |     def step_wait(self):
 41 |         """
 42 |         Apply sequence of actions to sequence of environments
 43 |         actions -> (observations, rewards, news)
 44 | 
 45 |         where 'news' is a boolean vector indicating whether each element is new.
 46 |         """
 47 |         obs, rews, news, infos = self.venv.step_wait()
 48 |         self.ret = self.ret * self.gamma + rews
 49 |         self.old_obs = obs
 50 |         obs = self._normalize_observation(obs)
 51 |         if self.norm_reward:
 52 |             if self.training:
 53 |                 self.ret_rms.update(self.ret)
 54 |             rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.clip_reward, self.clip_reward)
 55 |         self.ret[news] = 0
 56 |         return obs, rews, news, infos
 57 | 
 58 |     def _normalize_observation(self, obs):
 59 |         """
 60 |         :param obs: (numpy tensor)
 61 |         """
 62 |         if self.norm_obs:
 63 |             if self.training:
 64 |                 self.obs_rms.update(obs)
 65 |             obs = np.clip((obs - self.obs_rms.mean) / np.sqrt(self.obs_rms.var + self.epsilon), -self.clip_obs,
 66 |                           self.clip_obs)
 67 |             return obs
 68 |         else:
 69 |             return obs
 70 | 
 71 |     def get_original_obs(self):
 72 |         """
 73 |         returns the unnormalized observation
 74 | 
 75 |         :return: (numpy float)
 76 |         """
 77 |         return self.old_obs
 78 | 
 79 |     def reset(self):
 80 |         """
 81 |         Reset all environments
 82 |         """
 83 |         obs = self.venv.reset()
 84 |         if len(np.array(obs).shape) == 1:  # for when num_cpu is 1
 85 |             self.old_obs = [obs]
 86 |         else:
 87 |             self.old_obs = obs
 88 |         self.ret = np.zeros(self.num_envs)
 89 |         return self._normalize_observation(obs)
 90 | 
 91 |     def save_running_average(self, path):
 92 |         """
 93 |         :param path: (str) path to log dir
 94 |         """
 95 |         for rms, name in zip([self.obs_rms, self.ret_rms], ['obs_rms', 'ret_rms']):
 96 |             with open("{}/{}.pkl".format(path, name), 'wb') as file_handler:
 97 |                 pickle.dump(rms, file_handler)
 98 | 
 99 |     def load_running_average(self, path):
100 |         """
101 |         :param path: (str) path to log dir
102 |         """
103 |         for name in ['obs_rms', 'ret_rms']:
104 |             with open("{}/{}.pkl".format(path, name), 'rb') as file_handler:
105 |                 setattr(self, name, pickle.load(file_handler))
106 | 


--------------------------------------------------------------------------------
/stable_baselines/ddpg/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.ddpg.ddpg import DDPG
2 | from stable_baselines.ddpg.policies import MlpPolicy, CnnPolicy, LnMlpPolicy, LnCnnPolicy
3 | from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec, NormalActionNoise, OrnsteinUhlenbeckActionNoise
4 | 


--------------------------------------------------------------------------------
/stable_baselines/ddpg/main.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import time
  3 | import os
  4 | 
  5 | import gym
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | from mpi4py import MPI
  9 | 
 10 | from stable_baselines import logger, bench
 11 | from stable_baselines.common.misc_util import set_global_seeds, boolean_flag
 12 | from stable_baselines.ddpg.policies import MlpPolicy, LnMlpPolicy
 13 | from stable_baselines.ddpg import DDPG
 14 | from stable_baselines.ddpg.memory import Memory
 15 | from stable_baselines.ddpg.noise import AdaptiveParamNoiseSpec, OrnsteinUhlenbeckActionNoise, NormalActionNoise
 16 | 
 17 | 
 18 | def run(env_id, seed, noise_type, layer_norm, evaluation, **kwargs):
 19 |     """
 20 |     run the training of DDPG
 21 | 
 22 |     :param env_id: (str) the environment ID
 23 |     :param seed: (int) the initial random seed
 24 |     :param noise_type: (str) the wanted noises ('adaptive-param', 'normal' or 'ou'), can use multiple noise type by
 25 |         seperating them with commas
 26 |     :param layer_norm: (bool) use layer normalization
 27 |     :param evaluation: (bool) enable evaluation of DDPG training
 28 |     :param kwargs: (dict) extra keywords for the training.train function
 29 |     """
 30 | 
 31 |     # Configure things.
 32 |     rank = MPI.COMM_WORLD.Get_rank()
 33 |     if rank != 0:
 34 |         logger.set_level(logger.DISABLED)
 35 | 
 36 |     # Create envs.
 37 |     env = gym.make(env_id)
 38 |     env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
 39 | 
 40 |     if evaluation and rank == 0:
 41 |         eval_env = gym.make(env_id)
 42 |         eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval'))
 43 |         env = bench.Monitor(env, None)
 44 |     else:
 45 |         eval_env = None
 46 | 
 47 |     # Parse noise_type
 48 |     action_noise = None
 49 |     param_noise = None
 50 |     nb_actions = env.action_space.shape[-1]
 51 |     for current_noise_type in noise_type.split(','):
 52 |         current_noise_type = current_noise_type.strip()
 53 |         if current_noise_type == 'none':
 54 |             pass
 55 |         elif 'adaptive-param' in current_noise_type:
 56 |             _, stddev = current_noise_type.split('_')
 57 |             param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev))
 58 |         elif 'normal' in current_noise_type:
 59 |             _, stddev = current_noise_type.split('_')
 60 |             action_noise = NormalActionNoise(mean=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions))
 61 |         elif 'ou' in current_noise_type:
 62 |             _, stddev = current_noise_type.split('_')
 63 |             action_noise = OrnsteinUhlenbeckActionNoise(mean=np.zeros(nb_actions),
 64 |                                                         sigma=float(stddev) * np.ones(nb_actions))
 65 |         else:
 66 |             raise RuntimeError('unknown noise type "{}"'.format(current_noise_type))
 67 | 
 68 |     # Seed everything to make things reproducible.
 69 |     seed = seed + 1000000 * rank
 70 |     logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir()))
 71 |     tf.reset_default_graph()
 72 |     set_global_seeds(seed)
 73 |     env.seed(seed)
 74 |     if eval_env is not None:
 75 |         eval_env.seed(seed)
 76 | 
 77 |     # Disable logging for rank != 0 to avoid noise.
 78 |     start_time = 0
 79 |     if rank == 0:
 80 |         start_time = time.time()
 81 | 
 82 |     if layer_norm:
 83 |         policy = LnMlpPolicy
 84 |     else:
 85 |         policy = MlpPolicy
 86 | 
 87 |     num_timesteps = kwargs['num_timesteps']
 88 |     del kwargs['num_timesteps']
 89 | 
 90 |     model = DDPG(policy=policy, env=env, memory_policy=Memory, eval_env=eval_env, param_noise=param_noise,
 91 |                  action_noise=action_noise, memory_limit=int(1e6), verbose=2, **kwargs)
 92 |     model.learn(total_timesteps=num_timesteps)
 93 |     env.close()
 94 |     if eval_env is not None:
 95 |         eval_env.close()
 96 |     if rank == 0:
 97 |         logger.info('total runtime: {}s'.format(time.time() - start_time))
 98 | 
 99 | 
100 | def parse_args():
101 |     """
102 |     parse the arguments for DDPG training
103 | 
104 |     :return: (dict) the arguments
105 |     """
106 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
107 | 
108 |     parser.add_argument('--env-id', type=str, default='HalfCheetah-v1')
109 |     boolean_flag(parser, 'render-eval', default=False)
110 |     boolean_flag(parser, 'layer-norm', default=True)
111 |     boolean_flag(parser, 'render', default=False)
112 |     boolean_flag(parser, 'normalize-returns', default=False)
113 |     boolean_flag(parser, 'normalize-observations', default=True)
114 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
115 |     parser.add_argument('--critic-l2-reg', type=float, default=1e-2)
116 |     parser.add_argument('--batch-size', type=int, default=64)  # per MPI worker
117 |     parser.add_argument('--actor-lr', type=float, default=1e-4)
118 |     parser.add_argument('--critic-lr', type=float, default=1e-3)
119 |     boolean_flag(parser, 'enable-popart', default=False)
120 |     parser.add_argument('--gamma', type=float, default=0.99)
121 |     parser.add_argument('--reward-scale', type=float, default=1.)
122 |     parser.add_argument('--clip-norm', type=float, default=None)
123 |     parser.add_argument('--nb-train-steps', type=int, default=50)  # per epoch cycle and MPI worker
124 |     parser.add_argument('--nb-eval-steps', type=int, default=100)  # per epoch cycle and MPI worker
125 |     parser.add_argument('--nb-rollout-steps', type=int, default=100)  # per epoch cycle and MPI worker
126 |     # choices are adaptive-param_xx, ou_xx, normal_xx, none
127 |     parser.add_argument('--noise-type', type=str, default='adaptive-param_0.2')
128 |     parser.add_argument('--num-timesteps', type=int, default=int(1e6))
129 |     boolean_flag(parser, 'evaluation', default=False)
130 |     args = parser.parse_args()
131 |     dict_args = vars(args)
132 |     return dict_args
133 | 
134 | 
135 | if __name__ == '__main__':
136 |     args = parse_args()
137 |     if MPI.COMM_WORLD.Get_rank() == 0:
138 |         logger.configure()
139 |     # Run actual script.
140 |     run(**args)
141 | 


--------------------------------------------------------------------------------
/stable_baselines/ddpg/memory.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class RingBuffer(object):
  5 |     def __init__(self, maxlen, shape, dtype='float32'):
  6 |         """
  7 |         A buffer object, when full restarts at the initial position
  8 | 
  9 |         :param maxlen: (int) the max number of numpy objects to store
 10 |         :param shape: (tuple) the shape of the numpy objects you want to store
 11 |         :param dtype: (str) the name of the type of the numpy object you want to store
 12 |         """
 13 |         self.maxlen = maxlen
 14 |         self.start = 0
 15 |         self.length = 0
 16 |         self.data = np.zeros((maxlen,) + shape).astype(dtype)
 17 | 
 18 |     def __len__(self):
 19 |         return self.length
 20 | 
 21 |     def __getitem__(self, idx):
 22 |         if idx < 0 or idx >= self.length:
 23 |             raise KeyError()
 24 |         return self.data[(self.start + idx) % self.maxlen]
 25 | 
 26 |     def get_batch(self, idxs):
 27 |         """
 28 |         get the value at the indexes
 29 | 
 30 |         :param idxs: (int or numpy int) the indexes
 31 |         :return: (np.ndarray) the stored information in the buffer at the asked positions
 32 |         """
 33 |         return self.data[(self.start + idxs) % self.maxlen]
 34 | 
 35 |     def append(self, var):
 36 |         """
 37 |         Append an object to the buffer
 38 | 
 39 |         :param var: (np.ndarray) the object you wish to add
 40 |         """
 41 |         if self.length < self.maxlen:
 42 |             # We have space, simply increase the length.
 43 |             self.length += 1
 44 |         elif self.length == self.maxlen:
 45 |             # No space, "remove" the first item.
 46 |             self.start = (self.start + 1) % self.maxlen
 47 |         else:
 48 |             # This should never happen.
 49 |             raise RuntimeError()
 50 |         self.data[(self.start + self.length - 1) % self.maxlen] = var
 51 | 
 52 | 
 53 | def array_min2d(arr):
 54 |     """
 55 |     cast to np.ndarray, and make sure it is of 2 dim
 56 | 
 57 |     :param arr: ([Any]) the array to clean
 58 |     :return: (np.ndarray) the cleaned array
 59 |     """
 60 |     arr = np.array(arr)
 61 |     if arr.ndim >= 2:
 62 |         return arr
 63 |     return arr.reshape(-1, 1)
 64 | 
 65 | 
 66 | class Memory(object):
 67 |     def __init__(self, limit, action_shape, observation_shape):
 68 |         """
 69 |         The replay buffer object
 70 | 
 71 |         :param limit: (int) the max number of transitions to store
 72 |         :param action_shape: (tuple) the action shape
 73 |         :param observation_shape: (tuple) the observation shape
 74 |         """
 75 |         self.limit = limit
 76 | 
 77 |         self.observations0 = RingBuffer(limit, shape=observation_shape)
 78 |         self.actions = RingBuffer(limit, shape=action_shape)
 79 |         self.rewards = RingBuffer(limit, shape=(1,))
 80 |         self.terminals1 = RingBuffer(limit, shape=(1,))
 81 |         self.observations1 = RingBuffer(limit, shape=observation_shape)
 82 | 
 83 |     def sample(self, batch_size):
 84 |         """
 85 |         sample a random batch from the buffer
 86 | 
 87 |         :param batch_size: (int) the number of element to sample for the batch
 88 |         :return: (dict) the sampled batch
 89 |         """
 90 |         # Draw such that we always have a proceeding element.
 91 |         batch_idxs = np.random.randint(low=1, high=self.nb_entries - 1, size=batch_size)
 92 | 
 93 |         obs0_batch = self.observations0.get_batch(batch_idxs)
 94 |         obs1_batch = self.observations1.get_batch(batch_idxs)
 95 |         action_batch = self.actions.get_batch(batch_idxs)
 96 |         reward_batch = self.rewards.get_batch(batch_idxs)
 97 |         terminal1_batch = self.terminals1.get_batch(batch_idxs)
 98 | 
 99 |         result = {
100 |             'obs0': array_min2d(obs0_batch),
101 |             'obs1': array_min2d(obs1_batch),
102 |             'rewards': array_min2d(reward_batch),
103 |             'actions': array_min2d(action_batch),
104 |             'terminals1': array_min2d(terminal1_batch),
105 |         }
106 |         return result
107 | 
108 |     def append(self, obs0, action, reward, obs1, terminal1, training=True):
109 |         """
110 |         Append a transition to the buffer
111 | 
112 |         :param obs0: ([float] or [int]) the last observation
113 |         :param action: ([float]) the action
114 |         :param reward: (float] the reward
115 |         :param obs1: ([float] or [int]) the current observation
116 |         :param terminal1: (bool) is the episode done
117 |         :param training: (bool) is the RL model training or not
118 |         """
119 |         if not training:
120 |             return
121 | 
122 |         self.observations0.append(obs0)
123 |         self.actions.append(action)
124 |         self.rewards.append(reward)
125 |         self.observations1.append(obs1)
126 |         self.terminals1.append(terminal1)
127 | 
128 |     @property
129 |     def nb_entries(self):
130 |         return len(self.observations0)
131 | 


--------------------------------------------------------------------------------
/stable_baselines/ddpg/noise.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | class AdaptiveParamNoiseSpec(object):
  5 |     """
  6 |     Implements adaptive parameter noise
  7 | 
  8 |     :param initial_stddev: (float) the initial value for the standard deviation of the noise
  9 |     :param desired_action_stddev: (float) the desired value for the standard deviation of the noise
 10 |     :param adoption_coefficient: (float) the update coefficient for the standard deviation of the noise
 11 |     """
 12 |     def __init__(self, initial_stddev=0.1, desired_action_stddev=0.1, adoption_coefficient=1.01):
 13 |         self.initial_stddev = initial_stddev
 14 |         self.desired_action_stddev = desired_action_stddev
 15 |         self.adoption_coefficient = adoption_coefficient
 16 | 
 17 |         self.current_stddev = initial_stddev
 18 | 
 19 |     def adapt(self, distance):
 20 |         """
 21 |         update the standard deviation for the parameter noise
 22 | 
 23 |         :param distance: (float) the noise distance applied to the parameters
 24 |         """
 25 |         if distance > self.desired_action_stddev:
 26 |             # Decrease stddev.
 27 |             self.current_stddev /= self.adoption_coefficient
 28 |         else:
 29 |             # Increase stddev.
 30 |             self.current_stddev *= self.adoption_coefficient
 31 | 
 32 |     def get_stats(self):
 33 |         """
 34 |         return the standard deviation for the parameter noise
 35 | 
 36 |         :return: (dict) the stats of the noise
 37 |         """
 38 |         return {'param_noise_stddev': self.current_stddev}
 39 | 
 40 |     def __repr__(self):
 41 |         fmt = 'AdaptiveParamNoiseSpec(initial_stddev={}, desired_action_stddev={}, adoption_coefficient={})'
 42 |         return fmt.format(self.initial_stddev, self.desired_action_stddev, self.adoption_coefficient)
 43 | 
 44 | 
 45 | class ActionNoise(object):
 46 |     """
 47 |     The action noise base class
 48 |     """
 49 |     def reset(self):
 50 |         """
 51 |         call end of episode reset for the noise
 52 |         """
 53 |         pass
 54 | 
 55 | 
 56 | class NormalActionNoise(ActionNoise):
 57 |     """
 58 |     A gaussian action noise
 59 | 
 60 |     :param mean: (float) the mean value of the noise
 61 |     :param sigma: (float) the scale of the noise (std here)
 62 |     """
 63 |     def __init__(self, mean, sigma):
 64 |         self._mu = mean
 65 |         self._sigma = sigma
 66 | 
 67 |     def __call__(self):
 68 |         return np.random.normal(self._mu, self._sigma)
 69 | 
 70 |     def __repr__(self):
 71 |         return 'NormalActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma)
 72 | 
 73 | 
 74 | class OrnsteinUhlenbeckActionNoise(ActionNoise):
 75 |     """
 76 |     A Ornstein Uhlenbeck action noise, this is designed to aproximate brownian motion with friction.
 77 | 
 78 |     Based on http://math.stackexchange.com/questions/1287634/implementing-ornstein-uhlenbeck-in-matlab
 79 | 
 80 |     :param mean: (float) the mean of the noise
 81 |     :param sigma: (float) the scale of the noise
 82 |     :param theta: (float) the rate of mean reversion
 83 |     :param dt: (float) the timestep for the noise
 84 |     :param initial_noise: ([float]) the initial value for the noise output, (if None: 0)
 85 |     """
 86 | 
 87 |     def __init__(self, mean, sigma, theta=.15, dt=1e-2, initial_noise=None):
 88 |         self._theta = theta
 89 |         self._mu = mean
 90 |         self._sigma = sigma
 91 |         self._dt = dt
 92 |         self.initial_noise = initial_noise
 93 |         self.noise_prev = None
 94 |         self.reset()
 95 | 
 96 |     def __call__(self):
 97 |         noise = self.noise_prev + self._theta * (self._mu - self.noise_prev) * self._dt + \
 98 |                 self._sigma * np.sqrt(self._dt) * np.random.normal(size=self._mu.shape)
 99 |         self.noise_prev = noise
100 |         return noise
101 | 
102 |     def reset(self):
103 |         """
104 |         reset the Ornstein Uhlenbeck noise, to the initial position
105 |         """
106 |         self.noise_prev = self.initial_noise if self.initial_noise is not None else np.zeros_like(self._mu)
107 | 
108 |     def __repr__(self):
109 |         return 'OrnsteinUhlenbeckActionNoise(mu={}, sigma={})'.format(self._mu, self._sigma)
110 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/__init__.py:
--------------------------------------------------------------------------------
 1 | from stable_baselines.deepq.policies import MlpPolicy, CnnPolicy, LnMlpPolicy, LnCnnPolicy
 2 | from stable_baselines.deepq.build_graph import build_act, build_train  # noqa
 3 | from stable_baselines.deepq.dqn import DQN
 4 | from stable_baselines.deepq.replay_buffer import ReplayBuffer, PrioritizedReplayBuffer  # noqa
 5 | 
 6 | 
 7 | def wrap_atari_dqn(env):
 8 |     """
 9 |     wrap the environment in atari wrappers for DQN
10 | 
11 |     :param env: (Gym Environment) the environment
12 |     :return: (Gym Environment) the wrapped environment
13 |     """
14 |     from stable_baselines.common.atari_wrappers import wrap_deepmind
15 |     return wrap_deepmind(env, frame_stack=True, scale=False)
16 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/deepq/experiments/__init__.py


--------------------------------------------------------------------------------
/stable_baselines/deepq/experiments/custom_cartpole.py:
--------------------------------------------------------------------------------
  1 | import itertools
  2 | import argparse
  3 | 
  4 | import gym
  5 | import numpy as np
  6 | import tensorflow as tf
  7 | 
  8 | import stable_baselines.common.tf_util as tf_utils
  9 | from stable_baselines import logger, deepq
 10 | from stable_baselines.deepq.replay_buffer import ReplayBuffer
 11 | from stable_baselines.deepq.policies import FeedForwardPolicy
 12 | from stable_baselines.common.schedules import LinearSchedule
 13 | 
 14 | 
 15 | class CustomPolicy(FeedForwardPolicy):
 16 |     def __init__(self, *args, **kwargs):
 17 |         super(CustomPolicy, self).__init__(*args, **kwargs,
 18 |                                            layers=[64],
 19 |                                            feature_extraction="mlp")
 20 | 
 21 | 
 22 | def main(args):
 23 |     """
 24 |     Train a DQN agent on cartpole env
 25 |     :param args: (Parsed Arguments) the input arguments
 26 |     """
 27 |     with tf_utils.make_session(8) as sess:
 28 |         # Create the environment
 29 |         env = gym.make("CartPole-v0")
 30 |         # Create all the functions necessary to train the model
 31 |         act, train, update_target, _ = deepq.build_train(
 32 |             q_func=CustomPolicy,
 33 |             ob_space=env.observation_space,
 34 |             ac_space=env.action_space,
 35 |             optimizer=tf.train.AdamOptimizer(learning_rate=5e-4),
 36 |             sess=sess
 37 |         )
 38 |         # Create the replay buffer
 39 |         replay_buffer = ReplayBuffer(50000)
 40 |         # Create the schedule for exploration starting from 1 (every action is random) down to
 41 |         # 0.02 (98% of actions are selected according to values predicted by the model).
 42 |         exploration = LinearSchedule(schedule_timesteps=10000, initial_p=1.0, final_p=0.02)
 43 | 
 44 |         # Initialize the parameters and copy them to the target network.
 45 |         tf_utils.initialize()
 46 |         update_target()
 47 | 
 48 |         episode_rewards = [0.0]
 49 |         obs = env.reset()
 50 |         for step in itertools.count():
 51 |             # Take action and update exploration to the newest value
 52 |             action = act(obs[None], update_eps=exploration.value(step))[0]
 53 |             new_obs, rew, done, _ = env.step(action)
 54 |             # Store transition in the replay buffer.
 55 |             replay_buffer.add(obs, action, rew, new_obs, float(done))
 56 |             obs = new_obs
 57 | 
 58 |             episode_rewards[-1] += rew
 59 |             if done:
 60 |                 obs = env.reset()
 61 |                 episode_rewards.append(0)
 62 | 
 63 |             if len(episode_rewards[-101:-1]) == 0:
 64 |                 mean_100ep_reward = -np.inf
 65 |             else:
 66 |                 mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1)
 67 | 
 68 |             is_solved = step > 100 and mean_100ep_reward >= 200
 69 | 
 70 |             if args.no_render and step > args.max_timesteps:
 71 |                 break
 72 | 
 73 |             if is_solved:
 74 |                 if args.no_render:
 75 |                     break
 76 |                 # Show off the result
 77 |                 env.render()
 78 |             else:
 79 |                 # Minimize the error in Bellman's equation on a batch sampled from replay buffer.
 80 |                 if step > 1000:
 81 |                     obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample(32)
 82 |                     train(obses_t, actions, rewards, obses_tp1, dones, np.ones_like(rewards))
 83 |                 # Update target network periodically.
 84 |                 if step % 1000 == 0:
 85 |                     update_target()
 86 | 
 87 |             if done and len(episode_rewards) % 10 == 0:
 88 |                 logger.record_tabular("steps", step)
 89 |                 logger.record_tabular("episodes", len(episode_rewards))
 90 |                 logger.record_tabular("mean episode reward", mean_100ep_reward)
 91 |                 logger.record_tabular("% time spent exploring", int(100 * exploration.value(step)))
 92 |                 logger.dump_tabular()
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     parser = argparse.ArgumentParser(description="Train DQN on cartpole using a custom mlp")
 97 |     parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering")
 98 |     parser.add_argument('--max-timesteps', default=50000, type=int,
 99 |                         help="Maximum number of timesteps when not rendering")
100 |     args = parser.parse_args()
101 |     main(args)
102 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/experiments/enjoy_cartpole.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import gym
 4 | 
 5 | from stable_baselines.deepq import DQN
 6 | 
 7 | 
 8 | def main(args):
 9 |     """
10 |     Run a trained model for the cartpole problem
11 | 
12 |     :param args: (ArgumentParser) the input arguments
13 |     """
14 |     env = gym.make("CartPole-v0")
15 |     model = DQN.load("cartpole_model.pkl", env)
16 | 
17 |     while True:
18 |         obs, done = env.reset(), False
19 |         episode_rew = 0
20 |         while not done:
21 |             if not args.no_render:
22 |                 env.render()
23 |             action, _ = model.predict(obs)
24 |             obs, rew, done, _ = env.step(action)
25 |             episode_rew += rew
26 |         print("Episode reward", episode_rew)
27 |         # No render is only used for automatic testing
28 |         if args.no_render:
29 |             break
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     parser = argparse.ArgumentParser(description="Enjoy trained DQN on cartpole")
34 |     parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering")
35 |     args = parser.parse_args()
36 |     main(args)
37 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/experiments/enjoy_mountaincar.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import gym
 4 | import numpy as np
 5 | 
 6 | from stable_baselines.deepq import DQN
 7 | 
 8 | 
 9 | def main(args):
10 |     """
11 |     Run a trained model for the mountain car problem
12 | 
13 |     :param args: (ArgumentParser) the input arguments
14 |     """
15 |     env = gym.make("MountainCar-v0")
16 |     model = DQN.load("mountaincar_model.pkl", env)
17 | 
18 |     while True:
19 |         obs, done = env.reset(), False
20 |         episode_rew = 0
21 |         while not done:
22 |             if not args.no_render:
23 |                 env.render()
24 |             # Epsilon-greedy
25 |             if np.random.random() < 0.02:
26 |                 action = env.action_space.sample()
27 |             else:
28 |                 action, _ = model.predict(obs, deterministic=True)
29 |             obs, rew, done, _ = env.step(action)
30 |             episode_rew += rew
31 |         print("Episode reward", episode_rew)
32 |         # No render is only used for automatic testing
33 |         if args.no_render:
34 |             break
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     parser = argparse.ArgumentParser(description="Enjoy trained DQN on MountainCar")
39 |     parser.add_argument('--no-render', default=False, action="store_true", help="Disable rendering")
40 |     args = parser.parse_args()
41 |     main(args)
42 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/experiments/enjoy_pong.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from stable_baselines import deepq
 4 | from stable_baselines.deepq import DQN
 5 | 
 6 | 
 7 | def main():
 8 |     """
 9 |     Run a trained model for the pong problem
10 |     """
11 |     env = gym.make("PongNoFrameskip-v4")
12 |     env = deepq.wrap_atari_dqn(env)
13 |     model = DQN.load("pong_model.pkl", env)
14 | 
15 |     while True:
16 |         obs, done = env.reset(), False
17 |         episode_rew = 0
18 |         while not done:
19 |             env.render()
20 |             action, _ = model.predict(obs)
21 |             obs, rew, done, _ = env.step(action)
22 |             episode_rew += rew
23 |         print("Episode reward", episode_rew)
24 | 
25 | 
26 | if __name__ == '__main__':
27 |     main()
28 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/experiments/run_atari.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from functools import partial
 3 | 
 4 | from stable_baselines import bench, logger
 5 | from stable_baselines.common import set_global_seeds
 6 | from stable_baselines.common.atari_wrappers import make_atari
 7 | from stable_baselines.deepq import DQN, wrap_atari_dqn, CnnPolicy
 8 | 
 9 | 
10 | def main():
11 |     """
12 |     Run the atari test
13 |     """
14 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
15 |     parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4')
16 |     parser.add_argument('--seed', help='RNG seed', type=int, default=0)
17 |     parser.add_argument('--prioritized', type=int, default=1)
18 |     parser.add_argument('--dueling', type=int, default=1)
19 |     parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6)
20 |     parser.add_argument('--num-timesteps', type=int, default=int(10e6))
21 |     parser.add_argument('--checkpoint-freq', type=int, default=10000)
22 |     parser.add_argument('--checkpoint-path', type=str, default=None)
23 | 
24 |     args = parser.parse_args()
25 |     logger.configure()
26 |     set_global_seeds(args.seed)
27 |     env = make_atari(args.env)
28 |     env = bench.Monitor(env, logger.get_dir())
29 |     env = wrap_atari_dqn(env)
30 |     policy = partial(CnnPolicy, dueling=args.dueling == 1)
31 | 
32 |     model = DQN(
33 |         env=env,
34 |         policy=policy,
35 |         learning_rate=1e-4,
36 |         buffer_size=10000,
37 |         exploration_fraction=0.1,
38 |         exploration_final_eps=0.01,
39 |         train_freq=4,
40 |         learning_starts=10000,
41 |         target_network_update_freq=1000,
42 |         gamma=0.99,
43 |         prioritized_replay=bool(args.prioritized),
44 |         prioritized_replay_alpha=args.prioritized_replay_alpha,
45 |         checkpoint_freq=args.checkpoint_freq,
46 |         checkpoint_path=args.checkpoint_path,
47 |     )
48 |     model.learn(total_timesteps=args.num_timesteps)
49 | 
50 |     env.close()
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/experiments/train_cartpole.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import gym
 4 | import numpy as np
 5 | 
 6 | from stable_baselines.deepq import DQN, MlpPolicy
 7 | 
 8 | 
 9 | def callback(lcl, _glb):
10 |     """
11 |     The callback function for logging and saving
12 | 
13 |     :param lcl: (dict) the local variables
14 |     :param _glb: (dict) the global variables
15 |     :return: (bool) is solved
16 |     """
17 |     # stop training if reward exceeds 199
18 |     if len(lcl['episode_rewards'][-101:-1]) == 0:
19 |         mean_100ep_reward = -np.inf
20 |     else:
21 |         mean_100ep_reward = round(float(np.mean(lcl['episode_rewards'][-101:-1])), 1)
22 |     is_solved = lcl['step'] > 100 and mean_100ep_reward >= 199
23 |     return is_solved
24 | 
25 | 
26 | def main(args):
27 |     """
28 |     Train and save the DQN model, for the cartpole problem
29 | 
30 |     :param args: (ArgumentParser) the input arguments
31 |     """
32 |     env = gym.make("CartPole-v0")
33 |     model = DQN(
34 |         env=env,
35 |         policy=MlpPolicy,
36 |         learning_rate=1e-3,
37 |         buffer_size=50000,
38 |         exploration_fraction=0.1,
39 |         exploration_final_eps=0.02,
40 |     )
41 |     model.learn(total_timesteps=args.max_timesteps, callback=callback)
42 | 
43 |     print("Saving model to cartpole_model.pkl")
44 |     model.save("cartpole_model.pkl")
45 | 
46 | 
47 | if __name__ == '__main__':
48 |     parser = argparse.ArgumentParser(description="Train DQN on cartpole")
49 |     parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps")
50 |     args = parser.parse_args()
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/experiments/train_mountaincar.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import gym
 4 | 
 5 | from stable_baselines.deepq import DQN
 6 | from stable_baselines.deepq.policies import FeedForwardPolicy
 7 | 
 8 | 
 9 | class CustomPolicy(FeedForwardPolicy):
10 |     def __init__(self, *args, **kwargs):
11 |         super(CustomPolicy, self).__init__(*args, **kwargs,
12 |                                            layers=[64],
13 |                                            layer_norm=True,
14 |                                            feature_extraction="mlp")
15 | 
16 | 
17 | def main(args):
18 |     """
19 |     Train and save the DQN model, for the mountain car problem
20 | 
21 |     :param args: (ArgumentParser) the input arguments
22 |     """
23 |     env = gym.make("MountainCar-v0")
24 | 
25 |     # using layer norm policy here is important for parameter space noise!
26 |     model = DQN(
27 |         policy=CustomPolicy,
28 |         env=env,
29 |         learning_rate=1e-3,
30 |         buffer_size=50000,
31 |         exploration_fraction=0.1,
32 |         exploration_final_eps=0.1,
33 |         param_noise=True
34 |     )
35 |     model.learn(total_timesteps=args.max_timesteps)
36 | 
37 |     print("Saving model to mountaincar_model.pkl")
38 |     model.save("mountaincar_model.pkl")
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     parser = argparse.ArgumentParser(description="Train DQN on cartpole")
43 |     parser.add_argument('--max-timesteps', default=100000, type=int, help="Maximum number of timesteps")
44 |     args = parser.parse_args()
45 |     main(args)
46 | 


--------------------------------------------------------------------------------
/stable_baselines/deepq/utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from stable_baselines.common.input import observation_input
 4 | 
 5 | # ================================================================
 6 | # Placeholders
 7 | # ================================================================
 8 | 
 9 | 
10 | class TfInput(object):
11 |     def __init__(self, name="(unnamed)"):
12 |         """
13 |         Generalized Tensorflow placeholder. The main differences are:
14 |             - possibly uses multiple placeholders internally and returns multiple values
15 |             - can apply light postprocessing to the value feed to placeholder.
16 | 
17 |         :param name: (str) the input name
18 |         """
19 |         self.name = name
20 | 
21 |     def get(self):
22 |         """
23 |         Return the tf variable(s) representing the possibly postprocessed value
24 |         of placeholder(s).
25 | 
26 |         :return: (TensorFlow Tensor) the placeholder
27 |         """
28 |         raise NotImplementedError
29 | 
30 |     def make_feed_dict(self, data):
31 |         """
32 |         Given data input it to the placeholder(s).
33 | 
34 |         :return: (dict) the given data input
35 |         """
36 |         raise NotImplementedError
37 | 
38 | 
39 | class PlaceholderTfInput(TfInput):
40 |     def __init__(self, placeholder):
41 |         """
42 |         Wrapper for regular tensorflow placeholder.
43 | 
44 |         :param placeholder: (TensorFlow Tensor)
45 |         """
46 |         super().__init__(placeholder.name)
47 |         self._placeholder = placeholder
48 | 
49 |     def get(self):
50 |         return self._placeholder
51 | 
52 |     def make_feed_dict(self, data):
53 |         return {self._placeholder: data}
54 | 
55 | 
56 | class Uint8Input(PlaceholderTfInput):
57 |     def __init__(self, shape, name=None):
58 |         """
59 |         Takes input in uint8 format which is cast to float32 and divided by 255
60 |         before passing it to the model.
61 | 
62 |         On GPU this ensures lower data transfer times.
63 | 
64 |         :param shape: ([int]) shape of the tensor.
65 |         :param name: (str) name of the underlying placeholder
66 |         """
67 | 
68 |         super().__init__(tf.placeholder(tf.uint8, [None] + list(shape), name=name))
69 |         self._shape = shape
70 |         self._output = tf.cast(super().get(), tf.float32) / 255.0
71 | 
72 |     def get(self):
73 |         return self._output
74 | 
75 | 
76 | class ObservationInput(PlaceholderTfInput):
77 |     def __init__(self, observation_space, name=None):
78 |         """
79 |         Creates an input placeholder tailored to a specific observation space
80 | 
81 |         :param observation_space: (Gym Space) observation space of the environment. Should be one of the gym.spaces
82 |             types
83 |         :param name: (str) tensorflow name of the underlying placeholder
84 |         """
85 |         is_image = len(observation_space.shape) == 3
86 |         inpt, self.processed_inpt = observation_input(observation_space, name=name, scale=is_image)
87 |         super().__init__(inpt)
88 | 
89 |     def get(self):
90 |         return self.processed_inpt
91 | 


--------------------------------------------------------------------------------
/stable_baselines/gail/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.gail.model import GAIL
2 | 


--------------------------------------------------------------------------------
/stable_baselines/gail/dataset/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/dataset/__init__.py


--------------------------------------------------------------------------------
/stable_baselines/gail/dataset/mujocodset.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Data structure of the input .npz:
  3 | the data is save in python dictionary format with keys: 'acs', 'ep_rets', 'rews', 'obs'
  4 | the values of each item is a list storing the expert trajectory sequentially
  5 | a transition can be: (data['obs'][t], data['acs'][t], data['obs'][t+1]) and get reward data['rews'][t]
  6 | """
  7 | 
  8 | import numpy as np
  9 | import matplotlib.pyplot as plt
 10 | 
 11 | from stable_baselines import logger
 12 | 
 13 | 
 14 | class Dset(object):
 15 |     def __init__(self, inputs, labels, randomize):
 16 |         """
 17 |         Dataset object
 18 | 
 19 |         :param inputs: (np.ndarray) the input values
 20 |         :param labels: (np.ndarray) the target values
 21 |         :param randomize: (bool) if the dataset should be shuffled
 22 |         """
 23 |         self.inputs = inputs
 24 |         self.labels = labels
 25 |         assert len(self.inputs) == len(self.labels)
 26 |         self.randomize = randomize
 27 |         self.num_pairs = len(inputs)
 28 |         self.init_pointer()
 29 | 
 30 |     def init_pointer(self):
 31 |         """
 32 |         initialize the pointer and shuffle the dataset, if randomize the dataset
 33 |         """
 34 |         self.pointer = 0
 35 |         if self.randomize:
 36 |             idx = np.arange(self.num_pairs)
 37 |             np.random.shuffle(idx)
 38 |             self.inputs = self.inputs[idx, :]
 39 |             self.labels = self.labels[idx, :]
 40 | 
 41 |     def get_next_batch(self, batch_size):
 42 |         """
 43 |         get the batch from the dataset
 44 | 
 45 |         :param batch_size: (int) the size of the batch from the dataset
 46 |         :return: (np.ndarray, np.ndarray) inputs and labels
 47 |         """
 48 |         # if batch_size is negative -> return all
 49 |         if batch_size < 0:
 50 |             return self.inputs, self.labels
 51 |         if self.pointer + batch_size >= self.num_pairs:
 52 |             self.init_pointer()
 53 |         end = self.pointer + batch_size
 54 |         inputs = self.inputs[self.pointer:end, :]
 55 |         labels = self.labels[self.pointer:end, :]
 56 |         self.pointer = end
 57 |         return inputs, labels
 58 | 
 59 | 
 60 | class MujocoDset(object):
 61 |     def __init__(self, expert_path, train_fraction=0.7, traj_limitation=-1, randomize=True):
 62 |         """
 63 |         Dataset for mujoco
 64 | 
 65 |         :param expert_path: (str) the path to trajectory data
 66 |         :param train_fraction: (float) the train val split (0 to 1)
 67 |         :param traj_limitation: (int) the dims to load (if -1, load all)
 68 |         :param randomize: (bool) if the dataset should be shuffled
 69 |         """
 70 |         traj_data = np.load(expert_path)
 71 |         if traj_limitation < 0:
 72 |             traj_limitation = len(traj_data['obs'])
 73 |         obs = traj_data['obs'][:traj_limitation]
 74 |         acs = traj_data['acs'][:traj_limitation]
 75 | 
 76 |         # obs, acs: shape (N, L, ) + S where N = # episodes, L = episode length
 77 |         # and S is the environment observation/action space.
 78 |         # Flatten to (N * L, prod(S))
 79 |         self.obs = np.reshape(obs, [-1, np.prod(obs.shape[2:])])
 80 |         self.acs = np.reshape(acs, [-1, np.prod(acs.shape[2:])])
 81 | 
 82 |         self.rets = traj_data['ep_rets'][:traj_limitation]
 83 |         self.avg_ret = sum(self.rets)/len(self.rets)
 84 |         self.std_ret = np.std(np.array(self.rets))
 85 |         if len(self.acs) > 2:
 86 |             self.acs = np.squeeze(self.acs)
 87 |         assert len(self.obs) == len(self.acs)
 88 |         self.num_traj = min(traj_limitation, len(traj_data['obs']))
 89 |         self.num_transition = len(self.obs)
 90 |         self.randomize = randomize
 91 |         self.dset = Dset(self.obs, self.acs, self.randomize)
 92 |         # for behavior cloning
 93 |         self.train_set = Dset(self.obs[:int(self.num_transition*train_fraction), :],
 94 |                               self.acs[:int(self.num_transition*train_fraction), :],
 95 |                               self.randomize)
 96 |         self.val_set = Dset(self.obs[int(self.num_transition*train_fraction):, :],
 97 |                             self.acs[int(self.num_transition*train_fraction):, :],
 98 |                             self.randomize)
 99 |         self.log_info()
100 | 
101 |     def log_info(self):
102 |         """
103 |         log the information of the dataset
104 |         """
105 |         logger.log("Total trajectorues: %d" % self.num_traj)
106 |         logger.log("Total transitions: %d" % self.num_transition)
107 |         logger.log("Average returns: %f" % self.avg_ret)
108 |         logger.log("Std for returns: %f" % self.std_ret)
109 | 
110 |     def get_next_batch(self, batch_size, split=None):
111 |         """
112 |         get the batch from the dataset
113 | 
114 |         :param batch_size: (int) the size of the batch from the dataset
115 |         :param split: (str) the type of data split (can be None, 'train', 'val')
116 |         :return: (np.ndarray, np.ndarray) inputs and labels
117 |         """
118 |         if split is None:
119 |             return self.dset.get_next_batch(batch_size)
120 |         elif split == 'train':
121 |             return self.train_set.get_next_batch(batch_size)
122 |         elif split == 'val':
123 |             return self.val_set.get_next_batch(batch_size)
124 |         else:
125 |             raise NotImplementedError
126 | 
127 |     def plot(self):
128 |         """
129 |         show and save (to 'histogram_rets.png') a histogram plotting of the episode returns
130 |         """
131 |         plt.hist(self.rets)
132 |         plt.savefig("histogram_rets.png")
133 |         plt.close()
134 | 
135 | 
136 | def test(expert_path, traj_limitation, plot):
137 |     """
138 |     test mujoco dataset object
139 | 
140 |     :param expert_path: (str) the path to trajectory data
141 |     :param traj_limitation: (int) the dims to load (if -1, load all)
142 |     :param plot: (bool) enable plotting
143 |     """
144 |     dset = MujocoDset(expert_path, traj_limitation=traj_limitation)
145 |     if plot:
146 |         dset.plot()
147 | 
148 | 
149 | if __name__ == '__main__':
150 |     import argparse
151 |     parser = argparse.ArgumentParser()
152 |     parser.add_argument("--expert_path", type=str, default="../data/deterministic.trpo.Hopper.0.00.npz")
153 |     parser.add_argument("--traj_limitation", type=int, default=None)
154 |     parser.add_argument("--plot", type=bool, default=False)
155 |     args = parser.parse_args()
156 |     test(args.expert_path, args.traj_limitation, args.plot)
157 | 


--------------------------------------------------------------------------------
/stable_baselines/gail/mlp_policy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | from stable_baselines/ppo1/mlp_policy.py and add simple modification
 3 | (1) add reuse argument
 4 | (2) cache the `stochastic` placeholder
 5 | """
 6 | import gym
 7 | import tensorflow as tf
 8 | 
 9 | import stable_baselines.common.tf_util as tf_util
10 | from stable_baselines.acktr.utils import dense
11 | from stable_baselines.common.mpi_running_mean_std import RunningMeanStd
12 | from stable_baselines.ppo1.mlp_policy import BasePolicy
13 | 
14 | 
15 | class MlpPolicy(BasePolicy):
16 |     recurrent = False
17 | 
18 |     def __init__(self, name, *args, sess=None, reuse=False, placeholders=None, **kwargs):
19 |         """
20 |         MLP policy for Gail
21 | 
22 |         :param name: (str) the variable scope name
23 |         :param ob_space: (Gym Space) The observation space of the environment
24 |         :param ac_space: (Gym Space) The action space of the environment
25 |         :param hid_size: (int) the size of the hidden layers
26 |         :param num_hid_layers: (int) the number of hidden layers
27 |         :param sess: (TensorFlow session) The current TensorFlow session containing the variables.
28 |         :param reuse: (bool) allow resue of the graph
29 |         :param placeholders: (dict) To feed existing placeholders if needed
30 |         :param gaussian_fixed_var: (bool) fix the gaussian variance
31 |         """
32 |         super(MlpPolicy, self).__init__(placeholders=placeholders)
33 |         self.sess = sess
34 |         with tf.variable_scope(name):
35 |             if reuse:
36 |                 tf.get_variable_scope().reuse_variables()
37 |             self._init(*args, **kwargs)
38 |             self.scope = tf.get_variable_scope().name
39 | 
40 |     def _init(self, ob_space, ac_space, hid_size, num_hid_layers, gaussian_fixed_var=True):
41 | 
42 |         obs, pdtype = self.get_obs_and_pdtype(ob_space, ac_space)
43 | 
44 |         with tf.variable_scope("obfilter"):
45 |             self.ob_rms = RunningMeanStd(shape=ob_space.shape)
46 | 
47 |         obz = tf.clip_by_value((obs - self.ob_rms.mean) / self.ob_rms.std, -5.0, 5.0)
48 |         last_out = obz
49 |         for i in range(num_hid_layers):
50 |             last_out = tf.nn.tanh(dense(last_out, hid_size, "vffc%i" % (i+1),
51 |                                         weight_init=tf_util.normc_initializer(1.0)))
52 |         self.vpred = dense(last_out, 1, "vffinal", weight_init=tf_util.normc_initializer(1.0))[:, 0]
53 | 
54 |         last_out = obz
55 |         for i in range(num_hid_layers):
56 |             last_out = tf.nn.tanh(dense(last_out, hid_size, "polfc%i" % (i+1),
57 |                                         weight_init=tf_util.normc_initializer(1.0)))
58 | 
59 |         if gaussian_fixed_var and isinstance(ac_space, gym.spaces.Box):
60 |             mean = dense(last_out, pdtype.param_shape()[0] // 2, "polfinal", tf_util.normc_initializer(0.01))
61 |             logstd = tf.get_variable(name="logstd", shape=[1, pdtype.param_shape()[0]//2],
62 |                                      initializer=tf.zeros_initializer())
63 |             pdparam = tf.concat([mean, mean * 0.0 + logstd], axis=1)
64 |         else:
65 |             pdparam = dense(last_out, pdtype.param_shape()[0], "polfinal", tf_util.normc_initializer(0.01))
66 | 
67 |         self.proba_distribution = pdtype.proba_distribution_from_flat(pdparam)
68 |         self.state_in = []
69 |         self.state_out = []
70 | 
71 |         # change for BC
72 |         self.stochastic_ph = tf.placeholder(dtype=tf.bool, shape=(), name="stochastic")
73 |         action = tf_util.switch(self.stochastic_ph, self.proba_distribution.sample(), self.proba_distribution.mode())
74 |         self.action = action
75 |         self._act = tf_util.function([self.stochastic_ph, obs], [action, self.vpred])
76 | 


--------------------------------------------------------------------------------
/stable_baselines/gail/model.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | from stable_baselines.common import ActorCriticRLModel
 4 | from stable_baselines.common.policies import ActorCriticPolicy
 5 | from stable_baselines.trpo_mpi import TRPO
 6 | 
 7 | 
 8 | class GAIL(ActorCriticRLModel):
 9 |     """
10 |     Generative Adversarial Imitation Learning (GAIL)
11 | 
12 |     :param policy: (ActorCriticPolicy or str) The policy model to use (MlpPolicy, CnnPolicy, CnnLstmPolicy, ...)
13 |     :param env: (Gym environment or str) The environment to learn from (if registered in Gym, can be str)
14 |     :param gamma: (float) the discount value
15 |     :param timesteps_per_batch: (int) the number of timesteps to run per batch (horizon)
16 |     :param max_kl: (float) the kullback leiber loss threashold
17 |     :param cg_iters: (int) the number of iterations for the conjugate gradient calculation
18 |     :param lam: (float) GAE factor
19 |     :param entcoeff: (float) the weight for the entropy loss
20 |     :param cg_damping: (float) the compute gradient dampening factor
21 |     :param vf_stepsize: (float) the value function stepsize
22 |     :param vf_iters: (int) the value function's number iterations for learning
23 |     :param pretrained_weight: (str) the save location for the pretrained weights
24 |     :param hidden_size: ([int]) the hidden dimension for the MLP
25 |     :param expert_dataset: (Dset) the dataset manager
26 |     :param save_per_iter: (int) the number of iterations before saving
27 |     :param checkpoint_dir: (str) the location for saving checkpoints
28 |     :param g_step: (int) number of steps to train policy in each epoch
29 |     :param d_step: (int) number of steps to train discriminator in each epoch
30 |     :param task_name: (str) the name of the task (can be None)
31 |     :param d_stepsize: (float) the reward giver stepsize
32 |     :param verbose: (int) the verbosity level: 0 none, 1 training information, 2 tensorflow debug
33 |     :param _init_setup_model: (bool) Whether or not to build the network at the creation of the instance
34 |     """
35 | 
36 |     def __init__(self, policy, env, pretrained_weight=False, hidden_size_adversary=100, adversary_entcoeff=1e-3,
37 |                  expert_dataset=None, save_per_iter=1, checkpoint_dir="/tmp/gail/ckpt/", g_step=1, d_step=1,
38 |                  task_name="task_name", d_stepsize=3e-4, verbose=0, _init_setup_model=True, **kwargs):
39 |         super().__init__(policy=policy, env=env, verbose=verbose, requires_vec_env=False,
40 |                          _init_setup_model=_init_setup_model)
41 | 
42 |         self.trpo = TRPO(policy, env, verbose=verbose, _init_setup_model=False, **kwargs)
43 |         self.trpo.using_gail = True
44 |         self.trpo.pretrained_weight = pretrained_weight
45 |         self.trpo.expert_dataset = expert_dataset
46 |         self.trpo.save_per_iter = save_per_iter
47 |         self.trpo.checkpoint_dir = checkpoint_dir
48 |         self.trpo.g_step = g_step
49 |         self.trpo.d_step = d_step
50 |         self.trpo.task_name = task_name
51 |         self.trpo.d_stepsize = d_stepsize
52 |         self.trpo.hidden_size_adversary = hidden_size_adversary
53 |         self.trpo.adversary_entcoeff = adversary_entcoeff
54 | 
55 |         if _init_setup_model:
56 |             self.setup_model()
57 | 
58 |     def set_env(self, env):
59 |         super().set_env(env)
60 |         self.trpo.set_env(env)
61 | 
62 |     def setup_model(self):
63 |         assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the GAIL model must be an " \
64 |                                                            "instance of common.policies.ActorCriticPolicy."
65 |         assert isinstance(self.action_space, gym.spaces.Box), "Error: GAIL requires a continuous action space."
66 | 
67 |         self.trpo.setup_model()
68 | 
69 |     def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="GAIL"):
70 |         self.trpo.learn(total_timesteps, callback, seed, log_interval, tb_log_name)
71 |         return self
72 | 
73 |     def predict(self, observation, state=None, mask=None, deterministic=False):
74 |         return self.trpo.predict(observation, state, mask, deterministic=deterministic)
75 | 
76 |     def action_probability(self, observation, state=None, mask=None):
77 |         return self.trpo.action_probability(observation, state, mask)
78 | 
79 |     def save(self, save_path):
80 |         self.trpo.save(save_path)
81 | 
82 |     @classmethod
83 |     def load(cls, load_path, env=None, **kwargs):
84 |         data, params = cls._load_from_file(load_path)
85 | 
86 |         model = cls(policy=data["policy"], env=None, _init_setup_model=False)
87 |         model.trpo.__dict__.update(data)
88 |         model.trpo.__dict__.update(kwargs)
89 |         model.set_env(env)
90 |         model.setup_model()
91 | 
92 |         restores = []
93 |         for param, loaded_p in zip(model.trpo.params, params):
94 |             restores.append(param.assign(loaded_p))
95 |         model.trpo.sess.run(restores)
96 | 
97 |         return model
98 | 


--------------------------------------------------------------------------------
/stable_baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HalfCheetah-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HalfCheetah-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HalfCheetah-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HalfCheetah-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Hopper-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Hopper-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Hopper-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Hopper-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Hopper-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Hopper-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Hopper-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Hopper-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Humanoid-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Humanoid-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Humanoid-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Humanoid-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Humanoid-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Humanoid-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HumanoidStandup-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HumanoidStandup-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HumanoidStandup-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/HumanoidStandup-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Walker2d-normalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Walker2d-normalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Walker2d-normalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Walker2d-normalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Walker2d-unnormalized-deterministic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/Walker2d-unnormalized-stochastic-scores.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/gail-result.md:
--------------------------------------------------------------------------------
 1 | # Results of GAIL/BC on Mujoco
 2 | 
 3 | Here's the extensive experimental results of applying GAIL/BC on Mujoco environments, including 
 4 | Hopper-v1, Walker2d-v1, HalfCheetah-v1, Humanoid-v1, HumanoidStandup-v1. Every imitator is evaluated with seed to be 0.
 5 | 
 6 | ## Results
 7 | 
 8 | ### Training through iterations
 9 | 
10 | - Hoppers-v1
11 | <img src='hopper-training.png'> 
12 | 
13 | - HalfCheetah-v1
14 | <img src='halfcheetah-training.png'> 
15 | 
16 | - Walker2d-v1
17 | <img src='walker2d-training.png'> 
18 | 
19 | - Humanoid-v1
20 | <img src='humanoid-training.png'> 
21 | 
22 | - HumanoidStandup-v1
23 | <img src='humanoidstandup-training.png'> 
24 | 
25 | For details (e.g., adversarial loss, discriminator accuracy, etc.) about GAIL training, please see [here](https://drive.google.com/drive/folders/1nnU8dqAV9i37-_5_vWIspyFUJFQLCsDD?usp=sharing)
26 | 
27 | ### Determinstic Polciy (Set std=0)
28 | |   | Un-normalized | Normalized |
29 | |---|---|---|
30 | | Hopper-v1 | <img src='Hopper-unnormalized-deterministic-scores.png'> | <img src='Hopper-normalized-deterministic-scores.png'> |
31 | | HalfCheetah-v1 | <img src='HalfCheetah-unnormalized-deterministic-scores.png'> | <img src='HalfCheetah-normalized-deterministic-scores.png'> |
32 | | Walker2d-v1 | <img src='Walker2d-unnormalized-deterministic-scores.png'> | <img src='Walker2d-normalized-deterministic-scores.png'> |
33 | | Humanoid-v1 | <img src='Humanoid-unnormalized-deterministic-scores.png'> | <img src='Humanoid-normalized-deterministic-scores.png'> |
34 | | HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-deterministic-scores.png'> | <img src='HumanoidStandup-normalized-deterministic-scores.png'> |
35 | 
36 | ### Stochatic Policy 
37 | |   | Un-normalized | Normalized |
38 | |---|---|---|
39 | | Hopper-v1 | <img src='Hopper-unnormalized-stochastic-scores.png'> | <img src='Hopper-normalized-stochastic-scores.png'> |
40 | | HalfCheetah-v1 | <img src='HalfCheetah-unnormalized-stochastic-scores.png'> | <img src='HalfCheetah-normalized-stochastic-scores.png'> |
41 | | Walker2d-v1 | <img src='Walker2d-unnormalized-stochastic-scores.png'> | <img src='Walker2d-normalized-stochastic-scores.png'> |
42 | | Humanoid-v1 | <img src='Humanoid-unnormalized-stochastic-scores.png'> | <img src='Humanoid-normalized-stochastic-scores.png'> |
43 | | HumanoidStandup-v1 | <img src='HumanoidStandup-unnormalized-stochastic-scores.png'> | <img src='HumanoidStandup-normalized-stochastic-scores.png'> |
44 | 
45 | ### details about GAIL imitator
46 | 
47 | For all environments, the 
48 | imitator is trained with 1, 5, 10, 50 trajectories, where each trajectory contains at most 
49 | 1024 transitions, and seed 0, 1, 2, 3, respectively.
50 | 
51 | ### details about the BC imitators
52 | 
53 | All BC imitators are trained with seed 0.
54 | 


--------------------------------------------------------------------------------
/stable_baselines/gail/result/halfcheetah-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/halfcheetah-training.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/hopper-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/hopper-training.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/humanoid-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/humanoid-training.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/humanoidstandup-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/humanoidstandup-training.png


--------------------------------------------------------------------------------
/stable_baselines/gail/result/walker2d-training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/gail/result/walker2d-training.png


--------------------------------------------------------------------------------
/stable_baselines/gail/statistics.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This code is highly based on https://github.com/carpedm20/deep-rl-tensorflow/blob/master/agents/statistic.py
 3 | """
 4 | 
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | 
 8 | import stable_baselines.common.tf_util as tf_util
 9 | 
10 | 
11 | class Stats:
12 | 
13 |     def __init__(self, scalar_keys=None, histogram_keys=None):
14 |         """
15 |         initialize the placeholders from the input keys, for summary logging
16 | 
17 |         :param scalar_keys: ([str]) the name of all the scalar inputs
18 |         :param histogram_keys: ([str]) the name of all the histogram inputs
19 |         """
20 |         if scalar_keys is None:
21 |             scalar_keys = []
22 |         if histogram_keys is None:
23 |             histogram_keys = []
24 |         self.scalar_keys = scalar_keys
25 |         self.histogram_keys = histogram_keys
26 |         self.scalar_summaries = []
27 |         self.scalar_summaries_ph = []
28 |         self.histogram_summaries_ph = []
29 |         self.histogram_summaries = []
30 |         with tf.variable_scope('summary'):
31 |             for key in scalar_keys:
32 |                 place_holder = tf.placeholder('float32', None, name=key + '.scalar.summary')
33 |                 string_summary = tf.summary.scalar(key + '.scalar.summary', place_holder)
34 |                 self.scalar_summaries_ph.append(place_holder)
35 |                 self.scalar_summaries.append(string_summary)
36 |             for key in histogram_keys:
37 |                 place_holder = tf.placeholder('float32', None, name=key + '.histogram.summary')
38 |                 string_summary = tf.summary.scalar(key + '.histogram.summary', place_holder)
39 |                 self.histogram_summaries_ph.append(place_holder)
40 |                 self.histogram_summaries.append(string_summary)
41 | 
42 |         self.summaries = tf.summary.merge(self.scalar_summaries + self.histogram_summaries)
43 | 
44 |     def add_all_summary(self, writer, values, _iter):
45 |         """
46 |         Note that the order of the incoming ```values``` should be the same as the that of the
47 |                    ```scalar_keys``` given in ```__init__```
48 | 
49 |         :param writer: (TensorFlow FileWriter) the writer
50 |         :param values: (TensorFlow Tensor or np.ndarray) the input for the summary run
51 |         :param _iter: (Number) the global step value
52 |         """
53 |         if np.sum(np.isnan(values) + 0) != 0:
54 |             return
55 |         sess = tf_util.get_session()
56 |         keys = self.scalar_summaries_ph + self.histogram_summaries_ph
57 |         feed_dict = {}
58 |         for key, value in zip(keys, values):
59 |             feed_dict.update({key: value})
60 |         summaries_str = sess.run(self.summaries, feed_dict)
61 |         writer.add_summary(summaries_str, _iter)
62 | 


--------------------------------------------------------------------------------
/stable_baselines/her/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.her.her import HER
2 | 


--------------------------------------------------------------------------------
/stable_baselines/her/actor_critic.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from stable_baselines.her.util import mlp
 4 | 
 5 | 
 6 | class ActorCritic:
 7 |     def __init__(self, inputs_tf, dim_obs, dim_goal, dim_action,
 8 |                  max_u, o_stats, g_stats, hidden, layers, **kwargs):
 9 |         """The actor-critic network and related training code.
10 | 
11 |         :param inputs_tf: ({str: TensorFlow Tensor}) all necessary inputs for the network: the
12 |             observation (o), the goal (g), and the action (u)
13 |         :param dim_obs: (int) the dimension of the observations
14 |         :param dim_goal: (int) the dimension of the goals
15 |         :param dim_action: (int) the dimension of the actions
16 |         :param max_u: (float) the maximum magnitude of actions; action outputs will be scaled accordingly
17 |         :param o_stats (stable_baselines.her.Normalizer): normalizer for observations
18 |         :param g_stats (stable_baselines.her.Normalizer): normalizer for goals
19 |         :param hidden (int): number of hidden units that should be used in hidden layers
20 |         :param layers (int): number of hidden layers
21 |         """
22 |         self.inputs_tf = inputs_tf
23 |         self.dim_obs = dim_obs
24 |         self.dim_goal = dim_goal
25 |         self.dim_action = dim_action
26 |         self.max_u = max_u
27 |         self.o_stats = o_stats
28 |         self.g_stats = g_stats
29 |         self.hidden = hidden
30 |         self.layers = layers
31 | 
32 |         self.o_tf = inputs_tf['o']
33 |         self.g_tf = inputs_tf['g']
34 |         self.u_tf = inputs_tf['u']
35 | 
36 |         # Prepare inputs for actor and critic.
37 |         obs = self.o_stats.normalize(self.o_tf)
38 |         goals = self.g_stats.normalize(self.g_tf)
39 |         input_pi = tf.concat(axis=1, values=[obs, goals])  # for actor
40 | 
41 |         # Networks.
42 |         with tf.variable_scope('pi'):
43 |             self.pi_tf = self.max_u * tf.tanh(mlp(
44 |                 input_pi, [self.hidden] * self.layers + [self.dimu]))
45 |         with tf.variable_scope('Q'):
46 |             # for policy training
47 |             input_q = tf.concat(axis=1, values=[obs, goals, self.pi_tf / self.max_u])
48 |             self.q_pi_tf = mlp(input_q, [self.hidden] * self.layers + [1])
49 |             # for critic training
50 |             input_q = tf.concat(axis=1, values=[obs, goals, self.u_tf / self.max_u])
51 |             self._input_q = input_q  # exposed for tests
52 |             self.q_tf = mlp(input_q, [self.hidden] * self.layers + [1], reuse=True)
53 | 


--------------------------------------------------------------------------------
/stable_baselines/her/experiment/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wonseokjung/ai_supermario/9c4757d6167b9ee4f2a09a3ff353920e80ea030f/stable_baselines/her/experiment/__init__.py


--------------------------------------------------------------------------------
/stable_baselines/her/experiment/play.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | import pickle
 3 | 
 4 | import numpy as np
 5 | 
 6 | from stable_baselines import logger
 7 | from stable_baselines.common import set_global_seeds
 8 | import stable_baselines.her.experiment.config as config
 9 | from stable_baselines.her.rollout import RolloutWorker
10 | 
11 | 
12 | @click.command()
13 | @click.argument('policy_file', type=str)
14 | @click.option('--seed', type=int, default=0)
15 | @click.option('--n_test_rollouts', type=int, default=10)
16 | @click.option('--render', type=int, default=1)
17 | def main(policy_file, seed, n_test_rollouts, render):
18 |     """
19 |     run HER from a saved policy
20 | 
21 |     :param policy_file: (str) pickle path to a saved policy
22 |     :param seed: (int) initial seed
23 |     :param n_test_rollouts: (int) the number of test rollouts
24 |     :param render: (bool) if rendering should be done
25 |     """
26 |     set_global_seeds(seed)
27 | 
28 |     # Load policy.
29 |     with open(policy_file, 'rb') as file_handler:
30 |         policy = pickle.load(file_handler)
31 |     env_name = policy.info['env_name']
32 | 
33 |     # Prepare params.
34 |     params = config.DEFAULT_PARAMS
35 |     if env_name in config.DEFAULT_ENV_PARAMS:
36 |         params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
37 |     params['env_name'] = env_name
38 |     params = config.prepare_params(params)
39 |     config.log_params(params, logger_input=logger)
40 | 
41 |     dims = config.configure_dims(params)
42 | 
43 |     eval_params = {
44 |         'exploit': True,
45 |         'use_target_net': params['test_with_polyak'],
46 |         'compute_q': True,
47 |         'rollout_batch_size': 1,
48 |         'render': bool(render),
49 |     }
50 | 
51 |     for name in ['time_horizon', 'gamma', 'noise_eps', 'random_eps']:
52 |         eval_params[name] = params[name]
53 | 
54 |     evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
55 |     evaluator.seed(seed)
56 | 
57 |     # Run evaluation.
58 |     evaluator.clear_history()
59 |     for _ in range(n_test_rollouts):
60 |         evaluator.generate_rollouts()
61 | 
62 |     # record logs
63 |     for key, val in evaluator.logs('test'):
64 |         logger.record_tabular(key, np.mean(val))
65 |     logger.dump_tabular()
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     main()
70 | 


--------------------------------------------------------------------------------
/stable_baselines/her/experiment/plot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import argparse
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | import numpy as np
  7 | import seaborn as sns
  8 | import glob2
  9 | 
 10 | # Initialize seaborn
 11 | sns.set()
 12 | 
 13 | def smooth_reward_curve(x, y):
 14 |     """
 15 |     smooth the reward curve
 16 | 
 17 |     :param x: (numpy float) the x coord of the reward
 18 |     :param y: (numpy float) the y coord of the reward
 19 |     :return: (numpy float, numpy float) smoothed x, smoothed y
 20 |     """
 21 |     halfwidth = int(np.ceil(len(x) / 60))  # Halfwidth of our smoothing convolution
 22 |     k = halfwidth
 23 |     xsmoo = x
 24 |     ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1),
 25 |                                                                           mode='same')
 26 |     return xsmoo, ysmoo
 27 | 
 28 | 
 29 | def load_results(file):
 30 |     """
 31 |     load the results from a file
 32 | 
 33 |     :param file: (str) the saved results
 34 |     :return: (dict) the result
 35 |     """
 36 |     if not os.path.exists(file):
 37 |         return None
 38 |     with open(file, 'r') as file_handler:
 39 |         lines = [line for line in file_handler]
 40 |     if len(lines) < 2:
 41 |         return None
 42 |     keys = [name.strip() for name in lines[0].split(',')]
 43 |     data = np.genfromtxt(file, delimiter=',', skip_header=1, filling_values=0.)
 44 |     if data.ndim == 1:
 45 |         data = data.reshape(1, -1)
 46 |     assert data.ndim == 2
 47 |     assert data.shape[-1] == len(keys)
 48 |     result = {}
 49 |     for idx, key in enumerate(keys):
 50 |         result[key] = data[:, idx]
 51 |     return result
 52 | 
 53 | 
 54 | def pad(xs, value=np.nan):
 55 |     """
 56 | 
 57 | 
 58 |     :param xs:
 59 |     :param value:
 60 |     :return:
 61 |     """
 62 |     maxlen = np.max([len(x) for x in xs])
 63 | 
 64 |     padded_xs = []
 65 |     for x in xs:
 66 |         if x.shape[0] >= maxlen:
 67 |             padded_xs.append(x)
 68 | 
 69 |         padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
 70 |         x_padded = np.concatenate([x, padding], axis=0)
 71 |         assert x_padded.shape[1:] == x.shape[1:]
 72 |         assert x_padded.shape[0] == maxlen
 73 |         padded_xs.append(x_padded)
 74 |     return np.array(padded_xs)
 75 | 
 76 | 
 77 | parser = argparse.ArgumentParser()
 78 | parser.add_argument('dir', type=str)
 79 | parser.add_argument('--smooth', type=int, default=1)
 80 | args = parser.parse_args()
 81 | 
 82 | # Load all data.
 83 | data = {}
 84 | paths = [os.path.abspath(os.path.join(path, '..')) for path in glob2.glob(os.path.join(args.dir, '**', 'progress.csv'))]
 85 | for curr_path in paths:
 86 |     if not os.path.isdir(curr_path):
 87 |         continue
 88 |     results = load_results(os.path.join(curr_path, 'progress.csv'))
 89 |     if not results:
 90 |         print('skipping {}'.format(curr_path))
 91 |         continue
 92 |     print('loading {} ({})'.format(curr_path, len(results['epoch'])))
 93 |     with open(os.path.join(curr_path, 'params.json'), 'r') as f:
 94 |         params = json.load(f)
 95 | 
 96 |     success_rate = np.array(results['test/success_rate'])
 97 |     epoch = np.array(results['epoch']) + 1
 98 |     env_id = params['env_name']
 99 |     replay_strategy = params['replay_strategy']
100 | 
101 |     if replay_strategy == 'future':
102 |         config = 'her'
103 |     else:
104 |         config = 'ddpg'
105 |     if 'Dense' in env_id:
106 |         config += '-dense'
107 |     else:
108 |         config += '-sparse'
109 |     env_id = env_id.replace('Dense', '')
110 | 
111 |     # Process and smooth data.
112 |     assert success_rate.shape == epoch.shape
113 |     x = epoch
114 |     y = success_rate
115 |     if args.smooth:
116 |         x, y = smooth_reward_curve(epoch, success_rate)
117 |     assert x.shape == y.shape
118 | 
119 |     if env_id not in data:
120 |         data[env_id] = {}
121 |     if config not in data[env_id]:
122 |         data[env_id][config] = []
123 |     data[env_id][config].append((x, y))
124 | 
125 | # Plot data.
126 | for env_id in sorted(data.keys()):
127 |     print('exporting {}'.format(env_id))
128 |     plt.clf()
129 | 
130 |     for config in sorted(data[env_id].keys()):
131 |         xs, ys = zip(*data[env_id][config])
132 |         xs, ys = pad(xs), pad(ys)
133 |         assert xs.shape == ys.shape
134 | 
135 |         plt.plot(xs[0], np.nanmedian(ys, axis=0), label=config)
136 |         plt.fill_between(xs[0], np.nanpercentile(ys, 25, axis=0), np.nanpercentile(ys, 75, axis=0), alpha=0.25)
137 |     plt.title(env_id)
138 |     plt.xlabel('Epoch')
139 |     plt.ylabel('Median Success Rate')
140 |     plt.legend()
141 |     plt.savefig(os.path.join(args.dir, 'fig_{}.png'.format(env_id)))
142 | 


--------------------------------------------------------------------------------
/stable_baselines/her/her.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | import gym
  4 | 
  5 | from stable_baselines.common import BaseRLModel, SetVerbosity
  6 | from stable_baselines.common.policies import LstmPolicy, ActorCriticPolicy
  7 | 
  8 | 
  9 | def make_sample_her_transitions(replay_strategy, replay_k, reward_fun):
 10 |     """
 11 |     Creates a sample function that can be used for HER experience replay.
 12 | 
 13 |     :param replay_strategy: (str) the HER replay strategy; if set to 'none', regular DDPG experience replay is used
 14 |         (can be 'future' or 'none').
 15 |     :param replay_k: (int) the ratio between HER replays and regular replays (e.g. k = 4 -> 4 times
 16 |             as many HER replays as regular replays are used)
 17 |     :param reward_fun: (function (dict, dict): float) function to re-compute the reward with substituted goals
 18 |     """
 19 |     if replay_strategy == 'future':
 20 |         future_p = 1 - (1. / (1 + replay_k))
 21 |     else:  # 'replay_strategy' == 'none'
 22 |         future_p = 0
 23 | 
 24 |     def _sample_her_transitions(episode_batch, batch_size_in_transitions):
 25 |         """episode_batch is {key: array(buffer_size x T x dim_key)}
 26 |         """
 27 |         time_horizon = episode_batch['u'].shape[1]
 28 |         rollout_batch_size = episode_batch['u'].shape[0]
 29 |         batch_size = batch_size_in_transitions
 30 | 
 31 |         # Select which episodes and time steps to use.
 32 |         episode_idxs = np.random.randint(0, rollout_batch_size, batch_size)
 33 |         t_samples = np.random.randint(time_horizon, size=batch_size)
 34 |         transitions = {key: episode_batch[key][episode_idxs, t_samples].copy()
 35 |                        for key in episode_batch.keys()}
 36 | 
 37 |         # Select future time indexes proportional with probability future_p. These
 38 |         # will be used for HER replay by substituting in future goals.
 39 |         her_indexes = np.where(np.random.uniform(size=batch_size) < future_p)
 40 |         future_offset = np.random.uniform(size=batch_size) * (time_horizon - t_samples)
 41 |         future_offset = future_offset.astype(int)
 42 |         future_t = (t_samples + 1 + future_offset)[her_indexes]
 43 | 
 44 |         # Replace goal with achieved goal but only for the previously-selected
 45 |         # HER transitions (as defined by her_indexes). For the other transitions,
 46 |         # keep the original goal.
 47 |         future_ag = episode_batch['ag'][episode_idxs[her_indexes], future_t]
 48 |         transitions['g'][her_indexes] = future_ag
 49 | 
 50 |         # Reconstruct info dictionary for reward  computation.
 51 |         info = {}
 52 |         for key, value in transitions.items():
 53 |             if key.startswith('info_'):
 54 |                 info[key.replace('info_', '')] = value
 55 | 
 56 |         # Re-compute reward since we may have substituted the goal.
 57 |         reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
 58 |         reward_params['info'] = info
 59 |         transitions['r'] = reward_fun(**reward_params)
 60 | 
 61 |         transitions = {k: transitions[k].reshape(batch_size, *transitions[k].shape[1:])
 62 |                        for k in transitions.keys()}
 63 | 
 64 |         assert transitions['u'].shape[0] == batch_size_in_transitions
 65 | 
 66 |         return transitions
 67 | 
 68 |     return _sample_her_transitions
 69 | 
 70 | 
 71 | class HER(BaseRLModel):
 72 |     def __init__(self, policy, env, verbose=0, _init_setup_model=True):
 73 |         super().__init__(policy=policy, env=env, verbose=verbose, policy_base=ActorCriticPolicy, requires_vec_env=False)
 74 | 
 75 |         self.policy = policy
 76 | 
 77 |         self.sess = None
 78 |         self.graph = None
 79 | 
 80 |         if _init_setup_model:
 81 |             self.setup_model()
 82 | 
 83 |     def setup_model(self):
 84 |         with SetVerbosity(self.verbose):
 85 | 
 86 |             assert isinstance(self.action_space, gym.spaces.Box), \
 87 |                 "Error: HER cannot output a {} action space, only spaces.Box is supported.".format(self.action_space)
 88 |             assert not issubclass(self.policy, LstmPolicy), "Error: cannot use a recurrent policy for the HER model."
 89 |             assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the HER model must be an " \
 90 |                                                                "instance of common.policies.ActorCriticPolicy."
 91 | 
 92 |             self.graph = tf.Graph()
 93 |             with self.graph.as_default():
 94 |                 pass
 95 | 
 96 |     def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="HER"):
 97 |         with SetVerbosity(self.verbose):
 98 |             self._setup_learn(seed)
 99 | 
100 |         return self
101 | 
102 |     def predict(self, observation, state=None, mask=None, deterministic=False):
103 |         pass
104 | 
105 |     def action_probability(self, observation, state=None, mask=None):
106 |         pass
107 | 
108 |     def save(self, save_path):
109 |         pass
110 | 
111 |     @classmethod
112 |     def load(cls, load_path, env=None, **kwargs):
113 |         pass
114 | 


--------------------------------------------------------------------------------
/stable_baselines/her/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | class ReplayBuffer:
  7 |     def __init__(self, buffer_shapes, size_in_transitions, time_horizon, sample_transitions):
  8 |         """
  9 |         Creates a replay buffer.
 10 | 
 11 |         :param buffer_shapes: ({str: int}) the shape for all buffers that are used in the replay buffer
 12 |         :param size_in_transitions: (int) the size of the buffer, measured in transitions
 13 |         :param time_horizon: (int) the time horizon for episodes
 14 |         :param sample_transitions: (function) a function that samples from the replay buffer
 15 |         """
 16 |         self.buffer_shapes = buffer_shapes
 17 |         self.size = size_in_transitions // time_horizon
 18 |         self.time_horizon = time_horizon
 19 |         self.sample_transitions = sample_transitions
 20 | 
 21 |         # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)}
 22 |         self.buffers = {key: np.empty([self.size, *shape])
 23 |                         for key, shape in buffer_shapes.items()}
 24 | 
 25 |         # memory management
 26 |         self.current_size = 0
 27 |         self.n_transitions_stored = 0
 28 | 
 29 |         self.lock = threading.Lock()
 30 | 
 31 |     @property
 32 |     def full(self):
 33 |         with self.lock:
 34 |             return self.current_size == self.size
 35 | 
 36 |     def sample(self, batch_size):
 37 |         """
 38 |         sample random transitions
 39 | 
 40 |         :param batch_size: (int) How many transitions to sample.
 41 |         :return: (dict) {key: array(batch_size x shapes[key])}
 42 |         """
 43 |         buffers = {}
 44 | 
 45 |         with self.lock:
 46 |             assert self.current_size > 0
 47 |             for key in self.buffers.keys():
 48 |                 buffers[key] = self.buffers[key][:self.current_size]
 49 | 
 50 |         buffers['o_2'] = buffers['o'][:, 1:, :]
 51 |         buffers['ag_2'] = buffers['ag'][:, 1:, :]
 52 | 
 53 |         transitions = self.sample_transitions(buffers, batch_size)
 54 | 
 55 |         for key in (['r', 'o_2', 'ag_2'] + list(self.buffers.keys())):
 56 |             assert key in transitions, "key %s missing from transitions" % key
 57 | 
 58 |         return transitions
 59 | 
 60 |     def store_episode(self, episode_batch):
 61 |         """
 62 |         Store an episode in the replay buffer
 63 | 
 64 |         :param episode_batch: (np.ndarray) batch_size x (T or T+1) x dim_key
 65 |         """
 66 |         batch_sizes = [len(episode_batch[key]) for key in episode_batch.keys()]
 67 |         assert np.all(np.array(batch_sizes) == batch_sizes[0])
 68 |         batch_size = batch_sizes[0]
 69 | 
 70 |         with self.lock:
 71 |             idxs = self._get_storage_idx(batch_size)
 72 | 
 73 |             # load inputs into buffers
 74 |             for key in self.buffers.keys():
 75 |                 self.buffers[key][idxs] = episode_batch[key]
 76 | 
 77 |             self.n_transitions_stored += batch_size * self.time_horizon
 78 | 
 79 |     def get_current_episode_size(self):
 80 |         """
 81 |         get current episode size
 82 | 
 83 |         :return: (int) the current size of the episode
 84 |         """
 85 |         with self.lock:
 86 |             return self.current_size
 87 | 
 88 |     def get_current_size(self):
 89 |         """
 90 |         get current size of the buffer
 91 | 
 92 |         :return: (int) the current size of the buffer
 93 |         """
 94 |         with self.lock:
 95 |             return self.current_size * self.time_horizon
 96 | 
 97 |     def get_transitions_stored(self):
 98 |         """
 99 |         get the number of stored transitions
100 | 
101 |         :return: (int) the number of transitions stored
102 |         """
103 |         with self.lock:
104 |             return self.n_transitions_stored
105 | 
106 |     def clear_buffer(self):
107 |         """
108 |         clear the buffer of all entries
109 |         """
110 |         with self.lock:
111 |             self.current_size = 0
112 | 
113 |     def _get_storage_idx(self, inc=None):
114 |         inc = inc or 1  # size increment
115 |         assert inc <= self.size, "Batch committed to replay is too large!"
116 |         # go consecutively until you hit the end, and then go randomly.
117 |         if self.current_size + inc <= self.size:
118 |             idx = np.arange(self.current_size, self.current_size + inc)
119 |         elif self.current_size < self.size:
120 |             overflow = inc - (self.size - self.current_size)
121 |             idx_a = np.arange(self.current_size, self.size)
122 |             idx_b = np.random.randint(0, self.current_size, overflow)
123 |             idx = np.concatenate([idx_a, idx_b])
124 |         else:
125 |             idx = np.random.randint(0, self.size, inc)
126 | 
127 |         # update replay size
128 |         self.current_size = min(self.size, self.current_size + inc)
129 | 
130 |         if inc == 1:
131 |             idx = idx[0]
132 |         return idx
133 | 


--------------------------------------------------------------------------------
/stable_baselines/her/util.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import subprocess
  3 | import sys
  4 | import importlib
  5 | 
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | from mpi4py import MPI
  9 | 
 10 | from stable_baselines.common import tf_util
 11 | 
 12 | 
 13 | def import_function(spec):
 14 |     """
 15 |     Import a function identified by a string like "pkg.module:fn_name".
 16 | 
 17 |     :param spec: (str) the function to import
 18 |     :return: (function)
 19 |     """
 20 |     mod_name, fn_name = spec.split(':')
 21 |     module = importlib.import_module(mod_name)
 22 |     func = getattr(module, fn_name)
 23 |     return func
 24 | 
 25 | 
 26 | def flatten_grads(var_list, grads):
 27 |     """
 28 |     Flattens a variables and their gradients.
 29 | 
 30 |     :param var_list: ([TensorFlow Tensor]) the variables
 31 |     :param grads: ([TensorFlow Tensor]) the gradients
 32 |     :return: (TensorFlow Tensor) the flattend variable and gradient
 33 |     """
 34 |     return tf.concat([tf.reshape(grad, [tf_util.numel(v)])
 35 |                       for (v, grad) in zip(var_list, grads)], 0)
 36 | 
 37 | 
 38 | def mlp(_input, layers_sizes, reuse=None, flatten=False, name=""):
 39 |     """
 40 |     Creates a simple fully-connected neural network
 41 | 
 42 |     :param _input: (TensorFlow Tensor) the input
 43 |     :param layers_sizes: ([int]) the hidden layers
 44 |     :param reuse: (bool) Enable reuse of the network
 45 |     :param flatten: (bool) flatten the network output
 46 |     :param name: (str) the name of the network
 47 |     :return: (TensorFlow Tensor) the network
 48 |     """
 49 |     for i, size in enumerate(layers_sizes):
 50 |         activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
 51 |         _input = tf.layers.dense(inputs=_input,
 52 |                                  units=size,
 53 |                                  kernel_initializer=tf.contrib.layers.xavier_initializer(),
 54 |                                  reuse=reuse,
 55 |                                  name=name + '_' + str(i))
 56 |         if activation:
 57 |             _input = activation(_input)
 58 |     if flatten:
 59 |         assert layers_sizes[-1] == 1
 60 |         _input = tf.reshape(_input, [-1])
 61 |     return _input
 62 | 
 63 | 
 64 | def install_mpi_excepthook():
 65 |     """
 66 |     setup the MPI exception hooks
 67 |     """
 68 |     old_hook = sys.excepthook
 69 | 
 70 |     def new_hook(a, b, c):
 71 |         old_hook(a, b, c)
 72 |         sys.stdout.flush()
 73 |         sys.stderr.flush()
 74 |         MPI.COMM_WORLD.Abort()
 75 | 
 76 |     sys.excepthook = new_hook
 77 | 
 78 | 
 79 | def mpi_fork(rank, extra_mpi_args=None):
 80 |     """
 81 |     Re-launches the current script with workers
 82 |     Returns "parent" for original parent, "child" for MPI children
 83 | 
 84 |     :param rank: (int) the thread rank
 85 |     :param extra_mpi_args: (dict) extra arguments for MPI
 86 |     :return: (str) the correct type of thread name
 87 |     """
 88 |     if extra_mpi_args is None:
 89 |         extra_mpi_args = []
 90 | 
 91 |     if rank <= 1:
 92 |         return "child"
 93 |     if os.getenv("IN_MPI") is None:
 94 |         env = os.environ.copy()
 95 |         env.update(
 96 |             MKL_NUM_THREADS="1",
 97 |             OMP_NUM_THREADS="1",
 98 |             IN_MPI="1"
 99 |         )
100 |         # "-bind-to core" is crucial for good performance
101 |         args = ["mpirun", "-np", str(rank)] + \
102 |                extra_mpi_args + \
103 |                [sys.executable]
104 | 
105 |         args += sys.argv
106 |         subprocess.check_call(args, env=env)
107 |         return "parent"
108 |     else:
109 |         install_mpi_excepthook()
110 |         return "child"
111 | 
112 | 
113 | def convert_episode_to_batch_major(episode):
114 |     """
115 |     Converts an episode to have the batch dimension in the major (first) dimension.
116 | 
117 |     :param episode: (dict) the episode batch
118 |     :return: (dict) the episode batch with he batch dimension in the major (first) dimension.
119 |     """
120 |     episode_batch = {}
121 |     for key in episode.keys():
122 |         val = np.array(episode[key]).copy()
123 |         # make inputs batch-major instead of time-major
124 |         episode_batch[key] = val.swapaxes(0, 1)
125 | 
126 |     return episode_batch
127 | 
128 | 
129 | def transitions_in_episode_batch(episode_batch):
130 |     """
131 |     Number of transitions in a given episode batch.
132 | 
133 |     :param episode_batch: (dict) the episode batch
134 |     :return: (int) the number of transitions in episode batch
135 |     """
136 |     shape = episode_batch['u'].shape
137 |     return shape[0] * shape[1]
138 | 
139 | 
140 | def reshape_for_broadcasting(source, target):
141 |     """
142 |     Reshapes a tensor (source) to have the correct shape and dtype of the target before broadcasting it with MPI.
143 | 
144 |     :param source: (TensorFlow Tensor) the input tensor
145 |     :param target: (TensorFlow Tensor) the target tensor
146 |     :return: (TensorFlow Tensor) the rehshaped tensor
147 |     """
148 |     dim = len(target.get_shape())
149 |     shape = ([1] * (dim - 1)) + [-1]
150 |     return tf.reshape(tf.cast(source, target.dtype), shape)
151 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo1/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.ppo1.pposgd_simple import PPO1
2 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo1/experiments/train_cartpole.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Simple test to check that PPO1 is running with no errors (see issue #50)
 3 | """
 4 | from stable_baselines import PPO1
 5 | 
 6 | 
 7 | if __name__ == '__main__':
 8 |     model = PPO1('MlpPolicy', 'CartPole-v1', schedule='linear', verbose=0)
 9 |     model.learn(total_timesteps=10000)
10 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo1/mlp_policy.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from stable_baselines.common.input import observation_input
 4 | from stable_baselines.common.distributions import make_proba_dist_type
 5 | 
 6 | 
 7 | class BasePolicy(object):
 8 |     def __init__(self, placeholders=None):
 9 |         """
10 |         A base policy object for PPO1
11 | 
12 |         :param placeholders: (dict) To feed existing placeholders if needed
13 |         """
14 |         super(BasePolicy, self).__init__()
15 |         self.sess = None
16 |         self.pdtype = None
17 |         self._act = None
18 |         self.scope = None
19 |         self.obs_ph = None
20 |         self.stochastic_ph = None
21 |         self.processed_x = None
22 | 
23 |         if placeholders is not None:
24 |             self.obs_ph = placeholders.get("obs", None)
25 |             self.processed_x = placeholders.get("processed_obs", None)
26 |             self.stochastic_ph = placeholders.get("stochastic", None)
27 | 
28 |     def get_obs_and_pdtype(self, ob_space, ac_space):
29 |         """
30 |         Initialize probability distribution and get observation placeholder.
31 | 
32 |         :param ob_space: (Gym Spaces) the observation space
33 |         :param ac_space: (Gym Spaces) the action space
34 |         """
35 |         self.pdtype = pdtype = make_proba_dist_type(ac_space)
36 | 
37 |         if self.obs_ph is None:
38 |             self.obs_ph, self.processed_x = observation_input(ob_space)
39 |         else:
40 |             assert self.processed_x is not None
41 | 
42 |         return self.obs_ph, pdtype
43 | 
44 |     def act(self, stochastic, obs):
45 |         """
46 |         Get the action from the policy, using the observation
47 | 
48 |         :param stochastic: (bool) whether or not to use a stochastic or deterministic policy
49 |         :param obs: (TensorFlow Tensor or np.ndarray) the observation
50 |         :return: (np.ndarray, np.ndarray) the action and value function
51 |         """
52 |         ac1, vpred1 = self._act(stochastic, obs[None], sess=self.sess)
53 |         return ac1[0], vpred1[0]
54 | 
55 |     def get_variables(self):
56 |         """
57 |         Get all the policy's variables
58 | 
59 |         :return: ([TensorFlow Tensor]) the variables of the network
60 |         """
61 |         return tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, self.scope)
62 | 
63 |     def get_trainable_variables(self):
64 |         """
65 |         Get the policy's trainable variables
66 | 
67 |         :return: ([TensorFlow Tensor]) the trainable variables of the network
68 |         """
69 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, self.scope)
70 | 
71 |     @classmethod
72 |     def get_initial_state(cls):
73 |         """
74 |         Get the initial state
75 | 
76 |         :return: ([np.ndarray]) the initial state
77 |         """
78 |         return []
79 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo1/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | from mpi4py import MPI
 5 | 
 6 | from stable_baselines.common import set_global_seeds
 7 | from stable_baselines import bench, logger
 8 | from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind
 9 | from stable_baselines.common.cmd_util import atari_arg_parser
10 | from stable_baselines.common.policies import CnnPolicy
11 | from stable_baselines.ppo1 import PPO1
12 | 
13 | 
14 | def train(env_id, num_timesteps, seed):
15 |     """
16 |     Train PPO1 model for Atari environments, for testing purposes
17 | 
18 |     :param env_id: (str) Environment ID
19 |     :param num_timesteps: (int) The total number of samples
20 |     :param seed: (int) The initial seed for training
21 |     """
22 |     rank = MPI.COMM_WORLD.Get_rank()
23 | 
24 |     if rank == 0:
25 |         logger.configure()
26 |     else:
27 |         logger.configure(format_strs=[])
28 |     workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
29 |     set_global_seeds(workerseed)
30 |     env = make_atari(env_id)
31 | 
32 |     env = bench.Monitor(env, logger.get_dir() and
33 |                         os.path.join(logger.get_dir(), str(rank)))
34 |     env.seed(workerseed)
35 | 
36 |     env = wrap_deepmind(env)
37 |     env.seed(workerseed)
38 | 
39 |     model = PPO1(CnnPolicy, env, timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4,
40 |                  optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', verbose=2)
41 |     model.learn(total_timesteps=num_timesteps)
42 |     env.close()
43 | 
44 | 
45 | def main():
46 |     """
47 |     Runs the test
48 |     """
49 |     args = atari_arg_parser().parse_args()
50 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     main()
55 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo1/run_humanoid.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | import gym
 5 | 
 6 | from stable_baselines.ppo1 import PPO1
 7 | from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 8 | from stable_baselines.common import tf_util
 9 | from stable_baselines.common.policies import MlpPolicy
10 | from stable_baselines import logger
11 | 
12 | 
13 | def train(num_timesteps, seed, model_path=None):
14 |     """
15 |     Train PPO1 model for the Humanoid environment, for testing purposes
16 | 
17 |     :param num_timesteps: (int) The total number of samples
18 |     :param seed: (int) The initial seed for training
19 |     :param model_path: (str) path to the model
20 |     """
21 |     env_id = 'Humanoid-v2'
22 | 
23 |     env = make_mujoco_env(env_id, seed)
24 | 
25 |     # parameters below were the best found in a simple random search
26 |     # these are good enough to make humanoid walk, but whether those are
27 |     # an absolute best or not is not certain
28 |     env = RewScale(env, 0.1)
29 |     model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
30 |                  optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear')
31 |     model.learn(total_timesteps=num_timesteps)
32 |     env.close()
33 |     if model_path:
34 |         tf_util.save_state(model_path)
35 | 
36 |     return model
37 | 
38 | 
39 | class RewScale(gym.RewardWrapper):
40 |     def __init__(self, env, scale):
41 |         gym.RewardWrapper.__init__(self, env)
42 |         self.scale = scale
43 | 
44 |     def reward(self, _reward):
45 |         return _reward * self.scale
46 | 
47 | 
48 | def main():
49 |     """
50 |     Runs the test
51 |     """
52 |     logger.configure()
53 |     parser = mujoco_arg_parser()
54 |     parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy'))
55 |     parser.set_defaults(num_timesteps=int(2e7))
56 | 
57 |     args = parser.parse_args()
58 | 
59 |     if not args.play:
60 |         # train the model
61 |         train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path)
62 |     else:
63 |         # construct the model object, load pre-trained model and render
64 |         model = train(num_timesteps=1, seed=args.seed)
65 |         tf_util.load_state(args.model_path)
66 |         env = make_mujoco_env('Humanoid-v2', seed=0)
67 | 
68 |         obs = env.reset()
69 |         while True:
70 |             action = model.policy.act(stochastic=False, obs=obs)[0]
71 |             obs, _, done, _ = env.step(action)
72 |             env.render()
73 |             if done:
74 |                 obs = env.reset()
75 | 
76 | 
77 | if __name__ == '__main__':
78 |     main()
79 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo1/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from stable_baselines.ppo1 import PPO1
 4 | from stable_baselines.common.policies import MlpPolicy
 5 | from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 6 | from stable_baselines import logger
 7 | 
 8 | 
 9 | def train(env_id, num_timesteps, seed):
10 |     """
11 |     Train PPO1 model for the Mujoco environment, for testing purposes
12 | 
13 |     :param env_id: (str) Environment ID
14 |     :param num_timesteps: (int) The total number of samples
15 |     :param seed: (int) The initial seed for training
16 |     """
17 |     env = make_mujoco_env(env_id, seed)
18 |     model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10,
19 |                  optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear')
20 |     model.learn(total_timesteps=num_timesteps)
21 |     env.close()
22 | 
23 | 
24 | def main():
25 |     """
26 |     Runs the test
27 |     """
28 |     args = mujoco_arg_parser().parse_args()
29 |     logger.configure()
30 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     main()
35 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo1/run_robotics.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | from mpi4py import MPI
 4 | import mujoco_py
 5 | 
 6 | from stable_baselines.common import set_global_seeds
 7 | from stable_baselines.common.policies import MlpPolicy
 8 | from stable_baselines.common.cmd_util import make_robotics_env, robotics_arg_parser
 9 | from stable_baselines.ppo1 import PPO1
10 | 
11 | 
12 | def train(env_id, num_timesteps, seed):
13 |     """
14 |     Train PPO1 model for Robotics environment, for testing purposes
15 | 
16 |     :param env_id: (str) Environment ID
17 |     :param num_timesteps: (int) The total number of samples
18 |     :param seed: (int) The initial seed for training
19 |     """
20 | 
21 |     rank = MPI.COMM_WORLD.Get_rank()
22 |     with mujoco_py.ignore_mujoco_warnings():
23 |         workerseed = seed + 10000 * rank
24 |         set_global_seeds(workerseed)
25 |         env = make_robotics_env(env_id, workerseed, rank=rank)
26 | 
27 |         model = PPO1(MlpPolicy, env, timesteps_per_actorbatch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=5,
28 |                      optim_stepsize=3e-4, optim_batchsize=256, gamma=0.99, lam=0.95, schedule='linear')
29 |         model.learn(total_timesteps=num_timesteps)
30 |         env.close()
31 | 
32 | 
33 | def main():
34 |     """
35 |     Runs the test
36 |     """
37 |     args = robotics_arg_parser().parse_args()
38 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     main()
43 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo2/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.ppo2.ppo2 import PPO2
2 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo2/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from stable_baselines import logger
 3 | from stable_baselines.common.cmd_util import make_atari_env, atari_arg_parser
 4 | from stable_baselines.common.vec_env.vec_frame_stack import VecFrameStack
 5 | from stable_baselines.ppo2 import PPO2
 6 | from stable_baselines.common.policies import CnnPolicy, CnnLstmPolicy, CnnLnLstmPolicy, MlpPolicy
 7 | 
 8 | 
 9 | def train(env_id, num_timesteps, seed, policy):
10 |     """
11 |     Train PPO2 model for atari environment, for testing purposes
12 | 
13 |     :param env_id: (str) the environment id string
14 |     :param num_timesteps: (int) the number of timesteps to run
15 |     :param seed: (int) Used to seed the random generator.
16 |     :param policy: (Object) The policy model to use (MLP, CNN, LSTM, ...)
17 |     """
18 | 
19 |     env = VecFrameStack(make_atari_env(env_id, 8, seed), 4)
20 |     policy = {'cnn': CnnPolicy, 'lstm': CnnLstmPolicy, 'lnlstm': CnnLnLstmPolicy, 'mlp': MlpPolicy}[policy]
21 |     model = PPO2(policy=policy, env=env, n_steps=128, nminibatches=4, lam=0.95, gamma=0.99, noptepochs=4, ent_coef=.01,
22 |                  learning_rate=lambda f: f * 2.5e-4, cliprange=lambda f: f * 0.1, verbose=1)
23 |     model.learn(total_timesteps=num_timesteps)
24 | 
25 | 
26 | def main():
27 |     """
28 |     Runs the test
29 |     """
30 |     parser = atari_arg_parser()
31 |     parser.add_argument('--policy', help='Policy architecture', choices=['cnn', 'lstm', 'lnlstm', 'mlp'], default='cnn')
32 |     args = parser.parse_args()
33 |     logger.configure()
34 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed,
35 |           policy=args.policy)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     main()
40 | 


--------------------------------------------------------------------------------
/stable_baselines/ppo2/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import numpy as np
 3 | import gym
 4 | 
 5 | from stable_baselines.common.cmd_util import mujoco_arg_parser
 6 | from stable_baselines import bench, logger
 7 | from stable_baselines.common import set_global_seeds
 8 | from stable_baselines.common.vec_env.vec_normalize import VecNormalize
 9 | from stable_baselines.ppo2 import PPO2
10 | from stable_baselines.common.policies import MlpPolicy
11 | from stable_baselines.common.vec_env.dummy_vec_env import DummyVecEnv
12 | 
13 | 
14 | def train(env_id, num_timesteps, seed):
15 |     """
16 |     Train PPO2 model for Mujoco environment, for testing purposes
17 | 
18 |     :param env_id: (str) the environment id string
19 |     :param num_timesteps: (int) the number of timesteps to run
20 |     :param seed: (int) Used to seed the random generator.
21 |     """
22 |     def make_env():
23 |         env_out = gym.make(env_id)
24 |         env_out = bench.Monitor(env_out, logger.get_dir(), allow_early_resets=True)
25 |         return env_out
26 | 
27 |     env = DummyVecEnv([make_env])
28 |     env = VecNormalize(env)
29 | 
30 |     set_global_seeds(seed)
31 |     policy = MlpPolicy
32 |     model = PPO2(policy=policy, env=env, n_steps=2048, nminibatches=32, lam=0.95, gamma=0.99, noptepochs=10,
33 |                  ent_coef=0.0, learning_rate=3e-4, cliprange=0.2)
34 |     model.learn(total_timesteps=num_timesteps)
35 | 
36 |     return model, env
37 | 
38 | 
39 | def main():
40 |     """
41 |     Runs the test
42 |     """
43 |     args = mujoco_arg_parser().parse_args()
44 |     logger.configure()
45 |     model, env = train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
46 | 
47 |     if args.play:
48 |         logger.log("Running trained model")
49 |         obs = np.zeros((env.num_envs,) + env.observation_space.shape)
50 |         obs[:] = env.reset()
51 |         while True:
52 |             actions = model.step(obs)[0]
53 |             obs[:] = env.step(actions)[0]
54 |             env.render()
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     main()
59 | 


--------------------------------------------------------------------------------
/stable_baselines/results_plotter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib
  3 | import matplotlib.pyplot as plt
  4 | 
  5 | from stable_baselines.bench.monitor import load_results
  6 | 
  7 | # matplotlib.use('TkAgg')  # Can change to 'Agg' for non-interactive mode
  8 | plt.rcParams['svg.fonttype'] = 'none'
  9 | 
 10 | X_TIMESTEPS = 'timesteps'
 11 | X_EPISODES = 'episodes'
 12 | X_WALLTIME = 'walltime_hrs'
 13 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 14 | EPISODES_WINDOW = 100
 15 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
 16 |           'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
 17 |           'darkgreen', 'tan', 'salmon', 'gold', 'lightpurple', 'darkred', 'darkblue']
 18 | 
 19 | 
 20 | def rolling_window(array, window):
 21 |     """
 22 |     apply a rolling window to a np.ndarray
 23 | 
 24 |     :param array: (np.ndarray) the input Array
 25 |     :param window: (int) length of the rolling window
 26 |     :return: (np.ndarray) rolling window on the input array
 27 |     """
 28 |     shape = array.shape[:-1] + (array.shape[-1] - window + 1, window)
 29 |     strides = array.strides + (array.strides[-1],)
 30 |     return np.lib.stride_tricks.as_strided(array, shape=shape, strides=strides)
 31 | 
 32 | 
 33 | def window_func(var_1, var_2, window, func):
 34 |     """
 35 |     apply a function to the rolling window of 2 arrays
 36 | 
 37 |     :param var_1: (np.ndarray) variable 1
 38 |     :param var_2: (np.ndarray) variable 2
 39 |     :param window: (int) length of the rolling window
 40 |     :param func: (numpy function) function to apply on the rolling window on variable 2 (such as np.mean)
 41 |     :return: (np.ndarray, np.ndarray)  the rolling output with applied function
 42 |     """
 43 |     var_2_window = rolling_window(var_2, window)
 44 |     function_on_var2 = func(var_2_window, axis=-1)
 45 |     return var_1[window - 1:], function_on_var2
 46 | 
 47 | 
 48 | def ts2xy(timesteps, xaxis):
 49 |     """
 50 |     Decompose a timesteps variable to x ans ys
 51 | 
 52 |     :param timesteps: (Pandas DataFrame) the input data
 53 |     :param xaxis: (str) the axis for the x and y output
 54 |         (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
 55 |     :return: (np.ndarray, np.ndarray) the x and y output
 56 |     """
 57 |     if xaxis == X_TIMESTEPS:
 58 |         x_var = np.cumsum(timesteps.l.values)
 59 |         y_var = timesteps.r.values
 60 |     elif xaxis == X_EPISODES:
 61 |         x_var = np.arange(len(timesteps))
 62 |         y_var = timesteps.r.values
 63 |     elif xaxis == X_WALLTIME:
 64 |         x_var = timesteps.t.values / 3600.
 65 |         y_var = timesteps.r.values
 66 |     else:
 67 |         raise NotImplementedError
 68 |     return x_var, y_var
 69 | 
 70 | 
 71 | def plot_curves(xy_list, xaxis, title):
 72 |     """
 73 |     plot the curves
 74 | 
 75 |     :param xy_list: ([(np.ndarray, np.ndarray)]) the x and y coordinates to plot
 76 |     :param xaxis: (str) the axis for the x and y output
 77 |         (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
 78 |     :param title: (str) the title of the plot
 79 |     """
 80 | 
 81 |     plt.figure(figsize=(8, 2))
 82 |     maxx = max(xy[0][-1] for xy in xy_list)
 83 |     minx = 0
 84 |     for (i, (x, y)) in enumerate(xy_list):
 85 |         color = COLORS[i]
 86 |         plt.scatter(x, y, s=2)
 87 |         x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean)  # So returns average of last EPISODE_WINDOW episodes
 88 |         plt.plot(x, y_mean, color=color)
 89 |     plt.xlim(minx, maxx)
 90 |     plt.title(title)
 91 |     plt.xlabel(xaxis)
 92 |     plt.ylabel("Episode Rewards")
 93 |     plt.tight_layout()
 94 | 
 95 | 
 96 | def plot_results(dirs, num_timesteps, xaxis, task_name):
 97 |     """
 98 |     plot the results
 99 | 
100 |     :param dirs: ([str]) the save location of the results to plot
101 |     :param num_timesteps: (int) only plot the points below this value
102 |     :param xaxis: (str) the axis for the x and y output
103 |         (can be X_TIMESTEPS='timesteps', X_EPISODES='episodes' or X_WALLTIME='walltime_hrs')
104 |     :param task_name: (str) the title of the task to plot
105 |     """
106 | 
107 |     tslist = []
108 |     for folder in dirs:
109 |         timesteps = load_results(folder)
110 |         timesteps = timesteps[timesteps.l.cumsum() <= num_timesteps]
111 |         tslist.append(timesteps)
112 |     xy_list = [ts2xy(timesteps_item, xaxis) for timesteps_item in tslist]
113 |     plot_curves(xy_list, xaxis, task_name)
114 | 
115 | 
116 | def main():
117 |     """
118 |     Example usage in jupyter-notebook
119 |     
120 |     .. code-block:: python
121 | 
122 |         from stable_baselines import log_viewer
123 |         %matplotlib inline
124 |         log_viewer.plot_results(["./log"], 10e6, log_viewer.X_TIMESTEPS, "Breakout")
125 | 
126 |     Here ./log is a directory containing the monitor.csv files
127 |     """
128 |     import argparse
129 |     import os
130 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
131 |     parser.add_argument('--dirs', help='List of log directories', nargs='*', default=['./log'])
132 |     parser.add_argument('--num_timesteps', type=int, default=int(10e6))
133 |     parser.add_argument('--xaxis', help='Varible on X-axis', default=X_TIMESTEPS)
134 |     parser.add_argument('--task_name', help='Title of plot', default='Breakout')
135 |     args = parser.parse_args()
136 |     args.dirs = [os.path.abspath(folder) for folder in args.dirs]
137 |     plot_results(args.dirs, args.num_timesteps, args.xaxis, args.task_name)
138 |     plt.show()
139 | 
140 | 
141 | if __name__ == '__main__':
142 |     main()
143 | 


--------------------------------------------------------------------------------
/stable_baselines/trpo_mpi/__init__.py:
--------------------------------------------------------------------------------
1 | from stable_baselines.trpo_mpi.trpo_mpi import TRPO
2 | 


--------------------------------------------------------------------------------
/stable_baselines/trpo_mpi/run_atari.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | from mpi4py import MPI
 5 | 
 6 | from stable_baselines.common import set_global_seeds
 7 | from stable_baselines import bench, logger
 8 | from stable_baselines.common.atari_wrappers import make_atari, wrap_deepmind
 9 | from stable_baselines.common.cmd_util import atari_arg_parser
10 | from stable_baselines.common.policies import CnnPolicy
11 | # from stable_baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy
12 | from stable_baselines.trpo_mpi import TRPO
13 | 
14 | 
15 | def train(env_id, num_timesteps, seed):
16 |     """
17 |     Train TRPO model for the atari environment, for testing purposes
18 | 
19 |     :param env_id: (str) Environment ID
20 |     :param num_timesteps: (int) The total number of samples
21 |     :param seed: (int) The initial seed for training
22 |     """
23 |     rank = MPI.COMM_WORLD.Get_rank()
24 | 
25 |     if rank == 0:
26 |         logger.configure()
27 |     else:
28 |         logger.configure(format_strs=[])
29 | 
30 |     workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
31 |     set_global_seeds(workerseed)
32 |     env = make_atari(env_id)
33 | 
34 |     # def policy_fn(name, ob_space, ac_space, sess=None, placeholders=None):  # pylint: disable=W0613
35 |     #     return CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space, sess=sess, placeholders=placeholders)
36 | 
37 |     env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)))
38 |     env.seed(workerseed)
39 | 
40 |     env = wrap_deepmind(env)
41 |     env.seed(workerseed)
42 | 
43 |     model = TRPO(CnnPolicy, env, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, entcoeff=0.0,
44 |                  gamma=0.98, lam=1, vf_iters=3, vf_stepsize=1e-4)
45 |     model.learn(total_timesteps=int(num_timesteps * 1.1))
46 |     env.close()
47 | 
48 | 
49 | def main():
50 |     """
51 |     Runs the test
52 |     """
53 |     args = atari_arg_parser().parse_args()
54 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     main()
59 | 


--------------------------------------------------------------------------------
/stable_baselines/trpo_mpi/run_mujoco.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | # noinspection PyUnresolvedReferences
 3 | from mpi4py import MPI
 4 | 
 5 | from stable_baselines.common.cmd_util import make_mujoco_env, mujoco_arg_parser
 6 | from stable_baselines.common.policies import MlpPolicy
 7 | from stable_baselines import logger
 8 | from stable_baselines.trpo_mpi import TRPO
 9 | import stable_baselines.common.tf_util as tf_util
10 | 
11 | 
12 | def train(env_id, num_timesteps, seed):
13 |     """
14 |     Train TRPO model for the mujoco environment, for testing purposes
15 | 
16 |     :param env_id: (str) Environment ID
17 |     :param num_timesteps: (int) The total number of samples
18 |     :param seed: (int) The initial seed for training
19 |     """
20 |     with tf_util.single_threaded_session():
21 |         rank = MPI.COMM_WORLD.Get_rank()
22 |         if rank == 0:
23 |             logger.configure()
24 |         else:
25 |             logger.configure(format_strs=[])
26 |             logger.set_level(logger.DISABLED)
27 |         workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank()
28 | 
29 |         env = make_mujoco_env(env_id, workerseed)
30 |         model = TRPO(MlpPolicy, env, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, entcoeff=0.0,
31 |                      gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3)
32 |         model.learn(total_timesteps=num_timesteps)
33 |         env.close()
34 | 
35 | 
36 | def main():
37 |     """
38 |     Runs the test
39 |     """
40 |     args = mujoco_arg_parser().parse_args()
41 |     train(args.env, num_timesteps=args.num_timesteps, seed=args.seed)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/stable_baselines/trpo_mpi/utils.py:
--------------------------------------------------------------------------------
  1 | import gym
  2 | import numpy as np
  3 | 
  4 | from stable_baselines.common.vec_env import VecEnv
  5 | 
  6 | 
  7 | def traj_segment_generator(policy, env, horizon, reward_giver=None, gail=False):
  8 |     """
  9 |     Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
 10 | 
 11 |     :param policy: (MLPPolicy) the policy
 12 |     :param env: (Gym Environment) the environment
 13 |     :param horizon: (int) the number of timesteps to run per batch
 14 |     :param reward_giver: (TransitionClassifier) the reward predicter from obsevation and action
 15 |     :param gail: (bool) Whether we are using this generator for standard trpo or with gail
 16 |     :return: (dict) generator that returns a dict with the following keys:
 17 | 
 18 |         - ob: (np.ndarray) observations
 19 |         - rew: (numpy float) rewards (if gail is used it is the predicted reward)
 20 |         - vpred: (numpy float) action logits
 21 |         - new: (numpy bool) dones (is end of episode)
 22 |         - ac: (np.ndarray) actions
 23 |         - prevac: (np.ndarray) previous actions
 24 |         - nextvpred: (numpy float) next action logits
 25 |         - ep_rets: (float) cumulated current episode reward
 26 |         - ep_lens: (int) the length of the current episode
 27 |         - ep_true_rets: (float) the real environment reward
 28 |     """
 29 |     # Check when using GAIL
 30 |     assert not (gail and reward_giver is None), "You must pass a reward giver when using GAIL"
 31 | 
 32 |     # Initialize state variables
 33 |     step = 0
 34 |     action = env.action_space.sample()  # not used, just so we have the datatype
 35 |     new = True
 36 |     observation = env.reset()
 37 | 
 38 |     cur_ep_ret = 0  # return in current episode
 39 |     cur_ep_len = 0  # len of current episode
 40 |     cur_ep_true_ret = 0
 41 |     ep_true_rets = []
 42 |     ep_rets = []  # returns of completed episodes in this segment
 43 |     ep_lens = []  # Episode lengths
 44 | 
 45 |     # Initialize history arrays
 46 |     observations = np.array([observation for _ in range(horizon)])
 47 |     true_rews = np.zeros(horizon, 'float32')
 48 |     rews = np.zeros(horizon, 'float32')
 49 |     vpreds = np.zeros(horizon, 'float32')
 50 |     dones = np.zeros(horizon, 'int32')
 51 |     actions = np.array([action for _ in range(horizon)])
 52 |     prev_actions = actions.copy()
 53 |     states = policy.initial_state
 54 |     done = None
 55 | 
 56 |     while True:
 57 |         prevac = action
 58 |         action, vpred, states, _ = policy.step(observation.reshape(-1, *observation.shape), states, done)
 59 |         # Slight weirdness here because we need value function at time T
 60 |         # before returning segment [0, T-1] so we get the correct
 61 |         # terminal value
 62 |         if step > 0 and step % horizon == 0:
 63 |             # Fix to avoid "mean of empty slice" warning when there is only one episode
 64 |             if len(ep_rets) == 0:
 65 |                 ep_rets = [cur_ep_ret]
 66 |                 ep_lens = [cur_ep_len]
 67 |                 ep_true_rets = [cur_ep_true_ret]
 68 |                 total_timesteps = cur_ep_len
 69 |             else:
 70 |                 total_timesteps = sum(ep_lens) + cur_ep_len
 71 | 
 72 |             yield {"ob": observations, "rew": rews, "dones": dones, "true_rew": true_rews, "vpred": vpreds,
 73 |                    "ac": actions, "prevac": prev_actions, "nextvpred": vpred * (1 - new), "ep_rets": ep_rets,
 74 |                    "ep_lens": ep_lens, "ep_true_rets": ep_true_rets, "total_timestep": total_timesteps}
 75 |             _, vpred, _, _ = policy.step(observation.reshape(-1, *observation.shape))
 76 |             # Be careful!!! if you change the downstream algorithm to aggregate
 77 |             # several of these batches, then be sure to do a deepcopy
 78 |             ep_rets = []
 79 |             ep_true_rets = []
 80 |             ep_lens = []
 81 |         i = step % horizon
 82 |         observations[i] = observation
 83 |         vpreds[i] = vpred[0]
 84 |         actions[i] = action[0]
 85 |         prev_actions[i] = prevac
 86 | 
 87 |         clipped_action = action
 88 |         # Clip the actions to avoid out of bound error
 89 |         if isinstance(env.action_space, gym.spaces.Box):
 90 |             clipped_action = np.clip(action, env.action_space.low, env.action_space.high)
 91 | 
 92 |         if gail:
 93 |             rew = reward_giver.get_reward(observation, clipped_action[0])
 94 |             observation, true_rew, done, _info = env.step(clipped_action[0])
 95 |         else:
 96 |             observation, rew, done, _info = env.step(clipped_action[0])
 97 |             true_rew = rew
 98 |         rews[i] = rew
 99 |         true_rews[i] = true_rew
100 |         dones[i] = done
101 | 
102 |         cur_ep_ret += rew
103 |         cur_ep_true_ret += true_rew
104 |         cur_ep_len += 1
105 |         if done:
106 |             ep_rets.append(cur_ep_ret)
107 |             ep_true_rets.append(cur_ep_true_ret)
108 |             ep_lens.append(cur_ep_len)
109 |             cur_ep_ret = 0
110 |             cur_ep_true_ret = 0
111 |             cur_ep_len = 0
112 |             if not isinstance(env, VecEnv):
113 |                 observation = env.reset()
114 |         step += 1
115 | 
116 | 
117 | def add_vtarg_and_adv(seg, gamma, lam):
118 |     """
119 |     Compute target value using TD(lambda) estimator, and advantage with GAE(lambda)
120 | 
121 |     :param seg: (dict) the current segment of the trajectory (see traj_segment_generator return for more information)
122 |     :param gamma: (float) Discount factor
123 |     :param lam: (float) GAE factor
124 |     """
125 |     # last element is only used for last vtarg, but we already zeroed it if last new = 1
126 |     new = np.append(seg["dones"], 0)
127 |     vpred = np.append(seg["vpred"], seg["nextvpred"])
128 |     rew_len = len(seg["rew"])
129 |     seg["adv"] = gaelam = np.empty(rew_len, 'float32')
130 |     rew = seg["rew"]
131 |     lastgaelam = 0
132 |     for step in reversed(range(rew_len)):
133 |         nonterminal = 1 - new[step + 1]
134 |         delta = rew[step] + gamma * vpred[step + 1] * nonterminal - vpred[step]
135 |         gaelam[step] = lastgaelam = delta + gamma * lam * nonterminal * lastgaelam
136 |     seg["tdlamret"] = seg["adv"] + seg["vpred"]
137 | 
138 | 
139 | def flatten_lists(listoflists):
140 |     """
141 |     Flatten a python list of list
142 | 
143 |     :param listoflists: (list(list))
144 |     :return: (list)
145 |     """
146 |     return [el for list_ in listoflists for el in list_]
147 | 


--------------------------------------------------------------------------------