├── LICENSE
├── README.md
├── data
    ├── mher_all.png
    ├── mher_all_step.png
    └── mher_sac.png
├── mher
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-36.pyc
    │   ├── config.cpython-36.pyc
    │   ├── default_cfg.cpython-36.pyc
    │   ├── play.cpython-36.pyc
    │   ├── run.cpython-36.pyc
    │   └── train.cpython-36.pyc
    ├── algos
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── actor_critic.cpython-36.pyc
    │   │   ├── ddpg.cpython-36.pyc
    │   │   ├── normalizer.cpython-36.pyc
    │   │   ├── rollout.cpython-36.pyc
    │   │   └── util.cpython-36.pyc
    │   ├── actor_critic.py
    │   ├── algorithm.py
    │   ├── ddpg.py
    │   ├── dynamics.py
    │   ├── sac.py
    │   ├── sac_utils.py
    │   └── util.py
    ├── buffers
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── replay_buffer.cpython-36.pyc
    │   │   └── samplers.cpython-36.pyc
    │   ├── prioritized_buffer.py
    │   └── replay_buffer.py
    ├── common
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── atari_wrappers.cpython-36.pyc
    │   │   ├── cmd_util.cpython-36.pyc
    │   │   ├── console_util.cpython-36.pyc
    │   │   ├── dataset.cpython-36.pyc
    │   │   ├── import_util.cpython-36.pyc
    │   │   ├── init_utils.cpython-36.pyc
    │   │   ├── logger.cpython-36.pyc
    │   │   ├── math_util.cpython-36.pyc
    │   │   ├── misc_util.cpython-36.pyc
    │   │   ├── monitor.cpython-36.pyc
    │   │   ├── mpi_adam.cpython-36.pyc
    │   │   ├── mpi_moments.cpython-36.pyc
    │   │   ├── retro_wrappers.cpython-36.pyc
    │   │   ├── tf_util.cpython-36.pyc
    │   │   ├── tile_images.cpython-36.pyc
    │   │   └── wrappers.cpython-36.pyc
    │   ├── atari_wrappers.py
    │   ├── cg.py
    │   ├── cmd_util.py
    │   ├── console_util.py
    │   ├── dataset.py
    │   ├── distributions.py
    │   ├── import_util.py
    │   ├── init_utils.py
    │   ├── input.py
    │   ├── logger.py
    │   ├── math_util.py
    │   ├── misc_util.py
    │   ├── models.py
    │   ├── monitor.py
    │   ├── mpi_adam.py
    │   ├── mpi_adam_optimizer.py
    │   ├── mpi_fork.py
    │   ├── mpi_moments.py
    │   ├── mpi_running_mean_std.py
    │   ├── mpi_util.py
    │   ├── normalizer.py
    │   ├── plot
    │   │   ├── plot.py
    │   │   └── results_plotter.py
    │   ├── plot_util.py
    │   ├── policies.py
    │   ├── retro_wrappers.py
    │   ├── runners.py
    │   ├── running_mean_std.py
    │   ├── schedules.py
    │   ├── segment_tree.py
    │   ├── test_mpi_util.py
    │   ├── tests
    │   │   ├── __init__.py
    │   │   ├── envs
    │   │   │   ├── __init__.py
    │   │   │   ├── fixed_sequence_env.py
    │   │   │   ├── identity_env.py
    │   │   │   ├── identity_env_test.py
    │   │   │   └── mnist_env.py
    │   │   ├── test_cartpole.py
    │   │   ├── test_doc_examples.py
    │   │   ├── test_env_after_learn.py
    │   │   ├── test_fetchreach.py
    │   │   ├── test_fixed_sequence.py
    │   │   ├── test_identity.py
    │   │   ├── test_mnist.py
    │   │   ├── test_plot_util.py
    │   │   ├── test_schedules.py
    │   │   ├── test_segment_tree.py
    │   │   ├── test_serialization.py
    │   │   ├── test_tf_util.py
    │   │   ├── test_with_mpi.py
    │   │   └── util.py
    │   ├── tf_util.py
    │   ├── tile_images.py
    │   ├── vec_env
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── dummy_vec_env.cpython-36.pyc
    │   │   │   ├── shmem_vec_env.cpython-36.pyc
    │   │   │   ├── subproc_vec_env.cpython-36.pyc
    │   │   │   ├── util.cpython-36.pyc
    │   │   │   ├── vec_env.cpython-36.pyc
    │   │   │   ├── vec_frame_stack.cpython-36.pyc
    │   │   │   ├── vec_monitor.cpython-36.pyc
    │   │   │   ├── vec_normalize.cpython-36.pyc
    │   │   │   ├── vec_remove_dict_obs.cpython-36.pyc
    │   │   │   └── vec_video_recorder.cpython-36.pyc
    │   │   ├── dummy_vec_env.py
    │   │   ├── shmem_vec_env.py
    │   │   ├── subproc_vec_env.py
    │   │   ├── test_vec_env.py
    │   │   ├── test_video_recorder.py
    │   │   ├── util.py
    │   │   ├── vec_env.py
    │   │   ├── vec_frame_stack.py
    │   │   ├── vec_monitor.py
    │   │   ├── vec_normalize.py
    │   │   ├── vec_remove_dict_obs.py
    │   │   └── vec_video_recorder.py
    │   └── wrappers.py
    ├── config.py
    ├── default_cfg.py
    ├── envs
    │   ├── __pycache__
    │   │   ├── env_utils.cpython-36.pyc
    │   │   └── make_env_utils.cpython-36.pyc
    │   ├── env_utils.py
    │   ├── make_env_utils.py
    │   └── wrappers
    │   │   ├── __pycache__
    │   │       └── wrapper_utils.cpython-36.pyc
    │   │   ├── multi_world_wrapper.py
    │   │   └── wrapper_utils.py
    ├── play.py
    ├── plot.py
    ├── rollouts
    │   ├── __init__.py
    │   └── rollout.py
    ├── run.py
    ├── samplers
    │   ├── __init__.py
    │   ├── her_sampler.py
    │   ├── nstep_sampler.py
    │   ├── prioritized_sampler.py
    │   └── sampler.py
    └── train.py
└── setup.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Rui
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Modular-HER 
 2 | ![GitHub](https://img.shields.io/github/license/YangRui2015/Modular_HER) ![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/YangRui2015/Modular_HER) ![GitHub last commit](https://img.shields.io/github/last-commit/YangRui2015/Modular_HER)
 3 | 
 4 | Modular-HER is revised from OpenAI baselines and supports many improvements for Hindsight Experience Replay (HER) as modules. We aim to provide a more **modular**, **readable** and **concise** package for Multi-goal Reinforcement Learning.
 5 | 
 6 | Welcome everyone to contribute suggestions or code !
 7 | 
 8 | 
 9 | ## Functions
10 | - [x] DDPG (https://arxiv.org/abs/1509.02971);
11 | - [x] HER (future, episode, final, random) (https://arxiv.org/abs/1707.01495);
12 | - [x] Cut HER (incrementally increase the future sample length);
13 | - [x] SHER (https://arxiv.org/abs/2002.02089);
14 | - [x] Prioritized HER (same as PHER in https://arxiv.org/abs/1905.08786);
15 | - [ ] Energe-based Prioritized HER(https://www.researchgate.net/publication/341776498_Energy-Based_Hindsight_Experience_Prioritization);
16 | - [ ] Curriculum-guided Hindsight Experience Replay (http://papers.nips.cc/paper/9425-curriculum-guided-hindsight-experience-replay);
17 | - [x] nstep DDPG and nstep HER;
18 | - [ ] more to be continued...
19 | 
20 | 
21 | ## Prerequisites 
22 | Require python3 (>=3.5), tensorflow (>=1.4,<=1.14) and system packages CMake, OpenMPI and zlib. Those can be installed as follows
23 | 
24 | #### Ubuntu :
25 | ```bash
26 | sudo apt-get update && sudo apt-get install cmake libopenmpi-dev python3-dev zlib1g-dev
27 | ```
28 | 
29 | #### Mac OS X  :
30 | With [Homebrew](https://brew.sh) installed, run the following:
31 | ```bash
32 | brew install cmake openmpi
33 | ```
34 | 
35 | ## Installation
36 | ```bash
37 | git clone https://github.com/YangRui2015/Modular_HER.git
38 | cd Modular_HER
39 | pip install -e .
40 | ```
41 | 
42 | 
43 | ## Usage
44 | Trainging DDPG and save logs and models.
45 | ```bash
46 | python -m mher.run --env=FetchReach-v1 --num_epoch 30 --num_env 1 --sampler random --play_episodes 5 --log_path=~/logs/fetchreach/ --save_path=~/logs/models/fetchreach_ddpg/
47 | ```
48 | 
49 | Trainging HER + DDPG with different sampler ('her_future', 'her_random', 'her_last', 'her_episode' are supported).
50 | ```bash
51 | python -m mher.run --env=FetchReach-v1 --num_epoch 30 --num_env 1 --sampler her_future --play_episodes 5 --log_path=~/logs/fetchreach/ --save_path=~/logs/models/fetchreach_herfuture/
52 | ```
53 | 
54 | Training SAC + HER.
55 | ```bash
56 | python -m mher.run  --env=FetchReach-v1 --num_epoch 50  --algo sac --sac_alpha 0.05 --sampler her_episode 
57 | ```
58 | 
59 | All support sampler flags.
60 | | Group | Samplers | 
61 | | ------ | ------ | 
62 | | Random sampler | random | 
63 | | HER | her_future, her_episode, her_last, her_random |
64 | | Nstep| nstep, nstep_her_future, nstep_her_epsisode, nstep_her_last, nstep_her_random|
65 | | Priority| priority, priority_her_future, priority_her_episode, priority_her_random, priority_her_last|
66 | 
67 | 
68 | ## Results
69 | 
70 | We use a group of test parameters in DEFAULT_ENV_PARAMS for performance comparison in FetchReach-v1 environment. 
71 | 
72 | 1. Performance of HER of different goal sample methods (future, random, episode, last).
73 | 
74 | <div  align="center"> <img src="./data/mher_all.png" width=500;  /></div>    
75 | 
76 | 2. Performance of Nstep HER and Nstep DDPG.
77 | 
78 | <div  align="center"><img src="./data/mher_all_step.png" width=500;" /></div>
79 | 
80 | 3. Performance of SHER (Not good enough in FetchReach environment, I will test more envs to report). 
81 | 
82 | <div  align="center"><img src="./data/mher_sac.png" width=500;" /></div>
83 | 
84 | 
85 | ## Update
86 | 
87 | * 9.27 V0.0: update readme;
88 | * 10.3 V0.5: revised code framework hugely, supported DDPG and HER(future, last, final, random);
89 | * 10.4 V0.6: update code framework, add rollouts and samplers packages;
90 | * 10.6 add nstep sampler and nstep her sampler;
91 | * 10.7 fix bug of nstep her sampler;
92 | * 10.16 add priority experience replay and cut her;
93 | * 10.31 V1.0: add SHER support;
94 | 


--------------------------------------------------------------------------------
/data/mher_all.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/data/mher_all.png


--------------------------------------------------------------------------------
/data/mher_all_step.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/data/mher_all_step.png


--------------------------------------------------------------------------------
/data/mher_sac.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/data/mher_sac.png


--------------------------------------------------------------------------------
/mher/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/__init__.py


--------------------------------------------------------------------------------
/mher/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/__pycache__/config.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/__pycache__/config.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/__pycache__/default_cfg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/__pycache__/default_cfg.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/__pycache__/play.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/__pycache__/play.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/__pycache__/run.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/__pycache__/run.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/__pycache__/train.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/__pycache__/train.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/algos/__init__.py:
--------------------------------------------------------------------------------
1 | from mher.algos.ddpg import DDPG
2 | from mher.algos.sac import SAC


--------------------------------------------------------------------------------
/mher/algos/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/algos/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/algos/__pycache__/actor_critic.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/algos/__pycache__/actor_critic.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/algos/__pycache__/ddpg.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/algos/__pycache__/ddpg.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/algos/__pycache__/normalizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/algos/__pycache__/normalizer.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/algos/__pycache__/rollout.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/algos/__pycache__/rollout.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/algos/__pycache__/util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/algos/__pycache__/util.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/algos/actor_critic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from mher.algos.sac_utils import apply_squashing_func, mlp_gaussian_policy
  4 | from mher.algos.util import nn, store_args
  5 | 
  6 | 
  7 | class ActorCritic: 
  8 |     @store_args
  9 |     def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, sess):
 10 |         """The actor-critic network and related training code.
 11 |         Args:
 12 |             inputs_tf (dict of tensors): all necessary inputs for the network: the
 13 |                 observation (o), the goal (g), and the action (u)
 14 |             dimo (int): the dimension of the observations
 15 |             dimg (int): the dimension of the goals
 16 |             dimu (int): the dimension of the actions
 17 |             max_u (float): the maximum magnitude of actions; action outputs will be scaled accordingly
 18 |             o_stats (mher.algos.Normalizer): normalizer for observations
 19 |             g_stats (mher.algos.Normalizer): normalizer for goals
 20 |             hidden (int): number of hidden units that should be used in hidden layers
 21 |             layers (int): number of hidden layers
 22 |         """
 23 |         self.o_tf = inputs_tf['o']
 24 |         self.g_tf = inputs_tf['g']
 25 |         self.u_tf = inputs_tf['u'] 
 26 | 
 27 |         # Prepare inputs for actor and critic.
 28 |         o = self.o_stats.normalize(self.o_tf)
 29 |         g = self.g_stats.normalize(self.g_tf)
 30 |         input_pi = tf.concat(axis=1, values=[o, g])  # for actor
 31 |         self._network(input_pi, o, g)
 32 | 
 33 |     
 34 |     def _network(self, input_pi, o, g):
 35 |         # Networks.
 36 |         with tf.variable_scope('pi'):
 37 |             self.pi_tf = self.max_u * tf.tanh(nn(input_pi, [self.hidden] * self.layers + [self.dimu]))
 38 | 
 39 |         with tf.variable_scope('Q'):
 40 |             # for policy training
 41 |             input_Q = tf.concat(axis=1, values=[o, g, self.pi_tf / self.max_u])
 42 |             self.Q_pi_tf = nn(input_Q, [self.hidden] * self.layers + [1])
 43 |             # for critic training
 44 |             input_Q = tf.concat(axis=1, values=[o, g, self.u_tf / self.max_u])
 45 |             self._input_Q = input_Q  # exposed for tests
 46 |             self.Q_tf = nn(input_Q, [self.hidden] * self.layers + [1], reuse=True) 
 47 | 
 48 |     def get_Q(self, o, g, u):
 49 |         feed = {
 50 |             self.o_tf: o.reshape(-1, self.dimo),
 51 |             self.g_tf: g.reshape(-1, self.dimg),
 52 |             self.u_tf: u.reshape(-1, self.dimu)
 53 |         }
 54 |         return self.sess.run(self.Q_tf, feed_dict=feed)
 55 | 
 56 |     def get_Q_pi(self, o, g):
 57 |         feed = {
 58 |             self.o_tf: o.reshape(-1, self.dimo),
 59 |             self.g_tf:g.reshape(-1, self.dimg)
 60 |         }
 61 |         return self.sess.run(self.Q_pi_tf, feed_dict=feed)
 62 | 
 63 | 
 64 | class SAC_ActorCritic(ActorCritic):
 65 |     @store_args
 66 |     def __init__(self, inputs_tf, dimo, dimg, dimu, max_u, o_stats, g_stats, hidden, layers, sess):
 67 |         super(SAC_ActorCritic, self).__init__(**self.__dict__)
 68 | 
 69 | 
 70 |     def _network(self, input_pi, o, g):
 71 |         with tf.variable_scope('pi'):
 72 |             self.mu_tf, self.pi_tf, self.logp_pi_tf, self.log_std = mlp_gaussian_policy(input_pi, self.dimu,
 73 |                                                                           hidden_sizes=[self.hidden] * self.layers,
 74 |                                                                           activation=tf.nn.relu,
 75 |                                                                           output_activation=None)
 76 |             self.mu_tf, self.pi_tf, self.logp_pi_tf = apply_squashing_func(self.mu_tf, self.pi_tf, self.logp_pi_tf)
 77 | 
 78 |         with tf.variable_scope('q1'):
 79 |             self.q1_pi_tf = nn(tf.concat(axis=1, values=[o, g, self.pi_tf]),
 80 |                                 layers_sizes=[self.hidden] * self.layers + [1])
 81 |             self.q1_tf = nn(tf.concat(axis=1, values=[o, g, self.u_tf]),
 82 |                              layers_sizes=[self.hidden] * self.layers + [1], reuse=True)
 83 |         with tf.variable_scope('q2'):
 84 |             self.q2_pi_tf = nn(tf.concat(axis=1, values=[o, g, self.pi_tf]),
 85 |                                 layers_sizes=[self.hidden] * self.layers + [1])
 86 |             self.q2_tf = nn(tf.concat(axis=1, values=[o, g, self.u_tf]),
 87 |                              layers_sizes=[self.hidden] * self.layers + [1], reuse=True)
 88 |         with tf.variable_scope('min'):
 89 |             self.min_q_pi_tf = tf.minimum(self.q1_pi_tf, self.q2_pi_tf)
 90 |             self.min_q_tf = tf.minimum(self.q1_tf, self.q2_tf)
 91 |         with tf.variable_scope('v'):
 92 |             self.v_tf = nn(input_pi,layers_sizes=[self.hidden] * self.layers + [1])
 93 | 
 94 |     def get_Q(self, o, g, u):
 95 |         feed = {
 96 |             self.o_tf: o.reshape(-1, self.dimo),
 97 |             self.g_tf: g.reshape(-1, self.dimg),
 98 |             self.u_tf: u.reshape(-1, self.dimu)
 99 |         }
100 |         return self.sess.run(self.min_q_tf, feed_dict=feed)
101 | 
102 |     def get_Q_pi(self, o, g):
103 |         feed = {
104 |             self.o_tf: o.reshape(-1, self.dimo),
105 |             self.g_tf: g.reshape(-1, self.dimg)
106 |         }
107 |         return self.sess.run(self.min_q_pi_tf, feed_dict=feed)
108 |     
109 |     def get_V(self, o, g):
110 |         feed = {
111 |             self.o_tf: o.reshape(-1, self.dimo),
112 |             self.g_tf: g.reshape(-1, self.dimg)
113 |         }
114 |         return self.sess.run(self.v_tf, feed_dict=feed)
115 | 
116 | 
117 |     
118 |             
119 | 
120 | 
121 |     
122 | 
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/mher/algos/ddpg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from mher.algos.actor_critic import ActorCritic
 4 | from mher.algos.algorithm import Algorithm
 5 | from mher.algos.util import flatten_grads, get_var, store_args
 6 | from mher.common import logger, tf_util
 7 | from mher.common.mpi_adam import MpiAdam
 8 | 
 9 | 
10 | class DDPG(Algorithm):
11 |     @store_args
12 |     def __init__(self, buffer, input_dims, hidden, layers, polyak, Q_lr, pi_lr, 
13 |                 norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, subtract_goals, 
14 |                 relative_goals, clip_pos_returns, clip_return, gamma, vloss_type='normal', 
15 |                 priority=False, reuse=False, **kwargs):
16 |         """
17 |         see algorithm
18 |         """
19 |         super(DDPG, self).__init__(**self.__dict__)
20 |     
21 |     def _create_network(self, reuse=False):
22 |         logger.info("Creating a DDPG agent with action space %d x %s..." % (self.dimu, self.max_u))
23 |         self.sess = tf_util.get_session()
24 |         # normalizer for input
25 |         self._create_normalizer(reuse)
26 |         batch_tf = self._get_batch_tf()
27 | 
28 |         # networks
29 |         self._create_target_main(ActorCritic, reuse, batch_tf)
30 | 
31 |         # loss functions
32 |         target_Q_pi_tf = self.target.Q_pi_tf
33 |         clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
34 |         target_tf = self._clip_target(batch_tf, clip_range, target_Q_pi_tf)
35 | 
36 |         self.abs_td_error_tf = tf.abs(tf.stop_gradient(target_tf) - self.main.Q_tf)
37 |         self.Q_loss = tf.square(self.abs_td_error_tf)
38 |         if self.priority:
39 |             self.Q_loss_tf = tf.reduce_mean(batch_tf['w'] * self.Q_loss)
40 |         else:
41 |             self.Q_loss_tf = tf.reduce_mean(self.Q_loss)
42 |         self.pi_loss_tf = -tf.reduce_mean(self.main.Q_pi_tf)
43 |         self.pi_loss_tf += self.action_l2 * tf.reduce_mean(tf.square(self.main.pi_tf / self.max_u))
44 | 
45 |         # varibles
46 |         self.main_Q_var = get_var(self.scope + '/main/Q')
47 |         self.main_pi_var = get_var(self.scope + '/main/pi')
48 |         self.target_Q_var = get_var(self.scope + '/target/Q')
49 |         self.target_pi_var = get_var(self.scope + '/target/pi')
50 | 
51 |         Q_grads_tf = tf.gradients(self.Q_loss_tf, self.main_Q_var)
52 |         pi_grads_tf = tf.gradients(self.pi_loss_tf, self.main_pi_var)
53 |         assert len(self.main_Q_var) == len(Q_grads_tf)
54 |         assert len(self.main_pi_var) == len(pi_grads_tf)
55 |         self.Q_grads_vars_tf = zip(Q_grads_tf, self.main_Q_var)
56 |         self.pi_grads_vars_tf = zip(pi_grads_tf, self.main_pi_var)
57 |         self.Q_grad_tf = flatten_grads(grads=Q_grads_tf, var_list=self.main_Q_var)
58 |         self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=self.main_pi_var)
59 | 
60 |         # optimizers
61 |         self.Q_adam = MpiAdam(self.main_Q_var, scale_grad_by_procs=False)
62 |         self.pi_adam = MpiAdam(self.main_pi_var, scale_grad_by_procs=False)
63 |         self.main_vars = self.main_Q_var + self.main_pi_var
64 |         self.target_vars = self.target_Q_var+ self.target_pi_var
65 |         self.init_target_net_op = list(map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
66 |         self.update_target_net_op = list(map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), 
67 |                                         zip(self.target_vars, self.main_vars)))
68 | 
69 |         # initialize all variables
70 |         self.global_vars = get_var(self.scope, key='global')
71 |         tf.variables_initializer(self.global_vars).run()
72 |         self._sync_optimizers()
73 |         self._init_target_net()
74 | 
75 |     def _sync_optimizers(self):
76 |         self.Q_adam.sync()
77 |         self.pi_adam.sync()
78 | 
79 |     def _grads(self): # Avoid feed_dict here for performance!
80 |         critic_loss, actor_loss, Q_grad, pi_grad, abs_td_error = self.sess.run([  
81 |             self.Q_loss_tf,
82 |             self.main.Q_pi_tf,
83 |             self.Q_grad_tf,
84 |             self.pi_grad_tf,  
85 |             self.abs_td_error_tf
86 |         ])
87 |         return critic_loss, actor_loss, Q_grad, pi_grad, abs_td_error
88 | 
89 |     def _update(self, Q_grad, pi_grad):
90 |         self.Q_adam.update(Q_grad, self.Q_lr)
91 |         self.pi_adam.update(pi_grad, self.pi_lr)
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/mher/algos/sac.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | from mher.algos.actor_critic import SAC_ActorCritic
  4 | from mher.algos.algorithm import Algorithm
  5 | from mher.algos.util import flatten_grads, get_var, store_args
  6 | from mher.common import logger, tf_util
  7 | from mher.common.mpi_adam import MpiAdam
  8 | from mher.common import logger
  9 | 
 10 | 
 11 | class SAC(Algorithm):
 12 |     @store_args
 13 |     def __init__(self, buffer, input_dims, hidden, layers, polyak, Q_lr, pi_lr,
 14 |                  norm_eps, norm_clip, max_u, action_l2, clip_obs, scope, subtract_goals, 
 15 |                  relative_goals, clip_pos_returns, clip_return, gamma, vloss_type='normal',
 16 |                  priority=False, sac_alpha=0.03, reuse=False, **kwargs):
 17 |         """Implementation of DDPG that is used in combination with Hindsight Experience Replay (HER).
 18 |         Args:
 19 |             sac_alpha: hyperparameter in SAC
 20 |         """
 21 |         super(SAC, self).__init__(**self.__dict__)
 22 | 
 23 |     def _name_variable(self, name, main=True):
 24 |         if main:
 25 |             return self.scope + '/main/' + name
 26 |         else:
 27 |             return self.scope + '/target/' + name
 28 |     
 29 |     def _create_network(self, reuse=False):
 30 |         logger.info("Creating a SAC agent with action space %d x %s..." % (self.dimu, self.max_u))
 31 |         self.sess = tf_util.get_session()
 32 |         self._create_normalizer(reuse)
 33 |         batch_tf = self._get_batch_tf()
 34 | 
 35 |         # networks
 36 |         self._create_target_main(SAC_ActorCritic, reuse, batch_tf)
 37 | 
 38 |         # loss functions
 39 |         clip_range = (-self.clip_return, 0. if self.clip_pos_returns else np.inf)
 40 |         target_tf = self._clip_target(batch_tf, clip_range, self.target.v_tf)
 41 |         q_backup_tf = tf.stop_gradient(target_tf)
 42 |         v_backup_tf = tf.stop_gradient(self.main.min_q_pi_tf - self.sac_alpha * self.main.logp_pi_tf)
 43 | 
 44 |         q1_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q1_tf) ** 2)
 45 |         q2_loss_tf = 0.5 * tf.reduce_mean((q_backup_tf - self.main.q2_tf) ** 2)
 46 |         v_loss_tf = 0.5 * tf.reduce_mean((v_backup_tf - self.main.v_tf) ** 2)
 47 |         self.abs_tf_error_tf = tf.reduce_mean(tf.abs(q_backup_tf - self.main.q1_tf) + tf.abs(q_backup_tf -self.main.q2_tf))
 48 | 
 49 |         self.value_loss_tf = q1_loss_tf + q2_loss_tf + v_loss_tf
 50 |         self.pi_loss_tf = tf.reduce_mean(self.sac_alpha * self.main.logp_pi_tf - self.main.q1_pi_tf)
 51 |         
 52 |         # virables
 53 |         value_params = get_var(self._name_variable('q')) + get_var(self._name_variable('v'))
 54 |         pi_params = get_var(self._name_variable('pi'))
 55 |         # gradients
 56 |         V_grads_tf = tf.gradients(self.value_loss_tf, value_params)
 57 |         pi_grads_tf = tf.gradients(self.pi_loss_tf, pi_params)
 58 |         self.V_grad_tf = flatten_grads(grads=V_grads_tf, var_list=value_params)
 59 |         self.pi_grad_tf = flatten_grads(grads=pi_grads_tf, var_list=pi_params)
 60 | 
 61 |         # optimizers
 62 |         self.V_adam = MpiAdam(value_params, scale_grad_by_procs=False)
 63 |         self.pi_adam = MpiAdam(pi_params, scale_grad_by_procs=False)
 64 | 
 65 |         # polyak averaging
 66 |         self.main_vars = get_var(self._name_variable('pi')) + get_var(self._name_variable('q1')) + get_var(self._name_variable('q2')) + get_var(self._name_variable('v'))
 67 |         self.target_vars = get_var(self._name_variable('pi', main=False)) + get_var(self._name_variable('q1', main=False)) + get_var(self._name_variable('q2', main=False)) + get_var(self._name_variable('v', main=False))
 68 | 
 69 |         self.init_target_net_op = list(map(lambda v: v[0].assign(v[1]), zip(self.target_vars, self.main_vars)))
 70 |         self.update_target_net_op = list(map(lambda v: v[0].assign(self.polyak * v[0] + (1. - self.polyak) * v[1]), \
 71 |                                         zip(self.target_vars, self.main_vars)))
 72 | 
 73 |         # initialize all variables
 74 |         self.global_vars = get_var(self.scope, key='global')
 75 |         tf.variables_initializer(self.global_vars).run()
 76 |         self._sync_optimizers()
 77 |         self._init_target_net()
 78 | 
 79 | 
 80 |     def _sync_optimizers(self):
 81 |         self.V_adam.sync()
 82 |         self.pi_adam.sync()
 83 | 
 84 |     def _grads(self):
 85 |         critic_loss, actor_loss, V_grad, pi_grad, abs_td_error = self.sess.run([
 86 |             self.value_loss_tf,
 87 |             self.pi_loss_tf,
 88 |             self.V_grad_tf,
 89 |             self.pi_grad_tf,
 90 |             self.abs_tf_error_tf
 91 |         ])
 92 |         return critic_loss, actor_loss, V_grad, pi_grad, abs_td_error
 93 | 
 94 |     def _update(self, V_grad, pi_grad):
 95 |         self.V_adam.update(V_grad, self.Q_lr)
 96 |         self.pi_adam.update(pi_grad, self.pi_lr)
 97 |     
 98 |     # sac doesn't need noise
 99 |     def get_actions(self, o, ag, g, noise_eps=0., random_eps=0., use_target_net=False, compute_Q=False):
100 |         o, g = self._preprocess_og(o=o, g=g, ag=ag)
101 |         if not noise_eps and not random_eps:
102 |             u = self.simple_get_action(o, g, use_target_net, deterministic=True)
103 |         else:
104 |             u = self.simple_get_action(o, g, use_target_net, deterministic=False)
105 | 
106 |         if compute_Q:
107 |             Q_pi = self.get_Q_fun(o, g)
108 | 
109 |         u = np.clip(u, -self.max_u, self.max_u)
110 |         if u.shape[0] == 1:
111 |             u = u[0] 
112 | 
113 |         if compute_Q:
114 |             return [u, Q_pi]
115 |         else:
116 |             return u
117 | 
118 |     def simple_get_action(self, o, g, use_target_net=False, deterministic=False):
119 |         o,g = self._preprocess_og(o=o,g=g)
120 |         policy = self.target if use_target_net else self.main  # in n-step self.target performs better
121 |         act_tf = policy.mu_tf if deterministic else policy.pi_tf
122 |         action, logp_pi, min_q_pi, q1_pi, q2_pi,log_std  = self.sess.run( \
123 |             [act_tf, policy.logp_pi_tf, policy.min_q_pi_tf, policy.q1_pi_tf, policy.q2_pi_tf, policy.log_std], \
124 |             feed_dict={
125 |             policy.o_tf: o.reshape(-1, self.dimo),
126 |             policy.g_tf: g.reshape(-1, self.dimg)
127 |         })
128 |         return action
129 | 


--------------------------------------------------------------------------------
/mher/algos/sac_utils.py:
--------------------------------------------------------------------------------
 1 | from mher.algos.util import nn
 2 | import tensorflow as tf 
 3 | import numpy as np 
 4 | 
 5 | EPS = 1e-6
 6 | LOG_STD_MAX = 2
 7 | LOG_STD_MIN = -20
 8 | 
 9 | def gaussian_likelihood(x, mu, log_std):
10 |     pre_sum = -0.5 * (((x-mu)/(tf.exp(log_std)+EPS))**2 + 2*log_std + np.log(2*np.pi)) 
11 |     return tf.reduce_sum(pre_sum, axis=1)
12 | 
13 | def clip_but_pass_gradient(x, l=-1., u=1.):
14 |     clip_up = tf.cast(x > u, tf.float32)
15 |     clip_low = tf.cast(x < l, tf.float32)
16 |     return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low)
17 | 
18 | def nn_gaussian_policy(x, a, dimu, layers_sizes,output_activation):
19 |     act_dim = dimu
20 |     net = nn(x, layers_sizes)
21 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
22 | 
23 |     log_std = tf.layers.dense(net, act_dim, activation=None)
24 |     log_std = tf.clip_by_value(log_std, LOG_STD_MIN, LOG_STD_MAX)
25 | 
26 |     std = tf.exp(log_std)
27 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
28 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
29 |     return mu, pi, logp_pi
30 | 
31 | def apply_squashing_func(mu, pi, logp_pi):
32 |     mu = tf.tanh(mu)
33 |     pi = tf.tanh(pi)
34 |     # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range.
35 |     logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1)
36 |     return mu, pi, logp_pi
37 | 
38 | def mlp_gaussian_policy(x, act_dim, hidden_sizes, activation, output_activation):
39 |     net = nn(x, hidden_sizes)
40 |     mu = tf.layers.dense(net, act_dim, activation=output_activation)
41 | 
42 |     log_std = tf.layers.dense(net, act_dim, activation=tf.tanh)
43 |     log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1)
44 | 
45 |     std = tf.exp(log_std)
46 |     pi = mu + tf.random_normal(tf.shape(mu)) * std
47 |     logp_pi = gaussian_likelihood(pi, mu, log_std)
48 |     return mu, pi, logp_pi, log_std
49 | 
50 | 


--------------------------------------------------------------------------------
/mher/algos/util.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import importlib
  3 | import inspect
  4 | import os
  5 | import subprocess
  6 | import sys
  7 | 
  8 | import numpy as np
  9 | import tensorflow as tf
 10 | from mher.common import tf_util as U
 11 | 
 12 | 
 13 | def dims_to_shapes(input_dims):
 14 |     return {key: tuple([val]) if val > 0 else tuple() for key, val in input_dims.items()}
 15 | 
 16 | def store_args(method):
 17 |     """Stores provided method args as instance attributes.
 18 |     """
 19 |     argspec = inspect.getfullargspec(method)
 20 |     defaults = {}
 21 |     if argspec.defaults is not None:
 22 |         defaults = dict(
 23 |             zip(argspec.args[-len(argspec.defaults):], argspec.defaults))
 24 |     if argspec.kwonlydefaults is not None:
 25 |         defaults.update(argspec.kwonlydefaults)
 26 |     arg_names = argspec.args[1:]
 27 | 
 28 |     @functools.wraps(method)
 29 |     def wrapper(*positional_args, **keyword_args):
 30 |         self = positional_args[0]
 31 |         # Get default arg values
 32 |         args = defaults.copy()
 33 |         # Add provided arg values
 34 |         for name, value in zip(arg_names, positional_args[1:]):
 35 |             args[name] = value
 36 |         args.update(keyword_args)
 37 |         self.__dict__.update(args)
 38 |         return method(*positional_args, **keyword_args)
 39 | 
 40 |     return wrapper
 41 | 
 42 | 
 43 | def import_function(spec):
 44 |     """Import a function identified by a string like "pkg.module:fn_name".
 45 |     """
 46 |     mod_name, fn_name = spec.split(':')
 47 |     module = importlib.import_module(mod_name)
 48 |     fn = getattr(module, fn_name)
 49 |     return fn
 50 | 
 51 | 
 52 | def flatten_grads(var_list, grads):
 53 |     """Flattens a variables and their gradients.
 54 |     """
 55 |     return tf.concat([tf.reshape(grad, [U.numel(v)])
 56 |                       for (v, grad) in zip(var_list, grads)], 0)
 57 | 
 58 | 
 59 | def nn(input, layers_sizes, reuse=None, flatten=False, name="", trainable='True', init='xavier', init_range=0.01):
 60 |     """Creates a simple neural network
 61 |     """
 62 |     if init == 'xavier':
 63 |         initializer = tf.contrib.layers.xavier_initializer()
 64 |     elif init == 'random':
 65 |         initializer = tf.random_uniform_initializer(minval=-init_range, maxval=init_range)
 66 |     else:
 67 |         raise NotImplementedError
 68 | 
 69 |     for i, size in enumerate(layers_sizes):
 70 |         activation = tf.nn.relu if i < len(layers_sizes) - 1 else None
 71 |         input = tf.layers.dense(inputs=input,
 72 |                                 units=size,
 73 |                                 kernel_initializer=initializer,
 74 |                                 reuse=reuse,
 75 |                                 name=name + '_' + str(i),
 76 |                                 trainable=trainable)
 77 |         if activation:
 78 |             input = activation(input)
 79 |     if flatten:
 80 |         assert layers_sizes[-1] == 1
 81 |         input = tf.reshape(input, [-1])
 82 |     return input
 83 | 
 84 | 
 85 | def install_mpi_excepthook():
 86 |     import sys
 87 | 
 88 |     from mpi4py import MPI
 89 |     old_hook = sys.excepthook
 90 | 
 91 |     def new_hook(a, b, c):
 92 |         old_hook(a, b, c)
 93 |         sys.stdout.flush()
 94 |         sys.stderr.flush()
 95 |         MPI.COMM_WORLD.Abort()
 96 |     sys.excepthook = new_hook
 97 | 
 98 | 
 99 | def mpi_fork(n, extra_mpi_args=[]):
100 |     """Re-launches the current script with workers
101 |     Returns "parent" for original parent, "child" for MPI children
102 |     """
103 |     if n <= 1:
104 |         return "child"
105 |     if os.getenv("IN_MPI") is None:
106 |         env = os.environ.copy()
107 |         env.update(
108 |             MKL_NUM_THREADS="1",
109 |             OMP_NUM_THREADS="1",
110 |             IN_MPI="1"
111 |         )
112 |         # "-bind-to core" is crucial for good performance
113 |         args = ["mpirun", "-np", str(n)] + \
114 |             extra_mpi_args + \
115 |             [sys.executable]
116 | 
117 |         args += sys.argv
118 |         subprocess.check_call(args, env=env)
119 |         return "parent"
120 |     else:
121 |         install_mpi_excepthook()
122 |         return "child"
123 | 
124 | 
125 | def convert_episode_to_batch_major(episode):
126 |     """Converts an episode to have the batch dimension in the major (first)
127 |     dimension.
128 |     """
129 |     episode_batch = {}
130 |     for key in episode.keys():
131 |         val = np.array(episode[key]).copy()
132 |         # make inputs batch-major instead of time-major
133 |         episode_batch[key] = val.swapaxes(0, 1)
134 | 
135 |     return episode_batch
136 | 
137 | 
138 | def transitions_in_episode_batch(episode_batch):
139 |     """Number of transitions in a given episode batch.
140 |     """
141 |     shape = episode_batch['u'].shape
142 |     return shape[0] * shape[1]
143 | 
144 | 
145 | def reshape_for_broadcasting(source, target):
146 |     """Reshapes a tensor (source) to have the correct shape and dtype of the target
147 |     before broadcasting it with MPI.
148 |     """
149 |     dim = len(target.get_shape())
150 |     shape = ([1] * (dim - 1)) + [-1]
151 |     return tf.reshape(tf.cast(source, target.dtype), shape)
152 | 
153 | def get_var(scope, key='trainable'):
154 |     if key == 'trainable':
155 |         tf_key = tf.GraphKeys.TRAINABLE_VARIABLES
156 |     elif key == 'global':
157 |         tf_key = tf.GraphKeys.GLOBAL_VARIABLES
158 |     else:
159 |         print('No such key {} for tensorflow'.format(key))
160 |         raise NotImplementedError
161 |     res = tf.get_collection(tf_key, scope=scope)
162 |     return res
163 | 
164 | 


--------------------------------------------------------------------------------
/mher/buffers/__init__.py:
--------------------------------------------------------------------------------
1 | from mher.buffers.replay_buffer import ReplayBuffer
2 | from mher.buffers.prioritized_buffer import PrioritizedReplayBuffer


--------------------------------------------------------------------------------
/mher/buffers/__pycache__/replay_buffer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/buffers/__pycache__/replay_buffer.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/buffers/__pycache__/samplers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/buffers/__pycache__/samplers.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/buffers/prioritized_buffer.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | 
 3 | import numpy as np
 4 | from mher.buffers.replay_buffer import ReplayBuffer
 5 | from mher.common.segment_tree import MinSegmentTree, SumSegmentTree
 6 | 
 7 | 
 8 | class PrioritizedReplayBuffer(ReplayBuffer):
 9 |     def __init__(self, buffer_shapes, size_in_transitions, T, sampler):
10 |         """Create Prioritized Replay buffer"""
11 |         super(PrioritizedReplayBuffer, self).__init__(buffer_shapes, size_in_transitions, T, sampler)
12 | 
13 |     def store_episode(self, episode_batch):
14 |         """episode_batch: array(batch_size x (T or T+1) x dim_key)"""
15 |         episode_idxs = super().store_episode(episode_batch)
16 |         # save priority
17 |         if not hasattr(episode_idxs, '__len__'):
18 |             episode_idxs = np.array([episode_idxs]) 
19 |         self.sampler.update_new_priorities(episode_idxs)
20 | 
21 |     def update_priorities(self, idxs, priorities):
22 |         self.sampler.update_priorities(idxs, priorities)
23 | 
24 | 


--------------------------------------------------------------------------------
/mher/buffers/replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | 
  3 | import numpy as np
  4 | 
  5 | 
  6 | class ReplayBuffer:
  7 |     def __init__(self, buffer_shapes, size_in_transitions, T, sampler): 
  8 |         """Creates a replay buffer.
  9 |         Args:
 10 |             buffer_shapes (dict of ints): the shape for all buffers that are used in buffer
 11 |             size_in_transitions (int): the size of the buffer, measured in transitions
 12 |             T (int): the time horizon for episodes
 13 |             sampler (class): sampler class used to sample from buffer
 14 |         """
 15 |         self.buffer_shapes = buffer_shapes
 16 |         self.size = size_in_transitions // T   # size in episodes
 17 |         self.T = T
 18 |         self.sampler = sampler
 19 |         # self.buffers is {key: array(size_in_episodes x T or T+1 x dim_key)}
 20 |         self.buffers = {key: np.empty([self.size, *shape]) for key, shape in buffer_shapes.items()}
 21 |         # memory management
 22 |         self.point = 0
 23 |         self.current_size = 0
 24 |         self.n_transitions_stored = 0
 25 |         self.lock = threading.Lock()
 26 |         
 27 |     @property
 28 |     def full(self):
 29 |         with self.lock:
 30 |             return self.current_size == self.size
 31 | 
 32 |     def sample(self):
 33 |         """Returns a dict {key: array(batch_size x shapes[key])}
 34 |         """
 35 |         buffers = {}
 36 |         with self.lock:
 37 |             assert self.current_size > 0
 38 |             for key in self.buffers.keys():
 39 |                 buffers[key] = self.buffers[key][:self.current_size]
 40 |         # make o_2 and ag_2
 41 |         if 'o_2' not in buffers and 'ag_2' not in buffers:
 42 |             buffers['o_2'] = buffers['o'][:, 1:, :]
 43 |             buffers['ag_2'] = buffers['ag'][:, 1:, :]
 44 |         transitions = self.sampler.sample(buffers)
 45 |         return transitions
 46 | 
 47 |     def store_episode(self, episode_batch):
 48 |         """episode_batch: array(rollout_batch_size x (T or T+1) x dim_key)"""
 49 |         buffer_sizes = [len(episode_batch[key]) for key in episode_batch.keys()]
 50 |         assert np.all(np.array(buffer_sizes) == buffer_sizes[0])
 51 |         buffer_size = buffer_sizes[0]
 52 |         with self.lock:
 53 |             idxs = self._get_storage_idx(buffer_size)  #use ordered idx get lower performance
 54 |             # load inputs into buffers
 55 |             for key in episode_batch.keys():
 56 |                 if key in self.buffers:
 57 |                     self.buffers[key][idxs] = episode_batch[key]
 58 |             self.n_transitions_stored += buffer_size * self.T
 59 |         return idxs
 60 | 
 61 |     def get_current_episode_size(self):
 62 |         with self.lock:
 63 |             return self.current_size
 64 | 
 65 |     def get_current_size(self):
 66 |         with self.lock:
 67 |             return self.current_size * self.T
 68 | 
 69 |     def get_transitions_stored(self):
 70 |         with self.lock:
 71 |             return self.n_transitions_stored
 72 | 
 73 |     def clear_buffer(self):
 74 |         with self.lock:
 75 |             self.current_size = 0
 76 | 
 77 |     # if full, insert randomly
 78 |     def _get_storage_idx(self, inc=None):
 79 |         inc = inc or 1   # size increment
 80 |         assert inc <= self.size, "Batch committed to replay is too large!"
 81 |         # go consecutively until you hit the end, and then go randomly.
 82 |         if self.current_size+inc <= self.size:
 83 |             idx = np.arange(self.current_size, self.current_size+inc)
 84 |         elif self.current_size < self.size:
 85 |             overflow = inc - (self.size - self.current_size)
 86 |             idx_a = np.arange(self.current_size, self.size)
 87 |             idx_b = np.random.randint(0, self.current_size, overflow) 
 88 |             idx = np.concatenate([idx_a, idx_b])
 89 |         else:
 90 |             idx = np.random.randint(0, self.size, inc)
 91 |         # update replay size
 92 |         self.current_size = min(self.size, self.current_size+inc)
 93 | 
 94 |         if inc == 1:
 95 |             idx = idx[0]
 96 |         return idx
 97 |     
 98 |     # if full, insert in order
 99 |     def _get_ordered_storage_idx(self, inc=None):
100 |         inc = inc or 1   # size increment
101 |         assert inc <= self.size, "Batch committed to replay is too large!"
102 | 
103 |         if self.point+inc <= self.size - 1:
104 |             idx = np.arange(self.point, self.point + inc)
105 |         else:
106 |             overflow = inc - (self.size - self.point)
107 |             idx_a = np.arange(self.point, self.size)
108 |             idx_b = np.arange(0, overflow)
109 |             idx = np.concatenate([idx_a, idx_b])
110 | 
111 |         self.point = (self.point + inc) % self.size
112 | 
113 |         # update replay size, don't add when it already surpass self.size
114 |         if self.current_size < self.size:
115 |             self.current_size = min(self.size, self.current_size+inc)
116 | 
117 |         if inc == 1:
118 |             idx = idx[0]
119 |         return idx
120 | 


--------------------------------------------------------------------------------
/mher/common/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa F403
2 | from mher.common.console_util import *
3 | from mher.common.dataset import Dataset
4 | from mher.common.math_util import *
5 | from mher.common.misc_util import *
6 | 


--------------------------------------------------------------------------------
/mher/common/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/atari_wrappers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/atari_wrappers.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/cmd_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/cmd_util.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/console_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/console_util.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/dataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/dataset.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/import_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/import_util.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/init_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/init_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/logger.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/logger.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/math_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/math_util.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/misc_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/misc_util.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/monitor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/monitor.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/mpi_adam.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/mpi_adam.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/mpi_moments.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/mpi_moments.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/retro_wrappers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/retro_wrappers.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/tf_util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/tf_util.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/tile_images.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/tile_images.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/__pycache__/wrappers.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/__pycache__/wrappers.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/cg.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | def cg(f_Ax, b, cg_iters=10, callback=None, verbose=False, residual_tol=1e-10):
 3 |     """
 4 |     Demmel p 312
 5 |     """
 6 |     p = b.copy()
 7 |     r = b.copy()
 8 |     x = np.zeros_like(b)
 9 |     rdotr = r.dot(r)
10 | 
11 |     fmtstr =  "%10i %10.3g %10.3g"
12 |     titlestr =  "%10s %10s %10s"
13 |     if verbose: print(titlestr % ("iter", "residual norm", "soln norm"))
14 | 
15 |     for i in range(cg_iters):
16 |         if callback is not None:
17 |             callback(x)
18 |         if verbose: print(fmtstr % (i, rdotr, np.linalg.norm(x)))
19 |         z = f_Ax(p)
20 |         v = rdotr / p.dot(z)
21 |         x += v*p
22 |         r -= v*z
23 |         newrdotr = r.dot(r)
24 |         mu = newrdotr/rdotr
25 |         p = r + mu*p
26 | 
27 |         rdotr = newrdotr
28 |         if rdotr < residual_tol:
29 |             break
30 | 
31 |     if callback is not None:
32 |         callback(x)
33 |     if verbose: print(fmtstr % (i+1, rdotr, np.linalg.norm(x)))  # pylint: disable=W0631
34 |     return x
35 | 


--------------------------------------------------------------------------------
/mher/common/cmd_util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for command line
 3 | """
 4 | import os
 5 | import gym
 6 | import argparse
 7 | from mher.common import logger 
 8 | 
 9 | def common_arg_parser():
10 |     """
11 |     Create common used argparses for training
12 |     """
13 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
14 |     parser.add_argument('--env', help='environment ID', type=str, default='FetchReach-v1')
15 |     parser.add_argument('--seed', help='set seed', type=int, default=None)
16 |     parser.add_argument('--alg', help='Algorithm', type=str, default='her')
17 |     parser.add_argument('--random_init', help='Random init epochs before training',default=0, type=int)
18 |     parser.add_argument('--num_epoch', type=int, default=100)
19 |     parser.add_argument('--num_timesteps', type=float, default=1e6)
20 |     parser.add_argument('--network', help='network type (mlp, cnn, lstm, cnn_lstm, conv_only)', default='mlp', type=str)
21 |     parser.add_argument('--num_env', help='Number of environment being run in parallel. Default set to 1', default=1, type=int)
22 |     parser.add_argument('--save_path', help='Path to save trained model to', default=None, type=str)
23 |     parser.add_argument('--policy_save_interval', default=10, type=int)
24 |     parser.add_argument('--load_path', help='Path to load trained model to', default=None, type=str)
25 |     parser.add_argument('--log_path', help='Directory to save learning curve data.', default=None, type=str)
26 |     parser.add_argument('--play_episodes', help='Number of episodes to play after training', default=1, type=int)
27 |     parser.add_argument('--play_no_training', default=False, action='store_true')
28 |     return parser
29 | 
30 | def parse_unknown_args(args):
31 |     """
32 |     Parse arguments not consumed by arg parser into a dictionary
33 |     """
34 |     retval = {}
35 |     preceded_by_key = False
36 |     for arg in args:
37 |         if arg.startswith('--'):
38 |             if '=' in arg:
39 |                 key = arg.split('=')[0][2:]
40 |                 value = arg.split('=')[1]
41 |                 retval[key] = value
42 |             else:
43 |                 key = arg[2:]
44 |                 preceded_by_key = True
45 |         elif preceded_by_key:
46 |             retval[key] = arg
47 |             preceded_by_key = False
48 | 
49 |     return retval
50 | 
51 | 
52 | def parse_cmdline_kwargs(args):
53 |     '''
54 |     convert a list of '='-spaced command-line arguments to a dictionary, evaluating python objects when possible
55 |     '''
56 |     def parse(v):
57 |         assert isinstance(v, str)
58 |         try:
59 |             return eval(v)
60 |         except (NameError, SyntaxError):
61 |             return v
62 |     return {k: parse(v) for k,v in parse_unknown_args(args).items()}
63 | 
64 | def preprocess_kwargs(args):
65 |     arg_parser = common_arg_parser()
66 |     args, unknown_args = arg_parser.parse_known_args(args)
67 |     extra_args = parse_cmdline_kwargs(unknown_args)
68 |     return args, extra_args


--------------------------------------------------------------------------------
/mher/common/console_util.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | from contextlib import contextmanager
 3 | import numpy as np
 4 | import time
 5 | import shlex
 6 | import subprocess
 7 | 
 8 | # ================================================================
 9 | # Misc
10 | # ================================================================
11 | 
12 | def fmt_row(width, row, header=False):
13 |     out = " | ".join(fmt_item(x, width) for x in row)
14 |     if header: out = out + "\n" + "-"*len(out)
15 |     return out
16 | 
17 | def fmt_item(x, l):
18 |     if isinstance(x, np.ndarray):
19 |         assert x.ndim==0
20 |         x = x.item()
21 |     if isinstance(x, (float, np.float32, np.float64)):
22 |         v = abs(x)
23 |         if (v < 1e-4 or v > 1e+4) and v > 0:
24 |             rep = "%7.2e" % x
25 |         else:
26 |             rep = "%7.5f" % x
27 |     else: rep = str(x)
28 |     return " "*(l - len(rep)) + rep
29 | 
30 | color2num = dict(
31 |     gray=30,
32 |     red=31,
33 |     green=32,
34 |     yellow=33,
35 |     blue=34,
36 |     magenta=35,
37 |     cyan=36,
38 |     white=37,
39 |     crimson=38
40 | )
41 | 
42 | def colorize(string, color='green', bold=False, highlight=False):
43 |     attr = []
44 |     num = color2num[color]
45 |     if highlight: num += 10
46 |     attr.append(str(num))
47 |     if bold: attr.append('1')
48 |     return '\x1b[%sm%s\x1b[0m' % (';'.join(attr), string)
49 | 
50 | def print_cmd(cmd, dry=False):
51 |     if isinstance(cmd, str):  # for shell=True
52 |         pass
53 |     else:
54 |         cmd = ' '.join(shlex.quote(arg) for arg in cmd)
55 |     print(colorize(('CMD: ' if not dry else 'DRY: ') + cmd))
56 | 
57 | 
58 | def get_git_commit(cwd=None):
59 |     return subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD'], cwd=cwd).decode('utf8')
60 | 
61 | def get_git_commit_message(cwd=None):
62 |     return subprocess.check_output(['git', 'show', '-s', '--format=%B', 'HEAD'], cwd=cwd).decode('utf8')
63 | 
64 | def ccap(cmd, dry=False, env=None, **kwargs):
65 |     print_cmd(cmd, dry)
66 |     if not dry:
67 |         subprocess.check_call(cmd, env=env, **kwargs)
68 | 
69 | 
70 | MESSAGE_DEPTH = 0
71 | 
72 | @contextmanager
73 | def timed(msg):
74 |     global MESSAGE_DEPTH #pylint: disable=W0603
75 |     print(colorize('\t'*MESSAGE_DEPTH + '=: ' + msg, color='magenta'))
76 |     tstart = time.time()
77 |     MESSAGE_DEPTH += 1
78 |     yield
79 |     MESSAGE_DEPTH -= 1
80 |     print(colorize('\t'*MESSAGE_DEPTH + "done in %.3f seconds"%(time.time() - tstart), color='magenta'))
81 | 


--------------------------------------------------------------------------------
/mher/common/dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class Dataset(object):
 4 |     def __init__(self, data_map, deterministic=False, shuffle=True):
 5 |         self.data_map = data_map
 6 |         self.deterministic = deterministic
 7 |         self.enable_shuffle = shuffle
 8 |         self.n = next(iter(data_map.values())).shape[0]
 9 |         self._next_id = 0
10 |         self.shuffle()
11 | 
12 |     def shuffle(self):
13 |         if self.deterministic:
14 |             return
15 |         perm = np.arange(self.n)
16 |         np.random.shuffle(perm)
17 | 
18 |         for key in self.data_map:
19 |             self.data_map[key] = self.data_map[key][perm]
20 | 
21 |         self._next_id = 0
22 | 
23 |     def next_batch(self, batch_size):
24 |         if self._next_id >= self.n and self.enable_shuffle:
25 |             self.shuffle()
26 | 
27 |         cur_id = self._next_id
28 |         cur_batch_size = min(batch_size, self.n - self._next_id)
29 |         self._next_id += cur_batch_size
30 | 
31 |         data_map = dict()
32 |         for key in self.data_map:
33 |             data_map[key] = self.data_map[key][cur_id:cur_id+cur_batch_size]
34 |         return data_map
35 | 
36 |     def iterate_once(self, batch_size):
37 |         if self.enable_shuffle: self.shuffle()
38 | 
39 |         while self._next_id <= self.n - batch_size:
40 |             yield self.next_batch(batch_size)
41 |         self._next_id = 0
42 | 
43 |     def subset(self, num_elements, deterministic=True):
44 |         data_map = dict()
45 |         for key in self.data_map:
46 |             data_map[key] = self.data_map[key][:num_elements]
47 |         return Dataset(data_map, deterministic)
48 | 
49 | 
50 | def iterbatches(arrays, *, num_batches=None, batch_size=None, shuffle=True, include_final_partial_batch=True):
51 |     assert (num_batches is None) != (batch_size is None), 'Provide num_batches or batch_size, but not both'
52 |     arrays = tuple(map(np.asarray, arrays))
53 |     n = arrays[0].shape[0]
54 |     assert all(a.shape[0] == n for a in arrays[1:])
55 |     inds = np.arange(n)
56 |     if shuffle: np.random.shuffle(inds)
57 |     sections = np.arange(0, n, batch_size)[1:] if num_batches is None else num_batches
58 |     for batch_inds in np.array_split(inds, sections):
59 |         if include_final_partial_batch or len(batch_inds) == batch_size:
60 |             yield tuple(a[batch_inds] for a in arrays)
61 | 


--------------------------------------------------------------------------------
/mher/common/import_util.py:
--------------------------------------------------------------------------------
 1 | from importlib import import_module
 2 | 
 3 | 
 4 | def get_alg_module(alg, submodule=None):
 5 |     submodule = submodule or alg
 6 |     try:
 7 |         # first try to import the alg module from mher
 8 |         alg_module = import_module('.'.join(['mher', alg, submodule]))
 9 |     except ImportError:
10 |         # then from rl_algs
11 |         alg_module = import_module('.'.join(['rl_' + 'algs', alg, submodule]))
12 | 
13 |     return alg_module


--------------------------------------------------------------------------------
/mher/common/init_utils.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from collections import defaultdict
 3 | 
 4 | def init_mpi_import():
 5 |     '''
 6 |     import mpi used for multi-process training 
 7 |     '''
 8 |     try:
 9 |         from mpi4py import MPI
10 |     except ImportError:
11 |         MPI = None
12 |     return MPI
13 | 
14 | 
15 | def init_environment_import():
16 |     '''
17 |     import required environment code base
18 |     '''
19 |     # try:
20 |     #     import pybullet_envs
21 |     # except ImportError:
22 |     #     pybullet_envs = None
23 | 
24 |     # try:
25 |     #     import roboschool
26 |     # except ImportError:
27 |     #     roboschool = None
28 | 
29 |     # support mulitworld
30 |     # try:
31 |     #     import multiworld
32 |     #     multiworld.register_all_envs()
33 |     # except ImportError:
34 |     #     multiworld = None
35 | 
36 |     _game_envs = defaultdict(set)
37 |     for env in gym.envs.registry.all():
38 |         # TODO: solve this with regexes
39 |         try:
40 |             env_type = env.entry_point.split(':')[0].split('.')[-1]
41 |             _game_envs[env_type].add(env.id)
42 |         except:
43 |             pass
44 |     return _game_envs


--------------------------------------------------------------------------------
/mher/common/input.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from gym.spaces import Discrete, Box, MultiDiscrete
 4 | 
 5 | def observation_placeholder(ob_space, batch_size=None, name='Ob'):
 6 |     '''
 7 |     Create placeholder to feed observations into of the size appropriate to the observation space
 8 | 
 9 |     Parameters:
10 |     ----------
11 | 
12 |     ob_space: gym.Space     observation space
13 | 
14 |     batch_size: int         size of the batch to be fed into input. Can be left None in most cases.
15 | 
16 |     name: str               name of the placeholder
17 | 
18 |     Returns:
19 |     -------
20 | 
21 |     tensorflow placeholder tensor
22 |     '''
23 | 
24 |     assert isinstance(ob_space, Discrete) or isinstance(ob_space, Box) or isinstance(ob_space, MultiDiscrete), \
25 |         'Can only deal with Discrete and Box observation spaces for now'
26 | 
27 |     dtype = ob_space.dtype
28 |     if dtype == np.int8:
29 |         dtype = np.uint8
30 | 
31 |     return tf.placeholder(shape=(batch_size,) + ob_space.shape, dtype=dtype, name=name)
32 | 
33 | 
34 | def observation_input(ob_space, batch_size=None, name='Ob'):
35 |     '''
36 |     Create placeholder to feed observations into of the size appropriate to the observation space, and add input
37 |     encoder of the appropriate type.
38 |     '''
39 | 
40 |     placeholder = observation_placeholder(ob_space, batch_size, name)
41 |     return placeholder, encode_observation(ob_space, placeholder)
42 | 
43 | def encode_observation(ob_space, placeholder):
44 |     '''
45 |     Encode input in the way that is appropriate to the observation space
46 | 
47 |     Parameters:
48 |     ----------
49 | 
50 |     ob_space: gym.Space             observation space
51 | 
52 |     placeholder: tf.placeholder     observation input placeholder
53 |     '''
54 |     if isinstance(ob_space, Discrete):
55 |         return tf.to_float(tf.one_hot(placeholder, ob_space.n))
56 |     elif isinstance(ob_space, Box):
57 |         return tf.to_float(placeholder)
58 |     elif isinstance(ob_space, MultiDiscrete):
59 |         placeholder = tf.cast(placeholder, tf.int32)
60 |         one_hots = [tf.to_float(tf.one_hot(placeholder[..., i], ob_space.nvec[i])) for i in range(placeholder.shape[-1])]
61 |         return tf.concat(one_hots, axis=-1)
62 |     else:
63 |         raise NotImplementedError
64 | 
65 | 


--------------------------------------------------------------------------------
/mher/common/math_util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.signal
 3 | 
 4 | 
 5 | def discount(x, gamma):
 6 |     """
 7 |     computes discounted sums along 0th dimension of x.
 8 | 
 9 |     inputs
10 |     ------
11 |     x: ndarray
12 |     gamma: float
13 | 
14 |     outputs
15 |     -------
16 |     y: ndarray with same shape as x, satisfying
17 | 
18 |         y[t] = x[t] + gamma*x[t+1] + gamma^2*x[t+2] + ... + gamma^k x[t+k],
19 |                 where k = len(x) - t - 1
20 | 
21 |     """
22 |     assert x.ndim >= 1
23 |     return scipy.signal.lfilter([1],[1,-gamma],x[::-1], axis=0)[::-1]
24 | 
25 | def explained_variance(ypred,y):
26 |     """
27 |     Computes fraction of variance that ypred explains about y.
28 |     Returns 1 - Var[y-ypred] / Var[y]
29 | 
30 |     interpretation:
31 |         ev=0  =>  might as well have predicted zero
32 |         ev=1  =>  perfect prediction
33 |         ev<0  =>  worse than just predicting zero
34 | 
35 |     """
36 |     assert y.ndim == 1 and ypred.ndim == 1
37 |     vary = np.var(y)
38 |     return np.nan if vary==0 else 1 - np.var(y-ypred)/vary
39 | 
40 | def explained_variance_2d(ypred, y):
41 |     assert y.ndim == 2 and ypred.ndim == 2
42 |     vary = np.var(y, axis=0)
43 |     out = 1 - np.var(y-ypred)/vary
44 |     out[vary < 1e-10] = 0
45 |     return out
46 | 
47 | def ncc(ypred, y):
48 |     return np.corrcoef(ypred, y)[1,0]
49 | 
50 | def flatten_arrays(arrs):
51 |     return np.concatenate([arr.flat for arr in arrs])
52 | 
53 | def unflatten_vector(vec, shapes):
54 |     i=0
55 |     arrs = []
56 |     for shape in shapes:
57 |         size = np.prod(shape)
58 |         arr = vec[i:i+size].reshape(shape)
59 |         arrs.append(arr)
60 |         i += size
61 |     return arrs
62 | 
63 | def discount_with_boundaries(X, New, gamma):
64 |     """
65 |     X: 2d array of floats, time x features
66 |     New: 2d array of bools, indicating when a new episode has started
67 |     """
68 |     Y = np.zeros_like(X)
69 |     T = X.shape[0]
70 |     Y[T-1] = X[T-1]
71 |     for t in range(T-2, -1, -1):
72 |         Y[t] = X[t] + gamma * Y[t+1] * (1 - New[t+1])
73 |     return Y
74 | 
75 | def test_discount_with_boundaries():
76 |     gamma=0.9
77 |     x = np.array([1.0, 2.0, 3.0, 4.0], 'float32')
78 |     starts = [1.0, 0.0, 0.0, 1.0]
79 |     y = discount_with_boundaries(x, starts, gamma)
80 |     assert np.allclose(y, [
81 |         1 + gamma * 2 + gamma**2 * 3,
82 |         2 + gamma * 3,
83 |         3,
84 |         4
85 |     ])
86 | 


--------------------------------------------------------------------------------
/mher/common/mpi_adam.py:
--------------------------------------------------------------------------------
  1 | import mher.common.tf_util as U
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | try:
  5 |     from mpi4py import MPI
  6 | except ImportError:
  7 |     MPI = None
  8 | 
  9 | 
 10 | class MpiAdam(object):
 11 |     def __init__(self, var_list, *, beta1=0.9, beta2=0.999, epsilon=1e-08, scale_grad_by_procs=True, comm=None):
 12 |         self.var_list = var_list
 13 |         self.beta1 = beta1
 14 |         self.beta2 = beta2
 15 |         self.epsilon = epsilon
 16 |         self.scale_grad_by_procs = scale_grad_by_procs
 17 |         size = sum(U.numel(v) for v in var_list)
 18 |         self.m = np.zeros(size, 'float32')
 19 |         self.v = np.zeros(size, 'float32')
 20 |         self.t = 0
 21 |         self.setfromflat = U.SetFromFlat(var_list)
 22 |         self.getflat = U.GetFlat(var_list) 
 23 |         self.comm = MPI.COMM_WORLD if comm is None and MPI is not None else comm
 24 | 
 25 |     def update(self, localg, stepsize):
 26 |         if self.t % 100 == 0:
 27 |             self.check_synced()
 28 |         localg = localg.astype('float32')
 29 |         if self.comm is not None:
 30 |             globalg = np.zeros_like(localg)
 31 |             self.comm.Allreduce(localg, globalg, op=MPI.SUM)
 32 |             if self.scale_grad_by_procs:
 33 |                 globalg /= self.comm.Get_size()
 34 |         else:
 35 |             globalg = np.copy(localg)
 36 | 
 37 |         self.t += 1
 38 |         a = stepsize * np.sqrt(1 - self.beta2**self.t)/(1 - self.beta1**self.t)
 39 |         self.m = self.beta1 * self.m + (1 - self.beta1) * globalg
 40 |         self.v = self.beta2 * self.v + (1 - self.beta2) * (globalg * globalg)
 41 |         step = (- a) * self.m / (np.sqrt(self.v) + self.epsilon)
 42 |         self.setfromflat(self.getflat() + step)
 43 | 
 44 |     def sync(self):
 45 |         if self.comm is None:
 46 |             return
 47 |         theta = self.getflat()
 48 |         self.comm.Bcast(theta, root=0)
 49 |         self.setfromflat(theta)
 50 | 
 51 |     def check_synced(self):
 52 |         if self.comm is None:
 53 |             return
 54 |         if self.comm.Get_rank() == 0: # this is root
 55 |             theta = self.getflat()
 56 |             self.comm.Bcast(theta, root=0)
 57 |         else:
 58 |             thetalocal = self.getflat()
 59 |             thetaroot = np.empty_like(thetalocal)
 60 |             self.comm.Bcast(thetaroot, root=0)
 61 |             assert (thetaroot == thetalocal).all(), (thetaroot, thetalocal)
 62 | 
 63 | @U.in_session
 64 | def test_MpiAdam():
 65 |     np.random.seed(0)
 66 |     tf.set_random_seed(0)
 67 | 
 68 |     a = tf.Variable(np.random.randn(3).astype('float32'))
 69 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
 70 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
 71 | 
 72 |     stepsize = 1e-2
 73 |     update_op = tf.train.AdamOptimizer(stepsize).minimize(loss)
 74 |     do_update = U.function([], loss, updates=[update_op])
 75 | 
 76 |     tf.get_default_session().run(tf.global_variables_initializer())
 77 |     losslist_ref = []
 78 |     for i in range(10):
 79 |         l = do_update()
 80 |         print(i, l)
 81 |         losslist_ref.append(l)
 82 | 
 83 | 
 84 | 
 85 |     tf.set_random_seed(0)
 86 |     tf.get_default_session().run(tf.global_variables_initializer())
 87 | 
 88 |     var_list = [a,b]
 89 |     lossandgrad = U.function([], [loss, U.flatgrad(loss, var_list)])
 90 |     adam = MpiAdam(var_list)
 91 | 
 92 |     losslist_test = []
 93 |     for i in range(10):
 94 |         l,g = lossandgrad()
 95 |         adam.update(g, stepsize)
 96 |         print(i,l)
 97 |         losslist_test.append(l)
 98 | 
 99 |     np.testing.assert_allclose(np.array(losslist_ref), np.array(losslist_test), atol=1e-4)
100 | 
101 | 
102 | if __name__ == '__main__':
103 |     test_MpiAdam()
104 | 


--------------------------------------------------------------------------------
/mher/common/mpi_adam_optimizer.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | from mher.common import tf_util as U
 4 | from mher.common.tests.test_with_mpi import with_mpi
 5 | from mher import logger
 6 | try:
 7 |     from mpi4py import MPI
 8 | except ImportError:
 9 |     MPI = None
10 | 
11 | class MpiAdamOptimizer(tf.train.AdamOptimizer):
12 |     """Adam optimizer that averages gradients across mpi processes."""
13 |     def __init__(self, comm, grad_clip=None, mpi_rank_weight=1, **kwargs):
14 |         self.comm = comm
15 |         self.grad_clip = grad_clip
16 |         self.mpi_rank_weight = mpi_rank_weight
17 |         tf.train.AdamOptimizer.__init__(self, **kwargs)
18 |     def compute_gradients(self, loss, var_list, **kwargs):
19 |         grads_and_vars = tf.train.AdamOptimizer.compute_gradients(self, loss, var_list, **kwargs)
20 |         grads_and_vars = [(g, v) for g, v in grads_and_vars if g is not None]
21 |         flat_grad = tf.concat([tf.reshape(g, (-1,)) for g, v in grads_and_vars], axis=0) * self.mpi_rank_weight
22 |         shapes = [v.shape.as_list() for g, v in grads_and_vars]
23 |         sizes = [int(np.prod(s)) for s in shapes]
24 | 
25 |         total_weight = np.zeros(1, np.float32)
26 |         self.comm.Allreduce(np.array([self.mpi_rank_weight], dtype=np.float32), total_weight, op=MPI.SUM)
27 |         total_weight = total_weight[0]
28 | 
29 |         buf = np.zeros(sum(sizes), np.float32)
30 |         countholder = [0] # Counts how many times _collect_grads has been called
31 |         stat = tf.reduce_sum(grads_and_vars[0][1]) # sum of first variable
32 |         def _collect_grads(flat_grad, np_stat):
33 |             if self.grad_clip is not None:
34 |                 gradnorm = np.linalg.norm(flat_grad)
35 |                 if gradnorm > 1:
36 |                     flat_grad /= gradnorm
37 |                 logger.logkv_mean('gradnorm', gradnorm)
38 |                 logger.logkv_mean('gradclipfrac', float(gradnorm > 1))
39 |             self.comm.Allreduce(flat_grad, buf, op=MPI.SUM)
40 |             np.divide(buf, float(total_weight), out=buf)
41 |             if countholder[0] % 100 == 0:
42 |                 check_synced(np_stat, self.comm)
43 |             countholder[0] += 1
44 |             return buf
45 | 
46 |         avg_flat_grad = tf.py_func(_collect_grads, [flat_grad, stat], tf.float32)
47 |         avg_flat_grad.set_shape(flat_grad.shape)
48 |         avg_grads = tf.split(avg_flat_grad, sizes, axis=0)
49 |         avg_grads_and_vars = [(tf.reshape(g, v.shape), v)
50 |                     for g, (_, v) in zip(avg_grads, grads_and_vars)]
51 |         return avg_grads_and_vars
52 | 
53 | def check_synced(localval, comm=None):
54 |     """
55 |     It's common to forget to initialize your variables to the same values, or
56 |     (less commonly) if you update them in some other way than adam, to get them out of sync.
57 |     This function checks that variables on all MPI workers are the same, and raises
58 |     an AssertionError otherwise
59 | 
60 |     Arguments:
61 |         comm: MPI communicator
62 |         localval: list of local variables (list of variables on current worker to be compared with the other workers)
63 |     """
64 |     comm = comm or MPI.COMM_WORLD
65 |     vals = comm.gather(localval)
66 |     if comm.rank == 0:
67 |         assert all(val==vals[0] for val in vals[1:]),\
68 |             'MpiAdamOptimizer detected that different workers have different weights: {}'.format(vals)
69 | 
70 | @with_mpi(timeout=5)
71 | def test_nonfreeze():
72 |     np.random.seed(0)
73 |     tf.set_random_seed(0)
74 | 
75 |     a = tf.Variable(np.random.randn(3).astype('float32'))
76 |     b = tf.Variable(np.random.randn(2,5).astype('float32'))
77 |     loss = tf.reduce_sum(tf.square(a)) + tf.reduce_sum(tf.sin(b))
78 | 
79 |     stepsize = 1e-2
80 |     # for some reason the session config with inter_op_parallelism_threads was causing
81 |     # nested sess.run calls to freeze
82 |     config = tf.ConfigProto(inter_op_parallelism_threads=1)
83 |     sess = U.get_session(config=config)
84 |     update_op = MpiAdamOptimizer(comm=MPI.COMM_WORLD, learning_rate=stepsize).minimize(loss)
85 |     sess.run(tf.global_variables_initializer())
86 |     losslist_ref = []
87 |     for i in range(100):
88 |         l,_ = sess.run([loss, update_op])
89 |         print(i, l)
90 |         losslist_ref.append(l)
91 | 


--------------------------------------------------------------------------------
/mher/common/mpi_fork.py:
--------------------------------------------------------------------------------
 1 | import os, subprocess, sys
 2 | 
 3 | def mpi_fork(n, bind_to_core=False):
 4 |     """Re-launches the current script with workers
 5 |     Returns "parent" for original parent, "child" for MPI children
 6 |     """
 7 |     if n<=1:
 8 |         return "child"
 9 |     if os.getenv("IN_MPI") is None:
10 |         env = os.environ.copy()
11 |         env.update(
12 |             MKL_NUM_THREADS="1",
13 |             OMP_NUM_THREADS="1",
14 |             IN_MPI="1"
15 |         )
16 |         args = ["mpirun", "-np", str(n)]
17 |         if bind_to_core:
18 |             args += ["-bind-to", "core"]
19 |         args += [sys.executable] + sys.argv
20 |         subprocess.check_call(args, env=env)
21 |         return "parent"
22 |     else:
23 |         return "child"
24 | 


--------------------------------------------------------------------------------
/mher/common/mpi_moments.py:
--------------------------------------------------------------------------------
 1 | from mpi4py import MPI
 2 | import numpy as np
 3 | from mher.common import zipsame
 4 | 
 5 | 
 6 | def mpi_mean(x, axis=0, comm=None, keepdims=False):
 7 |     x = np.asarray(x)
 8 |     assert x.ndim > 0
 9 |     if comm is None: comm = MPI.COMM_WORLD
10 |     xsum = x.sum(axis=axis, keepdims=keepdims)
11 |     n = xsum.size
12 |     localsum = np.zeros(n+1, x.dtype)
13 |     localsum[:n] = xsum.ravel()
14 |     localsum[n] = x.shape[axis]
15 |     # globalsum = np.zeros_like(localsum)
16 |     # comm.Allreduce(localsum, globalsum, op=MPI.SUM)
17 |     globalsum = comm.allreduce(localsum, op=MPI.SUM)
18 |     return globalsum[:n].reshape(xsum.shape) / globalsum[n], globalsum[n]
19 | 
20 | def mpi_moments(x, axis=0, comm=None, keepdims=False):
21 |     x = np.asarray(x)
22 |     assert x.ndim > 0
23 |     mean, count = mpi_mean(x, axis=axis, comm=comm, keepdims=True)
24 |     sqdiffs = np.square(x - mean)
25 |     meansqdiff, count1 = mpi_mean(sqdiffs, axis=axis, comm=comm, keepdims=True)
26 |     assert count1 == count
27 |     std = np.sqrt(meansqdiff)
28 |     if not keepdims:
29 |         newshape = mean.shape[:axis] + mean.shape[axis+1:]
30 |         mean = mean.reshape(newshape)
31 |         std = std.reshape(newshape)
32 |     return mean, std, count
33 | 
34 | 
35 | def test_runningmeanstd():
36 |     import subprocess
37 |     subprocess.check_call(['mpirun', '-np', '3',
38 |         'python','-c',
39 |         'from mher.common.mpi_moments import _helper_runningmeanstd; _helper_runningmeanstd()'])
40 | 
41 | def _helper_runningmeanstd():
42 |     comm = MPI.COMM_WORLD
43 |     np.random.seed(0)
44 |     for (triple,axis) in [
45 |         ((np.random.randn(3), np.random.randn(4), np.random.randn(5)),0),
46 |         ((np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),0),
47 |         ((np.random.randn(2,3), np.random.randn(2,4), np.random.randn(2,4)),1),
48 |         ]:
49 | 
50 | 
51 |         x = np.concatenate(triple, axis=axis)
52 |         ms1 = [x.mean(axis=axis), x.std(axis=axis), x.shape[axis]]
53 | 
54 | 
55 |         ms2 = mpi_moments(triple[comm.Get_rank()],axis=axis)
56 | 
57 |         for (a1,a2) in zipsame(ms1, ms2):
58 |             print(a1, a2)
59 |             assert np.allclose(a1, a2)
60 |             print("ok!")
61 | 
62 | 


--------------------------------------------------------------------------------
/mher/common/mpi_running_mean_std.py:
--------------------------------------------------------------------------------
  1 | try:
  2 |     from mpi4py import MPI
  3 | except ImportError:
  4 |     MPI = None
  5 | 
  6 | import tensorflow as tf, mher.common.tf_util as U, numpy as np
  7 | 
  8 | class RunningMeanStd(object):
  9 |     # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
 10 |     def __init__(self, epsilon=1e-2, shape=()):
 11 | 
 12 |         self._sum = tf.get_variable(
 13 |             dtype=tf.float64,
 14 |             shape=shape,
 15 |             initializer=tf.constant_initializer(0.0),
 16 |             name="runningsum", trainable=False)
 17 |         self._sumsq = tf.get_variable(
 18 |             dtype=tf.float64,
 19 |             shape=shape,
 20 |             initializer=tf.constant_initializer(epsilon),
 21 |             name="runningsumsq", trainable=False)
 22 |         self._count = tf.get_variable(
 23 |             dtype=tf.float64,
 24 |             shape=(),
 25 |             initializer=tf.constant_initializer(epsilon),
 26 |             name="count", trainable=False)
 27 |         self.shape = shape
 28 | 
 29 |         self.mean = tf.to_float(self._sum / self._count)
 30 |         self.std = tf.sqrt( tf.maximum( tf.to_float(self._sumsq / self._count) - tf.square(self.mean) , 1e-2 ))
 31 | 
 32 |         newsum = tf.placeholder(shape=self.shape, dtype=tf.float64, name='sum')
 33 |         newsumsq = tf.placeholder(shape=self.shape, dtype=tf.float64, name='var')
 34 |         newcount = tf.placeholder(shape=[], dtype=tf.float64, name='count')
 35 |         self.incfiltparams = U.function([newsum, newsumsq, newcount], [],
 36 |             updates=[tf.assign_add(self._sum, newsum),
 37 |                      tf.assign_add(self._sumsq, newsumsq),
 38 |                      tf.assign_add(self._count, newcount)])
 39 | 
 40 | 
 41 |     def update(self, x):
 42 |         x = x.astype('float64')
 43 |         n = int(np.prod(self.shape))
 44 |         totalvec = np.zeros(n*2+1, 'float64')
 45 |         addvec = np.concatenate([x.sum(axis=0).ravel(), np.square(x).sum(axis=0).ravel(), np.array([len(x)],dtype='float64')])
 46 |         if MPI is not None:
 47 |             MPI.COMM_WORLD.Allreduce(addvec, totalvec, op=MPI.SUM)
 48 |         self.incfiltparams(totalvec[0:n].reshape(self.shape), totalvec[n:2*n].reshape(self.shape), totalvec[2*n])
 49 | 
 50 | @U.in_session
 51 | def test_runningmeanstd():
 52 |     for (x1, x2, x3) in [
 53 |         (np.random.randn(3), np.random.randn(4), np.random.randn(5)),
 54 |         (np.random.randn(3,2), np.random.randn(4,2), np.random.randn(5,2)),
 55 |         ]:
 56 | 
 57 |         rms = RunningMeanStd(epsilon=0.0, shape=x1.shape[1:])
 58 |         U.initialize()
 59 | 
 60 |         x = np.concatenate([x1, x2, x3], axis=0)
 61 |         ms1 = [x.mean(axis=0), x.std(axis=0)]
 62 |         rms.update(x1)
 63 |         rms.update(x2)
 64 |         rms.update(x3)
 65 |         ms2 = [rms.mean.eval(), rms.std.eval()]
 66 | 
 67 |         assert np.allclose(ms1, ms2)
 68 | 
 69 | @U.in_session
 70 | def test_dist():
 71 |     np.random.seed(0)
 72 |     p1,p2,p3=(np.random.randn(3,1), np.random.randn(4,1), np.random.randn(5,1))
 73 |     q1,q2,q3=(np.random.randn(6,1), np.random.randn(7,1), np.random.randn(8,1))
 74 | 
 75 |     # p1,p2,p3=(np.random.randn(3), np.random.randn(4), np.random.randn(5))
 76 |     # q1,q2,q3=(np.random.randn(6), np.random.randn(7), np.random.randn(8))
 77 | 
 78 |     comm = MPI.COMM_WORLD
 79 |     assert comm.Get_size()==2
 80 |     if comm.Get_rank()==0:
 81 |         x1,x2,x3 = p1,p2,p3
 82 |     elif comm.Get_rank()==1:
 83 |         x1,x2,x3 = q1,q2,q3
 84 |     else:
 85 |         assert False
 86 | 
 87 |     rms = RunningMeanStd(epsilon=0.0, shape=(1,))
 88 |     U.initialize()
 89 | 
 90 |     rms.update(x1)
 91 |     rms.update(x2)
 92 |     rms.update(x3)
 93 | 
 94 |     bigvec = np.concatenate([p1,p2,p3,q1,q2,q3])
 95 | 
 96 |     def checkallclose(x,y):
 97 |         print(x,y)
 98 |         return np.allclose(x,y)
 99 | 
100 |     assert checkallclose(
101 |         bigvec.mean(axis=0),
102 |         rms.mean.eval(),
103 |     )
104 |     assert checkallclose(
105 |         bigvec.std(axis=0),
106 |         rms.std.eval(),
107 |     )
108 | 
109 | 
110 | if __name__ == "__main__":
111 |     # Run with mpirun -np 2 python <filename>
112 |     test_dist()
113 | 


--------------------------------------------------------------------------------
/mher/common/mpi_util.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import os, numpy as np
  3 | import platform
  4 | import shutil
  5 | import subprocess
  6 | import warnings
  7 | import sys
  8 | 
  9 | try:
 10 |     from mpi4py import MPI
 11 | except ImportError:
 12 |     MPI = None
 13 | 
 14 | 
 15 | def sync_from_root(sess, variables, comm=None):
 16 |     """
 17 |     Send the root node's parameters to every worker.
 18 |     Arguments:
 19 |       sess: the TensorFlow session.
 20 |       variables: all parameter variables including optimizer's
 21 |     """
 22 |     if comm is None: comm = MPI.COMM_WORLD
 23 |     import tensorflow as tf
 24 |     values = comm.bcast(sess.run(variables))
 25 |     sess.run([tf.assign(var, val)
 26 |         for (var, val) in zip(variables, values)])
 27 | 
 28 | def gpu_count():
 29 |     """
 30 |     Count the GPUs on this machine.
 31 |     """
 32 |     if shutil.which('nvidia-smi') is None:
 33 |         return 0
 34 |     output = subprocess.check_output(['nvidia-smi', '--query-gpu=gpu_name', '--format=csv'])
 35 |     return max(0, len(output.split(b'\n')) - 2)
 36 | 
 37 | def setup_mpi_gpus():
 38 |     """
 39 |     Set CUDA_VISIBLE_DEVICES to MPI rank if not already set
 40 |     """
 41 |     if 'CUDA_VISIBLE_DEVICES' not in os.environ:
 42 |         if sys.platform == 'darwin': # This Assumes if you're on OSX you're just
 43 |             ids = []                 # doing a smoke test and don't want GPUs
 44 |         else:
 45 |             lrank, _lsize = get_local_rank_size(MPI.COMM_WORLD)
 46 |             ids = [lrank]
 47 |         os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(map(str, ids))
 48 | 
 49 | def get_local_rank_size(comm):
 50 |     """
 51 |     Returns the rank of each process on its machine
 52 |     The processes on a given machine will be assigned ranks
 53 |         0, 1, 2, ..., N-1,
 54 |     where N is the number of processes on this machine.
 55 | 
 56 |     Useful if you want to assign one gpu per machine
 57 |     """
 58 |     this_node = platform.node()
 59 |     ranks_nodes = comm.allgather((comm.Get_rank(), this_node))
 60 |     node2rankssofar = defaultdict(int)
 61 |     local_rank = None
 62 |     for (rank, node) in ranks_nodes:
 63 |         if rank == comm.Get_rank():
 64 |             local_rank = node2rankssofar[node]
 65 |         node2rankssofar[node] += 1
 66 |     assert local_rank is not None
 67 |     return local_rank, node2rankssofar[this_node]
 68 | 
 69 | def share_file(comm, path):
 70 |     """
 71 |     Copies the file from rank 0 to all other ranks
 72 |     Puts it in the same place on all machines
 73 |     """
 74 |     localrank, _ = get_local_rank_size(comm)
 75 |     if comm.Get_rank() == 0:
 76 |         with open(path, 'rb') as fh:
 77 |             data = fh.read()
 78 |         comm.bcast(data)
 79 |     else:
 80 |         data = comm.bcast(None)
 81 |         if localrank == 0:
 82 |             os.makedirs(os.path.dirname(path), exist_ok=True)
 83 |             with open(path, 'wb') as fh:
 84 |                 fh.write(data)
 85 |     comm.Barrier()
 86 | 
 87 | def dict_gather(comm, d, op='mean', assert_all_have_data=True):
 88 |     """
 89 |     Perform a reduction operation over dicts
 90 |     """
 91 |     if comm is None: return d
 92 |     alldicts = comm.allgather(d)
 93 |     size = comm.size
 94 |     k2li = defaultdict(list)
 95 |     for d in alldicts:
 96 |         for (k,v) in d.items():
 97 |             k2li[k].append(v)
 98 |     result = {}
 99 |     for (k,li) in k2li.items():
100 |         if assert_all_have_data:
101 |             assert len(li)==size, "only %i out of %i MPI workers have sent '%s'" % (len(li), size, k)
102 |         if op=='mean':
103 |             result[k] = np.mean(li, axis=0)
104 |         elif op=='sum':
105 |             result[k] = np.sum(li, axis=0)
106 |         else:
107 |             assert 0, op
108 |     return result
109 | 
110 | def mpi_weighted_mean(comm, local_name2valcount):
111 |     """
112 |     Perform a weighted average over dicts that are each on a different node
113 |     Input: local_name2valcount: dict mapping key -> (value, count)
114 |     Returns: key -> mean
115 |     """
116 |     all_name2valcount = comm.gather(local_name2valcount)
117 |     if comm.rank == 0:
118 |         name2sum = defaultdict(float)
119 |         name2count = defaultdict(float)
120 |         for n2vc in all_name2valcount:
121 |             for (name, (val, count)) in n2vc.items():
122 |                 try:
123 |                     val = float(val)
124 |                 except ValueError:
125 |                     if comm.rank == 0:
126 |                         warnings.warn('WARNING: tried to compute mean on non-float {}={}'.format(name, val))
127 |                 else:
128 |                     name2sum[name] += val * count
129 |                     name2count[name] += count
130 |         return {name : name2sum[name] / name2count[name] for name in name2sum}
131 |     else:
132 |         return {}
133 | 
134 | 


--------------------------------------------------------------------------------
/mher/common/plot/plot.py:
--------------------------------------------------------------------------------
  1 | # DEPRECATED, use mher.common.plot_util instead
  2 | 
  3 | import os
  4 | import matplotlib.pyplot as plt
  5 | import numpy as np
  6 | import json
  7 | import seaborn as sns; sns.set()
  8 | import glob2
  9 | import argparse
 10 | 
 11 | 
 12 | def smooth_reward_curve(x, y):
 13 |     halfwidth = int(np.ceil(len(x) / 60))  # Halfwidth of our smoothing convolution
 14 |     k = halfwidth
 15 |     xsmoo = x
 16 |     ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1),
 17 |         mode='same')
 18 |     return xsmoo, ysmoo
 19 | 
 20 | 
 21 | def load_results(file):
 22 |     if not os.path.exists(file):
 23 |         return None
 24 |     with open(file, 'r') as f:
 25 |         lines = [line for line in f]
 26 |     if len(lines) < 2:
 27 |         return None
 28 |     keys = [name.strip() for name in lines[0].split(',')]
 29 |     data = np.genfromtxt(file, delimiter=',', skip_header=1, filling_values=0.)
 30 |     if data.ndim == 1:
 31 |         data = data.reshape(1, -1)
 32 |     assert data.ndim == 2
 33 |     assert data.shape[-1] == len(keys)
 34 |     result = {}
 35 |     for idx, key in enumerate(keys):
 36 |         result[key] = data[:, idx]
 37 |     return result
 38 | 
 39 | 
 40 | def pad(xs, value=np.nan):
 41 |     maxlen = np.max([len(x) for x in xs])
 42 | 
 43 |     padded_xs = []
 44 |     for x in xs:
 45 |         if x.shape[0] >= maxlen:
 46 |             padded_xs.append(x)
 47 | 
 48 |         padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
 49 |         x_padded = np.concatenate([x, padding], axis=0)
 50 |         assert x_padded.shape[1:] == x.shape[1:]
 51 |         assert x_padded.shape[0] == maxlen
 52 |         padded_xs.append(x_padded)
 53 |     return np.array(padded_xs)
 54 | 
 55 | 
 56 | parser = argparse.ArgumentParser()
 57 | parser.add_argument('dir', type=str)
 58 | parser.add_argument('--smooth', type=int, default=1)
 59 | args = parser.parse_args()
 60 | 
 61 | # Load all data.
 62 | data = {}
 63 | paths = [os.path.abspath(os.path.join(path, '..')) for path in glob2.glob(os.path.join(args.dir, '**', 'progress.csv'))]
 64 | for curr_path in paths:
 65 |     if not os.path.isdir(curr_path):
 66 |         continue
 67 |     results = load_results(os.path.join(curr_path, 'progress.csv'))
 68 |     if not results:
 69 |         print('skipping {}'.format(curr_path))
 70 |         continue
 71 |     print('loading {} ({})'.format(curr_path, len(results['epoch'])))
 72 |     with open(os.path.join(curr_path, 'params.json'), 'r') as f:
 73 |         params = json.load(f)
 74 | 
 75 |     success_rate = np.array(results['test/success_rate'])
 76 |     epoch = np.array(results['epoch']) + 1
 77 |     env_id = params['env_name']
 78 |     replay_strategy = params['replay_strategy']
 79 | 
 80 |     if replay_strategy == 'future':
 81 |         config = 'her'
 82 |     else:
 83 |         config = 'ddpg'
 84 |     if 'Dense' in env_id:
 85 |         config += '-dense'
 86 |     else:
 87 |         config += '-sparse'
 88 |     env_id = env_id.replace('Dense', '')
 89 | 
 90 |     # Process and smooth data.
 91 |     assert success_rate.shape == epoch.shape
 92 |     x = epoch
 93 |     y = success_rate
 94 |     if args.smooth:
 95 |         x, y = smooth_reward_curve(epoch, success_rate)
 96 |     assert x.shape == y.shape
 97 | 
 98 |     if env_id not in data:
 99 |         data[env_id] = {}
100 |     if config not in data[env_id]:
101 |         data[env_id][config] = []
102 |     data[env_id][config].append((x, y))
103 | 
104 | # Plot data.
105 | for env_id in sorted(data.keys()):
106 |     print('exporting {}'.format(env_id))
107 |     plt.clf()
108 | 
109 |     for config in sorted(data[env_id].keys()):
110 |         xs, ys = zip(*data[env_id][config])
111 |         xs, ys = pad(xs), pad(ys)
112 |         assert xs.shape == ys.shape
113 | 
114 |         plt.plot(xs[0], np.nanmedian(ys, axis=0), label=config)
115 |         plt.fill_between(xs[0], np.nanpercentile(ys, 25, axis=0), np.nanpercentile(ys, 75, axis=0), alpha=0.25)
116 |     plt.title(env_id)
117 |     plt.xlabel('Epoch')
118 |     plt.ylabel('Median Success Rate')
119 |     plt.legend()
120 |     plt.savefig(os.path.join(args.dir, 'fig_{}.png'.format(env_id)))
121 | 


--------------------------------------------------------------------------------
/mher/common/plot/results_plotter.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib
  3 | matplotlib.use('TkAgg') # Can change to 'Agg' for non-interactive mode
  4 | 
  5 | import matplotlib.pyplot as plt
  6 | plt.rcParams['svg.fonttype'] = 'none'
  7 | 
  8 | from mher.common import plot_util
  9 | 
 10 | X_TIMESTEPS = 'timesteps'
 11 | X_EPISODES = 'episodes'
 12 | X_WALLTIME = 'walltime_hrs'
 13 | Y_REWARD = 'reward'
 14 | Y_TIMESTEPS = 'timesteps'
 15 | POSSIBLE_X_AXES = [X_TIMESTEPS, X_EPISODES, X_WALLTIME]
 16 | EPISODES_WINDOW = 100
 17 | COLORS = ['blue', 'green', 'red', 'cyan', 'magenta', 'yellow', 'black', 'purple', 'pink',
 18 |         'brown', 'orange', 'teal', 'coral', 'lightblue', 'lime', 'lavender', 'turquoise',
 19 |         'darkgreen', 'tan', 'salmon', 'gold', 'darkred', 'darkblue']
 20 | 
 21 | def rolling_window(a, window):
 22 |     shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
 23 |     strides = a.strides + (a.strides[-1],)
 24 |     return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
 25 | 
 26 | def window_func(x, y, window, func):
 27 |     yw = rolling_window(y, window)
 28 |     yw_func = func(yw, axis=-1)
 29 |     return x[window-1:], yw_func
 30 | 
 31 | def ts2xy(ts, xaxis, yaxis):
 32 |     if xaxis == X_TIMESTEPS:
 33 |         x = np.cumsum(ts.l.values)
 34 |     elif xaxis == X_EPISODES:
 35 |         x = np.arange(len(ts))
 36 |     elif xaxis == X_WALLTIME:
 37 |         x = ts.t.values / 3600.
 38 |     else:
 39 |         raise NotImplementedError
 40 |     if yaxis == Y_REWARD:
 41 |         y = ts.r.values
 42 |     elif yaxis == Y_TIMESTEPS:
 43 |         y = ts.l.values
 44 |     else:
 45 |         raise NotImplementedError
 46 |     return x, y
 47 | 
 48 | def plot_curves(xy_list, xaxis, yaxis, title):
 49 |     fig = plt.figure(figsize=(8,2))
 50 |     maxx = max(xy[0][-1] for xy in xy_list)
 51 |     minx = 0
 52 |     for (i, (x, y)) in enumerate(xy_list):
 53 |         color = COLORS[i % len(COLORS)]
 54 |         plt.scatter(x, y, s=2)
 55 |         x, y_mean = window_func(x, y, EPISODES_WINDOW, np.mean) #So returns average of last EPISODE_WINDOW episodes
 56 |         plt.plot(x, y_mean, color=color)
 57 |     plt.xlim(minx, maxx)
 58 |     plt.title(title)
 59 |     plt.xlabel(xaxis)
 60 |     plt.ylabel(yaxis)
 61 |     plt.tight_layout()
 62 |     fig.canvas.mpl_connect('resize_event', lambda event: plt.tight_layout())
 63 |     plt.grid(True)
 64 | 
 65 | 
 66 | def split_by_task(taskpath):
 67 |     if type(taskpath) == dict:
 68 |         return taskpath['dirname'].split('/')[-1].split('-')[0]
 69 |     else:
 70 |         return taskpath.dirname.split('/')[-1].split('-')[0]
 71 | 
 72 | def plot_results(dirs, num_timesteps=10e6, xaxis=X_TIMESTEPS, yaxis=Y_REWARD, title='', split_fn=split_by_task, average_group=False):
 73 |     results = plot_util.load_results(dirs)
 74 |     plot_util.plot_results(results, xy_fn=lambda r: ts2xy(r.monitor, xaxis, yaxis), split_fn=split_fn, resample=0 ,average_group=average_group )
 75 | 
 76 | # ['monitor']  resample=int(1e6)
 77 | 
 78 | # Example usage in jupyter-notebook
 79 | # from mher.results_plotter import plot_results
 80 | # %matplotlib inline
 81 | # plot_results("./log")
 82 | # Here ./log is a directory containing the monitor.csv files
 83 | 
 84 | def main():
 85 |     import argparse
 86 |     import os
 87 |     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 88 |     parser.add_argument('--dirs', help='List of log directories', nargs = '*', default=['./log'])
 89 |     parser.add_argument('--num_timesteps', type=int, default=int(10e6))
 90 |     parser.add_argument('--xaxis', help = 'Varible on X-axis', default = X_TIMESTEPS)
 91 |     parser.add_argument('--yaxis', help = 'Varible on Y-axis', default = Y_REWARD)
 92 |     parser.add_argument('--task_name', help = 'Title of plot', default = 'Breakout')
 93 |     parser.add_argument('--average_group', help = 'group of point on the X-axis', type=bool, default = False)
 94 |     args = parser.parse_args()
 95 |     args.dirs = [os.path.abspath(dir) for dir in args.dirs]
 96 |     plot_results(args.dirs, args.num_timesteps, args.xaxis, args.yaxis, args.task_name, average_group=args.average_group)
 97 |     plt.show()
 98 | 
 99 | if __name__ == '__main__':
100 |     main()
101 | 


--------------------------------------------------------------------------------
/mher/common/runners.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import ABC, abstractmethod
 3 | 
 4 | class AbstractEnvRunner(ABC):
 5 |     def __init__(self, *, env, model, nsteps):
 6 |         self.env = env
 7 |         self.model = model
 8 |         self.nenv = nenv = env.num_envs if hasattr(env, 'num_envs') else 1
 9 |         self.batch_ob_shape = (nenv*nsteps,) + env.observation_space.shape
10 |         self.obs = np.zeros((nenv,) + env.observation_space.shape, dtype=env.observation_space.dtype.name)
11 |         self.obs[:] = env.reset()
12 |         self.nsteps = nsteps
13 |         self.states = model.initial_state
14 |         self.dones = [False for _ in range(nenv)]
15 | 
16 |     @abstractmethod
17 |     def run(self):
18 |         raise NotImplementedError
19 | 
20 | 


--------------------------------------------------------------------------------
/mher/common/schedules.py:
--------------------------------------------------------------------------------
  1 | """This file is used for specifying various schedules that evolve over
  2 | time throughout the execution of the algorithm, such as:
  3 |  - learning rate for the optimizer
  4 |  - exploration epsilon for the epsilon greedy exploration strategy
  5 |  - beta parameter for beta parameter in prioritized replay
  6 | 
  7 | Each schedule has a function `value(t)` which returns the current value
  8 | of the parameter given the timestep t of the optimization procedure.
  9 | """
 10 | 
 11 | 
 12 | class Schedule(object):
 13 |     def value(self, t):
 14 |         """Value of the schedule at time t"""
 15 |         raise NotImplementedError()
 16 | 
 17 | 
 18 | class ConstantSchedule(object):
 19 |     def __init__(self, value):
 20 |         """Value remains constant over time.
 21 | 
 22 |         Parameters
 23 |         ----------
 24 |         value: float
 25 |             Constant value of the schedule
 26 |         """
 27 |         self._v = value
 28 | 
 29 |     def value(self, t):
 30 |         """See Schedule.value"""
 31 |         return self._v
 32 | 
 33 | 
 34 | def linear_interpolation(l, r, alpha):
 35 |     return l + alpha * (r - l)
 36 | 
 37 | 
 38 | class PiecewiseSchedule(object):
 39 |     def __init__(self, endpoints, interpolation=linear_interpolation, outside_value=None):
 40 |         """Piecewise schedule.
 41 | 
 42 |         endpoints: [(int, int)]
 43 |             list of pairs `(time, value)` meanining that schedule should output
 44 |             `value` when `t==time`. All the values for time must be sorted in
 45 |             an increasing order. When t is between two times, e.g. `(time_a, value_a)`
 46 |             and `(time_b, value_b)`, such that `time_a <= t < time_b` then value outputs
 47 |             `interpolation(value_a, value_b, alpha)` where alpha is a fraction of
 48 |             time passed between `time_a` and `time_b` for time `t`.
 49 |         interpolation: lambda float, float, float: float
 50 |             a function that takes value to the left and to the right of t according
 51 |             to the `endpoints`. Alpha is the fraction of distance from left endpoint to
 52 |             right endpoint that t has covered. See linear_interpolation for example.
 53 |         outside_value: float
 54 |             if the value is requested outside of all the intervals sepecified in
 55 |             `endpoints` this value is returned. If None then AssertionError is
 56 |             raised when outside value is requested.
 57 |         """
 58 |         idxes = [e[0] for e in endpoints]
 59 |         assert idxes == sorted(idxes)
 60 |         self._interpolation = interpolation
 61 |         self._outside_value = outside_value
 62 |         self._endpoints = endpoints
 63 | 
 64 |     def value(self, t):
 65 |         """See Schedule.value"""
 66 |         for (l_t, l), (r_t, r) in zip(self._endpoints[:-1], self._endpoints[1:]):
 67 |             if l_t <= t and t < r_t:
 68 |                 alpha = float(t - l_t) / (r_t - l_t)
 69 |                 return self._interpolation(l, r, alpha)
 70 | 
 71 |         # t does not belong to any of the pieces, so doom.
 72 |         assert self._outside_value is not None
 73 |         return self._outside_value
 74 | 
 75 | 
 76 | class LinearSchedule(object):
 77 |     def __init__(self, schedule_timesteps, final_p, initial_p=1.0):
 78 |         """Linear interpolation between initial_p and final_p over
 79 |         schedule_timesteps. After this many timesteps pass final_p is
 80 |         returned.
 81 | 
 82 |         Parameters
 83 |         ----------
 84 |         schedule_timesteps: int
 85 |             Number of timesteps for which to linearly anneal initial_p
 86 |             to final_p
 87 |         initial_p: float
 88 |             initial output value
 89 |         final_p: float
 90 |             final output value
 91 |         """
 92 |         self.schedule_timesteps = schedule_timesteps
 93 |         self.final_p = final_p
 94 |         self.initial_p = initial_p
 95 | 
 96 |     def value(self, t):
 97 |         """See Schedule.value"""
 98 |         fraction = min(float(t) / self.schedule_timesteps, 1.0)
 99 |         return self.initial_p + fraction * (self.final_p - self.initial_p)
100 | 


--------------------------------------------------------------------------------
/mher/common/test_mpi_util.py:
--------------------------------------------------------------------------------
 1 | from mher.common import mpi_util
 2 | from mher import logger
 3 | from mher.common.tests.test_with_mpi import with_mpi
 4 | try:
 5 |     from mpi4py import MPI
 6 | except ImportError:
 7 |     MPI = None
 8 | 
 9 | @with_mpi()
10 | def test_mpi_weighted_mean():
11 |     comm = MPI.COMM_WORLD
12 |     with logger.scoped_configure(comm=comm):
13 |         if comm.rank == 0:
14 |             name2valcount = {'a' : (10, 2), 'b' : (20,3)}
15 |         elif comm.rank == 1:
16 |             name2valcount = {'a' : (19, 1), 'c' : (42,3)}
17 |         else:
18 |             raise NotImplementedError
19 |         d = mpi_util.mpi_weighted_mean(comm, name2valcount)
20 |         correctval = {'a' : (10 * 2 + 19) / 3.0, 'b' : 20, 'c' : 42}
21 |         if comm.rank == 0:
22 |             assert d == correctval, '{} != {}'.format(d, correctval)
23 | 
24 |         for name, (val, count) in name2valcount.items():
25 |             for _ in range(count):
26 |                 logger.logkv_mean(name, val)
27 |         d2 = logger.dumpkvs()
28 |         if comm.rank == 0:
29 |             assert d2 == correctval
30 | 


--------------------------------------------------------------------------------
/mher/common/tests/__init__.py:
--------------------------------------------------------------------------------
1 | import os, pytest
2 | mark_slow = pytest.mark.skipif(not os.getenv('RUNSLOW'), reason='slow')


--------------------------------------------------------------------------------
/mher/common/tests/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/tests/envs/__init__.py


--------------------------------------------------------------------------------
/mher/common/tests/envs/fixed_sequence_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import Env
 3 | from gym.spaces import Discrete
 4 | 
 5 | 
 6 | class FixedSequenceEnv(Env):
 7 |     def __init__(
 8 |             self,
 9 |             n_actions=10,
10 |             episode_len=100
11 |     ):
12 |         self.action_space = Discrete(n_actions)
13 |         self.observation_space = Discrete(1)
14 |         self.np_random = np.random.RandomState(0)
15 |         self.episode_len = episode_len
16 |         self.sequence = [self.np_random.randint(0, self.action_space.n)
17 |             for _ in range(self.episode_len)]
18 |         self.time = 0
19 | 
20 | 
21 |     def reset(self):
22 |         self.time = 0
23 |         return 0
24 | 
25 |     def step(self, actions):
26 |         rew = self._get_reward(actions)
27 |         self._choose_next_state()
28 |         done = False
29 |         if self.episode_len and self.time >= self.episode_len:
30 |             done = True
31 | 
32 |         return 0, rew, done, {}
33 | 
34 |     def seed(self, seed=None):
35 |         self.np_random.seed(seed)
36 | 
37 |     def _choose_next_state(self):
38 |         self.time += 1
39 | 
40 |     def _get_reward(self, actions):
41 |         return 1 if actions == self.sequence[self.time] else 0
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/mher/common/tests/envs/identity_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from abc import abstractmethod
 3 | from gym import Env
 4 | from gym.spaces import MultiDiscrete, Discrete, Box
 5 | from collections import deque
 6 | 
 7 | class IdentityEnv(Env):
 8 |     def __init__(
 9 |             self,
10 |             episode_len=None,
11 |             delay=0,
12 |             zero_first_rewards=True
13 |     ):
14 | 
15 |         self.observation_space = self.action_space
16 |         self.episode_len = episode_len
17 |         self.time = 0
18 |         self.delay = delay
19 |         self.zero_first_rewards = zero_first_rewards
20 |         self.q = deque(maxlen=delay+1)
21 | 
22 |     def reset(self):
23 |         self.q.clear()
24 |         for _ in range(self.delay + 1):
25 |             self.q.append(self.action_space.sample())
26 |         self.time = 0
27 | 
28 |         return self.q[-1]
29 | 
30 |     def step(self, actions):
31 |         rew = self._get_reward(self.q.popleft(), actions)
32 |         if self.zero_first_rewards and self.time < self.delay:
33 |             rew = 0
34 |         self.q.append(self.action_space.sample())
35 |         self.time += 1
36 |         done = self.episode_len is not None and self.time >= self.episode_len
37 |         return self.q[-1], rew, done, {}
38 | 
39 |     def seed(self, seed=None):
40 |         self.action_space.seed(seed)
41 | 
42 |     @abstractmethod
43 |     def _get_reward(self, state, actions):
44 |         raise NotImplementedError
45 | 
46 | 
47 | class DiscreteIdentityEnv(IdentityEnv):
48 |     def __init__(
49 |             self,
50 |             dim,
51 |             episode_len=None,
52 |             delay=0,
53 |             zero_first_rewards=True
54 |     ):
55 | 
56 |         self.action_space = Discrete(dim)
57 |         super().__init__(episode_len=episode_len, delay=delay, zero_first_rewards=zero_first_rewards)
58 | 
59 |     def _get_reward(self, state, actions):
60 |         return 1 if state == actions else 0
61 | 
62 | class MultiDiscreteIdentityEnv(IdentityEnv):
63 |     def __init__(
64 |             self,
65 |             dims,
66 |             episode_len=None,
67 |             delay=0,
68 |     ):
69 | 
70 |         self.action_space = MultiDiscrete(dims)
71 |         super().__init__(episode_len=episode_len, delay=delay)
72 | 
73 |     def _get_reward(self, state, actions):
74 |         return 1 if all(state == actions) else 0
75 | 
76 | 
77 | class BoxIdentityEnv(IdentityEnv):
78 |     def __init__(
79 |             self,
80 |             shape,
81 |             episode_len=None,
82 |     ):
83 | 
84 |         self.action_space = Box(low=-1.0, high=1.0, shape=shape, dtype=np.float32)
85 |         super().__init__(episode_len=episode_len)
86 | 
87 |     def _get_reward(self, state, actions):
88 |         diff = actions - state
89 |         diff = diff[:]
90 |         return -0.5 * np.dot(diff, diff)
91 | 


--------------------------------------------------------------------------------
/mher/common/tests/envs/identity_env_test.py:
--------------------------------------------------------------------------------
 1 | from mher.common.tests.envs.identity_env import DiscreteIdentityEnv
 2 | 
 3 | 
 4 | def test_discrete_nodelay():
 5 |     nsteps = 100
 6 |     eplen = 50
 7 |     env = DiscreteIdentityEnv(10, episode_len=eplen)
 8 |     ob = env.reset()
 9 |     for t in range(nsteps):
10 |         action = env.action_space.sample()
11 |         next_ob, rew, done, info = env.step(action)
12 |         assert rew == (1 if action == ob else 0)
13 |         if (t + 1) % eplen == 0:
14 |             assert done
15 |             next_ob = env.reset()
16 |         else:
17 |             assert not done
18 |         ob = next_ob
19 | 
20 | def test_discrete_delay1():
21 |     eplen = 50
22 |     env = DiscreteIdentityEnv(10, episode_len=eplen, delay=1)
23 |     ob = env.reset()
24 |     prev_ob = None
25 |     for t in range(eplen):
26 |         action = env.action_space.sample()
27 |         next_ob, rew, done, info = env.step(action)
28 |         if t > 0:
29 |             assert rew == (1 if action == prev_ob else 0)
30 |         else:
31 |             assert rew == 0
32 |         prev_ob = ob
33 |         ob = next_ob
34 |         if t < eplen - 1:
35 |             assert not done
36 |     assert done
37 | 


--------------------------------------------------------------------------------
/mher/common/tests/envs/mnist_env.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import numpy as np
 3 | import tempfile
 4 | from gym import Env
 5 | from gym.spaces import Discrete, Box
 6 | 
 7 | 
 8 | 
 9 | class MnistEnv(Env):
10 |     def __init__(
11 |             self,
12 |             episode_len=None,
13 |             no_images=None
14 |     ):
15 |         import filelock
16 |         from tensorflow.examples.tutorials.mnist import input_data
17 |         # we could use temporary directory for this with a context manager and
18 |         # TemporaryDirecotry, but then each test that uses mnist would re-download the data
19 |         # this way the data is not cleaned up, but we only download it once per machine
20 |         mnist_path = osp.join(tempfile.gettempdir(), 'MNIST_data')
21 |         with filelock.FileLock(mnist_path + '.lock'):
22 |            self.mnist = input_data.read_data_sets(mnist_path)
23 | 
24 |         self.np_random = np.random.RandomState()
25 | 
26 |         self.observation_space = Box(low=0.0, high=1.0, shape=(28,28,1))
27 |         self.action_space = Discrete(10)
28 |         self.episode_len = episode_len
29 |         self.time = 0
30 |         self.no_images = no_images
31 | 
32 |         self.train_mode()
33 |         self.reset()
34 | 
35 |     def reset(self):
36 |         self._choose_next_state()
37 |         self.time = 0
38 | 
39 |         return self.state[0]
40 | 
41 |     def step(self, actions):
42 |         rew = self._get_reward(actions)
43 |         self._choose_next_state()
44 |         done = False
45 |         if self.episode_len and self.time >= self.episode_len:
46 |             rew = 0
47 |             done = True
48 | 
49 |         return self.state[0], rew, done, {}
50 | 
51 |     def seed(self, seed=None):
52 |         self.np_random.seed(seed)
53 | 
54 |     def train_mode(self):
55 |         self.dataset = self.mnist.train
56 | 
57 |     def test_mode(self):
58 |         self.dataset = self.mnist.test
59 | 
60 |     def _choose_next_state(self):
61 |         max_index = (self.no_images if self.no_images is not None else self.dataset.num_examples) - 1
62 |         index = self.np_random.randint(0, max_index)
63 |         image = self.dataset.images[index].reshape(28,28,1)*255
64 |         label = self.dataset.labels[index]
65 |         self.state = (image, label)
66 |         self.time += 1
67 | 
68 |     def _get_reward(self, actions):
69 |         return 1 if self.state[1] == actions else 0
70 | 
71 | 
72 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_cartpole.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | 
 4 | from mher.run import get_learn_function
 5 | from mher.common.tests.util import reward_per_episode_test
 6 | from mher.common.tests import mark_slow
 7 | 
 8 | common_kwargs = dict(
 9 |     total_timesteps=30000,
10 |     network='mlp',
11 |     gamma=1.0,
12 |     seed=0,
13 | )
14 | 
15 | learn_kwargs = {
16 |     'a2c' : dict(nsteps=32, value_network='copy', lr=0.05),
17 |     'acer': dict(value_network='copy'),
18 |     'acktr': dict(nsteps=32, value_network='copy', is_async=False),
19 |     'deepq': dict(total_timesteps=20000),
20 |     'ppo2': dict(value_network='copy'),
21 |     'trpo_mpi': {}
22 | }
23 | 
24 | @mark_slow
25 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
26 | def test_cartpole(alg):
27 |     '''
28 |     Test if the algorithm (with an mlp policy)
29 |     can learn to balance the cartpole
30 |     '''
31 | 
32 |     kwargs = common_kwargs.copy()
33 |     kwargs.update(learn_kwargs[alg])
34 | 
35 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
36 |     def env_fn():
37 | 
38 |         env = gym.make('CartPole-v0')
39 |         env.seed(0)
40 |         return env
41 | 
42 |     reward_per_episode_test(env_fn, learn_fn, 100)
43 | 
44 | if __name__ == '__main__':
45 |     test_cartpole('acer')
46 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_doc_examples.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | try:
 3 |     import mujoco_py
 4 |     _mujoco_present = True
 5 | except BaseException:
 6 |     mujoco_py = None
 7 |     _mujoco_present = False
 8 | 
 9 | 
10 | @pytest.mark.skipif(
11 |     not _mujoco_present,
12 |     reason='error loading mujoco - either mujoco / mujoco key not present, or LD_LIBRARY_PATH is not pointing to mujoco library'
13 | )
14 | def test_lstm_example():
15 |     import tensorflow as tf
16 |     from mher.common import policies, models, cmd_util
17 |     from mher.common.vec_env.dummy_vec_env import DummyVecEnv
18 | 
19 |     # create vectorized environment
20 |     venv = DummyVecEnv([lambda: cmd_util.make_mujoco_env('Reacher-v2', seed=0)])
21 | 
22 |     with tf.Session() as sess:
23 |         # build policy based on lstm network with 128 units
24 |         policy = policies.build_policy(venv, models.lstm(128))(nbatch=1, nsteps=1)
25 | 
26 |         # initialize tensorflow variables
27 |         sess.run(tf.global_variables_initializer())
28 | 
29 |         # prepare environment variables
30 |         ob = venv.reset()
31 |         state = policy.initial_state
32 |         done = [False]
33 |         step_counter = 0
34 | 
35 |         # run a single episode until the end (i.e. until done)
36 |         while True:
37 |             action, _, state, _ = policy.step(ob, S=state, M=done)
38 |             ob, reward, done, _ = venv.step(action)
39 |             step_counter += 1
40 |             if done:
41 |                 break
42 | 
43 | 
44 |         assert step_counter > 5
45 | 
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_env_after_learn.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | import tensorflow as tf
 4 | 
 5 | from mher.common.vec_env.subproc_vec_env import SubprocVecEnv
 6 | from mher.run import get_learn_function
 7 | from mher.common.tf_util import make_session
 8 | 
 9 | algos = ['a2c', 'acer', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
10 | 
11 | @pytest.mark.parametrize('algo', algos)
12 | def test_env_after_learn(algo):
13 |     def make_env():
14 |         # acktr requires too much RAM, fails on travis
15 |         env = gym.make('CartPole-v1' if algo == 'acktr' else 'PongNoFrameskip-v4')
16 |         return env
17 | 
18 |     make_session(make_default=True, graph=tf.Graph())
19 |     env = SubprocVecEnv([make_env])
20 | 
21 |     learn = get_learn_function(algo)
22 | 
23 |     # Commenting out the following line resolves the issue, though crash happens at env.reset().
24 |     learn(network='mlp', env=env, total_timesteps=0, load_path=None, seed=None)
25 | 
26 |     env.reset()
27 |     env.close()
28 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_fetchreach.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import gym
 3 | 
 4 | from mher.run import get_learn_function
 5 | from mher.common.tests.util import reward_per_episode_test
 6 | from mher.common.tests import mark_slow
 7 | 
 8 | pytest.importorskip('mujoco_py')
 9 | 
10 | common_kwargs = dict(
11 |     network='mlp',
12 |     seed=0,
13 | )
14 | 
15 | learn_kwargs = {
16 |     'her': dict(total_timesteps=2000)
17 | }
18 | 
19 | @mark_slow
20 | @pytest.mark.parametrize("alg", learn_kwargs.keys())
21 | def test_fetchreach(alg):
22 |     '''
23 |     Test if the algorithm (with an mlp policy)
24 |     can learn the FetchReach task
25 |     '''
26 | 
27 |     kwargs = common_kwargs.copy()
28 |     kwargs.update(learn_kwargs[alg])
29 | 
30 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
31 |     def env_fn():
32 | 
33 |         env = gym.make('FetchReach-v1')
34 |         env.seed(0)
35 |         return env
36 | 
37 |     reward_per_episode_test(env_fn, learn_fn, -15)
38 | 
39 | if __name__ == '__main__':
40 |     test_fetchreach('her')
41 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_fixed_sequence.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from mher.common.tests.envs.fixed_sequence_env import FixedSequenceEnv
 3 | 
 4 | from mher.common.tests.util import simple_test
 5 | from mher.run import get_learn_function
 6 | from mher.common.tests import mark_slow
 7 | 
 8 | 
 9 | common_kwargs = dict(
10 |     seed=0,
11 |     total_timesteps=50000,
12 | )
13 | 
14 | learn_kwargs = {
15 |     'a2c': {},
16 |     'ppo2': dict(nsteps=10, ent_coef=0.0, nminibatches=1),
17 |     # TODO enable sequential models for trpo_mpi (proper handling of nbatch and nsteps)
18 |     # github issue: https://github.com/openai/baselines/issues/188
19 |     # 'trpo_mpi': lambda e, p: trpo_mpi.learn(policy_fn=p(env=e), env=e, max_timesteps=30000, timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.001)
20 | }
21 | 
22 | 
23 | alg_list = learn_kwargs.keys()
24 | rnn_list = ['lstm']
25 | 
26 | @mark_slow
27 | @pytest.mark.parametrize("alg", alg_list)
28 | @pytest.mark.parametrize("rnn", rnn_list)
29 | def test_fixed_sequence(alg, rnn):
30 |     '''
31 |     Test if the algorithm (with a given policy)
32 |     can learn an identity transformation (i.e. return observation as an action)
33 |     '''
34 | 
35 |     kwargs = learn_kwargs[alg]
36 |     kwargs.update(common_kwargs)
37 | 
38 |     env_fn = lambda: FixedSequenceEnv(n_actions=10, episode_len=5)
39 |     learn = lambda e: get_learn_function(alg)(
40 |         env=e,
41 |         network=rnn,
42 |         **kwargs
43 |     )
44 | 
45 |     simple_test(env_fn, learn, 0.7)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     test_fixed_sequence('ppo2', 'lstm')
50 | 
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_identity.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from mher.common.tests.envs.identity_env import DiscreteIdentityEnv, BoxIdentityEnv, MultiDiscreteIdentityEnv
 3 | from mher.run import get_learn_function
 4 | from mher.common.tests.util import simple_test
 5 | from mher.common.tests import mark_slow
 6 | 
 7 | common_kwargs = dict(
 8 |     total_timesteps=30000,
 9 |     network='mlp',
10 |     gamma=0.9,
11 |     seed=0,
12 | )
13 | 
14 | learn_kwargs = {
15 |     'a2c' : {},
16 |     'acktr': {},
17 |     'deepq': {},
18 |     'ddpg': dict(layer_norm=True),
19 |     'ppo2': dict(lr=1e-3, nsteps=64, ent_coef=0.0),
20 |     'trpo_mpi': dict(timesteps_per_batch=100, cg_iters=10, gamma=0.9, lam=1.0, max_kl=0.01)
21 | }
22 | 
23 | 
24 | algos_disc = ['a2c', 'acktr', 'deepq', 'ppo2', 'trpo_mpi']
25 | algos_multidisc = ['a2c', 'acktr', 'ppo2', 'trpo_mpi']
26 | algos_cont = ['a2c', 'acktr', 'ddpg',  'ppo2', 'trpo_mpi']
27 | 
28 | @mark_slow
29 | @pytest.mark.parametrize("alg", algos_disc)
30 | def test_discrete_identity(alg):
31 |     '''
32 |     Test if the algorithm (with an mlp policy)
33 |     can learn an identity transformation (i.e. return observation as an action)
34 |     '''
35 | 
36 |     kwargs = learn_kwargs[alg]
37 |     kwargs.update(common_kwargs)
38 | 
39 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
40 |     env_fn = lambda: DiscreteIdentityEnv(10, episode_len=100)
41 |     simple_test(env_fn, learn_fn, 0.9)
42 | 
43 | @mark_slow
44 | @pytest.mark.parametrize("alg", algos_multidisc)
45 | def test_multidiscrete_identity(alg):
46 |     '''
47 |     Test if the algorithm (with an mlp policy)
48 |     can learn an identity transformation (i.e. return observation as an action)
49 |     '''
50 | 
51 |     kwargs = learn_kwargs[alg]
52 |     kwargs.update(common_kwargs)
53 | 
54 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
55 |     env_fn = lambda: MultiDiscreteIdentityEnv((3,3), episode_len=100)
56 |     simple_test(env_fn, learn_fn, 0.9)
57 | 
58 | @mark_slow
59 | @pytest.mark.parametrize("alg", algos_cont)
60 | def test_continuous_identity(alg):
61 |     '''
62 |     Test if the algorithm (with an mlp policy)
63 |     can learn an identity transformation (i.e. return observation as an action)
64 |     to a required precision
65 |     '''
66 | 
67 |     kwargs = learn_kwargs[alg]
68 |     kwargs.update(common_kwargs)
69 |     learn_fn = lambda e: get_learn_function(alg)(env=e, **kwargs)
70 | 
71 |     env_fn = lambda: BoxIdentityEnv((1,), episode_len=100)
72 |     simple_test(env_fn, learn_fn, -0.1)
73 | 
74 | if __name__ == '__main__':
75 |     test_multidiscrete_identity('acktr')
76 | 
77 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_mnist.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | # from mher.acer import acer_simple as acer
 4 | from mher.common.tests.envs.mnist_env import MnistEnv
 5 | from mher.common.tests.util import simple_test
 6 | from mher.run import get_learn_function
 7 | from mher.common.tests import mark_slow
 8 | 
 9 | # TODO investigate a2c and ppo2 failures - is it due to bad hyperparameters for this problem?
10 | # GitHub issue https://github.com/openai/baselines/issues/189
11 | common_kwargs = {
12 |     'seed': 0,
13 |     'network':'cnn',
14 |     'gamma':0.9,
15 |     'pad':'SAME'
16 | }
17 | 
18 | learn_args = {
19 |     'a2c': dict(total_timesteps=50000),
20 |     'acer': dict(total_timesteps=20000),
21 |     'deepq': dict(total_timesteps=5000),
22 |     'acktr': dict(total_timesteps=30000),
23 |     'ppo2': dict(total_timesteps=50000, lr=1e-3, nsteps=128, ent_coef=0.0),
24 |     'trpo_mpi': dict(total_timesteps=80000, timesteps_per_batch=100, cg_iters=10, lam=1.0, max_kl=0.001)
25 | }
26 | 
27 | 
28 | #tests pass, but are too slow on travis. Same algorithms are covered
29 | # by other tests with less compute-hungry nn's and by benchmarks
30 | @pytest.mark.skip
31 | @mark_slow
32 | @pytest.mark.parametrize("alg", learn_args.keys())
33 | def test_mnist(alg):
34 |     '''
35 |     Test if the algorithm can learn to classify MNIST digits.
36 |     Uses CNN policy.
37 |     '''
38 | 
39 |     learn_kwargs = learn_args[alg]
40 |     learn_kwargs.update(common_kwargs)
41 | 
42 |     learn = get_learn_function(alg)
43 |     learn_fn = lambda e: learn(env=e, **learn_kwargs)
44 |     env_fn = lambda: MnistEnv(episode_len=100)
45 | 
46 |     simple_test(env_fn, learn_fn, 0.6)
47 | 
48 | if __name__ == '__main__':
49 |     test_mnist('acer')
50 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_plot_util.py:
--------------------------------------------------------------------------------
 1 | # smoke tests of plot_util
 2 | from mher.common import plot_util as pu
 3 | from mher.common.tests.util import smoketest
 4 | 
 5 | 
 6 | def test_plot_util():
 7 |     nruns = 4
 8 |     logdirs = [smoketest('--alg=ppo2 --env=CartPole-v0 --num_timesteps=10000') for _ in range(nruns)]
 9 |     data = pu.load_results(logdirs)
10 |     assert len(data) == 4
11 | 
12 |     _, axes = pu.plot_results(data[:1]); assert len(axes) == 1
13 |     _, axes = pu.plot_results(data, tiling='vertical'); assert axes.shape==(4,1)
14 |     _, axes = pu.plot_results(data, tiling='horizontal'); assert axes.shape==(1,4)
15 |     _, axes = pu.plot_results(data, tiling='symmetric'); assert axes.shape==(2,2)
16 |     _, axes = pu.plot_results(data, split_fn=lambda _: ''); assert len(axes) == 1
17 | 
18 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_schedules.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from mher.common.schedules import ConstantSchedule, PiecewiseSchedule
 4 | 
 5 | 
 6 | def test_piecewise_schedule():
 7 |     ps = PiecewiseSchedule([(-5, 100), (5, 200), (10, 50), (100, 50), (200, -50)], outside_value=500)
 8 | 
 9 |     assert np.isclose(ps.value(-10), 500)
10 |     assert np.isclose(ps.value(0), 150)
11 |     assert np.isclose(ps.value(5), 200)
12 |     assert np.isclose(ps.value(9), 80)
13 |     assert np.isclose(ps.value(50), 50)
14 |     assert np.isclose(ps.value(80), 50)
15 |     assert np.isclose(ps.value(150), 0)
16 |     assert np.isclose(ps.value(175), -25)
17 |     assert np.isclose(ps.value(201), 500)
18 |     assert np.isclose(ps.value(500), 500)
19 | 
20 |     assert np.isclose(ps.value(200 - 1e-10), -50)
21 | 
22 | 
23 | def test_constant_schedule():
24 |     cs = ConstantSchedule(5)
25 |     for i in range(-100, 100):
26 |         assert np.isclose(cs.value(i), 5)
27 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_segment_tree.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from mher.common.segment_tree import SumSegmentTree, MinSegmentTree
  4 | 
  5 | 
  6 | def test_tree_set():
  7 |     tree = SumSegmentTree(4)
  8 | 
  9 |     tree[2] = 1.0
 10 |     tree[3] = 3.0
 11 | 
 12 |     assert np.isclose(tree.sum(), 4.0)
 13 |     assert np.isclose(tree.sum(0, 2), 0.0)
 14 |     assert np.isclose(tree.sum(0, 3), 1.0)
 15 |     assert np.isclose(tree.sum(2, 3), 1.0)
 16 |     assert np.isclose(tree.sum(2, -1), 1.0)
 17 |     assert np.isclose(tree.sum(2, 4), 4.0)
 18 | 
 19 | 
 20 | def test_tree_set_overlap():
 21 |     tree = SumSegmentTree(4)
 22 | 
 23 |     tree[2] = 1.0
 24 |     tree[2] = 3.0
 25 | 
 26 |     assert np.isclose(tree.sum(), 3.0)
 27 |     assert np.isclose(tree.sum(2, 3), 3.0)
 28 |     assert np.isclose(tree.sum(2, -1), 3.0)
 29 |     assert np.isclose(tree.sum(2, 4), 3.0)
 30 |     assert np.isclose(tree.sum(1, 2), 0.0)
 31 | 
 32 | 
 33 | def test_prefixsum_idx():
 34 |     tree = SumSegmentTree(4)
 35 | 
 36 |     tree[2] = 1.0
 37 |     tree[3] = 3.0
 38 | 
 39 |     assert tree.find_prefixsum_idx(0.0) == 2
 40 |     assert tree.find_prefixsum_idx(0.5) == 2
 41 |     assert tree.find_prefixsum_idx(0.99) == 2
 42 |     assert tree.find_prefixsum_idx(1.01) == 3
 43 |     assert tree.find_prefixsum_idx(3.00) == 3
 44 |     assert tree.find_prefixsum_idx(4.00) == 3
 45 | 
 46 | 
 47 | def test_prefixsum_idx2():
 48 |     tree = SumSegmentTree(4)
 49 | 
 50 |     tree[0] = 0.5
 51 |     tree[1] = 1.0
 52 |     tree[2] = 1.0
 53 |     tree[3] = 3.0
 54 | 
 55 |     assert tree.find_prefixsum_idx(0.00) == 0
 56 |     assert tree.find_prefixsum_idx(0.55) == 1
 57 |     assert tree.find_prefixsum_idx(0.99) == 1
 58 |     assert tree.find_prefixsum_idx(1.51) == 2
 59 |     assert tree.find_prefixsum_idx(3.00) == 3
 60 |     assert tree.find_prefixsum_idx(5.50) == 3
 61 | 
 62 | 
 63 | def test_max_interval_tree():
 64 |     tree = MinSegmentTree(4)
 65 | 
 66 |     tree[0] = 1.0
 67 |     tree[2] = 0.5
 68 |     tree[3] = 3.0
 69 | 
 70 |     assert np.isclose(tree.min(), 0.5)
 71 |     assert np.isclose(tree.min(0, 2), 1.0)
 72 |     assert np.isclose(tree.min(0, 3), 0.5)
 73 |     assert np.isclose(tree.min(0, -1), 0.5)
 74 |     assert np.isclose(tree.min(2, 4), 0.5)
 75 |     assert np.isclose(tree.min(3, 4), 3.0)
 76 | 
 77 |     tree[2] = 0.7
 78 | 
 79 |     assert np.isclose(tree.min(), 0.7)
 80 |     assert np.isclose(tree.min(0, 2), 1.0)
 81 |     assert np.isclose(tree.min(0, 3), 0.7)
 82 |     assert np.isclose(tree.min(0, -1), 0.7)
 83 |     assert np.isclose(tree.min(2, 4), 0.7)
 84 |     assert np.isclose(tree.min(3, 4), 3.0)
 85 | 
 86 |     tree[2] = 4.0
 87 | 
 88 |     assert np.isclose(tree.min(), 1.0)
 89 |     assert np.isclose(tree.min(0, 2), 1.0)
 90 |     assert np.isclose(tree.min(0, 3), 1.0)
 91 |     assert np.isclose(tree.min(0, -1), 1.0)
 92 |     assert np.isclose(tree.min(2, 4), 3.0)
 93 |     assert np.isclose(tree.min(2, 3), 4.0)
 94 |     assert np.isclose(tree.min(2, -1), 4.0)
 95 |     assert np.isclose(tree.min(3, 4), 3.0)
 96 | 
 97 | 
 98 | if __name__ == '__main__':
 99 |     test_tree_set()
100 |     test_tree_set_overlap()
101 |     test_prefixsum_idx()
102 |     test_prefixsum_idx2()
103 |     test_max_interval_tree()
104 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_serialization.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import gym
  3 | import tempfile
  4 | import pytest
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | 
  8 | from mher.common.tests.envs.mnist_env import MnistEnv
  9 | from mher.common.vec_env.dummy_vec_env import DummyVecEnv
 10 | from mher.run import get_learn_function
 11 | from mher.common.tf_util import make_session, get_session
 12 | 
 13 | from functools import partial
 14 | 
 15 | 
 16 | learn_kwargs = {
 17 |     'deepq': {},
 18 |     'a2c': {},
 19 |     'acktr': {},
 20 |     'acer': {},
 21 |     'ppo2': {'nminibatches': 1, 'nsteps': 10},
 22 |     'trpo_mpi': {},
 23 | }
 24 | 
 25 | network_kwargs = {
 26 |     'mlp': {},
 27 |     'cnn': {'pad': 'SAME'},
 28 |     'lstm': {},
 29 |     'cnn_lnlstm': {'pad': 'SAME'}
 30 | }
 31 | 
 32 | 
 33 | @pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
 34 | @pytest.mark.parametrize("network_fn", network_kwargs.keys())
 35 | def test_serialization(learn_fn, network_fn):
 36 |     '''
 37 |     Test if the trained model can be serialized
 38 |     '''
 39 | 
 40 | 
 41 |     if network_fn.endswith('lstm') and learn_fn in ['acer', 'acktr', 'trpo_mpi', 'deepq']:
 42 |             # TODO make acktr work with recurrent policies
 43 |             # and test
 44 |             # github issue: https://github.com/openai/baselines/issues/660
 45 |             return
 46 | 
 47 |     def make_env():
 48 |         env = MnistEnv(episode_len=100)
 49 |         env.seed(10)
 50 |         return env
 51 | 
 52 |     env = DummyVecEnv([make_env])
 53 |     ob = env.reset().copy()
 54 |     learn = get_learn_function(learn_fn)
 55 | 
 56 |     kwargs = {}
 57 |     kwargs.update(network_kwargs[network_fn])
 58 |     kwargs.update(learn_kwargs[learn_fn])
 59 | 
 60 | 
 61 |     learn = partial(learn, env=env, network=network_fn, seed=0, **kwargs)
 62 | 
 63 |     with tempfile.TemporaryDirectory() as td:
 64 |         model_path = os.path.join(td, 'serialization_test_model')
 65 | 
 66 |         with tf.Graph().as_default(), make_session().as_default():
 67 |             model = learn(total_timesteps=100)
 68 |             model.save(model_path)
 69 |             mean1, std1 = _get_action_stats(model, ob)
 70 |             variables_dict1 = _serialize_variables()
 71 | 
 72 |         with tf.Graph().as_default(), make_session().as_default():
 73 |             model = learn(total_timesteps=0, load_path=model_path)
 74 |             mean2, std2 = _get_action_stats(model, ob)
 75 |             variables_dict2 = _serialize_variables()
 76 | 
 77 |         for k, v in variables_dict1.items():
 78 |             np.testing.assert_allclose(v, variables_dict2[k], atol=0.01,
 79 |                 err_msg='saved and loaded variable {} value mismatch'.format(k))
 80 | 
 81 |         np.testing.assert_allclose(mean1, mean2, atol=0.5)
 82 |         np.testing.assert_allclose(std1, std2, atol=0.5)
 83 | 
 84 | 
 85 | @pytest.mark.parametrize("learn_fn", learn_kwargs.keys())
 86 | @pytest.mark.parametrize("network_fn", ['mlp'])
 87 | def test_coexistence(learn_fn, network_fn):
 88 |     '''
 89 |     Test if more than one model can exist at a time
 90 |     '''
 91 | 
 92 |     if learn_fn == 'deepq':
 93 |             # TODO enable multiple DQN models to be useable at the same time
 94 |             # github issue https://github.com/openai/baselines/issues/656
 95 |             return
 96 | 
 97 |     if network_fn.endswith('lstm') and learn_fn in ['acktr', 'trpo_mpi', 'deepq']:
 98 |             # TODO make acktr work with recurrent policies
 99 |             # and test
100 |             # github issue: https://github.com/openai/baselines/issues/660
101 |             return
102 | 
103 |     env = DummyVecEnv([lambda: gym.make('CartPole-v0')])
104 |     learn = get_learn_function(learn_fn)
105 | 
106 |     kwargs = {}
107 |     kwargs.update(network_kwargs[network_fn])
108 |     kwargs.update(learn_kwargs[learn_fn])
109 | 
110 |     learn =  partial(learn, env=env, network=network_fn, total_timesteps=0, **kwargs)
111 |     make_session(make_default=True, graph=tf.Graph())
112 |     model1 = learn(seed=1)
113 |     make_session(make_default=True, graph=tf.Graph())
114 |     model2 = learn(seed=2)
115 | 
116 |     model1.step(env.observation_space.sample())
117 |     model2.step(env.observation_space.sample())
118 | 
119 | 
120 | 
121 | def _serialize_variables():
122 |     sess = get_session()
123 |     variables = tf.trainable_variables()
124 |     values = sess.run(variables)
125 |     return {var.name: value for var, value in zip(variables, values)}
126 | 
127 | 
128 | def _get_action_stats(model, ob):
129 |     ntrials = 1000
130 |     if model.initial_state is None or model.initial_state == []:
131 |         actions = np.array([model.step(ob)[0] for _ in range(ntrials)])
132 |     else:
133 |         actions = np.array([model.step(ob, S=model.initial_state, M=[False])[0] for _ in range(ntrials)])
134 | 
135 |     mean = np.mean(actions, axis=0)
136 |     std = np.std(actions, axis=0)
137 | 
138 |     return mean, std
139 | 
140 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_tf_util.py:
--------------------------------------------------------------------------------
 1 | # tests for tf_util
 2 | import tensorflow as tf
 3 | from mher.common.tf_util import (
 4 |     function,
 5 |     initialize,
 6 |     single_threaded_session
 7 | )
 8 | 
 9 | 
10 | def test_function():
11 |     with tf.Graph().as_default():
12 |         x = tf.placeholder(tf.int32, (), name="x")
13 |         y = tf.placeholder(tf.int32, (), name="y")
14 |         z = 3 * x + 2 * y
15 |         lin = function([x, y], z, givens={y: 0})
16 | 
17 |         with single_threaded_session():
18 |             initialize()
19 | 
20 |             assert lin(2) == 6
21 |             assert lin(x=3) == 9
22 |             assert lin(2, 2) == 10
23 |             assert lin(x=2, y=3) == 12
24 | 
25 | 
26 | def test_multikwargs():
27 |     with tf.Graph().as_default():
28 |         x = tf.placeholder(tf.int32, (), name="x")
29 |         with tf.variable_scope("other"):
30 |             x2 = tf.placeholder(tf.int32, (), name="x")
31 |         z = 3 * x + 2 * x2
32 | 
33 |         lin = function([x, x2], z, givens={x2: 0})
34 |         with single_threaded_session():
35 |             initialize()
36 |             assert lin(2) == 6
37 |             assert lin(2, 2) == 10
38 | 
39 | 
40 | if __name__ == '__main__':
41 |     test_function()
42 |     test_multikwargs()
43 | 


--------------------------------------------------------------------------------
/mher/common/tests/test_with_mpi.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import subprocess
 4 | import cloudpickle
 5 | import base64
 6 | import pytest
 7 | from functools import wraps
 8 | 
 9 | try:
10 |     from mpi4py import MPI
11 | except ImportError:
12 |     MPI = None
13 | 
14 | def with_mpi(nproc=2, timeout=30, skip_if_no_mpi=True):
15 |     def outer_thunk(fn):
16 |         @wraps(fn)
17 |         def thunk(*args, **kwargs):
18 |             serialized_fn = base64.b64encode(cloudpickle.dumps(lambda: fn(*args, **kwargs)))
19 |             subprocess.check_call([
20 |                 'mpiexec','-n', str(nproc),
21 |                 sys.executable,
22 |                 '-m', 'mher.common.tests.test_with_mpi',
23 |                 serialized_fn
24 |             ], env=os.environ, timeout=timeout)
25 | 
26 |         if skip_if_no_mpi:
27 |             return pytest.mark.skipif(MPI is None, reason="MPI not present")(thunk)
28 |         else:
29 |             return thunk
30 | 
31 |     return outer_thunk
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     if len(sys.argv) > 1:
36 |         fn = cloudpickle.loads(base64.b64decode(sys.argv[1]))
37 |         assert callable(fn)
38 |         fn()
39 | 


--------------------------------------------------------------------------------
/mher/common/tests/util.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from mher.common.vec_env.dummy_vec_env import DummyVecEnv
 4 | 
 5 | N_TRIALS = 10000
 6 | N_EPISODES = 100
 7 | 
 8 | _sess_config = tf.ConfigProto(
 9 |     allow_soft_placement=True,
10 |     intra_op_parallelism_threads=1,
11 |     inter_op_parallelism_threads=1
12 | )
13 | 
14 | def simple_test(env_fn, learn_fn, min_reward_fraction, n_trials=N_TRIALS):
15 |     def seeded_env_fn():
16 |         env = env_fn()
17 |         env.seed(0)
18 |         return env
19 | 
20 |     np.random.seed(0)
21 |     env = DummyVecEnv([seeded_env_fn])
22 |     with tf.Graph().as_default(), tf.Session(config=_sess_config).as_default():
23 |         tf.set_random_seed(0)
24 |         model = learn_fn(env)
25 |         sum_rew = 0
26 |         done = True
27 |         for i in range(n_trials):
28 |             if done:
29 |                 obs = env.reset()
30 |                 state = model.initial_state
31 |             if state is not None:
32 |                 a, v, state, _ = model.step(obs, S=state, M=[False])
33 |             else:
34 |                 a, v, _, _ = model.step(obs)
35 |             obs, rew, done, _ = env.step(a)
36 |             sum_rew += float(rew)
37 |         print("Reward in {} trials is {}".format(n_trials, sum_rew))
38 |         assert sum_rew > min_reward_fraction * n_trials, \
39 |             'sum of rewards {} is less than {} of the total number of trials {}'.format(sum_rew, min_reward_fraction, n_trials)
40 | 
41 | def reward_per_episode_test(env_fn, learn_fn, min_avg_reward, n_trials=N_EPISODES):
42 |     env = DummyVecEnv([env_fn])
43 |     with tf.Graph().as_default(), tf.Session(config=_sess_config).as_default():
44 |         model = learn_fn(env)
45 |         N_TRIALS = 100
46 |         observations, actions, rewards = rollout(env, model, N_TRIALS)
47 |         rewards = [sum(r) for r in rewards]
48 |         avg_rew = sum(rewards) / N_TRIALS
49 |         print("Average reward in {} episodes is {}".format(n_trials, avg_rew))
50 |         assert avg_rew > min_avg_reward, \
51 |             'average reward in {} episodes ({}) is less than {}'.format(n_trials, avg_rew, min_avg_reward)
52 | 
53 | def rollout(env, model, n_trials):
54 |     rewards = []
55 |     actions = []
56 |     observations = []
57 |     for i in range(n_trials):
58 |         obs = env.reset()
59 |         state = model.initial_state if hasattr(model, 'initial_state') else None
60 |         episode_rew = []
61 |         episode_actions = []
62 |         episode_obs = []
63 |         while True:
64 |             if state is not None:
65 |                 a, v, state, _ = model.step(obs, S=state, M=[False])
66 |             else:
67 |                 a,v, _, _ = model.step(obs)
68 | 
69 |             obs, rew, done, _ = env.step(a)
70 |             episode_rew.append(rew)
71 |             episode_actions.append(a)
72 |             episode_obs.append(obs)
73 |             if done:
74 |                 break
75 |         rewards.append(episode_rew)
76 |         actions.append(episode_actions)
77 |         observations.append(episode_obs)
78 |     return observations, actions, rewards
79 | 
80 | 
81 | def smoketest(argstr, **kwargs):
82 |     import tempfile
83 |     import subprocess
84 |     import os
85 |     argstr = 'python -m mher.run ' + argstr
86 |     for key, value in kwargs:
87 |         argstr += ' --{}={}'.format(key, value)
88 |     tempdir = tempfile.mkdtemp()
89 |     env = os.environ.copy()
90 |     env['OPENAI_LOGDIR'] = tempdir
91 |     subprocess.run(argstr.split(' '), env=env)
92 |     return tempdir
93 | 


--------------------------------------------------------------------------------
/mher/common/tile_images.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def tile_images(img_nhwc):
 4 |     """
 5 |     Tile N images into one big PxQ image
 6 |     (P,Q) are chosen to be as close as possible, and if N
 7 |     is square, then P=Q.
 8 | 
 9 |     input: img_nhwc, list or array of images, ndim=4 once turned into array
10 |         n = batch index, h = height, w = width, c = channel
11 |     returns:
12 |         bigim_HWc, ndarray with ndim=3
13 |     """
14 |     img_nhwc = np.asarray(img_nhwc)
15 |     N, h, w, c = img_nhwc.shape
16 |     H = int(np.ceil(np.sqrt(N)))
17 |     W = int(np.ceil(float(N)/H))
18 |     img_nhwc = np.array(list(img_nhwc) + [img_nhwc[0]*0 for _ in range(N, H*W)])
19 |     img_HWhwc = img_nhwc.reshape(H, W, h, w, c)
20 |     img_HhWwc = img_HWhwc.transpose(0, 2, 1, 3, 4)
21 |     img_Hh_Ww_c = img_HhWwc.reshape(H*h, W*w, c)
22 |     return img_Hh_Ww_c
23 | 
24 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/__init__.py:
--------------------------------------------------------------------------------
 1 | from .vec_env import AlreadySteppingError, NotSteppingError, VecEnv, VecEnvWrapper, VecEnvObservationWrapper, CloudpickleWrapper
 2 | from .dummy_vec_env import DummyVecEnv
 3 | from .shmem_vec_env import ShmemVecEnv
 4 | from .subproc_vec_env import SubprocVecEnv
 5 | from .vec_frame_stack import VecFrameStack
 6 | from .vec_monitor import VecMonitor
 7 | from .vec_normalize import VecNormalize
 8 | from .vec_remove_dict_obs import VecExtractDictObs
 9 | 
10 | __all__ = ['AlreadySteppingError', 'NotSteppingError', 'VecEnv', 'VecEnvWrapper', 'VecEnvObservationWrapper', 'CloudpickleWrapper', 'DummyVecEnv', 'ShmemVecEnv', 'SubprocVecEnv', 'VecFrameStack', 'VecMonitor', 'VecNormalize', 'VecExtractDictObs']
11 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/dummy_vec_env.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/dummy_vec_env.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/shmem_vec_env.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/shmem_vec_env.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/subproc_vec_env.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/subproc_vec_env.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/util.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/vec_env.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/vec_env.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/vec_frame_stack.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/vec_frame_stack.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/vec_monitor.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/vec_monitor.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/vec_normalize.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/vec_normalize.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/vec_remove_dict_obs.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/vec_remove_dict_obs.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/__pycache__/vec_video_recorder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/common/vec_env/__pycache__/vec_video_recorder.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/common/vec_env/dummy_vec_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .vec_env import VecEnv
 3 | from .util import copy_obs_dict, dict_to_obs, obs_space_info
 4 | 
 5 | class DummyVecEnv(VecEnv):
 6 |     """
 7 |     VecEnv that does runs multiple environments sequentially, that is,
 8 |     the step and reset commands are send to one environment at a time.
 9 |     Useful when debugging and when num_env == 1 (in the latter case,
10 |     avoids communication overhead)
11 |     """
12 |     def __init__(self, env_fns):
13 |         """
14 |         Arguments:
15 | 
16 |         env_fns: iterable of callables      functions that build environments
17 |         """
18 |         self.envs = [fn() for fn in env_fns]
19 |         env = self.envs[0]
20 |         VecEnv.__init__(self, len(env_fns), env.observation_space, env.action_space)
21 |         obs_space = env.observation_space
22 |         self.keys, shapes, dtypes = obs_space_info(obs_space)
23 | 
24 |         self.buf_obs = { k: np.zeros((self.num_envs,) + tuple(shapes[k]), dtype=dtypes[k]) for k in self.keys }
25 |         self.buf_dones = np.zeros((self.num_envs,), dtype=np.bool)
26 |         self.buf_rews  = np.zeros((self.num_envs,), dtype=np.float32)
27 |         self.buf_infos = [{} for _ in range(self.num_envs)]
28 |         self.actions = None
29 |         self.spec = self.envs[0].spec
30 | 
31 |     def step_async(self, actions):
32 |         listify = True
33 |         try:
34 |             if len(actions) == self.num_envs:
35 |                 listify = False
36 |         except TypeError:
37 |             pass
38 | 
39 |         if not listify:
40 |             self.actions = actions
41 |         else:
42 |             assert self.num_envs == 1, "actions {} is either not a list or has a wrong size - cannot match to {} environments".format(actions, self.num_envs)
43 |             self.actions = [actions]
44 | 
45 |     def step_wait(self):
46 |         for e in range(self.num_envs):
47 |             action = self.actions[e]
48 |             obs, self.buf_rews[e], self.buf_dones[e], self.buf_infos[e] = self.envs[e].step(action)
49 |             # if self.buf_dones[e]:   # here we don't need to reset because we reset in the main code file
50 |             #     obs = self.envs[e].reset()
51 |             self._save_obs(e, obs)
52 |         return (self._obs_from_buf(), np.copy(self.buf_rews), np.copy(self.buf_dones),
53 |                 self.buf_infos.copy())
54 | 
55 |     def reset(self):
56 |         for e in range(self.num_envs):
57 |             obs = self.envs[e].reset()
58 |             self._save_obs(e, obs)
59 |         return self._obs_from_buf()
60 | 
61 |     def _save_obs(self, e, obs):
62 |         for k in self.keys:
63 |             if k is None:
64 |                 self.buf_obs[k][e] = obs
65 |             else:
66 |                 self.buf_obs[k][e] = obs[k]
67 | 
68 |     def _obs_from_buf(self):
69 |         return dict_to_obs(copy_obs_dict(self.buf_obs))
70 | 
71 |     def get_images(self):
72 |         return [env.render(mode='rgb_array') for env in self.envs]
73 | 
74 |     def render(self, mode='human'):
75 |         if self.num_envs == 1:
76 |             return self.envs[0].render(mode=mode)
77 |         else:
78 |             return super().render(mode=mode)
79 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/shmem_vec_env.py:
--------------------------------------------------------------------------------
  1 | """
  2 | An interface for asynchronous vectorized environments.
  3 | """
  4 | 
  5 | import multiprocessing as mp
  6 | import numpy as np
  7 | from .vec_env import VecEnv, CloudpickleWrapper, clear_mpi_env_vars
  8 | import ctypes
  9 | from mher.common import logger
 10 | 
 11 | from .util import dict_to_obs, obs_space_info, obs_to_dict
 12 | 
 13 | _NP_TO_CT = {np.float32: ctypes.c_float,
 14 |              np.int32: ctypes.c_int32,
 15 |              np.int8: ctypes.c_int8,
 16 |              np.uint8: ctypes.c_char,
 17 |              np.bool: ctypes.c_bool}
 18 | 
 19 | 
 20 | class ShmemVecEnv(VecEnv):
 21 |     """
 22 |     Optimized version of SubprocVecEnv that uses shared variables to communicate observations.
 23 |     """
 24 | 
 25 |     def __init__(self, env_fns, spaces=None, context='spawn'):
 26 |         """
 27 |         If you don't specify observation_space, we'll have to create a dummy
 28 |         environment to get it.
 29 |         """
 30 |         ctx = mp.get_context(context)
 31 |         if spaces:
 32 |             observation_space, action_space = spaces
 33 |         else:
 34 |             logger.log('Creating dummy env object to get spaces')
 35 |             with logger.scoped_configure(format_strs=[]):
 36 |                 dummy = env_fns[0]()
 37 |                 observation_space, action_space = dummy.observation_space, dummy.action_space
 38 |                 dummy.close()
 39 |                 del dummy
 40 |         VecEnv.__init__(self, len(env_fns), observation_space, action_space)
 41 |         self.obs_keys, self.obs_shapes, self.obs_dtypes = obs_space_info(observation_space)
 42 |         self.obs_bufs = [
 43 |             {k: ctx.Array(_NP_TO_CT[self.obs_dtypes[k].type], int(np.prod(self.obs_shapes[k]))) for k in self.obs_keys}
 44 |             for _ in env_fns]
 45 |         self.parent_pipes = []
 46 |         self.procs = []
 47 |         with clear_mpi_env_vars():
 48 |             for env_fn, obs_buf in zip(env_fns, self.obs_bufs):
 49 |                 wrapped_fn = CloudpickleWrapper(env_fn)
 50 |                 parent_pipe, child_pipe = ctx.Pipe()
 51 |                 proc = ctx.Process(target=_subproc_worker,
 52 |                             args=(child_pipe, parent_pipe, wrapped_fn, obs_buf, self.obs_shapes, self.obs_dtypes, self.obs_keys))
 53 |                 proc.daemon = True
 54 |                 self.procs.append(proc)
 55 |                 self.parent_pipes.append(parent_pipe)
 56 |                 proc.start()
 57 |                 child_pipe.close()
 58 |         self.waiting_step = False
 59 |         self.viewer = None
 60 | 
 61 |     def reset(self):
 62 |         if self.waiting_step:
 63 |             logger.warn('Called reset() while waiting for the step to complete')
 64 |             self.step_wait()
 65 |         for pipe in self.parent_pipes:
 66 |             pipe.send(('reset', None))
 67 |         return self._decode_obses([pipe.recv() for pipe in self.parent_pipes])
 68 | 
 69 |     def step_async(self, actions):
 70 |         assert len(actions) == len(self.parent_pipes)
 71 |         for pipe, act in zip(self.parent_pipes, actions):
 72 |             pipe.send(('step', act))
 73 |         self.waiting_step = True
 74 | 
 75 |     def step_wait(self):
 76 |         outs = [pipe.recv() for pipe in self.parent_pipes]
 77 |         self.waiting_step = False
 78 |         obs, rews, dones, infos = zip(*outs)
 79 |         return self._decode_obses(obs), np.array(rews), np.array(dones), infos
 80 | 
 81 |     def close_extras(self):
 82 |         if self.waiting_step:
 83 |             self.step_wait()
 84 |         for pipe in self.parent_pipes:
 85 |             pipe.send(('close', None))
 86 |         for pipe in self.parent_pipes:
 87 |             pipe.recv()
 88 |             pipe.close()
 89 |         for proc in self.procs:
 90 |             proc.join()
 91 | 
 92 |     def get_images(self, mode='human'):
 93 |         for pipe in self.parent_pipes:
 94 |             pipe.send(('render', None))
 95 |         return [pipe.recv() for pipe in self.parent_pipes]
 96 | 
 97 |     def _decode_obses(self, obs):
 98 |         result = {}
 99 |         for k in self.obs_keys:
100 | 
101 |             bufs = [b[k] for b in self.obs_bufs]
102 |             o = [np.frombuffer(b.get_obj(), dtype=self.obs_dtypes[k]).reshape(self.obs_shapes[k]) for b in bufs]
103 |             result[k] = np.array(o)
104 |         return dict_to_obs(result)
105 | 
106 | 
107 | def _subproc_worker(pipe, parent_pipe, env_fn_wrapper, obs_bufs, obs_shapes, obs_dtypes, keys):
108 |     """
109 |     Control a single environment instance using IPC and
110 |     shared memory.
111 |     """
112 |     def _write_obs(maybe_dict_obs):
113 |         flatdict = obs_to_dict(maybe_dict_obs)
114 |         for k in keys:
115 |             dst = obs_bufs[k].get_obj()
116 |             dst_np = np.frombuffer(dst, dtype=obs_dtypes[k]).reshape(obs_shapes[k])  # pylint: disable=W0212
117 |             np.copyto(dst_np, flatdict[k])
118 | 
119 |     env = env_fn_wrapper.x()
120 |     parent_pipe.close()
121 |     try:
122 |         while True:
123 |             cmd, data = pipe.recv()
124 |             if cmd == 'reset':
125 |                 pipe.send(_write_obs(env.reset()))
126 |             elif cmd == 'step':
127 |                 obs, reward, done, info = env.step(data)
128 |                 if done:
129 |                     obs = env.reset()
130 |                 pipe.send((_write_obs(obs), reward, done, info))
131 |             elif cmd == 'render':
132 |                 pipe.send(env.render(mode='rgb_array'))
133 |             elif cmd == 'close':
134 |                 pipe.send(None)
135 |                 break
136 |             else:
137 |                 raise RuntimeError('Got unrecognized cmd %s' % cmd)
138 |     except KeyboardInterrupt:
139 |         print('ShmemVecEnv worker: got KeyboardInterrupt')
140 |     finally:
141 |         env.close()
142 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/subproc_vec_env.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing as mp
  2 | 
  3 | import numpy as np
  4 | from .vec_env import VecEnv, CloudpickleWrapper, clear_mpi_env_vars
  5 | 
  6 | 
  7 | def worker(remote, parent_remote, env_fn_wrappers):
  8 |     def step_env(env, action):
  9 |         ob, reward, done, info = env.step(action)
 10 |         if done:
 11 |             ob = env.reset()
 12 |         return ob, reward, done, info
 13 | 
 14 |     parent_remote.close()
 15 |     envs = [env_fn_wrapper() for env_fn_wrapper in env_fn_wrappers.x]
 16 |     try:
 17 |         while True:
 18 |             cmd, data = remote.recv()
 19 |             if cmd == 'step':
 20 |                 remote.send([step_env(env, action) for env, action in zip(envs, data)])
 21 |             elif cmd == 'reset':
 22 |                 remote.send([env.reset() for env in envs])
 23 |             elif cmd == 'render':
 24 |                 remote.send([env.render(mode='rgb_array') for env in envs])
 25 |             elif cmd == 'close':
 26 |                 remote.close()
 27 |                 break
 28 |             elif cmd == 'get_spaces_spec':
 29 |                 remote.send(CloudpickleWrapper((envs[0].observation_space, envs[0].action_space, envs[0].spec)))
 30 |             else:
 31 |                 raise NotImplementedError
 32 |     except KeyboardInterrupt:
 33 |         print('SubprocVecEnv worker: got KeyboardInterrupt')
 34 |     finally:
 35 |         for env in envs:
 36 |             env.close()
 37 | 
 38 | 
 39 | class SubprocVecEnv(VecEnv):
 40 |     """
 41 |     VecEnv that runs multiple environments in parallel in subproceses and communicates with them via pipes.
 42 |     Recommended to use when num_envs > 1 and step() can be a bottleneck.
 43 |     """
 44 |     def __init__(self, env_fns, spaces=None, context='spawn', in_series=1):
 45 |         """
 46 |         Arguments:
 47 | 
 48 |         env_fns: iterable of callables -  functions that create environments to run in subprocesses. Need to be cloud-pickleable
 49 |         in_series: number of environments to run in series in a single process
 50 |         (e.g. when len(env_fns) == 12 and in_series == 3, it will run 4 processes, each running 3 envs in series)
 51 |         """
 52 |         self.waiting = False
 53 |         self.closed = False
 54 |         self.in_series = in_series
 55 |         nenvs = len(env_fns)
 56 |         assert nenvs % in_series == 0, "Number of envs must be divisible by number of envs to run in series"
 57 |         self.nremotes = nenvs // in_series
 58 |         env_fns = np.array_split(env_fns, self.nremotes)
 59 |         ctx = mp.get_context(context)
 60 |         self.remotes, self.work_remotes = zip(*[ctx.Pipe() for _ in range(self.nremotes)])
 61 |         self.ps = [ctx.Process(target=worker, args=(work_remote, remote, CloudpickleWrapper(env_fn)))
 62 |                    for (work_remote, remote, env_fn) in zip(self.work_remotes, self.remotes, env_fns)]
 63 |         for p in self.ps:
 64 |             p.daemon = True  # if the main process crashes, we should not cause things to hang
 65 |             with clear_mpi_env_vars():
 66 |                 p.start()
 67 |         for remote in self.work_remotes:
 68 |             remote.close()
 69 | 
 70 |         self.remotes[0].send(('get_spaces_spec', None))
 71 |         observation_space, action_space, self.spec = self.remotes[0].recv().x
 72 |         self.viewer = None
 73 |         VecEnv.__init__(self, nenvs, observation_space, action_space)
 74 | 
 75 |     def step_async(self, actions):
 76 |         self._assert_not_closed()
 77 |         actions = np.array_split(actions, self.nremotes)
 78 |         for remote, action in zip(self.remotes, actions):
 79 |             remote.send(('step', action))
 80 |         self.waiting = True
 81 | 
 82 |     def step_wait(self):
 83 |         self._assert_not_closed()
 84 |         results = [remote.recv() for remote in self.remotes]
 85 |         results = _flatten_list(results)
 86 |         self.waiting = False
 87 |         obs, rews, dones, infos = zip(*results)
 88 |         return _flatten_obs(obs), np.stack(rews), np.stack(dones), infos
 89 | 
 90 |     def reset(self):
 91 |         self._assert_not_closed()
 92 |         for remote in self.remotes:
 93 |             remote.send(('reset', None))
 94 |         obs = [remote.recv() for remote in self.remotes]
 95 |         obs = _flatten_list(obs)
 96 |         return _flatten_obs(obs)
 97 | 
 98 |     def close_extras(self):
 99 |         self.closed = True
100 |         if self.waiting:
101 |             for remote in self.remotes:
102 |                 remote.recv()
103 |         for remote in self.remotes:
104 |             remote.send(('close', None))
105 |         for p in self.ps:
106 |             p.join()
107 | 
108 |     def get_images(self):
109 |         self._assert_not_closed()
110 |         for pipe in self.remotes:
111 |             pipe.send(('render', None))
112 |         imgs = [pipe.recv() for pipe in self.remotes]
113 |         imgs = _flatten_list(imgs)
114 |         return imgs
115 | 
116 |     def _assert_not_closed(self):
117 |         assert not self.closed, "Trying to operate on a SubprocVecEnv after calling close()"
118 | 
119 |     def __del__(self):
120 |         if not self.closed:
121 |             self.close()
122 | 
123 | def _flatten_obs(obs):
124 |     assert isinstance(obs, (list, tuple))
125 |     assert len(obs) > 0
126 | 
127 |     if isinstance(obs[0], dict):
128 |         keys = obs[0].keys()
129 |         return {k: np.stack([o[k] for o in obs]) for k in keys}
130 |     else:
131 |         return np.stack(obs)
132 | 
133 | def _flatten_list(l):
134 |     assert isinstance(l, (list, tuple))
135 |     assert len(l) > 0
136 |     assert all([len(l_) > 0 for l_ in l])
137 | 
138 |     return [l__ for l_ in l for l__ in l_]
139 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/test_vec_env.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Tests for asynchronous vectorized environments.
  3 | """
  4 | 
  5 | import gym
  6 | import numpy as np
  7 | import pytest
  8 | from .dummy_vec_env import DummyVecEnv
  9 | from .shmem_vec_env import ShmemVecEnv
 10 | from .subproc_vec_env import SubprocVecEnv
 11 | from mher.common.tests.test_with_mpi import with_mpi
 12 | 
 13 | 
 14 | def assert_venvs_equal(venv1, venv2, num_steps):
 15 |     """
 16 |     Compare two environments over num_steps steps and make sure
 17 |     that the observations produced by each are the same when given
 18 |     the same actions.
 19 |     """
 20 |     assert venv1.num_envs == venv2.num_envs
 21 |     assert venv1.observation_space.shape == venv2.observation_space.shape
 22 |     assert venv1.observation_space.dtype == venv2.observation_space.dtype
 23 |     assert venv1.action_space.shape == venv2.action_space.shape
 24 |     assert venv1.action_space.dtype == venv2.action_space.dtype
 25 | 
 26 |     try:
 27 |         obs1, obs2 = venv1.reset(), venv2.reset()
 28 |         assert np.array(obs1).shape == np.array(obs2).shape
 29 |         assert np.array(obs1).shape == (venv1.num_envs,) + venv1.observation_space.shape
 30 |         assert np.allclose(obs1, obs2)
 31 |         venv1.action_space.seed(1337)
 32 |         for _ in range(num_steps):
 33 |             actions = np.array([venv1.action_space.sample() for _ in range(venv1.num_envs)])
 34 |             for venv in [venv1, venv2]:
 35 |                 venv.step_async(actions)
 36 |             outs1 = venv1.step_wait()
 37 |             outs2 = venv2.step_wait()
 38 |             for out1, out2 in zip(outs1[:3], outs2[:3]):
 39 |                 assert np.array(out1).shape == np.array(out2).shape
 40 |                 assert np.allclose(out1, out2)
 41 |             assert list(outs1[3]) == list(outs2[3])
 42 |     finally:
 43 |         venv1.close()
 44 |         venv2.close()
 45 | 
 46 | 
 47 | @pytest.mark.parametrize('klass', (ShmemVecEnv, SubprocVecEnv))
 48 | @pytest.mark.parametrize('dtype', ('uint8', 'float32'))
 49 | def test_vec_env(klass, dtype):  # pylint: disable=R0914
 50 |     """
 51 |     Test that a vectorized environment is equivalent to
 52 |     DummyVecEnv, since DummyVecEnv is less likely to be
 53 |     error prone.
 54 |     """
 55 |     num_envs = 3
 56 |     num_steps = 100
 57 |     shape = (3, 8)
 58 | 
 59 |     def make_fn(seed):
 60 |         """
 61 |         Get an environment constructor with a seed.
 62 |         """
 63 |         return lambda: SimpleEnv(seed, shape, dtype)
 64 |     fns = [make_fn(i) for i in range(num_envs)]
 65 |     env1 = DummyVecEnv(fns)
 66 |     env2 = klass(fns)
 67 |     assert_venvs_equal(env1, env2, num_steps=num_steps)
 68 | 
 69 | 
 70 | @pytest.mark.parametrize('dtype', ('uint8', 'float32'))
 71 | @pytest.mark.parametrize('num_envs_in_series', (3, 4, 6))
 72 | def test_sync_sampling(dtype, num_envs_in_series):
 73 |     """
 74 |     Test that a SubprocVecEnv running with envs in series
 75 |     outputs the same as DummyVecEnv.
 76 |     """
 77 |     num_envs = 12
 78 |     num_steps = 100
 79 |     shape = (3, 8)
 80 | 
 81 |     def make_fn(seed):
 82 |         """
 83 |         Get an environment constructor with a seed.
 84 |         """
 85 |         return lambda: SimpleEnv(seed, shape, dtype)
 86 |     fns = [make_fn(i) for i in range(num_envs)]
 87 |     env1 = DummyVecEnv(fns)
 88 |     env2 = SubprocVecEnv(fns, in_series=num_envs_in_series)
 89 |     assert_venvs_equal(env1, env2, num_steps=num_steps)
 90 | 
 91 | 
 92 | @pytest.mark.parametrize('dtype', ('uint8', 'float32'))
 93 | @pytest.mark.parametrize('num_envs_in_series', (3, 4, 6))
 94 | def test_sync_sampling_sanity(dtype, num_envs_in_series):
 95 |     """
 96 |     Test that a SubprocVecEnv running with envs in series
 97 |     outputs the same as SubprocVecEnv without running in series.
 98 |     """
 99 |     num_envs = 12
100 |     num_steps = 100
101 |     shape = (3, 8)
102 | 
103 |     def make_fn(seed):
104 |         """
105 |         Get an environment constructor with a seed.
106 |         """
107 |         return lambda: SimpleEnv(seed, shape, dtype)
108 |     fns = [make_fn(i) for i in range(num_envs)]
109 |     env1 = SubprocVecEnv(fns)
110 |     env2 = SubprocVecEnv(fns, in_series=num_envs_in_series)
111 |     assert_venvs_equal(env1, env2, num_steps=num_steps)
112 | 
113 | 
114 | class SimpleEnv(gym.Env):
115 |     """
116 |     An environment with a pre-determined observation space
117 |     and RNG seed.
118 |     """
119 | 
120 |     def __init__(self, seed, shape, dtype):
121 |         np.random.seed(seed)
122 |         self._dtype = dtype
123 |         self._start_obs = np.array(np.random.randint(0, 0x100, size=shape),
124 |                                    dtype=dtype)
125 |         self._max_steps = seed + 1
126 |         self._cur_obs = None
127 |         self._cur_step = 0
128 |         # this is 0xFF instead of 0x100 because the Box space includes
129 |         # the high end, while randint does not
130 |         self.action_space = gym.spaces.Box(low=0, high=0xFF, shape=shape, dtype=dtype)
131 |         self.observation_space = self.action_space
132 | 
133 |     def step(self, action):
134 |         self._cur_obs += np.array(action, dtype=self._dtype)
135 |         self._cur_step += 1
136 |         done = self._cur_step >= self._max_steps
137 |         reward = self._cur_step / self._max_steps
138 |         return self._cur_obs, reward, done, {'foo': 'bar' + str(reward)}
139 | 
140 |     def reset(self):
141 |         self._cur_obs = self._start_obs
142 |         self._cur_step = 0
143 |         return self._cur_obs
144 | 
145 |     def render(self, mode=None):
146 |         raise NotImplementedError
147 | 
148 | 
149 | 
150 | @with_mpi()
151 | def test_mpi_with_subprocvecenv():
152 |     shape = (2,3,4)
153 |     nenv = 1
154 |     venv = SubprocVecEnv([lambda: SimpleEnv(0, shape, 'float32')] * nenv)
155 |     ob = venv.reset()
156 |     venv.close()
157 |     assert ob.shape == (nenv,) + shape
158 | 
159 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/test_video_recorder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests for asynchronous vectorized environments.
 3 | """
 4 | 
 5 | import gym
 6 | import pytest
 7 | import os
 8 | import glob
 9 | import tempfile
10 | 
11 | from .dummy_vec_env import DummyVecEnv
12 | from .shmem_vec_env import ShmemVecEnv
13 | from .subproc_vec_env import SubprocVecEnv
14 | from .vec_video_recorder import VecVideoRecorder
15 | 
16 | @pytest.mark.parametrize('klass', (DummyVecEnv, ShmemVecEnv, SubprocVecEnv))
17 | @pytest.mark.parametrize('num_envs', (1, 4))
18 | @pytest.mark.parametrize('video_length', (10, 100))
19 | @pytest.mark.parametrize('video_interval', (1, 50))
20 | def test_video_recorder(klass, num_envs, video_length, video_interval):
21 |     """
22 |     Wrap an existing VecEnv with VevVideoRecorder,
23 |     Make (video_interval + video_length + 1) steps,
24 |     then check that the file is present
25 |     """
26 | 
27 |     def make_fn():
28 |         env = gym.make('PongNoFrameskip-v4')
29 |         return env
30 |     fns = [make_fn for _ in range(num_envs)]
31 |     env = klass(fns)
32 | 
33 |     with tempfile.TemporaryDirectory() as video_path:
34 |         env = VecVideoRecorder(env, video_path, record_video_trigger=lambda x: x % video_interval == 0, video_length=video_length)
35 | 
36 |         env.reset()
37 |         for _ in range(video_interval + video_length + 1):
38 |             env.step([0] * num_envs)
39 |         env.close()
40 | 
41 | 
42 |         recorded_video = glob.glob(os.path.join(video_path, "*.mp4"))
43 | 
44 |         # first and second step
45 |         assert len(recorded_video) == 2
46 |         # Files are not empty
47 |         assert all(os.stat(p).st_size != 0 for p in recorded_video)
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/util.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Helpers for dealing with vectorized environments.
 3 | """
 4 | 
 5 | from collections import OrderedDict
 6 | 
 7 | import gym
 8 | import numpy as np
 9 | 
10 | 
11 | def copy_obs_dict(obs):
12 |     """
13 |     Deep-copy an observation dict.
14 |     """
15 |     return {k: np.copy(v) for k, v in obs.items()}
16 | 
17 | 
18 | def dict_to_obs(obs_dict):
19 |     """
20 |     Convert an observation dict into a raw array if the
21 |     original observation space was not a Dict space.
22 |     """
23 |     if set(obs_dict.keys()) == {None}:
24 |         return obs_dict[None]
25 |     return obs_dict
26 | 
27 | 
28 | def obs_space_info(obs_space):
29 |     """
30 |     Get dict-structured information about a gym.Space.
31 | 
32 |     Returns:
33 |       A tuple (keys, shapes, dtypes):
34 |         keys: a list of dict keys.
35 |         shapes: a dict mapping keys to shapes.
36 |         dtypes: a dict mapping keys to dtypes.
37 |     """
38 |     if isinstance(obs_space, gym.spaces.Dict):
39 |         assert isinstance(obs_space.spaces, OrderedDict)
40 |         subspaces = obs_space.spaces
41 |     elif isinstance(obs_space, gym.spaces.Tuple):
42 |         assert isinstance(obs_space.spaces, tuple)
43 |         subspaces = {i: obs_space.spaces[i] for i in range(len(obs_space.spaces))}
44 |     else:
45 |         subspaces = {None: obs_space}
46 |     keys = []
47 |     shapes = {}
48 |     dtypes = {}
49 |     for key, box in subspaces.items():
50 |         keys.append(key)
51 |         shapes[key] = box.shape
52 |         dtypes[key] = box.dtype
53 |     return keys, shapes, dtypes
54 | 
55 | 
56 | def obs_to_dict(obs):
57 |     """
58 |     Convert an observation into a dict.
59 |     """
60 |     if isinstance(obs, dict):
61 |         return obs
62 |     return {None: obs}
63 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/vec_frame_stack.py:
--------------------------------------------------------------------------------
 1 | from .vec_env import VecEnvWrapper
 2 | import numpy as np
 3 | from gym import spaces
 4 | 
 5 | 
 6 | class VecFrameStack(VecEnvWrapper):
 7 |     def __init__(self, venv, nstack):
 8 |         self.venv = venv
 9 |         self.nstack = nstack
10 |         wos = venv.observation_space  # wrapped ob space
11 |         low = np.repeat(wos.low, self.nstack, axis=-1)
12 |         high = np.repeat(wos.high, self.nstack, axis=-1)
13 |         self.stackedobs = np.zeros((venv.num_envs,) + low.shape, low.dtype)
14 |         observation_space = spaces.Box(low=low, high=high, dtype=venv.observation_space.dtype)
15 |         VecEnvWrapper.__init__(self, venv, observation_space=observation_space)
16 | 
17 |     def step_wait(self):
18 |         obs, rews, news, infos = self.venv.step_wait()
19 |         self.stackedobs = np.roll(self.stackedobs, shift=-1, axis=-1)
20 |         for (i, new) in enumerate(news):
21 |             if new:
22 |                 self.stackedobs[i] = 0
23 |         self.stackedobs[..., -obs.shape[-1]:] = obs
24 |         return self.stackedobs, rews, news, infos
25 | 
26 |     def reset(self):
27 |         obs = self.venv.reset()
28 |         self.stackedobs[...] = 0
29 |         self.stackedobs[..., -obs.shape[-1]:] = obs
30 |         return self.stackedobs
31 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/vec_monitor.py:
--------------------------------------------------------------------------------
 1 | from . import VecEnvWrapper
 2 | from mher.common.monitor import ResultsWriter
 3 | import numpy as np
 4 | import time
 5 | from collections import deque
 6 | 
 7 | class VecMonitor(VecEnvWrapper):
 8 |     def __init__(self, venv, filename=None, keep_buf=0, info_keywords=()):
 9 |         VecEnvWrapper.__init__(self, venv)
10 |         self.eprets = None
11 |         self.eplens = None
12 |         self.epcount = 0
13 |         self.tstart = time.time()
14 |         if filename:
15 |             self.results_writer = ResultsWriter(filename, header={'t_start': self.tstart},
16 |                 extra_keys=info_keywords)
17 |         else:
18 |             self.results_writer = None
19 |         self.info_keywords = info_keywords
20 |         self.keep_buf = keep_buf
21 |         if self.keep_buf:
22 |             self.epret_buf = deque([], maxlen=keep_buf)
23 |             self.eplen_buf = deque([], maxlen=keep_buf)
24 | 
25 |     def reset(self):
26 |         obs = self.venv.reset()
27 |         self.eprets = np.zeros(self.num_envs, 'f')
28 |         self.eplens = np.zeros(self.num_envs, 'i')
29 |         return obs
30 | 
31 |     def step_wait(self):
32 |         obs, rews, dones, infos = self.venv.step_wait()
33 |         self.eprets += rews
34 |         self.eplens += 1
35 | 
36 |         newinfos = list(infos[:])
37 |         for i in range(len(dones)):
38 |             if dones[i]:
39 |                 info = infos[i].copy()
40 |                 ret = self.eprets[i]
41 |                 eplen = self.eplens[i]
42 |                 epinfo = {'r': ret, 'l': eplen, 't': round(time.time() - self.tstart, 6)}
43 |                 for k in self.info_keywords:
44 |                     epinfo[k] = info[k]
45 |                 info['episode'] = epinfo
46 |                 if self.keep_buf:
47 |                     self.epret_buf.append(ret)
48 |                     self.eplen_buf.append(eplen)
49 |                 self.epcount += 1
50 |                 self.eprets[i] = 0
51 |                 self.eplens[i] = 0
52 |                 if self.results_writer:
53 |                     self.results_writer.write_row(epinfo)
54 |                 newinfos[i] = info
55 |         return obs, rews, dones, newinfos
56 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/vec_normalize.py:
--------------------------------------------------------------------------------
 1 | from . import VecEnvWrapper
 2 | import numpy as np
 3 | 
 4 | class VecNormalize(VecEnvWrapper):
 5 |     """
 6 |     A vectorized wrapper that normalizes the observations
 7 |     and returns from an environment.
 8 |     """
 9 | 
10 |     def __init__(self, venv, ob=True, ret=True, clipob=10., cliprew=10., gamma=0.99, epsilon=1e-8, use_tf=False):
11 |         VecEnvWrapper.__init__(self, venv)
12 |         if use_tf:
13 |             from mher.common.running_mean_std import TfRunningMeanStd
14 |             self.ob_rms = TfRunningMeanStd(shape=self.observation_space.shape, scope='ob_rms') if ob else None
15 |             self.ret_rms = TfRunningMeanStd(shape=(), scope='ret_rms') if ret else None
16 |         else:
17 |             from mher.common.running_mean_std import RunningMeanStd
18 |             self.ob_rms = RunningMeanStd(shape=self.observation_space.shape) if ob else None
19 |             self.ret_rms = RunningMeanStd(shape=()) if ret else None
20 |         self.clipob = clipob
21 |         self.cliprew = cliprew
22 |         self.ret = np.zeros(self.num_envs)
23 |         self.gamma = gamma
24 |         self.epsilon = epsilon
25 | 
26 |     def step_wait(self):
27 |         obs, rews, news, infos = self.venv.step_wait()
28 |         self.ret = self.ret * self.gamma + rews
29 |         obs = self._obfilt(obs)
30 |         if self.ret_rms:
31 |             self.ret_rms.update(self.ret)
32 |             rews = np.clip(rews / np.sqrt(self.ret_rms.var + self.epsilon), -self.cliprew, self.cliprew)
33 |         self.ret[news] = 0.
34 |         return obs, rews, news, infos
35 | 
36 |     def _obfilt(self, obs):
37 |         if self.ob_rms:
38 |             self.ob_rms.update(obs)
39 |             obs = np.clip((obs - self.ob_rms.mean) / np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob, self.clipob)
40 |             return obs
41 |         else:
42 |             return obs
43 | 
44 |     def reset(self):
45 |         self.ret = np.zeros(self.num_envs)
46 |         obs = self.venv.reset()
47 |         return self._obfilt(obs)
48 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/vec_remove_dict_obs.py:
--------------------------------------------------------------------------------
 1 | from .vec_env import VecEnvObservationWrapper
 2 | 
 3 | class VecExtractDictObs(VecEnvObservationWrapper):
 4 |     def __init__(self, venv, key):
 5 |         self.key = key
 6 |         super().__init__(venv=venv,
 7 |             observation_space=venv.observation_space.spaces[self.key])
 8 | 
 9 |     def process(self, obs):
10 |         return obs[self.key]
11 | 


--------------------------------------------------------------------------------
/mher/common/vec_env/vec_video_recorder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from mher import logger
 3 | from mher.common.vec_env import VecEnvWrapper
 4 | from gym.wrappers.monitoring import video_recorder
 5 | 
 6 | 
 7 | class VecVideoRecorder(VecEnvWrapper):
 8 |     """
 9 |     Wrap VecEnv to record rendered image as mp4 video.
10 |     """
11 | 
12 |     def __init__(self, venv, directory, record_video_trigger, video_length=200):
13 |         """
14 |         # Arguments
15 |             venv: VecEnv to wrap
16 |             directory: Where to save videos
17 |             record_video_trigger:
18 |                 Function that defines when to start recording.
19 |                 The function takes the current number of step,
20 |                 and returns whether we should start recording or not.
21 |             video_length: Length of recorded video
22 |         """
23 | 
24 |         VecEnvWrapper.__init__(self, venv)
25 |         self.record_video_trigger = record_video_trigger
26 |         self.video_recorder = None
27 | 
28 |         self.directory = os.path.abspath(directory)
29 |         if not os.path.exists(self.directory): os.mkdir(self.directory)
30 | 
31 |         self.file_prefix = "vecenv"
32 |         self.file_infix = '{}'.format(os.getpid())
33 |         self.step_id = 0
34 |         self.video_length = video_length
35 | 
36 |         self.recording = False
37 |         self.recorded_frames = 0
38 | 
39 |     def reset(self):
40 |         obs = self.venv.reset()
41 | 
42 |         self.start_video_recorder()
43 | 
44 |         return obs
45 | 
46 |     def start_video_recorder(self):
47 |         self.close_video_recorder()
48 | 
49 |         base_path = os.path.join(self.directory, '{}.video.{}.video{:06}'.format(self.file_prefix, self.file_infix, self.step_id))
50 |         self.video_recorder = video_recorder.VideoRecorder(
51 |                 env=self.venv,
52 |                 base_path=base_path,
53 |                 metadata={'step_id': self.step_id}
54 |                 )
55 | 
56 |         self.video_recorder.capture_frame()
57 |         self.recorded_frames = 1
58 |         self.recording = True
59 | 
60 |     def _video_enabled(self):
61 |         return self.record_video_trigger(self.step_id)
62 | 
63 |     def step_wait(self):
64 |         obs, rews, dones, infos = self.venv.step_wait()
65 | 
66 |         self.step_id += 1
67 |         if self.recording:
68 |             self.video_recorder.capture_frame()
69 |             self.recorded_frames += 1
70 |             if self.recorded_frames > self.video_length:
71 |                 logger.info("Saving video to ", self.video_recorder.path)
72 |                 self.close_video_recorder()
73 |         elif self._video_enabled():
74 |                 self.start_video_recorder()
75 | 
76 |         return obs, rews, dones, infos
77 | 
78 |     def close_video_recorder(self):
79 |         if self.recording:
80 |             self.video_recorder.close()
81 |         self.recording = False
82 |         self.recorded_frames = 0
83 | 
84 |     def close(self):
85 |         VecEnvWrapper.close(self)
86 |         self.close_video_recorder()
87 | 
88 |     def __del__(self):
89 |         self.close()
90 | 


--------------------------------------------------------------------------------
/mher/common/wrappers.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | 
 3 | class TimeLimit(gym.Wrapper):
 4 |     def __init__(self, env, max_episode_steps=None):
 5 |         super(TimeLimit, self).__init__(env)
 6 |         self._max_episode_steps = max_episode_steps
 7 |         self._elapsed_steps = 0
 8 | 
 9 |     def step(self, ac):
10 |         observation, reward, done, info = self.env.step(ac)
11 |         self._elapsed_steps += 1
12 |         if self._elapsed_steps >= self._max_episode_steps:
13 |             done = True
14 |             info['TimeLimit.truncated'] = True
15 |         return observation, reward, done, info
16 | 
17 |     def reset(self, **kwargs):
18 |         self._elapsed_steps = 0
19 |         return self.env.reset(**kwargs)
20 | 
21 | class ClipActionsWrapper(gym.Wrapper):
22 |     def step(self, action):
23 |         import numpy as np
24 |         action = np.nan_to_num(action)
25 |         action = np.clip(action, self.action_space.low, self.action_space.high)
26 |         return self.env.step(action)
27 | 
28 |     def reset(self, **kwargs):
29 |         return self.env.reset(**kwargs)
30 | 


--------------------------------------------------------------------------------
/mher/default_cfg.py:
--------------------------------------------------------------------------------
 1 | DEFAULT_ENV_PARAMS = {
 2 |     'SawyerPush-v0':{
 3 |         'n_cycles':10,
 4 |         'n_batches':5,
 5 |         'n_test_rollouts':50,
 6 |         'batch_size':64,
 7 |         'rollout_batch_size':1
 8 |     },
 9 |     'SawyerReachXYZEnv-v1':{
10 |         'n_cycles':5,
11 |         'n_batches':2,
12 |         'n_test_rollouts':50,
13 |         'batch_size':64
14 |     },
15 |     'FetchReach-v1': {
16 |         'n_cycles': 10,  
17 |         'n_test_rollouts': 20,
18 |         'n_batches': 2, 
19 |         'batch_size': 64,
20 |     },
21 |     # 'FetchPush-v1': {
22 |     #     'n_cycles': 10,  
23 |     #     'n_test_rollouts': 20,
24 |     #     'n_batches': 10, 
25 |     #     'batch_size': 256,
26 |     # },
27 | }
28 | 
29 | 
30 | DEFAULT_PARAMS = {  
31 |     # algorithm
32 |     'algo':'ddpg',
33 |     # env
34 |     'max_u': 1.,  # max absolute value of actions on different coordinates
35 |     # ddpg
36 |     'layers': 3,  # number of layers in the critic/actor networks
37 |     'hidden': 256,  # number of neurons in each hidden layers
38 |     'Q_lr': 0.001,  # critic learning rate
39 |     'pi_lr': 0.001,  # actor learning rate
40 |     'polyak': 0.95,  # polyak averaging coefficient
41 |     'action_l2': 1.0,  # quadratic penalty on actions (before rescaling by max_u)
42 |     'clip_obs': 200.,
43 |     'relative_goals': False,
44 |     'clip_pos_returns': True,
45 |     'clip_return': True,
46 | 
47 |     # sac
48 |     'sac_alpha':0.03,
49 | 
50 |     # buffer
51 |     'buffer_size': int(1E6),  # for experience replay
52 |     'sampler': 'random',
53 | 
54 |     # training
55 |     'n_cycles': 50,  # per epoch
56 |     'rollout_batch_size': 2,  # per mpi thread
57 |     'n_batches': 40,  # training batches per cycle
58 |     'batch_size': 1024,  #258 per mpi thread, measured in transitions and reduced to even multiple of chunk_length.
59 |     'n_test_rollouts': 10,  # number of test rollouts per epoch, each consists of rollout_batch_size rollouts
60 |     'test_with_polyak': False,  # run test episodes with the target network
61 |     # playing 
62 |     'play_episodes':1, # number of running test episodes
63 |     # saving
64 |     'policy_save_interval': 10,
65 |     # exploration
66 |     'random_eps': 0.3,  # percentage of time a random action is taken
67 |     'noise_eps': 0.2,  # std of gaussian noise added to not-completely-random actions as a percentage of max_u
68 |     # HER
69 |     'replay_strategy': 'future',  # supported modes: future, none
70 |     'relabel_p': 0.8,  # relabeling probability
71 |     # normalization
72 |     'norm_eps': 1e-4,  # epsilon used for observation normalization
73 |     'norm_clip': 5,  # normalized observations are cropped to this values
74 | 
75 |     # random init episode
76 |     'random_init':100, # for dynamic n-step, this should be bigger
77 | 
78 |     # prioritized experience replay
79 |     'alpah': 0.6,
80 |     'beta': 0.4,
81 |     'eps': 1e-5,
82 | 
83 |     # n step hindsight experience
84 |     'nstep':3,
85 |     'use_nstep':False,
86 | 
87 |     # lambda n-step
88 |     'use_lambda_nstep':False,
89 |     'lamb':0.7,
90 | 
91 |     # dynamic n-step
92 |     'use_dynamic_nstep':False, 
93 |     'alpha':0.5,
94 |     'dynamic_batchsize':512,  # warm up the dynamic model
95 |     'dynamic_init':500,
96 | 
97 |     # if do not use her
98 |     'no_her':False    # no her, will be used for DDPG and n-step DDPG
99 | }


--------------------------------------------------------------------------------
/mher/envs/__pycache__/env_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/envs/__pycache__/env_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/envs/__pycache__/make_env_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/envs/__pycache__/make_env_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/envs/env_utils.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Util tools for environments
  3 | '''
  4 | import re
  5 | 
  6 | import gym
  7 | 
  8 | 
  9 | def simple_goal_subtract(a, b):
 10 |     assert a.shape == b.shape
 11 |     return a - b
 12 | 
 13 | def g_to_ag(o, env_id):
 14 |     if env_id == 'FetchReach':
 15 |         ag = o[:,0:3]
 16 |     elif env_id in ['FetchPush','FetchSlide', 'FetchPickAndPlace']:
 17 |         ag = o[:,3:6]
 18 |     else:
 19 |         raise NotImplementedError
 20 |     return ag
 21 | 
 22 | CACHED_ENVS = {}
 23 | def cached_make_env(make_env):
 24 |     """
 25 |     Only creates a new environment from the provided function if one has not yet already been
 26 |     created. This is useful here because we need to infer certain properties of the env, e.g.
 27 |     its observation and action spaces, without any intend of actually using it.
 28 |     """
 29 |     if make_env not in CACHED_ENVS:
 30 |         env = make_env()
 31 |         CACHED_ENVS[make_env] = env
 32 |     return CACHED_ENVS[make_env]
 33 | 
 34 | def get_rewardfun(params, tmp_env):
 35 |     tmp_env.reset()
 36 |     def reward_fun(ag_2, g, info):  # vectorized
 37 |         return tmp_env.compute_reward(achieved_goal=ag_2, desired_goal=g, info=info)
 38 |     return reward_fun
 39 | 
 40 | def get_env_type(args, _game_envs):
 41 |     env_id = args.env
 42 |     # Re-parse the gym registry, since we could have new envs since last time.
 43 |     for env in gym.envs.registry.all():
 44 |         try:
 45 |             env_type = env.entry_point.split(':')[0].split('.')[-1]
 46 |             _game_envs[env_type].add(env.id)  # This is a set so add is idempotent
 47 |         except:
 48 |             pass
 49 | 
 50 |     if env_id in _game_envs.keys():
 51 |         env_type = env_id
 52 |         env_id = [g for g in _game_envs[env_type]][0]
 53 |     else:
 54 |         env_type = None
 55 |         for g, e in _game_envs.items():
 56 |             if env_id in e:
 57 |                 env_type = g
 58 |                 break
 59 |         if ':' in env_id:
 60 |             env_type = re.sub(r':.*', '', env_id)
 61 |         assert env_type is not None, 'env_id {} is not recognized in env types {}'.format(env_id, _game_envs.keys())
 62 | 
 63 |     return env_type, env_id
 64 | 
 65 | def obs_to_goal_fun(env):
 66 |     # only support Fetchenv and Handenv now
 67 |     from gym.envs.robotics import FetchEnv, hand_env
 68 |     from multiworld.envs.mujoco.sawyer_xyz import (sawyer_push_nips,
 69 |                                                    sawyer_reach)
 70 |     from multiworld.envs.pygame import point2d
 71 | 
 72 |     if isinstance(env.env, FetchEnv):
 73 |         obs_dim = env.observation_space['observation'].shape[0]
 74 |         goal_dim = env.observation_space['desired_goal'].shape[0]
 75 |         temp_dim = env.sim.data.get_site_xpos('robot0:grip').shape[0]
 76 |         def obs_to_goal(observation):
 77 |             observation = observation.reshape(-1, obs_dim)
 78 |             if env.has_object:
 79 |                 goal = observation[:, temp_dim:temp_dim + goal_dim]
 80 |             else:
 81 |                 goal = observation[:, :goal_dim]
 82 |             return goal.copy()
 83 |     elif isinstance(env.env, hand_env.HandEnv):
 84 |         goal_dim = env.observation_space['desired_goal'].shape[0]
 85 |         def obs_to_goal(observation):
 86 |             goal = observation[:, -goal_dim:]
 87 |             return goal.copy()
 88 |     elif isinstance(env.env.env, point2d.Point2DEnv):
 89 |         def obs_to_goal(observation):
 90 |             return observation.copy()
 91 |     elif isinstance(env.env.env, sawyer_push_nips.SawyerPushAndReachXYEnv):
 92 |         assert env.env.env.observation_space['observation'].shape == env.env.env.observation_space['achieved_goal'].shape, \
 93 |             "This environment's observation space doesn't equal goal space"
 94 |         def obs_to_goal(observation):
 95 |             return observation
 96 |     elif isinstance(env.env.env, sawyer_reach.SawyerReachXYZEnv):
 97 |         def obs_to_goal(observation):
 98 |             return observation
 99 |     else:
100 |         import pdb; pdb.set_trace()
101 |         raise NotImplementedError('Do not support such type {}'.format(env))
102 |         
103 |     return obs_to_goal
104 | 


--------------------------------------------------------------------------------
/mher/envs/make_env_utils.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import os
  3 | import sys
  4 | 
  5 | import gym
  6 | import tensorflow as tf
  7 | from gym.wrappers import FilterObservation, FlattenObservation
  8 | from mher.common import logger, retro_wrappers, set_global_seeds
  9 | from mher.common.init_utils import init_mpi_import
 10 | from mher.common.monitor import Monitor
 11 | from mher.common.tf_util import get_session
 12 | from mher.common.vec_env import VecEnv, VecFrameStack, VecNormalize
 13 | from mher.common.vec_env.dummy_vec_env import DummyVecEnv
 14 | from mher.common.vec_env.subproc_vec_env import SubprocVecEnv
 15 | from mher.common.wrappers import ClipActionsWrapper
 16 | from mher.envs.env_utils import get_env_type
 17 | 
 18 | MPI = init_mpi_import()
 19 | 
 20 | def build_env(args, _game_envs):
 21 |     ncpu = multiprocessing.cpu_count()
 22 |     if sys.platform == 'darwin': ncpu //= 2
 23 |     alg = args.alg
 24 |     seed = args.seed
 25 | 
 26 |     env_type, env_id = get_env_type(args, _game_envs)
 27 |     config = tf.ConfigProto(allow_soft_placement=True,
 28 |                         intra_op_parallelism_threads=1,
 29 |                         inter_op_parallelism_threads=1)
 30 |     config.gpu_options.allow_growth = True
 31 |     get_session(config=config)
 32 | 
 33 |     reward_scale = args.reward_scale if hasattr(args, 'reward_scale') else 1
 34 |     flatten_dict_observations = alg not in {'her'}
 35 |     env = make_vec_env(env_id, env_type, args.num_env or 1, seed, 
 36 |                         reward_scale=reward_scale, 
 37 |                         flatten_dict_observations=flatten_dict_observations)
 38 | 
 39 |     if env_type == 'mujoco':
 40 |         env = VecNormalize(env, use_tf=True)
 41 |     # build one simple env without vector wrapper
 42 |     tmp_env = make_env(env_id, env_type, seed=seed,
 43 |                         reward_scale=reward_scale,
 44 |                         flatten_dict_observations=flatten_dict_observations,
 45 |                         logger_dir=logger.get_dir())
 46 | 
 47 |     return env, tmp_env
 48 | 
 49 | def make_vec_env(env_id, env_type, num_env, seed,
 50 |                  wrapper_kwargs=None,
 51 |                  env_kwargs=None,
 52 |                  start_index=0,
 53 |                  reward_scale=1.0,
 54 |                  flatten_dict_observations=True,
 55 |                  gamestate=None,
 56 |                  initializer=None,
 57 |                  force_dummy=False):
 58 |     """
 59 |     Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo.
 60 |     """
 61 |     wrapper_kwargs = wrapper_kwargs or {}
 62 |     env_kwargs = env_kwargs or {}
 63 |     mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0
 64 |     seed = seed + 10000 * mpi_rank if seed is not None else None
 65 |     logger_dir = logger.get_dir()
 66 |     def make_thunk(rank, initializer=None):
 67 |         return lambda: make_env(
 68 |             env_id=env_id,
 69 |             env_type=env_type,
 70 |             mpi_rank=mpi_rank,
 71 |             subrank=rank,
 72 |             seed=seed,
 73 |             reward_scale=reward_scale,
 74 |             gamestate=gamestate,
 75 |             flatten_dict_observations=flatten_dict_observations,
 76 |             wrapper_kwargs=wrapper_kwargs,
 77 |             env_kwargs=env_kwargs,
 78 |             logger_dir=logger_dir,
 79 |             initializer=initializer 
 80 |         )
 81 |     set_global_seeds(seed)
 82 |     if not force_dummy and num_env > 1:
 83 |         return SubprocVecEnv([make_thunk(i + start_index, initializer=initializer) for i in range(num_env)])
 84 |     else:
 85 |         return DummyVecEnv([make_thunk(i + start_index, initializer=None) for i in range(num_env)])
 86 | 
 87 | 
 88 | def make_env(env_id, env_type, mpi_rank=0, subrank=0, seed=None, reward_scale=1.0, gamestate=None, 
 89 |             flatten_dict_observations=True, wrapper_kwargs=None, env_kwargs=None, logger_dir=None, initializer=None):
 90 |     if initializer is not None:
 91 |         initializer(mpi_rank=mpi_rank, subrank=subrank)
 92 | 
 93 |     wrapper_kwargs = wrapper_kwargs or {}
 94 |     env_kwargs = env_kwargs or {}
 95 |     if ':' in env_id:
 96 |         import importlib
 97 |         import re
 98 |         module_name = re.sub(':.*','',env_id)
 99 |         env_id = re.sub('.*:', '', env_id)
100 |         importlib.import_module(module_name)
101 | 
102 |     env = gym.make(env_id, **env_kwargs)
103 |     # if env_id.startswith('Sawyer'):
104 |     #     from mher.algos.multi_world_wrapper import SawyerGoalWrapper
105 |     #     env = SawyerGoalWrapper(env)
106 |     # if (env_id.startswith('Sawyer') or env_id.startswith('Point2D')) and not hasattr(env, '_max_episode_steps'):
107 |     #     env = gym.wrappers.TimeLimit(env, max_episode_steps=100)
108 | 
109 |     if flatten_dict_observations and isinstance(env.observation_space, gym.spaces.Dict):
110 |         env = FlattenObservation(env)
111 | 
112 |     env.seed(seed + subrank if seed is not None else None)
113 |     env = Monitor(env,
114 |                   logger_dir and os.path.join(logger_dir, str(mpi_rank) + '.' + str(subrank)),
115 |                   allow_early_resets=True)
116 | 
117 |     if isinstance(env.action_space, gym.spaces.Box):
118 |         env = ClipActionsWrapper(env)
119 | 
120 |     if reward_scale != 1:
121 |         env = retro_wrappers.RewardScaler(env, reward_scale)
122 |     return env
123 | 
124 | def make_mujoco_env(env_id, seed, reward_scale=1.0):
125 |     """
126 |     Create a wrapped, monitored gym.Env for MuJoCo.
127 |     """
128 |     rank = MPI.COMM_WORLD.Get_rank()
129 |     myseed = seed  + 1000 * rank if seed is not None else None
130 |     set_global_seeds(myseed)
131 |     env = gym.make(env_id)
132 |     logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank))
133 |     env = Monitor(env, logger_path, allow_early_resets=True)
134 |     env.seed(seed)
135 |     if reward_scale != 1.0:
136 |         from mher.common.retro_wrappers import RewardScaler
137 |         env = RewardScaler(env, reward_scale)
138 |     return env
139 | 
140 | def make_robotics_env(env_id, seed, rank=0):
141 |     """
142 |     Create a wrapped, monitored gym.Env for MuJoCo.
143 |     """
144 |     set_global_seeds(seed)
145 |     env = gym.make(env_id)
146 |     env = FlattenObservation(FilterObservation(env, ['observation', 'desired_goal']))
147 |     env = Monitor(
148 |         env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)),
149 |         info_keywords=('is_success',))
150 |     env.seed(seed)
151 |     return env
152 | 


--------------------------------------------------------------------------------
/mher/envs/wrappers/__pycache__/wrapper_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/envs/wrappers/__pycache__/wrapper_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/mher/envs/wrappers/multi_world_wrapper.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | 
 3 | import gym
 4 | import multiworld
 5 | import numpy as np
 6 | from gym.core import Wrapper
 7 | 
 8 | 
 9 | # for point env 
10 | class PointGoalWrapper(Wrapper):
11 |     def __init__(self, env):
12 |         Wrapper.__init__(self, env=env)
13 |         self.env = env
14 |         self.action_space = env.action_space
15 |         self.observation_space = env.observation_space
16 |     
17 |     def reset(self):
18 |         return self.env.reset()
19 |     
20 |     def step(self, action):
21 |         import pdb; pdb.set_trace()
22 |         obs_dict, reward, done, info = self.env.step(action)
23 |         obs = {
24 |             'observation':obs_dict['observation'],
25 |             'desired_goal':obs_dict['desired_goal'],
26 |             'achieved_goal':obs_dict['achieved_goal']
27 |         }
28 |         return obs, reward, done, info
29 |     
30 |     def render(self, mode='human'):
31 |         return self.env.render()
32 |     
33 |     def compute_reward(self, achieved_goal, desired_goal, info=None):
34 |         obs = {
35 |             'state_achieved_goal': achieved_goal,
36 |             'state_desired_goal':desired_goal
37 |         }
38 |         action = np.array([])
39 |         return self.env.compute_reward(action, obs)
40 | 
41 |     def sample_goal(self):
42 |         goal_dict = self.env.sample_goal()
43 |         return goal_dict['desired_goal']
44 | 
45 | # for sawyer env
46 | class SawyerGoalWrapper(Wrapper):
47 |     reward_type_dict = {
48 |         'dense':'hand_distance',
49 |         'sparse':'hand_success'
50 |     }
51 |     observation_keys = ['observation', 'desired_goal', 'achieved_goal']
52 |         
53 |     def __init__(self, env, reward_type='sparse'):
54 |         Wrapper.__init__(self, env=env)
55 |         self.env = env
56 |         self.action_space = env.action_space
57 |         # observation
58 |         for key in list(env.observation_space.spaces.keys()):
59 |             if key not in self.observation_keys:
60 |                 del env.observation_space.spaces[key]
61 | 
62 |         self.observation_space = env.observation_space
63 |         self.reward_type = reward_type
64 |         self.env.reward_type = self.reward_type_dict[self.reward_type]
65 |         # self.env.indicator_threshold = 0.03
66 |     
67 |     def reset(self):
68 |         return self.env.reset()
69 |     
70 |     def step(self, action):
71 |         obs_dict, reward, done, info = self.env.step(action)
72 |         obs = {
73 |             'observation':obs_dict['observation'],
74 |             'desired_goal':obs_dict['desired_goal'],
75 |             'achieved_goal':obs_dict['achieved_goal']
76 |         }
77 |         if 'hand_success' in info.keys():
78 |             info['is_success'] = info['hand_success']
79 |         if 'success' in info.keys():
80 |             info['is_success'] = info['success']
81 |         import pdb; pdb.set_trace()
82 |         return obs, reward, done, info
83 |     
84 |     def render(self, mode='human'):
85 |         return self.env.render()
86 |     
87 |     def compute_reward(self, achieved_goal, desired_goal, info):
88 |         obs = {
89 |             'state_achieved_goal': achieved_goal,
90 |             'state_desired_goal':desired_goal
91 |         }
92 |         action = np.array([])
93 |         return self.env.compute_rewards(action, obs)
94 | 
95 |     def sample_goal(self):
96 |         goal_dict = self.env.sample_goal()
97 |         return goal_dict['desired_goal']
98 | 


--------------------------------------------------------------------------------
/mher/envs/wrappers/wrapper_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def recurse_attribute(obj, attr, max_depth=3):
 3 |     '''find env's attribution'''
 4 |     tmp_obj = obj
 5 |     depth = 0
 6 |     while depth < max_depth and not hasattr(tmp_obj, attr):
 7 |         tmp_obj = tmp_obj.env
 8 |         depth += 1
 9 |     if hasattr(tmp_obj, attr):
10 |         return getattr(tmp_obj, attr)
11 |     else:
12 |         return None
13 |     
14 | 


--------------------------------------------------------------------------------
/mher/play.py:
--------------------------------------------------------------------------------
 1 | # DEPRECATED, use --play flag to mher.run instead
 2 | import pickle
 3 | 
 4 | import click
 5 | import numpy as np
 6 | 
 7 | import mher.config as config
 8 | from mher.rollouts.rollout import RolloutWorker
 9 | from mher.common import logger, set_global_seeds
10 | from mher.common.vec_env import VecEnv
11 | 
12 | 
13 | @click.command()
14 | @click.argument('policy_file', type=str)
15 | @click.option('--seed', type=int, default=0)
16 | @click.option('--n_test_rollouts', type=int, default=10)
17 | @click.option('--render', type=int, default=1)
18 | 
19 | def main(policy_file, seed, n_test_rollouts, render):
20 |     set_global_seeds(seed)
21 | 
22 |     # Load policy.
23 |     with open(policy_file, 'rb') as f:
24 |         policy = pickle.load(f)
25 |     env_name = policy.info['env_name']
26 | 
27 |     # Prepare params.
28 |     params = config.DEFAULT_PARAMS
29 |     if env_name in config.DEFAULT_ENV_PARAMS:
30 |         params.update(config.DEFAULT_ENV_PARAMS[env_name])  # merge env-specific parameters in
31 |     params['env_name'] = env_name
32 |     params = config.prepare_params(params)
33 |     config.log_params(params, logger=logger)
34 |     dims = config.configure_dims(params)
35 | 
36 |     eval_params = {
37 |         'exploit': True,
38 |         'use_target_net': params['test_with_polyak'],
39 |         'compute_Q': True,
40 |         'rollout_batch_size': 1,
41 |         'render': bool(render),
42 |     }
43 |     for name in ['T', 'gamma', 'noise_eps', 'random_eps']:
44 |         eval_params[name] = params[name]
45 | 
46 |     evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params)
47 |     evaluator.seed(seed)
48 | 
49 |     # Run evaluation.
50 |     evaluator.clear_history()
51 |     for _ in range(n_test_rollouts):
52 |         evaluator.generate_rollouts()
53 | 
54 |     # record logs
55 |     for key, val in evaluator.logs('test'):
56 |         logger.record_tabular(key, np.mean(val))
57 |     logger.dump_tabular()
58 | 
59 | 
60 | # playing with a model and an environment
61 | def play(model, env, episodes=1):
62 |     logger.log("Running trained model")
63 |     obs = env.reset()
64 |     state = model.initial_state if hasattr(model, 'initial_state') else None
65 |     dones = np.zeros((1,))
66 | 
67 |     episode_rew = np.zeros((episodes, env.num_envs)) if isinstance(env, VecEnv) else np.zeros((episodes, 1))
68 |     ep_num = 0
69 |     while ep_num < episodes:
70 |         actions, _, _, _ = model.step(obs)
71 | 
72 |         obs, rew, done, _ = env.step(actions)
73 |         episode_rew[ep_num] += rew
74 |         env.render()
75 |         done_any = done.any() if isinstance(done, np.ndarray) else done
76 |         if done_any:
77 |             logger.log('episode_rew={}'.format(episode_rew[ep_num]))
78 |             ep_num += 1
79 |             obs = env.reset()
80 |     average_reward = np.mean(episode_rew)
81 |     logger.log('Total average test reward:{}'.format(average_reward))
82 |     return average_reward
83 |     
84 | 
85 | if __name__ == '__main__':
86 |     main()
87 | 


--------------------------------------------------------------------------------
/mher/plot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | import json
  5 | import math
  6 | from numpy.core.fromnumeric import size
  7 | from numpy.lib.function_base import i0
  8 | from numpy.lib.npyio import save
  9 | from numpy.ma.core import right_shift
 10 | import seaborn as sns; sns.set()
 11 | import glob2
 12 | import argparse
 13 | plt.rcParams['pdf.fonttype'] = 42
 14 | plt.rcParams['ps.fonttype'] = 42
 15 | 
 16 | 
 17 | smooth = True
 18 | 
 19 | def smooth_reward_curve(x, y):
 20 |     halfwidth = 2
 21 |     k = halfwidth
 22 |     xsmoo = x
 23 |     ysmoo = np.convolve(y, np.ones(2 * k + 1), mode='same') / np.convolve(np.ones_like(y), np.ones(2 * k + 1),
 24 |         mode='same')
 25 |     return xsmoo, ysmoo
 26 | 
 27 | 
 28 | def load_results(file):
 29 |     if not os.path.exists(file):
 30 |         return None
 31 |     with open(file, 'r') as f:
 32 |         lines = [line for line in f]
 33 |     if len(lines) < 2:
 34 |         return None
 35 |     keys = [name.strip() for name in lines[0].split(',')]
 36 |     try:
 37 |         data = np.genfromtxt(file, delimiter=',', skip_header=1, filling_values=0.)
 38 |     except:
 39 |         import pdb; pdb.set_trace()
 40 |     if data.ndim == 1:
 41 |         data = data.reshape(1, -1)
 42 |     assert data.ndim == 2
 43 |     assert data.shape[-1] == len(keys)
 44 |     result = {}
 45 |     for idx, key in enumerate(keys):
 46 |         result[key] = data[:, idx]
 47 |     return result
 48 | 
 49 | 
 50 | def pad(xs, value=np.nan):
 51 |     maxlen = np.max([len(x) for x in xs])
 52 | 
 53 |     padded_xs = []
 54 |     for x in xs:
 55 |         if x.shape[0] >= maxlen:
 56 |             padded_xs.append(x)
 57 | 
 58 |         padding = np.ones((maxlen - x.shape[0],) + x.shape[1:]) * value
 59 |         x_padded = np.concatenate([x, padding], axis=0)
 60 |         assert x_padded.shape[1:] == x.shape[1:]
 61 |         assert x_padded.shape[0] == maxlen
 62 |         padded_xs.append(x_padded)
 63 |     return np.array(padded_xs)
 64 | 
 65 | 
 66 | # Load all data.
 67 | def load_data(dir, key='test/success_rate', filename='progress.csv'):
 68 |     data = []
 69 |     # find all */progress.csv under dir
 70 |     paths = [os.path.abspath(os.path.join(path, '..')) for path in glob2.glob(os.path.join(dir, '**', filename))]
 71 |     for curr_path in paths:
 72 |         if not os.path.isdir(curr_path):
 73 |             continue
 74 |         results = load_results(os.path.join(curr_path, filename))
 75 |         if not results:
 76 |             print('skipping {}'.format(curr_path))
 77 |             continue
 78 |         print('loading {} ({})'.format(curr_path, len(results['epoch'])))
 79 | 
 80 |         success_rate = np.array(results[key])[:50]
 81 |         epoch = np.array(results['epoch'])[:50] + 1
 82 | 
 83 |         # Process and smooth data.
 84 |         assert success_rate.shape == epoch.shape
 85 |         x = epoch
 86 |         y = success_rate
 87 |         if smooth:
 88 |             x, y = smooth_reward_curve(epoch, success_rate)
 89 |         assert x.shape == y.shape
 90 |         data.append((x, y))
 91 |     return data
 92 | 
 93 | def load_datas(dirs, key='test/success_rate', filename='progress.csv'):
 94 |     datas = []
 95 |     for dir in dirs:
 96 |         data = load_data(dir, key, filename)
 97 |         datas.append(data)
 98 |     return datas
 99 | 
100 | # Plot datas
101 | def plot_datas(datas, labels, info, fontsize=15, i=0, j=0):
102 |     title, xlabel, ylabel = info
103 |     for data, label in zip(datas, labels):
104 |         try:
105 |             xs, ys = zip(*data)
106 |         except:
107 |             import pdb; pdb.set_trace()
108 |         xs, ys = pad(xs), pad(ys)
109 |         assert xs.shape == ys.shape
110 | 
111 |         plt.plot(xs[0], np.nanmedian(ys, axis=0), label=label)
112 |         plt.fill_between(xs[0], np.nanpercentile(ys, 25, axis=0), np.nanpercentile(ys, 75, axis=0), alpha=0.25)
113 |     plt.title(title, fontsize=fontsize)
114 |     plt.xlabel(xlabel, fontsize=fontsize)
115 |     plt.ylabel(ylabel, fontsize=fontsize)
116 |     plt.legend(fontsize=fontsize-3, loc=4, bbox_to_anchor=(0.5, 0.06, 0.5, 0.5))
117 |     plt.xticks(fontsize=fontsize-3)
118 |     plt.yticks(fontsize=fontsize-4)
119 | 
120 | def plot_main(dirs, labels, info, key='test/success_rate', filename='progress.csv', save_dir='./test.png'):
121 |     plt.figure(dpi=300, figsize=(5,4))
122 |     datas = load_datas(dirs, key, filename)
123 | 
124 |     plot_datas(datas, labels, info)
125 |     plt.subplots_adjust(left=0.14, right=0.98, bottom=0.15, top=0.92, hspace=0.3, wspace=0.15)
126 |     plt.savefig(save_dir)
127 | 
128 | 
129 | if __name__ == '__main__':
130 |     data_dirs = ['', '']
131 |     save_dir = ''
132 |     legend = ['HER', 'CHER']
133 |     infos = ['title', 'Epoch', 'Median success rate'] 
134 |     plot_main(data_dirs, legend, infos, key='test/mean_Q', save_dir=save_dir)


--------------------------------------------------------------------------------
/mher/rollouts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YangRui2015/Modular_HER/77acca83d6849d140ab893ec1b472b71e1da08d4/mher/rollouts/__init__.py


--------------------------------------------------------------------------------
/mher/run.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import multiprocessing
 3 | import os
 4 | import os.path as osp
 5 | import re
 6 | import sys
 7 | 
 8 | import gym
 9 | import numpy as np
10 | import tensorflow as tf
11 | 
12 | from mher import config
13 | from mher.rollouts.rollout import RolloutWorker
14 | from mher.common import logger, set_global_seeds, tf_util
15 | from mher.common.cmd_util import preprocess_kwargs
16 | from mher.common.import_util import get_alg_module
17 | from mher.common.init_utils import init_environment_import, init_mpi_import
18 | from mher.common.logger import configure_logger
19 | from mher.envs.make_env_utils import build_env
20 | from mher.play import play
21 | from mher.train import train
22 | 
23 | MPI = init_mpi_import()
24 | _game_envs = init_environment_import()
25 | 
26 | def prepare(args):
27 |     ## make save dir
28 |     if args.save_path:
29 |         os.makedirs(os.path.expanduser(args.save_path), exist_ok=True)
30 |     # configure logger, disable logging in child MPI processes (with rank > 0)
31 |     if MPI is None or MPI.COMM_WORLD.Get_rank() == 0:
32 |         configure_logger(args.log_path)
33 |     else:
34 |         configure_logger(args.log_path, format_strs=[])
35 |     # Seed everything.
36 |     rank = MPI.COMM_WORLD.Get_rank()
37 |     rank_seed = args.seed + 1000000 * rank if args.seed is not None else None
38 |     set_global_seeds(rank_seed)
39 |     return rank
40 | 
41 | def main(args): 
42 |     # process argprase and parameters
43 |     args, extra_args = preprocess_kwargs(args)
44 |     rank = prepare(args)
45 |     env, tmp_env = build_env(args, _game_envs)
46 |     params = config.process_params(env, tmp_env, rank, args, extra_args)
47 |     dims = config.configure_dims(tmp_env, params)
48 | 
49 |     # define objects
50 |     sampler = config.configure_sampler(dims, params)
51 |     buffer = config.configure_buffer(dims, params, sampler)
52 |     policy = config.configure_algorithm(dims=dims, params=params, buffer=buffer)
53 |     rollout_params, eval_params = config.configure_rollout(params)
54 | 
55 |     if args.load_path is not None:
56 |         tf_util.load_variables(args.load_path)
57 | 
58 |     rollout_worker = RolloutWorker(env, policy, dims, logger, monitor=True, **rollout_params)
59 |     evaluator = RolloutWorker(env, policy, dims, logger, **eval_params)
60 | 
61 |     n_epochs = config.configure_epoch(args.num_epoch, params)
62 |     policy = train(
63 |         policy=policy,
64 |         rollout_worker=rollout_worker,
65 |         save_path=args.save_path, 
66 |         evaluator=evaluator, 
67 |         n_epochs=n_epochs, 
68 |         n_test_rollouts=params['n_test_rollouts'],
69 |         n_cycles=params['n_cycles'], 
70 |         n_batches=params['n_batches'],
71 |         policy_save_interval=params['policy_save_interval'], 
72 |         random_init=params['random_init']
73 |         )
74 | 
75 |     if args.play_episodes or args.play_no_training:
76 |         play(policy, env, episodes=args.play_episodes)
77 |     env.close()
78 |     
79 | 
80 | if __name__ == '__main__':
81 |     main(sys.argv)
82 | 


--------------------------------------------------------------------------------
/mher/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from mher.samplers.sampler import RandomSampler
2 | from mher.samplers.her_sampler import HER_Sampler
3 | from mher.samplers.nstep_sampler import Nstep_Sampler, Nstep_HER_Sampler
4 | from mher.samplers.prioritized_sampler import PrioritizedSampler, PrioritizedHERSampler
5 | 
6 | 
7 | 


--------------------------------------------------------------------------------
/mher/samplers/her_sampler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from mher.samplers.sampler import RelabelSampler
 3 | 
 4 | 
 5 | class HER_Sampler(RelabelSampler):
 6 |     valid_strategy = ['future', 'last', 'random', 'episode', 'cut']
 7 |     def __init__(self, T, reward_fun, batch_size, relabel_p, strategy, *args):
 8 |         super(HER_Sampler, self).__init__(T, reward_fun, batch_size, relabel_p)
 9 |         self.strategy = strategy
10 |         self.cur_L = 1
11 |         self.inc_L = T / 500
12 |     
13 |     def _get_relabel_ag(self, episode_batch, episode_idxs, t_samples, num_episodes):
14 |         relabel_indexes = self._relabel_idxs()
15 |         if self.strategy == 'future' or self.strategy not in self.valid_strategy:
16 |             future_offset = (np.random.uniform(size=self.batch_size) * (self.T - t_samples)).astype(int)
17 |             future_t = (t_samples + 1 + future_offset)[relabel_indexes]
18 |             future_ag = episode_batch['ag'][episode_idxs[relabel_indexes], future_t]
19 |         elif self.strategy == 'last':
20 |             future_ag = episode_batch['ag'][episode_idxs[relabel_indexes], -1]
21 |         elif self.strategy == 'episode':
22 |             random_t_samples = np.random.randint(self.T, size=self.batch_size)[relabel_indexes]
23 |             future_ag = episode_batch['ag'][episode_idxs[relabel_indexes], random_t_samples]
24 |         elif self.strategy == 'cut':
25 |             print(int(self.cur_L))
26 |             future_offset = (np.random.uniform(size=self.batch_size) * np.minimum(int(self.cur_L), (self.T - t_samples))).astype(int)
27 |             future_t = (t_samples + 1 + future_offset)[relabel_indexes]
28 |             future_ag = episode_batch['ag'][episode_idxs[relabel_indexes], future_t]
29 |             self.cur_L  += self.inc_L
30 |         else: # self.strategy == 'random'
31 |             random_episode_idxs = np.random.randint(0, num_episodes, self.batch_size)[relabel_indexes]
32 |             random_t_samples = np.random.randint(self.T, size=self.batch_size)[relabel_indexes]
33 |             future_ag = episode_batch['ag'][random_episode_idxs, random_t_samples]
34 |         return future_ag, relabel_indexes
35 | 
36 |     def sample(self, episode_batch):
37 |         transitions, info = self._sample_transitions(episode_batch)
38 |         relabel_ag, relabel_indexes = self._get_relabel_ag(episode_batch, info['episode_idxs'], info['t_samples'], info['num_episodes'])
39 |         transitions = self.relabel_transition(transitions, relabel_indexes, relabel_ag)
40 |         transitions = self.reshape_transitions(transitions)
41 |         return transitions
42 | 
43 | class ClipHER_Sampler(HER_Sampler):
44 |     def __init__(self, T, reward_fun, batch_size, relabel_p, num_epoch=200, *args):
45 |         super(ClipHER_Sampler, self).__init__(T, reward_fun, batch_size, relabel_p, 'future', *args)
46 |         self.cur_L = 1
47 |         self.inc_L = T / num_epoch
48 |     
49 |     def _get_relabel_ag(self, episode_batch, episode_idxs, t_samples, num_episodes):
50 |         relabel_indexes = self._relabel_idxs()
51 |         future_offset = (np.random.uniform(size=self.batch_size) * np.minimum(int(self.cur_L), (self.T - t_samples))).astype(int)
52 |         future_t = (t_samples + 1 + future_offset)[relabel_indexes]
53 |         future_ag = episode_batch['ag'][episode_idxs[relabel_indexes], future_t]
54 |         return future_ag, relabel_indexes
55 |         
56 |     def sample(self, episode_batch):
57 |         transitions, info = self._sample_transitions(episode_batch)
58 |         relabel_ag, relabel_indexes = self._get_relabel_ag(episode_batch, info['episode_idxs'], info['t_samples'], info['num_episodes'])
59 |         transitions = self.relabel_transition(transitions, relabel_indexes, relabel_ag)
60 |         transitions = self.reshape_transitions(transitions)
61 |         self.cur_L += self.inc_L
62 |         return transitions
63 | 
64 | 


--------------------------------------------------------------------------------
/mher/samplers/nstep_sampler.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import numpy as np
 3 | 
 4 | from mher.samplers.sampler import RelabelSampler
 5 | from mher.samplers.her_sampler import HER_Sampler
 6 | 
 7 | 
 8 | class Nstep_Sampler(RelabelSampler):
 9 |     def __init__(self, T, reward_fun, batch_size, replay_p, nstep, gamma, *args):
10 |         super(Nstep_Sampler, self).__init__(T, reward_fun, batch_size, replay_p, *args)
11 |         self.nstep = nstep
12 |         self.gamma = gamma
13 |     
14 |     def _sample_nstep_transitions(self, episode_batch):
15 |         transitions, info = self._sample_transitions(episode_batch)
16 |         episode_idxs, t_samples = info['episode_idxs'], info['t_samples']
17 |         transitions['r'] = self.recompute_reward(transitions)
18 |         transition_lis = [transitions]
19 |         nstep_masks = [np.ones(self.batch_size)]
20 |         for i in range(1, self.nstep):
21 |             t_samples_i = t_samples + i
22 |             out_range_idxs = np.where(t_samples_i > self.T-1)
23 |             t_samples_i[out_range_idxs] = self.T - 1
24 |             transitions = self._get_transitions(episode_batch, episode_idxs, t_samples_i)
25 |             transition_lis.append(transitions)
26 |             mask = np.ones(self.batch_size) * pow(self.gamma, i)
27 |             mask[out_range_idxs] = 0
28 |             nstep_masks.append(mask)
29 |         return transition_lis, nstep_masks, info
30 |     
31 |     def _recompute_nstep_reward(self, transition_lis):
32 |         for i in range(len(transition_lis)):
33 |             transition_lis[i]['r'] = self.recompute_reward(transition_lis[i])
34 |         return transition_lis
35 | 
36 |     # process to get final transitions
37 |     def _get_out_transitions(self, transition_lis, nstep_masks):
38 |         out_transitions = copy.deepcopy(transition_lis[0])
39 |         final_gamma = np.ones(self.batch_size) * pow(self.gamma, self.nstep)  # gamma
40 |         for i in range(1, self.nstep):
41 |             out_transitions['r'] += nstep_masks[i] * transition_lis[i]['r']
42 |             final_gamma[np.where((nstep_masks[i] == 0) & (final_gamma == pow(self.gamma, self.nstep)))] = pow(self.gamma, i)
43 |         out_transitions['o_2'] = transition_lis[-1]['o_2'].copy()
44 |         out_transitions['gamma'] = final_gamma.copy()
45 |         return out_transitions
46 | 
47 |     def sample(self, episode_batch):
48 |         transition_lis, nstep_masks, _ = self._sample_nstep_transitions(episode_batch)
49 |         transition_lis = self._recompute_nstep_reward(transition_lis)
50 |         out_transitions = self._get_out_transitions(transition_lis, nstep_masks)
51 |         self.reshape_transitions(out_transitions)
52 |         return out_transitions
53 |         
54 |         
55 | class Nstep_HER_Sampler(Nstep_Sampler, HER_Sampler):
56 |     def __init__(self, T, reward_fun, batch_size, relabel_p, nstep, gamma, strategy):
57 |         super().__init__(T, reward_fun, batch_size, relabel_p, nstep, gamma, strategy)
58 | 
59 |     def relabel_nstep_transitions(self, episode_batch, transition_lis, info):
60 |         relabel_ag, relabel_indexes = self._get_relabel_ag(episode_batch, info['episode_idxs'], info['t_samples'], info['num_episodes'])
61 |         for i in range(len(transition_lis)):
62 |             transitions = transition_lis[i]
63 |             transitions = self.relabel_transition(transitions, relabel_indexes, relabel_ag)
64 |             transition_lis[i] = transitions
65 |         return transition_lis
66 |     
67 |     def sample(self, episode_batch):
68 |         transition_lis, nstep_masks, info = self._sample_nstep_transitions(episode_batch)
69 |         transition_lis = self.relabel_nstep_transitions(episode_batch, transition_lis, info)
70 |         out_transitions = self._get_out_transitions(transition_lis, nstep_masks)
71 |         out_transitions = self.reshape_transitions(out_transitions)
72 |         return out_transitions
73 | 


--------------------------------------------------------------------------------
/mher/samplers/prioritized_sampler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np 
 2 | from mher.samplers.sampler import Sampler 
 3 | from mher.samplers.her_sampler import HER_Sampler
 4 | from mher.common.segment_tree import SumSegmentTree, MinSegmentTree
 5 | 
 6 | 
 7 | class PrioritizedSampler(Sampler):
 8 |     def __init__(self, T, reward_fun, batch_size, size_in_transitions, alpha, beta, eps, *args):
 9 |         '''beta: float  To what degree to use importance weights
10 |         (0 - no corrections, 1 - full correction)'''
11 |         super(PrioritizedSampler, self).__init__(T, reward_fun, batch_size, *args)
12 |         assert alpha >= 0 and beta >= 0
13 |         self.alpha = alpha
14 |         self.beta = beta
15 |         self.eps = eps
16 | 
17 |         capacity = 1
18 |         while capacity < size_in_transitions:
19 |             capacity *= 2
20 |         self.sum_tree = SumSegmentTree(capacity)
21 |         self.min_tree = MinSegmentTree(capacity)
22 |         self.capacity = size_in_transitions
23 |         self._max_priority = 1.0
24 |         self.n_transitions_stored = 0
25 |     
26 |     def update_new_priorities(self, episode_idxs):
27 |         N = len(episode_idxs) * self.T
28 |         priority_array = np.zeros(N) + self._max_priority 
29 |         episode_idxs_repeat = (episode_idxs * self.T).repeat(self.T) + np.arange(self.T)
30 |         self.update_priorities(episode_idxs_repeat, priority_array)
31 |         self.n_transitions_stored += len(episode_idxs) * self.T
32 |         self.n_transitions_stored = min(self.n_transitions_stored, self.capacity)
33 | 
34 |     def update_priorities(self, idxes, priorities):
35 |         """Update priorities of sampled transitions"""
36 |         assert len(idxes) == len(priorities) and np.all(priorities >= 0)
37 |         priorities += self.eps  # avoid zero
38 |         new_priority = np.power(priorities.flatten(), self.alpha)
39 |         self.sum_tree.set_items(idxes, new_priority)
40 |         self.min_tree.set_items(idxes, new_priority)
41 |         self._max_priority = max(np.max(priorities), self._max_priority)
42 | 
43 |     def _sample_idxes(self):
44 |         culm_sums = np.random.random(size=self.batch_size) * self.sum_tree.sum()
45 |         idxes = np.zeros(self.batch_size)
46 |         for i in range(self.batch_size):
47 |             idxes[i] = self.sum_tree.find_prefixsum_idx(culm_sums[i])
48 |         episode_idxs = idxes // self.T
49 |         t_samples = idxes % self.T
50 |         return episode_idxs.astype(np.int), t_samples.astype(np.int), idxes.astype(np.int)
51 | 
52 |     def priority_sample(self, episode_batch):
53 |         episode_idxs, t_samples, idxes = self._sample_idxes()
54 |         p_min = self.min_tree.min() / self.sum_tree.sum()
55 |         transitions = self._get_transitions(episode_batch, episode_idxs, t_samples)
56 |         p_samples = self.sum_tree.get_items(idxes) / self.sum_tree.sum()
57 |         weights = np.power(p_samples / p_min, - self.beta)  
58 |         transitions['w'] = weights
59 |         info = {
60 |             'episode_idxs': episode_idxs,
61 |             't_samples': t_samples,
62 |             'idxes': idxes,
63 |             'num_episodes': episode_batch['u'].shape[0]
64 |         }
65 |         return transitions, info
66 |         
67 |     def sample(self, episode_batch):
68 |         transitions, info = self.priority_sample(episode_batch)
69 |         transitions['r'] = self.recompute_reward(transitions)
70 |         transitions = self.reshape_transitions(transitions)
71 |         return (transitions, info['idxes'])
72 | 
73 | 
74 | class PrioritizedHERSampler(PrioritizedSampler, HER_Sampler):
75 |     '''not good with relabeling after prioritized sampling'''
76 |     def __init__(self, T, reward_fun, batch_size, size_in_transitions, alpha, beta, eps, relabel_p, strategy):
77 |         super().__init__(T, reward_fun, batch_size, size_in_transitions, alpha, beta, eps, relabel_p, strategy)
78 |     
79 |     def sample(self, episode_batch):
80 |         transitions, info = self.priority_sample(episode_batch)
81 |         relabel_ag, relabel_indexes = self._get_relabel_ag(episode_batch, info['episode_idxs'], info['t_samples'], info['num_episodes'])
82 |         transitions = self.relabel_transition(transitions, relabel_indexes, relabel_ag)
83 |         transitions = self.reshape_transitions(transitions)
84 |         return (transitions, info['idxes'])
85 | 
86 | 
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/mher/samplers/sampler.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Sampler:
 5 |     def __init__(self, T, reward_fun, batch_size):
 6 |         self.T = T
 7 |         self.reward_fun = reward_fun
 8 |         self.batch_size = batch_size
 9 |     
10 |     def _get_transitions(self, episode_batch, episode_idxs, t_samples):
11 |         return {key: episode_batch[key][episode_idxs, t_samples].copy()
12 |                             for key in episode_batch.keys()}
13 |     
14 |     def _sample_transitions(self, episode_batch):
15 |         num_episodes = episode_batch['u'].shape[0]
16 |         episode_idxs = np.random.randint(0, num_episodes, self.batch_size)
17 |         t_samples = np.random.randint(self.T, size=self.batch_size)
18 |         transitions = self._get_transitions(episode_batch, episode_idxs, t_samples)
19 |         info = {
20 |             'num_episodes': num_episodes,
21 |             'episode_idxs':episode_idxs,
22 |             't_samples':t_samples
23 |         }
24 |         return transitions, info
25 | 
26 |     def recompute_reward(self, transitions):
27 |         # Reconstruct info dictionary for reward  computation.
28 |         info = {}
29 |         for key, value in transitions.items():
30 |             if key.startswith('info_'):
31 |                 info[key.replace('info_', '')] = value
32 |         # Re-compute reward since we may have substituted the goal.
33 |         reward_params = {k: transitions[k] for k in ['ag_2', 'g']}
34 |         reward_params['info'] = info
35 |         return self.reward_fun(**reward_params)
36 |     
37 |     def reshape_transitions(self, transitions):
38 |         transitions = {k: transitions[k].reshape(self.batch_size, *transitions[k].shape[1:])
39 |                         for k in transitions.keys()}
40 |         assert(transitions['u'].shape[0] == self.batch_size)
41 |         return transitions
42 | 
43 |     def sample(self, episode_batch):
44 |         pass
45 | 
46 | class RandomSampler(Sampler):
47 |     def sample(self, episode_batch):
48 |         transitions, _ = self._sample_transitions(episode_batch)
49 |         transitions['r'] = self.recompute_reward(transitions)
50 |         transitions = self.reshape_transitions(transitions)
51 |         return transitions
52 | 
53 | class RelabelSampler(Sampler):
54 |     def __init__(self, T, reward_fun, batch_size, relabel_p):
55 |         '''relabel_p defines the probability for relabeling'''
56 |         super(RelabelSampler, self).__init__(T, reward_fun, batch_size)
57 |         self.relabel_p = relabel_p
58 | 
59 |     def _relabel_idxs(self):
60 |         return (np.random.uniform(size=self.batch_size) < self.relabel_p)
61 | 
62 |     def relabel_transition(self, transitions, relabel_indexes, relabel_ag):
63 |         assert relabel_indexes.sum() == len(relabel_ag)
64 |         transitions['g'][relabel_indexes] = relabel_ag
65 |         transitions['r'] = self.recompute_reward(transitions)
66 |         return transitions
67 | 


--------------------------------------------------------------------------------
/mher/train.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | 
  5 | import click
  6 | import numpy as np
  7 | from mpi4py import MPI
  8 | 
  9 | import mher.config as config
 10 | from mher.common import logger
 11 | from mher.common.mpi_moments import mpi_moments
 12 | from mher.rollouts.rollout import RolloutWorker
 13 | 
 14 | 
 15 | def mpi_average(value):
 16 |     if not isinstance(value, list):
 17 |         value = [value]
 18 |     if not any(value):
 19 |         value = [0.]
 20 |     return mpi_moments(np.array(value))[0]
 21 | 
 22 | 
 23 | def train(*, policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, 
 24 |             n_batches, policy_save_interval, save_path, random_init, **kwargs):
 25 |     rank = MPI.COMM_WORLD.Get_rank()
 26 |     if save_path:
 27 |         latest_policy_path = os.path.join(save_path, 'policy_latest.pkl')
 28 |         best_policy_path = os.path.join(save_path, 'policy_best.pkl')
 29 |         periodic_policy_path = os.path.join(save_path, 'policy_{}.pkl')
 30 | 
 31 |     # random_init buffer and o/g/u stat 
 32 |     if random_init:
 33 |         logger.info('Random initializing ...')
 34 |         rollout_worker.clear_history()
 35 |         for epi in range(int(random_init) // rollout_worker.rollout_batch_size): 
 36 |             episode = rollout_worker.generate_rollouts(random_ac=True)
 37 |             policy.store_episode(episode)
 38 |         if policy.use_dynamic_nstep and policy.n_step > 1:
 39 |             policy.update_dynamic_model(init=True)
 40 | 
 41 |     best_success_rate = -1
 42 |     logger.info('Start training...')
 43 |     # num_timesteps = n_epochs * n_cycles * rollout_length * number of rollout workers
 44 |     for epoch in range(n_epochs):
 45 |         time_start = time.time()
 46 |         # train
 47 |         rollout_worker.clear_history()
 48 |         for i in range(n_cycles):
 49 |             policy.dynamic_batch = False
 50 |             episode = rollout_worker.generate_rollouts()
 51 |             policy.store_episode(episode)
 52 |             for j in range(n_batches):   
 53 |                 policy.train()
 54 |             policy.update_target_net()
 55 | 
 56 |         # test
 57 |         evaluator.clear_history()
 58 |         for _ in range(n_test_rollouts):
 59 |             evaluator.generate_rollouts()
 60 | 
 61 |         # record logs
 62 |         time_end = time.time()
 63 |         logger.record_tabular('epoch', epoch)
 64 |         logger.record_tabular('epoch time(min)', (time_end - time_start)/60)
 65 |         for key, val in evaluator.logs('test'):
 66 |             logger.record_tabular(key, mpi_average(val))
 67 |         for key, val in rollout_worker.logs('train'):
 68 |             logger.record_tabular(key, mpi_average(val))
 69 |         for key, val in policy.logs_stats():
 70 |             logger.record_tabular(key, mpi_average(val))
 71 | 
 72 |         if rank == 0:
 73 |             logger.dump_tabular()
 74 | 
 75 |         # save the policy if it's better than the previous ones
 76 |         success_rate = mpi_average(evaluator.current_success_rate())
 77 |         if rank == 0 and success_rate > best_success_rate and save_path:
 78 |             best_success_rate = success_rate
 79 |             logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path))
 80 |             policy.save(best_policy_path)
 81 |             policy.save(latest_policy_path)
 82 |         if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_path:
 83 |             policy_path = periodic_policy_path.format(epoch)
 84 |             logger.info('Saving periodic policy to {} ...'.format(policy_path))
 85 |             policy.save(policy_path)
 86 | 
 87 |         # make sure that different threads have different seeds
 88 |         local_uniform = np.random.uniform(size=(1,))
 89 |         root_uniform = local_uniform.copy()
 90 |         MPI.COMM_WORLD.Bcast(root_uniform, root=0)
 91 |         if rank != 0:
 92 |             assert local_uniform[0] != root_uniform[0]
 93 |     
 94 |     if rank == 0 and save_path:
 95 |         policy_path = periodic_policy_path.format(epoch)
 96 |         logger.info('Saving final policy to {} ...'.format(policy_path))
 97 |         policy.save(policy_path)
 98 | 
 99 |     return policy
100 | 
101 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from setuptools import setup, find_packages
 3 | import sys
 4 | 
 5 | if sys.version_info.major != 3:
 6 |     print('This Python is only compatible with Python 3, but you are running '
 7 |           'Python {}. The installation will likely fail.'.format(sys.version_info.major))
 8 | 
 9 | 
10 | extras = {
11 |     'test': [
12 |         'filelock',
13 |         'pytest',
14 |         'pytest-forked',
15 |         'atari-py',
16 |         'matplotlib',
17 |         'pandas'
18 |     ],
19 |     'mpi': [
20 |         'mpi4py'
21 |     ]
22 | }
23 | 
24 | all_deps = []
25 | for group_name in extras:
26 |     all_deps += extras[group_name]
27 | 
28 | extras['all'] = all_deps
29 | 
30 | setup(name='mher',
31 |     #   packages=[package for package in find_packages()
32 |     #             if package.startswith('baselines')],
33 |       
34 |       packages = find_packages(),
35 |       install_requires=[
36 |           'gym>=0.15.4, <0.16.0',
37 |           'scipy',
38 |           'tqdm',
39 |           'joblib',
40 |           'cloudpickle',
41 |           'click',
42 |           'opencv-python'
43 |       ],
44 |       extras_require=extras,
45 |       description='Modular HER: based on OpenAI baselines',
46 |       author='RuiYang',
47 |       url='https://github.com/YangRui2015/Modular_HER',
48 |       author_email='yangrui19@mails.tsinghua.edu.cn',
49 |       version='1.0')
50 | 
51 | 
52 | # ensure there is some tensorflow build with version above 1.4
53 | import pkg_resources
54 | tf_pkg = None
55 | for tf_pkg_name in ['tensorflow', 'tensorflow-gpu', 'tf-nightly', 'tf-nightly-gpu']:
56 |     try:
57 |         tf_pkg = pkg_resources.get_distribution(tf_pkg_name)
58 |     except pkg_resources.DistributionNotFound:
59 |         pass
60 | assert tf_pkg is not None, 'TensorFlow needed, of version above 1.4'
61 | from distutils.version import LooseVersion
62 | assert LooseVersion(re.sub(r'-?rc\d+$', '', tf_pkg.version)) >= LooseVersion('1.4.0')
63 | 


--------------------------------------------------------------------------------