├── LICENSE
├── README.md
├── algos
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── base.cpython-35.pyc
    │   ├── diayn.cpython-35.pyc
    │   └── sac.cpython-35.pyc
    ├── base.py
    └── sac.py
├── core
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   └── serializable.cpython-35.pyc
    └── serializable.py
├── distributions
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── gmm.cpython-35.pyc
    │   ├── normal.cpython-35.pyc
    │   └── real_nvp_bijector.cpython-35.pyc
    ├── gmm.py
    ├── normal.py
    └── real_nvp_bijector.py
├── environments
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── delayed_env.cpython-35.pyc
    │   ├── gym_env.cpython-35.pyc
    │   ├── multigoal.cpython-35.pyc
    │   └── pusher.cpython-35.pyc
    ├── delayed_env.py
    ├── gym_env.py
    ├── multigoal.py
    └── pusher.py
├── envs
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── cheetah_hurdle_env.cpython-35.pyc
    │   ├── cross_maze_ant_env.cpython-35.pyc
    │   ├── gym_env.cpython-35.pyc
    │   ├── helpers.cpython-35.pyc
    │   ├── hierarchy_proxy_env.cpython-35.pyc
    │   ├── multi_direction_env.cpython-35.pyc
    │   ├── multigoal.cpython-35.pyc
    │   ├── pusher.cpython-35.pyc
    │   ├── random_goal_ant_env.cpython-35.pyc
    │   └── simple_maze_ant_env.cpython-35.pyc
    ├── cheetah_hurdle_env.py
    ├── cross_maze_ant_env.py
    ├── delayed_env.py
    ├── gym_env.py
    ├── helpers.py
    ├── hierarchy_proxy_env.py
    ├── meta_env.py
    ├── multi_direction_env.py
    ├── multigoal.py
    ├── pusher.py
    ├── random_goal_ant_env.py
    └── simple_maze_ant_env.py
├── misc
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── instrument.cpython-35.pyc
    │   ├── mlp.cpython-35.pyc
    │   ├── plotter.cpython-35.pyc
    │   ├── sampler.cpython-35.pyc
    │   ├── tf_utils.cpython-35.pyc
    │   └── utils.cpython-35.pyc
    ├── instrument.py
    ├── mlp.py
    ├── plotter.py
    ├── remote_sampler.py
    ├── replay_pool.py
    ├── sampler.py
    ├── tf_utils.py
    └── utils.py
├── mujoco_am_sac.py
├── mujoco_models
    ├── cross_maze_ant.xml
    ├── half_cheetah_hurdle.xml
    ├── pusher_2d.xml
    └── simple_maze_ant.xml
├── policies
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── base.cpython-35.pyc
    │   ├── gaussian_policy.cpython-35.pyc
    │   ├── gmm.cpython-35.pyc
    │   ├── hierarchical_policy.cpython-35.pyc
    │   ├── latent_space_policy.cpython-35.pyc
    │   ├── nn_policy.cpython-35.pyc
    │   ├── nn_policy2.cpython-35.pyc
    │   ├── pointer_policy.cpython-35.pyc
    │   └── uniform_policy.cpython-35.pyc
    ├── base.py
    ├── gaussian_policy.py
    ├── nn_policy.py
    ├── nn_policy2.py
    ├── pointer_policy.py
    └── uniform_policy.py
├── preprocessors
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   └── mlp_preprocessor.cpython-35.pyc
    └── mlp_preprocessor.py
├── primitive-policies
    ├── ant
    │   ├── bwrd
    │   │   └── bwrd.pkl
    │   ├── dwrd
    │   │   └── dwrd.pkl
    │   ├── fwrd
    │   │   └── fwrd.pkl
    │   └── uwrd
    │   │   └── uwrd.pkl
    ├── hc
    │   ├── fwd
    │   │   └── fwd.pkl
    │   └── jp-longz
    │   │   └── jump.pkl
    └── pusher
    │   ├── bottom
    │       └── bottom.pkl
    │   └── left
    │       └── left.pkl
├── replay_buffers
    ├── __init__.py
    ├── __pycache__
    │   ├── __init__.cpython-35.pyc
    │   ├── replay_buffer.cpython-35.pyc
    │   └── simple_replay_buffer.cpython-35.pyc
    ├── replay_buffer.py
    └── simple_replay_buffer.py
├── sandbox
    ├── __pycache__
    │   └── __init__.cpython-35.pyc
    └── rocky
    │   ├── __pycache__
    │       └── __init__.cpython-35.pyc
    │   └── tf
    │       ├── __pycache__
    │           └── __init__.cpython-35.pyc
    │       ├── algos
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-35.pyc
    │           │   ├── batch_polopt.cpython-35.pyc
    │           │   ├── npo.cpython-35.pyc
    │           │   └── trpo.cpython-35.pyc
    │           ├── batch_polopt.py
    │           ├── npg.py
    │           ├── npo.py
    │           ├── trpo.py
    │           └── vpg.py
    │       ├── core
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-35.pyc
    │           │   ├── layers.cpython-35.pyc
    │           │   ├── layers_powered.cpython-35.pyc
    │           │   ├── network.cpython-35.pyc
    │           │   └── parameterized.cpython-35.pyc
    │           ├── layers.py
    │           ├── layers_powered.py
    │           ├── network.py
    │           └── parameterized.py
    │       ├── distributions
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-35.pyc
    │           │   ├── base.cpython-35.pyc
    │           │   └── diagonal_gaussian.cpython-35.pyc
    │           ├── base.py
    │           ├── bernoulli.py
    │           ├── categorical.py
    │           ├── diagonal_gaussian.py
    │           ├── recurrent_categorical.py
    │           └── recurrent_diagonal_gaussian.py
    │       ├── envs
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-35.pyc
    │           │   ├── base.cpython-35.pyc
    │           │   ├── parallel_vec_env_executor.cpython-35.pyc
    │           │   └── vec_env_executor.cpython-35.pyc
    │           ├── base.py
    │           ├── parallel_vec_env_executor.py
    │           └── vec_env_executor.py
    │       ├── launchers
    │           ├── __init__.py
    │           ├── trpo_cartpole.py
    │           ├── trpo_cartpole_recurrent.py
    │           └── vpg_cartpole.py
    │       ├── misc
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-35.pyc
    │           │   └── tensor_utils.cpython-35.pyc
    │           └── tensor_utils.py
    │       ├── optimizers
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-35.pyc
    │           │   ├── conjugate_gradient_optimizer.cpython-35.pyc
    │           │   └── penalty_lbfgs_optimizer.cpython-35.pyc
    │           ├── conjugate_gradient_optimizer.py
    │           ├── first_order_optimizer.py
    │           ├── lbfgs_optimizer.py
    │           └── penalty_lbfgs_optimizer.py
    │       ├── policies
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-35.pyc
    │           │   ├── base.cpython-35.pyc
    │           │   ├── gaussian_mlp_inverse_policy.cpython-35.pyc
    │           │   └── gaussian_mlp_policy.cpython-35.pyc
    │           ├── base.py
    │           ├── categorical_conv_policy.py
    │           ├── categorical_gru_policy.py
    │           ├── categorical_lstm_policy.py
    │           ├── categorical_mlp_policy.py
    │           ├── deterministic_mlp_policy.py
    │           ├── gaussian_gru_policy.py
    │           ├── gaussian_lstm_policy.py
    │           ├── gaussian_mlp_inverse_policy.py
    │           ├── gaussian_mlp_policy.py
    │           └── uniform_control_policy.py
    │       ├── q_functions
    │           ├── base.py
    │           └── continuous_mlp_q_function.py
    │       ├── regressors
    │           ├── __init__.py
    │           ├── bernoulli_mlp_regressor.py
    │           ├── categorical_mlp_regressor.py
    │           ├── deterministic_mlp_regressor.py
    │           └── gaussian_mlp_regressor.py
    │       ├── samplers
    │           ├── __init__.py
    │           ├── __pycache__
    │           │   ├── __init__.cpython-35.pyc
    │           │   ├── base.cpython-35.pyc
    │           │   ├── batch_sampler.cpython-35.pyc
    │           │   └── vectorized_sampler.cpython-35.pyc
    │           ├── batch_sampler.py
    │           └── vectorized_sampler.py
    │       └── spaces
    │           ├── __init__.py
    │           ├── __pycache__
    │               ├── __init__.cpython-35.pyc
    │               ├── box.cpython-35.pyc
    │               ├── discrete.cpython-35.pyc
    │               └── product.cpython-35.pyc
    │           ├── box.py
    │           ├── discrete.py
    │           └── product.py
├── sim_cpolicy.py
├── sim_policy.py
└── value_functions
    ├── __init__.py
    ├── __pycache__
        ├── __init__.cpython-35.pyc
        └── value_function.cpython-35.pyc
    └── value_function.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Ahmed Qureshi
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # COMPOSING TASK-AGNOSTIC POLICIES WITH DEEP REINFORCEMENT LEARNING
 2 | 
 3 | 
 4 | * Requirements:
 5 | 1. Rllab
 6 | 2. Tensorflow
 7 | 3. mujoco
 8 | 
 9 | 
10 | ## To train composite model from scratch, run:
11 | 
12 | 1. To simulate "ant-cross-maze", run:
13 | 
14 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/ant-maze" --domain="ant-cross-maze"```
15 | 
16 | 2. To simulate "ant-random-goal", run:
17 | 
18 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/ant-rgoal" --domain="ant-random-goal"```
19 | 
20 | 3. To simulate "cheetah-hurdle", run:
21 | 
22 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/cheetah-hurdle" --domain="cheetah-hurdle"```
23 | 
24 | 4. To simulate "pusher", run:
25 | 
26 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/pusher" --domain="pusher"```
27 |  
28 |  
29 |  
30 |  
31 | 
32 | ## References
33 | ```
34 | @inproceedings{
35 | qureshi2020composing,
36 | title={Composing Task-Agnostic Policies with Deep Reinforcement Learning},
37 | author={Ahmed H. Qureshi and Jacob J. Johnson and Yuzhe Qin and Taylor Henderson and Byron Boots and Michael C. Yip},
38 | booktitle={International Conference on Learning Representations},
39 | year={2020},
40 | url={https://openreview.net/forum?id=H1ezFREtwH}
41 | }
42 | ```
43 | 


--------------------------------------------------------------------------------
/algos/__init__.py:
--------------------------------------------------------------------------------
1 | from .sac import SAC
2 | 


--------------------------------------------------------------------------------
/algos/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/algos/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/base.cpython-35.pyc


--------------------------------------------------------------------------------
/algos/__pycache__/diayn.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/diayn.cpython-35.pyc


--------------------------------------------------------------------------------
/algos/__pycache__/sac.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/sac.cpython-35.pyc


--------------------------------------------------------------------------------
/core/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/core/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/core/__pycache__/serializable.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/core/__pycache__/serializable.cpython-35.pyc


--------------------------------------------------------------------------------
/core/serializable.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.serializable import Serializable
 2 | 
 3 | 
 4 | def deep_clone(obj):
 5 |     assert isinstance(obj, Serializable)
 6 | 
 7 |     def maybe_deep_clone(o):
 8 |         if isinstance(o, Serializable):
 9 |             return deep_clone(o)
10 |         else:
11 |             return o
12 | 
13 |     d = obj.__getstate__()
14 |     for key, val in d.items():
15 |         d[key] = maybe_deep_clone(val)
16 | 
17 |     d['__args'] = list(d['__args'])  # Make args mutable.
18 |     for i, val in enumerate(d['__args']):
19 |         d['__args'][i] = maybe_deep_clone(val)
20 | 
21 |     for key, val in d['__kwargs'].items():
22 |         d['__kwargs'][key] = maybe_deep_clone(val)
23 | 
24 |     out = type(obj).__new__(type(obj))
25 |     # noinspection PyArgumentList
26 |     out.__setstate__(d)
27 | 
28 |     return out
29 | 


--------------------------------------------------------------------------------
/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | from .normal import Normal
2 | from .gmm import GMM
3 | from .real_nvp_bijector import RealNVPBijector
4 | 


--------------------------------------------------------------------------------
/distributions/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/distributions/__pycache__/gmm.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/gmm.cpython-35.pyc


--------------------------------------------------------------------------------
/distributions/__pycache__/normal.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/normal.cpython-35.pyc


--------------------------------------------------------------------------------
/distributions/__pycache__/real_nvp_bijector.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/real_nvp_bijector.cpython-35.pyc


--------------------------------------------------------------------------------
/distributions/gmm.py:
--------------------------------------------------------------------------------
  1 | """ Gaussian mixture model. """
  2 | 
  3 | import tensorflow as tf
  4 | import numpy as np
  5 | 
  6 | from sac.misc.mlp import mlp
  7 | 
  8 | LOG_SIG_CAP_MAX = 2
  9 | LOG_SIG_CAP_MIN = -20
 10 | 
 11 | 
 12 | class GMM(object):
 13 |     def __init__(
 14 |             self,
 15 |             K,
 16 |             Dx,
 17 |             hidden_layers_sizes=(100, 100),
 18 |             reg=0.001,
 19 |             reparameterize=True,
 20 |             cond_t_lst=(),
 21 |     ):
 22 |         self._cond_t_lst = cond_t_lst
 23 |         self._reg = reg
 24 |         self._layer_sizes = list(hidden_layers_sizes) + [K * (2 * Dx + 1)]
 25 |         self._reparameterize = reparameterize
 26 | 
 27 |         self._Dx = Dx
 28 |         self._K = K
 29 | 
 30 |         self._create_placeholders()
 31 |         self._create_graph()
 32 | 
 33 |     def _create_placeholders(self):
 34 |         self._N_pl = tf.placeholder(
 35 |             tf.int32,
 36 |             shape=(),
 37 |             name='N',
 38 |         )
 39 | 
 40 |     @staticmethod
 41 |     def _create_log_gaussian(mu_t, log_sig_t, t):
 42 |         normalized_dist_t = (t - mu_t) * tf.exp(-log_sig_t)  # ... x D
 43 |         quadratic = - 0.5 * tf.reduce_sum(normalized_dist_t ** 2, axis=-1)
 44 |         # ... x (None)
 45 | 
 46 |         log_z = tf.reduce_sum(log_sig_t, axis=-1)  # ... x (None)
 47 |         D_t = tf.cast(tf.shape(mu_t)[-1], tf.float32)
 48 |         log_z += 0.5 * D_t * np.log(2 * np.pi)
 49 | 
 50 |         log_p = quadratic - log_z
 51 | 
 52 |         return log_p  # ... x (None)
 53 | 
 54 |     def _create_p_xz_params(self):
 55 |         K = self._K
 56 |         Dx = self._Dx
 57 | 
 58 |         if len(self._cond_t_lst) == 0:
 59 |             w_and_mu_and_logsig_t = tf.get_variable(
 60 |                 'params', self._layer_sizes[-1],
 61 |                 initializer=tf.random_normal_initializer(0, 0.1)
 62 |             )
 63 | 
 64 |         else:
 65 |             w_and_mu_and_logsig_t = mlp(
 66 |                 inputs=self._cond_t_lst,
 67 |                 layer_sizes=self._layer_sizes,
 68 |                 output_nonlinearity=None,
 69 |             )  # ... x K*Dx*2+K
 70 | 
 71 |         w_and_mu_and_logsig_t = tf.reshape(
 72 |             w_and_mu_and_logsig_t, shape=(-1, K, 2*Dx+1))
 73 | 
 74 |         log_w_t = w_and_mu_and_logsig_t[..., 0]
 75 |         mu_t = w_and_mu_and_logsig_t[..., 1:1+Dx]
 76 |         log_sig_t = w_and_mu_and_logsig_t[..., 1+Dx:]
 77 | 
 78 |         log_sig_t = tf.clip_by_value(log_sig_t, LOG_SIG_CAP_MIN, LOG_SIG_CAP_MAX)
 79 | 
 80 |         return log_w_t, mu_t, log_sig_t
 81 | 
 82 |     def _create_graph(self):
 83 |         Dx = self._Dx
 84 | 
 85 |         if len(self._cond_t_lst) > 0:
 86 |             N_t = tf.shape(self._cond_t_lst[0])[0]
 87 |         else:
 88 |             N_t = self._N_pl
 89 | 
 90 |         K = self._K
 91 | 
 92 |         # Create p(x|z).
 93 |         with tf.variable_scope('p'):
 94 |             log_ws_t, xz_mus_t, xz_log_sigs_t = self._create_p_xz_params()
 95 |             # (N x K), (N x K x Dx), (N x K x Dx)
 96 |             xz_sigs_t = tf.exp(xz_log_sigs_t)
 97 | 
 98 |             # Sample the latent code.
 99 |             z_t = tf.multinomial(logits=log_ws_t, num_samples=1)  # N x 1
100 | 
101 |             # Choose mixture component corresponding to the latent.
102 |             mask_t = tf.one_hot(
103 |                 z_t[:, 0], depth=K, dtype=tf.bool,
104 |                 on_value=True, off_value=False
105 |             )
106 |             xz_mu_t = tf.boolean_mask(xz_mus_t, mask_t)  # N x Dx
107 |             xz_sig_t = tf.boolean_mask(xz_sigs_t, mask_t)  # N x Dx
108 | 
109 |             # Sample x.
110 |             x_t = xz_mu_t + xz_sig_t * tf.random_normal((N_t, Dx))  # N x Dx
111 |             if not self._reparameterize:
112 |                 x_t = tf.stop_gradient(x_t)
113 | 
114 |             # log p(x|z)
115 |             log_p_xz_t = self._create_log_gaussian(
116 |                 xz_mus_t, xz_log_sigs_t, x_t[:, None, :]
117 |             )  # N x K
118 | 
119 |             # log p(x)
120 |             log_p_x_t = tf.reduce_logsumexp(log_p_xz_t + log_ws_t, axis=1)
121 |             log_p_x_t -= tf.reduce_logsumexp(log_ws_t, axis=1)  # N
122 | 
123 |         reg_loss_t = 0
124 |         reg_loss_t += self._reg * 0.5 * tf.reduce_mean(xz_log_sigs_t ** 2)
125 |         reg_loss_t += self._reg * 0.5 * tf.reduce_mean(xz_mus_t ** 2)
126 | 
127 |         self._log_p_x_t = log_p_x_t
128 |         self._reg_loss_t = reg_loss_t
129 |         self._x_t = x_t
130 | 
131 |         self._log_ws_t = log_ws_t
132 |         self._mus_t = xz_mus_t
133 |         self._log_sigs_t = xz_log_sigs_t
134 | 
135 |     @property
136 |     def log_p_t(self):
137 |         return self._log_p_x_t
138 | 
139 |     @property
140 |     def reg_loss_t(self):
141 |         return self._reg_loss_t
142 | 
143 |     @property
144 |     def x_t(self):
145 |         return self._x_t
146 | 
147 |     @property
148 |     def mus_t(self):
149 |         return self._mus_t
150 | 
151 |     @property
152 |     def log_sigs_t(self):
153 |         return self._log_sigs_t
154 | 
155 |     @property
156 |     def log_ws_t(self):
157 |         return self._log_ws_t
158 | 
159 |     @property
160 |     def N_t(self):
161 |         return self._N_pl
162 | 


--------------------------------------------------------------------------------
/distributions/normal.py:
--------------------------------------------------------------------------------
 1 | """ Multivariate normal distribution with mean and std deviation outputted by a neural net """
 2 | 
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | from sac.misc.mlp import mlp
 7 | 
 8 | LOG_SIG_CAP_MAX = 2
 9 | LOG_SIG_CAP_MIN = -20
10 | 
11 | 
12 | class Normal(object):
13 |     def __init__(
14 |             self,
15 |             Dx,
16 |             hidden_layers_sizes=(100, 100),
17 |             reg=0.001,
18 |             reparameterize=True,
19 |             cond_t_lst=(),
20 |     ):
21 |         self._cond_t_lst = cond_t_lst
22 |         self._reg = reg
23 |         self._layer_sizes = list(hidden_layers_sizes) + [2 * Dx]
24 |         print(self._layer_sizes)
25 |         self._reparameterize = reparameterize
26 | 
27 |         self._Dx = Dx
28 | 
29 |         self._create_placeholders()
30 |         self._create_graph()
31 | 
32 |     def _create_placeholders(self):
33 |         self._N_pl = tf.placeholder(
34 |             tf.int32,
35 |             shape=(),
36 |             name='N',
37 |         )
38 | 
39 |     def _create_graph(self):
40 |         Dx = self._Dx
41 | 
42 |         if len(self._cond_t_lst) == 0:
43 |             mu_and_logsig_t = tf.get_variable(
44 |                 'params', self._layer_sizes[-1],
45 |                 initializer=tf.random_normal_initializer(0, 0.1)
46 |             )
47 |         else:
48 |             mu_and_logsig_t = mlp(
49 |                 inputs=self._cond_t_lst,
50 |                 layer_sizes=self._layer_sizes,
51 |                 output_nonlinearity=None,
52 |             )  # ... x K*Dx*2+K
53 | 
54 |         self._mu_t = mu_and_logsig_t[..., :Dx]
55 |         self._log_sig_t = tf.clip_by_value(mu_and_logsig_t[..., Dx:], LOG_SIG_CAP_MIN, LOG_SIG_CAP_MAX)
56 | 
57 |         # Tensorflow's multivariate normal distribution supports reparameterization
58 |         ds = tf.contrib.distributions
59 |         dist = ds.MultivariateNormalDiag(loc=self._mu_t, scale_diag=tf.exp(self._log_sig_t))
60 |         x_t = dist.sample()
61 |         if not self._reparameterize:
62 |             x_t = tf.stop_gradient(x_t)
63 |         log_pi_t = dist.log_prob(x_t)
64 | 
65 |         self._dist = dist
66 |         self._x_t = x_t
67 |         self._log_pi_t = log_pi_t
68 |         
69 |         reg_loss_t = self._reg * 0.5 * tf.reduce_mean(self._log_sig_t ** 2)
70 |         reg_loss_t += self._reg * 0.5 * tf.reduce_mean(self._mu_t ** 2)
71 |         self._reg_loss_t = reg_loss_t
72 | 
73 | 
74 | 
75 |     @property
76 |     def log_p_t(self):
77 |         return self._log_pi_t
78 | 
79 |     @property
80 |     def reg_loss_t(self):
81 |         return self._reg_loss_t
82 | 
83 |     @property
84 |     def x_t(self):
85 |         return self._x_t
86 | 
87 |     @property
88 |     def mu_t(self):
89 |         return self._mu_t
90 | 
91 |     @property
92 |     def log_sig_t(self):
93 |         return self._log_sig_t
94 | 


--------------------------------------------------------------------------------
/environments/__init__.py:
--------------------------------------------------------------------------------
1 | from .multigoal import MultiGoalEnv
2 | from .gym_env import GymEnv
3 | from .delayed_env import DelayedEnv


--------------------------------------------------------------------------------
/environments/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/environments/__pycache__/delayed_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/delayed_env.cpython-35.pyc


--------------------------------------------------------------------------------
/environments/__pycache__/gym_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/gym_env.cpython-35.pyc


--------------------------------------------------------------------------------
/environments/__pycache__/multigoal.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/multigoal.cpython-35.pyc


--------------------------------------------------------------------------------
/environments/__pycache__/pusher.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/pusher.cpython-35.pyc


--------------------------------------------------------------------------------
/environments/delayed_env.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from rllab.envs.proxy_env import ProxyEnv
 4 | from rllab.core.serializable import Serializable
 5 | 
 6 | 
 7 | class DelayedEnv(ProxyEnv, Serializable):
 8 |     def __init__(self, env, delay=0.01):
 9 |         Serializable.quick_init(self, locals())
10 |         ProxyEnv.__init__(self, env)
11 | 
12 |         self._delay = delay
13 | 
14 |     def step(self, action):
15 |         time.sleep(self._delay)
16 |         return self._wrapped_env.step(action)
17 | 


--------------------------------------------------------------------------------
/environments/gym_env.py:
--------------------------------------------------------------------------------
  1 | """ Rllab implementation with a HACK. See comment in `GymEnv.__init__`. """
  2 | import gym
  3 | import gym.wrappers
  4 | import gym.envs
  5 | import gym.spaces
  6 | import traceback
  7 | import logging
  8 | 
  9 | try:
 10 |     from gym import logger as monitor_logger
 11 | 
 12 |     monitor_logger.setLevel(logging.WARNING)
 13 | except Exception as e:
 14 |     traceback.print_exc()
 15 | 
 16 | import os
 17 | import os.path as osp
 18 | from rllab.envs.base import Env, Step
 19 | from rllab.core.serializable import Serializable
 20 | from rllab.spaces.box import Box
 21 | from rllab.spaces.discrete import Discrete
 22 | from rllab.spaces.product import Product
 23 | from rllab.misc import logger
 24 | 
 25 | 
 26 | def convert_gym_space(space):
 27 |     if isinstance(space, gym.spaces.Box):
 28 |         return Box(low=space.low, high=space.high)
 29 |     elif isinstance(space, gym.spaces.Discrete):
 30 |         return Discrete(n=space.n)
 31 |     elif isinstance(space, gym.spaces.Tuple):
 32 |         return Product([convert_gym_space(x) for x in space.spaces])
 33 |     else:
 34 |         raise NotImplementedError
 35 | 
 36 | 
 37 | class CappedCubicVideoSchedule(object):
 38 |     # Copied from gym, since this method is frequently moved around
 39 |     def __call__(self, count):
 40 |         if count < 1000:
 41 |             return int(round(count ** (1. / 3))) ** 3 == count
 42 |         else:
 43 |             return count % 1000 == 0
 44 | 
 45 | 
 46 | class FixedIntervalVideoSchedule(object):
 47 |     def __init__(self, interval):
 48 |         self.interval = interval
 49 | 
 50 |     def __call__(self, count):
 51 |         return count % self.interval == 0
 52 | 
 53 | 
 54 | class NoVideoSchedule(object):
 55 |     def __call__(self, count):
 56 |         return False
 57 | 
 58 | 
 59 | class GymEnv(Env, Serializable):
 60 |     def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False,
 61 |                  force_reset=True):
 62 |         if log_dir is None:
 63 |             if logger.get_snapshot_dir() is None:
 64 |                 logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
 65 |             else:
 66 |                 log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
 67 |         Serializable.quick_init(self, locals())
 68 | 
 69 |         env = gym.envs.make(env_name)
 70 | 
 71 |         # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when
 72 |         # the time limit specified for each environment has been passed and
 73 |         # therefore the environment is not Markovian (terminal condition depends
 74 |         # on time rather than state).
 75 |         env = env.env
 76 | 
 77 |         self.env = env
 78 |         self.env_id = env.spec.id
 79 | 
 80 |         assert not (not record_log and record_video)
 81 | 
 82 |         if log_dir is None or record_log is False:
 83 |             self.monitoring = False
 84 |         else:
 85 |             if not record_video:
 86 |                 video_schedule = NoVideoSchedule()
 87 |             else:
 88 |                 if video_schedule is None:
 89 |                     video_schedule = CappedCubicVideoSchedule()
 90 |             self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
 91 |             self.monitoring = True
 92 | 
 93 |         self._observation_space = convert_gym_space(env.observation_space)
 94 |         logger.log("observation space: {}".format(self._observation_space))
 95 |         self._action_space = convert_gym_space(env.action_space)
 96 |         logger.log("action space: {}".format(self._action_space))
 97 |         self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
 98 |         self._log_dir = log_dir
 99 |         self._force_reset = force_reset
100 | 
101 |     @property
102 |     def observation_space(self):
103 |         return self._observation_space
104 | 
105 |     @property
106 |     def action_space(self):
107 |         return self._action_space
108 | 
109 |     @property
110 |     def horizon(self):
111 |         return self._horizon
112 | 
113 |     def reset(self):
114 |         if self._force_reset and self.monitoring:
115 |             from gym.wrappers.monitoring import Monitor
116 |             assert isinstance(self.env, Monitor)
117 |             recorder = self.env.stats_recorder
118 |             if recorder is not None:
119 |                 recorder.done = True
120 |         return self.env.reset()
121 | 
122 |     def step(self, action):
123 |         next_obs, reward, done, info = self.env.step(action)
124 |         return Step(next_obs, reward, done, **info)
125 | 
126 |     def render(self, mode='human', close=False):
127 |         return self.env._render(mode, close)
128 |         # self.env.render()
129 | 
130 |     def terminate(self):
131 |         if self.monitoring:
132 |             self.env._close()
133 |             if self._log_dir is not None:
134 |                 print("""
135 |     ***************************
136 | 
137 |     Training finished! You can upload results to OpenAI Gym by running the following command:
138 | 
139 |     python scripts/submit_gym.py %s
140 | 
141 |     ***************************
142 |                 """ % self._log_dir)
143 | 
144 | 


--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
 1 | from .gym_env import GymEnv
 2 | from .cheetah_hurdle_env import HalfCheetahHurdleEnv
 3 | from .multi_direction_env import (
 4 |     MultiDirectionSwimmerEnv,
 5 |     MultiDirectionAntEnv,
 6 |     MultiDirectionHumanoidEnv)
 7 | 
 8 | from .random_goal_ant_env import RandomGoalAntEnv
 9 | from .pusher import PusherEnv
10 | from .cross_maze_ant_env import CrossMazeAntEnv
11 | from .simple_maze_ant_env import SimpleMazeAntEnv
12 | from .hierarchy_proxy_env import HierarchyProxyEnv
13 | from .multigoal import MultiGoalEnv
14 | 


--------------------------------------------------------------------------------
/envs/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/cheetah_hurdle_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/cheetah_hurdle_env.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/cross_maze_ant_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/cross_maze_ant_env.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/gym_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/gym_env.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/helpers.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/helpers.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/hierarchy_proxy_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/hierarchy_proxy_env.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/multi_direction_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/multi_direction_env.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/multigoal.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/multigoal.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/pusher.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/pusher.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/random_goal_ant_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/random_goal_ant_env.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/__pycache__/simple_maze_ant_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/simple_maze_ant_env.cpython-35.pyc


--------------------------------------------------------------------------------
/envs/cheetah_hurdle_env.py:
--------------------------------------------------------------------------------
 1 | """Implements a ant which is sparsely rewarded for reaching a goal"""
 2 | #from gym.envs.mujoco.half_cheetah import HalfCheetahEnv
 3 | #from gym.envs.mujoco.mujoco_env import MujocoEnv
 4 | 
 5 | 
 6 | from rllab.core.serializable import Serializable
 7 | from sac.misc.utils import PROJECT_PATH
 8 | from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
 9 | from rllab.envs.mujoco.mujoco_env import MujocoEnv
10 | from rllab.envs.base import Step
11 | from gym import utils
12 | import os
13 | import numpy as np
14 | 
15 | MODELS_PATH = os.path.abspath(os.path.join(PROJECT_PATH, 'sac/mujoco_models'))
16 | 
17 | class HalfCheetahHurdleEnv(HalfCheetahEnv):
18 | 	def __init__(self):
19 | 		self.exteroceptive_observation =[12.0,0,0.5]
20 | 		self.hurdles_xpos=[-15.,-13.,-9.,-5.,-1.,3.,7.,11.,15.]#,19.,23.,27.]
21 | 		path = os.path.join(MODELS_PATH, 'half_cheetah_hurdle.xml')
22 | 		MujocoEnv.__init__(self,file_path=path)
23 | 		#MujocoEnv.__init__(self)
24 | 		Serializable.quick_init(self, locals())
25 | 
26 | 	def get_current_obs(self):
27 | 		proprioceptive_observation = super().get_current_obs()
28 | 		x_pos1 =self.get_body_com('ffoot')[0]#self.model.data.qpos.flat[:1]
29 | 		x_pos2 =self.get_body_com('bfoot')[0]#self.model.data.qpos.flat[:1]
30 | 		matches = [x for x in self.hurdles_xpos if x >= x_pos2]
31 | 		next_hurdle_x_pos = [matches[0]]
32 | 		ff_dist_frm_next_hurdle=[np.linalg.norm(matches[0] - x_pos1)]
33 | 		bf_dist_frm_next_hurdle=[np.linalg.norm(matches[0] - x_pos2)]
34 | 		observation =np.concatenate([proprioceptive_observation,next_hurdle_x_pos,bf_dist_frm_next_hurdle]).reshape(-1)
35 | 		return observation
36 | 
37 | 	def isincollision(self):
38 | 		hurdle_size=[0.05,1.0,0.03]
39 | 		x_pos =self.get_body_com('ffoot')[0]#self.model.data.qpos.flat[:1]
40 | 		matches = [x for x in self.hurdles_xpos if x >= x_pos]
41 | 		if len(matches)==0:
42 | 			return False
43 | 		hurdle_pos =[matches[0],0.0,0.20]
44 | 		#names=['fthigh','bthigh']
45 | 		#names=['torso','bthigh','bshin','bfoot']
46 | 		names=['ffoot']
47 | 		xyz_pos=[]
48 | 		for i in range(0,len(names)):
49 | 			xyz_pos.append(self.get_body_com(names[i]))
50 | 		for i in range(0,len(names)):
51 | 			#xyz_position = self.get_body_com(names[i])
52 | 			cf=True
53 | 			for j in range(0,1):
54 | 				if abs(hurdle_pos[j]-xyz_pos[i][j])>1.5*hurdle_size[j]:
55 | 					cf=False
56 | 					break
57 | 			if cf:
58 | 				return True
59 | 		return False
60 | 
61 | 	def get_hurdle_reward(self):
62 | 		hurdle_size=[0.05,1.0,0.03]
63 | 		x_pos =self.get_body_com('bfoot')[0]#self.model.data.qpos.flat[:1]
64 | 		matches = [x for x in self.hurdles_xpos if x >= x_pos]
65 | 		hurdle_reward =-1.0*len(matches)
66 | 
67 | 		return hurdle_reward
68 | 
69 | 	def step(self, action):
70 | 		xyz_pos_before = self.get_body_com('bshin')
71 | 		self.forward_dynamics(action)
72 | 		xyz_pos_after = self.get_body_com('bshin')
73 | 		xyz_position = self.get_body_com('torso')
74 | 		jump_reward = np.abs(self.get_body_comvel("torso")[2])
75 | 		run_reward = self.get_body_comvel("torso")[0]
76 | 		next_obs= self.get_current_obs()
77 | 		if self.isincollision():# or (xyz_pos_after[0]-xyz_pos_before[0])<-0.01:#dist_from_hurdle < 1 and dist_from_hurdle > 0.3 and z_after<0.05:(xyz_pos_after[0]-xyz_pos_before[0])<-0.01: #
78 | 			collision_penality=-2.0
79 | 			#print("collision")
80 | 		else:
81 | 			collision_penality=0.0
82 | 			#print("not collisions")
83 | 		hurdle_reward = self.get_hurdle_reward()
84 | 		#print(hurdle_reward)
85 | 		done = False
86 | 		goal_reward=0
87 | 		goal_distance =np.linalg.norm(xyz_position - self.exteroceptive_observation)
88 | 		if (goal_distance)<1.0:
89 | 			done=True
90 | 			goal_reward=1000
91 | 		else:
92 | 			done=False
93 | 
94 | 		reward=-1e-1*goal_distance+hurdle_reward+goal_reward+run_reward+3e-1*jump_reward+collision_penality#1e-1*goal_distance+run_reward+jump_reward+collision_penality
95 | 		info = {'goal_distance': goal_distance}
96 | 		return Step(next_obs, reward, done, **info)
97 | 


--------------------------------------------------------------------------------
/envs/cross_maze_ant_env.py:
--------------------------------------------------------------------------------
 1 | """Implements an ant whose goal is to reach a target in a maze"""
 2 | 
 3 | import os
 4 | 
 5 | import numpy as np
 6 | 
 7 | from rllab.core.serializable import Serializable
 8 | from sac.misc.utils import PROJECT_PATH
 9 | from .helpers import random_point_in_circle, get_random_goal_logs
10 | from .random_goal_ant_env import RandomGoalAntEnv
11 | 
12 | MODELS_PATH = os.path.abspath(
13 |     os.path.join(PROJECT_PATH, 'sac/mujoco_models'))
14 | 
15 | class CrossMazeAntEnv(RandomGoalAntEnv, Serializable):
16 |     """Implements an ant whose goal is to reach a target in a maze"""
17 | 
18 |     FILE_PATH = os.path.join(MODELS_PATH, 'cross_maze_ant.xml')
19 | 
20 |     def __init__(self,
21 |                  reward_type='dense',
22 |                  terminate_at_goal=True,
23 |                  goal_reward_weight=3e-1,
24 |                  goal_radius=1,
25 |                  goal_distance=1,
26 |                  goal_angle_range=(0, 2*np.pi),
27 |                  velocity_reward_weight=0,
28 |                  ctrl_cost_coeff=1e-2,
29 |                  contact_cost_coeff=1e-3,
30 |                  survive_reward=5e-2,
31 |                  fixed_goal_position=None,
32 |                  *args,
33 |                  **kwargs):
34 |         file_path = self.__class__.FILE_PATH
35 |         kwargs.pop('file_path', None)
36 |         self.fixed_goal_position = fixed_goal_position
37 | 
38 |         super(CrossMazeAntEnv, self).__init__(
39 |             file_path=file_path,
40 |             reward_type=reward_type,
41 |             terminate_at_goal=terminate_at_goal,
42 |             goal_reward_weight=goal_reward_weight,
43 |             goal_radius=goal_radius,
44 |             goal_distance=goal_distance,
45 |             goal_angle_range=goal_angle_range,
46 |             velocity_reward_weight=velocity_reward_weight,
47 |             ctrl_cost_coeff=ctrl_cost_coeff,
48 |             contact_cost_coeff=contact_cost_coeff,
49 |             survive_reward=survive_reward,
50 |             *args,
51 |             **kwargs)
52 |         self._serializable_initialized = False
53 | 
54 |     def reset(self, goal_position=None, *args, **kwargs):
55 |         possible_goal_positions = [[6, -6], [6, 6], [12, 0]]
56 | 
57 |         if goal_position is None:
58 |             if self.fixed_goal_position is not None:
59 |                 goal_position = self.fixed_goal_position
60 |             else:
61 |                 goal_position = possible_goal_positions[
62 |                     np.random.choice(len(possible_goal_positions))]
63 | 
64 |         observation = super(CrossMazeAntEnv, self).reset(
65 |             goal_position=np.array(goal_position), *args, **kwargs)
66 | 
67 |         return observation
68 | 
69 |     def get_current_obs(self):
70 |         observation = super().get_current_obs()
71 | 
72 |         if self.fixed_goal_position is not None:
73 |             return observation[:-2]
74 | 
75 |         return observation
76 | 
77 |     def render(self, *args, **kwargs):
78 |         result = super(CrossMazeAntEnv, self).render(*args, **kwargs)
79 |         self.viewer.cam.elevation = -55
80 |         self.viewer.cam.lookat[0] = 7
81 |         self.viewer.cam.lookat[2] = 0
82 |         self.viewer.cam.distance = self.model.stat.extent * 0.9
83 |         self.viewer.cam.azimuth = 0
84 |         self.viewer.cam.trackbodyid = 0
85 | 
86 |         return result
87 | 


--------------------------------------------------------------------------------
/envs/delayed_env.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | from rllab.envs.proxy_env import ProxyEnv
 4 | from rllab.core.serializable import Serializable
 5 | 
 6 | 
 7 | class DelayedEnv(ProxyEnv, Serializable):
 8 |     def __init__(self, env, delay=0.01):
 9 |         Serializable.quick_init(self, locals())
10 |         ProxyEnv.__init__(self, env)
11 | 
12 |         self._delay = delay
13 | 
14 |     def step(self, action):
15 |         time.sleep(self._delay)
16 |         return self._wrapped_env.step(action)
17 | 


--------------------------------------------------------------------------------
/envs/gym_env.py:
--------------------------------------------------------------------------------
  1 | """ Rllab implementation with a HACK. See comment in GymEnv.__init__(). """
  2 | import gym
  3 | import gym.wrappers
  4 | import gym.envs
  5 | import gym.spaces
  6 | import traceback
  7 | import logging
  8 | 
  9 | try:
 10 |     from gym import logger as monitor_logger
 11 | 
 12 |     monitor_logger.setLevel(logging.WARNING)
 13 | except Exception as e:
 14 |     traceback.print_exc()
 15 | 
 16 | import os
 17 | import os.path as osp
 18 | from rllab.envs.base import Env, Step
 19 | from rllab.core.serializable import Serializable
 20 | from rllab.spaces.box import Box
 21 | from rllab.spaces.discrete import Discrete
 22 | from rllab.spaces.product import Product
 23 | from rllab.misc import logger
 24 | 
 25 | 
 26 | def convert_gym_space(space):
 27 |     if isinstance(space, gym.spaces.Box):
 28 |         return Box(low=space.low, high=space.high)
 29 |     elif isinstance(space, gym.spaces.Discrete):
 30 |         return Discrete(n=space.n)
 31 |     elif isinstance(space, gym.spaces.Tuple):
 32 |         return Product([convert_gym_space(x) for x in space.spaces])
 33 |     else:
 34 |         raise NotImplementedError
 35 | 
 36 | 
 37 | class CappedCubicVideoSchedule(object):
 38 |     # Copied from gym, since this method is frequently moved around
 39 |     def __call__(self, count):
 40 |         if count < 1000:
 41 |             return int(round(count ** (1. / 3))) ** 3 == count
 42 |         else:
 43 |             return count % 1000 == 0
 44 | 
 45 | 
 46 | class FixedIntervalVideoSchedule(object):
 47 |     def __init__(self, interval):
 48 |         self.interval = interval
 49 | 
 50 |     def __call__(self, count):
 51 |         return count % self.interval == 0
 52 | 
 53 | 
 54 | class NoVideoSchedule(object):
 55 |     def __call__(self, count):
 56 |         return False
 57 | 
 58 | 
 59 | class GymEnv(Env, Serializable):
 60 |     def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False,
 61 |                  force_reset=True):
 62 |         if log_dir is None:
 63 |             if logger.get_snapshot_dir() is None:
 64 |                 logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
 65 |             else:
 66 |                 log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
 67 |         Serializable.quick_init(self, locals())
 68 | 
 69 |         env = gym.envs.make(env_name)
 70 | 
 71 |         # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when
 72 |         # the time limit specified for each environment has been passed and
 73 |         # therefore the environment is not Markovian (terminal condition depends
 74 |         # on time rather than state).
 75 |         env = env.env
 76 | 
 77 |         self.env = env
 78 |         self.env_id = env.spec.id
 79 | 
 80 |         assert not (not record_log and record_video)
 81 | 
 82 |         if log_dir is None or record_log is False:
 83 |             self.monitoring = False
 84 |         else:
 85 |             if not record_video:
 86 |                 video_schedule = NoVideoSchedule()
 87 |             else:
 88 |                 if video_schedule is None:
 89 |                     video_schedule = CappedCubicVideoSchedule()
 90 |             self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
 91 |             self.monitoring = True
 92 | 
 93 |         self._observation_space = convert_gym_space(env.observation_space)
 94 |         logger.log("observation space: {}".format(self._observation_space))
 95 |         self._action_space = convert_gym_space(env.action_space)
 96 |         logger.log("action space: {}".format(self._action_space))
 97 |         self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
 98 |         self._log_dir = log_dir
 99 |         self._force_reset = force_reset
100 | 
101 |     @property
102 |     def observation_space(self):
103 |         return self._observation_space
104 | 
105 |     @property
106 |     def action_space(self):
107 |         return self._action_space
108 | 
109 |     @property
110 |     def horizon(self):
111 |         return self._horizon
112 | 
113 |     def reset(self):
114 |         if self._force_reset and self.monitoring:
115 |             from gym.wrappers.monitoring import Monitor
116 |             assert isinstance(self.env, Monitor)
117 |             recorder = self.env.stats_recorder
118 |             if recorder is not None:
119 |                 recorder.done = True
120 |         return self.env.reset()
121 | 
122 |     def step(self, action):
123 |         next_obs, reward, done, info = self.env.step(action)
124 |         return Step(next_obs, reward, done, **info)
125 | 
126 |     def render(self, mode='human', close=False):
127 |         return self.env._render(mode, close)
128 |         # self.env.render()
129 | 
130 |     def terminate(self):
131 |         if self.monitoring:
132 |             self.env._close()
133 |             if self._log_dir is not None:
134 |                 print("""
135 |     ***************************
136 | 
137 |     Training finished! You can upload results to OpenAI Gym by running the following command:
138 | 
139 |     python scripts/submit_gym.py %s
140 | 
141 |     ***************************
142 |                 """ % self._log_dir)
143 | 


--------------------------------------------------------------------------------
/envs/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def random_point_in_circle(angle_range=(0, 2*np.pi), radius=(0, 25)):
 5 |     angle = np.random.uniform(*angle_range)
 6 |     radius = radius if np.isscalar(radius) else np.random.uniform(*radius)
 7 |     x, y = np.cos(angle) * radius, np.sin(angle) * radius
 8 |     point = np.array([x, y])
 9 |     return point
10 | 
11 | def get_random_goal_logs(paths, goal_radius, fixed_goal_position=False):
12 |     if fixed_goal_position:
13 |         position_slice = slice(-3, -1)
14 |     else:
15 |         position_slice = slice(-5, -3)
16 | 
17 |     logs = []
18 |     if len(paths) > 0:
19 |         progs = [
20 |             np.linalg.norm(path["observations"][-1][position_slice]
21 |                            - path["observations"][0][position_slice])
22 |             for path in paths
23 |         ]
24 | 
25 |         time_in_goals = [
26 |             np.sum(np.linalg.norm(
27 |                 (
28 |                     path['observations'][:, position_slice]
29 |                     - [path_goal['goal_position'] for path_goal in path['env_infos']]
30 |                 )
31 |                 , axis=1
32 |             ) < goal_radius)
33 |             for path in paths
34 |         ]
35 | 
36 |         logs += [
37 |             ('AverageProgress', np.mean(progs)),
38 |             ('MaxProgress', np.max(progs)),
39 |             ('MinProgress', np.min(progs)),
40 |             ('StdProgress', np.std(progs)),
41 | 
42 |             ('AverageTimeInGoal', np.mean(time_in_goals)),
43 |             ('MaxTimeInGoal', np.max(time_in_goals)),
44 |             ('MinTimeInGoal', np.min(time_in_goals)),
45 |             ('StdTimeInGoal', np.std(time_in_goals)),
46 |         ]
47 | 
48 |         goal_positions, final_positions = zip(*[
49 |             ([path_goal['goal_position'] for path_goal in p['env_infos']][-1],
50 |              p['observations'][-1][position_slice])
51 |             for p in paths
52 |         ])
53 | 
54 |         begin_goal_distances = [
55 |             np.linalg.norm(goal_position) for goal_position in goal_positions]
56 |         final_goal_distances = [
57 |             np.linalg.norm(goal_position - final_position)
58 |             for goal_position, final_position in zip(goal_positions, final_positions)
59 |         ]
60 |         progress_towards_goals = [
61 |             begin_goal_distance - final_goal_distance
62 |             for (begin_goal_distance, final_goal_distance)
63 |             in zip(begin_goal_distances, final_goal_distances)
64 |         ]
65 | 
66 | 
67 |         for series, name in zip((begin_goal_distances,
68 |                                  final_goal_distances,
69 |                                  progress_towards_goals),
70 |                                 ('BeginGoalDistance',
71 |                                  'FinalGoalDistance',
72 |                                  'ProgressTowardsGoal')):
73 |             for fn_name in ('mean', 'std', 'min', 'max'):
74 |                 fn = getattr(np, fn_name)
75 |                 logs.append((fn_name.capitalize() + name, fn(series)))
76 | 
77 |     return logs
78 | 
79 | def get_multi_direction_logs(paths):
80 |     progs = [
81 |         np.linalg.norm(path["observations"][-1][-3:-1]
82 |                        - path["observations"][0][-3:-1])
83 |         for path in paths
84 |     ]
85 |     logs = (
86 |         ('AverageProgress', np.mean(progs)),
87 |         ('MaxProgress', np.max(progs)),
88 |         ('MinProgress', np.min(progs)),
89 |         ('StdProgress', np.std(progs)),
90 |     )
91 | 
92 |     return logs
93 | 


--------------------------------------------------------------------------------
/envs/hierarchy_proxy_env.py:
--------------------------------------------------------------------------------
 1 | """Implements an environment proxy to test hierarchy policies"""
 2 | 
 3 | from rllab.envs.proxy_env import ProxyEnv
 4 | from rllab.core.serializable import Serializable
 5 | 
 6 | class HierarchyProxyEnv(ProxyEnv):
 7 |     def __init__(self, low_level_policy, *args, **kwargs):
 8 |         Serializable.quick_init(self, locals())
 9 |         self._low_level_policy = low_level_policy
10 |         super().__init__(*args, **kwargs)
11 | 
12 |     def step(self, high_level_action):
13 |         current_observation = (
14 |             # Our env might be double wrapped, e.g. around NormalizedEnv
15 |             self._wrapped_env._wrapped_env.get_current_obs()
16 |             if isinstance(self._wrapped_env, ProxyEnv)
17 |             else self._wrapped_env.get_current_obs())
18 | 
19 |         with self._low_level_policy.deterministic(h=high_level_action[None]):
20 |             action, _ = self._low_level_policy.get_action(
21 |                 observation=current_observation[:self._low_level_policy._Ds])
22 | 
23 |         return super().step(action)
24 | 


--------------------------------------------------------------------------------
/envs/meta_env.py:
--------------------------------------------------------------------------------
 1 | from rllab import spaces
 2 | from rllab.core.serializable import Serializable
 3 | from rllab.envs.env_spec import EnvSpec
 4 | 
 5 | from sac.misc.utils import concat_obs_z
 6 | 
 7 | import numpy as np
 8 | 
 9 | class MetaEnv(Serializable):
10 |     def __init__(self, env, base_policy, num_skills, steps_per_option=100):
11 |         Serializable.quick_init(self, locals())
12 |         self._base_policy = base_policy
13 |         self._env = env
14 |         self._steps_per_option = steps_per_option
15 |         self._num_skills = num_skills
16 |         self.observation_space = self._env.observation_space
17 |         self.action_space = spaces.Discrete(num_skills)
18 |         self.spec = EnvSpec(self.observation_space, self.action_space)
19 |         self._obs = self.reset()
20 | 
21 |     def step(self, meta_action):
22 |         total_reward = 0
23 |         for _ in range(self._steps_per_option):
24 |             aug_obs = concat_obs_z(self._obs, meta_action, self._num_skills)
25 |             (action, _) = self._base_policy.get_action(aug_obs)
26 |             (self._obs, r, done, _) = self._env.step(action)
27 |             total_reward += r
28 |             if done: break
29 |         # Normalize the total reward by number of steps
30 |         return (self._obs, total_reward / float(self._steps_per_option), done, {})
31 | 
32 |     def reset(self):
33 |         return self._env.reset()
34 | 
35 |     def log_diagnostics(self, paths):
36 |         self._env.log_diagnostics(paths)
37 | 
38 |     def terminate(self):
39 |         self._env.terminate()
40 | 
41 | 
42 | class FixedOptionEnv(Serializable):
43 |     def __init__(self, env, num_skills, z):
44 |         Serializable.quick_init(self, locals())
45 |         self._env = env
46 |         self._num_skills = num_skills
47 |         self._z = z
48 |         obs_space = self._env.observation_space
49 |         low = np.hstack([obs_space.low, np.full(num_skills, 0)])
50 |         high = np.hstack([obs_space.high, np.full(num_skills, 1)])
51 |         self.observation_space = spaces.Box(low=low, high=high)
52 |         self.action_space = self._env.action_space
53 |         self.spec = EnvSpec(self.observation_space, self.action_space)
54 | 
55 |     def step(self, action):
56 |         (obs, r, done, info) = self._env.step(action)
57 |         aug_obs = concat_obs_z(obs, self._z, self._num_skills)
58 |         return (aug_obs, r, done, info)
59 | 
60 |     def reset(self):
61 |         obs = self._env.reset()
62 |         aug_obs = concat_obs_z(obs, self._z, self._num_skills)
63 |         return aug_obs
64 | 
65 |     def log_diagnostics(self, paths):
66 |         self._env.log_diagnostics(paths)
67 | 
68 |     def terminate(self):
69 |         self._env.terminate()
70 | 


--------------------------------------------------------------------------------
/envs/multi_direction_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from rllab.core.serializable import Serializable
  4 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv
  5 | from rllab.envs.mujoco.ant_env import AntEnv
  6 | from rllab.envs.mujoco.humanoid_env import HumanoidEnv
  7 | from rllab.envs.base import Step
  8 | from rllab.misc import logger
  9 | 
 10 | from .helpers import get_multi_direction_logs
 11 | 
 12 | class MultiDirectionBaseEnv(Serializable):
 13 |     def __init__(self,
 14 |                  velocity_reward_weight=1.0,
 15 |                  survive_reward=0,
 16 |                  ctrl_cost_coeff=0,
 17 |                  contact_cost_coeff=0,
 18 |                  velocity_deviation_cost_coeff=0,
 19 |                  *args, **kwargs):
 20 |         self._velocity_reward_weight = velocity_reward_weight
 21 |         self._survive_reward = survive_reward
 22 | 
 23 |         self._ctrl_cost_coeff = ctrl_cost_coeff
 24 |         self._contact_cost_coeff = contact_cost_coeff
 25 |         self._velocity_deviation_cost_coeff = velocity_deviation_cost_coeff
 26 |         Serializable.quick_init(self, locals())
 27 | 
 28 |     @property
 29 |     def velocity_reward(self):
 30 |         xy_velocities = self.get_body_comvel("torso")[:2]
 31 |         #xy_velocities = self.get_body_comvel("torso")[0]
 32 |         # rewards for speed on xy-plane (no matter which direction)
 33 |         xy_velocity = np.linalg.norm(xy_velocities)
 34 | 
 35 |         velocity_reward = self._velocity_reward_weight * xy_velocity
 36 |         return velocity_reward
 37 | 
 38 |     @property
 39 |     def survive_reward(self):
 40 |         return self._survive_reward
 41 | 
 42 |     def control_cost(self, action):
 43 |         lb, ub = self.action_bounds
 44 |         scaling = (ub - lb) / 2.0
 45 | 
 46 |         return 0.5 * self._ctrl_cost_coeff * np.sum(
 47 |             np.square(action / scaling))
 48 | 
 49 |     @property
 50 |     def contact_cost(self):
 51 |         return 0.5 * self._contact_cost_coeff * np.sum(
 52 |             np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
 53 | 
 54 |     @property
 55 |     def is_healthy(self):
 56 |         return True
 57 | 
 58 |     @property
 59 |     def velocity_deviation_cost(self):
 60 |         velocity_deviation_cost = (
 61 |             0.5 *
 62 |             self._velocity_deviation_cost_coeff
 63 |             * np.sum(np.square(self.get_body_comvel("torso")[2:])))
 64 |         return velocity_deviation_cost
 65 | 
 66 |     @property
 67 |     def done(self):
 68 |         done = not self.is_healthy
 69 |         return done
 70 | 
 71 | 
 72 |     def step(self, action):
 73 |         self.forward_dynamics(action)
 74 | 
 75 |         reward = (
 76 |             self.velocity_reward
 77 |             + self.survive_reward
 78 |             - self.control_cost(action)
 79 |             - self.contact_cost
 80 |             - self.velocity_deviation_cost)
 81 | 
 82 |         next_observation = self.get_current_obs()
 83 |         #return Step(next_observation, float(reward), self.done)
 84 |         return Step(next_observation, float(reward), False)
 85 | 
 86 |     def log_diagnostics(self, paths, *args, **kwargs):
 87 |         logs = get_multi_direction_logs(paths)
 88 |         for row in logs:
 89 |             logger.record_tabular(*row)
 90 | 
 91 | 
 92 | class MultiDirectionSwimmerEnv(MultiDirectionBaseEnv, SwimmerEnv):
 93 |     def __init__(self,
 94 |                  ctrl_cost_coeff=1e-2,
 95 |                  *args, **kwargs):
 96 |         MultiDirectionBaseEnv.__init__(
 97 |             self, ctrl_cost_coeff=ctrl_cost_coeff, *args, **kwargs)
 98 |         SwimmerEnv.__init__(
 99 |             self, ctrl_cost_coeff=ctrl_cost_coeff, *args, **kwargs)
100 | 
101 |     @property
102 |     def velocity_reward(self):
103 |         xy_velocities = self.get_body_comvel("torso")[:2]
104 | 
105 |         # rewards for speed on positive x direction
106 |         xy_velocity = np.linalg.norm(xy_velocities)
107 |         if xy_velocities[0] < 0:
108 |             xy_velocity *= -1.0
109 | 
110 |         velocity_reward = self._velocity_reward_weight * xy_velocity
111 |         return velocity_reward
112 | 
113 | class MultiDirectionAntEnv(MultiDirectionBaseEnv, AntEnv):
114 |     def __init__(self,
115 |                  ctrl_cost_coeff=1e-2,
116 |                  contact_cost_coeff=1e-3,
117 |                  survive_reward=5e-2,
118 |                  *args, **kwargs):
119 |         MultiDirectionBaseEnv.__init__(
120 |             self,
121 |             ctrl_cost_coeff=ctrl_cost_coeff,
122 |             contact_cost_coeff=contact_cost_coeff,
123 |             survive_reward=survive_reward,
124 |             *args, **kwargs)
125 |         AntEnv.__init__(self,  *args, **kwargs)
126 | 
127 |     @property
128 |     def is_healthy(self):
129 |         return (np.isfinite(self._state).all()
130 |                 and 0.2 <= self._state[2] <= 1.0)
131 | 
132 | class MultiDirectionHumanoidEnv(MultiDirectionBaseEnv, HumanoidEnv):
133 |     def __init__(self,
134 |                  survive_reward=2e-1,
135 |                  ctrl_cost_coeff=1e-3,
136 |                  contact_cost_coeff=1e-5,
137 |                  velocity_deviation_cost_coeff=1e-2,
138 |                  *args, **kwargs):
139 |         MultiDirectionBaseEnv.__init__(
140 |             self,
141 |             survive_reward=survive_reward,
142 |             ctrl_cost_coeff=ctrl_cost_coeff,
143 |             contact_cost_coeff=contact_cost_coeff,
144 |             velocity_deviation_cost_coeff=velocity_deviation_cost_coeff,
145 |             *args, **kwargs)
146 |         HumanoidEnv.__init__(
147 |             self,
148 |             # survive_reward=survive_reward,
149 |             alive_bonus=survive_reward, # TODO: remove this
150 |             ctrl_cost_coeff=ctrl_cost_coeff,
151 |             # contact_cost_coeff=contact_cost_coeff,
152 |             impact_cost_coeff=contact_cost_coeff, # TODO: remove this
153 |             vel_deviation_cost_coeff=velocity_deviation_cost_coeff, # TODO: remove this
154 |             *args, **kwargs)
155 | 
156 |     @property
157 |     def is_healthy(self):
158 |         return 0.8 < self.model.data.qpos[2] < 2.0
159 | 


--------------------------------------------------------------------------------
/envs/simple_maze_ant_env.py:
--------------------------------------------------------------------------------
 1 | """Implements an ant whose goal is to reach a target in a maze"""
 2 | 
 3 | import os
 4 | 
 5 | import numpy as np
 6 | 
 7 | from rllab.core.serializable import Serializable
 8 | from sac.misc.utils import PROJECT_PATH
 9 | from .helpers import random_point_in_circle, get_random_goal_logs
10 | from .random_goal_ant_env import RandomGoalAntEnv
11 | 
12 | MODELS_PATH = os.path.abspath(
13 |     os.path.join(PROJECT_PATH, 'sac/mujoco_models'))
14 | 
15 | class SimpleMazeAntEnv(RandomGoalAntEnv, Serializable):
16 |     """Implements an ant whose goal is to reach a target in a maze"""
17 | 
18 |     FILE_PATH = os.path.join(MODELS_PATH, 'simple_maze_ant.xml')
19 | 
20 |     def __init__(self,
21 |                  reward_type='dense',
22 |                  terminate_at_goal=True,
23 |                  goal_reward_weight=3e-1,
24 |                  goal_radius=1.0,
25 |                  goal_distance=1.0,
26 |                  goal_angle_range=(0, 2*np.pi),
27 |                  velocity_reward_weight=0,
28 |                  ctrl_cost_coeff=1e-2,
29 |                  contact_cost_coeff=1e-3,
30 |                  survive_reward=5e-2,
31 |                  *args,
32 |                  **kwargs):
33 |         file_path = self.__class__.FILE_PATH
34 |         kwargs.pop('file_path', None)
35 |         super(SimpleMazeAntEnv, self).__init__(
36 |             file_path=file_path,
37 |             reward_type=reward_type,
38 |             terminate_at_goal=terminate_at_goal,
39 |             goal_reward_weight=goal_reward_weight,
40 |             goal_radius=goal_radius,
41 |             goal_distance=goal_distance,
42 |             goal_angle_range=goal_angle_range,
43 |             velocity_reward_weight=velocity_reward_weight,
44 |             ctrl_cost_coeff=ctrl_cost_coeff,
45 |             contact_cost_coeff=contact_cost_coeff,
46 |             survive_reward=survive_reward,
47 |             *args,
48 |             **kwargs)
49 | 
50 |     def reset(self, *args, **kwargs):
51 |         observation = super(SimpleMazeAntEnv, self).reset(
52 |             goal_position=np.array([20, -13]), *args, **kwargs)
53 | 
54 |         return observation
55 | 


--------------------------------------------------------------------------------
/misc/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/misc/__pycache__/instrument.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/instrument.cpython-35.pyc


--------------------------------------------------------------------------------
/misc/__pycache__/mlp.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/mlp.cpython-35.pyc


--------------------------------------------------------------------------------
/misc/__pycache__/plotter.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/plotter.cpython-35.pyc


--------------------------------------------------------------------------------
/misc/__pycache__/sampler.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/sampler.cpython-35.pyc


--------------------------------------------------------------------------------
/misc/__pycache__/tf_utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/tf_utils.cpython-35.pyc


--------------------------------------------------------------------------------
/misc/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/utils.cpython-35.pyc


--------------------------------------------------------------------------------
/misc/instrument.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import uuid
 3 | 
 4 | from rllab.misc.instrument import run_experiment_lite
 5 | from sac.misc.utils import timestamp
 6 | 
 7 | from sac.misc.utils import PROJECT_PATH
 8 | 
 9 | DEFAULT_LOG_DIR = PROJECT_PATH + "/data"
10 | 
11 | def _create_symlink(folder):
12 |     # Create a symbolic link that points to the sac folder and include it
13 |     # in the tarball.
14 | 
15 |     # Unique filename for the symlink.
16 |     include_path = os.path.join('/tmp/', str(uuid.uuid4()))
17 |     os.makedirs(include_path)
18 | 
19 |     os.symlink(os.path.join(PROJECT_PATH, folder),
20 |                os.path.join(include_path, folder))
21 | 
22 |     return include_path
23 | 
24 | 
25 | def run_sac_experiment(main, mode, include_folders=None, log_dir=None,
26 |                        exp_prefix="experiment", exp_name=None, **kwargs):
27 |     if exp_name is None:
28 |         exp_name = timestamp()
29 | 
30 |     if log_dir is None:
31 |         log_dir = os.path.join(
32 |             DEFAULT_LOG_DIR,
33 |             "local",
34 |             exp_prefix.replace("_", "-"),
35 |             exp_name)
36 | 
37 |     if include_folders is None:
38 |         include_folders = list()
39 | 
40 |     if mode == 'ec2':
41 |         include_folders.append('sac')
42 |         all_symlinks = list()
43 | 
44 |         for folder in include_folders:
45 |             all_symlinks.append(_create_symlink(folder))
46 | 
47 |         kwargs.update(added_project_directories=all_symlinks)
48 | 
49 |     run_experiment_lite(
50 |         stub_method_call=main,
51 |         mode=mode,
52 |         exp_prefix=exp_prefix,
53 |         exp_name=exp_name,
54 |         log_dir=log_dir,
55 |         **kwargs,
56 |     )
57 | 


--------------------------------------------------------------------------------
/misc/plotter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | 
 4 | 
 5 | class QFPolicyPlotter:
 6 |     def __init__(self, qf, policy, obs_lst, default_action, n_samples):
 7 |         self._qf = qf
 8 |         self._policy = policy
 9 |         self._obs_lst = obs_lst
10 |         self._default_action = default_action
11 |         self._n_samples = n_samples
12 | 
13 |         self._var_inds = np.where(np.isnan(default_action))[0]
14 |         assert len(self._var_inds) == 2
15 | 
16 |         n_plots = len(obs_lst)
17 | 
18 |         x_size = 5 * n_plots
19 |         y_size = 5
20 | 
21 |         fig = plt.figure(figsize=(x_size, y_size))
22 |         self._ax_lst = []
23 |         for i in range(n_plots):
24 |             ax = fig.add_subplot(100 + n_plots * 10 + i + 1)
25 |             ax.set_xlim((-1, 1))
26 |             ax.set_ylim((-1, 1))
27 |             ax.grid(True)
28 |             self._ax_lst.append(ax)
29 | 
30 |         self._line_objects = list()
31 | 
32 |     def draw(self):
33 |         # noinspection PyArgumentList
34 |         [h.remove() for h in self._line_objects]
35 |         self._line_objects = list()
36 | 
37 |         self._plot_level_curves()
38 |         self._plot_action_samples()
39 | 
40 |         plt.draw()
41 |         plt.pause(0.001)
42 | 
43 |     def _plot_level_curves(self):
44 |         # Create mesh grid.
45 |         xs = np.linspace(-1, 1, 50)
46 |         ys = np.linspace(-1, 1, 50)
47 |         xgrid, ygrid = np.meshgrid(xs, ys)
48 |         N = len(xs)*len(ys)
49 | 
50 |         # Copy default values along the first axis and replace nans with
51 |         # the mesh grid points.
52 |         actions = np.tile(self._default_action, (N, 1))
53 |         actions[:, self._var_inds[0]] = xgrid.ravel()
54 |         actions[:, self._var_inds[1]] = ygrid.ravel()
55 | 
56 |         for ax, obs in zip(self._ax_lst, self._obs_lst):
57 |             qs = self._qf.eval(obs[None], actions)
58 |             qs = qs.reshape(xgrid.shape)
59 | 
60 |             cs = ax.contour(xgrid, ygrid, qs, 20)
61 |             self._line_objects += cs.collections
62 |             self._line_objects += ax.clabel(
63 |                 cs, inline=1, fontsize=10, fmt='%.2f')
64 | 
65 |     def _plot_action_samples(self):
66 |         for ax, obs in zip(self._ax_lst, self._obs_lst):
67 |             actions = self._policy.get_actions(
68 |                 np.ones((self._n_samples, 1)) * obs[None, :])
69 | 
70 |             x, y = actions[:, 0], actions[:, 1]
71 |             self._line_objects += ax.plot(x, y, 'b*')
72 | 


--------------------------------------------------------------------------------
/misc/remote_sampler.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import ray  # TODO: Add ray to dependencies.
 3 | import tensorflow as tf
 4 | import numpy as np
 5 | 
 6 | from rllab.misc.overrides import overrides
 7 | from rllab.misc import logger
 8 | 
 9 | from . import tf_utils
10 | from .sampler import Sampler, rollout
11 | 
12 | # TODO: Make the remote sampler correctly use the initial exploration policy, as of now, using this will fail 
13 | 
14 | class RemoteSampler(Sampler):
15 |     def __init__(self, **kwargs):
16 |         super(RemoteSampler, self).__init__(**kwargs)
17 | 
18 |         self._remote_environment = None
19 |         self._remote_path = None
20 |         self._n_episodes = 0
21 |         self._total_samples = 0
22 |         self._last_path_return = 0
23 |         self._max_path_return = -np.inf
24 | 
25 |     @overrides
26 |     def initialize(self, env, policy, pool):
27 |         super(RemoteSampler, self).initialize(env, policy, pool)
28 | 
29 |         ray.init()
30 | 
31 |         env_pkl = pickle.dumps(env)
32 |         policy_pkl = pickle.dumps(policy)
33 | 
34 |         self._remote_environment = _RemoteEnv.remote(env_pkl, policy_pkl)
35 | 
36 |     def sample(self):
37 |         if self._remote_path is None:
38 |             policy_params = self.policy.get_param_values()
39 |             self._remote_path = self._remote_environment.rollout.remote(
40 |                 policy_params, self._max_path_length)
41 | 
42 |         path_ready, _ = ray.wait([self._remote_path], timeout=0)
43 | 
44 |         if len(path_ready) or not self.batch_ready():
45 |             path = ray.get(self._remote_path)
46 |             self.pool.add_path(path)
47 |             self._remote_path = None
48 |             self._total_samples += len(path['observations'])
49 |             self._last_path_return = np.sum(path['rewards'])
50 |             self._max_path_return = max(self._max_path_return,
51 |                                         self._last_path_return)
52 |             self._n_episodes += 1
53 | 
54 |     def log_diagnostics(self):
55 |         logger.record_tabular('max-path-return', self._max_path_return)
56 |         logger.record_tabular('last-path-return', self._last_path_return)
57 |         logger.record_tabular('pool-size', self.pool.size)
58 |         logger.record_tabular('episodes', self._n_episodes)
59 |         logger.record_tabular('total-samples', self._total_samples)
60 | 
61 | 
62 | @ray.remote
63 | class _RemoteEnv(object):
64 |     def __init__(self, env_pkl, policy_pkl):
65 |         self._sess = tf_utils.create_session()
66 |         self._sess.run(tf.global_variables_initializer())
67 | 
68 |         self._env = pickle.loads(env_pkl)
69 |         self._policy = pickle.loads(policy_pkl)
70 | 
71 |         if hasattr(self._env, 'initialize'):
72 |             self._env.initialize()
73 | 
74 |     def rollout(self, policy_params, path_length):
75 |         self._policy.set_param_values(policy_params)
76 |         path = rollout(self._env, self._policy, path_length)
77 | 
78 |         return path
79 | 


--------------------------------------------------------------------------------
/misc/tf_utils.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | from rllab import config
 3 | 
 4 | 
 5 | def get_default_session():
 6 |     return tf.get_default_session() or create_session()
 7 | 
 8 | 
 9 | def create_session(**kwargs):
10 |     """ Create new tensorflow session with given configuration. """
11 |     if "config" not in kwargs:
12 |         kwargs["config"] = get_configuration()
13 |     return tf.InteractiveSession(**kwargs)
14 | 
15 | 
16 | def get_configuration():
17 |     """ Returns personal tensorflow configuration. """
18 |     if config.USE_GPU:
19 |         raise NotImplementedError
20 | 
21 |     config_args = dict()
22 |     return tf.ConfigProto(**config_args)
23 | 


--------------------------------------------------------------------------------
/misc/utils.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import datetime
 3 | import dateutil.tz
 4 | import os
 5 | import numpy as np
 6 | 
 7 | PROJECT_PATH = os.path.dirname(
 8 |     os.path.realpath(os.path.join(__file__, '..', '..')))
 9 | 
10 | def timestamp():
11 |     now = datetime.datetime.now(dateutil.tz.tzlocal())
12 |     return now.strftime('%Y-%m-%d-%H-%M-%S-%f-%Z')
13 | 
14 | def deep_update(d, u):
15 |     for k, v in u.items():
16 |         d[k] = (
17 |             deep_update(d.get(k, {}), v)
18 |             if isinstance(v, collections.Mapping)
19 |             else v)
20 | 
21 |     return d
22 | 
23 | def get_git_rev():
24 |     try:
25 |         import git
26 |         repo = git.Repo(os.getcwd())
27 |         git_rev = repo.active_branch.commit.name_rev
28 |     except:
29 |         git_rev = None
30 | 
31 |     return git_rev
32 | 
33 | def flatten(unflattened, parent_key='', separator='.'):
34 |     items = []
35 |     for k, v in unflattened.items():
36 |         if separator in k:
37 |             raise ValueError(
38 |                 "Found separator ({}) from key ({})".format(separator, k))
39 |         new_key = parent_key + separator + k if parent_key else k
40 |         if isinstance(v, collections.MutableMapping) and v:
41 |             items.extend(flatten(v, new_key, separator=separator).items())
42 |         else:
43 |             items.append((new_key, v))
44 | 
45 |     return dict(items)
46 | 
47 | def unflatten(flattened, separator='.'):
48 |     result = {}
49 |     for key, value in flattened.items():
50 |         parts = key.split(separator)
51 |         d = result
52 |         for part in parts[:-1]:
53 |             if part not in d:
54 |                 d[part] = {}
55 |             d = d[part]
56 |         d[parts[-1]] = value
57 | 
58 |     return result
59 | 
60 | def concat_obs_z(obs, z, num_skills):
61 |     """Concatenates the observation to a one-hot encoding of Z."""
62 |     assert np.isscalar(z)
63 |     z_one_hot = np.zeros(num_skills)
64 |     z_one_hot[z] = 1
65 |     return np.hstack([obs, z_one_hot])
66 | 
67 | def split_aug_obs(aug_obs, num_skills):
68 |     """Splits an augmented observation into the observation and Z."""
69 |     (obs, z_one_hot) = (aug_obs[:-num_skills], aug_obs[-num_skills:])
70 |     z = np.where(z_one_hot == 1)[0][0]
71 |     return (obs, z)
72 | 
73 | def _make_dir(filename):
74 |     folder = os.path.dirname(filename)
75 |     if not os.path.exists(folder):
76 |         os.makedirs(folder)
77 | 
78 | def _save_video(paths, filename):
79 |     import cv2
80 |     assert all(['ims' in path for path in paths])
81 |     ims = [im for path in paths for im in path['ims']]
82 |     _make_dir(filename)
83 | 
84 |     # Define the codec and create VideoWriter object
85 |     fourcc = cv2.VideoWriter_fourcc(*'MJPG')
86 |     fps = 30.0
87 |     (height, width, _) = ims[0].shape
88 |     writer = cv2.VideoWriter(filename, fourcc, fps, (width, height))
89 |     for im in ims:
90 |         writer.write(im)
91 |     writer.release()
92 | 
93 | def _softmax(x):
94 |     max_x = np.max(x)
95 |     exp_x = np.exp(x - max_x)
96 |     return exp_x / np.sum(exp_x)
97 | 


--------------------------------------------------------------------------------
/mujoco_models/pusher_2d.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="arm3d">
 2 |     <compiler inertiafromgeom="true" angle="radian" coordinate="local" />
 3 |     <custom>
 4 |         <numeric name="frame_skip" data="5" />
 5 |     </custom>
 6 |     <option timestep="0.01" gravity="0 0 0" iterations="20" integrator="Euler" />
 7 | 
 8 |     <asset>
 9 |       <texture name="texplane" type="2d" builtin="flat" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="5" height="5" />
10 |       <material name='MatPlane' texture="texplane" shininess="0" texrepeat="60 60" specular="0"  reflectance="0" />
11 |     </asset>
12 | 
13 |     <default>
14 |         <joint armature='0.04' damping="1" limited="true"/>
15 |         <geom friction=".8 .1 .1" density="300" margin="0.002" condim="1" contype="1" conaffinity="1"/>
16 |     </default>
17 | 
18 |     <worldbody>
19 |         <light diffuse=".5 .5 .5" pos="0 0 5" dir="0 0 -1"/>
20 |         <geom name="floor" rgba="0 0 0 1" type="plane" material="MatPlane" pos="0 0.5 -0.15" size="4 4 0.1" contype="1" conaffinity="1"/>
21 | 
22 |         <body name="palm" pos="0 0 0">
23 |             <geom rgba="0. 1. 0. 1" type="capsule" fromto="0 0 -0.1 0 0 0.1" size="0.12"/>
24 |             <body name="proximal_1" pos="0 0 -0.075" axisangle="0 0 1 0.785">
25 |                 <joint name="proximal_j_1" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.5 2.5" damping="1.0" />
26 |                 <geom rgba="0. 1. 0. 1" type="capsule"  fromto="0 0 0 0.4 0 0" size="0.06" contype="1" conaffinity="1"/>
27 |                 <body name="distal_1" pos="0.4 0 0" axisangle="0 0 1 -0.785">
28 |                     <joint name="distal_j_1" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.3213 2.3" damping="1.0"/>
29 |                     <geom rgba="0. 1. 0. 1" type="capsule"  fromto="0 0 0 0.4 0 0" size="0.06" contype="1" conaffinity="1"/>
30 |                     <body name="distal_2" pos="0.4 0 0" axisangle="0 0 1 -1.57">
31 |                         <joint name="distal_j_2" type="hinge" pos="0 0 0" axis="0 0 1" range="-2.3213 2.3" damping="1.0"/>
32 |                         <geom rgba="0. 1. 0. 1" type="capsule" fromto="0 0 0 0.4 0 0" size="0.06" contype="1" conaffinity="1"/>
33 |                         <body name="distal_4" pos="0.4 0 0">
34 |                             <site name="tip arml" pos="0.1 -0.2 0" size="0.01" />
35 |                             <site name="tip armr" pos="0.1 0.2 0" size="0.01" />
36 |                             <!--<joint name="distal_j_3" type="hinge" pos="0 0 0" axis="1 0 0" range="-3.3213 3.3" damping="0.5"/>-->
37 |                             <geom rgba="0. 1. 0. 1" type="capsule" fromto="0 -0.2 0 0 0.2 0" size="0.04" contype="1" conaffinity="1" />
38 |                             <geom rgba="0. 1. 0. 1" type="capsule" fromto="0 -0.2 0 0.2 -0.2 0" size="0.04" contype="1" conaffinity="1" />
39 |                             <geom rgba="0. 1. 0. 1" type="capsule" fromto="0 0.2 0 0.2 0.2 0" size="0.04" contype="1" conaffinity="1" />
40 |                         </body>
41 |                     </body>
42 |                 </body>
43 |             </body>
44 |         </body>
45 | 
46 |         <body name="object" pos="0 0 -0.1">
47 |             <!--<geom rgba="1. 1. 1. 1" type="box" size="0.05 0.05 0.05" density='0.00001' contype="1" conaffinity="1"/>-->
48 |             <geom rgba="1. 1. 1. 1" type="cylinder" size="0.1 0.1 0.1" density='0.00001' contype="1" conaffinity="1"/>
49 |             <joint name="obj_slidex" type="slide" pos="0.025 0.025 0.025" axis="1 0 0" range="-10.3213 10.3" damping="0.5"/>
50 |             <joint name="obj_slidey" type="slide" pos="0.025 0.025 0.025" axis="0 1 0" range="-10.3213 10.3" damping="0.5"/>
51 |         </body>
52 | 
53 |         <body name="goal" pos="-1.0 -1.0 -0.145">
54 |             <!--<body name="goal" pos="0.0 0.0 -0.1">-->
55 |             <!--<geom rgba="1. 0. 0. 1" type="box" size="0.1 0.1 0.1" density='0.00001' contype="0" conaffinity="0"/>-->
56 |             <geom rgba="1. 0. 0. 1" type="cylinder" size="0.17 0.005 0.2" density='0.00001' contype="0" conaffinity="0"/>
57 |             <!-- <geom rgba="1. 0. 0. 1" type="box" size="0.1 2.0 0.01" density='0.00001' contype="0" conaffinity="0"/>-->
58 |             <joint name="goal_slidex" type="slide" pos="0 0 0" axis="1 0 0" range="-10.3213 10.3" damping="0.5"/>
59 |             <joint name="goal_slidey" type="slide" pos="0 0 0" axis="0 1 0" range="-10.3213 10.3" damping="0.5"/>
60 |         </body>
61 | 
62 |     <!-- <geom conaffinity="0" contype="0" density="0.00001" name="object0" pos="-0.384949 0.226851 -0.1" rgba="0.551315 0.719469 0.423106 1" size="0.1 0.1 0.1" type="cylinder" /><geom conaffinity="0" contype="0" density="0.00001" name="object1" pos="0.865376 0.684830 -0.1" rgba="0.480932 0.392118 0.343178 1" size="0.1 0.1 0.1" type="cylinder" /><geom conaffinity="0" contype="0" density="0.00001" name="object2" pos="0.412289 0.438572 -0.1" rgba="0.059678 0.398044 0.737995 1" size="0.1 0.1 0.1" type="cylinder" /><geom conaffinity="0" contype="0" density="0.00001" name="object3" pos="-0.571515 0.175452 -0.1" rgba="0.531551 0.531828 0.634401 1" size="0.1 0.1 0.1" type="cylinder" /><geom conaffinity="0" contype="0" density="0.00001" name="object4" pos="-0.71515 0.075452 -0.1" rgba="0.531551 0.531828 0.634401 1" size="0.1 0.1 0.1" type="cylinder" /> -->
63 |     </worldbody>
64 | 
65 |     <actuator>
66 |         <motor joint="proximal_j_1" ctrlrange="-3 3" ctrllimited="true"/>
67 |         <motor joint="distal_j_1" ctrlrange="-3 3" ctrllimited="true"/>
68 |         <motor joint="distal_j_2" ctrlrange="-3 3" ctrllimited="true"/>
69 |         <!--<motor joint="distal_j_3" ctrlrange="-3 3" ctrllimited="true"/>-->
70 |     </actuator>
71 | </mujoco>
72 | 


--------------------------------------------------------------------------------
/mujoco_models/simple_maze_ant.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="ant">
 2 |   <compiler inertiafromgeom="true" angle="degree" coordinate="local" />
 3 |   <option timestep="0.02" integrator="RK4" />
 4 |   <custom>
 5 |     <numeric name="init_qpos" data="0.0 0.0 0.55 1.0 0.0 0.0 0.0 0.0 1.0 0.0 -1.0 0.0 -1.0 0.0 1.0" />
 6 |   </custom>
 7 |   <default>
 8 |     <joint limited="true" armature="1" damping="1" />
 9 |     <geom condim="3" conaffinity="0" margin="0.01" friction="1 0.5 0.5" solref=".02 1" solimp=".8 .8 .01" rgba="0.8 0.6 0.4 1" density="5.0" />
10 |   </default>
11 |   <asset>
12 |     <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0" />
13 |     <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01" />
14 |     <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100" />
15 |     <material name='MatPlane' texture="texplane" shininess="1" texrepeat="60 60" specular="1"  reflectance="0.5" />
16 |     <material name='geom' texture="texgeom" texuniform="true" />
17 |   </asset>
18 |   <worldbody>
19 |     <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
20 |     <geom name='floor' material="MatPlane" pos='0 0 0' size='40 40 40' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3' />
21 |     <body name="torso" pos="0 0 0.75">
22 |       <geom name="torso_geom" type="sphere" size="0.25" pos="0 0 0" />
23 |       <joint name="root" type="free" limited="false" pos="0 0 0" axis="0 0 1" margin="0.01" armature="0" damping="0" />
24 |       <body name="front_left_leg" pos="0 0 0">
25 |         <geom name="aux_1_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0" />
26 |         <body name="aux_1" pos="0.2 0.2 0">
27 |           <joint name="hip_1" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
28 |           <geom name="left_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 0.2 0.0" />
29 |           <body pos="0.2 0.2 0">
30 |             <joint name="ankle_1" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="30 70" />
31 |             <geom name="left_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 0.4 0.0" />
32 |           </body>
33 |         </body>
34 |       </body>
35 |       <body name="front_right_leg" pos="0 0 0">
36 |         <geom name="aux_2_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0" />
37 |         <body name="aux_2" pos="-0.2 0.2 0">
38 |           <joint name="hip_2" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
39 |           <geom name="right_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 0.2 0.0" />
40 |           <body pos="-0.2 0.2 0">
41 |             <joint name="ankle_2" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="-70 -30" />
42 |             <geom name="right_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 0.4 0.0" />
43 |           </body>
44 |         </body>
45 |       </body>
46 |       <body name="back_leg" pos="0 0 0">
47 |         <geom name="aux_3_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" />
48 |         <body name="aux_3" pos="-0.2 -0.2 0">
49 |           <joint name="hip_3" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
50 |           <geom name="back_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.2 -0.2 0.0" />
51 |           <body pos="-0.2 -0.2 0">
52 |             <joint name="ankle_3" type="hinge" pos="0.0 0.0 0.0" axis="-1 1 0" range="-70 -30" />
53 |             <geom name="third_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 -0.4 -0.4 0.0" />
54 |           </body>
55 |         </body>
56 |       </body>
57 |       <body name="right_back_leg" pos="0 0 0">
58 |         <geom name="aux_4_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0" />
59 |         <body name="aux_4" pos="0.2 -0.2 0">
60 |           <joint name="hip_4" type="hinge" pos="0.0 0.0 0.0" axis="0 0 1" range="-30 30" />
61 |           <geom name="rightback_leg_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.2 -0.2 0.0" />
62 |           <body pos="0.2 -0.2 0">
63 |             <joint name="ankle_4" type="hinge" pos="0.0 0.0 0.0" axis="1 1 0" range="30 70" />
64 |             <geom name="fourth_ankle_geom" type="capsule" size="0.08" fromto="0.0 0.0 0.0 0.4 -0.4 0.0" />
65 |           </body>
66 |         </body>
67 |       </body>
68 |     </body>
69 | 
70 |     <geom name="wall-5" pos="-1.5 0 0.5" rgba="1. 1. 1. 1" type="box" size="0.5 3 1" density='0.00001' contype="1" conaffinity="1"/>
71 |     <geom name="wall-0" pos="3.5 2.5 0.5" rgba="1. 1. 1. 1" type="box" size="5.5 0.5 1" density='0.00001' contype="1" conaffinity="1"/>
72 |     <geom name="wall-1" pos="8.5 -3.5 0.5" rgba="1. 1. 1. 1" type="box" size="0.5 6.5 1" density='0.00001' contype="1" conaffinity="1"/>
73 |     <geom name="wall-2" pos="13.5 -15.5 0.5" rgba="1. 1. 1. 1" type="box" size="10.5 0.5 1" density='0.00001' contype="1" conaffinity="1"/>
74 |     <geom name="wall-3" pos="3.5 -9 0.5" rgba="1. 1. 1. 1" type="box" size="0.5 6.5 1" density='0.00001' contype="1" conaffinity="1"/>
75 |     <geom name="wall-4" pos="1 -2.5 0.5" rgba="1. 1. 1. 1" type="box" size="3 0.5 1" density='0.00001' contype="1" conaffinity="1"/>
76 |     <geom name="wall-6" pos="16 -10.5 0.5" rgba="1. 1. 1. 1" type="box" size="8 0.5 1" density='0.00001' contype="1" conaffinity="1"/>
77 |     <geom name="wall-7" pos="23.5 -13 0.5" rgba="1. 1. 1. 1" type="box" size="0.5 2.5 1" density='0.00001' contype="1" conaffinity="1"/>
78 |     <geom name="target" pos="24 -12 0" rgba="0 1 0 0.2" type="sphere" size="3"/>
79 | 
80 |   </worldbody>
81 |   <actuator>
82 |     <motor joint="hip_4" ctrlrange="-150.0 150.0" ctrllimited="true" />
83 |     <motor joint="ankle_4" ctrlrange="-150.0 150.0" ctrllimited="true" />
84 |     <motor joint="hip_1" ctrlrange="-150.0 150.0" ctrllimited="true" />
85 |     <motor joint="ankle_1" ctrlrange="-150.0 150.0" ctrllimited="true" />
86 |     <motor joint="hip_2" ctrlrange="-150.0 150.0" ctrllimited="true" />
87 |     <motor joint="ankle_2" ctrlrange="-150.0 150.0" ctrllimited="true" />
88 |     <motor joint="hip_3" ctrlrange="-150.0 150.0" ctrllimited="true" />
89 |     <motor joint="ankle_3" ctrlrange="-150.0 150.0" ctrllimited="true" />
90 |   </actuator>
91 | </mujoco>
92 | 


--------------------------------------------------------------------------------
/policies/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn_policy import NNPolicy
2 | from .nn_policy2 import NNPolicy2
3 | from .uniform_policy import UniformPolicy
4 | from .gaussian_policy import GaussianPolicy
5 | from .pointer_policy import GaussianPtrPolicy
6 | 


--------------------------------------------------------------------------------
/policies/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/base.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/gaussian_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/gaussian_policy.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/gmm.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/gmm.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/hierarchical_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/hierarchical_policy.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/latent_space_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/latent_space_policy.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/nn_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/nn_policy.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/nn_policy2.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/nn_policy2.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/pointer_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/pointer_policy.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/__pycache__/uniform_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/uniform_policy.cpython-35.pyc


--------------------------------------------------------------------------------
/policies/base.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | from sandbox.rocky.tf.core.parameterized import Parameterized
  5 | 
  6 | 
  7 | class Policy2(Parameterized):
  8 |     def __init__(self, env_spec):
  9 |         Parameterized.__init__(self)
 10 |         self._env_spec = env_spec
 11 | 
 12 |     # Should be implemented by all policies
 13 | 
 14 |     def get_action(self, observation, sub_level_actions):
 15 |         raise NotImplementedError
 16 | 
 17 |     def get_actions(self, observations,sub_level_actions):
 18 |         raise NotImplementedError
 19 | 
 20 |     def reset(self, dones=None):
 21 |         pass
 22 | 
 23 |     @property
 24 |     def vectorized(self):
 25 |         """
 26 |         Indicates whether the policy is vectorized. If True, it should implement get_actions(), and support resetting
 27 |         with multiple simultaneous states.
 28 |         """
 29 |         return False
 30 | 
 31 |     @property
 32 |     def observation_space(self):
 33 |         return self._env_spec.observation_space
 34 | 
 35 |     @property
 36 |     def action_space(self):
 37 |         return self._env_spec.action_space
 38 | 
 39 |     @property
 40 |     def env_spec(self):
 41 |         return self._env_spec
 42 | 
 43 |     @property
 44 |     def recurrent(self):
 45 |         """
 46 |         Indicates whether the policy is recurrent.
 47 |         :return:
 48 |         """
 49 |         return False
 50 | 
 51 |     def log_diagnostics(self, paths):
 52 |         """
 53 |         Log extra information per iteration based on the collected paths
 54 |         """
 55 |         pass
 56 | 
 57 |     @property
 58 |     def state_info_keys(self):
 59 |         """
 60 |         Return keys for the information related to the policy's state when taking an action.
 61 |         :return:
 62 |         """
 63 |         return [k for k, _ in self.state_info_specs]
 64 | 
 65 |     @property
 66 |     def state_info_specs(self):
 67 |         """
 68 |         Return keys and shapes for the information related to the policy's state when taking an action.
 69 |         :return:
 70 |         """
 71 |         return list()
 72 | 
 73 |     def terminate(self):
 74 |         """
 75 |         Clean up operation
 76 |         """
 77 |         pass
 78 | 
 79 | 
 80 | class StochasticPolicy(Policy2):
 81 |     @property
 82 |     def distribution(self):
 83 |         """
 84 |         :rtype Distribution
 85 |         """
 86 |         raise NotImplementedError
 87 | 
 88 |     def dist_info_sym(self, obs_var, state_info_vars):
 89 |         """
 90 |         Return the symbolic distribution information about the actions.
 91 |         :param obs_var: symbolic variable for observations
 92 |         :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
 93 |         the time it received the observation
 94 |         :return:
 95 |         """
 96 |         raise NotImplementedError
 97 | 
 98 |     def dist_info(self, obs, state_infos):
 99 |         """
100 |         Return the distribution information about the actions.
101 |         :param obs_var: observation values
102 |         :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
103 |         the time it received the observation
104 |         :return:
105 |         """
106 |         raise NotImplementedError
107 | 


--------------------------------------------------------------------------------
/policies/nn_policy.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | from rllab.misc.overrides import overrides
 6 | from sandbox.rocky.tf.policies.base import Policy
 7 | 
 8 | 
 9 | class NNPolicy(Policy, Serializable):
10 |     def __init__(self, env_spec, observation_ph, actions,
11 |                  scope_name=None):
12 |         Serializable.quick_init(self, locals())
13 | 
14 |         self._observations_ph = observation_ph
15 |         self._actions = actions
16 |         self._scope_name = (
17 |             tf.get_variable_scope().name if not scope_name else scope_name
18 |         )
19 |         super(NNPolicy, self).__init__(env_spec)
20 | 
21 |     @overrides
22 |     def get_action(self, observation):
23 |         """Sample single action based on the observations."""
24 |         return self.get_actions(observation[None])[0], {}
25 | 
26 |     @overrides
27 |     def get_actions(self, observations):
28 |         """Sample actions based on the observations."""
29 |         feed_dict = {self._observations_ph: observations}
30 |         actions = tf.get_default_session().run(self._actions, feed_dict)
31 |         return actions
32 | 
33 |     @overrides
34 |     def log_diagnostics(self, paths):
35 |         pass
36 | 
37 |     @overrides
38 |     def get_params_internal(self, **tags):
39 |         if tags:
40 |             raise NotImplementedError
41 |         scope = self._scope_name
42 |         # Add "/" to 'scope' unless it's empty (otherwise get_collection will
43 |         # return all parameters that start with 'scope'.
44 |         scope = scope if scope == '' else scope + '/'
45 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
46 | 


--------------------------------------------------------------------------------
/policies/nn_policy2.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | from rllab.misc.overrides import overrides
 6 | from sac.policies.base import Policy2
 7 | 
 8 | 
 9 | class NNPolicy2(Policy2, Serializable):
10 |     def __init__(self, env_spec, observation_ph, actions,
11 |                  scope_name=None):
12 |         Serializable.quick_init(self, locals())
13 | 
14 |         self._observations_ph = observation_ph
15 |         self._actions = actions
16 |         self._scope_name = (
17 |             tf.get_variable_scope().name if not scope_name else scope_name
18 |         )
19 |         super(NNPolicy2, self).__init__(env_spec)
20 | 
21 |     @overrides
22 |     def get_action(self, observation,sub_level_actions):
23 |         """Sample single action based on the observations."""
24 |         return self.get_actions(observation[None],sub_level_actions)[0], {}
25 | 
26 |     @overrides
27 |     def get_actions(self, observations,sub_level_actions):
28 |         """Sample actions based on the observations."""
29 |         feed_dict = {self._observations_ph: observations,self.sub_level_actions:sub_level_actions}
30 |         actions = tf.get_default_session().run(self._actions, feed_dict)
31 |         return actions
32 | 
33 |     @overrides
34 |     def log_diagnostics(self, paths):
35 |         pass
36 | 
37 |     @overrides
38 |     def get_params_internal(self, **tags):
39 |         if tags:
40 |             raise NotImplementedError
41 |         scope = self._scope_name
42 |         # Add "/" to 'scope' unless it's empty (otherwise get_collection will
43 |         # return all parameters that start with 'scope'.
44 |         scope = scope if scope == '' else scope + '/'
45 |         return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/policies/uniform_policy.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.serializable import Serializable
 2 | 
 3 | from rllab.misc.overrides import overrides
 4 | from sac.policies.base import Policy2
 5 | 
 6 | import numpy as np
 7 | 
 8 | 
 9 | class UniformPolicy(Policy2, Serializable):
10 |     """
11 |     Fixed policy that randomly samples actions uniformly at random.
12 | 
13 |     Used for an initial exploration period instead of an undertrained policy.
14 |     """
15 |     def __init__(self, env_spec):
16 |         Serializable.quick_init(self, locals())
17 |         self._Da = env_spec.action_space.flat_dim
18 | 
19 |         super(UniformPolicy, self).__init__(env_spec)
20 | 
21 |     # Assumes action spaces are normalized to be the interval [-1, 1]
22 |     @overrides
23 |     def get_action(self, observation,sub_level_actions):
24 |         return np.random.uniform(-1., 1., self._Da), None 
25 |     '''@overrides 
26 |     def get_action(self, observation,sub_level_actions):
27 |         probs=np.random.uniform(0.0, 1., 4)
28 |         probs=np.argmax(probs)#probs/sum(probs)
29 |         #probs=np.array([1,0,0,0],dtype=np.float32)
30 |         #probs=np.random.shuffle(probs)
31 |         #actions_mean=np.sum(np.multiply(sub_level_actions[0],np.expand_dims(probs,2)),1)
32 |         return sub_level_actions[0][0][probs], None 
33 |         #return np.random.uniform(-1., 1., self._Da), None''' 
34 | 
35 |     @overrides
36 |     def get_actions(self, observations,sub_level_actions):
37 |         pass 
38 | 
39 |     @overrides
40 |     def log_diagnostics(self, paths):
41 |         pass
42 | 
43 |     @overrides
44 |     def get_params_internal(self, **tags):
45 |         pass 
46 | 
47 | 


--------------------------------------------------------------------------------
/preprocessors/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp_preprocessor import MLPPreprocessor
2 | 


--------------------------------------------------------------------------------
/preprocessors/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/preprocessors/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/preprocessors/__pycache__/mlp_preprocessor.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/preprocessors/__pycache__/mlp_preprocessor.cpython-35.pyc


--------------------------------------------------------------------------------
/preprocessors/mlp_preprocessor.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | from sandbox.rocky.tf.core.parameterized import Parameterized
 6 | 
 7 | from sac.misc.mlp import MLPFunction
 8 | from sac.misc import tf_utils
 9 | 
10 | class MLPPreprocessor(MLPFunction):
11 |     def __init__(self, env_spec, layer_sizes=(128, 16),
12 |                  output_nonlinearity=None, name='observations_preprocessor'):
13 | 
14 |         Parameterized.__init__(self)
15 |         Serializable.quick_init(self, locals())
16 | 
17 |         self._name = name
18 | 
19 |         self._Do = env_spec.observation_space.flat_dim
20 | 
21 |         obs_ph = tf.placeholder(
22 |             tf.float32,
23 |             shape=(None, self._Do),
24 |             name='observations',
25 |         )
26 | 
27 |         self._input_pls = (obs_ph, )
28 |         self._layer_sizes = layer_sizes
29 |         self._output_nonlinearity = output_nonlinearity
30 | 
31 |         self._output_t = self.get_output_for(obs_ph, reuse=tf.AUTO_REUSE)
32 | 


--------------------------------------------------------------------------------
/primitive-policies/ant/bwrd/bwrd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/bwrd/bwrd.pkl


--------------------------------------------------------------------------------
/primitive-policies/ant/dwrd/dwrd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/dwrd/dwrd.pkl


--------------------------------------------------------------------------------
/primitive-policies/ant/fwrd/fwrd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/fwrd/fwrd.pkl


--------------------------------------------------------------------------------
/primitive-policies/ant/uwrd/uwrd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/uwrd/uwrd.pkl


--------------------------------------------------------------------------------
/primitive-policies/hc/fwd/fwd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/hc/fwd/fwd.pkl


--------------------------------------------------------------------------------
/primitive-policies/hc/jp-longz/jump.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/hc/jp-longz/jump.pkl


--------------------------------------------------------------------------------
/primitive-policies/pusher/bottom/bottom.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/pusher/bottom/bottom.pkl


--------------------------------------------------------------------------------
/primitive-policies/pusher/left/left.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/pusher/left/left.pkl


--------------------------------------------------------------------------------
/replay_buffers/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_replay_buffer import SimpleReplayBuffer


--------------------------------------------------------------------------------
/replay_buffers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/replay_buffers/__pycache__/replay_buffer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/replay_buffer.cpython-35.pyc


--------------------------------------------------------------------------------
/replay_buffers/__pycache__/simple_replay_buffer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/simple_replay_buffer.cpython-35.pyc


--------------------------------------------------------------------------------
/replay_buffers/replay_buffer.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class ReplayBuffer(object, metaclass=abc.ABCMeta):
 5 |     """
 6 |     A class used to save and replay data.
 7 |     """
 8 | 
 9 |     @abc.abstractmethod
10 |     def add_sample(self, observation, action, reward, next_observation,
11 |                    terminal, **kwargs):
12 |         """
13 |         Add a transition tuple.
14 |         """
15 |         pass
16 | 
17 |     @abc.abstractmethod
18 |     def terminate_episode(self):
19 |         """
20 |         Let the replay buffer know that the episode has terminated in case some
21 |         special book-keeping has to happen.
22 |         :return:
23 |         """
24 |         pass
25 | 
26 |     @property
27 |     @abc.abstractmethod
28 |     def size(self, **kwargs):
29 |         """
30 |         :return: # of unique items that can be sampled.
31 |         """
32 |         pass
33 | 
34 |     def add_path(self, path):
35 |         """
36 |         Add a path to the replay buffer.
37 | 
38 |         This default implementation naively goes through every step, but you
39 |         may want to optimize this.
40 | 
41 |         NOTE: You should NOT call "terminate_episode" after calling add_path.
42 |         It's assumed that this function handles the episode termination.
43 | 
44 |         :param path: Dict like one outputted by railrl.samplers.util.rollout
45 |         """
46 |         for i, (
47 |                 obs,
48 |                 sub_level_actions,
49 |                 action,
50 |                 reward,
51 |                 next_obs,
52 |                 terminal,
53 |                 agent_info,
54 |                 env_info
55 |         ) in enumerate(zip(
56 |             path["observations"],
57 |             path["sub_level_actions"],
58 |             path["actions"],
59 |             path["rewards"],
60 |             path["next_observations"],
61 |             path["terminals"],
62 |             path["agent_infos"],
63 |             path["env_infos"],
64 |         )):
65 |             self.add_sample(
66 |                 obs,
67 |                 sub_level_actions,
68 |                 action,
69 |                 reward,
70 |                 terminal,
71 |                 next_obs,
72 |                 agent_info=agent_info,
73 |                 env_info=env_info,
74 |             )
75 |         self.terminate_episode()
76 | 
77 |     @abc.abstractmethod
78 |     def random_batch(self, batch_size):
79 |         """
80 |         Return a batch of size `batch_size`.
81 |         :param batch_size:
82 |         :return:
83 |         """
84 |         pass
85 | 


--------------------------------------------------------------------------------
/replay_buffers/simple_replay_buffer.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from rllab.core.serializable import Serializable
  4 | 
  5 | from .replay_buffer import ReplayBuffer
  6 | 
  7 | 
  8 | class SimpleReplayBuffer(ReplayBuffer, Serializable):
  9 |     def __init__(self, env_spec, max_replay_buffer_size, seq_len):
 10 |         super(SimpleReplayBuffer, self).__init__()
 11 |         Serializable.quick_init(self, locals())
 12 | 
 13 |         max_replay_buffer_size = int(max_replay_buffer_size)
 14 | 
 15 |         self._env_spec = env_spec
 16 |         self._observation_dim = env_spec.observation_space.flat_dim
 17 |         self._action_dim = env_spec.action_space.flat_dim
 18 |         self._max_buffer_size = max_replay_buffer_size
 19 |         self._observations = np.zeros((max_replay_buffer_size,
 20 |                                        self._observation_dim))
 21 |         # It's a bit memory inefficient to save the observations twice,
 22 |         # but it makes the code *much* easier since you no longer have to
 23 |         # worry about termination conditions.
 24 |         self._next_obs = np.zeros((max_replay_buffer_size,
 25 |                                    self._observation_dim))
 26 |         self._sub_level_actions = np.zeros((max_replay_buffer_size,seq_len, self._action_dim))
 27 |         self._sub_level_probs = np.zeros((max_replay_buffer_size,seq_len, 1))
 28 |         self._actions = np.zeros((max_replay_buffer_size, self._action_dim))
 29 |         self._rewards = np.zeros(max_replay_buffer_size)
 30 |         # self._terminals[i] = a terminal was received at time i
 31 |         self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8')
 32 |         self._top = 0
 33 |         self._size = 0
 34 | 
 35 |     def add_sample(self, observation,sub_level_actions,sub_level_probs, action, reward, terminal,
 36 |                    next_observation, **kwargs):
 37 |         self._observations[self._top] = observation
 38 |         self._sub_level_actions[self._top] = sub_level_actions
 39 |         self._sub_level_probs[self._top] = sub_level_probs
 40 |         self._actions[self._top] = action
 41 |         self._rewards[self._top] = reward
 42 |         self._terminals[self._top] = terminal
 43 |         self._next_obs[self._top] = next_observation
 44 | 
 45 |         self._advance()
 46 | 
 47 |     def terminate_episode(self):
 48 |         pass
 49 | 
 50 |     def _advance(self):
 51 |         self._top = (self._top + 1) % self._max_buffer_size
 52 |         if self._size < self._max_buffer_size:
 53 |             self._size += 1
 54 | 
 55 |     def random_batch(self, batch_size):
 56 |         indices = np.random.randint(0, self._size, batch_size)
 57 |         return dict(
 58 |             observations=self._observations[indices],
 59 |             sub_level_actions=self._sub_level_actions[indices],
 60 |             sub_level_probs=self._sub_level_probs[indices],
 61 |             actions=self._actions[indices],
 62 |             rewards=self._rewards[indices],
 63 |             terminals=self._terminals[indices],
 64 |             next_observations=self._next_obs[indices],
 65 |         )
 66 | 
 67 |     @property
 68 |     def size(self):
 69 |         return self._size
 70 | 
 71 |     def __getstate__(self):
 72 |         d = super(SimpleReplayBuffer, self).__getstate__()
 73 |         d.update(dict(
 74 |             o=self._observations.tobytes(),
 75 |             sa=self._sub_level_actions.tobytes(),
 76 |             sp=self._sub_level_probs.tobytes(),
 77 |             a=self._actions.tobytes(),
 78 |             r=self._rewards.tobytes(),
 79 |             t=self._terminals.tobytes(),
 80 |             no=self._next_obs.tobytes(),
 81 |             top=self._top,
 82 |             size=self._size,
 83 |         ))
 84 |         return d
 85 | 
 86 |     def __setstate__(self, d):
 87 |         super(SimpleReplayBuffer, self).__setstate__(d)
 88 |         self._observations = np.fromstring(d['o']).reshape(
 89 |             self._max_buffer_size, -1
 90 |         )
 91 |         self._next_obs = np.fromstring(d['no']).reshape(
 92 |             self._max_buffer_size, -1
 93 |         )
 94 |         self._sub_level_actions = np.fromstring(d['sa']).reshape(self._max_buffer_size,seq_len, -1)
 95 |         self._sub_level_probs = np.fromstring(d['sp']).reshape(self._max_buffer_size,seq_len, -1)
 96 |         self._actions = np.fromstring(d['a']).reshape(self._max_buffer_size, -1)
 97 |         self._rewards = np.fromstring(d['r']).reshape(self._max_buffer_size)
 98 |         self._terminals = np.fromstring(d['t'], dtype=np.uint8)
 99 |         self._top = d['top']
100 |         self._size = d['size']
101 | 


--------------------------------------------------------------------------------
/sandbox/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__pycache__/batch_polopt.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/batch_polopt.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__pycache__/npo.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/npo.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__pycache__/trpo.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/trpo.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/npg.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/npo.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | from rllab.misc import ext
  5 | from rllab.misc.overrides import overrides
  6 | import rllab.misc.logger as logger
  7 | from sandbox.rocky.tf.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer
  8 | from sandbox.rocky.tf.algos.batch_polopt import BatchPolopt
  9 | from sandbox.rocky.tf.misc import tensor_utils
 10 | import tensorflow as tf
 11 | 
 12 | 
 13 | class NPO(BatchPolopt):
 14 |     """
 15 |     Natural Policy Optimization.
 16 |     """
 17 | 
 18 |     def __init__(
 19 |             self,
 20 |             optimizer=None,
 21 |             optimizer_args=None,
 22 |             step_size=0.01,
 23 |             **kwargs):
 24 |         if optimizer is None:
 25 |             if optimizer_args is None:
 26 |                 optimizer_args = dict()
 27 |             optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
 28 |         self.optimizer = optimizer
 29 |         self.step_size = step_size
 30 |         super(NPO, self).__init__(**kwargs)
 31 | 
 32 |     @overrides
 33 |     def init_opt(self):
 34 |         is_recurrent = int(self.policy.recurrent)
 35 |         obs_var = self.env.observation_space.new_tensor_variable(
 36 |             'obs',
 37 |             extra_dims=1 + is_recurrent,
 38 |         )
 39 |         action_var = self.env.action_space.new_tensor_variable(
 40 |             'action',
 41 |             extra_dims=1 + is_recurrent,
 42 |         )
 43 |         advantage_var = tensor_utils.new_tensor(
 44 |             'advantage',
 45 |             ndim=1 + is_recurrent,
 46 |             dtype=tf.float32,
 47 |         )
 48 |         dist = self.policy.distribution
 49 | 
 50 |         old_dist_info_vars = {
 51 |             k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
 52 |             for k, shape in dist.dist_info_specs
 53 |             }
 54 |         old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]
 55 | 
 56 |         state_info_vars = {
 57 |             k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
 58 |             for k, shape in self.policy.state_info_specs
 59 |             }
 60 |         state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]
 61 | 
 62 |         if is_recurrent:
 63 |             valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
 64 |         else:
 65 |             valid_var = None
 66 | 
 67 |         dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
 68 |         kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
 69 |         lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
 70 |         if is_recurrent:
 71 |             mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
 72 |             surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
 73 |         else:
 74 |             mean_kl = tf.reduce_mean(kl)
 75 |             surr_loss = - tf.reduce_mean(lr * advantage_var)
 76 | 
 77 |         input_list = [
 78 |                          obs_var,
 79 |                          action_var,
 80 |                          advantage_var,
 81 |                      ] + state_info_vars_list + old_dist_info_vars_list
 82 |         if is_recurrent:
 83 |             input_list.append(valid_var)
 84 | 
 85 |         self.optimizer.update_opt(
 86 |             loss=surr_loss,
 87 |             target=self.policy,
 88 |             leq_constraint=(mean_kl, self.step_size),
 89 |             inputs=input_list,
 90 |             constraint_name="mean_kl"
 91 |         )
 92 |         return dict()
 93 | 
 94 |     @overrides
 95 |     def optimize_policy(self, itr, samples_data):
 96 |         all_input_values = tuple(ext.extract(
 97 |             samples_data,
 98 |             "observations", "actions", "advantages"
 99 |         ))
100 |         agent_infos = samples_data["agent_infos"]
101 |         state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
102 |         dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
103 |         all_input_values += tuple(state_info_list) + tuple(dist_info_list)
104 |         if self.policy.recurrent:
105 |             all_input_values += (samples_data["valids"],)
106 |         logger.log("Computing loss before")
107 |         loss_before = self.optimizer.loss(all_input_values)
108 |         logger.log("Computing KL before")
109 |         mean_kl_before = self.optimizer.constraint_val(all_input_values)
110 |         logger.log("Optimizing")
111 |         self.optimizer.optimize(all_input_values)
112 |         logger.log("Computing KL after")
113 |         mean_kl = self.optimizer.constraint_val(all_input_values)
114 |         logger.log("Computing loss after")
115 |         loss_after = self.optimizer.loss(all_input_values)
116 |         logger.record_tabular('LossBefore', loss_before)
117 |         logger.record_tabular('LossAfter', loss_after)
118 |         logger.record_tabular('MeanKLBefore', mean_kl_before)
119 |         logger.record_tabular('MeanKL', mean_kl)
120 |         logger.record_tabular('dLoss', loss_before - loss_after)
121 |         return dict()
122 | 
123 |     @overrides
124 |     def get_itr_snapshot(self, itr, samples_data):
125 |         return dict(
126 |             itr=itr,
127 |             policy=self.policy,
128 |             baseline=self.baseline,
129 |             env=self.env,
130 |         )
131 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/trpo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from sandbox.rocky.tf.algos.npo import NPO
 4 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
 5 | 
 6 | 
 7 | class TRPO(NPO):
 8 |     """
 9 |     Trust Region Policy Optimization
10 |     """
11 | 
12 |     def __init__(
13 |             self,
14 |             optimizer=None,
15 |             optimizer_args=None,
16 |             **kwargs):
17 |         if optimizer is None:
18 |             if optimizer_args is None:
19 |                 optimizer_args = dict()
20 |             optimizer = ConjugateGradientOptimizer(**optimizer_args)
21 |         super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
22 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/vpg.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | from rllab.misc import logger
  4 | from rllab.misc import ext
  5 | from rllab.misc.overrides import overrides
  6 | from sandbox.rocky.tf.algos.batch_polopt import BatchPolopt
  7 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
  8 | from sandbox.rocky.tf.misc import tensor_utils
  9 | from rllab.core.serializable import Serializable
 10 | import tensorflow as tf
 11 | 
 12 | 
 13 | class VPG(BatchPolopt, Serializable):
 14 |     """
 15 |     Vanilla Policy Gradient.
 16 |     """
 17 | 
 18 |     def __init__(
 19 |             self,
 20 |             env,
 21 |             policy,
 22 |             baseline,
 23 |             optimizer=None,
 24 |             optimizer_args=None,
 25 |             **kwargs):
 26 |         Serializable.quick_init(self, locals())
 27 |         if optimizer is None:
 28 |             default_args = dict(
 29 |                 batch_size=None,
 30 |                 max_epochs=1,
 31 |             )
 32 |             if optimizer_args is None:
 33 |                 optimizer_args = default_args
 34 |             else:
 35 |                 optimizer_args = dict(default_args, **optimizer_args)
 36 |             optimizer = FirstOrderOptimizer(**optimizer_args)
 37 |         self.optimizer = optimizer
 38 |         self.opt_info = None
 39 |         super(VPG, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs)
 40 | 
 41 |     @overrides
 42 |     def init_opt(self):
 43 |         is_recurrent = int(self.policy.recurrent)
 44 | 
 45 |         obs_var = self.env.observation_space.new_tensor_variable(
 46 |             'obs',
 47 |             extra_dims=1 + is_recurrent,
 48 |         )
 49 |         action_var = self.env.action_space.new_tensor_variable(
 50 |             'action',
 51 |             extra_dims=1 + is_recurrent,
 52 |         )
 53 |         advantage_var = tensor_utils.new_tensor(
 54 |             name='advantage',
 55 |             ndim=1 + is_recurrent,
 56 |             dtype=tf.float32,
 57 |         )
 58 |         dist = self.policy.distribution
 59 | 
 60 |         old_dist_info_vars = {
 61 |             k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
 62 |             for k, shape in dist.dist_info_specs
 63 |             }
 64 |         old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]
 65 | 
 66 |         state_info_vars = {
 67 |             k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
 68 |             for k, shape in self.policy.state_info_specs
 69 |             }
 70 |         state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]
 71 | 
 72 |         if is_recurrent:
 73 |             valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
 74 |         else:
 75 |             valid_var = None
 76 | 
 77 |         dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
 78 |         logli = dist.log_likelihood_sym(action_var, dist_info_vars)
 79 |         kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
 80 | 
 81 |         # formulate as a minimization problem
 82 |         # The gradient of the surrogate objective is the policy gradient
 83 |         if is_recurrent:
 84 |             surr_obj = - tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var)
 85 |             mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
 86 |             max_kl = tf.reduce_max(kl * valid_var)
 87 |         else:
 88 |             surr_obj = - tf.reduce_mean(logli * advantage_var)
 89 |             mean_kl = tf.reduce_mean(kl)
 90 |             max_kl = tf.reduce_max(kl)
 91 | 
 92 |         input_list = [obs_var, action_var, advantage_var] + state_info_vars_list
 93 |         if is_recurrent:
 94 |             input_list.append(valid_var)
 95 | 
 96 |         self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list)
 97 | 
 98 |         f_kl = tensor_utils.compile_function(
 99 |             inputs=input_list + old_dist_info_vars_list,
100 |             outputs=[mean_kl, max_kl],
101 |         )
102 |         self.opt_info = dict(
103 |             f_kl=f_kl,
104 |         )
105 | 
106 |     @overrides
107 |     def optimize_policy(self, itr, samples_data):
108 |         logger.log("optimizing policy")
109 |         inputs = ext.extract(
110 |             samples_data,
111 |             "observations", "actions", "advantages"
112 |         )
113 |         agent_infos = samples_data["agent_infos"]
114 |         state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
115 |         inputs += tuple(state_info_list)
116 |         if self.policy.recurrent:
117 |             inputs += (samples_data["valids"],)
118 |         dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
119 |         loss_before = self.optimizer.loss(inputs)
120 |         self.optimizer.optimize(inputs)
121 |         loss_after = self.optimizer.loss(inputs)
122 |         logger.record_tabular("LossBefore", loss_before)
123 |         logger.record_tabular("LossAfter", loss_after)
124 | 
125 |         mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list))
126 |         logger.record_tabular('MeanKL', mean_kl)
127 |         logger.record_tabular('MaxKL', max_kl)
128 | 
129 |     @overrides
130 |     def get_itr_snapshot(self, itr, samples_data):
131 |         return dict(
132 |             itr=itr,
133 |             policy=self.policy,
134 |             baseline=self.baseline,
135 |             env=self.env,
136 |         )
137 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/layers.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/layers.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/layers_powered.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/layers_powered.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/network.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/network.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/parameterized.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/parameterized.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/layers_powered.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.core.parameterized import Parameterized
 2 | import sandbox.rocky.tf.core.layers as L
 3 | import itertools
 4 | 
 5 | 
 6 | class LayersPowered(Parameterized):
 7 | 
 8 |     def __init__(self, output_layers, input_layers=None):
 9 |         self._output_layers = output_layers
10 |         self._input_layers = input_layers
11 |         Parameterized.__init__(self)
12 | 
13 |     def get_params_internal(self, **tags):
14 |         layers = L.get_all_layers(self._output_layers, treat_as_input=self._input_layers)
15 |         params = itertools.chain.from_iterable(l.get_params(**tags) for l in layers)
16 |         return L.unique(params)
17 | 
18 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/parameterized.py:
--------------------------------------------------------------------------------
  1 | from contextlib import contextmanager
  2 | 
  3 | from rllab.core.serializable import Serializable
  4 | from rllab.misc.tensor_utils import flatten_tensors, unflatten_tensors
  5 | import tensorflow as tf
  6 | 
  7 | 
  8 | load_params = True
  9 | 
 10 | @contextmanager
 11 | def suppress_params_loading():
 12 |     global load_params
 13 |     load_params = False
 14 |     yield
 15 |     load_params = True
 16 | 
 17 | 
 18 | class Parameterized(object):
 19 |     def __init__(self):
 20 |         self._cached_params = {}
 21 |         self._cached_param_dtypes = {}
 22 |         self._cached_param_shapes = {}
 23 |         self._cached_assign_ops = {}
 24 |         self._cached_assign_placeholders = {}
 25 | 
 26 |     def get_params_internal(self, **tags):
 27 |         """
 28 |         Internal method to be implemented which does not perform caching
 29 |         """
 30 |         raise NotImplementedError
 31 | 
 32 |     def get_params(self, **tags):
 33 |         """
 34 |         Get the list of parameters, filtered by the provided tags.
 35 |         Some common tags include 'regularizable' and 'trainable'
 36 |         """
 37 |         tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0]))
 38 |         if tag_tuple not in self._cached_params:
 39 |             self._cached_params[tag_tuple] = self.get_params_internal(**tags)
 40 |         return self._cached_params[tag_tuple]
 41 | 
 42 |     def get_param_dtypes(self, **tags):
 43 |         tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0]))
 44 |         if tag_tuple not in self._cached_param_dtypes:
 45 |             params = self.get_params(**tags)
 46 |             param_values = tf.get_default_session().run(params)
 47 |             self._cached_param_dtypes[tag_tuple] = [val.dtype for val in param_values]
 48 |         return self._cached_param_dtypes[tag_tuple]
 49 | 
 50 |     def get_param_shapes(self, **tags):
 51 |         tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0]))
 52 |         if tag_tuple not in self._cached_param_shapes:
 53 |             params = self.get_params(**tags)
 54 |             param_values = tf.get_default_session().run(params)
 55 |             self._cached_param_shapes[tag_tuple] = [val.shape for val in param_values]
 56 |         return self._cached_param_shapes[tag_tuple]
 57 | 
 58 |     def get_param_values(self, **tags):
 59 |         params = self.get_params(**tags)
 60 |         param_values = tf.get_default_session().run(params)
 61 |         return flatten_tensors(param_values)
 62 | 
 63 |     def set_param_values(self, flattened_params, **tags):
 64 |         debug = tags.pop("debug", False)
 65 |         param_values = unflatten_tensors(
 66 |             flattened_params, self.get_param_shapes(**tags))
 67 |         ops = []
 68 |         feed_dict = dict()
 69 |         for param, dtype, value in zip(
 70 |                 self.get_params(**tags),
 71 |                 self.get_param_dtypes(**tags),
 72 |                 param_values):
 73 |             if param not in self._cached_assign_ops:
 74 |                 assign_placeholder = tf.placeholder(dtype=param.dtype.base_dtype)
 75 |                 assign_op = tf.assign(param, assign_placeholder)
 76 |                 self._cached_assign_ops[param] = assign_op
 77 |                 self._cached_assign_placeholders[param] = assign_placeholder
 78 |             ops.append(self._cached_assign_ops[param])
 79 |             feed_dict[self._cached_assign_placeholders[param]] = value.astype(dtype)
 80 |             if debug:
 81 |                 print("setting value of %s" % param.name)
 82 |         tf.get_default_session().run(ops, feed_dict=feed_dict)
 83 | 
 84 |     def flat_to_params(self, flattened_params, **tags):
 85 |         return unflatten_tensors(flattened_params, self.get_param_shapes(**tags))
 86 | 
 87 |     def __getstate__(self):
 88 |         d = Serializable.__getstate__(self)
 89 |         global load_params
 90 |         if load_params:
 91 |             d["params"] = self.get_param_values()
 92 |         return d
 93 | 
 94 |     def __setstate__(self, d):
 95 |         Serializable.__setstate__(self, d)
 96 |         global load_params
 97 |         if load_params:
 98 |             tf.get_default_session().run(tf.variables_initializer(self.get_params()))
 99 |             self.set_param_values(d["params"])
100 | 
101 | 
102 | class JointParameterized(Parameterized):
103 |     def __init__(self, components):
104 |         super(JointParameterized, self).__init__()
105 |         self.components = components
106 | 
107 |     def get_params_internal(self, **tags):
108 |         params = [param for comp in self.components for param in comp.get_params_internal(**tags)]
109 |         # only return unique parameters
110 |         return sorted(set(params), key=hash)
111 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/base.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__pycache__/diagonal_gaussian.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/diagonal_gaussian.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | class Distribution(object):
 6 |     @property
 7 |     def dim(self):
 8 |         raise NotImplementedError
 9 | 
10 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
11 |         """
12 |         Compute the symbolic KL divergence of two distributions
13 |         """
14 |         raise NotImplementedError
15 | 
16 |     def kl(self, old_dist_info, new_dist_info):
17 |         """
18 |         Compute the KL divergence of two distributions
19 |         """
20 |         raise NotImplementedError
21 | 
22 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
23 |         raise NotImplementedError
24 | 
25 |     def entropy(self, dist_info):
26 |         raise NotImplementedError
27 | 
28 |     def log_likelihood_sym(self, x_var, dist_info_vars):
29 |         raise NotImplementedError
30 | 
31 |     def log_likelihood(self, xs, dist_info):
32 |         raise NotImplementedError
33 | 
34 |     @property
35 |     def dist_info_specs(self):
36 |         raise NotImplementedError
37 | 
38 |     @property
39 |     def dist_info_keys(self):
40 |         return [k for k, _ in self.dist_info_specs]
41 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/bernoulli.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from .base import Distribution
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | 
 7 | TINY = 1e-8
 8 | 
 9 | 
10 | class Bernoulli(Distribution):
11 |     def __init__(self, dim):
12 |         self._dim = dim
13 | 
14 |     @property
15 |     def dim(self):
16 |         return self._dim
17 | 
18 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
19 |         old_p = old_dist_info_vars["p"]
20 |         new_p = new_dist_info_vars["p"]
21 |         kl = old_p * (tf.log(old_p + TINY) - tf.log(new_p + TINY)) + \
22 |              (1 - old_p) * (tf.log(1 - old_p + TINY) - tf.log(1 - new_p + TINY))
23 |         ndims = kl.get_shape().ndims
24 |         return tf.reduce_sum(kl, axis=ndims - 1)
25 | 
26 |     def kl(self, old_dist_info, new_dist_info):
27 |         old_p = old_dist_info["p"]
28 |         new_p = new_dist_info["p"]
29 |         kl = old_p * (np.log(old_p + TINY) - np.log(new_p + TINY)) + \
30 |              (1 - old_p) * (np.log(1 - old_p + TINY) - np.log(1 - new_p + TINY))
31 |         return np.sum(kl, axis=-1)
32 | 
33 |     def sample(self, dist_info):
34 |         p = np.asarray(dist_info["p"])
35 |         return np.cast['int'](np.random.uniform(low=0., high=1., size=p.shape) < p)
36 | 
37 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
38 |         old_p = old_dist_info_vars["p"]
39 |         new_p = new_dist_info_vars["p"]
40 |         ndims = old_p.get_shape().ndims
41 |         return tf.reduce_prod(x_var * new_p / (old_p + TINY) + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY),
42 |                               axis=ndims - 1)
43 | 
44 |     def log_likelihood_sym(self, x_var, dist_info_vars):
45 |         p = dist_info_vars["p"]
46 |         ndims = p.get_shape().ndims
47 |         return tf.reduce_sum(x_var * tf.log(p + TINY) + (1 - x_var) * tf.log(1 - p + TINY), axis=ndims - 1)
48 | 
49 |     def log_likelihood(self, xs, dist_info):
50 |         p = dist_info["p"]
51 |         return np.sum(xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1)
52 | 
53 |     def entropy(self, dist_info):
54 |         p = dist_info["p"]
55 |         return np.sum(- p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1)
56 | 
57 |     @property
58 |     def dist_info_keys(self):
59 |         return ["p"]
60 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/categorical.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from .base import Distribution
  3 | import tensorflow as tf
  4 | from sandbox.rocky.tf.misc import tensor_utils
  5 | 
  6 | TINY = 1e-8
  7 | 
  8 | 
  9 | def from_onehot(x_var):
 10 |     ret = np.zeros((len(x_var),), 'int32')
 11 |     nonzero_n, nonzero_a = np.nonzero(x_var)
 12 |     ret[nonzero_n] = nonzero_a
 13 |     return ret
 14 | 
 15 | 
 16 | class Categorical(Distribution):
 17 |     def __init__(self, dim):
 18 |         self._dim = dim
 19 |         weights_var = tf.placeholder(
 20 |             dtype=tf.float32,
 21 |             shape=(None, dim),
 22 |             name="weights"
 23 |         )
 24 |         self._f_sample = tensor_utils.compile_function(
 25 |             inputs=[weights_var],
 26 |             outputs=tf.multinomial(tf.log(weights_var + 1e-8), num_samples=1)[:, 0],
 27 |         )
 28 | 
 29 |     @property
 30 |     def dim(self):
 31 |         return self._dim
 32 | 
 33 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
 34 |         """
 35 |         Compute the symbolic KL divergence of two categorical distributions
 36 |         """
 37 |         old_prob_var = old_dist_info_vars["prob"]
 38 |         new_prob_var = new_dist_info_vars["prob"]
 39 |         ndims = old_prob_var.get_shape().ndims
 40 |         # Assume layout is N * A
 41 |         return tf.reduce_sum(
 42 |             old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)),
 43 |             axis=ndims - 1
 44 |         )
 45 | 
 46 |     def kl(self, old_dist_info, new_dist_info):
 47 |         """
 48 |         Compute the KL divergence of two categorical distributions
 49 |         """
 50 |         old_prob = old_dist_info["prob"]
 51 |         new_prob = new_dist_info["prob"]
 52 |         return np.sum(
 53 |             old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)),
 54 |             axis=-1
 55 |         )
 56 | 
 57 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
 58 |         old_prob_var = old_dist_info_vars["prob"]
 59 |         new_prob_var = new_dist_info_vars["prob"]
 60 |         ndims = old_prob_var.get_shape().ndims
 61 |         x_var = tf.cast(x_var, tf.float32)
 62 |         # Assume layout is N * A
 63 |         return (tf.reduce_sum(new_prob_var * x_var, ndims - 1) + TINY) / \
 64 |                (tf.reduce_sum(old_prob_var * x_var, ndims - 1) + TINY)
 65 | 
 66 |     def entropy_sym(self, dist_info_vars):
 67 |         probs = dist_info_vars["prob"]
 68 |         return -tf.reduce_sum(probs * tf.log(probs + TINY), axis=1)
 69 | 
 70 |     def cross_entropy_sym(self, old_dist_info_vars, new_dist_info_vars):
 71 |         old_prob_var = old_dist_info_vars["prob"]
 72 |         new_prob_var = new_dist_info_vars["prob"]
 73 |         ndims = old_prob_var.get_shape().ndims
 74 |         # Assume layout is N * A
 75 |         return tf.reduce_sum(
 76 |             old_prob_var * (- tf.log(new_prob_var + TINY)),
 77 |             axis=ndims - 1
 78 |         )
 79 | 
 80 |     def entropy(self, info):
 81 |         probs = info["prob"]
 82 |         return -np.sum(probs * np.log(probs + TINY), axis=1)
 83 | 
 84 |     def log_likelihood_sym(self, x_var, dist_info_vars):
 85 |         probs = dist_info_vars["prob"]
 86 |         ndims = probs.get_shape().ndims
 87 |         return tf.log(tf.reduce_sum(probs * tf.cast(x_var, tf.float32), ndims - 1) + TINY)
 88 | 
 89 |     def log_likelihood(self, xs, dist_info):
 90 |         probs = dist_info["prob"]
 91 |         # Assume layout is N * A
 92 |         return np.log(np.sum(probs * xs, axis=-1) + TINY)
 93 | 
 94 |     @property
 95 |     def dist_info_specs(self):
 96 |         return [("prob", (self.dim,))]
 97 | 
 98 |     def sample(self, dist_info):
 99 |         return self._f_sample(dist_info["prob"])
100 | 
101 |     def sample_sym(self, dist_info):
102 |         probs = dist_info["prob"]
103 |         samples = tf.multinomial(tf.log(probs + 1e-8), num_samples=1)[:, 0]
104 |         return tf.nn.embedding_lookup(np.eye(self.dim, dtype=np.float32), samples)
105 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/diagonal_gaussian.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | from sandbox.rocky.tf.distributions.base import Distribution
 7 | 
 8 | 
 9 | class DiagonalGaussian(Distribution):
10 |     def __init__(self, dim):
11 |         self._dim = dim
12 | 
13 |     @property
14 |     def dim(self):
15 |         return self._dim
16 | 
17 |     def kl(self, old_dist_info, new_dist_info):
18 |         old_means = old_dist_info["mean"]
19 |         old_log_stds = old_dist_info["log_std"]
20 |         new_means = new_dist_info["mean"]
21 |         new_log_stds = new_dist_info["log_std"]
22 |         """
23 |         Compute the KL divergence of two multivariate Gaussian distribution with
24 |         diagonal covariance matrices
25 |         """
26 |         old_std = np.exp(old_log_stds)
27 |         new_std = np.exp(new_log_stds)
28 |         # means: (N*A)
29 |         # std: (N*A)
30 |         # formula:
31 |         # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
32 |         # ln(\sigma_2/\sigma_1)
33 |         numerator = np.square(old_means - new_means) + \
34 |                     np.square(old_std) - np.square(new_std)
35 |         denominator = 2 * np.square(new_std) + 1e-8
36 |         return np.sum(
37 |             numerator / denominator + new_log_stds - old_log_stds, axis=-1)
38 |         # more lossy version
39 |         # return TT.sum(
40 |         #     numerator / denominator + TT.log(new_std) - TT.log(old_std ), axis=-1)
41 | 
42 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
43 |         old_means = old_dist_info_vars["mean"]
44 |         old_log_stds = old_dist_info_vars["log_std"]
45 |         new_means = new_dist_info_vars["mean"]
46 |         new_log_stds = new_dist_info_vars["log_std"]
47 |         """
48 |         Compute the KL divergence of two multivariate Gaussian distribution with
49 |         diagonal covariance matrices
50 |         """
51 |         old_std = tf.exp(old_log_stds)
52 |         new_std = tf.exp(new_log_stds)
53 |         # means: (N*A)
54 |         # std: (N*A)
55 |         # formula:
56 |         # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
57 |         # ln(\sigma_2/\sigma_1)
58 |         numerator = tf.square(old_means - new_means) + \
59 |                     tf.square(old_std) - tf.square(new_std)
60 |         denominator = 2 * tf.square(new_std) + 1e-8
61 |         return tf.reduce_sum(
62 |             numerator / denominator + new_log_stds - old_log_stds, axis=-1)
63 | 
64 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
65 |         logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars)
66 |         logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars)
67 |         return tf.exp(logli_new - logli_old)
68 | 
69 |     def log_likelihood_sym(self, x_var, dist_info_vars):
70 |         means = dist_info_vars["mean"]
71 |         log_stds = dist_info_vars["log_std"]
72 |         zs = (x_var - means) / tf.exp(log_stds)
73 |         return - tf.reduce_sum(log_stds, axis=-1) - \
74 |                0.5 * tf.reduce_sum(tf.square(zs), axis=-1) - \
75 |                0.5 * self.dim * np.log(2 * np.pi)
76 | 
77 |     def sample(self, dist_info):
78 |         means = dist_info["mean"]
79 |         log_stds = dist_info["log_std"]
80 |         rnd = np.random.normal(size=means.shape)
81 |         return rnd * np.exp(log_stds) + means
82 | 
83 |     def log_likelihood(self, xs, dist_info):
84 |         means = dist_info["mean"]
85 |         log_stds = dist_info["log_std"]
86 |         zs = (xs - means) / np.exp(log_stds)
87 |         return - np.sum(log_stds, axis=-1) - \
88 |                0.5 * np.sum(np.square(zs), axis=-1) - \
89 |                0.5 * self.dim * np.log(2 * np.pi)
90 | 
91 |     def entropy(self, dist_info):
92 |         log_stds = dist_info["log_std"]
93 |         return np.sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)), axis=-1)
94 | 
95 |     @property
96 |     def dist_info_specs(self):
97 |         return [("mean", (self.dim,)), ("log_std", (self.dim,))]
98 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/recurrent_categorical.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | from sandbox.rocky.tf.distributions.categorical import Categorical
 4 | from sandbox.rocky.tf.distributions.base import Distribution
 5 | 
 6 | TINY = 1e-8
 7 | 
 8 | 
 9 | class RecurrentCategorical(Distribution):
10 |     def __init__(self, dim):
11 |         self._cat = Categorical(dim)
12 |         self._dim = dim
13 | 
14 |     @property
15 |     def dim(self):
16 |         return self._dim
17 | 
18 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
19 |         """
20 |         Compute the symbolic KL divergence of two categorical distributions
21 |         """
22 |         old_prob_var = old_dist_info_vars["prob"]
23 |         new_prob_var = new_dist_info_vars["prob"]
24 |         # Assume layout is N * T * A
25 |         return tf.reduce_sum(
26 |             old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)),
27 |             axis=2
28 |         )
29 | 
30 |     def kl(self, old_dist_info, new_dist_info):
31 |         """
32 |         Compute the KL divergence of two categorical distributions
33 |         """
34 |         old_prob = old_dist_info["prob"]
35 |         new_prob = new_dist_info["prob"]
36 |         return np.sum(
37 |             old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)),
38 |             axis=2
39 |         )
40 | 
41 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
42 |         old_prob_var = old_dist_info_vars["prob"]
43 |         new_prob_var = new_dist_info_vars["prob"]
44 |         # Assume layout is N * T * A
45 |         a_dim = tf.shape(x_var)[2]
46 |         flat_ratios = self._cat.likelihood_ratio_sym(
47 |             tf.reshape(x_var, tf.stack([-1, a_dim])),
48 |             dict(prob=tf.reshape(old_prob_var, tf.stack([-1, a_dim]))),
49 |             dict(prob=tf.reshape(new_prob_var, tf.stack([-1, a_dim])))
50 |         )
51 |         return tf.reshape(flat_ratios, tf.shape(old_prob_var)[:2])
52 | 
53 |     def entropy(self, dist_info):
54 |         probs = dist_info["prob"]
55 |         return -np.sum(probs * np.log(probs + TINY), axis=2)
56 | 
57 |     def entropy_sym(self, dist_info_vars):
58 |         probs = dist_info_vars["prob"]
59 |         return -tf.reduce_sum(probs * tf.log(probs + TINY), 2)
60 | 
61 |     def log_likelihood_sym(self, xs, dist_info_vars):
62 |         probs = dist_info_vars["prob"]
63 |         # Assume layout is N * T * A
64 |         a_dim = tf.shape(probs)[2]
65 |         # a_dim = TT.printing.Print("lala")(a_dim)
66 |         flat_logli = self._cat.log_likelihood_sym(
67 |             tf.reshape(xs, tf.stack([-1, a_dim])),
68 |             dict(prob=tf.reshape(probs, tf.stack((-1, a_dim))))
69 |         )
70 |         return tf.reshape(flat_logli, tf.shape(probs)[:2])
71 | 
72 |     def log_likelihood(self, xs, dist_info):
73 |         probs = dist_info["prob"]
74 |         # Assume layout is N * T * A
75 |         a_dim = tf.shape(probs)[2]
76 |         flat_logli = self._cat.log_likelihood_sym(
77 |             xs.reshape((-1, a_dim)),
78 |             dict(prob=probs.reshape((-1, a_dim)))
79 |         )
80 |         return flat_logli.reshape(probs.shape[:2])
81 | 
82 |     @property
83 |     def dist_info_specs(self):
84 |         return [("prob", (self.dim,))]
85 | 
86 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/recurrent_diagonal_gaussian.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | from sandbox.rocky.tf.distributions.diagonal_gaussian import DiagonalGaussian
5 | 
6 | RecurrentDiagonalGaussian = DiagonalGaussian
7 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/base.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__pycache__/parallel_vec_env_executor.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/parallel_vec_env_executor.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__pycache__/vec_env_executor.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/vec_env_executor.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/base.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.proxy_env import ProxyEnv
 2 | from rllab.envs.base import EnvSpec
 3 | from rllab.spaces.box import Box as TheanoBox
 4 | from rllab.spaces.discrete import Discrete as TheanoDiscrete
 5 | from rllab.spaces.product import Product as TheanoProduct
 6 | from sandbox.rocky.tf.spaces.discrete import Discrete
 7 | from sandbox.rocky.tf.spaces.box import Box
 8 | from sandbox.rocky.tf.spaces.product import Product
 9 | from cached_property import cached_property
10 | 
11 | 
12 | def to_tf_space(space):
13 |     if isinstance(space, TheanoBox):
14 |         return Box(low=space.low, high=space.high)
15 |     elif isinstance(space, TheanoDiscrete):
16 |         return Discrete(space.n)
17 |     elif isinstance(space, TheanoProduct):
18 |         return Product(list(map(to_tf_space, space.components)))
19 |     else:
20 |         raise NotImplementedError
21 | 
22 | 
23 | class WrappedCls(object):
24 |     def __init__(self, cls, env_cls, extra_kwargs):
25 |         self.cls = cls
26 |         self.env_cls = env_cls
27 |         self.extra_kwargs = extra_kwargs
28 | 
29 |     def __call__(self, *args, **kwargs):
30 |         return self.cls(self.env_cls(*args, **dict(self.extra_kwargs, **kwargs)))
31 | 
32 | 
33 | class TfEnv(ProxyEnv):
34 |     @cached_property
35 |     def observation_space(self):
36 |         return to_tf_space(self.wrapped_env.observation_space)
37 | 
38 |     @cached_property
39 |     def action_space(self):
40 |         return to_tf_space(self.wrapped_env.action_space)
41 | 
42 |     @cached_property
43 |     def spec(self):
44 |         return EnvSpec(
45 |             observation_space=self.observation_space,
46 |             action_space=self.action_space,
47 |         )
48 | 
49 |     @property
50 |     def vectorized(self):
51 |         return getattr(self.wrapped_env, "vectorized", False)
52 | 
53 |     def vec_env_executor(self, n_envs, max_path_length):
54 |         return VecTfEnv(self.wrapped_env.vec_env_executor(n_envs=n_envs, max_path_length=max_path_length))
55 | 
56 |     @classmethod
57 |     def wrap(cls, env_cls, **extra_kwargs):
58 |         # Use a class wrapper rather than a lambda method for smoother serialization
59 |         return WrappedCls(cls, env_cls, extra_kwargs)
60 | 
61 | 
62 | class VecTfEnv(object):
63 | 
64 |     def __init__(self, vec_env):
65 |         self.vec_env = vec_env
66 | 
67 |     def reset(self):
68 |         return self.vec_env.reset()
69 | 
70 |     @property
71 |     def num_envs(self):
72 |         return self.vec_env.num_envs
73 | 
74 |     def step(self, action_n):
75 |         return self.vec_env.step(action_n)
76 | 
77 |     def terminate(self):
78 |         self.vec_env.terminate()
79 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/vec_env_executor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | import pickle as pickle
 5 | from sandbox.rocky.tf.misc import tensor_utils
 6 | 
 7 | 
 8 | class VecEnvExecutor(object):
 9 |     def __init__(self, envs, max_path_length):
10 |         self.envs = envs
11 |         self._action_space = envs[0].action_space
12 |         self._observation_space = envs[0].observation_space
13 |         self.ts = np.zeros(len(self.envs), dtype='int')
14 |         self.max_path_length = max_path_length
15 | 
16 |     def step(self, action_n):
17 |         all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)]
18 |         obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results))))
19 |         dones = np.asarray(dones)
20 |         rewards = np.asarray(rewards)
21 |         self.ts += 1
22 |         if self.max_path_length is not None:
23 |             dones[self.ts >= self.max_path_length] = True
24 |         for (i, done) in enumerate(dones):
25 |             if done:
26 |                 obs[i] = self.envs[i].reset()
27 |                 self.ts[i] = 0
28 |         return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
29 | 
30 |     def reset(self):
31 |         results = [env.reset() for env in self.envs]
32 |         self.ts[:] = 0
33 |         return results
34 | 
35 |     @property
36 |     def num_envs(self):
37 |         return len(self.envs)
38 | 
39 |     @property
40 |     def action_space(self):
41 |         return self._action_space
42 | 
43 |     @property
44 |     def observation_space(self):
45 |         return self._observation_space
46 | 
47 |     def terminate(self):
48 |         pass
49 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/trpo_cartpole.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.algos.trpo import TRPO
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
 6 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp
 7 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
 8 | from sandbox.rocky.tf.envs.base import TfEnv
 9 | from rllab.misc.instrument import stub, run_experiment_lite
10 | 
11 | env = TfEnv(normalize(CartpoleEnv()))
12 | 
13 | policy = GaussianMLPPolicy(
14 |     name="policy",
15 |     env_spec=env.spec,
16 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
17 |     hidden_sizes=(32, 32)
18 | )
19 | 
20 | baseline = LinearFeatureBaseline(env_spec=env.spec)
21 | 
22 | algo = TRPO(
23 |     env=env,
24 |     policy=policy,
25 |     baseline=baseline,
26 |     batch_size=4000,
27 |     max_path_length=100,
28 |     n_itr=40,
29 |     discount=0.99,
30 |     step_size=0.01,
31 |     # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
32 | 
33 | )
34 | algo.train()
35 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/trpo_cartpole_recurrent.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.algos.trpo import TRPO
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from sandbox.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy
 6 | from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy
 7 | from sandbox.rocky.tf.envs.base import TfEnv
 8 | import sandbox.rocky.tf.core.layers as L
 9 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
10 | from rllab.misc.instrument import stub, run_experiment_lite
11 | 
12 | env = TfEnv(normalize(CartpoleEnv()))
13 | 
14 | policy = GaussianLSTMPolicy(
15 |     name="policy",
16 |     env_spec=env.spec,
17 |     lstm_layer_cls=L.TfBasicLSTMLayer,
18 |     # gru_layer_cls=L.GRULayer,
19 | )
20 | 
21 | baseline = LinearFeatureBaseline(env_spec=env.spec)
22 | 
23 | algo = TRPO(
24 |     env=env,
25 |     policy=policy,
26 |     baseline=baseline,
27 |     batch_size=4000,
28 |     max_path_length=100,
29 |     n_itr=10,
30 |     discount=0.99,
31 |     step_size=0.01,
32 |     optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
33 | )
34 | algo.train()
35 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/vpg_cartpole.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.algos.vpg import VPG
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
 6 | from sandbox.rocky.tf.envs.base import TfEnv
 7 | from rllab.misc.instrument import stub, run_experiment_lite
 8 | 
 9 | env = TfEnv(normalize(CartpoleEnv()))
10 | 
11 | policy = GaussianMLPPolicy(
12 |     name="policy",
13 |     env_spec=env.spec,
14 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
15 |     hidden_sizes=(32, 32)
16 | )
17 | 
18 | baseline = LinearFeatureBaseline(env_spec=env.spec)
19 | 
20 | algo = VPG(
21 |     env=env,
22 |     policy=policy,
23 |     baseline=baseline,
24 |     batch_size=10000,
25 |     max_path_length=100,
26 |     n_itr=40,
27 |     discount=0.99,
28 |     optimizer_args=dict(
29 |         tf_optimizer_args=dict(
30 |             learning_rate=0.01,
31 |         )
32 |     )
33 | )
34 | algo.train()
35 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/misc/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/__pycache__/tensor_utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/misc/__pycache__/tensor_utils.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/tensor_utils.py:
--------------------------------------------------------------------------------
  1 | import tensorflow as tf
  2 | import numpy as np
  3 | 
  4 | 
  5 | def compile_function(inputs, outputs, log_name=None):
  6 |     def run(*input_vals):
  7 |         sess = tf.get_default_session()
  8 |         return sess.run(outputs, feed_dict=dict(list(zip(inputs, input_vals))))
  9 | 
 10 |     return run
 11 | 
 12 | 
 13 | def flatten_tensor_variables(ts):
 14 |     return tf.concat(axis=0, values=[tf.reshape(x, [-1]) for x in ts])
 15 | 
 16 | 
 17 | def unflatten_tensor_variables(flatarr, shapes, symb_arrs):
 18 |     arrs = []
 19 |     n = 0
 20 |     for (shape, symb_arr) in zip(shapes, symb_arrs):
 21 |         size = np.prod(list(shape))
 22 |         arr = tf.reshape(flatarr[n:n + size], shape)
 23 |         arrs.append(arr)
 24 |         n += size
 25 |     return arrs
 26 | 
 27 | 
 28 | def new_tensor(name, ndim, dtype):
 29 |     return tf.placeholder(dtype=dtype, shape=[None] * ndim, name=name)
 30 | 
 31 | 
 32 | def new_tensor_like(name, arr_like):
 33 |     return new_tensor(name, arr_like.get_shape().ndims, arr_like.dtype.base_dtype)
 34 | 
 35 | 
 36 | def concat_tensor_list(tensor_list):
 37 |     return np.concatenate(tensor_list, axis=0)
 38 | 
 39 | 
 40 | def concat_tensor_dict_list(tensor_dict_list):
 41 |     keys = list(tensor_dict_list[0].keys())
 42 |     ret = dict()
 43 |     for k in keys:
 44 |         example = tensor_dict_list[0][k]
 45 |         if isinstance(example, dict):
 46 |             v = concat_tensor_dict_list([x[k] for x in tensor_dict_list])
 47 |         else:
 48 |             v = concat_tensor_list([x[k] for x in tensor_dict_list])
 49 |         ret[k] = v
 50 |     return ret
 51 | 
 52 | 
 53 | def stack_tensor_list(tensor_list):
 54 |     return np.array(tensor_list)
 55 |     # tensor_shape = np.array(tensor_list[0]).shape
 56 |     # if tensor_shape is tuple():
 57 |     #     return np.array(tensor_list)
 58 |     # return np.vstack(tensor_list)
 59 | 
 60 | 
 61 | def stack_tensor_dict_list(tensor_dict_list):
 62 |     """
 63 |     Stack a list of dictionaries of {tensors or dictionary of tensors}.
 64 |     :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}.
 65 |     :return: a dictionary of {stacked tensors or dictionary of stacked tensors}
 66 |     """
 67 |     keys = list(tensor_dict_list[0].keys())
 68 |     ret = dict()
 69 |     for k in keys:
 70 |         example = tensor_dict_list[0][k]
 71 |         if isinstance(example, dict):
 72 |             v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
 73 |         else:
 74 |             v = stack_tensor_list([x[k] for x in tensor_dict_list])
 75 |         ret[k] = v
 76 |     return ret
 77 | 
 78 | 
 79 | def split_tensor_dict_list(tensor_dict):
 80 |     keys = list(tensor_dict.keys())
 81 |     ret = None
 82 |     for k in keys:
 83 |         vals = tensor_dict[k]
 84 |         if isinstance(vals, dict):
 85 |             vals = split_tensor_dict_list(vals)
 86 |         if ret is None:
 87 |             ret = [{k: v} for v in vals]
 88 |         else:
 89 |             for v, cur_dict in zip(vals, ret):
 90 |                 cur_dict[k] = v
 91 |     return ret
 92 | 
 93 | 
 94 | def to_onehot_sym(inds, dim):
 95 |     return tf.one_hot(inds, depth=dim, on_value=1, off_value=0)
 96 | 
 97 | 
 98 | def pad_tensor(x, max_len):
 99 |     return np.concatenate([
100 |         x,
101 |         np.tile(np.zeros_like(x[0]), (max_len - len(x),) + (1,) * np.ndim(x[0]))
102 |     ])
103 | 
104 | 
105 | def pad_tensor_n(xs, max_len):
106 |     ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype)
107 |     for idx, x in enumerate(xs):
108 |         ret[idx][:len(x)] = x
109 |     return ret
110 | 
111 | 
112 | def pad_tensor_dict(tensor_dict, max_len):
113 |     keys = list(tensor_dict.keys())
114 |     ret = dict()
115 |     for k in keys:
116 |         if isinstance(tensor_dict[k], dict):
117 |             ret[k] = pad_tensor_dict(tensor_dict[k], max_len)
118 |         else:
119 |             ret[k] = pad_tensor(tensor_dict[k], max_len)
120 |     return ret
121 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__pycache__/conjugate_gradient_optimizer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/conjugate_gradient_optimizer.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__pycache__/penalty_lbfgs_optimizer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/penalty_lbfgs_optimizer.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/first_order_optimizer.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | from rllab.misc import ext
  5 | from rllab.misc import logger
  6 | from rllab.core.serializable import Serializable
  7 | from sandbox.rocky.tf.misc import tensor_utils
  8 | # from rllab.algo.first_order_method import parse_update_method
  9 | from rllab.optimizers.minibatch_dataset import BatchDataset
 10 | from collections import OrderedDict
 11 | import tensorflow as tf
 12 | import time
 13 | from functools import partial
 14 | import pyprind
 15 | 
 16 | 
 17 | class FirstOrderOptimizer(Serializable):
 18 |     """
 19 |     Performs (stochastic) gradient descent, possibly using fancier methods like adam etc.
 20 |     """
 21 | 
 22 |     def __init__(
 23 |             self,
 24 |             tf_optimizer_cls=None,
 25 |             tf_optimizer_args=None,
 26 |             # learning_rate=1e-3,
 27 |             max_epochs=1000,
 28 |             tolerance=1e-6,
 29 |             batch_size=32,
 30 |             callback=None,
 31 |             verbose=False,
 32 |             **kwargs):
 33 |         """
 34 | 
 35 |         :param max_epochs:
 36 |         :param tolerance:
 37 |         :param update_method:
 38 |         :param batch_size: None or an integer. If None the whole dataset will be used.
 39 |         :param callback:
 40 |         :param kwargs:
 41 |         :return:
 42 |         """
 43 |         Serializable.quick_init(self, locals())
 44 |         self._opt_fun = None
 45 |         self._target = None
 46 |         self._callback = callback
 47 |         if tf_optimizer_cls is None:
 48 |             tf_optimizer_cls = tf.train.AdamOptimizer
 49 |         if tf_optimizer_args is None:
 50 |             tf_optimizer_args = dict(learning_rate=1e-3)
 51 |         self._tf_optimizer = tf_optimizer_cls(**tf_optimizer_args)
 52 |         self._max_epochs = max_epochs
 53 |         self._tolerance = tolerance
 54 |         self._batch_size = batch_size
 55 |         self._verbose = verbose
 56 |         self._input_vars = None
 57 |         self._train_op = None
 58 | 
 59 |     def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs):
 60 |         """
 61 |         :param loss: Symbolic expression for the loss function.
 62 |         :param target: A parameterized object to optimize over. It should implement methods of the
 63 |         :class:`rllab.core.paramerized.Parameterized` class.
 64 |         :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
 65 |         :param inputs: A list of symbolic variables as inputs
 66 |         :return: No return value.
 67 |         """
 68 | 
 69 |         self._target = target
 70 | 
 71 |         self._train_op = self._tf_optimizer.minimize(loss, var_list=target.get_params(trainable=True))
 72 | 
 73 |         # updates = OrderedDict([(k, v.astype(k.dtype)) for k, v in updates.iteritems()])
 74 | 
 75 |         if extra_inputs is None:
 76 |             extra_inputs = list()
 77 |         self._input_vars = inputs + extra_inputs
 78 |         self._opt_fun = ext.lazydict(
 79 |             f_loss=lambda: tensor_utils.compile_function(inputs + extra_inputs, loss),
 80 |         )
 81 | 
 82 |     def loss(self, inputs, extra_inputs=None):
 83 |         if extra_inputs is None:
 84 |             extra_inputs = tuple()
 85 |         return self._opt_fun["f_loss"](*(tuple(inputs) + extra_inputs))
 86 | 
 87 |     def optimize(self, inputs, extra_inputs=None, callback=None):
 88 | 
 89 |         if len(inputs) == 0:
 90 |             # Assumes that we should always sample mini-batches
 91 |             raise NotImplementedError
 92 | 
 93 |         f_loss = self._opt_fun["f_loss"]
 94 | 
 95 |         if extra_inputs is None:
 96 |             extra_inputs = tuple()
 97 | 
 98 |         last_loss = f_loss(*(tuple(inputs) + extra_inputs))
 99 | 
100 |         start_time = time.time()
101 | 
102 |         dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs)
103 | 
104 |         sess = tf.get_default_session()
105 | 
106 |         for epoch in range(self._max_epochs):
107 |             if self._verbose:
108 |                 logger.log("Epoch %d" % (epoch))
109 |                 progbar = pyprind.ProgBar(len(inputs[0]))
110 | 
111 |             for batch in dataset.iterate(update=True):
112 |                 sess.run(self._train_op, dict(list(zip(self._input_vars, batch))))
113 |                 if self._verbose:
114 |                     progbar.update(len(batch[0]))
115 | 
116 |             if self._verbose:
117 |                 if progbar.active:
118 |                     progbar.stop()
119 | 
120 |             new_loss = f_loss(*(tuple(inputs) + extra_inputs))
121 | 
122 |             if self._verbose:
123 |                 logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss))
124 |             if self._callback or callback:
125 |                 elapsed = time.time() - start_time
126 |                 callback_args = dict(
127 |                     loss=new_loss,
128 |                     params=self._target.get_param_values(trainable=True) if self._target else None,
129 |                     itr=epoch,
130 |                     elapsed=elapsed,
131 |                 )
132 |                 if self._callback:
133 |                     self._callback(callback_args)
134 |                 if callback:
135 |                     callback(**callback_args)
136 | 
137 |             if abs(last_loss - new_loss) < self._tolerance:
138 |                 break
139 |             last_loss = new_loss
140 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/lbfgs_optimizer.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from rllab.misc import ext
 4 | from sandbox.rocky.tf.misc import tensor_utils
 5 | from rllab.core.serializable import Serializable
 6 | import tensorflow as tf
 7 | import scipy.optimize
 8 | import time
 9 | 
10 | 
11 | class LbfgsOptimizer(Serializable):
12 |     """
13 |     Performs unconstrained optimization via L-BFGS.
14 |     """
15 | 
16 |     def __init__(self, name, max_opt_itr=20, callback=None):
17 |         Serializable.quick_init(self, locals())
18 |         self._name = name
19 |         self._max_opt_itr = max_opt_itr
20 |         self._opt_fun = None
21 |         self._target = None
22 |         self._callback = callback
23 | 
24 |     def update_opt(self, loss, target, inputs, extra_inputs=None, *args, **kwargs):
25 |         """
26 |         :param loss: Symbolic expression for the loss function.
27 |         :param target: A parameterized object to optimize over. It should implement methods of the
28 |         :class:`rllab.core.paramerized.Parameterized` class.
29 |         :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
30 |         :param inputs: A list of symbolic variables as inputs
31 |         :return: No return value.
32 |         """
33 | 
34 |         self._target = target
35 | 
36 |         def get_opt_output():
37 |             flat_grad = tensor_utils.flatten_tensor_variables(tf.gradients(loss, target.get_params(trainable=True)))
38 |             return [tf.cast(loss, tf.float64), tf.cast(flat_grad, tf.float64)]
39 | 
40 |         if extra_inputs is None:
41 |             extra_inputs = list()
42 | 
43 |         self._opt_fun = ext.lazydict(
44 |             f_loss=lambda: tensor_utils.compile_function(inputs + extra_inputs, loss),
45 |             f_opt=lambda: tensor_utils.compile_function(
46 |                 inputs=inputs + extra_inputs,
47 |                 outputs=get_opt_output(),
48 |             )
49 |         )
50 | 
51 |     def loss(self, inputs, extra_inputs=None):
52 |         if extra_inputs is None:
53 |             extra_inputs = list()
54 |         return self._opt_fun["f_loss"](*(list(inputs) + list(extra_inputs)))
55 | 
56 |     def optimize(self, inputs, extra_inputs=None):
57 |         f_opt = self._opt_fun["f_opt"]
58 | 
59 |         if extra_inputs is None:
60 |             extra_inputs = list()
61 | 
62 |         def f_opt_wrapper(flat_params):
63 |             self._target.set_param_values(flat_params, trainable=True)
64 |             ret = f_opt(*inputs)
65 |             return ret
66 | 
67 |         itr = [0]
68 |         start_time = time.time()
69 | 
70 |         if self._callback:
71 |             def opt_callback(params):
72 |                 loss = self._opt_fun["f_loss"](*(inputs + extra_inputs))
73 |                 elapsed = time.time() - start_time
74 |                 self._callback(dict(
75 |                     loss=loss,
76 |                     params=params,
77 |                     itr=itr[0],
78 |                     elapsed=elapsed,
79 |                 ))
80 |                 itr[0] += 1
81 |         else:
82 |             opt_callback = None
83 | 
84 |         scipy.optimize.fmin_l_bfgs_b(
85 |             func=f_opt_wrapper, x0=self._target.get_param_values(trainable=True),
86 |             maxiter=self._max_opt_itr, callback=opt_callback,
87 |         )
88 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/base.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_inverse_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_inverse_policy.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_policy.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/base.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | from sandbox.rocky.tf.core.parameterized import Parameterized
  5 | 
  6 | 
  7 | class Policy(Parameterized):
  8 |     def __init__(self, env_spec):
  9 |         Parameterized.__init__(self)
 10 |         self._env_spec = env_spec
 11 | 
 12 |     # Should be implemented by all policies
 13 | 
 14 |     def get_action(self, observation):
 15 |         raise NotImplementedError
 16 | 
 17 |     def get_actions(self, observations):
 18 |         raise NotImplementedError
 19 | 
 20 |     def reset(self, dones=None):
 21 |         pass
 22 | 
 23 |     @property
 24 |     def vectorized(self):
 25 |         """
 26 |         Indicates whether the policy is vectorized. If True, it should implement get_actions(), and support resetting
 27 |         with multiple simultaneous states.
 28 |         """
 29 |         return False
 30 | 
 31 |     @property
 32 |     def observation_space(self):
 33 |         return self._env_spec.observation_space
 34 | 
 35 |     @property
 36 |     def action_space(self):
 37 |         return self._env_spec.action_space
 38 | 
 39 |     @property
 40 |     def env_spec(self):
 41 |         return self._env_spec
 42 | 
 43 |     @property
 44 |     def recurrent(self):
 45 |         """
 46 |         Indicates whether the policy is recurrent.
 47 |         :return:
 48 |         """
 49 |         return False
 50 | 
 51 |     def log_diagnostics(self, paths):
 52 |         """
 53 |         Log extra information per iteration based on the collected paths
 54 |         """
 55 |         pass
 56 | 
 57 |     @property
 58 |     def state_info_keys(self):
 59 |         """
 60 |         Return keys for the information related to the policy's state when taking an action.
 61 |         :return:
 62 |         """
 63 |         return [k for k, _ in self.state_info_specs]
 64 | 
 65 |     @property
 66 |     def state_info_specs(self):
 67 |         """
 68 |         Return keys and shapes for the information related to the policy's state when taking an action.
 69 |         :return:
 70 |         """
 71 |         return list()
 72 | 
 73 |     def terminate(self):
 74 |         """
 75 |         Clean up operation
 76 |         """
 77 |         pass
 78 | 
 79 | 
 80 | class StochasticPolicy(Policy):
 81 |     @property
 82 |     def distribution(self):
 83 |         """
 84 |         :rtype Distribution
 85 |         """
 86 |         raise NotImplementedError
 87 | 
 88 |     def dist_info_sym(self, obs_var, state_info_vars):
 89 |         """
 90 |         Return the symbolic distribution information about the actions.
 91 |         :param obs_var: symbolic variable for observations
 92 |         :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
 93 |         the time it received the observation
 94 |         :return:
 95 |         """
 96 |         raise NotImplementedError
 97 | 
 98 |     def dist_info(self, obs, state_infos):
 99 |         """
100 |         Return the distribution information about the actions.
101 |         :param obs_var: observation values
102 |         :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
103 |         the time it received the observation
104 |         :return:
105 |         """
106 |         raise NotImplementedError
107 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/categorical_conv_policy.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
 2 | import sandbox.rocky.tf.core.layers as L
 3 | from sandbox.rocky.tf.core.network import ConvNetwork
 4 | from rllab.core.serializable import Serializable
 5 | from sandbox.rocky.tf.distributions.categorical import Categorical
 6 | from sandbox.rocky.tf.policies.base import StochasticPolicy
 7 | from rllab.misc import ext
 8 | from sandbox.rocky.tf.misc import tensor_utils
 9 | from rllab.misc.overrides import overrides
10 | from sandbox.rocky.tf.spaces.discrete import Discrete
11 | import tensorflow as tf
12 | 
13 | 
14 | class CategoricalConvPolicy(StochasticPolicy, LayersPowered, Serializable):
15 |     def __init__(
16 |             self,
17 |             name,
18 |             env_spec,
19 |             conv_filters, conv_filter_sizes, conv_strides, conv_pads,
20 |             hidden_sizes=[],
21 |             hidden_nonlinearity=tf.nn.relu,
22 |             output_nonlinearity=tf.nn.softmax,
23 |             prob_network=None,
24 |     ):
25 |         """
26 |         :param env_spec: A spec for the mdp.
27 |         :param hidden_sizes: list of sizes for the fully connected hidden layers
28 |         :param hidden_nonlinearity: nonlinearity used for each hidden layer
29 |         :param prob_network: manually specified network for this policy, other network params
30 |         are ignored
31 |         :return:
32 |         """
33 |         Serializable.quick_init(self, locals())
34 | 
35 |         assert isinstance(env_spec.action_space, Discrete)
36 | 
37 |         self._env_spec = env_spec
38 |         # import pdb; pdb.set_trace()
39 |         if prob_network is None:
40 |             prob_network = ConvNetwork(
41 |                 input_shape=env_spec.observation_space.shape,
42 |                 output_dim=env_spec.action_space.n,
43 |                 conv_filters=conv_filters,
44 |                 conv_filter_sizes=conv_filter_sizes,
45 |                 conv_strides=conv_strides,
46 |                 conv_pads=conv_pads,
47 |                 hidden_sizes=hidden_sizes,
48 |                 hidden_nonlinearity=hidden_nonlinearity,
49 |                 output_nonlinearity=output_nonlinearity,
50 |                 name="prob_network",
51 |             )
52 | 
53 |         self._l_prob = prob_network.output_layer
54 |         self._l_obs = prob_network.input_layer
55 |         self._f_prob = tensor_utils.compile_function(
56 |             [prob_network.input_layer.input_var],
57 |             L.get_output(prob_network.output_layer)
58 |         )
59 | 
60 |         self._dist = Categorical(env_spec.action_space.n)
61 | 
62 |         super(CategoricalConvPolicy, self).__init__(env_spec)
63 |         LayersPowered.__init__(self, [prob_network.output_layer])
64 | 
65 |     @property
66 |     def vectorized(self):
67 |         return True
68 | 
69 |     @overrides
70 |     def dist_info_sym(self, obs_var, state_info_vars=None):
71 |         return dict(prob=L.get_output(self._l_prob, {self._l_obs: tf.cast(obs_var, tf.float32)}))
72 | 
73 |     @overrides
74 |     def dist_info(self, obs, state_infos=None):
75 |         return dict(prob=self._f_prob(obs))
76 | 
77 |     # The return value is a pair. The first item is a matrix (N, A), where each
78 |     # entry corresponds to the action value taken. The second item is a vector
79 |     # of length N, where each entry is the density value for that action, under
80 |     # the current policy
81 |     @overrides
82 |     def get_action(self, observation):
83 |         flat_obs = self.observation_space.flatten(observation)
84 |         prob = self._f_prob([flat_obs])[0]
85 |         action = self.action_space.weighted_sample(prob)
86 |         return action, dict(prob=prob)
87 | 
88 |     def get_actions(self, observations):
89 |         flat_obs = self.observation_space.flatten_n(observations)
90 |         probs = self._f_prob(flat_obs)
91 |         actions = list(map(self.action_space.weighted_sample, probs))
92 |         return actions, dict(prob=probs)
93 | 
94 |     @property
95 |     def distribution(self):
96 |         return self._dist
97 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/categorical_mlp_policy.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
 2 | import sandbox.rocky.tf.core.layers as L
 3 | from sandbox.rocky.tf.core.network import MLP
 4 | from rllab.core.serializable import Serializable
 5 | from sandbox.rocky.tf.distributions.categorical import Categorical
 6 | from sandbox.rocky.tf.policies.base import StochasticPolicy
 7 | from rllab.misc import ext
 8 | from sandbox.rocky.tf.misc import tensor_utils
 9 | from rllab.misc.overrides import overrides
10 | from sandbox.rocky.tf.spaces.discrete import Discrete
11 | import tensorflow as tf
12 | 
13 | 
14 | class CategoricalMLPPolicy(StochasticPolicy, LayersPowered, Serializable):
15 |     def __init__(
16 |             self,
17 |             name,
18 |             env_spec,
19 |             hidden_sizes=(32, 32),
20 |             hidden_nonlinearity=tf.nn.tanh,
21 |             prob_network=None,
22 |     ):
23 |         """
24 |         :param env_spec: A spec for the mdp.
25 |         :param hidden_sizes: list of sizes for the fully connected hidden layers
26 |         :param hidden_nonlinearity: nonlinearity used for each hidden layer
27 |         :param prob_network: manually specified network for this policy, other network params
28 |         are ignored
29 |         :return:
30 |         """
31 |         Serializable.quick_init(self, locals())
32 | 
33 |         assert isinstance(env_spec.action_space, Discrete)
34 | 
35 |         with tf.variable_scope(name):
36 |             if prob_network is None:
37 |                 prob_network = MLP(
38 |                     input_shape=(env_spec.observation_space.flat_dim,),
39 |                     output_dim=env_spec.action_space.n,
40 |                     hidden_sizes=hidden_sizes,
41 |                     hidden_nonlinearity=hidden_nonlinearity,
42 |                     output_nonlinearity=tf.nn.softmax,
43 |                     name="prob_network",
44 |                 )
45 | 
46 |             self._l_prob = prob_network.output_layer
47 |             self._l_obs = prob_network.input_layer
48 |             self._f_prob = tensor_utils.compile_function(
49 |                 [prob_network.input_layer.input_var],
50 |                 L.get_output(prob_network.output_layer)
51 |             )
52 | 
53 |             self._dist = Categorical(env_spec.action_space.n)
54 | 
55 |             super(CategoricalMLPPolicy, self).__init__(env_spec)
56 |             LayersPowered.__init__(self, [prob_network.output_layer])
57 | 
58 |     @property
59 |     def vectorized(self):
60 |         return True
61 | 
62 |     @overrides
63 |     def dist_info_sym(self, obs_var, state_info_vars=None):
64 |         return dict(prob=L.get_output(self._l_prob, {self._l_obs: tf.cast(obs_var, tf.float32)}))
65 | 
66 |     @overrides
67 |     def dist_info(self, obs, state_infos=None):
68 |         return dict(prob=self._f_prob(obs))
69 | 
70 |     # The return value is a pair. The first item is a matrix (N, A), where each
71 |     # entry corresponds to the action value taken. The second item is a vector
72 |     # of length N, where each entry is the density value for that action, under
73 |     # the current policy
74 |     @overrides
75 |     def get_action(self, observation):
76 |         flat_obs = self.observation_space.flatten(observation)
77 |         prob = self._f_prob([flat_obs])[0]
78 |         action = self.action_space.weighted_sample(prob)
79 |         return action, dict(prob=prob)
80 | 
81 |     def get_actions(self, observations):
82 |         flat_obs = self.observation_space.flatten_n(observations)
83 |         probs = self._f_prob(flat_obs)
84 |         actions = list(map(self.action_space.weighted_sample, probs))
85 |         return actions, dict(prob=probs)
86 | 
87 |     @property
88 |     def distribution(self):
89 |         return self._dist
90 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/deterministic_mlp_policy.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.serializable import Serializable
 2 | from rllab.misc import ext
 3 | from rllab.misc.overrides import overrides
 4 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
 5 | from sandbox.rocky.tf.core.network import MLP
 6 | from sandbox.rocky.tf.distributions.categorical import Categorical
 7 | from sandbox.rocky.tf.policies.base import Policy
 8 | from sandbox.rocky.tf.misc import tensor_utils
 9 | 
10 | import sandbox.rocky.tf.core.layers as L
11 | from sandbox.rocky.tf.core.layers import batch_norm
12 | 
13 | from sandbox.rocky.tf.spaces.discrete import Discrete
14 | import tensorflow as tf
15 | 
16 | 
17 | class DeterministicMLPPolicy(Policy, LayersPowered, Serializable):
18 |     def __init__(
19 |             self,
20 |             name,
21 |             env_spec,
22 |             hidden_sizes=(32, 32),
23 |             hidden_nonlinearity=tf.nn.relu,
24 |             output_nonlinearity=tf.nn.tanh,
25 |             prob_network=None,
26 |             bn=False):
27 |         Serializable.quick_init(self, locals())
28 | 
29 |         with tf.variable_scope(name):
30 |             if prob_network is None:
31 |                 prob_network = MLP(
32 |                     input_shape=(env_spec.observation_space.flat_dim,),
33 |                     output_dim=env_spec.action_space.flat_dim,
34 |                     hidden_sizes=hidden_sizes,
35 |                     hidden_nonlinearity=hidden_nonlinearity,
36 |                     output_nonlinearity=output_nonlinearity,
37 |                     # batch_normalization=True,
38 |                     name="prob_network",
39 |                 )
40 | 
41 |             self._l_prob = prob_network.output_layer
42 |             self._l_obs = prob_network.input_layer
43 |             self._f_prob = tensor_utils.compile_function(
44 |                 [prob_network.input_layer.input_var],
45 |                 L.get_output(prob_network.output_layer, deterministic=True)
46 |             )
47 | 
48 |         self.prob_network = prob_network
49 | 
50 |         # Note the deterministic=True argument. It makes sure that when getting
51 |         # actions from single observations, we do not update params in the
52 |         # batch normalization layers.
53 |         # TODO: this doesn't currently work properly in the tf version so we leave out batch_norm
54 |         super(DeterministicMLPPolicy, self).__init__(env_spec)
55 |         LayersPowered.__init__(self, [prob_network.output_layer])
56 | 
57 |     @property
58 |     def vectorized(self):
59 |         return True
60 |         
61 |     @overrides
62 |     def get_action(self, observation):
63 |         flat_obs = self.observation_space.flatten(observation)
64 |         action = self._f_prob([flat_obs])[0]
65 |         return action, dict()
66 | 
67 |     @overrides
68 |     def get_actions(self, observations):
69 |         flat_obs = self.observation_space.flatten_n(observations)
70 |         actions = self._f_prob(flat_obs)
71 |         return actions, dict()
72 | 
73 |     def get_action_sym(self, obs_var):
74 |         return L.get_output(self.prob_network.output_layer, obs_var)
75 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/uniform_control_policy.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.policies.base import Policy
 2 | from rllab.core.serializable import Serializable
 3 | 
 4 | 
 5 | class UniformControlPolicy(Policy, Serializable):
 6 |     def __init__(
 7 |             self,
 8 |             env_spec,
 9 |     ):
10 |         Serializable.quick_init(self, locals())
11 |         super(UniformControlPolicy, self).__init__(env_spec=env_spec)
12 | 
13 |     @property
14 |     def vectorized(self):
15 |         return True
16 | 
17 |     def get_action(self, observation):
18 |         return self.action_space.sample(), dict()
19 | 
20 |     def get_actions(self, observations):
21 |         return self.action_space.sample_n(len(observations)), dict()
22 | 
23 |     def get_params_internal(self, **tags):
24 |         return []
25 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/q_functions/base.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.core.parameterized import Parameterized
2 | 
3 | class QFunction(Parameterized):
4 |     pass
5 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/q_functions/continuous_mlp_q_function.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.q_functions.base import QFunction
 2 | from rllab.core.serializable import Serializable
 3 | from rllab.misc import ext
 4 | 
 5 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
 6 | from sandbox.rocky.tf.core.network import MLP
 7 | from sandbox.rocky.tf.core.layers import batch_norm
 8 | from sandbox.rocky.tf.distributions.categorical import Categorical
 9 | from sandbox.rocky.tf.policies.base import StochasticPolicy
10 | from sandbox.rocky.tf.misc import tensor_utils
11 | 
12 | import tensorflow as tf
13 | import sandbox.rocky.tf.core.layers as L
14 | 
15 | 
16 | class ContinuousMLPQFunction(QFunction, LayersPowered, Serializable):
17 |     def __init__(
18 |             self,
19 |             env_spec,
20 |             hidden_sizes=(32, 32),
21 |             hidden_nonlinearity=tf.nn.relu,
22 |             action_merge_layer=-2,
23 |             output_nonlinearity=None,
24 |             bn=False):
25 |         Serializable.quick_init(self, locals())
26 | 
27 |         l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs")
28 |         l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions")
29 | 
30 |         n_layers = len(hidden_sizes) + 1
31 | 
32 |         if n_layers > 1:
33 |             action_merge_layer = \
34 |                 (action_merge_layer % n_layers + n_layers) % n_layers
35 |         else:
36 |             action_merge_layer = 1
37 | 
38 |         l_hidden = l_obs
39 | 
40 |         for idx, size in enumerate(hidden_sizes):
41 |             if bn:
42 |                 l_hidden = batch_norm(l_hidden)
43 | 
44 |             if idx == action_merge_layer:
45 |                 l_hidden = L.ConcatLayer([l_hidden, l_action])
46 | 
47 |             l_hidden = L.DenseLayer(
48 |                 l_hidden,
49 |                 num_units=size,
50 |                 nonlinearity=hidden_nonlinearity,
51 |                 name="h%d" % (idx + 1)
52 |             )
53 | 
54 |         if action_merge_layer == n_layers:
55 |             l_hidden = L.ConcatLayer([l_hidden, l_action])
56 | 
57 |         l_output = L.DenseLayer(
58 |             l_hidden,
59 |             num_units=1,
60 |             nonlinearity=output_nonlinearity,
61 |             name="output"
62 |         )
63 | 
64 |         output_var = L.get_output(l_output, deterministic=True)
65 | 
66 |         self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var)
67 |         self._output_layer = l_output
68 |         self._obs_layer = l_obs
69 |         self._action_layer = l_action
70 |         self._output_nonlinearity = output_nonlinearity
71 | 
72 |         LayersPowered.__init__(self, [l_output])
73 | 
74 |     def get_qval(self, observations, actions):
75 |         return self._f_qval(observations, actions)
76 | 
77 |     def get_qval_sym(self, obs_var, action_var, **kwargs):
78 |         qvals = L.get_output(
79 |             self._output_layer,
80 |             {self._obs_layer: obs_var, self._action_layer: action_var},
81 |             **kwargs
82 |         )
83 |         return tf.reshape(qvals, (-1,))
84 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/regressors/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/regressors/deterministic_mlp_regressor.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | 
  4 | 
  5 | 
  6 | 
  7 | import numpy as np
  8 | 
  9 | import tensorflow as tf
 10 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
 11 | from sandbox.rocky.tf.core.network import MLP
 12 | from sandbox.rocky.tf.misc import tensor_utils
 13 | from sandbox.rocky.tf.distributions.categorical import Categorical
 14 | from sandbox.rocky.tf.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer
 15 | from sandbox.rocky.tf.optimizers.lbfgs_optimizer import LbfgsOptimizer
 16 | import sandbox.rocky.tf.core.layers as L
 17 | from rllab.core.serializable import Serializable
 18 | from rllab.misc import ext
 19 | from rllab.misc import logger
 20 | 
 21 | NONE = list()
 22 | 
 23 | 
 24 | class DeterministicMLPRegressor(LayersPowered, Serializable):
 25 |     """
 26 |     A class for performing nonlinear regression.
 27 |     """
 28 | 
 29 |     def __init__(
 30 |             self,
 31 |             name,
 32 |             input_shape,
 33 |             output_dim,
 34 |             network=None,
 35 |             hidden_sizes=(32, 32),
 36 |             hidden_nonlinearity=tf.nn.tanh,
 37 |             output_nonlinearity=None,
 38 |             optimizer=None,
 39 |             normalize_inputs=True,
 40 |     ):
 41 |         """
 42 |         :param input_shape: Shape of the input data.
 43 |         :param output_dim: Dimension of output.
 44 |         :param hidden_sizes: Number of hidden units of each layer of the mean network.
 45 |         :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
 46 |         :param optimizer: Optimizer for minimizing the negative log-likelihood.
 47 |         """
 48 |         Serializable.quick_init(self, locals())
 49 | 
 50 |         with tf.variable_scope(name):
 51 | 
 52 |             if optimizer is None:
 53 |                 optimizer = LbfgsOptimizer(name="optimizer")
 54 | 
 55 |             self.output_dim = output_dim
 56 |             self.optimizer = optimizer
 57 | 
 58 |             if network is None:
 59 |                 network = MLP(
 60 |                     input_shape=input_shape,
 61 |                     output_dim=output_dim,
 62 |                     hidden_sizes=hidden_sizes,
 63 |                     hidden_nonlinearity=hidden_nonlinearity,
 64 |                     output_nonlinearity=output_nonlinearity,
 65 |                     name="network"
 66 |                 )
 67 | 
 68 |             l_out = network.output_layer
 69 | 
 70 |             LayersPowered.__init__(self, [l_out])
 71 | 
 72 |             xs_var = network.input_layer.input_var
 73 |             ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys")
 74 | 
 75 |             x_mean_var = tf.get_variable(
 76 |                 name="x_mean",
 77 |                 shape=(1,) + input_shape,
 78 |                 initializer=tf.constant_initializer(0., dtype=tf.float32)
 79 |             )
 80 |             x_std_var = tf.get_variable(
 81 |                 name="x_std",
 82 |                 shape=(1,) + input_shape,
 83 |                 initializer=tf.constant_initializer(1., dtype=tf.float32)
 84 |             )
 85 | 
 86 |             normalized_xs_var = (xs_var - x_mean_var) / x_std_var
 87 | 
 88 |             fit_ys_var = L.get_output(l_out, {network.input_layer: normalized_xs_var})
 89 | 
 90 |             loss = - tf.reduce_mean(tf.square(fit_ys_var - ys_var))
 91 | 
 92 |             self.f_predict = tensor_utils.compile_function([xs_var], fit_ys_var)
 93 | 
 94 |             optimizer_args = dict(
 95 |                 loss=loss,
 96 |                 target=self,
 97 |                 network_outputs=[fit_ys_var],
 98 |             )
 99 | 
100 |             optimizer_args["inputs"] = [xs_var, ys_var]
101 | 
102 |             self.optimizer.update_opt(**optimizer_args)
103 | 
104 |             self.name = name
105 |             self.l_out = l_out
106 | 
107 |             self.normalize_inputs = normalize_inputs
108 |             self.x_mean_var = x_mean_var
109 |             self.x_std_var = x_std_var
110 | 
111 |     def predict_sym(self, xs):
112 |         return L.get_output(self.l_out, xs)
113 | 
114 |     # def fit(self, xs, ys):
115 |     #     if self._normalize_inputs:
116 |     #         # recompute normalizing constants for inputs
117 |     #         new_mean = np.mean(xs, axis=0, keepdims=True)
118 |     #         new_std = np.std(xs, axis=0, keepdims=True) + 1e-8
119 |     #         tf.get_default_session().run(tf.group(
120 |     #             tf.assign(self._x_mean_var, new_mean),
121 |     #             tf.assign(self._x_std_var, new_std),
122 |     #         ))
123 |     #         inputs = [xs, ys]
124 |     #     loss_before = self._optimizer.loss(inputs)
125 |     #     if self._name:
126 |     #         prefix = self._name + "_"
127 |     #     else:
128 |     #         prefix = ""
129 |     #     logger.record_tabular(prefix + 'LossBefore', loss_before)
130 |     #     self._optimizer.optimize(inputs)
131 |     #     loss_after = self._optimizer.loss(inputs)
132 |     #     logger.record_tabular(prefix + 'LossAfter', loss_after)
133 |     #     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
134 | 
135 |     def predict(self, xs):
136 |         return self.f_predict(np.asarray(xs))
137 | 
138 |     def get_param_values(self, **tags):
139 |         return LayersPowered.get_param_values(self, **tags)
140 | 
141 |     def set_param_values(self, flattened_params, **tags):
142 |         return LayersPowered.set_param_values(self, flattened_params, **tags)
143 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/base.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__pycache__/batch_sampler.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/batch_sampler.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__pycache__/vectorized_sampler.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/vectorized_sampler.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/batch_sampler.py:
--------------------------------------------------------------------------------
 1 | from rllab.sampler.base import BaseSampler
 2 | from rllab.sampler import parallel_sampler
 3 | from rllab.sampler.stateful_pool import singleton_pool
 4 | import tensorflow as tf
 5 | 
 6 | 
 7 | def worker_init_tf(G):
 8 |     G.sess = tf.Session()
 9 |     G.sess.__enter__()
10 | 
11 | 
12 | def worker_init_tf_vars(G):
13 |     G.sess.run(tf.global_variables_initializer())
14 | 
15 | 
16 | class BatchSampler(BaseSampler):
17 |     def start_worker(self):
18 |         if singleton_pool.n_parallel > 1:
19 |             singleton_pool.run_each(worker_init_tf)
20 |         parallel_sampler.populate_task(self.algo.env, self.algo.policy)
21 |         if singleton_pool.n_parallel > 1:
22 |             singleton_pool.run_each(worker_init_tf_vars)
23 | 
24 |     def shutdown_worker(self):
25 |         parallel_sampler.terminate_task(scope=self.algo.scope)
26 | 
27 |     def obtain_samples(self, itr):
28 |         cur_policy_params = self.algo.policy.get_param_values()
29 |         cur_env_params = self.algo.env.get_param_values()
30 |         paths = parallel_sampler.sample_paths(
31 |             policy_params=cur_policy_params,
32 |             env_params=cur_env_params,
33 |             max_samples=self.algo.batch_size,
34 |             max_path_length=self.algo.max_path_length,
35 |             scope=self.algo.scope,
36 |         )
37 |         if self.algo.whole_paths:
38 |             return paths
39 |         else:
40 |             paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size)
41 |             return paths_truncated
42 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/vectorized_sampler.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | 
  3 | import tensorflow as tf
  4 | from rllab.sampler.base import BaseSampler
  5 | #from base import BaseSampler
  6 | from sandbox.rocky.tf.envs.parallel_vec_env_executor import ParallelVecEnvExecutor
  7 | from sandbox.rocky.tf.envs.vec_env_executor import VecEnvExecutor
  8 | from rllab.misc import tensor_utils
  9 | import numpy as np
 10 | from rllab.sampler.stateful_pool import ProgBarCounter
 11 | import rllab.misc.logger as logger
 12 | import itertools
 13 | 
 14 | 
 15 | class VectorizedSampler(BaseSampler):
 16 | 
 17 |     def __init__(self, algo, n_envs=None):
 18 |         super(VectorizedSampler, self).__init__(algo)
 19 |         self.n_envs = n_envs
 20 | 
 21 |     def start_worker(self):
 22 |         n_envs = self.n_envs
 23 |         if n_envs is None:
 24 |             n_envs = int(self.algo.batch_size / self.algo.max_path_length)
 25 |             n_envs = max(1, min(n_envs, 100))
 26 | 
 27 |         if getattr(self.algo.env, 'vectorized', False):
 28 |             self.vec_env = self.algo.env.vec_env_executor(n_envs=n_envs, max_path_length=self.algo.max_path_length)
 29 |         else:
 30 |             envs = [pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs)]
 31 |             self.vec_env = VecEnvExecutor(
 32 |                 envs=envs,
 33 |                 max_path_length=self.algo.max_path_length
 34 |             )
 35 |         self.env_spec = self.algo.env.spec
 36 | 
 37 |     def shutdown_worker(self):
 38 |         self.vec_env.terminate()
 39 | 
 40 |     def obtain_samples(self, itr):
 41 |         logger.log("Obtaining samples for iteration %d..." % itr)
 42 |         paths = []
 43 |         n_samples = 0
 44 |         obses = self.vec_env.reset()
 45 |         dones = np.asarray([True] * self.vec_env.num_envs)
 46 |         running_paths = [None] * self.vec_env.num_envs
 47 | 
 48 |         pbar = ProgBarCounter(self.algo.batch_size)
 49 |         policy_time = 0
 50 |         env_time = 0
 51 |         process_time = 0
 52 | 
 53 |         policy = self.algo.policy
 54 |         import time
 55 |         while n_samples < self.algo.batch_size:
 56 |             t = time.time()
 57 |             policy.reset(dones)
 58 |             actions, agent_infos = policy.get_actions(obses)
 59 | 
 60 |             policy_time += time.time() - t
 61 |             t = time.time()
 62 |             next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
 63 |             env_time += time.time() - t
 64 | 
 65 |             t = time.time()
 66 | 
 67 |             agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
 68 |             env_infos = tensor_utils.split_tensor_dict_list(env_infos)
 69 |             if env_infos is None:
 70 |                 env_infos = [dict() for _ in range(self.vec_env.num_envs)]
 71 |             if agent_infos is None:
 72 |                 agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
 73 |             for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
 74 |                                                                                     rewards, env_infos, agent_infos,
 75 |                                                                                     dones):
 76 |                 if running_paths[idx] is None:
 77 |                     running_paths[idx] = dict(
 78 |                         observations=[],
 79 |                         actions=[],
 80 |                         rewards=[],
 81 |                         env_infos=[],
 82 |                         agent_infos=[],
 83 |                     )
 84 |                 running_paths[idx]["observations"].append(observation)
 85 |                 running_paths[idx]["actions"].append(action)
 86 |                 running_paths[idx]["rewards"].append(reward)
 87 |                 running_paths[idx]["env_infos"].append(env_info)
 88 |                 running_paths[idx]["agent_infos"].append(agent_info)
 89 |                 if done:
 90 |                     paths.append(dict(
 91 |                         observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
 92 |                         actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
 93 |                         rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
 94 |                         env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
 95 |                         agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
 96 |                     ))
 97 |                     n_samples += len(running_paths[idx]["rewards"])
 98 |                     running_paths[idx] = None
 99 |             process_time += time.time() - t
100 |             pbar.inc(len(obses))
101 |             obses = next_obses
102 | 
103 |         pbar.stop()
104 | 
105 |         logger.record_tabular("PolicyExecTime", policy_time)
106 |         logger.record_tabular("EnvExecTime", env_time)
107 |         logger.record_tabular("ProcessExecTime", process_time)
108 | 
109 |         return paths
110 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__init__.py:
--------------------------------------------------------------------------------
1 | from .product import Product
2 | from .discrete import Discrete
3 | from .box import Box
4 | 
5 | __all__ = ["Product", "Discrete", "Box"]
6 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__pycache__/box.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/box.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__pycache__/discrete.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/discrete.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__pycache__/product.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/product.cpython-35.pyc


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/box.py:
--------------------------------------------------------------------------------
 1 | from rllab.spaces.box import Box as TheanoBox
 2 | import tensorflow as tf
 3 | 
 4 | 
 5 | class Box(TheanoBox):
 6 |     def new_tensor_variable(self, name, extra_dims, flatten=True):
 7 |         if flatten:
 8 |             return tf.placeholder(tf.float32, shape=[None] * extra_dims + [self.flat_dim], name=name)
 9 |         return tf.placeholder(tf.float32, shape=[None] * extra_dims + list(self.shape), name=name)
10 | 
11 |     @property
12 |     def dtype(self):
13 |         return tf.float32
14 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/discrete.py:
--------------------------------------------------------------------------------
 1 | from rllab.spaces.base import Space
 2 | import numpy as np
 3 | from rllab.misc import special
 4 | from rllab.misc import ext
 5 | import tensorflow as tf
 6 | 
 7 | 
 8 | class Discrete(Space):
 9 |     """
10 |     {0,1,...,n-1}
11 |     """
12 | 
13 |     def __init__(self, n):
14 |         self._n = n
15 | 
16 |     @property
17 |     def n(self):
18 |         return self._n
19 | 
20 |     def sample(self):
21 |         return np.random.randint(self.n)
22 | 
23 |     def sample_n(self, n):
24 |         return np.random.randint(low=0, high=self.n, size=n)
25 | 
26 |     def contains(self, x):
27 |         x = np.asarray(x)
28 |         return x.shape == () and x.dtype.kind == 'i' and x >= 0 and x < self.n
29 | 
30 |     def __repr__(self):
31 |         return "Discrete(%d)" % self.n
32 | 
33 |     def __eq__(self, other):
34 |         return self.n == other.n
35 | 
36 |     def flatten(self, x):
37 |         return special.to_onehot(x, self.n)
38 | 
39 |     def unflatten(self, x):
40 |         return special.from_onehot(x)
41 | 
42 |     def flatten_n(self, x):
43 |         return special.to_onehot_n(x, self.n)
44 | 
45 |     def unflatten_n(self, x):
46 |         return special.from_onehot_n(x)
47 | 
48 |     @property
49 |     def default_value(self):
50 |         return 0
51 | 
52 |     @property
53 |     def flat_dim(self):
54 |         return self.n
55 | 
56 |     def weighted_sample(self, weights):
57 |         return special.weighted_sample(weights, range(self.n))
58 | 
59 |     def new_tensor_variable(self, name, extra_dims):
60 |         # needed for safe conversion to float32
61 |         return tf.placeholder(dtype=tf.uint8, shape=[None] * extra_dims + [self.flat_dim], name=name)
62 | 
63 |     @property
64 |     def dtype(self):
65 |         return tf.uint8
66 | 
67 |     def __eq__(self, other):
68 |         if not isinstance(other, Discrete):
69 |             return False
70 |         return self.n == other.n
71 | 
72 |     def __hash__(self):
73 |         return hash(self.n)
74 | 
75 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/product.py:
--------------------------------------------------------------------------------
 1 | from rllab.spaces.base import Space
 2 | import tensorflow as tf
 3 | import numpy as np
 4 | 
 5 | 
 6 | class Product(Space):
 7 |     def __init__(self, *components):
 8 |         if isinstance(components[0], (list, tuple)):
 9 |             assert len(components) == 1
10 |             components = components[0]
11 |         self._components = tuple(components)
12 |         dtypes = [c.dtype for c in components]
13 |         if len(dtypes) > 0 and hasattr(dtypes[0], "as_numpy_dtype"):
14 |             dtypes = [d.as_numpy_dtype for d in dtypes]
15 |         self._common_dtype = np.core.numerictypes.find_common_type([], dtypes)
16 | 
17 |     def sample(self):
18 |         return tuple(x.sample() for x in self._components)
19 | 
20 |     @property
21 |     def components(self):
22 |         return self._components
23 | 
24 |     def contains(self, x):
25 |         return isinstance(x, tuple) and all(c.contains(xi) for c, xi in zip(self._components, x))
26 | 
27 |     def new_tensor_variable(self, name, extra_dims):
28 |         return tf.placeholder(
29 |             dtype=self._common_dtype,
30 |             shape=[None] * extra_dims + [self.flat_dim],
31 |             name=name,
32 |         )
33 | 
34 |     @property
35 |     def dtype(self):
36 |         return self._common_dtype
37 | 
38 |     @property
39 |     def flat_dim(self):
40 |         return int(np.sum([c.flat_dim for c in self._components]))
41 | 
42 |     def flatten(self, x):
43 |         return np.concatenate([c.flatten(xi) for c, xi in zip(self._components, x)])
44 | 
45 |     def flatten_n(self, xs):
46 |         xs_regrouped = [[x[i] for x in xs] for i in range(len(xs[0]))]
47 |         flat_regrouped = [c.flatten_n(xi) for c, xi in zip(self.components, xs_regrouped)]
48 |         return np.concatenate(flat_regrouped, axis=-1)
49 | 
50 |     def unflatten(self, x):
51 |         dims = [c.flat_dim for c in self._components]
52 |         flat_xs = np.split(x, np.cumsum(dims)[:-1])
53 |         return tuple(c.unflatten(xi) for c, xi in zip(self._components, flat_xs))
54 | 
55 |     def unflatten_n(self, xs):
56 |         dims = [c.flat_dim for c in self._components]
57 |         flat_xs = np.split(xs, np.cumsum(dims)[:-1], axis=-1)
58 |         unflat_xs = [c.unflatten_n(xi) for c, xi in zip(self.components, flat_xs)]
59 |         unflat_xs_grouped = list(zip(*unflat_xs))
60 |         return unflat_xs_grouped
61 | 
62 |     def __eq__(self, other):
63 |         if not isinstance(other, Product):
64 |             return False
65 |         return tuple(self.components) == tuple(other.components)
66 | 
67 |     def __hash__(self):
68 |         return hash(tuple(self.components))
69 | 


--------------------------------------------------------------------------------
/sim_cpolicy.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import joblib
  3 | from rllab.misc import tensor_utils
  4 | import time
  5 | from contextlib import contextmanager
  6 | import numpy as np
  7 | import tensorflow as tf
  8 | 
  9 | from sac.envs import CrossMazeAntEnv, RandomGoalAntEnv,HalfCheetahHurdleEnv
 10 | from rllab.envs.normalized_env import normalize
 11 | from rllab.misc import tensor_utils
 12 | from sac.misc import tf_utils
 13 | 
 14 | def rollout(env, policy,sub_level_policies,path_length=1000, render=True, speedup=10, g=2):
 15 | 	observation = env.reset()
 16 | 	policy.reset()
 17 | 
 18 | 	t = 0
 19 | 	obs = observation
 20 | 	for t in range(path_length):
 21 | 
 22 | 
 23 | 		sub_level_actions=[]
 24 | 		if g!=0:
 25 | 			obs=observation[:-g]
 26 | 		else:
 27 | 			obs=observation
 28 | 		for i in range(0,len(sub_level_policies)):
 29 | 			action, _ = sub_level_policies[i].get_action(obs)
 30 | 			sub_level_actions.append(action.reshape(1,-1))
 31 | 		sub_level_actions=np.stack(sub_level_actions,axis=0)
 32 | 		sub_level_actions=np.transpose(sub_level_actions,(1,0,2))
 33 | 
 34 | 		action, agent_info = policy.get_action(observation,sub_level_actions)
 35 | 		next_obs, reward, terminal, env_info = env.step(action)
 36 | 
 37 | 
 38 | 		observation = next_obs
 39 | 
 40 | 		if render:
 41 | 			env.render()
 42 | 			time_step = 0.05
 43 | 			time.sleep(time_step / speedup)
 44 | 
 45 | 		if terminal:
 46 | 			break
 47 | 
 48 | 
 49 | 	return 0
 50 | 
 51 | 
 52 | def parse_args():
 53 | 	parser = argparse.ArgumentParser()
 54 | 	parser.add_argument('file', type=str, help='Path to the snapshot file.')
 55 | 	parser.add_argument('--max-path-length', '-l', type=int, default=1000)
 56 | 	parser.add_argument('--speedup', '-s', type=float, default=10)
 57 | 	parser.add_argument('--domain',type=str,default='ant-cross-maze')
 58 | 	parser.add_argument('--deterministic', '-d', dest='deterministic',
 59 | 		                action='store_true')
 60 | 	parser.add_argument('--no-deterministic', '-nd', dest='deterministic',
 61 | 		                action='store_false')
 62 | 	parser.add_argument('--policy_h', type=int)
 63 | 	parser.set_defaults(deterministic=True)
 64 | 
 65 | 	args = parser.parse_args()
 66 | 
 67 | 	return args
 68 | 
 69 | def load_low_level_policy(policy_path=None,name=None):
 70 | 	with tf_utils.get_default_session().as_default():
 71 | 		with tf.variable_scope(name, reuse=False):
 72 | 			snapshot = joblib.load(policy_path)
 73 | 
 74 | 	policy = snapshot["policy"]
 75 | 	return policy
 76 | 
 77 | 
 78 | def simulate_policy_ant(args):
 79 | 	sub_level_policies=[]
 80 | 	with tf.Session() as sess:
 81 | 		with tf.variable_scope("fwrd", reuse=False):
 82 | 			fwrd = joblib.load("primitive-policies/ant/fwrd/fwrd.pkl")
 83 | 		with tf.variable_scope("bwrd", reuse=False):
 84 | 			bwrd = joblib.load("primitive-policies/ant/bwrd/bwrd.pkl")
 85 | 		with tf.variable_scope("uwrd", reuse=False):
 86 | 			uwrd = joblib.load("primitive-policies/ant/uwrd/uwrd.pkl")
 87 | 		with tf.variable_scope("dwrd", reuse=False):
 88 | 			dwrd = joblib.load("primitive-policies/ant/dwrd/dwrd.pkl")
 89 | 		sub_level_policies.append(fwrd["policy"])
 90 | 		sub_level_policies.append(bwrd["policy"])
 91 | 		sub_level_policies.append(uwrd["policy"])
 92 | 		sub_level_policies.append(dwrd["policy"])
 93 | 		data = joblib.load(args.file)
 94 | 		if 'algo' in data.keys():
 95 | 			policy = data['algo'].policy
 96 | 			env = data['algo'].env
 97 | 		else:
 98 | 			policy = data['policy']
 99 | 			env = data['env']
100 | 		with policy.deterministic(args.deterministic):
101 | 			while True:
102 | 				path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length,g=2)
103 | 
104 | def simulate_policy_pusher(args):
105 | 	sub_level_policies=[]
106 | 	with tf.Session() as sess:
107 | 		with tf.variable_scope("bottom", reuse=False):
108 | 			btm = joblib.load("primitive-policies/pusher/bottom/bottom.pkl")
109 | 		with tf.variable_scope("jump", reuse=False):
110 | 			lft = joblib.load("primitive-policies/pusher/left/left.pkl")
111 | 		sub_level_policies.append(btm["policy"])
112 | 		sub_level_policies.append(lft["policy"])
113 | 		data = joblib.load(args.file)
114 | 		if 'algo' in data.keys():
115 | 			policy = data['algo'].policy
116 | 			env = data['algo'].env
117 | 		else:
118 | 			policy = data['policy']
119 | 			env =data['env']
120 | 		with policy.deterministic(args.deterministic):
121 | 			while True:
122 | 				path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length,g=0)
123 | 
124 | def simulate_policy_hch(args):
125 | 	sub_level_policies=[]
126 | 	with tf.Session() as sess:
127 | 		with tf.variable_scope("fwrd", reuse=False):
128 | 			fwrd = joblib.load("primitive-policies/hc/fwd/fwd.pkl")
129 | 		with tf.variable_scope("jump", reuse=False):
130 | 			jmp = joblib.load("primitive-policies/hc/jp-longz/jump.pkl")
131 | 		sub_level_policies.append(fwrd["policy"])
132 | 		sub_level_policies.append(jmp["policy"])
133 | 		data = joblib.load(args.file)
134 | 		if 'algo' in data.keys():
135 | 			policy = data['algo'].policy
136 | 			env = data['algo'].env
137 | 		else:
138 | 			policy = data['policy']
139 | 			env = normalize(HalfCheetahHurdleEnv()) #data['env']
140 | 		with policy.deterministic(args.deterministic):
141 | 			while True:
142 | 				path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length, g=2)
143 | 
144 | if __name__ == "__main__":
145 | 	args = parse_args()
146 | 	if args.domain=='ant-cross-maze' or args.domain=='ant-random-goal':
147 | 		simulate_policy_ant(args)
148 | 	if args.domain=='cheetah-hurdle':
149 | 		simulate_policy_hch(args)
150 | 	if args.domain=='pusher':
151 | 		simulate_policy_pusher(args)
152 | 


--------------------------------------------------------------------------------
/sim_policy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import joblib
 4 | import tensorflow as tf
 5 | 
 6 | from rllab.sampler.utils import rollout
 7 | 
 8 | def parse_args():
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('file', type=str, help='Path to the snapshot file.')
11 |     parser.add_argument('--max-path-length', '-l', type=int, default=1000)
12 |     parser.add_argument('--speedup', '-s', type=float, default=10)
13 |     parser.add_argument('--deterministic', '-d', dest='deterministic',
14 |                         action='store_true')
15 |     parser.add_argument('--no-deterministic', '-nd', dest='deterministic',
16 |                         action='store_false')
17 |     parser.add_argument('--policy_h', type=int)
18 |     parser.set_defaults(deterministic=True)
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     return args
23 | 
24 | def simulate_policy(args):
25 |     with tf.Session() as sess:
26 |         data = joblib.load(args.file)
27 |         print(data.keys())
28 |         if 'algo' in data.keys():
29 |             policy = data['algo'].policy
30 |             env = data['algo'].env
31 |         else:
32 |             policy = data['policy']
33 |             env = data['env']
34 |         print(policy)
35 |         with policy.deterministic(args.deterministic):
36 |             while True:
37 |                 path = rollout(env, policy,
38 |                                max_path_length=args.max_path_length,
39 |                                animated=True, speedup=args.speedup)
40 | if __name__ == "__main__":
41 |     args = parse_args()
42 |     simulate_policy(args)
43 | 


--------------------------------------------------------------------------------
/value_functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .value_function import NNVFunction, NNQFunction, NNDiscriminatorFunction
2 | 


--------------------------------------------------------------------------------
/value_functions/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/value_functions/__pycache__/__init__.cpython-35.pyc


--------------------------------------------------------------------------------
/value_functions/__pycache__/value_function.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/value_functions/__pycache__/value_function.cpython-35.pyc


--------------------------------------------------------------------------------
/value_functions/value_function.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | from sac.misc.mlp import MLPFunction
 6 | from sac.misc import tf_utils
 7 | 
 8 | class NNVFunction(MLPFunction):
 9 | 
10 |     def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='vf'):
11 |         Serializable.quick_init(self, locals())
12 | 
13 |         self._Do = env_spec.observation_space.flat_dim
14 |         self._obs_pl = tf.placeholder(
15 |             tf.float32,
16 |             shape=[None, self._Do],
17 |             name='observation',
18 |         )
19 | 
20 |         super(NNVFunction, self).__init__(
21 |             name, (self._obs_pl,), hidden_layer_sizes)
22 | 
23 | 
24 | class NNQFunction(MLPFunction):
25 |     def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='qf'):
26 |         Serializable.quick_init(self, locals())
27 | 
28 |         self._Da = env_spec.action_space.flat_dim
29 |         self._Do = env_spec.observation_space.flat_dim
30 | 
31 |         self._obs_pl = tf.placeholder(
32 |             tf.float32,
33 |             shape=[None, self._Do],
34 |             name='observation',
35 |         )
36 | 
37 |         self._action_pl = tf.placeholder(
38 |             tf.float32,
39 |             shape=[None, self._Da],
40 |             name='actions',
41 |         )
42 | 
43 |         super(NNQFunction, self).__init__(
44 |             name, (self._obs_pl, self._action_pl), hidden_layer_sizes)
45 | 
46 | 
47 | class NNDiscriminatorFunction(MLPFunction):
48 |     def __init__(self, env_spec, hidden_layer_sizes=(100, 100), num_skills=None):
49 |         assert num_skills is not None
50 |         Serializable.quick_init(self, locals())
51 |         Parameterized.__init__(self)
52 | 
53 |         self._Da = env_spec.action_space.flat_dim
54 |         self._Do = env_spec.observation_space.flat_dim
55 | 
56 |         self._obs_pl = tf.placeholder(
57 |             tf.float32,
58 |             shape=[None, self._Do],
59 |             name='observation',
60 |         )
61 |         self._action_pl = tf.placeholder(
62 |             tf.float32,
63 |             shape=[None, self._Da],
64 |             name='actions',
65 |         )
66 | 
67 |         self._name = 'discriminator'
68 |         self._input_pls = (self._obs_pl, self._action_pl)
69 |         self._layer_sizes = list(hidden_layer_sizes) + [num_skills]
70 |         self._output_t = self.get_output_for(*self._input_pls)
71 | 


--------------------------------------------------------------------------------