├── LICENSE
├── README.md
├── algos
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── base.cpython-35.pyc
│ ├── diayn.cpython-35.pyc
│ └── sac.cpython-35.pyc
├── base.py
└── sac.py
├── core
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ └── serializable.cpython-35.pyc
└── serializable.py
├── distributions
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── gmm.cpython-35.pyc
│ ├── normal.cpython-35.pyc
│ └── real_nvp_bijector.cpython-35.pyc
├── gmm.py
├── normal.py
└── real_nvp_bijector.py
├── environments
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── delayed_env.cpython-35.pyc
│ ├── gym_env.cpython-35.pyc
│ ├── multigoal.cpython-35.pyc
│ └── pusher.cpython-35.pyc
├── delayed_env.py
├── gym_env.py
├── multigoal.py
└── pusher.py
├── envs
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── cheetah_hurdle_env.cpython-35.pyc
│ ├── cross_maze_ant_env.cpython-35.pyc
│ ├── gym_env.cpython-35.pyc
│ ├── helpers.cpython-35.pyc
│ ├── hierarchy_proxy_env.cpython-35.pyc
│ ├── multi_direction_env.cpython-35.pyc
│ ├── multigoal.cpython-35.pyc
│ ├── pusher.cpython-35.pyc
│ ├── random_goal_ant_env.cpython-35.pyc
│ └── simple_maze_ant_env.cpython-35.pyc
├── cheetah_hurdle_env.py
├── cross_maze_ant_env.py
├── delayed_env.py
├── gym_env.py
├── helpers.py
├── hierarchy_proxy_env.py
├── meta_env.py
├── multi_direction_env.py
├── multigoal.py
├── pusher.py
├── random_goal_ant_env.py
└── simple_maze_ant_env.py
├── misc
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── instrument.cpython-35.pyc
│ ├── mlp.cpython-35.pyc
│ ├── plotter.cpython-35.pyc
│ ├── sampler.cpython-35.pyc
│ ├── tf_utils.cpython-35.pyc
│ └── utils.cpython-35.pyc
├── instrument.py
├── mlp.py
├── plotter.py
├── remote_sampler.py
├── replay_pool.py
├── sampler.py
├── tf_utils.py
└── utils.py
├── mujoco_am_sac.py
├── mujoco_models
├── cross_maze_ant.xml
├── half_cheetah_hurdle.xml
├── pusher_2d.xml
└── simple_maze_ant.xml
├── policies
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── base.cpython-35.pyc
│ ├── gaussian_policy.cpython-35.pyc
│ ├── gmm.cpython-35.pyc
│ ├── hierarchical_policy.cpython-35.pyc
│ ├── latent_space_policy.cpython-35.pyc
│ ├── nn_policy.cpython-35.pyc
│ ├── nn_policy2.cpython-35.pyc
│ ├── pointer_policy.cpython-35.pyc
│ └── uniform_policy.cpython-35.pyc
├── base.py
├── gaussian_policy.py
├── nn_policy.py
├── nn_policy2.py
├── pointer_policy.py
└── uniform_policy.py
├── preprocessors
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ └── mlp_preprocessor.cpython-35.pyc
└── mlp_preprocessor.py
├── primitive-policies
├── ant
│ ├── bwrd
│ │ └── bwrd.pkl
│ ├── dwrd
│ │ └── dwrd.pkl
│ ├── fwrd
│ │ └── fwrd.pkl
│ └── uwrd
│ │ └── uwrd.pkl
├── hc
│ ├── fwd
│ │ └── fwd.pkl
│ └── jp-longz
│ │ └── jump.pkl
└── pusher
│ ├── bottom
│ └── bottom.pkl
│ └── left
│ └── left.pkl
├── replay_buffers
├── __init__.py
├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── replay_buffer.cpython-35.pyc
│ └── simple_replay_buffer.cpython-35.pyc
├── replay_buffer.py
└── simple_replay_buffer.py
├── sandbox
├── __pycache__
│ └── __init__.cpython-35.pyc
└── rocky
│ ├── __pycache__
│ └── __init__.cpython-35.pyc
│ └── tf
│ ├── __pycache__
│ └── __init__.cpython-35.pyc
│ ├── algos
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ ├── batch_polopt.cpython-35.pyc
│ │ ├── npo.cpython-35.pyc
│ │ └── trpo.cpython-35.pyc
│ ├── batch_polopt.py
│ ├── npg.py
│ ├── npo.py
│ ├── trpo.py
│ └── vpg.py
│ ├── core
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ ├── layers.cpython-35.pyc
│ │ ├── layers_powered.cpython-35.pyc
│ │ ├── network.cpython-35.pyc
│ │ └── parameterized.cpython-35.pyc
│ ├── layers.py
│ ├── layers_powered.py
│ ├── network.py
│ └── parameterized.py
│ ├── distributions
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ ├── base.cpython-35.pyc
│ │ └── diagonal_gaussian.cpython-35.pyc
│ ├── base.py
│ ├── bernoulli.py
│ ├── categorical.py
│ ├── diagonal_gaussian.py
│ ├── recurrent_categorical.py
│ └── recurrent_diagonal_gaussian.py
│ ├── envs
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ ├── base.cpython-35.pyc
│ │ ├── parallel_vec_env_executor.cpython-35.pyc
│ │ └── vec_env_executor.cpython-35.pyc
│ ├── base.py
│ ├── parallel_vec_env_executor.py
│ └── vec_env_executor.py
│ ├── launchers
│ ├── __init__.py
│ ├── trpo_cartpole.py
│ ├── trpo_cartpole_recurrent.py
│ └── vpg_cartpole.py
│ ├── misc
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ └── tensor_utils.cpython-35.pyc
│ └── tensor_utils.py
│ ├── optimizers
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ ├── conjugate_gradient_optimizer.cpython-35.pyc
│ │ └── penalty_lbfgs_optimizer.cpython-35.pyc
│ ├── conjugate_gradient_optimizer.py
│ ├── first_order_optimizer.py
│ ├── lbfgs_optimizer.py
│ └── penalty_lbfgs_optimizer.py
│ ├── policies
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ ├── base.cpython-35.pyc
│ │ ├── gaussian_mlp_inverse_policy.cpython-35.pyc
│ │ └── gaussian_mlp_policy.cpython-35.pyc
│ ├── base.py
│ ├── categorical_conv_policy.py
│ ├── categorical_gru_policy.py
│ ├── categorical_lstm_policy.py
│ ├── categorical_mlp_policy.py
│ ├── deterministic_mlp_policy.py
│ ├── gaussian_gru_policy.py
│ ├── gaussian_lstm_policy.py
│ ├── gaussian_mlp_inverse_policy.py
│ ├── gaussian_mlp_policy.py
│ └── uniform_control_policy.py
│ ├── q_functions
│ ├── base.py
│ └── continuous_mlp_q_function.py
│ ├── regressors
│ ├── __init__.py
│ ├── bernoulli_mlp_regressor.py
│ ├── categorical_mlp_regressor.py
│ ├── deterministic_mlp_regressor.py
│ └── gaussian_mlp_regressor.py
│ ├── samplers
│ ├── __init__.py
│ ├── __pycache__
│ │ ├── __init__.cpython-35.pyc
│ │ ├── base.cpython-35.pyc
│ │ ├── batch_sampler.cpython-35.pyc
│ │ └── vectorized_sampler.cpython-35.pyc
│ ├── batch_sampler.py
│ └── vectorized_sampler.py
│ └── spaces
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-35.pyc
│ ├── box.cpython-35.pyc
│ ├── discrete.cpython-35.pyc
│ └── product.cpython-35.pyc
│ ├── box.py
│ ├── discrete.py
│ └── product.py
├── sim_cpolicy.py
├── sim_policy.py
└── value_functions
├── __init__.py
├── __pycache__
├── __init__.cpython-35.pyc
└── value_function.cpython-35.pyc
└── value_function.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Ahmed Qureshi
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # COMPOSING TASK-AGNOSTIC POLICIES WITH DEEP REINFORCEMENT LEARNING
2 |
3 |
4 | * Requirements:
5 | 1. Rllab
6 | 2. Tensorflow
7 | 3. mujoco
8 |
9 |
10 | ## To train composite model from scratch, run:
11 |
12 | 1. To simulate "ant-cross-maze", run:
13 |
14 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/ant-maze" --domain="ant-cross-maze"```
15 |
16 | 2. To simulate "ant-random-goal", run:
17 |
18 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/ant-rgoal" --domain="ant-random-goal"```
19 |
20 | 3. To simulate "cheetah-hurdle", run:
21 |
22 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/cheetah-hurdle" --domain="cheetah-hurdle"```
23 |
24 | 4. To simulate "pusher", run:
25 |
26 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/pusher" --domain="pusher"```
27 |
28 |
29 |
30 |
31 |
32 | ## References
33 | ```
34 | @inproceedings{
35 | qureshi2020composing,
36 | title={Composing Task-Agnostic Policies with Deep Reinforcement Learning},
37 | author={Ahmed H. Qureshi and Jacob J. Johnson and Yuzhe Qin and Taylor Henderson and Byron Boots and Michael C. Yip},
38 | booktitle={International Conference on Learning Representations},
39 | year={2020},
40 | url={https://openreview.net/forum?id=H1ezFREtwH}
41 | }
42 | ```
43 |
--------------------------------------------------------------------------------
/algos/__init__.py:
--------------------------------------------------------------------------------
1 | from .sac import SAC
2 |
--------------------------------------------------------------------------------
/algos/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/algos/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/base.cpython-35.pyc
--------------------------------------------------------------------------------
/algos/__pycache__/diayn.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/diayn.cpython-35.pyc
--------------------------------------------------------------------------------
/algos/__pycache__/sac.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/sac.cpython-35.pyc
--------------------------------------------------------------------------------
/core/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/core/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/core/__pycache__/serializable.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/core/__pycache__/serializable.cpython-35.pyc
--------------------------------------------------------------------------------
/core/serializable.py:
--------------------------------------------------------------------------------
1 | from rllab.core.serializable import Serializable
2 |
3 |
4 | def deep_clone(obj):
5 | assert isinstance(obj, Serializable)
6 |
7 | def maybe_deep_clone(o):
8 | if isinstance(o, Serializable):
9 | return deep_clone(o)
10 | else:
11 | return o
12 |
13 | d = obj.__getstate__()
14 | for key, val in d.items():
15 | d[key] = maybe_deep_clone(val)
16 |
17 | d['__args'] = list(d['__args']) # Make args mutable.
18 | for i, val in enumerate(d['__args']):
19 | d['__args'][i] = maybe_deep_clone(val)
20 |
21 | for key, val in d['__kwargs'].items():
22 | d['__kwargs'][key] = maybe_deep_clone(val)
23 |
24 | out = type(obj).__new__(type(obj))
25 | # noinspection PyArgumentList
26 | out.__setstate__(d)
27 |
28 | return out
29 |
--------------------------------------------------------------------------------
/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | from .normal import Normal
2 | from .gmm import GMM
3 | from .real_nvp_bijector import RealNVPBijector
4 |
--------------------------------------------------------------------------------
/distributions/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/distributions/__pycache__/gmm.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/gmm.cpython-35.pyc
--------------------------------------------------------------------------------
/distributions/__pycache__/normal.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/normal.cpython-35.pyc
--------------------------------------------------------------------------------
/distributions/__pycache__/real_nvp_bijector.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/real_nvp_bijector.cpython-35.pyc
--------------------------------------------------------------------------------
/distributions/gmm.py:
--------------------------------------------------------------------------------
1 | """ Gaussian mixture model. """
2 |
3 | import tensorflow as tf
4 | import numpy as np
5 |
6 | from sac.misc.mlp import mlp
7 |
8 | LOG_SIG_CAP_MAX = 2
9 | LOG_SIG_CAP_MIN = -20
10 |
11 |
12 | class GMM(object):
13 | def __init__(
14 | self,
15 | K,
16 | Dx,
17 | hidden_layers_sizes=(100, 100),
18 | reg=0.001,
19 | reparameterize=True,
20 | cond_t_lst=(),
21 | ):
22 | self._cond_t_lst = cond_t_lst
23 | self._reg = reg
24 | self._layer_sizes = list(hidden_layers_sizes) + [K * (2 * Dx + 1)]
25 | self._reparameterize = reparameterize
26 |
27 | self._Dx = Dx
28 | self._K = K
29 |
30 | self._create_placeholders()
31 | self._create_graph()
32 |
33 | def _create_placeholders(self):
34 | self._N_pl = tf.placeholder(
35 | tf.int32,
36 | shape=(),
37 | name='N',
38 | )
39 |
40 | @staticmethod
41 | def _create_log_gaussian(mu_t, log_sig_t, t):
42 | normalized_dist_t = (t - mu_t) * tf.exp(-log_sig_t) # ... x D
43 | quadratic = - 0.5 * tf.reduce_sum(normalized_dist_t ** 2, axis=-1)
44 | # ... x (None)
45 |
46 | log_z = tf.reduce_sum(log_sig_t, axis=-1) # ... x (None)
47 | D_t = tf.cast(tf.shape(mu_t)[-1], tf.float32)
48 | log_z += 0.5 * D_t * np.log(2 * np.pi)
49 |
50 | log_p = quadratic - log_z
51 |
52 | return log_p # ... x (None)
53 |
54 | def _create_p_xz_params(self):
55 | K = self._K
56 | Dx = self._Dx
57 |
58 | if len(self._cond_t_lst) == 0:
59 | w_and_mu_and_logsig_t = tf.get_variable(
60 | 'params', self._layer_sizes[-1],
61 | initializer=tf.random_normal_initializer(0, 0.1)
62 | )
63 |
64 | else:
65 | w_and_mu_and_logsig_t = mlp(
66 | inputs=self._cond_t_lst,
67 | layer_sizes=self._layer_sizes,
68 | output_nonlinearity=None,
69 | ) # ... x K*Dx*2+K
70 |
71 | w_and_mu_and_logsig_t = tf.reshape(
72 | w_and_mu_and_logsig_t, shape=(-1, K, 2*Dx+1))
73 |
74 | log_w_t = w_and_mu_and_logsig_t[..., 0]
75 | mu_t = w_and_mu_and_logsig_t[..., 1:1+Dx]
76 | log_sig_t = w_and_mu_and_logsig_t[..., 1+Dx:]
77 |
78 | log_sig_t = tf.clip_by_value(log_sig_t, LOG_SIG_CAP_MIN, LOG_SIG_CAP_MAX)
79 |
80 | return log_w_t, mu_t, log_sig_t
81 |
82 | def _create_graph(self):
83 | Dx = self._Dx
84 |
85 | if len(self._cond_t_lst) > 0:
86 | N_t = tf.shape(self._cond_t_lst[0])[0]
87 | else:
88 | N_t = self._N_pl
89 |
90 | K = self._K
91 |
92 | # Create p(x|z).
93 | with tf.variable_scope('p'):
94 | log_ws_t, xz_mus_t, xz_log_sigs_t = self._create_p_xz_params()
95 | # (N x K), (N x K x Dx), (N x K x Dx)
96 | xz_sigs_t = tf.exp(xz_log_sigs_t)
97 |
98 | # Sample the latent code.
99 | z_t = tf.multinomial(logits=log_ws_t, num_samples=1) # N x 1
100 |
101 | # Choose mixture component corresponding to the latent.
102 | mask_t = tf.one_hot(
103 | z_t[:, 0], depth=K, dtype=tf.bool,
104 | on_value=True, off_value=False
105 | )
106 | xz_mu_t = tf.boolean_mask(xz_mus_t, mask_t) # N x Dx
107 | xz_sig_t = tf.boolean_mask(xz_sigs_t, mask_t) # N x Dx
108 |
109 | # Sample x.
110 | x_t = xz_mu_t + xz_sig_t * tf.random_normal((N_t, Dx)) # N x Dx
111 | if not self._reparameterize:
112 | x_t = tf.stop_gradient(x_t)
113 |
114 | # log p(x|z)
115 | log_p_xz_t = self._create_log_gaussian(
116 | xz_mus_t, xz_log_sigs_t, x_t[:, None, :]
117 | ) # N x K
118 |
119 | # log p(x)
120 | log_p_x_t = tf.reduce_logsumexp(log_p_xz_t + log_ws_t, axis=1)
121 | log_p_x_t -= tf.reduce_logsumexp(log_ws_t, axis=1) # N
122 |
123 | reg_loss_t = 0
124 | reg_loss_t += self._reg * 0.5 * tf.reduce_mean(xz_log_sigs_t ** 2)
125 | reg_loss_t += self._reg * 0.5 * tf.reduce_mean(xz_mus_t ** 2)
126 |
127 | self._log_p_x_t = log_p_x_t
128 | self._reg_loss_t = reg_loss_t
129 | self._x_t = x_t
130 |
131 | self._log_ws_t = log_ws_t
132 | self._mus_t = xz_mus_t
133 | self._log_sigs_t = xz_log_sigs_t
134 |
135 | @property
136 | def log_p_t(self):
137 | return self._log_p_x_t
138 |
139 | @property
140 | def reg_loss_t(self):
141 | return self._reg_loss_t
142 |
143 | @property
144 | def x_t(self):
145 | return self._x_t
146 |
147 | @property
148 | def mus_t(self):
149 | return self._mus_t
150 |
151 | @property
152 | def log_sigs_t(self):
153 | return self._log_sigs_t
154 |
155 | @property
156 | def log_ws_t(self):
157 | return self._log_ws_t
158 |
159 | @property
160 | def N_t(self):
161 | return self._N_pl
162 |
--------------------------------------------------------------------------------
/distributions/normal.py:
--------------------------------------------------------------------------------
1 | """ Multivariate normal distribution with mean and std deviation outputted by a neural net """
2 |
3 | import tensorflow as tf
4 | import numpy as np
5 |
6 | from sac.misc.mlp import mlp
7 |
8 | LOG_SIG_CAP_MAX = 2
9 | LOG_SIG_CAP_MIN = -20
10 |
11 |
12 | class Normal(object):
13 | def __init__(
14 | self,
15 | Dx,
16 | hidden_layers_sizes=(100, 100),
17 | reg=0.001,
18 | reparameterize=True,
19 | cond_t_lst=(),
20 | ):
21 | self._cond_t_lst = cond_t_lst
22 | self._reg = reg
23 | self._layer_sizes = list(hidden_layers_sizes) + [2 * Dx]
24 | print(self._layer_sizes)
25 | self._reparameterize = reparameterize
26 |
27 | self._Dx = Dx
28 |
29 | self._create_placeholders()
30 | self._create_graph()
31 |
32 | def _create_placeholders(self):
33 | self._N_pl = tf.placeholder(
34 | tf.int32,
35 | shape=(),
36 | name='N',
37 | )
38 |
39 | def _create_graph(self):
40 | Dx = self._Dx
41 |
42 | if len(self._cond_t_lst) == 0:
43 | mu_and_logsig_t = tf.get_variable(
44 | 'params', self._layer_sizes[-1],
45 | initializer=tf.random_normal_initializer(0, 0.1)
46 | )
47 | else:
48 | mu_and_logsig_t = mlp(
49 | inputs=self._cond_t_lst,
50 | layer_sizes=self._layer_sizes,
51 | output_nonlinearity=None,
52 | ) # ... x K*Dx*2+K
53 |
54 | self._mu_t = mu_and_logsig_t[..., :Dx]
55 | self._log_sig_t = tf.clip_by_value(mu_and_logsig_t[..., Dx:], LOG_SIG_CAP_MIN, LOG_SIG_CAP_MAX)
56 |
57 | # Tensorflow's multivariate normal distribution supports reparameterization
58 | ds = tf.contrib.distributions
59 | dist = ds.MultivariateNormalDiag(loc=self._mu_t, scale_diag=tf.exp(self._log_sig_t))
60 | x_t = dist.sample()
61 | if not self._reparameterize:
62 | x_t = tf.stop_gradient(x_t)
63 | log_pi_t = dist.log_prob(x_t)
64 |
65 | self._dist = dist
66 | self._x_t = x_t
67 | self._log_pi_t = log_pi_t
68 |
69 | reg_loss_t = self._reg * 0.5 * tf.reduce_mean(self._log_sig_t ** 2)
70 | reg_loss_t += self._reg * 0.5 * tf.reduce_mean(self._mu_t ** 2)
71 | self._reg_loss_t = reg_loss_t
72 |
73 |
74 |
75 | @property
76 | def log_p_t(self):
77 | return self._log_pi_t
78 |
79 | @property
80 | def reg_loss_t(self):
81 | return self._reg_loss_t
82 |
83 | @property
84 | def x_t(self):
85 | return self._x_t
86 |
87 | @property
88 | def mu_t(self):
89 | return self._mu_t
90 |
91 | @property
92 | def log_sig_t(self):
93 | return self._log_sig_t
94 |
--------------------------------------------------------------------------------
/environments/__init__.py:
--------------------------------------------------------------------------------
1 | from .multigoal import MultiGoalEnv
2 | from .gym_env import GymEnv
3 | from .delayed_env import DelayedEnv
--------------------------------------------------------------------------------
/environments/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/environments/__pycache__/delayed_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/delayed_env.cpython-35.pyc
--------------------------------------------------------------------------------
/environments/__pycache__/gym_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/gym_env.cpython-35.pyc
--------------------------------------------------------------------------------
/environments/__pycache__/multigoal.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/multigoal.cpython-35.pyc
--------------------------------------------------------------------------------
/environments/__pycache__/pusher.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/pusher.cpython-35.pyc
--------------------------------------------------------------------------------
/environments/delayed_env.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from rllab.envs.proxy_env import ProxyEnv
4 | from rllab.core.serializable import Serializable
5 |
6 |
7 | class DelayedEnv(ProxyEnv, Serializable):
8 | def __init__(self, env, delay=0.01):
9 | Serializable.quick_init(self, locals())
10 | ProxyEnv.__init__(self, env)
11 |
12 | self._delay = delay
13 |
14 | def step(self, action):
15 | time.sleep(self._delay)
16 | return self._wrapped_env.step(action)
17 |
--------------------------------------------------------------------------------
/environments/gym_env.py:
--------------------------------------------------------------------------------
1 | """ Rllab implementation with a HACK. See comment in `GymEnv.__init__`. """
2 | import gym
3 | import gym.wrappers
4 | import gym.envs
5 | import gym.spaces
6 | import traceback
7 | import logging
8 |
9 | try:
10 | from gym import logger as monitor_logger
11 |
12 | monitor_logger.setLevel(logging.WARNING)
13 | except Exception as e:
14 | traceback.print_exc()
15 |
16 | import os
17 | import os.path as osp
18 | from rllab.envs.base import Env, Step
19 | from rllab.core.serializable import Serializable
20 | from rllab.spaces.box import Box
21 | from rllab.spaces.discrete import Discrete
22 | from rllab.spaces.product import Product
23 | from rllab.misc import logger
24 |
25 |
26 | def convert_gym_space(space):
27 | if isinstance(space, gym.spaces.Box):
28 | return Box(low=space.low, high=space.high)
29 | elif isinstance(space, gym.spaces.Discrete):
30 | return Discrete(n=space.n)
31 | elif isinstance(space, gym.spaces.Tuple):
32 | return Product([convert_gym_space(x) for x in space.spaces])
33 | else:
34 | raise NotImplementedError
35 |
36 |
37 | class CappedCubicVideoSchedule(object):
38 | # Copied from gym, since this method is frequently moved around
39 | def __call__(self, count):
40 | if count < 1000:
41 | return int(round(count ** (1. / 3))) ** 3 == count
42 | else:
43 | return count % 1000 == 0
44 |
45 |
46 | class FixedIntervalVideoSchedule(object):
47 | def __init__(self, interval):
48 | self.interval = interval
49 |
50 | def __call__(self, count):
51 | return count % self.interval == 0
52 |
53 |
54 | class NoVideoSchedule(object):
55 | def __call__(self, count):
56 | return False
57 |
58 |
59 | class GymEnv(Env, Serializable):
60 | def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False,
61 | force_reset=True):
62 | if log_dir is None:
63 | if logger.get_snapshot_dir() is None:
64 | logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
65 | else:
66 | log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
67 | Serializable.quick_init(self, locals())
68 |
69 | env = gym.envs.make(env_name)
70 |
71 | # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when
72 | # the time limit specified for each environment has been passed and
73 | # therefore the environment is not Markovian (terminal condition depends
74 | # on time rather than state).
75 | env = env.env
76 |
77 | self.env = env
78 | self.env_id = env.spec.id
79 |
80 | assert not (not record_log and record_video)
81 |
82 | if log_dir is None or record_log is False:
83 | self.monitoring = False
84 | else:
85 | if not record_video:
86 | video_schedule = NoVideoSchedule()
87 | else:
88 | if video_schedule is None:
89 | video_schedule = CappedCubicVideoSchedule()
90 | self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
91 | self.monitoring = True
92 |
93 | self._observation_space = convert_gym_space(env.observation_space)
94 | logger.log("observation space: {}".format(self._observation_space))
95 | self._action_space = convert_gym_space(env.action_space)
96 | logger.log("action space: {}".format(self._action_space))
97 | self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
98 | self._log_dir = log_dir
99 | self._force_reset = force_reset
100 |
101 | @property
102 | def observation_space(self):
103 | return self._observation_space
104 |
105 | @property
106 | def action_space(self):
107 | return self._action_space
108 |
109 | @property
110 | def horizon(self):
111 | return self._horizon
112 |
113 | def reset(self):
114 | if self._force_reset and self.monitoring:
115 | from gym.wrappers.monitoring import Monitor
116 | assert isinstance(self.env, Monitor)
117 | recorder = self.env.stats_recorder
118 | if recorder is not None:
119 | recorder.done = True
120 | return self.env.reset()
121 |
122 | def step(self, action):
123 | next_obs, reward, done, info = self.env.step(action)
124 | return Step(next_obs, reward, done, **info)
125 |
126 | def render(self, mode='human', close=False):
127 | return self.env._render(mode, close)
128 | # self.env.render()
129 |
130 | def terminate(self):
131 | if self.monitoring:
132 | self.env._close()
133 | if self._log_dir is not None:
134 | print("""
135 | ***************************
136 |
137 | Training finished! You can upload results to OpenAI Gym by running the following command:
138 |
139 | python scripts/submit_gym.py %s
140 |
141 | ***************************
142 | """ % self._log_dir)
143 |
144 |
--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
1 | from .gym_env import GymEnv
2 | from .cheetah_hurdle_env import HalfCheetahHurdleEnv
3 | from .multi_direction_env import (
4 | MultiDirectionSwimmerEnv,
5 | MultiDirectionAntEnv,
6 | MultiDirectionHumanoidEnv)
7 |
8 | from .random_goal_ant_env import RandomGoalAntEnv
9 | from .pusher import PusherEnv
10 | from .cross_maze_ant_env import CrossMazeAntEnv
11 | from .simple_maze_ant_env import SimpleMazeAntEnv
12 | from .hierarchy_proxy_env import HierarchyProxyEnv
13 | from .multigoal import MultiGoalEnv
14 |
--------------------------------------------------------------------------------
/envs/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/cheetah_hurdle_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/cheetah_hurdle_env.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/cross_maze_ant_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/cross_maze_ant_env.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/gym_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/gym_env.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/helpers.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/helpers.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/hierarchy_proxy_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/hierarchy_proxy_env.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/multi_direction_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/multi_direction_env.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/multigoal.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/multigoal.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/pusher.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/pusher.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/random_goal_ant_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/random_goal_ant_env.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/__pycache__/simple_maze_ant_env.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/simple_maze_ant_env.cpython-35.pyc
--------------------------------------------------------------------------------
/envs/cheetah_hurdle_env.py:
--------------------------------------------------------------------------------
1 | """Implements a ant which is sparsely rewarded for reaching a goal"""
2 | #from gym.envs.mujoco.half_cheetah import HalfCheetahEnv
3 | #from gym.envs.mujoco.mujoco_env import MujocoEnv
4 |
5 |
6 | from rllab.core.serializable import Serializable
7 | from sac.misc.utils import PROJECT_PATH
8 | from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
9 | from rllab.envs.mujoco.mujoco_env import MujocoEnv
10 | from rllab.envs.base import Step
11 | from gym import utils
12 | import os
13 | import numpy as np
14 |
15 | MODELS_PATH = os.path.abspath(os.path.join(PROJECT_PATH, 'sac/mujoco_models'))
16 |
17 | class HalfCheetahHurdleEnv(HalfCheetahEnv):
18 | def __init__(self):
19 | self.exteroceptive_observation =[12.0,0,0.5]
20 | self.hurdles_xpos=[-15.,-13.,-9.,-5.,-1.,3.,7.,11.,15.]#,19.,23.,27.]
21 | path = os.path.join(MODELS_PATH, 'half_cheetah_hurdle.xml')
22 | MujocoEnv.__init__(self,file_path=path)
23 | #MujocoEnv.__init__(self)
24 | Serializable.quick_init(self, locals())
25 |
26 | def get_current_obs(self):
27 | proprioceptive_observation = super().get_current_obs()
28 | x_pos1 =self.get_body_com('ffoot')[0]#self.model.data.qpos.flat[:1]
29 | x_pos2 =self.get_body_com('bfoot')[0]#self.model.data.qpos.flat[:1]
30 | matches = [x for x in self.hurdles_xpos if x >= x_pos2]
31 | next_hurdle_x_pos = [matches[0]]
32 | ff_dist_frm_next_hurdle=[np.linalg.norm(matches[0] - x_pos1)]
33 | bf_dist_frm_next_hurdle=[np.linalg.norm(matches[0] - x_pos2)]
34 | observation =np.concatenate([proprioceptive_observation,next_hurdle_x_pos,bf_dist_frm_next_hurdle]).reshape(-1)
35 | return observation
36 |
37 | def isincollision(self):
38 | hurdle_size=[0.05,1.0,0.03]
39 | x_pos =self.get_body_com('ffoot')[0]#self.model.data.qpos.flat[:1]
40 | matches = [x for x in self.hurdles_xpos if x >= x_pos]
41 | if len(matches)==0:
42 | return False
43 | hurdle_pos =[matches[0],0.0,0.20]
44 | #names=['fthigh','bthigh']
45 | #names=['torso','bthigh','bshin','bfoot']
46 | names=['ffoot']
47 | xyz_pos=[]
48 | for i in range(0,len(names)):
49 | xyz_pos.append(self.get_body_com(names[i]))
50 | for i in range(0,len(names)):
51 | #xyz_position = self.get_body_com(names[i])
52 | cf=True
53 | for j in range(0,1):
54 | if abs(hurdle_pos[j]-xyz_pos[i][j])>1.5*hurdle_size[j]:
55 | cf=False
56 | break
57 | if cf:
58 | return True
59 | return False
60 |
61 | def get_hurdle_reward(self):
62 | hurdle_size=[0.05,1.0,0.03]
63 | x_pos =self.get_body_com('bfoot')[0]#self.model.data.qpos.flat[:1]
64 | matches = [x for x in self.hurdles_xpos if x >= x_pos]
65 | hurdle_reward =-1.0*len(matches)
66 |
67 | return hurdle_reward
68 |
69 | def step(self, action):
70 | xyz_pos_before = self.get_body_com('bshin')
71 | self.forward_dynamics(action)
72 | xyz_pos_after = self.get_body_com('bshin')
73 | xyz_position = self.get_body_com('torso')
74 | jump_reward = np.abs(self.get_body_comvel("torso")[2])
75 | run_reward = self.get_body_comvel("torso")[0]
76 | next_obs= self.get_current_obs()
77 | if self.isincollision():# or (xyz_pos_after[0]-xyz_pos_before[0])<-0.01:#dist_from_hurdle < 1 and dist_from_hurdle > 0.3 and z_after<0.05:(xyz_pos_after[0]-xyz_pos_before[0])<-0.01: #
78 | collision_penality=-2.0
79 | #print("collision")
80 | else:
81 | collision_penality=0.0
82 | #print("not collisions")
83 | hurdle_reward = self.get_hurdle_reward()
84 | #print(hurdle_reward)
85 | done = False
86 | goal_reward=0
87 | goal_distance =np.linalg.norm(xyz_position - self.exteroceptive_observation)
88 | if (goal_distance)<1.0:
89 | done=True
90 | goal_reward=1000
91 | else:
92 | done=False
93 |
94 | reward=-1e-1*goal_distance+hurdle_reward+goal_reward+run_reward+3e-1*jump_reward+collision_penality#1e-1*goal_distance+run_reward+jump_reward+collision_penality
95 | info = {'goal_distance': goal_distance}
96 | return Step(next_obs, reward, done, **info)
97 |
--------------------------------------------------------------------------------
/envs/cross_maze_ant_env.py:
--------------------------------------------------------------------------------
1 | """Implements an ant whose goal is to reach a target in a maze"""
2 |
3 | import os
4 |
5 | import numpy as np
6 |
7 | from rllab.core.serializable import Serializable
8 | from sac.misc.utils import PROJECT_PATH
9 | from .helpers import random_point_in_circle, get_random_goal_logs
10 | from .random_goal_ant_env import RandomGoalAntEnv
11 |
12 | MODELS_PATH = os.path.abspath(
13 | os.path.join(PROJECT_PATH, 'sac/mujoco_models'))
14 |
15 | class CrossMazeAntEnv(RandomGoalAntEnv, Serializable):
16 | """Implements an ant whose goal is to reach a target in a maze"""
17 |
18 | FILE_PATH = os.path.join(MODELS_PATH, 'cross_maze_ant.xml')
19 |
20 | def __init__(self,
21 | reward_type='dense',
22 | terminate_at_goal=True,
23 | goal_reward_weight=3e-1,
24 | goal_radius=1,
25 | goal_distance=1,
26 | goal_angle_range=(0, 2*np.pi),
27 | velocity_reward_weight=0,
28 | ctrl_cost_coeff=1e-2,
29 | contact_cost_coeff=1e-3,
30 | survive_reward=5e-2,
31 | fixed_goal_position=None,
32 | *args,
33 | **kwargs):
34 | file_path = self.__class__.FILE_PATH
35 | kwargs.pop('file_path', None)
36 | self.fixed_goal_position = fixed_goal_position
37 |
38 | super(CrossMazeAntEnv, self).__init__(
39 | file_path=file_path,
40 | reward_type=reward_type,
41 | terminate_at_goal=terminate_at_goal,
42 | goal_reward_weight=goal_reward_weight,
43 | goal_radius=goal_radius,
44 | goal_distance=goal_distance,
45 | goal_angle_range=goal_angle_range,
46 | velocity_reward_weight=velocity_reward_weight,
47 | ctrl_cost_coeff=ctrl_cost_coeff,
48 | contact_cost_coeff=contact_cost_coeff,
49 | survive_reward=survive_reward,
50 | *args,
51 | **kwargs)
52 | self._serializable_initialized = False
53 |
54 | def reset(self, goal_position=None, *args, **kwargs):
55 | possible_goal_positions = [[6, -6], [6, 6], [12, 0]]
56 |
57 | if goal_position is None:
58 | if self.fixed_goal_position is not None:
59 | goal_position = self.fixed_goal_position
60 | else:
61 | goal_position = possible_goal_positions[
62 | np.random.choice(len(possible_goal_positions))]
63 |
64 | observation = super(CrossMazeAntEnv, self).reset(
65 | goal_position=np.array(goal_position), *args, **kwargs)
66 |
67 | return observation
68 |
69 | def get_current_obs(self):
70 | observation = super().get_current_obs()
71 |
72 | if self.fixed_goal_position is not None:
73 | return observation[:-2]
74 |
75 | return observation
76 |
77 | def render(self, *args, **kwargs):
78 | result = super(CrossMazeAntEnv, self).render(*args, **kwargs)
79 | self.viewer.cam.elevation = -55
80 | self.viewer.cam.lookat[0] = 7
81 | self.viewer.cam.lookat[2] = 0
82 | self.viewer.cam.distance = self.model.stat.extent * 0.9
83 | self.viewer.cam.azimuth = 0
84 | self.viewer.cam.trackbodyid = 0
85 |
86 | return result
87 |
--------------------------------------------------------------------------------
/envs/delayed_env.py:
--------------------------------------------------------------------------------
1 | import time
2 |
3 | from rllab.envs.proxy_env import ProxyEnv
4 | from rllab.core.serializable import Serializable
5 |
6 |
7 | class DelayedEnv(ProxyEnv, Serializable):
8 | def __init__(self, env, delay=0.01):
9 | Serializable.quick_init(self, locals())
10 | ProxyEnv.__init__(self, env)
11 |
12 | self._delay = delay
13 |
14 | def step(self, action):
15 | time.sleep(self._delay)
16 | return self._wrapped_env.step(action)
17 |
--------------------------------------------------------------------------------
/envs/gym_env.py:
--------------------------------------------------------------------------------
1 | """ Rllab implementation with a HACK. See comment in GymEnv.__init__(). """
2 | import gym
3 | import gym.wrappers
4 | import gym.envs
5 | import gym.spaces
6 | import traceback
7 | import logging
8 |
9 | try:
10 | from gym import logger as monitor_logger
11 |
12 | monitor_logger.setLevel(logging.WARNING)
13 | except Exception as e:
14 | traceback.print_exc()
15 |
16 | import os
17 | import os.path as osp
18 | from rllab.envs.base import Env, Step
19 | from rllab.core.serializable import Serializable
20 | from rllab.spaces.box import Box
21 | from rllab.spaces.discrete import Discrete
22 | from rllab.spaces.product import Product
23 | from rllab.misc import logger
24 |
25 |
26 | def convert_gym_space(space):
27 | if isinstance(space, gym.spaces.Box):
28 | return Box(low=space.low, high=space.high)
29 | elif isinstance(space, gym.spaces.Discrete):
30 | return Discrete(n=space.n)
31 | elif isinstance(space, gym.spaces.Tuple):
32 | return Product([convert_gym_space(x) for x in space.spaces])
33 | else:
34 | raise NotImplementedError
35 |
36 |
37 | class CappedCubicVideoSchedule(object):
38 | # Copied from gym, since this method is frequently moved around
39 | def __call__(self, count):
40 | if count < 1000:
41 | return int(round(count ** (1. / 3))) ** 3 == count
42 | else:
43 | return count % 1000 == 0
44 |
45 |
46 | class FixedIntervalVideoSchedule(object):
47 | def __init__(self, interval):
48 | self.interval = interval
49 |
50 | def __call__(self, count):
51 | return count % self.interval == 0
52 |
53 |
54 | class NoVideoSchedule(object):
55 | def __call__(self, count):
56 | return False
57 |
58 |
59 | class GymEnv(Env, Serializable):
60 | def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False,
61 | force_reset=True):
62 | if log_dir is None:
63 | if logger.get_snapshot_dir() is None:
64 | logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.")
65 | else:
66 | log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log")
67 | Serializable.quick_init(self, locals())
68 |
69 | env = gym.envs.make(env_name)
70 |
71 | # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when
72 | # the time limit specified for each environment has been passed and
73 | # therefore the environment is not Markovian (terminal condition depends
74 | # on time rather than state).
75 | env = env.env
76 |
77 | self.env = env
78 | self.env_id = env.spec.id
79 |
80 | assert not (not record_log and record_video)
81 |
82 | if log_dir is None or record_log is False:
83 | self.monitoring = False
84 | else:
85 | if not record_video:
86 | video_schedule = NoVideoSchedule()
87 | else:
88 | if video_schedule is None:
89 | video_schedule = CappedCubicVideoSchedule()
90 | self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True)
91 | self.monitoring = True
92 |
93 | self._observation_space = convert_gym_space(env.observation_space)
94 | logger.log("observation space: {}".format(self._observation_space))
95 | self._action_space = convert_gym_space(env.action_space)
96 | logger.log("action space: {}".format(self._action_space))
97 | self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps']
98 | self._log_dir = log_dir
99 | self._force_reset = force_reset
100 |
101 | @property
102 | def observation_space(self):
103 | return self._observation_space
104 |
105 | @property
106 | def action_space(self):
107 | return self._action_space
108 |
109 | @property
110 | def horizon(self):
111 | return self._horizon
112 |
113 | def reset(self):
114 | if self._force_reset and self.monitoring:
115 | from gym.wrappers.monitoring import Monitor
116 | assert isinstance(self.env, Monitor)
117 | recorder = self.env.stats_recorder
118 | if recorder is not None:
119 | recorder.done = True
120 | return self.env.reset()
121 |
122 | def step(self, action):
123 | next_obs, reward, done, info = self.env.step(action)
124 | return Step(next_obs, reward, done, **info)
125 |
126 | def render(self, mode='human', close=False):
127 | return self.env._render(mode, close)
128 | # self.env.render()
129 |
130 | def terminate(self):
131 | if self.monitoring:
132 | self.env._close()
133 | if self._log_dir is not None:
134 | print("""
135 | ***************************
136 |
137 | Training finished! You can upload results to OpenAI Gym by running the following command:
138 |
139 | python scripts/submit_gym.py %s
140 |
141 | ***************************
142 | """ % self._log_dir)
143 |
--------------------------------------------------------------------------------
/envs/helpers.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def random_point_in_circle(angle_range=(0, 2*np.pi), radius=(0, 25)):
5 | angle = np.random.uniform(*angle_range)
6 | radius = radius if np.isscalar(radius) else np.random.uniform(*radius)
7 | x, y = np.cos(angle) * radius, np.sin(angle) * radius
8 | point = np.array([x, y])
9 | return point
10 |
11 | def get_random_goal_logs(paths, goal_radius, fixed_goal_position=False):
12 | if fixed_goal_position:
13 | position_slice = slice(-3, -1)
14 | else:
15 | position_slice = slice(-5, -3)
16 |
17 | logs = []
18 | if len(paths) > 0:
19 | progs = [
20 | np.linalg.norm(path["observations"][-1][position_slice]
21 | - path["observations"][0][position_slice])
22 | for path in paths
23 | ]
24 |
25 | time_in_goals = [
26 | np.sum(np.linalg.norm(
27 | (
28 | path['observations'][:, position_slice]
29 | - [path_goal['goal_position'] for path_goal in path['env_infos']]
30 | )
31 | , axis=1
32 | ) < goal_radius)
33 | for path in paths
34 | ]
35 |
36 | logs += [
37 | ('AverageProgress', np.mean(progs)),
38 | ('MaxProgress', np.max(progs)),
39 | ('MinProgress', np.min(progs)),
40 | ('StdProgress', np.std(progs)),
41 |
42 | ('AverageTimeInGoal', np.mean(time_in_goals)),
43 | ('MaxTimeInGoal', np.max(time_in_goals)),
44 | ('MinTimeInGoal', np.min(time_in_goals)),
45 | ('StdTimeInGoal', np.std(time_in_goals)),
46 | ]
47 |
48 | goal_positions, final_positions = zip(*[
49 | ([path_goal['goal_position'] for path_goal in p['env_infos']][-1],
50 | p['observations'][-1][position_slice])
51 | for p in paths
52 | ])
53 |
54 | begin_goal_distances = [
55 | np.linalg.norm(goal_position) for goal_position in goal_positions]
56 | final_goal_distances = [
57 | np.linalg.norm(goal_position - final_position)
58 | for goal_position, final_position in zip(goal_positions, final_positions)
59 | ]
60 | progress_towards_goals = [
61 | begin_goal_distance - final_goal_distance
62 | for (begin_goal_distance, final_goal_distance)
63 | in zip(begin_goal_distances, final_goal_distances)
64 | ]
65 |
66 |
67 | for series, name in zip((begin_goal_distances,
68 | final_goal_distances,
69 | progress_towards_goals),
70 | ('BeginGoalDistance',
71 | 'FinalGoalDistance',
72 | 'ProgressTowardsGoal')):
73 | for fn_name in ('mean', 'std', 'min', 'max'):
74 | fn = getattr(np, fn_name)
75 | logs.append((fn_name.capitalize() + name, fn(series)))
76 |
77 | return logs
78 |
79 | def get_multi_direction_logs(paths):
80 | progs = [
81 | np.linalg.norm(path["observations"][-1][-3:-1]
82 | - path["observations"][0][-3:-1])
83 | for path in paths
84 | ]
85 | logs = (
86 | ('AverageProgress', np.mean(progs)),
87 | ('MaxProgress', np.max(progs)),
88 | ('MinProgress', np.min(progs)),
89 | ('StdProgress', np.std(progs)),
90 | )
91 |
92 | return logs
93 |
--------------------------------------------------------------------------------
/envs/hierarchy_proxy_env.py:
--------------------------------------------------------------------------------
1 | """Implements an environment proxy to test hierarchy policies"""
2 |
3 | from rllab.envs.proxy_env import ProxyEnv
4 | from rllab.core.serializable import Serializable
5 |
6 | class HierarchyProxyEnv(ProxyEnv):
7 | def __init__(self, low_level_policy, *args, **kwargs):
8 | Serializable.quick_init(self, locals())
9 | self._low_level_policy = low_level_policy
10 | super().__init__(*args, **kwargs)
11 |
12 | def step(self, high_level_action):
13 | current_observation = (
14 | # Our env might be double wrapped, e.g. around NormalizedEnv
15 | self._wrapped_env._wrapped_env.get_current_obs()
16 | if isinstance(self._wrapped_env, ProxyEnv)
17 | else self._wrapped_env.get_current_obs())
18 |
19 | with self._low_level_policy.deterministic(h=high_level_action[None]):
20 | action, _ = self._low_level_policy.get_action(
21 | observation=current_observation[:self._low_level_policy._Ds])
22 |
23 | return super().step(action)
24 |
--------------------------------------------------------------------------------
/envs/meta_env.py:
--------------------------------------------------------------------------------
1 | from rllab import spaces
2 | from rllab.core.serializable import Serializable
3 | from rllab.envs.env_spec import EnvSpec
4 |
5 | from sac.misc.utils import concat_obs_z
6 |
7 | import numpy as np
8 |
9 | class MetaEnv(Serializable):
10 | def __init__(self, env, base_policy, num_skills, steps_per_option=100):
11 | Serializable.quick_init(self, locals())
12 | self._base_policy = base_policy
13 | self._env = env
14 | self._steps_per_option = steps_per_option
15 | self._num_skills = num_skills
16 | self.observation_space = self._env.observation_space
17 | self.action_space = spaces.Discrete(num_skills)
18 | self.spec = EnvSpec(self.observation_space, self.action_space)
19 | self._obs = self.reset()
20 |
21 | def step(self, meta_action):
22 | total_reward = 0
23 | for _ in range(self._steps_per_option):
24 | aug_obs = concat_obs_z(self._obs, meta_action, self._num_skills)
25 | (action, _) = self._base_policy.get_action(aug_obs)
26 | (self._obs, r, done, _) = self._env.step(action)
27 | total_reward += r
28 | if done: break
29 | # Normalize the total reward by number of steps
30 | return (self._obs, total_reward / float(self._steps_per_option), done, {})
31 |
32 | def reset(self):
33 | return self._env.reset()
34 |
35 | def log_diagnostics(self, paths):
36 | self._env.log_diagnostics(paths)
37 |
38 | def terminate(self):
39 | self._env.terminate()
40 |
41 |
42 | class FixedOptionEnv(Serializable):
43 | def __init__(self, env, num_skills, z):
44 | Serializable.quick_init(self, locals())
45 | self._env = env
46 | self._num_skills = num_skills
47 | self._z = z
48 | obs_space = self._env.observation_space
49 | low = np.hstack([obs_space.low, np.full(num_skills, 0)])
50 | high = np.hstack([obs_space.high, np.full(num_skills, 1)])
51 | self.observation_space = spaces.Box(low=low, high=high)
52 | self.action_space = self._env.action_space
53 | self.spec = EnvSpec(self.observation_space, self.action_space)
54 |
55 | def step(self, action):
56 | (obs, r, done, info) = self._env.step(action)
57 | aug_obs = concat_obs_z(obs, self._z, self._num_skills)
58 | return (aug_obs, r, done, info)
59 |
60 | def reset(self):
61 | obs = self._env.reset()
62 | aug_obs = concat_obs_z(obs, self._z, self._num_skills)
63 | return aug_obs
64 |
65 | def log_diagnostics(self, paths):
66 | self._env.log_diagnostics(paths)
67 |
68 | def terminate(self):
69 | self._env.terminate()
70 |
--------------------------------------------------------------------------------
/envs/multi_direction_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from rllab.core.serializable import Serializable
4 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv
5 | from rllab.envs.mujoco.ant_env import AntEnv
6 | from rllab.envs.mujoco.humanoid_env import HumanoidEnv
7 | from rllab.envs.base import Step
8 | from rllab.misc import logger
9 |
10 | from .helpers import get_multi_direction_logs
11 |
12 | class MultiDirectionBaseEnv(Serializable):
13 | def __init__(self,
14 | velocity_reward_weight=1.0,
15 | survive_reward=0,
16 | ctrl_cost_coeff=0,
17 | contact_cost_coeff=0,
18 | velocity_deviation_cost_coeff=0,
19 | *args, **kwargs):
20 | self._velocity_reward_weight = velocity_reward_weight
21 | self._survive_reward = survive_reward
22 |
23 | self._ctrl_cost_coeff = ctrl_cost_coeff
24 | self._contact_cost_coeff = contact_cost_coeff
25 | self._velocity_deviation_cost_coeff = velocity_deviation_cost_coeff
26 | Serializable.quick_init(self, locals())
27 |
28 | @property
29 | def velocity_reward(self):
30 | xy_velocities = self.get_body_comvel("torso")[:2]
31 | #xy_velocities = self.get_body_comvel("torso")[0]
32 | # rewards for speed on xy-plane (no matter which direction)
33 | xy_velocity = np.linalg.norm(xy_velocities)
34 |
35 | velocity_reward = self._velocity_reward_weight * xy_velocity
36 | return velocity_reward
37 |
38 | @property
39 | def survive_reward(self):
40 | return self._survive_reward
41 |
42 | def control_cost(self, action):
43 | lb, ub = self.action_bounds
44 | scaling = (ub - lb) / 2.0
45 |
46 | return 0.5 * self._ctrl_cost_coeff * np.sum(
47 | np.square(action / scaling))
48 |
49 | @property
50 | def contact_cost(self):
51 | return 0.5 * self._contact_cost_coeff * np.sum(
52 | np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
53 |
54 | @property
55 | def is_healthy(self):
56 | return True
57 |
58 | @property
59 | def velocity_deviation_cost(self):
60 | velocity_deviation_cost = (
61 | 0.5 *
62 | self._velocity_deviation_cost_coeff
63 | * np.sum(np.square(self.get_body_comvel("torso")[2:])))
64 | return velocity_deviation_cost
65 |
66 | @property
67 | def done(self):
68 | done = not self.is_healthy
69 | return done
70 |
71 |
72 | def step(self, action):
73 | self.forward_dynamics(action)
74 |
75 | reward = (
76 | self.velocity_reward
77 | + self.survive_reward
78 | - self.control_cost(action)
79 | - self.contact_cost
80 | - self.velocity_deviation_cost)
81 |
82 | next_observation = self.get_current_obs()
83 | #return Step(next_observation, float(reward), self.done)
84 | return Step(next_observation, float(reward), False)
85 |
86 | def log_diagnostics(self, paths, *args, **kwargs):
87 | logs = get_multi_direction_logs(paths)
88 | for row in logs:
89 | logger.record_tabular(*row)
90 |
91 |
92 | class MultiDirectionSwimmerEnv(MultiDirectionBaseEnv, SwimmerEnv):
93 | def __init__(self,
94 | ctrl_cost_coeff=1e-2,
95 | *args, **kwargs):
96 | MultiDirectionBaseEnv.__init__(
97 | self, ctrl_cost_coeff=ctrl_cost_coeff, *args, **kwargs)
98 | SwimmerEnv.__init__(
99 | self, ctrl_cost_coeff=ctrl_cost_coeff, *args, **kwargs)
100 |
101 | @property
102 | def velocity_reward(self):
103 | xy_velocities = self.get_body_comvel("torso")[:2]
104 |
105 | # rewards for speed on positive x direction
106 | xy_velocity = np.linalg.norm(xy_velocities)
107 | if xy_velocities[0] < 0:
108 | xy_velocity *= -1.0
109 |
110 | velocity_reward = self._velocity_reward_weight * xy_velocity
111 | return velocity_reward
112 |
113 | class MultiDirectionAntEnv(MultiDirectionBaseEnv, AntEnv):
114 | def __init__(self,
115 | ctrl_cost_coeff=1e-2,
116 | contact_cost_coeff=1e-3,
117 | survive_reward=5e-2,
118 | *args, **kwargs):
119 | MultiDirectionBaseEnv.__init__(
120 | self,
121 | ctrl_cost_coeff=ctrl_cost_coeff,
122 | contact_cost_coeff=contact_cost_coeff,
123 | survive_reward=survive_reward,
124 | *args, **kwargs)
125 | AntEnv.__init__(self, *args, **kwargs)
126 |
127 | @property
128 | def is_healthy(self):
129 | return (np.isfinite(self._state).all()
130 | and 0.2 <= self._state[2] <= 1.0)
131 |
132 | class MultiDirectionHumanoidEnv(MultiDirectionBaseEnv, HumanoidEnv):
133 | def __init__(self,
134 | survive_reward=2e-1,
135 | ctrl_cost_coeff=1e-3,
136 | contact_cost_coeff=1e-5,
137 | velocity_deviation_cost_coeff=1e-2,
138 | *args, **kwargs):
139 | MultiDirectionBaseEnv.__init__(
140 | self,
141 | survive_reward=survive_reward,
142 | ctrl_cost_coeff=ctrl_cost_coeff,
143 | contact_cost_coeff=contact_cost_coeff,
144 | velocity_deviation_cost_coeff=velocity_deviation_cost_coeff,
145 | *args, **kwargs)
146 | HumanoidEnv.__init__(
147 | self,
148 | # survive_reward=survive_reward,
149 | alive_bonus=survive_reward, # TODO: remove this
150 | ctrl_cost_coeff=ctrl_cost_coeff,
151 | # contact_cost_coeff=contact_cost_coeff,
152 | impact_cost_coeff=contact_cost_coeff, # TODO: remove this
153 | vel_deviation_cost_coeff=velocity_deviation_cost_coeff, # TODO: remove this
154 | *args, **kwargs)
155 |
156 | @property
157 | def is_healthy(self):
158 | return 0.8 < self.model.data.qpos[2] < 2.0
159 |
--------------------------------------------------------------------------------
/envs/simple_maze_ant_env.py:
--------------------------------------------------------------------------------
1 | """Implements an ant whose goal is to reach a target in a maze"""
2 |
3 | import os
4 |
5 | import numpy as np
6 |
7 | from rllab.core.serializable import Serializable
8 | from sac.misc.utils import PROJECT_PATH
9 | from .helpers import random_point_in_circle, get_random_goal_logs
10 | from .random_goal_ant_env import RandomGoalAntEnv
11 |
12 | MODELS_PATH = os.path.abspath(
13 | os.path.join(PROJECT_PATH, 'sac/mujoco_models'))
14 |
15 | class SimpleMazeAntEnv(RandomGoalAntEnv, Serializable):
16 | """Implements an ant whose goal is to reach a target in a maze"""
17 |
18 | FILE_PATH = os.path.join(MODELS_PATH, 'simple_maze_ant.xml')
19 |
20 | def __init__(self,
21 | reward_type='dense',
22 | terminate_at_goal=True,
23 | goal_reward_weight=3e-1,
24 | goal_radius=1.0,
25 | goal_distance=1.0,
26 | goal_angle_range=(0, 2*np.pi),
27 | velocity_reward_weight=0,
28 | ctrl_cost_coeff=1e-2,
29 | contact_cost_coeff=1e-3,
30 | survive_reward=5e-2,
31 | *args,
32 | **kwargs):
33 | file_path = self.__class__.FILE_PATH
34 | kwargs.pop('file_path', None)
35 | super(SimpleMazeAntEnv, self).__init__(
36 | file_path=file_path,
37 | reward_type=reward_type,
38 | terminate_at_goal=terminate_at_goal,
39 | goal_reward_weight=goal_reward_weight,
40 | goal_radius=goal_radius,
41 | goal_distance=goal_distance,
42 | goal_angle_range=goal_angle_range,
43 | velocity_reward_weight=velocity_reward_weight,
44 | ctrl_cost_coeff=ctrl_cost_coeff,
45 | contact_cost_coeff=contact_cost_coeff,
46 | survive_reward=survive_reward,
47 | *args,
48 | **kwargs)
49 |
50 | def reset(self, *args, **kwargs):
51 | observation = super(SimpleMazeAntEnv, self).reset(
52 | goal_position=np.array([20, -13]), *args, **kwargs)
53 |
54 | return observation
55 |
--------------------------------------------------------------------------------
/misc/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/misc/__pycache__/instrument.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/instrument.cpython-35.pyc
--------------------------------------------------------------------------------
/misc/__pycache__/mlp.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/mlp.cpython-35.pyc
--------------------------------------------------------------------------------
/misc/__pycache__/plotter.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/plotter.cpython-35.pyc
--------------------------------------------------------------------------------
/misc/__pycache__/sampler.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/sampler.cpython-35.pyc
--------------------------------------------------------------------------------
/misc/__pycache__/tf_utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/tf_utils.cpython-35.pyc
--------------------------------------------------------------------------------
/misc/__pycache__/utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/utils.cpython-35.pyc
--------------------------------------------------------------------------------
/misc/instrument.py:
--------------------------------------------------------------------------------
1 | import os
2 | import uuid
3 |
4 | from rllab.misc.instrument import run_experiment_lite
5 | from sac.misc.utils import timestamp
6 |
7 | from sac.misc.utils import PROJECT_PATH
8 |
9 | DEFAULT_LOG_DIR = PROJECT_PATH + "/data"
10 |
11 | def _create_symlink(folder):
12 | # Create a symbolic link that points to the sac folder and include it
13 | # in the tarball.
14 |
15 | # Unique filename for the symlink.
16 | include_path = os.path.join('/tmp/', str(uuid.uuid4()))
17 | os.makedirs(include_path)
18 |
19 | os.symlink(os.path.join(PROJECT_PATH, folder),
20 | os.path.join(include_path, folder))
21 |
22 | return include_path
23 |
24 |
25 | def run_sac_experiment(main, mode, include_folders=None, log_dir=None,
26 | exp_prefix="experiment", exp_name=None, **kwargs):
27 | if exp_name is None:
28 | exp_name = timestamp()
29 |
30 | if log_dir is None:
31 | log_dir = os.path.join(
32 | DEFAULT_LOG_DIR,
33 | "local",
34 | exp_prefix.replace("_", "-"),
35 | exp_name)
36 |
37 | if include_folders is None:
38 | include_folders = list()
39 |
40 | if mode == 'ec2':
41 | include_folders.append('sac')
42 | all_symlinks = list()
43 |
44 | for folder in include_folders:
45 | all_symlinks.append(_create_symlink(folder))
46 |
47 | kwargs.update(added_project_directories=all_symlinks)
48 |
49 | run_experiment_lite(
50 | stub_method_call=main,
51 | mode=mode,
52 | exp_prefix=exp_prefix,
53 | exp_name=exp_name,
54 | log_dir=log_dir,
55 | **kwargs,
56 | )
57 |
--------------------------------------------------------------------------------
/misc/plotter.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib.pyplot as plt
3 |
4 |
5 | class QFPolicyPlotter:
6 | def __init__(self, qf, policy, obs_lst, default_action, n_samples):
7 | self._qf = qf
8 | self._policy = policy
9 | self._obs_lst = obs_lst
10 | self._default_action = default_action
11 | self._n_samples = n_samples
12 |
13 | self._var_inds = np.where(np.isnan(default_action))[0]
14 | assert len(self._var_inds) == 2
15 |
16 | n_plots = len(obs_lst)
17 |
18 | x_size = 5 * n_plots
19 | y_size = 5
20 |
21 | fig = plt.figure(figsize=(x_size, y_size))
22 | self._ax_lst = []
23 | for i in range(n_plots):
24 | ax = fig.add_subplot(100 + n_plots * 10 + i + 1)
25 | ax.set_xlim((-1, 1))
26 | ax.set_ylim((-1, 1))
27 | ax.grid(True)
28 | self._ax_lst.append(ax)
29 |
30 | self._line_objects = list()
31 |
32 | def draw(self):
33 | # noinspection PyArgumentList
34 | [h.remove() for h in self._line_objects]
35 | self._line_objects = list()
36 |
37 | self._plot_level_curves()
38 | self._plot_action_samples()
39 |
40 | plt.draw()
41 | plt.pause(0.001)
42 |
43 | def _plot_level_curves(self):
44 | # Create mesh grid.
45 | xs = np.linspace(-1, 1, 50)
46 | ys = np.linspace(-1, 1, 50)
47 | xgrid, ygrid = np.meshgrid(xs, ys)
48 | N = len(xs)*len(ys)
49 |
50 | # Copy default values along the first axis and replace nans with
51 | # the mesh grid points.
52 | actions = np.tile(self._default_action, (N, 1))
53 | actions[:, self._var_inds[0]] = xgrid.ravel()
54 | actions[:, self._var_inds[1]] = ygrid.ravel()
55 |
56 | for ax, obs in zip(self._ax_lst, self._obs_lst):
57 | qs = self._qf.eval(obs[None], actions)
58 | qs = qs.reshape(xgrid.shape)
59 |
60 | cs = ax.contour(xgrid, ygrid, qs, 20)
61 | self._line_objects += cs.collections
62 | self._line_objects += ax.clabel(
63 | cs, inline=1, fontsize=10, fmt='%.2f')
64 |
65 | def _plot_action_samples(self):
66 | for ax, obs in zip(self._ax_lst, self._obs_lst):
67 | actions = self._policy.get_actions(
68 | np.ones((self._n_samples, 1)) * obs[None, :])
69 |
70 | x, y = actions[:, 0], actions[:, 1]
71 | self._line_objects += ax.plot(x, y, 'b*')
72 |
--------------------------------------------------------------------------------
/misc/remote_sampler.py:
--------------------------------------------------------------------------------
1 | import pickle
2 | import ray # TODO: Add ray to dependencies.
3 | import tensorflow as tf
4 | import numpy as np
5 |
6 | from rllab.misc.overrides import overrides
7 | from rllab.misc import logger
8 |
9 | from . import tf_utils
10 | from .sampler import Sampler, rollout
11 |
12 | # TODO: Make the remote sampler correctly use the initial exploration policy, as of now, using this will fail
13 |
14 | class RemoteSampler(Sampler):
15 | def __init__(self, **kwargs):
16 | super(RemoteSampler, self).__init__(**kwargs)
17 |
18 | self._remote_environment = None
19 | self._remote_path = None
20 | self._n_episodes = 0
21 | self._total_samples = 0
22 | self._last_path_return = 0
23 | self._max_path_return = -np.inf
24 |
25 | @overrides
26 | def initialize(self, env, policy, pool):
27 | super(RemoteSampler, self).initialize(env, policy, pool)
28 |
29 | ray.init()
30 |
31 | env_pkl = pickle.dumps(env)
32 | policy_pkl = pickle.dumps(policy)
33 |
34 | self._remote_environment = _RemoteEnv.remote(env_pkl, policy_pkl)
35 |
36 | def sample(self):
37 | if self._remote_path is None:
38 | policy_params = self.policy.get_param_values()
39 | self._remote_path = self._remote_environment.rollout.remote(
40 | policy_params, self._max_path_length)
41 |
42 | path_ready, _ = ray.wait([self._remote_path], timeout=0)
43 |
44 | if len(path_ready) or not self.batch_ready():
45 | path = ray.get(self._remote_path)
46 | self.pool.add_path(path)
47 | self._remote_path = None
48 | self._total_samples += len(path['observations'])
49 | self._last_path_return = np.sum(path['rewards'])
50 | self._max_path_return = max(self._max_path_return,
51 | self._last_path_return)
52 | self._n_episodes += 1
53 |
54 | def log_diagnostics(self):
55 | logger.record_tabular('max-path-return', self._max_path_return)
56 | logger.record_tabular('last-path-return', self._last_path_return)
57 | logger.record_tabular('pool-size', self.pool.size)
58 | logger.record_tabular('episodes', self._n_episodes)
59 | logger.record_tabular('total-samples', self._total_samples)
60 |
61 |
62 | @ray.remote
63 | class _RemoteEnv(object):
64 | def __init__(self, env_pkl, policy_pkl):
65 | self._sess = tf_utils.create_session()
66 | self._sess.run(tf.global_variables_initializer())
67 |
68 | self._env = pickle.loads(env_pkl)
69 | self._policy = pickle.loads(policy_pkl)
70 |
71 | if hasattr(self._env, 'initialize'):
72 | self._env.initialize()
73 |
74 | def rollout(self, policy_params, path_length):
75 | self._policy.set_param_values(policy_params)
76 | path = rollout(self._env, self._policy, path_length)
77 |
78 | return path
79 |
--------------------------------------------------------------------------------
/misc/tf_utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | from rllab import config
3 |
4 |
5 | def get_default_session():
6 | return tf.get_default_session() or create_session()
7 |
8 |
9 | def create_session(**kwargs):
10 | """ Create new tensorflow session with given configuration. """
11 | if "config" not in kwargs:
12 | kwargs["config"] = get_configuration()
13 | return tf.InteractiveSession(**kwargs)
14 |
15 |
16 | def get_configuration():
17 | """ Returns personal tensorflow configuration. """
18 | if config.USE_GPU:
19 | raise NotImplementedError
20 |
21 | config_args = dict()
22 | return tf.ConfigProto(**config_args)
23 |
--------------------------------------------------------------------------------
/misc/utils.py:
--------------------------------------------------------------------------------
1 | import collections
2 | import datetime
3 | import dateutil.tz
4 | import os
5 | import numpy as np
6 |
7 | PROJECT_PATH = os.path.dirname(
8 | os.path.realpath(os.path.join(__file__, '..', '..')))
9 |
10 | def timestamp():
11 | now = datetime.datetime.now(dateutil.tz.tzlocal())
12 | return now.strftime('%Y-%m-%d-%H-%M-%S-%f-%Z')
13 |
14 | def deep_update(d, u):
15 | for k, v in u.items():
16 | d[k] = (
17 | deep_update(d.get(k, {}), v)
18 | if isinstance(v, collections.Mapping)
19 | else v)
20 |
21 | return d
22 |
23 | def get_git_rev():
24 | try:
25 | import git
26 | repo = git.Repo(os.getcwd())
27 | git_rev = repo.active_branch.commit.name_rev
28 | except:
29 | git_rev = None
30 |
31 | return git_rev
32 |
33 | def flatten(unflattened, parent_key='', separator='.'):
34 | items = []
35 | for k, v in unflattened.items():
36 | if separator in k:
37 | raise ValueError(
38 | "Found separator ({}) from key ({})".format(separator, k))
39 | new_key = parent_key + separator + k if parent_key else k
40 | if isinstance(v, collections.MutableMapping) and v:
41 | items.extend(flatten(v, new_key, separator=separator).items())
42 | else:
43 | items.append((new_key, v))
44 |
45 | return dict(items)
46 |
47 | def unflatten(flattened, separator='.'):
48 | result = {}
49 | for key, value in flattened.items():
50 | parts = key.split(separator)
51 | d = result
52 | for part in parts[:-1]:
53 | if part not in d:
54 | d[part] = {}
55 | d = d[part]
56 | d[parts[-1]] = value
57 |
58 | return result
59 |
60 | def concat_obs_z(obs, z, num_skills):
61 | """Concatenates the observation to a one-hot encoding of Z."""
62 | assert np.isscalar(z)
63 | z_one_hot = np.zeros(num_skills)
64 | z_one_hot[z] = 1
65 | return np.hstack([obs, z_one_hot])
66 |
67 | def split_aug_obs(aug_obs, num_skills):
68 | """Splits an augmented observation into the observation and Z."""
69 | (obs, z_one_hot) = (aug_obs[:-num_skills], aug_obs[-num_skills:])
70 | z = np.where(z_one_hot == 1)[0][0]
71 | return (obs, z)
72 |
73 | def _make_dir(filename):
74 | folder = os.path.dirname(filename)
75 | if not os.path.exists(folder):
76 | os.makedirs(folder)
77 |
78 | def _save_video(paths, filename):
79 | import cv2
80 | assert all(['ims' in path for path in paths])
81 | ims = [im for path in paths for im in path['ims']]
82 | _make_dir(filename)
83 |
84 | # Define the codec and create VideoWriter object
85 | fourcc = cv2.VideoWriter_fourcc(*'MJPG')
86 | fps = 30.0
87 | (height, width, _) = ims[0].shape
88 | writer = cv2.VideoWriter(filename, fourcc, fps, (width, height))
89 | for im in ims:
90 | writer.write(im)
91 | writer.release()
92 |
93 | def _softmax(x):
94 | max_x = np.max(x)
95 | exp_x = np.exp(x - max_x)
96 | return exp_x / np.sum(exp_x)
97 |
--------------------------------------------------------------------------------
/mujoco_models/pusher_2d.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
--------------------------------------------------------------------------------
/mujoco_models/simple_maze_ant.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
--------------------------------------------------------------------------------
/policies/__init__.py:
--------------------------------------------------------------------------------
1 | from .nn_policy import NNPolicy
2 | from .nn_policy2 import NNPolicy2
3 | from .uniform_policy import UniformPolicy
4 | from .gaussian_policy import GaussianPolicy
5 | from .pointer_policy import GaussianPtrPolicy
6 |
--------------------------------------------------------------------------------
/policies/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/base.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/gaussian_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/gaussian_policy.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/gmm.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/gmm.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/hierarchical_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/hierarchical_policy.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/latent_space_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/latent_space_policy.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/nn_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/nn_policy.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/nn_policy2.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/nn_policy2.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/pointer_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/pointer_policy.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/__pycache__/uniform_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/uniform_policy.cpython-35.pyc
--------------------------------------------------------------------------------
/policies/base.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | from sandbox.rocky.tf.core.parameterized import Parameterized
5 |
6 |
7 | class Policy2(Parameterized):
8 | def __init__(self, env_spec):
9 | Parameterized.__init__(self)
10 | self._env_spec = env_spec
11 |
12 | # Should be implemented by all policies
13 |
14 | def get_action(self, observation, sub_level_actions):
15 | raise NotImplementedError
16 |
17 | def get_actions(self, observations,sub_level_actions):
18 | raise NotImplementedError
19 |
20 | def reset(self, dones=None):
21 | pass
22 |
23 | @property
24 | def vectorized(self):
25 | """
26 | Indicates whether the policy is vectorized. If True, it should implement get_actions(), and support resetting
27 | with multiple simultaneous states.
28 | """
29 | return False
30 |
31 | @property
32 | def observation_space(self):
33 | return self._env_spec.observation_space
34 |
35 | @property
36 | def action_space(self):
37 | return self._env_spec.action_space
38 |
39 | @property
40 | def env_spec(self):
41 | return self._env_spec
42 |
43 | @property
44 | def recurrent(self):
45 | """
46 | Indicates whether the policy is recurrent.
47 | :return:
48 | """
49 | return False
50 |
51 | def log_diagnostics(self, paths):
52 | """
53 | Log extra information per iteration based on the collected paths
54 | """
55 | pass
56 |
57 | @property
58 | def state_info_keys(self):
59 | """
60 | Return keys for the information related to the policy's state when taking an action.
61 | :return:
62 | """
63 | return [k for k, _ in self.state_info_specs]
64 |
65 | @property
66 | def state_info_specs(self):
67 | """
68 | Return keys and shapes for the information related to the policy's state when taking an action.
69 | :return:
70 | """
71 | return list()
72 |
73 | def terminate(self):
74 | """
75 | Clean up operation
76 | """
77 | pass
78 |
79 |
80 | class StochasticPolicy(Policy2):
81 | @property
82 | def distribution(self):
83 | """
84 | :rtype Distribution
85 | """
86 | raise NotImplementedError
87 |
88 | def dist_info_sym(self, obs_var, state_info_vars):
89 | """
90 | Return the symbolic distribution information about the actions.
91 | :param obs_var: symbolic variable for observations
92 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
93 | the time it received the observation
94 | :return:
95 | """
96 | raise NotImplementedError
97 |
98 | def dist_info(self, obs, state_infos):
99 | """
100 | Return the distribution information about the actions.
101 | :param obs_var: observation values
102 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
103 | the time it received the observation
104 | :return:
105 | """
106 | raise NotImplementedError
107 |
--------------------------------------------------------------------------------
/policies/nn_policy.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from rllab.core.serializable import Serializable
4 |
5 | from rllab.misc.overrides import overrides
6 | from sandbox.rocky.tf.policies.base import Policy
7 |
8 |
9 | class NNPolicy(Policy, Serializable):
10 | def __init__(self, env_spec, observation_ph, actions,
11 | scope_name=None):
12 | Serializable.quick_init(self, locals())
13 |
14 | self._observations_ph = observation_ph
15 | self._actions = actions
16 | self._scope_name = (
17 | tf.get_variable_scope().name if not scope_name else scope_name
18 | )
19 | super(NNPolicy, self).__init__(env_spec)
20 |
21 | @overrides
22 | def get_action(self, observation):
23 | """Sample single action based on the observations."""
24 | return self.get_actions(observation[None])[0], {}
25 |
26 | @overrides
27 | def get_actions(self, observations):
28 | """Sample actions based on the observations."""
29 | feed_dict = {self._observations_ph: observations}
30 | actions = tf.get_default_session().run(self._actions, feed_dict)
31 | return actions
32 |
33 | @overrides
34 | def log_diagnostics(self, paths):
35 | pass
36 |
37 | @overrides
38 | def get_params_internal(self, **tags):
39 | if tags:
40 | raise NotImplementedError
41 | scope = self._scope_name
42 | # Add "/" to 'scope' unless it's empty (otherwise get_collection will
43 | # return all parameters that start with 'scope'.
44 | scope = scope if scope == '' else scope + '/'
45 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
46 |
--------------------------------------------------------------------------------
/policies/nn_policy2.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from rllab.core.serializable import Serializable
4 |
5 | from rllab.misc.overrides import overrides
6 | from sac.policies.base import Policy2
7 |
8 |
9 | class NNPolicy2(Policy2, Serializable):
10 | def __init__(self, env_spec, observation_ph, actions,
11 | scope_name=None):
12 | Serializable.quick_init(self, locals())
13 |
14 | self._observations_ph = observation_ph
15 | self._actions = actions
16 | self._scope_name = (
17 | tf.get_variable_scope().name if not scope_name else scope_name
18 | )
19 | super(NNPolicy2, self).__init__(env_spec)
20 |
21 | @overrides
22 | def get_action(self, observation,sub_level_actions):
23 | """Sample single action based on the observations."""
24 | return self.get_actions(observation[None],sub_level_actions)[0], {}
25 |
26 | @overrides
27 | def get_actions(self, observations,sub_level_actions):
28 | """Sample actions based on the observations."""
29 | feed_dict = {self._observations_ph: observations,self.sub_level_actions:sub_level_actions}
30 | actions = tf.get_default_session().run(self._actions, feed_dict)
31 | return actions
32 |
33 | @overrides
34 | def log_diagnostics(self, paths):
35 | pass
36 |
37 | @overrides
38 | def get_params_internal(self, **tags):
39 | if tags:
40 | raise NotImplementedError
41 | scope = self._scope_name
42 | # Add "/" to 'scope' unless it's empty (otherwise get_collection will
43 | # return all parameters that start with 'scope'.
44 | scope = scope if scope == '' else scope + '/'
45 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope)
46 |
47 |
48 |
--------------------------------------------------------------------------------
/policies/uniform_policy.py:
--------------------------------------------------------------------------------
1 | from rllab.core.serializable import Serializable
2 |
3 | from rllab.misc.overrides import overrides
4 | from sac.policies.base import Policy2
5 |
6 | import numpy as np
7 |
8 |
9 | class UniformPolicy(Policy2, Serializable):
10 | """
11 | Fixed policy that randomly samples actions uniformly at random.
12 |
13 | Used for an initial exploration period instead of an undertrained policy.
14 | """
15 | def __init__(self, env_spec):
16 | Serializable.quick_init(self, locals())
17 | self._Da = env_spec.action_space.flat_dim
18 |
19 | super(UniformPolicy, self).__init__(env_spec)
20 |
21 | # Assumes action spaces are normalized to be the interval [-1, 1]
22 | @overrides
23 | def get_action(self, observation,sub_level_actions):
24 | return np.random.uniform(-1., 1., self._Da), None
25 | '''@overrides
26 | def get_action(self, observation,sub_level_actions):
27 | probs=np.random.uniform(0.0, 1., 4)
28 | probs=np.argmax(probs)#probs/sum(probs)
29 | #probs=np.array([1,0,0,0],dtype=np.float32)
30 | #probs=np.random.shuffle(probs)
31 | #actions_mean=np.sum(np.multiply(sub_level_actions[0],np.expand_dims(probs,2)),1)
32 | return sub_level_actions[0][0][probs], None
33 | #return np.random.uniform(-1., 1., self._Da), None'''
34 |
35 | @overrides
36 | def get_actions(self, observations,sub_level_actions):
37 | pass
38 |
39 | @overrides
40 | def log_diagnostics(self, paths):
41 | pass
42 |
43 | @overrides
44 | def get_params_internal(self, **tags):
45 | pass
46 |
47 |
--------------------------------------------------------------------------------
/preprocessors/__init__.py:
--------------------------------------------------------------------------------
1 | from .mlp_preprocessor import MLPPreprocessor
2 |
--------------------------------------------------------------------------------
/preprocessors/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/preprocessors/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/preprocessors/__pycache__/mlp_preprocessor.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/preprocessors/__pycache__/mlp_preprocessor.cpython-35.pyc
--------------------------------------------------------------------------------
/preprocessors/mlp_preprocessor.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from rllab.core.serializable import Serializable
4 |
5 | from sandbox.rocky.tf.core.parameterized import Parameterized
6 |
7 | from sac.misc.mlp import MLPFunction
8 | from sac.misc import tf_utils
9 |
10 | class MLPPreprocessor(MLPFunction):
11 | def __init__(self, env_spec, layer_sizes=(128, 16),
12 | output_nonlinearity=None, name='observations_preprocessor'):
13 |
14 | Parameterized.__init__(self)
15 | Serializable.quick_init(self, locals())
16 |
17 | self._name = name
18 |
19 | self._Do = env_spec.observation_space.flat_dim
20 |
21 | obs_ph = tf.placeholder(
22 | tf.float32,
23 | shape=(None, self._Do),
24 | name='observations',
25 | )
26 |
27 | self._input_pls = (obs_ph, )
28 | self._layer_sizes = layer_sizes
29 | self._output_nonlinearity = output_nonlinearity
30 |
31 | self._output_t = self.get_output_for(obs_ph, reuse=tf.AUTO_REUSE)
32 |
--------------------------------------------------------------------------------
/primitive-policies/ant/bwrd/bwrd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/bwrd/bwrd.pkl
--------------------------------------------------------------------------------
/primitive-policies/ant/dwrd/dwrd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/dwrd/dwrd.pkl
--------------------------------------------------------------------------------
/primitive-policies/ant/fwrd/fwrd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/fwrd/fwrd.pkl
--------------------------------------------------------------------------------
/primitive-policies/ant/uwrd/uwrd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/uwrd/uwrd.pkl
--------------------------------------------------------------------------------
/primitive-policies/hc/fwd/fwd.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/hc/fwd/fwd.pkl
--------------------------------------------------------------------------------
/primitive-policies/hc/jp-longz/jump.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/hc/jp-longz/jump.pkl
--------------------------------------------------------------------------------
/primitive-policies/pusher/bottom/bottom.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/pusher/bottom/bottom.pkl
--------------------------------------------------------------------------------
/primitive-policies/pusher/left/left.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/pusher/left/left.pkl
--------------------------------------------------------------------------------
/replay_buffers/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_replay_buffer import SimpleReplayBuffer
--------------------------------------------------------------------------------
/replay_buffers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/replay_buffers/__pycache__/replay_buffer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/replay_buffer.cpython-35.pyc
--------------------------------------------------------------------------------
/replay_buffers/__pycache__/simple_replay_buffer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/simple_replay_buffer.cpython-35.pyc
--------------------------------------------------------------------------------
/replay_buffers/replay_buffer.py:
--------------------------------------------------------------------------------
1 | import abc
2 |
3 |
4 | class ReplayBuffer(object, metaclass=abc.ABCMeta):
5 | """
6 | A class used to save and replay data.
7 | """
8 |
9 | @abc.abstractmethod
10 | def add_sample(self, observation, action, reward, next_observation,
11 | terminal, **kwargs):
12 | """
13 | Add a transition tuple.
14 | """
15 | pass
16 |
17 | @abc.abstractmethod
18 | def terminate_episode(self):
19 | """
20 | Let the replay buffer know that the episode has terminated in case some
21 | special book-keeping has to happen.
22 | :return:
23 | """
24 | pass
25 |
26 | @property
27 | @abc.abstractmethod
28 | def size(self, **kwargs):
29 | """
30 | :return: # of unique items that can be sampled.
31 | """
32 | pass
33 |
34 | def add_path(self, path):
35 | """
36 | Add a path to the replay buffer.
37 |
38 | This default implementation naively goes through every step, but you
39 | may want to optimize this.
40 |
41 | NOTE: You should NOT call "terminate_episode" after calling add_path.
42 | It's assumed that this function handles the episode termination.
43 |
44 | :param path: Dict like one outputted by railrl.samplers.util.rollout
45 | """
46 | for i, (
47 | obs,
48 | sub_level_actions,
49 | action,
50 | reward,
51 | next_obs,
52 | terminal,
53 | agent_info,
54 | env_info
55 | ) in enumerate(zip(
56 | path["observations"],
57 | path["sub_level_actions"],
58 | path["actions"],
59 | path["rewards"],
60 | path["next_observations"],
61 | path["terminals"],
62 | path["agent_infos"],
63 | path["env_infos"],
64 | )):
65 | self.add_sample(
66 | obs,
67 | sub_level_actions,
68 | action,
69 | reward,
70 | terminal,
71 | next_obs,
72 | agent_info=agent_info,
73 | env_info=env_info,
74 | )
75 | self.terminate_episode()
76 |
77 | @abc.abstractmethod
78 | def random_batch(self, batch_size):
79 | """
80 | Return a batch of size `batch_size`.
81 | :param batch_size:
82 | :return:
83 | """
84 | pass
85 |
--------------------------------------------------------------------------------
/replay_buffers/simple_replay_buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from rllab.core.serializable import Serializable
4 |
5 | from .replay_buffer import ReplayBuffer
6 |
7 |
8 | class SimpleReplayBuffer(ReplayBuffer, Serializable):
9 | def __init__(self, env_spec, max_replay_buffer_size, seq_len):
10 | super(SimpleReplayBuffer, self).__init__()
11 | Serializable.quick_init(self, locals())
12 |
13 | max_replay_buffer_size = int(max_replay_buffer_size)
14 |
15 | self._env_spec = env_spec
16 | self._observation_dim = env_spec.observation_space.flat_dim
17 | self._action_dim = env_spec.action_space.flat_dim
18 | self._max_buffer_size = max_replay_buffer_size
19 | self._observations = np.zeros((max_replay_buffer_size,
20 | self._observation_dim))
21 | # It's a bit memory inefficient to save the observations twice,
22 | # but it makes the code *much* easier since you no longer have to
23 | # worry about termination conditions.
24 | self._next_obs = np.zeros((max_replay_buffer_size,
25 | self._observation_dim))
26 | self._sub_level_actions = np.zeros((max_replay_buffer_size,seq_len, self._action_dim))
27 | self._sub_level_probs = np.zeros((max_replay_buffer_size,seq_len, 1))
28 | self._actions = np.zeros((max_replay_buffer_size, self._action_dim))
29 | self._rewards = np.zeros(max_replay_buffer_size)
30 | # self._terminals[i] = a terminal was received at time i
31 | self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8')
32 | self._top = 0
33 | self._size = 0
34 |
35 | def add_sample(self, observation,sub_level_actions,sub_level_probs, action, reward, terminal,
36 | next_observation, **kwargs):
37 | self._observations[self._top] = observation
38 | self._sub_level_actions[self._top] = sub_level_actions
39 | self._sub_level_probs[self._top] = sub_level_probs
40 | self._actions[self._top] = action
41 | self._rewards[self._top] = reward
42 | self._terminals[self._top] = terminal
43 | self._next_obs[self._top] = next_observation
44 |
45 | self._advance()
46 |
47 | def terminate_episode(self):
48 | pass
49 |
50 | def _advance(self):
51 | self._top = (self._top + 1) % self._max_buffer_size
52 | if self._size < self._max_buffer_size:
53 | self._size += 1
54 |
55 | def random_batch(self, batch_size):
56 | indices = np.random.randint(0, self._size, batch_size)
57 | return dict(
58 | observations=self._observations[indices],
59 | sub_level_actions=self._sub_level_actions[indices],
60 | sub_level_probs=self._sub_level_probs[indices],
61 | actions=self._actions[indices],
62 | rewards=self._rewards[indices],
63 | terminals=self._terminals[indices],
64 | next_observations=self._next_obs[indices],
65 | )
66 |
67 | @property
68 | def size(self):
69 | return self._size
70 |
71 | def __getstate__(self):
72 | d = super(SimpleReplayBuffer, self).__getstate__()
73 | d.update(dict(
74 | o=self._observations.tobytes(),
75 | sa=self._sub_level_actions.tobytes(),
76 | sp=self._sub_level_probs.tobytes(),
77 | a=self._actions.tobytes(),
78 | r=self._rewards.tobytes(),
79 | t=self._terminals.tobytes(),
80 | no=self._next_obs.tobytes(),
81 | top=self._top,
82 | size=self._size,
83 | ))
84 | return d
85 |
86 | def __setstate__(self, d):
87 | super(SimpleReplayBuffer, self).__setstate__(d)
88 | self._observations = np.fromstring(d['o']).reshape(
89 | self._max_buffer_size, -1
90 | )
91 | self._next_obs = np.fromstring(d['no']).reshape(
92 | self._max_buffer_size, -1
93 | )
94 | self._sub_level_actions = np.fromstring(d['sa']).reshape(self._max_buffer_size,seq_len, -1)
95 | self._sub_level_probs = np.fromstring(d['sp']).reshape(self._max_buffer_size,seq_len, -1)
96 | self._actions = np.fromstring(d['a']).reshape(self._max_buffer_size, -1)
97 | self._rewards = np.fromstring(d['r']).reshape(self._max_buffer_size)
98 | self._terminals = np.fromstring(d['t'], dtype=np.uint8)
99 | self._top = d['top']
100 | self._size = d['size']
101 |
--------------------------------------------------------------------------------
/sandbox/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__pycache__/batch_polopt.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/batch_polopt.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__pycache__/npo.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/npo.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__pycache__/trpo.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/trpo.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/npg.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/npo.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | from rllab.misc import ext
5 | from rllab.misc.overrides import overrides
6 | import rllab.misc.logger as logger
7 | from sandbox.rocky.tf.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer
8 | from sandbox.rocky.tf.algos.batch_polopt import BatchPolopt
9 | from sandbox.rocky.tf.misc import tensor_utils
10 | import tensorflow as tf
11 |
12 |
13 | class NPO(BatchPolopt):
14 | """
15 | Natural Policy Optimization.
16 | """
17 |
18 | def __init__(
19 | self,
20 | optimizer=None,
21 | optimizer_args=None,
22 | step_size=0.01,
23 | **kwargs):
24 | if optimizer is None:
25 | if optimizer_args is None:
26 | optimizer_args = dict()
27 | optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
28 | self.optimizer = optimizer
29 | self.step_size = step_size
30 | super(NPO, self).__init__(**kwargs)
31 |
32 | @overrides
33 | def init_opt(self):
34 | is_recurrent = int(self.policy.recurrent)
35 | obs_var = self.env.observation_space.new_tensor_variable(
36 | 'obs',
37 | extra_dims=1 + is_recurrent,
38 | )
39 | action_var = self.env.action_space.new_tensor_variable(
40 | 'action',
41 | extra_dims=1 + is_recurrent,
42 | )
43 | advantage_var = tensor_utils.new_tensor(
44 | 'advantage',
45 | ndim=1 + is_recurrent,
46 | dtype=tf.float32,
47 | )
48 | dist = self.policy.distribution
49 |
50 | old_dist_info_vars = {
51 | k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
52 | for k, shape in dist.dist_info_specs
53 | }
54 | old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]
55 |
56 | state_info_vars = {
57 | k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
58 | for k, shape in self.policy.state_info_specs
59 | }
60 | state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]
61 |
62 | if is_recurrent:
63 | valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
64 | else:
65 | valid_var = None
66 |
67 | dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
68 | kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
69 | lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars)
70 | if is_recurrent:
71 | mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
72 | surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var)
73 | else:
74 | mean_kl = tf.reduce_mean(kl)
75 | surr_loss = - tf.reduce_mean(lr * advantage_var)
76 |
77 | input_list = [
78 | obs_var,
79 | action_var,
80 | advantage_var,
81 | ] + state_info_vars_list + old_dist_info_vars_list
82 | if is_recurrent:
83 | input_list.append(valid_var)
84 |
85 | self.optimizer.update_opt(
86 | loss=surr_loss,
87 | target=self.policy,
88 | leq_constraint=(mean_kl, self.step_size),
89 | inputs=input_list,
90 | constraint_name="mean_kl"
91 | )
92 | return dict()
93 |
94 | @overrides
95 | def optimize_policy(self, itr, samples_data):
96 | all_input_values = tuple(ext.extract(
97 | samples_data,
98 | "observations", "actions", "advantages"
99 | ))
100 | agent_infos = samples_data["agent_infos"]
101 | state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
102 | dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
103 | all_input_values += tuple(state_info_list) + tuple(dist_info_list)
104 | if self.policy.recurrent:
105 | all_input_values += (samples_data["valids"],)
106 | logger.log("Computing loss before")
107 | loss_before = self.optimizer.loss(all_input_values)
108 | logger.log("Computing KL before")
109 | mean_kl_before = self.optimizer.constraint_val(all_input_values)
110 | logger.log("Optimizing")
111 | self.optimizer.optimize(all_input_values)
112 | logger.log("Computing KL after")
113 | mean_kl = self.optimizer.constraint_val(all_input_values)
114 | logger.log("Computing loss after")
115 | loss_after = self.optimizer.loss(all_input_values)
116 | logger.record_tabular('LossBefore', loss_before)
117 | logger.record_tabular('LossAfter', loss_after)
118 | logger.record_tabular('MeanKLBefore', mean_kl_before)
119 | logger.record_tabular('MeanKL', mean_kl)
120 | logger.record_tabular('dLoss', loss_before - loss_after)
121 | return dict()
122 |
123 | @overrides
124 | def get_itr_snapshot(self, itr, samples_data):
125 | return dict(
126 | itr=itr,
127 | policy=self.policy,
128 | baseline=self.baseline,
129 | env=self.env,
130 | )
131 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/trpo.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from sandbox.rocky.tf.algos.npo import NPO
4 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
5 |
6 |
7 | class TRPO(NPO):
8 | """
9 | Trust Region Policy Optimization
10 | """
11 |
12 | def __init__(
13 | self,
14 | optimizer=None,
15 | optimizer_args=None,
16 | **kwargs):
17 | if optimizer is None:
18 | if optimizer_args is None:
19 | optimizer_args = dict()
20 | optimizer = ConjugateGradientOptimizer(**optimizer_args)
21 | super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
22 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/vpg.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from rllab.misc import logger
4 | from rllab.misc import ext
5 | from rllab.misc.overrides import overrides
6 | from sandbox.rocky.tf.algos.batch_polopt import BatchPolopt
7 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
8 | from sandbox.rocky.tf.misc import tensor_utils
9 | from rllab.core.serializable import Serializable
10 | import tensorflow as tf
11 |
12 |
13 | class VPG(BatchPolopt, Serializable):
14 | """
15 | Vanilla Policy Gradient.
16 | """
17 |
18 | def __init__(
19 | self,
20 | env,
21 | policy,
22 | baseline,
23 | optimizer=None,
24 | optimizer_args=None,
25 | **kwargs):
26 | Serializable.quick_init(self, locals())
27 | if optimizer is None:
28 | default_args = dict(
29 | batch_size=None,
30 | max_epochs=1,
31 | )
32 | if optimizer_args is None:
33 | optimizer_args = default_args
34 | else:
35 | optimizer_args = dict(default_args, **optimizer_args)
36 | optimizer = FirstOrderOptimizer(**optimizer_args)
37 | self.optimizer = optimizer
38 | self.opt_info = None
39 | super(VPG, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs)
40 |
41 | @overrides
42 | def init_opt(self):
43 | is_recurrent = int(self.policy.recurrent)
44 |
45 | obs_var = self.env.observation_space.new_tensor_variable(
46 | 'obs',
47 | extra_dims=1 + is_recurrent,
48 | )
49 | action_var = self.env.action_space.new_tensor_variable(
50 | 'action',
51 | extra_dims=1 + is_recurrent,
52 | )
53 | advantage_var = tensor_utils.new_tensor(
54 | name='advantage',
55 | ndim=1 + is_recurrent,
56 | dtype=tf.float32,
57 | )
58 | dist = self.policy.distribution
59 |
60 | old_dist_info_vars = {
61 | k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k)
62 | for k, shape in dist.dist_info_specs
63 | }
64 | old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]
65 |
66 | state_info_vars = {
67 | k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k)
68 | for k, shape in self.policy.state_info_specs
69 | }
70 | state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys]
71 |
72 | if is_recurrent:
73 | valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid")
74 | else:
75 | valid_var = None
76 |
77 | dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars)
78 | logli = dist.log_likelihood_sym(action_var, dist_info_vars)
79 | kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
80 |
81 | # formulate as a minimization problem
82 | # The gradient of the surrogate objective is the policy gradient
83 | if is_recurrent:
84 | surr_obj = - tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var)
85 | mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var)
86 | max_kl = tf.reduce_max(kl * valid_var)
87 | else:
88 | surr_obj = - tf.reduce_mean(logli * advantage_var)
89 | mean_kl = tf.reduce_mean(kl)
90 | max_kl = tf.reduce_max(kl)
91 |
92 | input_list = [obs_var, action_var, advantage_var] + state_info_vars_list
93 | if is_recurrent:
94 | input_list.append(valid_var)
95 |
96 | self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list)
97 |
98 | f_kl = tensor_utils.compile_function(
99 | inputs=input_list + old_dist_info_vars_list,
100 | outputs=[mean_kl, max_kl],
101 | )
102 | self.opt_info = dict(
103 | f_kl=f_kl,
104 | )
105 |
106 | @overrides
107 | def optimize_policy(self, itr, samples_data):
108 | logger.log("optimizing policy")
109 | inputs = ext.extract(
110 | samples_data,
111 | "observations", "actions", "advantages"
112 | )
113 | agent_infos = samples_data["agent_infos"]
114 | state_info_list = [agent_infos[k] for k in self.policy.state_info_keys]
115 | inputs += tuple(state_info_list)
116 | if self.policy.recurrent:
117 | inputs += (samples_data["valids"],)
118 | dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
119 | loss_before = self.optimizer.loss(inputs)
120 | self.optimizer.optimize(inputs)
121 | loss_after = self.optimizer.loss(inputs)
122 | logger.record_tabular("LossBefore", loss_before)
123 | logger.record_tabular("LossAfter", loss_after)
124 |
125 | mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list))
126 | logger.record_tabular('MeanKL', mean_kl)
127 | logger.record_tabular('MaxKL', max_kl)
128 |
129 | @overrides
130 | def get_itr_snapshot(self, itr, samples_data):
131 | return dict(
132 | itr=itr,
133 | policy=self.policy,
134 | baseline=self.baseline,
135 | env=self.env,
136 | )
137 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/layers.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/layers.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/layers_powered.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/layers_powered.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/network.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/network.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__pycache__/parameterized.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/parameterized.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/layers_powered.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.core.parameterized import Parameterized
2 | import sandbox.rocky.tf.core.layers as L
3 | import itertools
4 |
5 |
6 | class LayersPowered(Parameterized):
7 |
8 | def __init__(self, output_layers, input_layers=None):
9 | self._output_layers = output_layers
10 | self._input_layers = input_layers
11 | Parameterized.__init__(self)
12 |
13 | def get_params_internal(self, **tags):
14 | layers = L.get_all_layers(self._output_layers, treat_as_input=self._input_layers)
15 | params = itertools.chain.from_iterable(l.get_params(**tags) for l in layers)
16 | return L.unique(params)
17 |
18 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/parameterized.py:
--------------------------------------------------------------------------------
1 | from contextlib import contextmanager
2 |
3 | from rllab.core.serializable import Serializable
4 | from rllab.misc.tensor_utils import flatten_tensors, unflatten_tensors
5 | import tensorflow as tf
6 |
7 |
8 | load_params = True
9 |
10 | @contextmanager
11 | def suppress_params_loading():
12 | global load_params
13 | load_params = False
14 | yield
15 | load_params = True
16 |
17 |
18 | class Parameterized(object):
19 | def __init__(self):
20 | self._cached_params = {}
21 | self._cached_param_dtypes = {}
22 | self._cached_param_shapes = {}
23 | self._cached_assign_ops = {}
24 | self._cached_assign_placeholders = {}
25 |
26 | def get_params_internal(self, **tags):
27 | """
28 | Internal method to be implemented which does not perform caching
29 | """
30 | raise NotImplementedError
31 |
32 | def get_params(self, **tags):
33 | """
34 | Get the list of parameters, filtered by the provided tags.
35 | Some common tags include 'regularizable' and 'trainable'
36 | """
37 | tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0]))
38 | if tag_tuple not in self._cached_params:
39 | self._cached_params[tag_tuple] = self.get_params_internal(**tags)
40 | return self._cached_params[tag_tuple]
41 |
42 | def get_param_dtypes(self, **tags):
43 | tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0]))
44 | if tag_tuple not in self._cached_param_dtypes:
45 | params = self.get_params(**tags)
46 | param_values = tf.get_default_session().run(params)
47 | self._cached_param_dtypes[tag_tuple] = [val.dtype for val in param_values]
48 | return self._cached_param_dtypes[tag_tuple]
49 |
50 | def get_param_shapes(self, **tags):
51 | tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0]))
52 | if tag_tuple not in self._cached_param_shapes:
53 | params = self.get_params(**tags)
54 | param_values = tf.get_default_session().run(params)
55 | self._cached_param_shapes[tag_tuple] = [val.shape for val in param_values]
56 | return self._cached_param_shapes[tag_tuple]
57 |
58 | def get_param_values(self, **tags):
59 | params = self.get_params(**tags)
60 | param_values = tf.get_default_session().run(params)
61 | return flatten_tensors(param_values)
62 |
63 | def set_param_values(self, flattened_params, **tags):
64 | debug = tags.pop("debug", False)
65 | param_values = unflatten_tensors(
66 | flattened_params, self.get_param_shapes(**tags))
67 | ops = []
68 | feed_dict = dict()
69 | for param, dtype, value in zip(
70 | self.get_params(**tags),
71 | self.get_param_dtypes(**tags),
72 | param_values):
73 | if param not in self._cached_assign_ops:
74 | assign_placeholder = tf.placeholder(dtype=param.dtype.base_dtype)
75 | assign_op = tf.assign(param, assign_placeholder)
76 | self._cached_assign_ops[param] = assign_op
77 | self._cached_assign_placeholders[param] = assign_placeholder
78 | ops.append(self._cached_assign_ops[param])
79 | feed_dict[self._cached_assign_placeholders[param]] = value.astype(dtype)
80 | if debug:
81 | print("setting value of %s" % param.name)
82 | tf.get_default_session().run(ops, feed_dict=feed_dict)
83 |
84 | def flat_to_params(self, flattened_params, **tags):
85 | return unflatten_tensors(flattened_params, self.get_param_shapes(**tags))
86 |
87 | def __getstate__(self):
88 | d = Serializable.__getstate__(self)
89 | global load_params
90 | if load_params:
91 | d["params"] = self.get_param_values()
92 | return d
93 |
94 | def __setstate__(self, d):
95 | Serializable.__setstate__(self, d)
96 | global load_params
97 | if load_params:
98 | tf.get_default_session().run(tf.variables_initializer(self.get_params()))
99 | self.set_param_values(d["params"])
100 |
101 |
102 | class JointParameterized(Parameterized):
103 | def __init__(self, components):
104 | super(JointParameterized, self).__init__()
105 | self.components = components
106 |
107 | def get_params_internal(self, **tags):
108 | params = [param for comp in self.components for param in comp.get_params_internal(**tags)]
109 | # only return unique parameters
110 | return sorted(set(params), key=hash)
111 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/base.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__pycache__/diagonal_gaussian.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/diagonal_gaussian.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/base.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | class Distribution(object):
6 | @property
7 | def dim(self):
8 | raise NotImplementedError
9 |
10 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
11 | """
12 | Compute the symbolic KL divergence of two distributions
13 | """
14 | raise NotImplementedError
15 |
16 | def kl(self, old_dist_info, new_dist_info):
17 | """
18 | Compute the KL divergence of two distributions
19 | """
20 | raise NotImplementedError
21 |
22 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
23 | raise NotImplementedError
24 |
25 | def entropy(self, dist_info):
26 | raise NotImplementedError
27 |
28 | def log_likelihood_sym(self, x_var, dist_info_vars):
29 | raise NotImplementedError
30 |
31 | def log_likelihood(self, xs, dist_info):
32 | raise NotImplementedError
33 |
34 | @property
35 | def dist_info_specs(self):
36 | raise NotImplementedError
37 |
38 | @property
39 | def dist_info_keys(self):
40 | return [k for k, _ in self.dist_info_specs]
41 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/bernoulli.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from .base import Distribution
4 | import tensorflow as tf
5 | import numpy as np
6 |
7 | TINY = 1e-8
8 |
9 |
10 | class Bernoulli(Distribution):
11 | def __init__(self, dim):
12 | self._dim = dim
13 |
14 | @property
15 | def dim(self):
16 | return self._dim
17 |
18 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
19 | old_p = old_dist_info_vars["p"]
20 | new_p = new_dist_info_vars["p"]
21 | kl = old_p * (tf.log(old_p + TINY) - tf.log(new_p + TINY)) + \
22 | (1 - old_p) * (tf.log(1 - old_p + TINY) - tf.log(1 - new_p + TINY))
23 | ndims = kl.get_shape().ndims
24 | return tf.reduce_sum(kl, axis=ndims - 1)
25 |
26 | def kl(self, old_dist_info, new_dist_info):
27 | old_p = old_dist_info["p"]
28 | new_p = new_dist_info["p"]
29 | kl = old_p * (np.log(old_p + TINY) - np.log(new_p + TINY)) + \
30 | (1 - old_p) * (np.log(1 - old_p + TINY) - np.log(1 - new_p + TINY))
31 | return np.sum(kl, axis=-1)
32 |
33 | def sample(self, dist_info):
34 | p = np.asarray(dist_info["p"])
35 | return np.cast['int'](np.random.uniform(low=0., high=1., size=p.shape) < p)
36 |
37 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
38 | old_p = old_dist_info_vars["p"]
39 | new_p = new_dist_info_vars["p"]
40 | ndims = old_p.get_shape().ndims
41 | return tf.reduce_prod(x_var * new_p / (old_p + TINY) + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY),
42 | axis=ndims - 1)
43 |
44 | def log_likelihood_sym(self, x_var, dist_info_vars):
45 | p = dist_info_vars["p"]
46 | ndims = p.get_shape().ndims
47 | return tf.reduce_sum(x_var * tf.log(p + TINY) + (1 - x_var) * tf.log(1 - p + TINY), axis=ndims - 1)
48 |
49 | def log_likelihood(self, xs, dist_info):
50 | p = dist_info["p"]
51 | return np.sum(xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1)
52 |
53 | def entropy(self, dist_info):
54 | p = dist_info["p"]
55 | return np.sum(- p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1)
56 |
57 | @property
58 | def dist_info_keys(self):
59 | return ["p"]
60 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/categorical.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from .base import Distribution
3 | import tensorflow as tf
4 | from sandbox.rocky.tf.misc import tensor_utils
5 |
6 | TINY = 1e-8
7 |
8 |
9 | def from_onehot(x_var):
10 | ret = np.zeros((len(x_var),), 'int32')
11 | nonzero_n, nonzero_a = np.nonzero(x_var)
12 | ret[nonzero_n] = nonzero_a
13 | return ret
14 |
15 |
16 | class Categorical(Distribution):
17 | def __init__(self, dim):
18 | self._dim = dim
19 | weights_var = tf.placeholder(
20 | dtype=tf.float32,
21 | shape=(None, dim),
22 | name="weights"
23 | )
24 | self._f_sample = tensor_utils.compile_function(
25 | inputs=[weights_var],
26 | outputs=tf.multinomial(tf.log(weights_var + 1e-8), num_samples=1)[:, 0],
27 | )
28 |
29 | @property
30 | def dim(self):
31 | return self._dim
32 |
33 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
34 | """
35 | Compute the symbolic KL divergence of two categorical distributions
36 | """
37 | old_prob_var = old_dist_info_vars["prob"]
38 | new_prob_var = new_dist_info_vars["prob"]
39 | ndims = old_prob_var.get_shape().ndims
40 | # Assume layout is N * A
41 | return tf.reduce_sum(
42 | old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)),
43 | axis=ndims - 1
44 | )
45 |
46 | def kl(self, old_dist_info, new_dist_info):
47 | """
48 | Compute the KL divergence of two categorical distributions
49 | """
50 | old_prob = old_dist_info["prob"]
51 | new_prob = new_dist_info["prob"]
52 | return np.sum(
53 | old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)),
54 | axis=-1
55 | )
56 |
57 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
58 | old_prob_var = old_dist_info_vars["prob"]
59 | new_prob_var = new_dist_info_vars["prob"]
60 | ndims = old_prob_var.get_shape().ndims
61 | x_var = tf.cast(x_var, tf.float32)
62 | # Assume layout is N * A
63 | return (tf.reduce_sum(new_prob_var * x_var, ndims - 1) + TINY) / \
64 | (tf.reduce_sum(old_prob_var * x_var, ndims - 1) + TINY)
65 |
66 | def entropy_sym(self, dist_info_vars):
67 | probs = dist_info_vars["prob"]
68 | return -tf.reduce_sum(probs * tf.log(probs + TINY), axis=1)
69 |
70 | def cross_entropy_sym(self, old_dist_info_vars, new_dist_info_vars):
71 | old_prob_var = old_dist_info_vars["prob"]
72 | new_prob_var = new_dist_info_vars["prob"]
73 | ndims = old_prob_var.get_shape().ndims
74 | # Assume layout is N * A
75 | return tf.reduce_sum(
76 | old_prob_var * (- tf.log(new_prob_var + TINY)),
77 | axis=ndims - 1
78 | )
79 |
80 | def entropy(self, info):
81 | probs = info["prob"]
82 | return -np.sum(probs * np.log(probs + TINY), axis=1)
83 |
84 | def log_likelihood_sym(self, x_var, dist_info_vars):
85 | probs = dist_info_vars["prob"]
86 | ndims = probs.get_shape().ndims
87 | return tf.log(tf.reduce_sum(probs * tf.cast(x_var, tf.float32), ndims - 1) + TINY)
88 |
89 | def log_likelihood(self, xs, dist_info):
90 | probs = dist_info["prob"]
91 | # Assume layout is N * A
92 | return np.log(np.sum(probs * xs, axis=-1) + TINY)
93 |
94 | @property
95 | def dist_info_specs(self):
96 | return [("prob", (self.dim,))]
97 |
98 | def sample(self, dist_info):
99 | return self._f_sample(dist_info["prob"])
100 |
101 | def sample_sym(self, dist_info):
102 | probs = dist_info["prob"]
103 | samples = tf.multinomial(tf.log(probs + 1e-8), num_samples=1)[:, 0]
104 | return tf.nn.embedding_lookup(np.eye(self.dim, dtype=np.float32), samples)
105 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/diagonal_gaussian.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | import tensorflow as tf
5 | import numpy as np
6 | from sandbox.rocky.tf.distributions.base import Distribution
7 |
8 |
9 | class DiagonalGaussian(Distribution):
10 | def __init__(self, dim):
11 | self._dim = dim
12 |
13 | @property
14 | def dim(self):
15 | return self._dim
16 |
17 | def kl(self, old_dist_info, new_dist_info):
18 | old_means = old_dist_info["mean"]
19 | old_log_stds = old_dist_info["log_std"]
20 | new_means = new_dist_info["mean"]
21 | new_log_stds = new_dist_info["log_std"]
22 | """
23 | Compute the KL divergence of two multivariate Gaussian distribution with
24 | diagonal covariance matrices
25 | """
26 | old_std = np.exp(old_log_stds)
27 | new_std = np.exp(new_log_stds)
28 | # means: (N*A)
29 | # std: (N*A)
30 | # formula:
31 | # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
32 | # ln(\sigma_2/\sigma_1)
33 | numerator = np.square(old_means - new_means) + \
34 | np.square(old_std) - np.square(new_std)
35 | denominator = 2 * np.square(new_std) + 1e-8
36 | return np.sum(
37 | numerator / denominator + new_log_stds - old_log_stds, axis=-1)
38 | # more lossy version
39 | # return TT.sum(
40 | # numerator / denominator + TT.log(new_std) - TT.log(old_std ), axis=-1)
41 |
42 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
43 | old_means = old_dist_info_vars["mean"]
44 | old_log_stds = old_dist_info_vars["log_std"]
45 | new_means = new_dist_info_vars["mean"]
46 | new_log_stds = new_dist_info_vars["log_std"]
47 | """
48 | Compute the KL divergence of two multivariate Gaussian distribution with
49 | diagonal covariance matrices
50 | """
51 | old_std = tf.exp(old_log_stds)
52 | new_std = tf.exp(new_log_stds)
53 | # means: (N*A)
54 | # std: (N*A)
55 | # formula:
56 | # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) +
57 | # ln(\sigma_2/\sigma_1)
58 | numerator = tf.square(old_means - new_means) + \
59 | tf.square(old_std) - tf.square(new_std)
60 | denominator = 2 * tf.square(new_std) + 1e-8
61 | return tf.reduce_sum(
62 | numerator / denominator + new_log_stds - old_log_stds, axis=-1)
63 |
64 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
65 | logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars)
66 | logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars)
67 | return tf.exp(logli_new - logli_old)
68 |
69 | def log_likelihood_sym(self, x_var, dist_info_vars):
70 | means = dist_info_vars["mean"]
71 | log_stds = dist_info_vars["log_std"]
72 | zs = (x_var - means) / tf.exp(log_stds)
73 | return - tf.reduce_sum(log_stds, axis=-1) - \
74 | 0.5 * tf.reduce_sum(tf.square(zs), axis=-1) - \
75 | 0.5 * self.dim * np.log(2 * np.pi)
76 |
77 | def sample(self, dist_info):
78 | means = dist_info["mean"]
79 | log_stds = dist_info["log_std"]
80 | rnd = np.random.normal(size=means.shape)
81 | return rnd * np.exp(log_stds) + means
82 |
83 | def log_likelihood(self, xs, dist_info):
84 | means = dist_info["mean"]
85 | log_stds = dist_info["log_std"]
86 | zs = (xs - means) / np.exp(log_stds)
87 | return - np.sum(log_stds, axis=-1) - \
88 | 0.5 * np.sum(np.square(zs), axis=-1) - \
89 | 0.5 * self.dim * np.log(2 * np.pi)
90 |
91 | def entropy(self, dist_info):
92 | log_stds = dist_info["log_std"]
93 | return np.sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)), axis=-1)
94 |
95 | @property
96 | def dist_info_specs(self):
97 | return [("mean", (self.dim,)), ("log_std", (self.dim,))]
98 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/recurrent_categorical.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 | from sandbox.rocky.tf.distributions.categorical import Categorical
4 | from sandbox.rocky.tf.distributions.base import Distribution
5 |
6 | TINY = 1e-8
7 |
8 |
9 | class RecurrentCategorical(Distribution):
10 | def __init__(self, dim):
11 | self._cat = Categorical(dim)
12 | self._dim = dim
13 |
14 | @property
15 | def dim(self):
16 | return self._dim
17 |
18 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
19 | """
20 | Compute the symbolic KL divergence of two categorical distributions
21 | """
22 | old_prob_var = old_dist_info_vars["prob"]
23 | new_prob_var = new_dist_info_vars["prob"]
24 | # Assume layout is N * T * A
25 | return tf.reduce_sum(
26 | old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)),
27 | axis=2
28 | )
29 |
30 | def kl(self, old_dist_info, new_dist_info):
31 | """
32 | Compute the KL divergence of two categorical distributions
33 | """
34 | old_prob = old_dist_info["prob"]
35 | new_prob = new_dist_info["prob"]
36 | return np.sum(
37 | old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)),
38 | axis=2
39 | )
40 |
41 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
42 | old_prob_var = old_dist_info_vars["prob"]
43 | new_prob_var = new_dist_info_vars["prob"]
44 | # Assume layout is N * T * A
45 | a_dim = tf.shape(x_var)[2]
46 | flat_ratios = self._cat.likelihood_ratio_sym(
47 | tf.reshape(x_var, tf.stack([-1, a_dim])),
48 | dict(prob=tf.reshape(old_prob_var, tf.stack([-1, a_dim]))),
49 | dict(prob=tf.reshape(new_prob_var, tf.stack([-1, a_dim])))
50 | )
51 | return tf.reshape(flat_ratios, tf.shape(old_prob_var)[:2])
52 |
53 | def entropy(self, dist_info):
54 | probs = dist_info["prob"]
55 | return -np.sum(probs * np.log(probs + TINY), axis=2)
56 |
57 | def entropy_sym(self, dist_info_vars):
58 | probs = dist_info_vars["prob"]
59 | return -tf.reduce_sum(probs * tf.log(probs + TINY), 2)
60 |
61 | def log_likelihood_sym(self, xs, dist_info_vars):
62 | probs = dist_info_vars["prob"]
63 | # Assume layout is N * T * A
64 | a_dim = tf.shape(probs)[2]
65 | # a_dim = TT.printing.Print("lala")(a_dim)
66 | flat_logli = self._cat.log_likelihood_sym(
67 | tf.reshape(xs, tf.stack([-1, a_dim])),
68 | dict(prob=tf.reshape(probs, tf.stack((-1, a_dim))))
69 | )
70 | return tf.reshape(flat_logli, tf.shape(probs)[:2])
71 |
72 | def log_likelihood(self, xs, dist_info):
73 | probs = dist_info["prob"]
74 | # Assume layout is N * T * A
75 | a_dim = tf.shape(probs)[2]
76 | flat_logli = self._cat.log_likelihood_sym(
77 | xs.reshape((-1, a_dim)),
78 | dict(prob=probs.reshape((-1, a_dim)))
79 | )
80 | return flat_logli.reshape(probs.shape[:2])
81 |
82 | @property
83 | def dist_info_specs(self):
84 | return [("prob", (self.dim,))]
85 |
86 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/recurrent_diagonal_gaussian.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | from sandbox.rocky.tf.distributions.diagonal_gaussian import DiagonalGaussian
5 |
6 | RecurrentDiagonalGaussian = DiagonalGaussian
7 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/base.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__pycache__/parallel_vec_env_executor.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/parallel_vec_env_executor.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__pycache__/vec_env_executor.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/vec_env_executor.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/base.py:
--------------------------------------------------------------------------------
1 | from rllab.envs.proxy_env import ProxyEnv
2 | from rllab.envs.base import EnvSpec
3 | from rllab.spaces.box import Box as TheanoBox
4 | from rllab.spaces.discrete import Discrete as TheanoDiscrete
5 | from rllab.spaces.product import Product as TheanoProduct
6 | from sandbox.rocky.tf.spaces.discrete import Discrete
7 | from sandbox.rocky.tf.spaces.box import Box
8 | from sandbox.rocky.tf.spaces.product import Product
9 | from cached_property import cached_property
10 |
11 |
12 | def to_tf_space(space):
13 | if isinstance(space, TheanoBox):
14 | return Box(low=space.low, high=space.high)
15 | elif isinstance(space, TheanoDiscrete):
16 | return Discrete(space.n)
17 | elif isinstance(space, TheanoProduct):
18 | return Product(list(map(to_tf_space, space.components)))
19 | else:
20 | raise NotImplementedError
21 |
22 |
23 | class WrappedCls(object):
24 | def __init__(self, cls, env_cls, extra_kwargs):
25 | self.cls = cls
26 | self.env_cls = env_cls
27 | self.extra_kwargs = extra_kwargs
28 |
29 | def __call__(self, *args, **kwargs):
30 | return self.cls(self.env_cls(*args, **dict(self.extra_kwargs, **kwargs)))
31 |
32 |
33 | class TfEnv(ProxyEnv):
34 | @cached_property
35 | def observation_space(self):
36 | return to_tf_space(self.wrapped_env.observation_space)
37 |
38 | @cached_property
39 | def action_space(self):
40 | return to_tf_space(self.wrapped_env.action_space)
41 |
42 | @cached_property
43 | def spec(self):
44 | return EnvSpec(
45 | observation_space=self.observation_space,
46 | action_space=self.action_space,
47 | )
48 |
49 | @property
50 | def vectorized(self):
51 | return getattr(self.wrapped_env, "vectorized", False)
52 |
53 | def vec_env_executor(self, n_envs, max_path_length):
54 | return VecTfEnv(self.wrapped_env.vec_env_executor(n_envs=n_envs, max_path_length=max_path_length))
55 |
56 | @classmethod
57 | def wrap(cls, env_cls, **extra_kwargs):
58 | # Use a class wrapper rather than a lambda method for smoother serialization
59 | return WrappedCls(cls, env_cls, extra_kwargs)
60 |
61 |
62 | class VecTfEnv(object):
63 |
64 | def __init__(self, vec_env):
65 | self.vec_env = vec_env
66 |
67 | def reset(self):
68 | return self.vec_env.reset()
69 |
70 | @property
71 | def num_envs(self):
72 | return self.vec_env.num_envs
73 |
74 | def step(self, action_n):
75 | return self.vec_env.step(action_n)
76 |
77 | def terminate(self):
78 | self.vec_env.terminate()
79 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/vec_env_executor.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | import numpy as np
4 | import pickle as pickle
5 | from sandbox.rocky.tf.misc import tensor_utils
6 |
7 |
8 | class VecEnvExecutor(object):
9 | def __init__(self, envs, max_path_length):
10 | self.envs = envs
11 | self._action_space = envs[0].action_space
12 | self._observation_space = envs[0].observation_space
13 | self.ts = np.zeros(len(self.envs), dtype='int')
14 | self.max_path_length = max_path_length
15 |
16 | def step(self, action_n):
17 | all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)]
18 | obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results))))
19 | dones = np.asarray(dones)
20 | rewards = np.asarray(rewards)
21 | self.ts += 1
22 | if self.max_path_length is not None:
23 | dones[self.ts >= self.max_path_length] = True
24 | for (i, done) in enumerate(dones):
25 | if done:
26 | obs[i] = self.envs[i].reset()
27 | self.ts[i] = 0
28 | return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
29 |
30 | def reset(self):
31 | results = [env.reset() for env in self.envs]
32 | self.ts[:] = 0
33 | return results
34 |
35 | @property
36 | def num_envs(self):
37 | return len(self.envs)
38 |
39 | @property
40 | def action_space(self):
41 | return self._action_space
42 |
43 | @property
44 | def observation_space(self):
45 | return self._observation_space
46 |
47 | def terminate(self):
48 | pass
49 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/trpo_cartpole.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.algos.trpo import TRPO
2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
4 | from rllab.envs.normalized_env import normalize
5 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
6 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp
7 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
8 | from sandbox.rocky.tf.envs.base import TfEnv
9 | from rllab.misc.instrument import stub, run_experiment_lite
10 |
11 | env = TfEnv(normalize(CartpoleEnv()))
12 |
13 | policy = GaussianMLPPolicy(
14 | name="policy",
15 | env_spec=env.spec,
16 | # The neural network policy should have two hidden layers, each with 32 hidden units.
17 | hidden_sizes=(32, 32)
18 | )
19 |
20 | baseline = LinearFeatureBaseline(env_spec=env.spec)
21 |
22 | algo = TRPO(
23 | env=env,
24 | policy=policy,
25 | baseline=baseline,
26 | batch_size=4000,
27 | max_path_length=100,
28 | n_itr=40,
29 | discount=0.99,
30 | step_size=0.01,
31 | # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
32 |
33 | )
34 | algo.train()
35 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/trpo_cartpole_recurrent.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.algos.trpo import TRPO
2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
4 | from rllab.envs.normalized_env import normalize
5 | from sandbox.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy
6 | from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy
7 | from sandbox.rocky.tf.envs.base import TfEnv
8 | import sandbox.rocky.tf.core.layers as L
9 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
10 | from rllab.misc.instrument import stub, run_experiment_lite
11 |
12 | env = TfEnv(normalize(CartpoleEnv()))
13 |
14 | policy = GaussianLSTMPolicy(
15 | name="policy",
16 | env_spec=env.spec,
17 | lstm_layer_cls=L.TfBasicLSTMLayer,
18 | # gru_layer_cls=L.GRULayer,
19 | )
20 |
21 | baseline = LinearFeatureBaseline(env_spec=env.spec)
22 |
23 | algo = TRPO(
24 | env=env,
25 | policy=policy,
26 | baseline=baseline,
27 | batch_size=4000,
28 | max_path_length=100,
29 | n_itr=10,
30 | discount=0.99,
31 | step_size=0.01,
32 | optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
33 | )
34 | algo.train()
35 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/vpg_cartpole.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.algos.vpg import VPG
2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
4 | from rllab.envs.normalized_env import normalize
5 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
6 | from sandbox.rocky.tf.envs.base import TfEnv
7 | from rllab.misc.instrument import stub, run_experiment_lite
8 |
9 | env = TfEnv(normalize(CartpoleEnv()))
10 |
11 | policy = GaussianMLPPolicy(
12 | name="policy",
13 | env_spec=env.spec,
14 | # The neural network policy should have two hidden layers, each with 32 hidden units.
15 | hidden_sizes=(32, 32)
16 | )
17 |
18 | baseline = LinearFeatureBaseline(env_spec=env.spec)
19 |
20 | algo = VPG(
21 | env=env,
22 | policy=policy,
23 | baseline=baseline,
24 | batch_size=10000,
25 | max_path_length=100,
26 | n_itr=40,
27 | discount=0.99,
28 | optimizer_args=dict(
29 | tf_optimizer_args=dict(
30 | learning_rate=0.01,
31 | )
32 | )
33 | )
34 | algo.train()
35 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/misc/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/__pycache__/tensor_utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/misc/__pycache__/tensor_utils.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/tensor_utils.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 | import numpy as np
3 |
4 |
5 | def compile_function(inputs, outputs, log_name=None):
6 | def run(*input_vals):
7 | sess = tf.get_default_session()
8 | return sess.run(outputs, feed_dict=dict(list(zip(inputs, input_vals))))
9 |
10 | return run
11 |
12 |
13 | def flatten_tensor_variables(ts):
14 | return tf.concat(axis=0, values=[tf.reshape(x, [-1]) for x in ts])
15 |
16 |
17 | def unflatten_tensor_variables(flatarr, shapes, symb_arrs):
18 | arrs = []
19 | n = 0
20 | for (shape, symb_arr) in zip(shapes, symb_arrs):
21 | size = np.prod(list(shape))
22 | arr = tf.reshape(flatarr[n:n + size], shape)
23 | arrs.append(arr)
24 | n += size
25 | return arrs
26 |
27 |
28 | def new_tensor(name, ndim, dtype):
29 | return tf.placeholder(dtype=dtype, shape=[None] * ndim, name=name)
30 |
31 |
32 | def new_tensor_like(name, arr_like):
33 | return new_tensor(name, arr_like.get_shape().ndims, arr_like.dtype.base_dtype)
34 |
35 |
36 | def concat_tensor_list(tensor_list):
37 | return np.concatenate(tensor_list, axis=0)
38 |
39 |
40 | def concat_tensor_dict_list(tensor_dict_list):
41 | keys = list(tensor_dict_list[0].keys())
42 | ret = dict()
43 | for k in keys:
44 | example = tensor_dict_list[0][k]
45 | if isinstance(example, dict):
46 | v = concat_tensor_dict_list([x[k] for x in tensor_dict_list])
47 | else:
48 | v = concat_tensor_list([x[k] for x in tensor_dict_list])
49 | ret[k] = v
50 | return ret
51 |
52 |
53 | def stack_tensor_list(tensor_list):
54 | return np.array(tensor_list)
55 | # tensor_shape = np.array(tensor_list[0]).shape
56 | # if tensor_shape is tuple():
57 | # return np.array(tensor_list)
58 | # return np.vstack(tensor_list)
59 |
60 |
61 | def stack_tensor_dict_list(tensor_dict_list):
62 | """
63 | Stack a list of dictionaries of {tensors or dictionary of tensors}.
64 | :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}.
65 | :return: a dictionary of {stacked tensors or dictionary of stacked tensors}
66 | """
67 | keys = list(tensor_dict_list[0].keys())
68 | ret = dict()
69 | for k in keys:
70 | example = tensor_dict_list[0][k]
71 | if isinstance(example, dict):
72 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list])
73 | else:
74 | v = stack_tensor_list([x[k] for x in tensor_dict_list])
75 | ret[k] = v
76 | return ret
77 |
78 |
79 | def split_tensor_dict_list(tensor_dict):
80 | keys = list(tensor_dict.keys())
81 | ret = None
82 | for k in keys:
83 | vals = tensor_dict[k]
84 | if isinstance(vals, dict):
85 | vals = split_tensor_dict_list(vals)
86 | if ret is None:
87 | ret = [{k: v} for v in vals]
88 | else:
89 | for v, cur_dict in zip(vals, ret):
90 | cur_dict[k] = v
91 | return ret
92 |
93 |
94 | def to_onehot_sym(inds, dim):
95 | return tf.one_hot(inds, depth=dim, on_value=1, off_value=0)
96 |
97 |
98 | def pad_tensor(x, max_len):
99 | return np.concatenate([
100 | x,
101 | np.tile(np.zeros_like(x[0]), (max_len - len(x),) + (1,) * np.ndim(x[0]))
102 | ])
103 |
104 |
105 | def pad_tensor_n(xs, max_len):
106 | ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype)
107 | for idx, x in enumerate(xs):
108 | ret[idx][:len(x)] = x
109 | return ret
110 |
111 |
112 | def pad_tensor_dict(tensor_dict, max_len):
113 | keys = list(tensor_dict.keys())
114 | ret = dict()
115 | for k in keys:
116 | if isinstance(tensor_dict[k], dict):
117 | ret[k] = pad_tensor_dict(tensor_dict[k], max_len)
118 | else:
119 | ret[k] = pad_tensor(tensor_dict[k], max_len)
120 | return ret
121 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__pycache__/conjugate_gradient_optimizer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/conjugate_gradient_optimizer.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__pycache__/penalty_lbfgs_optimizer.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/penalty_lbfgs_optimizer.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/first_order_optimizer.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | from rllab.misc import ext
5 | from rllab.misc import logger
6 | from rllab.core.serializable import Serializable
7 | from sandbox.rocky.tf.misc import tensor_utils
8 | # from rllab.algo.first_order_method import parse_update_method
9 | from rllab.optimizers.minibatch_dataset import BatchDataset
10 | from collections import OrderedDict
11 | import tensorflow as tf
12 | import time
13 | from functools import partial
14 | import pyprind
15 |
16 |
17 | class FirstOrderOptimizer(Serializable):
18 | """
19 | Performs (stochastic) gradient descent, possibly using fancier methods like adam etc.
20 | """
21 |
22 | def __init__(
23 | self,
24 | tf_optimizer_cls=None,
25 | tf_optimizer_args=None,
26 | # learning_rate=1e-3,
27 | max_epochs=1000,
28 | tolerance=1e-6,
29 | batch_size=32,
30 | callback=None,
31 | verbose=False,
32 | **kwargs):
33 | """
34 |
35 | :param max_epochs:
36 | :param tolerance:
37 | :param update_method:
38 | :param batch_size: None or an integer. If None the whole dataset will be used.
39 | :param callback:
40 | :param kwargs:
41 | :return:
42 | """
43 | Serializable.quick_init(self, locals())
44 | self._opt_fun = None
45 | self._target = None
46 | self._callback = callback
47 | if tf_optimizer_cls is None:
48 | tf_optimizer_cls = tf.train.AdamOptimizer
49 | if tf_optimizer_args is None:
50 | tf_optimizer_args = dict(learning_rate=1e-3)
51 | self._tf_optimizer = tf_optimizer_cls(**tf_optimizer_args)
52 | self._max_epochs = max_epochs
53 | self._tolerance = tolerance
54 | self._batch_size = batch_size
55 | self._verbose = verbose
56 | self._input_vars = None
57 | self._train_op = None
58 |
59 | def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs):
60 | """
61 | :param loss: Symbolic expression for the loss function.
62 | :param target: A parameterized object to optimize over. It should implement methods of the
63 | :class:`rllab.core.paramerized.Parameterized` class.
64 | :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
65 | :param inputs: A list of symbolic variables as inputs
66 | :return: No return value.
67 | """
68 |
69 | self._target = target
70 |
71 | self._train_op = self._tf_optimizer.minimize(loss, var_list=target.get_params(trainable=True))
72 |
73 | # updates = OrderedDict([(k, v.astype(k.dtype)) for k, v in updates.iteritems()])
74 |
75 | if extra_inputs is None:
76 | extra_inputs = list()
77 | self._input_vars = inputs + extra_inputs
78 | self._opt_fun = ext.lazydict(
79 | f_loss=lambda: tensor_utils.compile_function(inputs + extra_inputs, loss),
80 | )
81 |
82 | def loss(self, inputs, extra_inputs=None):
83 | if extra_inputs is None:
84 | extra_inputs = tuple()
85 | return self._opt_fun["f_loss"](*(tuple(inputs) + extra_inputs))
86 |
87 | def optimize(self, inputs, extra_inputs=None, callback=None):
88 |
89 | if len(inputs) == 0:
90 | # Assumes that we should always sample mini-batches
91 | raise NotImplementedError
92 |
93 | f_loss = self._opt_fun["f_loss"]
94 |
95 | if extra_inputs is None:
96 | extra_inputs = tuple()
97 |
98 | last_loss = f_loss(*(tuple(inputs) + extra_inputs))
99 |
100 | start_time = time.time()
101 |
102 | dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs)
103 |
104 | sess = tf.get_default_session()
105 |
106 | for epoch in range(self._max_epochs):
107 | if self._verbose:
108 | logger.log("Epoch %d" % (epoch))
109 | progbar = pyprind.ProgBar(len(inputs[0]))
110 |
111 | for batch in dataset.iterate(update=True):
112 | sess.run(self._train_op, dict(list(zip(self._input_vars, batch))))
113 | if self._verbose:
114 | progbar.update(len(batch[0]))
115 |
116 | if self._verbose:
117 | if progbar.active:
118 | progbar.stop()
119 |
120 | new_loss = f_loss(*(tuple(inputs) + extra_inputs))
121 |
122 | if self._verbose:
123 | logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss))
124 | if self._callback or callback:
125 | elapsed = time.time() - start_time
126 | callback_args = dict(
127 | loss=new_loss,
128 | params=self._target.get_param_values(trainable=True) if self._target else None,
129 | itr=epoch,
130 | elapsed=elapsed,
131 | )
132 | if self._callback:
133 | self._callback(callback_args)
134 | if callback:
135 | callback(**callback_args)
136 |
137 | if abs(last_loss - new_loss) < self._tolerance:
138 | break
139 | last_loss = new_loss
140 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/lbfgs_optimizer.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | from rllab.misc import ext
4 | from sandbox.rocky.tf.misc import tensor_utils
5 | from rllab.core.serializable import Serializable
6 | import tensorflow as tf
7 | import scipy.optimize
8 | import time
9 |
10 |
11 | class LbfgsOptimizer(Serializable):
12 | """
13 | Performs unconstrained optimization via L-BFGS.
14 | """
15 |
16 | def __init__(self, name, max_opt_itr=20, callback=None):
17 | Serializable.quick_init(self, locals())
18 | self._name = name
19 | self._max_opt_itr = max_opt_itr
20 | self._opt_fun = None
21 | self._target = None
22 | self._callback = callback
23 |
24 | def update_opt(self, loss, target, inputs, extra_inputs=None, *args, **kwargs):
25 | """
26 | :param loss: Symbolic expression for the loss function.
27 | :param target: A parameterized object to optimize over. It should implement methods of the
28 | :class:`rllab.core.paramerized.Parameterized` class.
29 | :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon.
30 | :param inputs: A list of symbolic variables as inputs
31 | :return: No return value.
32 | """
33 |
34 | self._target = target
35 |
36 | def get_opt_output():
37 | flat_grad = tensor_utils.flatten_tensor_variables(tf.gradients(loss, target.get_params(trainable=True)))
38 | return [tf.cast(loss, tf.float64), tf.cast(flat_grad, tf.float64)]
39 |
40 | if extra_inputs is None:
41 | extra_inputs = list()
42 |
43 | self._opt_fun = ext.lazydict(
44 | f_loss=lambda: tensor_utils.compile_function(inputs + extra_inputs, loss),
45 | f_opt=lambda: tensor_utils.compile_function(
46 | inputs=inputs + extra_inputs,
47 | outputs=get_opt_output(),
48 | )
49 | )
50 |
51 | def loss(self, inputs, extra_inputs=None):
52 | if extra_inputs is None:
53 | extra_inputs = list()
54 | return self._opt_fun["f_loss"](*(list(inputs) + list(extra_inputs)))
55 |
56 | def optimize(self, inputs, extra_inputs=None):
57 | f_opt = self._opt_fun["f_opt"]
58 |
59 | if extra_inputs is None:
60 | extra_inputs = list()
61 |
62 | def f_opt_wrapper(flat_params):
63 | self._target.set_param_values(flat_params, trainable=True)
64 | ret = f_opt(*inputs)
65 | return ret
66 |
67 | itr = [0]
68 | start_time = time.time()
69 |
70 | if self._callback:
71 | def opt_callback(params):
72 | loss = self._opt_fun["f_loss"](*(inputs + extra_inputs))
73 | elapsed = time.time() - start_time
74 | self._callback(dict(
75 | loss=loss,
76 | params=params,
77 | itr=itr[0],
78 | elapsed=elapsed,
79 | ))
80 | itr[0] += 1
81 | else:
82 | opt_callback = None
83 |
84 | scipy.optimize.fmin_l_bfgs_b(
85 | func=f_opt_wrapper, x0=self._target.get_param_values(trainable=True),
86 | maxiter=self._max_opt_itr, callback=opt_callback,
87 | )
88 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/base.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_inverse_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_inverse_policy.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_policy.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_policy.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/base.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | from sandbox.rocky.tf.core.parameterized import Parameterized
5 |
6 |
7 | class Policy(Parameterized):
8 | def __init__(self, env_spec):
9 | Parameterized.__init__(self)
10 | self._env_spec = env_spec
11 |
12 | # Should be implemented by all policies
13 |
14 | def get_action(self, observation):
15 | raise NotImplementedError
16 |
17 | def get_actions(self, observations):
18 | raise NotImplementedError
19 |
20 | def reset(self, dones=None):
21 | pass
22 |
23 | @property
24 | def vectorized(self):
25 | """
26 | Indicates whether the policy is vectorized. If True, it should implement get_actions(), and support resetting
27 | with multiple simultaneous states.
28 | """
29 | return False
30 |
31 | @property
32 | def observation_space(self):
33 | return self._env_spec.observation_space
34 |
35 | @property
36 | def action_space(self):
37 | return self._env_spec.action_space
38 |
39 | @property
40 | def env_spec(self):
41 | return self._env_spec
42 |
43 | @property
44 | def recurrent(self):
45 | """
46 | Indicates whether the policy is recurrent.
47 | :return:
48 | """
49 | return False
50 |
51 | def log_diagnostics(self, paths):
52 | """
53 | Log extra information per iteration based on the collected paths
54 | """
55 | pass
56 |
57 | @property
58 | def state_info_keys(self):
59 | """
60 | Return keys for the information related to the policy's state when taking an action.
61 | :return:
62 | """
63 | return [k for k, _ in self.state_info_specs]
64 |
65 | @property
66 | def state_info_specs(self):
67 | """
68 | Return keys and shapes for the information related to the policy's state when taking an action.
69 | :return:
70 | """
71 | return list()
72 |
73 | def terminate(self):
74 | """
75 | Clean up operation
76 | """
77 | pass
78 |
79 |
80 | class StochasticPolicy(Policy):
81 | @property
82 | def distribution(self):
83 | """
84 | :rtype Distribution
85 | """
86 | raise NotImplementedError
87 |
88 | def dist_info_sym(self, obs_var, state_info_vars):
89 | """
90 | Return the symbolic distribution information about the actions.
91 | :param obs_var: symbolic variable for observations
92 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
93 | the time it received the observation
94 | :return:
95 | """
96 | raise NotImplementedError
97 |
98 | def dist_info(self, obs, state_infos):
99 | """
100 | Return the distribution information about the actions.
101 | :param obs_var: observation values
102 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
103 | the time it received the observation
104 | :return:
105 | """
106 | raise NotImplementedError
107 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/categorical_conv_policy.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
2 | import sandbox.rocky.tf.core.layers as L
3 | from sandbox.rocky.tf.core.network import ConvNetwork
4 | from rllab.core.serializable import Serializable
5 | from sandbox.rocky.tf.distributions.categorical import Categorical
6 | from sandbox.rocky.tf.policies.base import StochasticPolicy
7 | from rllab.misc import ext
8 | from sandbox.rocky.tf.misc import tensor_utils
9 | from rllab.misc.overrides import overrides
10 | from sandbox.rocky.tf.spaces.discrete import Discrete
11 | import tensorflow as tf
12 |
13 |
14 | class CategoricalConvPolicy(StochasticPolicy, LayersPowered, Serializable):
15 | def __init__(
16 | self,
17 | name,
18 | env_spec,
19 | conv_filters, conv_filter_sizes, conv_strides, conv_pads,
20 | hidden_sizes=[],
21 | hidden_nonlinearity=tf.nn.relu,
22 | output_nonlinearity=tf.nn.softmax,
23 | prob_network=None,
24 | ):
25 | """
26 | :param env_spec: A spec for the mdp.
27 | :param hidden_sizes: list of sizes for the fully connected hidden layers
28 | :param hidden_nonlinearity: nonlinearity used for each hidden layer
29 | :param prob_network: manually specified network for this policy, other network params
30 | are ignored
31 | :return:
32 | """
33 | Serializable.quick_init(self, locals())
34 |
35 | assert isinstance(env_spec.action_space, Discrete)
36 |
37 | self._env_spec = env_spec
38 | # import pdb; pdb.set_trace()
39 | if prob_network is None:
40 | prob_network = ConvNetwork(
41 | input_shape=env_spec.observation_space.shape,
42 | output_dim=env_spec.action_space.n,
43 | conv_filters=conv_filters,
44 | conv_filter_sizes=conv_filter_sizes,
45 | conv_strides=conv_strides,
46 | conv_pads=conv_pads,
47 | hidden_sizes=hidden_sizes,
48 | hidden_nonlinearity=hidden_nonlinearity,
49 | output_nonlinearity=output_nonlinearity,
50 | name="prob_network",
51 | )
52 |
53 | self._l_prob = prob_network.output_layer
54 | self._l_obs = prob_network.input_layer
55 | self._f_prob = tensor_utils.compile_function(
56 | [prob_network.input_layer.input_var],
57 | L.get_output(prob_network.output_layer)
58 | )
59 |
60 | self._dist = Categorical(env_spec.action_space.n)
61 |
62 | super(CategoricalConvPolicy, self).__init__(env_spec)
63 | LayersPowered.__init__(self, [prob_network.output_layer])
64 |
65 | @property
66 | def vectorized(self):
67 | return True
68 |
69 | @overrides
70 | def dist_info_sym(self, obs_var, state_info_vars=None):
71 | return dict(prob=L.get_output(self._l_prob, {self._l_obs: tf.cast(obs_var, tf.float32)}))
72 |
73 | @overrides
74 | def dist_info(self, obs, state_infos=None):
75 | return dict(prob=self._f_prob(obs))
76 |
77 | # The return value is a pair. The first item is a matrix (N, A), where each
78 | # entry corresponds to the action value taken. The second item is a vector
79 | # of length N, where each entry is the density value for that action, under
80 | # the current policy
81 | @overrides
82 | def get_action(self, observation):
83 | flat_obs = self.observation_space.flatten(observation)
84 | prob = self._f_prob([flat_obs])[0]
85 | action = self.action_space.weighted_sample(prob)
86 | return action, dict(prob=prob)
87 |
88 | def get_actions(self, observations):
89 | flat_obs = self.observation_space.flatten_n(observations)
90 | probs = self._f_prob(flat_obs)
91 | actions = list(map(self.action_space.weighted_sample, probs))
92 | return actions, dict(prob=probs)
93 |
94 | @property
95 | def distribution(self):
96 | return self._dist
97 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/categorical_mlp_policy.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
2 | import sandbox.rocky.tf.core.layers as L
3 | from sandbox.rocky.tf.core.network import MLP
4 | from rllab.core.serializable import Serializable
5 | from sandbox.rocky.tf.distributions.categorical import Categorical
6 | from sandbox.rocky.tf.policies.base import StochasticPolicy
7 | from rllab.misc import ext
8 | from sandbox.rocky.tf.misc import tensor_utils
9 | from rllab.misc.overrides import overrides
10 | from sandbox.rocky.tf.spaces.discrete import Discrete
11 | import tensorflow as tf
12 |
13 |
14 | class CategoricalMLPPolicy(StochasticPolicy, LayersPowered, Serializable):
15 | def __init__(
16 | self,
17 | name,
18 | env_spec,
19 | hidden_sizes=(32, 32),
20 | hidden_nonlinearity=tf.nn.tanh,
21 | prob_network=None,
22 | ):
23 | """
24 | :param env_spec: A spec for the mdp.
25 | :param hidden_sizes: list of sizes for the fully connected hidden layers
26 | :param hidden_nonlinearity: nonlinearity used for each hidden layer
27 | :param prob_network: manually specified network for this policy, other network params
28 | are ignored
29 | :return:
30 | """
31 | Serializable.quick_init(self, locals())
32 |
33 | assert isinstance(env_spec.action_space, Discrete)
34 |
35 | with tf.variable_scope(name):
36 | if prob_network is None:
37 | prob_network = MLP(
38 | input_shape=(env_spec.observation_space.flat_dim,),
39 | output_dim=env_spec.action_space.n,
40 | hidden_sizes=hidden_sizes,
41 | hidden_nonlinearity=hidden_nonlinearity,
42 | output_nonlinearity=tf.nn.softmax,
43 | name="prob_network",
44 | )
45 |
46 | self._l_prob = prob_network.output_layer
47 | self._l_obs = prob_network.input_layer
48 | self._f_prob = tensor_utils.compile_function(
49 | [prob_network.input_layer.input_var],
50 | L.get_output(prob_network.output_layer)
51 | )
52 |
53 | self._dist = Categorical(env_spec.action_space.n)
54 |
55 | super(CategoricalMLPPolicy, self).__init__(env_spec)
56 | LayersPowered.__init__(self, [prob_network.output_layer])
57 |
58 | @property
59 | def vectorized(self):
60 | return True
61 |
62 | @overrides
63 | def dist_info_sym(self, obs_var, state_info_vars=None):
64 | return dict(prob=L.get_output(self._l_prob, {self._l_obs: tf.cast(obs_var, tf.float32)}))
65 |
66 | @overrides
67 | def dist_info(self, obs, state_infos=None):
68 | return dict(prob=self._f_prob(obs))
69 |
70 | # The return value is a pair. The first item is a matrix (N, A), where each
71 | # entry corresponds to the action value taken. The second item is a vector
72 | # of length N, where each entry is the density value for that action, under
73 | # the current policy
74 | @overrides
75 | def get_action(self, observation):
76 | flat_obs = self.observation_space.flatten(observation)
77 | prob = self._f_prob([flat_obs])[0]
78 | action = self.action_space.weighted_sample(prob)
79 | return action, dict(prob=prob)
80 |
81 | def get_actions(self, observations):
82 | flat_obs = self.observation_space.flatten_n(observations)
83 | probs = self._f_prob(flat_obs)
84 | actions = list(map(self.action_space.weighted_sample, probs))
85 | return actions, dict(prob=probs)
86 |
87 | @property
88 | def distribution(self):
89 | return self._dist
90 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/deterministic_mlp_policy.py:
--------------------------------------------------------------------------------
1 | from rllab.core.serializable import Serializable
2 | from rllab.misc import ext
3 | from rllab.misc.overrides import overrides
4 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
5 | from sandbox.rocky.tf.core.network import MLP
6 | from sandbox.rocky.tf.distributions.categorical import Categorical
7 | from sandbox.rocky.tf.policies.base import Policy
8 | from sandbox.rocky.tf.misc import tensor_utils
9 |
10 | import sandbox.rocky.tf.core.layers as L
11 | from sandbox.rocky.tf.core.layers import batch_norm
12 |
13 | from sandbox.rocky.tf.spaces.discrete import Discrete
14 | import tensorflow as tf
15 |
16 |
17 | class DeterministicMLPPolicy(Policy, LayersPowered, Serializable):
18 | def __init__(
19 | self,
20 | name,
21 | env_spec,
22 | hidden_sizes=(32, 32),
23 | hidden_nonlinearity=tf.nn.relu,
24 | output_nonlinearity=tf.nn.tanh,
25 | prob_network=None,
26 | bn=False):
27 | Serializable.quick_init(self, locals())
28 |
29 | with tf.variable_scope(name):
30 | if prob_network is None:
31 | prob_network = MLP(
32 | input_shape=(env_spec.observation_space.flat_dim,),
33 | output_dim=env_spec.action_space.flat_dim,
34 | hidden_sizes=hidden_sizes,
35 | hidden_nonlinearity=hidden_nonlinearity,
36 | output_nonlinearity=output_nonlinearity,
37 | # batch_normalization=True,
38 | name="prob_network",
39 | )
40 |
41 | self._l_prob = prob_network.output_layer
42 | self._l_obs = prob_network.input_layer
43 | self._f_prob = tensor_utils.compile_function(
44 | [prob_network.input_layer.input_var],
45 | L.get_output(prob_network.output_layer, deterministic=True)
46 | )
47 |
48 | self.prob_network = prob_network
49 |
50 | # Note the deterministic=True argument. It makes sure that when getting
51 | # actions from single observations, we do not update params in the
52 | # batch normalization layers.
53 | # TODO: this doesn't currently work properly in the tf version so we leave out batch_norm
54 | super(DeterministicMLPPolicy, self).__init__(env_spec)
55 | LayersPowered.__init__(self, [prob_network.output_layer])
56 |
57 | @property
58 | def vectorized(self):
59 | return True
60 |
61 | @overrides
62 | def get_action(self, observation):
63 | flat_obs = self.observation_space.flatten(observation)
64 | action = self._f_prob([flat_obs])[0]
65 | return action, dict()
66 |
67 | @overrides
68 | def get_actions(self, observations):
69 | flat_obs = self.observation_space.flatten_n(observations)
70 | actions = self._f_prob(flat_obs)
71 | return actions, dict()
72 |
73 | def get_action_sym(self, obs_var):
74 | return L.get_output(self.prob_network.output_layer, obs_var)
75 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/uniform_control_policy.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.policies.base import Policy
2 | from rllab.core.serializable import Serializable
3 |
4 |
5 | class UniformControlPolicy(Policy, Serializable):
6 | def __init__(
7 | self,
8 | env_spec,
9 | ):
10 | Serializable.quick_init(self, locals())
11 | super(UniformControlPolicy, self).__init__(env_spec=env_spec)
12 |
13 | @property
14 | def vectorized(self):
15 | return True
16 |
17 | def get_action(self, observation):
18 | return self.action_space.sample(), dict()
19 |
20 | def get_actions(self, observations):
21 | return self.action_space.sample_n(len(observations)), dict()
22 |
23 | def get_params_internal(self, **tags):
24 | return []
25 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/q_functions/base.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.core.parameterized import Parameterized
2 |
3 | class QFunction(Parameterized):
4 | pass
5 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/q_functions/continuous_mlp_q_function.py:
--------------------------------------------------------------------------------
1 | from sandbox.rocky.tf.q_functions.base import QFunction
2 | from rllab.core.serializable import Serializable
3 | from rllab.misc import ext
4 |
5 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
6 | from sandbox.rocky.tf.core.network import MLP
7 | from sandbox.rocky.tf.core.layers import batch_norm
8 | from sandbox.rocky.tf.distributions.categorical import Categorical
9 | from sandbox.rocky.tf.policies.base import StochasticPolicy
10 | from sandbox.rocky.tf.misc import tensor_utils
11 |
12 | import tensorflow as tf
13 | import sandbox.rocky.tf.core.layers as L
14 |
15 |
16 | class ContinuousMLPQFunction(QFunction, LayersPowered, Serializable):
17 | def __init__(
18 | self,
19 | env_spec,
20 | hidden_sizes=(32, 32),
21 | hidden_nonlinearity=tf.nn.relu,
22 | action_merge_layer=-2,
23 | output_nonlinearity=None,
24 | bn=False):
25 | Serializable.quick_init(self, locals())
26 |
27 | l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs")
28 | l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions")
29 |
30 | n_layers = len(hidden_sizes) + 1
31 |
32 | if n_layers > 1:
33 | action_merge_layer = \
34 | (action_merge_layer % n_layers + n_layers) % n_layers
35 | else:
36 | action_merge_layer = 1
37 |
38 | l_hidden = l_obs
39 |
40 | for idx, size in enumerate(hidden_sizes):
41 | if bn:
42 | l_hidden = batch_norm(l_hidden)
43 |
44 | if idx == action_merge_layer:
45 | l_hidden = L.ConcatLayer([l_hidden, l_action])
46 |
47 | l_hidden = L.DenseLayer(
48 | l_hidden,
49 | num_units=size,
50 | nonlinearity=hidden_nonlinearity,
51 | name="h%d" % (idx + 1)
52 | )
53 |
54 | if action_merge_layer == n_layers:
55 | l_hidden = L.ConcatLayer([l_hidden, l_action])
56 |
57 | l_output = L.DenseLayer(
58 | l_hidden,
59 | num_units=1,
60 | nonlinearity=output_nonlinearity,
61 | name="output"
62 | )
63 |
64 | output_var = L.get_output(l_output, deterministic=True)
65 |
66 | self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var)
67 | self._output_layer = l_output
68 | self._obs_layer = l_obs
69 | self._action_layer = l_action
70 | self._output_nonlinearity = output_nonlinearity
71 |
72 | LayersPowered.__init__(self, [l_output])
73 |
74 | def get_qval(self, observations, actions):
75 | return self._f_qval(observations, actions)
76 |
77 | def get_qval_sym(self, obs_var, action_var, **kwargs):
78 | qvals = L.get_output(
79 | self._output_layer,
80 | {self._obs_layer: obs_var, self._action_layer: action_var},
81 | **kwargs
82 | )
83 | return tf.reshape(qvals, (-1,))
84 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/regressors/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/regressors/deterministic_mlp_regressor.py:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | import numpy as np
8 |
9 | import tensorflow as tf
10 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
11 | from sandbox.rocky.tf.core.network import MLP
12 | from sandbox.rocky.tf.misc import tensor_utils
13 | from sandbox.rocky.tf.distributions.categorical import Categorical
14 | from sandbox.rocky.tf.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer
15 | from sandbox.rocky.tf.optimizers.lbfgs_optimizer import LbfgsOptimizer
16 | import sandbox.rocky.tf.core.layers as L
17 | from rllab.core.serializable import Serializable
18 | from rllab.misc import ext
19 | from rllab.misc import logger
20 |
21 | NONE = list()
22 |
23 |
24 | class DeterministicMLPRegressor(LayersPowered, Serializable):
25 | """
26 | A class for performing nonlinear regression.
27 | """
28 |
29 | def __init__(
30 | self,
31 | name,
32 | input_shape,
33 | output_dim,
34 | network=None,
35 | hidden_sizes=(32, 32),
36 | hidden_nonlinearity=tf.nn.tanh,
37 | output_nonlinearity=None,
38 | optimizer=None,
39 | normalize_inputs=True,
40 | ):
41 | """
42 | :param input_shape: Shape of the input data.
43 | :param output_dim: Dimension of output.
44 | :param hidden_sizes: Number of hidden units of each layer of the mean network.
45 | :param hidden_nonlinearity: Non-linearity used for each layer of the mean network.
46 | :param optimizer: Optimizer for minimizing the negative log-likelihood.
47 | """
48 | Serializable.quick_init(self, locals())
49 |
50 | with tf.variable_scope(name):
51 |
52 | if optimizer is None:
53 | optimizer = LbfgsOptimizer(name="optimizer")
54 |
55 | self.output_dim = output_dim
56 | self.optimizer = optimizer
57 |
58 | if network is None:
59 | network = MLP(
60 | input_shape=input_shape,
61 | output_dim=output_dim,
62 | hidden_sizes=hidden_sizes,
63 | hidden_nonlinearity=hidden_nonlinearity,
64 | output_nonlinearity=output_nonlinearity,
65 | name="network"
66 | )
67 |
68 | l_out = network.output_layer
69 |
70 | LayersPowered.__init__(self, [l_out])
71 |
72 | xs_var = network.input_layer.input_var
73 | ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys")
74 |
75 | x_mean_var = tf.get_variable(
76 | name="x_mean",
77 | shape=(1,) + input_shape,
78 | initializer=tf.constant_initializer(0., dtype=tf.float32)
79 | )
80 | x_std_var = tf.get_variable(
81 | name="x_std",
82 | shape=(1,) + input_shape,
83 | initializer=tf.constant_initializer(1., dtype=tf.float32)
84 | )
85 |
86 | normalized_xs_var = (xs_var - x_mean_var) / x_std_var
87 |
88 | fit_ys_var = L.get_output(l_out, {network.input_layer: normalized_xs_var})
89 |
90 | loss = - tf.reduce_mean(tf.square(fit_ys_var - ys_var))
91 |
92 | self.f_predict = tensor_utils.compile_function([xs_var], fit_ys_var)
93 |
94 | optimizer_args = dict(
95 | loss=loss,
96 | target=self,
97 | network_outputs=[fit_ys_var],
98 | )
99 |
100 | optimizer_args["inputs"] = [xs_var, ys_var]
101 |
102 | self.optimizer.update_opt(**optimizer_args)
103 |
104 | self.name = name
105 | self.l_out = l_out
106 |
107 | self.normalize_inputs = normalize_inputs
108 | self.x_mean_var = x_mean_var
109 | self.x_std_var = x_std_var
110 |
111 | def predict_sym(self, xs):
112 | return L.get_output(self.l_out, xs)
113 |
114 | # def fit(self, xs, ys):
115 | # if self._normalize_inputs:
116 | # # recompute normalizing constants for inputs
117 | # new_mean = np.mean(xs, axis=0, keepdims=True)
118 | # new_std = np.std(xs, axis=0, keepdims=True) + 1e-8
119 | # tf.get_default_session().run(tf.group(
120 | # tf.assign(self._x_mean_var, new_mean),
121 | # tf.assign(self._x_std_var, new_std),
122 | # ))
123 | # inputs = [xs, ys]
124 | # loss_before = self._optimizer.loss(inputs)
125 | # if self._name:
126 | # prefix = self._name + "_"
127 | # else:
128 | # prefix = ""
129 | # logger.record_tabular(prefix + 'LossBefore', loss_before)
130 | # self._optimizer.optimize(inputs)
131 | # loss_after = self._optimizer.loss(inputs)
132 | # logger.record_tabular(prefix + 'LossAfter', loss_after)
133 | # logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
134 |
135 | def predict(self, xs):
136 | return self.f_predict(np.asarray(xs))
137 |
138 | def get_param_values(self, **tags):
139 | return LayersPowered.get_param_values(self, **tags)
140 |
141 | def set_param_values(self, flattened_params, **tags):
142 | return LayersPowered.set_param_values(self, flattened_params, **tags)
143 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__init__.py:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__pycache__/base.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/base.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__pycache__/batch_sampler.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/batch_sampler.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__pycache__/vectorized_sampler.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/vectorized_sampler.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/batch_sampler.py:
--------------------------------------------------------------------------------
1 | from rllab.sampler.base import BaseSampler
2 | from rllab.sampler import parallel_sampler
3 | from rllab.sampler.stateful_pool import singleton_pool
4 | import tensorflow as tf
5 |
6 |
7 | def worker_init_tf(G):
8 | G.sess = tf.Session()
9 | G.sess.__enter__()
10 |
11 |
12 | def worker_init_tf_vars(G):
13 | G.sess.run(tf.global_variables_initializer())
14 |
15 |
16 | class BatchSampler(BaseSampler):
17 | def start_worker(self):
18 | if singleton_pool.n_parallel > 1:
19 | singleton_pool.run_each(worker_init_tf)
20 | parallel_sampler.populate_task(self.algo.env, self.algo.policy)
21 | if singleton_pool.n_parallel > 1:
22 | singleton_pool.run_each(worker_init_tf_vars)
23 |
24 | def shutdown_worker(self):
25 | parallel_sampler.terminate_task(scope=self.algo.scope)
26 |
27 | def obtain_samples(self, itr):
28 | cur_policy_params = self.algo.policy.get_param_values()
29 | cur_env_params = self.algo.env.get_param_values()
30 | paths = parallel_sampler.sample_paths(
31 | policy_params=cur_policy_params,
32 | env_params=cur_env_params,
33 | max_samples=self.algo.batch_size,
34 | max_path_length=self.algo.max_path_length,
35 | scope=self.algo.scope,
36 | )
37 | if self.algo.whole_paths:
38 | return paths
39 | else:
40 | paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size)
41 | return paths_truncated
42 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/vectorized_sampler.py:
--------------------------------------------------------------------------------
1 | import pickle
2 |
3 | import tensorflow as tf
4 | from rllab.sampler.base import BaseSampler
5 | #from base import BaseSampler
6 | from sandbox.rocky.tf.envs.parallel_vec_env_executor import ParallelVecEnvExecutor
7 | from sandbox.rocky.tf.envs.vec_env_executor import VecEnvExecutor
8 | from rllab.misc import tensor_utils
9 | import numpy as np
10 | from rllab.sampler.stateful_pool import ProgBarCounter
11 | import rllab.misc.logger as logger
12 | import itertools
13 |
14 |
15 | class VectorizedSampler(BaseSampler):
16 |
17 | def __init__(self, algo, n_envs=None):
18 | super(VectorizedSampler, self).__init__(algo)
19 | self.n_envs = n_envs
20 |
21 | def start_worker(self):
22 | n_envs = self.n_envs
23 | if n_envs is None:
24 | n_envs = int(self.algo.batch_size / self.algo.max_path_length)
25 | n_envs = max(1, min(n_envs, 100))
26 |
27 | if getattr(self.algo.env, 'vectorized', False):
28 | self.vec_env = self.algo.env.vec_env_executor(n_envs=n_envs, max_path_length=self.algo.max_path_length)
29 | else:
30 | envs = [pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs)]
31 | self.vec_env = VecEnvExecutor(
32 | envs=envs,
33 | max_path_length=self.algo.max_path_length
34 | )
35 | self.env_spec = self.algo.env.spec
36 |
37 | def shutdown_worker(self):
38 | self.vec_env.terminate()
39 |
40 | def obtain_samples(self, itr):
41 | logger.log("Obtaining samples for iteration %d..." % itr)
42 | paths = []
43 | n_samples = 0
44 | obses = self.vec_env.reset()
45 | dones = np.asarray([True] * self.vec_env.num_envs)
46 | running_paths = [None] * self.vec_env.num_envs
47 |
48 | pbar = ProgBarCounter(self.algo.batch_size)
49 | policy_time = 0
50 | env_time = 0
51 | process_time = 0
52 |
53 | policy = self.algo.policy
54 | import time
55 | while n_samples < self.algo.batch_size:
56 | t = time.time()
57 | policy.reset(dones)
58 | actions, agent_infos = policy.get_actions(obses)
59 |
60 | policy_time += time.time() - t
61 | t = time.time()
62 | next_obses, rewards, dones, env_infos = self.vec_env.step(actions)
63 | env_time += time.time() - t
64 |
65 | t = time.time()
66 |
67 | agent_infos = tensor_utils.split_tensor_dict_list(agent_infos)
68 | env_infos = tensor_utils.split_tensor_dict_list(env_infos)
69 | if env_infos is None:
70 | env_infos = [dict() for _ in range(self.vec_env.num_envs)]
71 | if agent_infos is None:
72 | agent_infos = [dict() for _ in range(self.vec_env.num_envs)]
73 | for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions,
74 | rewards, env_infos, agent_infos,
75 | dones):
76 | if running_paths[idx] is None:
77 | running_paths[idx] = dict(
78 | observations=[],
79 | actions=[],
80 | rewards=[],
81 | env_infos=[],
82 | agent_infos=[],
83 | )
84 | running_paths[idx]["observations"].append(observation)
85 | running_paths[idx]["actions"].append(action)
86 | running_paths[idx]["rewards"].append(reward)
87 | running_paths[idx]["env_infos"].append(env_info)
88 | running_paths[idx]["agent_infos"].append(agent_info)
89 | if done:
90 | paths.append(dict(
91 | observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]),
92 | actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]),
93 | rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]),
94 | env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]),
95 | agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]),
96 | ))
97 | n_samples += len(running_paths[idx]["rewards"])
98 | running_paths[idx] = None
99 | process_time += time.time() - t
100 | pbar.inc(len(obses))
101 | obses = next_obses
102 |
103 | pbar.stop()
104 |
105 | logger.record_tabular("PolicyExecTime", policy_time)
106 | logger.record_tabular("EnvExecTime", env_time)
107 | logger.record_tabular("ProcessExecTime", process_time)
108 |
109 | return paths
110 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__init__.py:
--------------------------------------------------------------------------------
1 | from .product import Product
2 | from .discrete import Discrete
3 | from .box import Box
4 |
5 | __all__ = ["Product", "Discrete", "Box"]
6 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__pycache__/box.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/box.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__pycache__/discrete.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/discrete.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__pycache__/product.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/product.cpython-35.pyc
--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/box.py:
--------------------------------------------------------------------------------
1 | from rllab.spaces.box import Box as TheanoBox
2 | import tensorflow as tf
3 |
4 |
5 | class Box(TheanoBox):
6 | def new_tensor_variable(self, name, extra_dims, flatten=True):
7 | if flatten:
8 | return tf.placeholder(tf.float32, shape=[None] * extra_dims + [self.flat_dim], name=name)
9 | return tf.placeholder(tf.float32, shape=[None] * extra_dims + list(self.shape), name=name)
10 |
11 | @property
12 | def dtype(self):
13 | return tf.float32
14 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/discrete.py:
--------------------------------------------------------------------------------
1 | from rllab.spaces.base import Space
2 | import numpy as np
3 | from rllab.misc import special
4 | from rllab.misc import ext
5 | import tensorflow as tf
6 |
7 |
8 | class Discrete(Space):
9 | """
10 | {0,1,...,n-1}
11 | """
12 |
13 | def __init__(self, n):
14 | self._n = n
15 |
16 | @property
17 | def n(self):
18 | return self._n
19 |
20 | def sample(self):
21 | return np.random.randint(self.n)
22 |
23 | def sample_n(self, n):
24 | return np.random.randint(low=0, high=self.n, size=n)
25 |
26 | def contains(self, x):
27 | x = np.asarray(x)
28 | return x.shape == () and x.dtype.kind == 'i' and x >= 0 and x < self.n
29 |
30 | def __repr__(self):
31 | return "Discrete(%d)" % self.n
32 |
33 | def __eq__(self, other):
34 | return self.n == other.n
35 |
36 | def flatten(self, x):
37 | return special.to_onehot(x, self.n)
38 |
39 | def unflatten(self, x):
40 | return special.from_onehot(x)
41 |
42 | def flatten_n(self, x):
43 | return special.to_onehot_n(x, self.n)
44 |
45 | def unflatten_n(self, x):
46 | return special.from_onehot_n(x)
47 |
48 | @property
49 | def default_value(self):
50 | return 0
51 |
52 | @property
53 | def flat_dim(self):
54 | return self.n
55 |
56 | def weighted_sample(self, weights):
57 | return special.weighted_sample(weights, range(self.n))
58 |
59 | def new_tensor_variable(self, name, extra_dims):
60 | # needed for safe conversion to float32
61 | return tf.placeholder(dtype=tf.uint8, shape=[None] * extra_dims + [self.flat_dim], name=name)
62 |
63 | @property
64 | def dtype(self):
65 | return tf.uint8
66 |
67 | def __eq__(self, other):
68 | if not isinstance(other, Discrete):
69 | return False
70 | return self.n == other.n
71 |
72 | def __hash__(self):
73 | return hash(self.n)
74 |
75 |
--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/product.py:
--------------------------------------------------------------------------------
1 | from rllab.spaces.base import Space
2 | import tensorflow as tf
3 | import numpy as np
4 |
5 |
6 | class Product(Space):
7 | def __init__(self, *components):
8 | if isinstance(components[0], (list, tuple)):
9 | assert len(components) == 1
10 | components = components[0]
11 | self._components = tuple(components)
12 | dtypes = [c.dtype for c in components]
13 | if len(dtypes) > 0 and hasattr(dtypes[0], "as_numpy_dtype"):
14 | dtypes = [d.as_numpy_dtype for d in dtypes]
15 | self._common_dtype = np.core.numerictypes.find_common_type([], dtypes)
16 |
17 | def sample(self):
18 | return tuple(x.sample() for x in self._components)
19 |
20 | @property
21 | def components(self):
22 | return self._components
23 |
24 | def contains(self, x):
25 | return isinstance(x, tuple) and all(c.contains(xi) for c, xi in zip(self._components, x))
26 |
27 | def new_tensor_variable(self, name, extra_dims):
28 | return tf.placeholder(
29 | dtype=self._common_dtype,
30 | shape=[None] * extra_dims + [self.flat_dim],
31 | name=name,
32 | )
33 |
34 | @property
35 | def dtype(self):
36 | return self._common_dtype
37 |
38 | @property
39 | def flat_dim(self):
40 | return int(np.sum([c.flat_dim for c in self._components]))
41 |
42 | def flatten(self, x):
43 | return np.concatenate([c.flatten(xi) for c, xi in zip(self._components, x)])
44 |
45 | def flatten_n(self, xs):
46 | xs_regrouped = [[x[i] for x in xs] for i in range(len(xs[0]))]
47 | flat_regrouped = [c.flatten_n(xi) for c, xi in zip(self.components, xs_regrouped)]
48 | return np.concatenate(flat_regrouped, axis=-1)
49 |
50 | def unflatten(self, x):
51 | dims = [c.flat_dim for c in self._components]
52 | flat_xs = np.split(x, np.cumsum(dims)[:-1])
53 | return tuple(c.unflatten(xi) for c, xi in zip(self._components, flat_xs))
54 |
55 | def unflatten_n(self, xs):
56 | dims = [c.flat_dim for c in self._components]
57 | flat_xs = np.split(xs, np.cumsum(dims)[:-1], axis=-1)
58 | unflat_xs = [c.unflatten_n(xi) for c, xi in zip(self.components, flat_xs)]
59 | unflat_xs_grouped = list(zip(*unflat_xs))
60 | return unflat_xs_grouped
61 |
62 | def __eq__(self, other):
63 | if not isinstance(other, Product):
64 | return False
65 | return tuple(self.components) == tuple(other.components)
66 |
67 | def __hash__(self):
68 | return hash(tuple(self.components))
69 |
--------------------------------------------------------------------------------
/sim_cpolicy.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import joblib
3 | from rllab.misc import tensor_utils
4 | import time
5 | from contextlib import contextmanager
6 | import numpy as np
7 | import tensorflow as tf
8 |
9 | from sac.envs import CrossMazeAntEnv, RandomGoalAntEnv,HalfCheetahHurdleEnv
10 | from rllab.envs.normalized_env import normalize
11 | from rllab.misc import tensor_utils
12 | from sac.misc import tf_utils
13 |
14 | def rollout(env, policy,sub_level_policies,path_length=1000, render=True, speedup=10, g=2):
15 | observation = env.reset()
16 | policy.reset()
17 |
18 | t = 0
19 | obs = observation
20 | for t in range(path_length):
21 |
22 |
23 | sub_level_actions=[]
24 | if g!=0:
25 | obs=observation[:-g]
26 | else:
27 | obs=observation
28 | for i in range(0,len(sub_level_policies)):
29 | action, _ = sub_level_policies[i].get_action(obs)
30 | sub_level_actions.append(action.reshape(1,-1))
31 | sub_level_actions=np.stack(sub_level_actions,axis=0)
32 | sub_level_actions=np.transpose(sub_level_actions,(1,0,2))
33 |
34 | action, agent_info = policy.get_action(observation,sub_level_actions)
35 | next_obs, reward, terminal, env_info = env.step(action)
36 |
37 |
38 | observation = next_obs
39 |
40 | if render:
41 | env.render()
42 | time_step = 0.05
43 | time.sleep(time_step / speedup)
44 |
45 | if terminal:
46 | break
47 |
48 |
49 | return 0
50 |
51 |
52 | def parse_args():
53 | parser = argparse.ArgumentParser()
54 | parser.add_argument('file', type=str, help='Path to the snapshot file.')
55 | parser.add_argument('--max-path-length', '-l', type=int, default=1000)
56 | parser.add_argument('--speedup', '-s', type=float, default=10)
57 | parser.add_argument('--domain',type=str,default='ant-cross-maze')
58 | parser.add_argument('--deterministic', '-d', dest='deterministic',
59 | action='store_true')
60 | parser.add_argument('--no-deterministic', '-nd', dest='deterministic',
61 | action='store_false')
62 | parser.add_argument('--policy_h', type=int)
63 | parser.set_defaults(deterministic=True)
64 |
65 | args = parser.parse_args()
66 |
67 | return args
68 |
69 | def load_low_level_policy(policy_path=None,name=None):
70 | with tf_utils.get_default_session().as_default():
71 | with tf.variable_scope(name, reuse=False):
72 | snapshot = joblib.load(policy_path)
73 |
74 | policy = snapshot["policy"]
75 | return policy
76 |
77 |
78 | def simulate_policy_ant(args):
79 | sub_level_policies=[]
80 | with tf.Session() as sess:
81 | with tf.variable_scope("fwrd", reuse=False):
82 | fwrd = joblib.load("primitive-policies/ant/fwrd/fwrd.pkl")
83 | with tf.variable_scope("bwrd", reuse=False):
84 | bwrd = joblib.load("primitive-policies/ant/bwrd/bwrd.pkl")
85 | with tf.variable_scope("uwrd", reuse=False):
86 | uwrd = joblib.load("primitive-policies/ant/uwrd/uwrd.pkl")
87 | with tf.variable_scope("dwrd", reuse=False):
88 | dwrd = joblib.load("primitive-policies/ant/dwrd/dwrd.pkl")
89 | sub_level_policies.append(fwrd["policy"])
90 | sub_level_policies.append(bwrd["policy"])
91 | sub_level_policies.append(uwrd["policy"])
92 | sub_level_policies.append(dwrd["policy"])
93 | data = joblib.load(args.file)
94 | if 'algo' in data.keys():
95 | policy = data['algo'].policy
96 | env = data['algo'].env
97 | else:
98 | policy = data['policy']
99 | env = data['env']
100 | with policy.deterministic(args.deterministic):
101 | while True:
102 | path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length,g=2)
103 |
104 | def simulate_policy_pusher(args):
105 | sub_level_policies=[]
106 | with tf.Session() as sess:
107 | with tf.variable_scope("bottom", reuse=False):
108 | btm = joblib.load("primitive-policies/pusher/bottom/bottom.pkl")
109 | with tf.variable_scope("jump", reuse=False):
110 | lft = joblib.load("primitive-policies/pusher/left/left.pkl")
111 | sub_level_policies.append(btm["policy"])
112 | sub_level_policies.append(lft["policy"])
113 | data = joblib.load(args.file)
114 | if 'algo' in data.keys():
115 | policy = data['algo'].policy
116 | env = data['algo'].env
117 | else:
118 | policy = data['policy']
119 | env =data['env']
120 | with policy.deterministic(args.deterministic):
121 | while True:
122 | path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length,g=0)
123 |
124 | def simulate_policy_hch(args):
125 | sub_level_policies=[]
126 | with tf.Session() as sess:
127 | with tf.variable_scope("fwrd", reuse=False):
128 | fwrd = joblib.load("primitive-policies/hc/fwd/fwd.pkl")
129 | with tf.variable_scope("jump", reuse=False):
130 | jmp = joblib.load("primitive-policies/hc/jp-longz/jump.pkl")
131 | sub_level_policies.append(fwrd["policy"])
132 | sub_level_policies.append(jmp["policy"])
133 | data = joblib.load(args.file)
134 | if 'algo' in data.keys():
135 | policy = data['algo'].policy
136 | env = data['algo'].env
137 | else:
138 | policy = data['policy']
139 | env = normalize(HalfCheetahHurdleEnv()) #data['env']
140 | with policy.deterministic(args.deterministic):
141 | while True:
142 | path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length, g=2)
143 |
144 | if __name__ == "__main__":
145 | args = parse_args()
146 | if args.domain=='ant-cross-maze' or args.domain=='ant-random-goal':
147 | simulate_policy_ant(args)
148 | if args.domain=='cheetah-hurdle':
149 | simulate_policy_hch(args)
150 | if args.domain=='pusher':
151 | simulate_policy_pusher(args)
152 |
--------------------------------------------------------------------------------
/sim_policy.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import joblib
4 | import tensorflow as tf
5 |
6 | from rllab.sampler.utils import rollout
7 |
8 | def parse_args():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('file', type=str, help='Path to the snapshot file.')
11 | parser.add_argument('--max-path-length', '-l', type=int, default=1000)
12 | parser.add_argument('--speedup', '-s', type=float, default=10)
13 | parser.add_argument('--deterministic', '-d', dest='deterministic',
14 | action='store_true')
15 | parser.add_argument('--no-deterministic', '-nd', dest='deterministic',
16 | action='store_false')
17 | parser.add_argument('--policy_h', type=int)
18 | parser.set_defaults(deterministic=True)
19 |
20 | args = parser.parse_args()
21 |
22 | return args
23 |
24 | def simulate_policy(args):
25 | with tf.Session() as sess:
26 | data = joblib.load(args.file)
27 | print(data.keys())
28 | if 'algo' in data.keys():
29 | policy = data['algo'].policy
30 | env = data['algo'].env
31 | else:
32 | policy = data['policy']
33 | env = data['env']
34 | print(policy)
35 | with policy.deterministic(args.deterministic):
36 | while True:
37 | path = rollout(env, policy,
38 | max_path_length=args.max_path_length,
39 | animated=True, speedup=args.speedup)
40 | if __name__ == "__main__":
41 | args = parse_args()
42 | simulate_policy(args)
43 |
--------------------------------------------------------------------------------
/value_functions/__init__.py:
--------------------------------------------------------------------------------
1 | from .value_function import NNVFunction, NNQFunction, NNDiscriminatorFunction
2 |
--------------------------------------------------------------------------------
/value_functions/__pycache__/__init__.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/value_functions/__pycache__/__init__.cpython-35.pyc
--------------------------------------------------------------------------------
/value_functions/__pycache__/value_function.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/value_functions/__pycache__/value_function.cpython-35.pyc
--------------------------------------------------------------------------------
/value_functions/value_function.py:
--------------------------------------------------------------------------------
1 | import tensorflow as tf
2 |
3 | from rllab.core.serializable import Serializable
4 |
5 | from sac.misc.mlp import MLPFunction
6 | from sac.misc import tf_utils
7 |
8 | class NNVFunction(MLPFunction):
9 |
10 | def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='vf'):
11 | Serializable.quick_init(self, locals())
12 |
13 | self._Do = env_spec.observation_space.flat_dim
14 | self._obs_pl = tf.placeholder(
15 | tf.float32,
16 | shape=[None, self._Do],
17 | name='observation',
18 | )
19 |
20 | super(NNVFunction, self).__init__(
21 | name, (self._obs_pl,), hidden_layer_sizes)
22 |
23 |
24 | class NNQFunction(MLPFunction):
25 | def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='qf'):
26 | Serializable.quick_init(self, locals())
27 |
28 | self._Da = env_spec.action_space.flat_dim
29 | self._Do = env_spec.observation_space.flat_dim
30 |
31 | self._obs_pl = tf.placeholder(
32 | tf.float32,
33 | shape=[None, self._Do],
34 | name='observation',
35 | )
36 |
37 | self._action_pl = tf.placeholder(
38 | tf.float32,
39 | shape=[None, self._Da],
40 | name='actions',
41 | )
42 |
43 | super(NNQFunction, self).__init__(
44 | name, (self._obs_pl, self._action_pl), hidden_layer_sizes)
45 |
46 |
47 | class NNDiscriminatorFunction(MLPFunction):
48 | def __init__(self, env_spec, hidden_layer_sizes=(100, 100), num_skills=None):
49 | assert num_skills is not None
50 | Serializable.quick_init(self, locals())
51 | Parameterized.__init__(self)
52 |
53 | self._Da = env_spec.action_space.flat_dim
54 | self._Do = env_spec.observation_space.flat_dim
55 |
56 | self._obs_pl = tf.placeholder(
57 | tf.float32,
58 | shape=[None, self._Do],
59 | name='observation',
60 | )
61 | self._action_pl = tf.placeholder(
62 | tf.float32,
63 | shape=[None, self._Da],
64 | name='actions',
65 | )
66 |
67 | self._name = 'discriminator'
68 | self._input_pls = (self._obs_pl, self._action_pl)
69 | self._layer_sizes = list(hidden_layer_sizes) + [num_skills]
70 | self._output_t = self.get_output_for(*self._input_pls)
71 |
--------------------------------------------------------------------------------