├── LICENSE ├── README.md ├── algos ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── base.cpython-35.pyc │ ├── diayn.cpython-35.pyc │ └── sac.cpython-35.pyc ├── base.py └── sac.py ├── core ├── __pycache__ │ ├── __init__.cpython-35.pyc │ └── serializable.cpython-35.pyc └── serializable.py ├── distributions ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── gmm.cpython-35.pyc │ ├── normal.cpython-35.pyc │ └── real_nvp_bijector.cpython-35.pyc ├── gmm.py ├── normal.py └── real_nvp_bijector.py ├── environments ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── delayed_env.cpython-35.pyc │ ├── gym_env.cpython-35.pyc │ ├── multigoal.cpython-35.pyc │ └── pusher.cpython-35.pyc ├── delayed_env.py ├── gym_env.py ├── multigoal.py └── pusher.py ├── envs ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── cheetah_hurdle_env.cpython-35.pyc │ ├── cross_maze_ant_env.cpython-35.pyc │ ├── gym_env.cpython-35.pyc │ ├── helpers.cpython-35.pyc │ ├── hierarchy_proxy_env.cpython-35.pyc │ ├── multi_direction_env.cpython-35.pyc │ ├── multigoal.cpython-35.pyc │ ├── pusher.cpython-35.pyc │ ├── random_goal_ant_env.cpython-35.pyc │ └── simple_maze_ant_env.cpython-35.pyc ├── cheetah_hurdle_env.py ├── cross_maze_ant_env.py ├── delayed_env.py ├── gym_env.py ├── helpers.py ├── hierarchy_proxy_env.py ├── meta_env.py ├── multi_direction_env.py ├── multigoal.py ├── pusher.py ├── random_goal_ant_env.py └── simple_maze_ant_env.py ├── misc ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── instrument.cpython-35.pyc │ ├── mlp.cpython-35.pyc │ ├── plotter.cpython-35.pyc │ ├── sampler.cpython-35.pyc │ ├── tf_utils.cpython-35.pyc │ └── utils.cpython-35.pyc ├── instrument.py ├── mlp.py ├── plotter.py ├── remote_sampler.py ├── replay_pool.py ├── sampler.py ├── tf_utils.py └── utils.py ├── mujoco_am_sac.py ├── mujoco_models ├── cross_maze_ant.xml ├── half_cheetah_hurdle.xml ├── pusher_2d.xml └── simple_maze_ant.xml ├── policies ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── base.cpython-35.pyc │ ├── gaussian_policy.cpython-35.pyc │ ├── gmm.cpython-35.pyc │ ├── hierarchical_policy.cpython-35.pyc │ ├── latent_space_policy.cpython-35.pyc │ ├── nn_policy.cpython-35.pyc │ ├── nn_policy2.cpython-35.pyc │ ├── pointer_policy.cpython-35.pyc │ └── uniform_policy.cpython-35.pyc ├── base.py ├── gaussian_policy.py ├── nn_policy.py ├── nn_policy2.py ├── pointer_policy.py └── uniform_policy.py ├── preprocessors ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ └── mlp_preprocessor.cpython-35.pyc └── mlp_preprocessor.py ├── primitive-policies ├── ant │ ├── bwrd │ │ └── bwrd.pkl │ ├── dwrd │ │ └── dwrd.pkl │ ├── fwrd │ │ └── fwrd.pkl │ └── uwrd │ │ └── uwrd.pkl ├── hc │ ├── fwd │ │ └── fwd.pkl │ └── jp-longz │ │ └── jump.pkl └── pusher │ ├── bottom │ └── bottom.pkl │ └── left │ └── left.pkl ├── replay_buffers ├── __init__.py ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── replay_buffer.cpython-35.pyc │ └── simple_replay_buffer.cpython-35.pyc ├── replay_buffer.py └── simple_replay_buffer.py ├── sandbox ├── __pycache__ │ └── __init__.cpython-35.pyc └── rocky │ ├── __pycache__ │ └── __init__.cpython-35.pyc │ └── tf │ ├── __pycache__ │ └── __init__.cpython-35.pyc │ ├── algos │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── batch_polopt.cpython-35.pyc │ │ ├── npo.cpython-35.pyc │ │ └── trpo.cpython-35.pyc │ ├── batch_polopt.py │ ├── npg.py │ ├── npo.py │ ├── trpo.py │ └── vpg.py │ ├── core │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── layers.cpython-35.pyc │ │ ├── layers_powered.cpython-35.pyc │ │ ├── network.cpython-35.pyc │ │ └── parameterized.cpython-35.pyc │ ├── layers.py │ ├── layers_powered.py │ ├── network.py │ └── parameterized.py │ ├── distributions │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── base.cpython-35.pyc │ │ └── diagonal_gaussian.cpython-35.pyc │ ├── base.py │ ├── bernoulli.py │ ├── categorical.py │ ├── diagonal_gaussian.py │ ├── recurrent_categorical.py │ └── recurrent_diagonal_gaussian.py │ ├── envs │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── base.cpython-35.pyc │ │ ├── parallel_vec_env_executor.cpython-35.pyc │ │ └── vec_env_executor.cpython-35.pyc │ ├── base.py │ ├── parallel_vec_env_executor.py │ └── vec_env_executor.py │ ├── launchers │ ├── __init__.py │ ├── trpo_cartpole.py │ ├── trpo_cartpole_recurrent.py │ └── vpg_cartpole.py │ ├── misc │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ └── tensor_utils.cpython-35.pyc │ └── tensor_utils.py │ ├── optimizers │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── conjugate_gradient_optimizer.cpython-35.pyc │ │ └── penalty_lbfgs_optimizer.cpython-35.pyc │ ├── conjugate_gradient_optimizer.py │ ├── first_order_optimizer.py │ ├── lbfgs_optimizer.py │ └── penalty_lbfgs_optimizer.py │ ├── policies │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── base.cpython-35.pyc │ │ ├── gaussian_mlp_inverse_policy.cpython-35.pyc │ │ └── gaussian_mlp_policy.cpython-35.pyc │ ├── base.py │ ├── categorical_conv_policy.py │ ├── categorical_gru_policy.py │ ├── categorical_lstm_policy.py │ ├── categorical_mlp_policy.py │ ├── deterministic_mlp_policy.py │ ├── gaussian_gru_policy.py │ ├── gaussian_lstm_policy.py │ ├── gaussian_mlp_inverse_policy.py │ ├── gaussian_mlp_policy.py │ └── uniform_control_policy.py │ ├── q_functions │ ├── base.py │ └── continuous_mlp_q_function.py │ ├── regressors │ ├── __init__.py │ ├── bernoulli_mlp_regressor.py │ ├── categorical_mlp_regressor.py │ ├── deterministic_mlp_regressor.py │ └── gaussian_mlp_regressor.py │ ├── samplers │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-35.pyc │ │ ├── base.cpython-35.pyc │ │ ├── batch_sampler.cpython-35.pyc │ │ └── vectorized_sampler.cpython-35.pyc │ ├── batch_sampler.py │ └── vectorized_sampler.py │ └── spaces │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-35.pyc │ ├── box.cpython-35.pyc │ ├── discrete.cpython-35.pyc │ └── product.cpython-35.pyc │ ├── box.py │ ├── discrete.py │ └── product.py ├── sim_cpolicy.py ├── sim_policy.py └── value_functions ├── __init__.py ├── __pycache__ ├── __init__.cpython-35.pyc └── value_function.cpython-35.pyc └── value_function.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Ahmed Qureshi 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # COMPOSING TASK-AGNOSTIC POLICIES WITH DEEP REINFORCEMENT LEARNING 2 | 3 | 4 | * Requirements: 5 | 1. Rllab 6 | 2. Tensorflow 7 | 3. mujoco 8 | 9 | 10 | ## To train composite model from scratch, run: 11 | 12 | 1. To simulate "ant-cross-maze", run: 13 | 14 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/ant-maze" --domain="ant-cross-maze"``` 15 | 16 | 2. To simulate "ant-random-goal", run: 17 | 18 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/ant-rgoal" --domain="ant-random-goal"``` 19 | 20 | 3. To simulate "cheetah-hurdle", run: 21 | 22 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/cheetah-hurdle" --domain="cheetah-hurdle"``` 23 | 24 | 4. To simulate "pusher", run: 25 | 26 | ```python mujoco_am_sac.py --log_dir="/path-to-crl-code-folder/composition_sac_code/pusher" --domain="pusher"``` 27 | 28 | 29 | 30 | 31 | 32 | ## References 33 | ``` 34 | @inproceedings{ 35 | qureshi2020composing, 36 | title={Composing Task-Agnostic Policies with Deep Reinforcement Learning}, 37 | author={Ahmed H. Qureshi and Jacob J. Johnson and Yuzhe Qin and Taylor Henderson and Byron Boots and Michael C. Yip}, 38 | booktitle={International Conference on Learning Representations}, 39 | year={2020}, 40 | url={https://openreview.net/forum?id=H1ezFREtwH} 41 | } 42 | ``` 43 | -------------------------------------------------------------------------------- /algos/__init__.py: -------------------------------------------------------------------------------- 1 | from .sac import SAC 2 | -------------------------------------------------------------------------------- /algos/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /algos/__pycache__/base.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/base.cpython-35.pyc -------------------------------------------------------------------------------- /algos/__pycache__/diayn.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/diayn.cpython-35.pyc -------------------------------------------------------------------------------- /algos/__pycache__/sac.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/algos/__pycache__/sac.cpython-35.pyc -------------------------------------------------------------------------------- /core/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/core/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /core/__pycache__/serializable.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/core/__pycache__/serializable.cpython-35.pyc -------------------------------------------------------------------------------- /core/serializable.py: -------------------------------------------------------------------------------- 1 | from rllab.core.serializable import Serializable 2 | 3 | 4 | def deep_clone(obj): 5 | assert isinstance(obj, Serializable) 6 | 7 | def maybe_deep_clone(o): 8 | if isinstance(o, Serializable): 9 | return deep_clone(o) 10 | else: 11 | return o 12 | 13 | d = obj.__getstate__() 14 | for key, val in d.items(): 15 | d[key] = maybe_deep_clone(val) 16 | 17 | d['__args'] = list(d['__args']) # Make args mutable. 18 | for i, val in enumerate(d['__args']): 19 | d['__args'][i] = maybe_deep_clone(val) 20 | 21 | for key, val in d['__kwargs'].items(): 22 | d['__kwargs'][key] = maybe_deep_clone(val) 23 | 24 | out = type(obj).__new__(type(obj)) 25 | # noinspection PyArgumentList 26 | out.__setstate__(d) 27 | 28 | return out 29 | -------------------------------------------------------------------------------- /distributions/__init__.py: -------------------------------------------------------------------------------- 1 | from .normal import Normal 2 | from .gmm import GMM 3 | from .real_nvp_bijector import RealNVPBijector 4 | -------------------------------------------------------------------------------- /distributions/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /distributions/__pycache__/gmm.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/gmm.cpython-35.pyc -------------------------------------------------------------------------------- /distributions/__pycache__/normal.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/normal.cpython-35.pyc -------------------------------------------------------------------------------- /distributions/__pycache__/real_nvp_bijector.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/distributions/__pycache__/real_nvp_bijector.cpython-35.pyc -------------------------------------------------------------------------------- /distributions/gmm.py: -------------------------------------------------------------------------------- 1 | """ Gaussian mixture model. """ 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | from sac.misc.mlp import mlp 7 | 8 | LOG_SIG_CAP_MAX = 2 9 | LOG_SIG_CAP_MIN = -20 10 | 11 | 12 | class GMM(object): 13 | def __init__( 14 | self, 15 | K, 16 | Dx, 17 | hidden_layers_sizes=(100, 100), 18 | reg=0.001, 19 | reparameterize=True, 20 | cond_t_lst=(), 21 | ): 22 | self._cond_t_lst = cond_t_lst 23 | self._reg = reg 24 | self._layer_sizes = list(hidden_layers_sizes) + [K * (2 * Dx + 1)] 25 | self._reparameterize = reparameterize 26 | 27 | self._Dx = Dx 28 | self._K = K 29 | 30 | self._create_placeholders() 31 | self._create_graph() 32 | 33 | def _create_placeholders(self): 34 | self._N_pl = tf.placeholder( 35 | tf.int32, 36 | shape=(), 37 | name='N', 38 | ) 39 | 40 | @staticmethod 41 | def _create_log_gaussian(mu_t, log_sig_t, t): 42 | normalized_dist_t = (t - mu_t) * tf.exp(-log_sig_t) # ... x D 43 | quadratic = - 0.5 * tf.reduce_sum(normalized_dist_t ** 2, axis=-1) 44 | # ... x (None) 45 | 46 | log_z = tf.reduce_sum(log_sig_t, axis=-1) # ... x (None) 47 | D_t = tf.cast(tf.shape(mu_t)[-1], tf.float32) 48 | log_z += 0.5 * D_t * np.log(2 * np.pi) 49 | 50 | log_p = quadratic - log_z 51 | 52 | return log_p # ... x (None) 53 | 54 | def _create_p_xz_params(self): 55 | K = self._K 56 | Dx = self._Dx 57 | 58 | if len(self._cond_t_lst) == 0: 59 | w_and_mu_and_logsig_t = tf.get_variable( 60 | 'params', self._layer_sizes[-1], 61 | initializer=tf.random_normal_initializer(0, 0.1) 62 | ) 63 | 64 | else: 65 | w_and_mu_and_logsig_t = mlp( 66 | inputs=self._cond_t_lst, 67 | layer_sizes=self._layer_sizes, 68 | output_nonlinearity=None, 69 | ) # ... x K*Dx*2+K 70 | 71 | w_and_mu_and_logsig_t = tf.reshape( 72 | w_and_mu_and_logsig_t, shape=(-1, K, 2*Dx+1)) 73 | 74 | log_w_t = w_and_mu_and_logsig_t[..., 0] 75 | mu_t = w_and_mu_and_logsig_t[..., 1:1+Dx] 76 | log_sig_t = w_and_mu_and_logsig_t[..., 1+Dx:] 77 | 78 | log_sig_t = tf.clip_by_value(log_sig_t, LOG_SIG_CAP_MIN, LOG_SIG_CAP_MAX) 79 | 80 | return log_w_t, mu_t, log_sig_t 81 | 82 | def _create_graph(self): 83 | Dx = self._Dx 84 | 85 | if len(self._cond_t_lst) > 0: 86 | N_t = tf.shape(self._cond_t_lst[0])[0] 87 | else: 88 | N_t = self._N_pl 89 | 90 | K = self._K 91 | 92 | # Create p(x|z). 93 | with tf.variable_scope('p'): 94 | log_ws_t, xz_mus_t, xz_log_sigs_t = self._create_p_xz_params() 95 | # (N x K), (N x K x Dx), (N x K x Dx) 96 | xz_sigs_t = tf.exp(xz_log_sigs_t) 97 | 98 | # Sample the latent code. 99 | z_t = tf.multinomial(logits=log_ws_t, num_samples=1) # N x 1 100 | 101 | # Choose mixture component corresponding to the latent. 102 | mask_t = tf.one_hot( 103 | z_t[:, 0], depth=K, dtype=tf.bool, 104 | on_value=True, off_value=False 105 | ) 106 | xz_mu_t = tf.boolean_mask(xz_mus_t, mask_t) # N x Dx 107 | xz_sig_t = tf.boolean_mask(xz_sigs_t, mask_t) # N x Dx 108 | 109 | # Sample x. 110 | x_t = xz_mu_t + xz_sig_t * tf.random_normal((N_t, Dx)) # N x Dx 111 | if not self._reparameterize: 112 | x_t = tf.stop_gradient(x_t) 113 | 114 | # log p(x|z) 115 | log_p_xz_t = self._create_log_gaussian( 116 | xz_mus_t, xz_log_sigs_t, x_t[:, None, :] 117 | ) # N x K 118 | 119 | # log p(x) 120 | log_p_x_t = tf.reduce_logsumexp(log_p_xz_t + log_ws_t, axis=1) 121 | log_p_x_t -= tf.reduce_logsumexp(log_ws_t, axis=1) # N 122 | 123 | reg_loss_t = 0 124 | reg_loss_t += self._reg * 0.5 * tf.reduce_mean(xz_log_sigs_t ** 2) 125 | reg_loss_t += self._reg * 0.5 * tf.reduce_mean(xz_mus_t ** 2) 126 | 127 | self._log_p_x_t = log_p_x_t 128 | self._reg_loss_t = reg_loss_t 129 | self._x_t = x_t 130 | 131 | self._log_ws_t = log_ws_t 132 | self._mus_t = xz_mus_t 133 | self._log_sigs_t = xz_log_sigs_t 134 | 135 | @property 136 | def log_p_t(self): 137 | return self._log_p_x_t 138 | 139 | @property 140 | def reg_loss_t(self): 141 | return self._reg_loss_t 142 | 143 | @property 144 | def x_t(self): 145 | return self._x_t 146 | 147 | @property 148 | def mus_t(self): 149 | return self._mus_t 150 | 151 | @property 152 | def log_sigs_t(self): 153 | return self._log_sigs_t 154 | 155 | @property 156 | def log_ws_t(self): 157 | return self._log_ws_t 158 | 159 | @property 160 | def N_t(self): 161 | return self._N_pl 162 | -------------------------------------------------------------------------------- /distributions/normal.py: -------------------------------------------------------------------------------- 1 | """ Multivariate normal distribution with mean and std deviation outputted by a neural net """ 2 | 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | from sac.misc.mlp import mlp 7 | 8 | LOG_SIG_CAP_MAX = 2 9 | LOG_SIG_CAP_MIN = -20 10 | 11 | 12 | class Normal(object): 13 | def __init__( 14 | self, 15 | Dx, 16 | hidden_layers_sizes=(100, 100), 17 | reg=0.001, 18 | reparameterize=True, 19 | cond_t_lst=(), 20 | ): 21 | self._cond_t_lst = cond_t_lst 22 | self._reg = reg 23 | self._layer_sizes = list(hidden_layers_sizes) + [2 * Dx] 24 | print(self._layer_sizes) 25 | self._reparameterize = reparameterize 26 | 27 | self._Dx = Dx 28 | 29 | self._create_placeholders() 30 | self._create_graph() 31 | 32 | def _create_placeholders(self): 33 | self._N_pl = tf.placeholder( 34 | tf.int32, 35 | shape=(), 36 | name='N', 37 | ) 38 | 39 | def _create_graph(self): 40 | Dx = self._Dx 41 | 42 | if len(self._cond_t_lst) == 0: 43 | mu_and_logsig_t = tf.get_variable( 44 | 'params', self._layer_sizes[-1], 45 | initializer=tf.random_normal_initializer(0, 0.1) 46 | ) 47 | else: 48 | mu_and_logsig_t = mlp( 49 | inputs=self._cond_t_lst, 50 | layer_sizes=self._layer_sizes, 51 | output_nonlinearity=None, 52 | ) # ... x K*Dx*2+K 53 | 54 | self._mu_t = mu_and_logsig_t[..., :Dx] 55 | self._log_sig_t = tf.clip_by_value(mu_and_logsig_t[..., Dx:], LOG_SIG_CAP_MIN, LOG_SIG_CAP_MAX) 56 | 57 | # Tensorflow's multivariate normal distribution supports reparameterization 58 | ds = tf.contrib.distributions 59 | dist = ds.MultivariateNormalDiag(loc=self._mu_t, scale_diag=tf.exp(self._log_sig_t)) 60 | x_t = dist.sample() 61 | if not self._reparameterize: 62 | x_t = tf.stop_gradient(x_t) 63 | log_pi_t = dist.log_prob(x_t) 64 | 65 | self._dist = dist 66 | self._x_t = x_t 67 | self._log_pi_t = log_pi_t 68 | 69 | reg_loss_t = self._reg * 0.5 * tf.reduce_mean(self._log_sig_t ** 2) 70 | reg_loss_t += self._reg * 0.5 * tf.reduce_mean(self._mu_t ** 2) 71 | self._reg_loss_t = reg_loss_t 72 | 73 | 74 | 75 | @property 76 | def log_p_t(self): 77 | return self._log_pi_t 78 | 79 | @property 80 | def reg_loss_t(self): 81 | return self._reg_loss_t 82 | 83 | @property 84 | def x_t(self): 85 | return self._x_t 86 | 87 | @property 88 | def mu_t(self): 89 | return self._mu_t 90 | 91 | @property 92 | def log_sig_t(self): 93 | return self._log_sig_t 94 | -------------------------------------------------------------------------------- /environments/__init__.py: -------------------------------------------------------------------------------- 1 | from .multigoal import MultiGoalEnv 2 | from .gym_env import GymEnv 3 | from .delayed_env import DelayedEnv -------------------------------------------------------------------------------- /environments/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /environments/__pycache__/delayed_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/delayed_env.cpython-35.pyc -------------------------------------------------------------------------------- /environments/__pycache__/gym_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/gym_env.cpython-35.pyc -------------------------------------------------------------------------------- /environments/__pycache__/multigoal.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/multigoal.cpython-35.pyc -------------------------------------------------------------------------------- /environments/__pycache__/pusher.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/environments/__pycache__/pusher.cpython-35.pyc -------------------------------------------------------------------------------- /environments/delayed_env.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from rllab.envs.proxy_env import ProxyEnv 4 | from rllab.core.serializable import Serializable 5 | 6 | 7 | class DelayedEnv(ProxyEnv, Serializable): 8 | def __init__(self, env, delay=0.01): 9 | Serializable.quick_init(self, locals()) 10 | ProxyEnv.__init__(self, env) 11 | 12 | self._delay = delay 13 | 14 | def step(self, action): 15 | time.sleep(self._delay) 16 | return self._wrapped_env.step(action) 17 | -------------------------------------------------------------------------------- /environments/gym_env.py: -------------------------------------------------------------------------------- 1 | """ Rllab implementation with a HACK. See comment in `GymEnv.__init__`. """ 2 | import gym 3 | import gym.wrappers 4 | import gym.envs 5 | import gym.spaces 6 | import traceback 7 | import logging 8 | 9 | try: 10 | from gym import logger as monitor_logger 11 | 12 | monitor_logger.setLevel(logging.WARNING) 13 | except Exception as e: 14 | traceback.print_exc() 15 | 16 | import os 17 | import os.path as osp 18 | from rllab.envs.base import Env, Step 19 | from rllab.core.serializable import Serializable 20 | from rllab.spaces.box import Box 21 | from rllab.spaces.discrete import Discrete 22 | from rllab.spaces.product import Product 23 | from rllab.misc import logger 24 | 25 | 26 | def convert_gym_space(space): 27 | if isinstance(space, gym.spaces.Box): 28 | return Box(low=space.low, high=space.high) 29 | elif isinstance(space, gym.spaces.Discrete): 30 | return Discrete(n=space.n) 31 | elif isinstance(space, gym.spaces.Tuple): 32 | return Product([convert_gym_space(x) for x in space.spaces]) 33 | else: 34 | raise NotImplementedError 35 | 36 | 37 | class CappedCubicVideoSchedule(object): 38 | # Copied from gym, since this method is frequently moved around 39 | def __call__(self, count): 40 | if count < 1000: 41 | return int(round(count ** (1. / 3))) ** 3 == count 42 | else: 43 | return count % 1000 == 0 44 | 45 | 46 | class FixedIntervalVideoSchedule(object): 47 | def __init__(self, interval): 48 | self.interval = interval 49 | 50 | def __call__(self, count): 51 | return count % self.interval == 0 52 | 53 | 54 | class NoVideoSchedule(object): 55 | def __call__(self, count): 56 | return False 57 | 58 | 59 | class GymEnv(Env, Serializable): 60 | def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False, 61 | force_reset=True): 62 | if log_dir is None: 63 | if logger.get_snapshot_dir() is None: 64 | logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") 65 | else: 66 | log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") 67 | Serializable.quick_init(self, locals()) 68 | 69 | env = gym.envs.make(env_name) 70 | 71 | # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when 72 | # the time limit specified for each environment has been passed and 73 | # therefore the environment is not Markovian (terminal condition depends 74 | # on time rather than state). 75 | env = env.env 76 | 77 | self.env = env 78 | self.env_id = env.spec.id 79 | 80 | assert not (not record_log and record_video) 81 | 82 | if log_dir is None or record_log is False: 83 | self.monitoring = False 84 | else: 85 | if not record_video: 86 | video_schedule = NoVideoSchedule() 87 | else: 88 | if video_schedule is None: 89 | video_schedule = CappedCubicVideoSchedule() 90 | self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) 91 | self.monitoring = True 92 | 93 | self._observation_space = convert_gym_space(env.observation_space) 94 | logger.log("observation space: {}".format(self._observation_space)) 95 | self._action_space = convert_gym_space(env.action_space) 96 | logger.log("action space: {}".format(self._action_space)) 97 | self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps'] 98 | self._log_dir = log_dir 99 | self._force_reset = force_reset 100 | 101 | @property 102 | def observation_space(self): 103 | return self._observation_space 104 | 105 | @property 106 | def action_space(self): 107 | return self._action_space 108 | 109 | @property 110 | def horizon(self): 111 | return self._horizon 112 | 113 | def reset(self): 114 | if self._force_reset and self.monitoring: 115 | from gym.wrappers.monitoring import Monitor 116 | assert isinstance(self.env, Monitor) 117 | recorder = self.env.stats_recorder 118 | if recorder is not None: 119 | recorder.done = True 120 | return self.env.reset() 121 | 122 | def step(self, action): 123 | next_obs, reward, done, info = self.env.step(action) 124 | return Step(next_obs, reward, done, **info) 125 | 126 | def render(self, mode='human', close=False): 127 | return self.env._render(mode, close) 128 | # self.env.render() 129 | 130 | def terminate(self): 131 | if self.monitoring: 132 | self.env._close() 133 | if self._log_dir is not None: 134 | print(""" 135 | *************************** 136 | 137 | Training finished! You can upload results to OpenAI Gym by running the following command: 138 | 139 | python scripts/submit_gym.py %s 140 | 141 | *************************** 142 | """ % self._log_dir) 143 | 144 | -------------------------------------------------------------------------------- /envs/__init__.py: -------------------------------------------------------------------------------- 1 | from .gym_env import GymEnv 2 | from .cheetah_hurdle_env import HalfCheetahHurdleEnv 3 | from .multi_direction_env import ( 4 | MultiDirectionSwimmerEnv, 5 | MultiDirectionAntEnv, 6 | MultiDirectionHumanoidEnv) 7 | 8 | from .random_goal_ant_env import RandomGoalAntEnv 9 | from .pusher import PusherEnv 10 | from .cross_maze_ant_env import CrossMazeAntEnv 11 | from .simple_maze_ant_env import SimpleMazeAntEnv 12 | from .hierarchy_proxy_env import HierarchyProxyEnv 13 | from .multigoal import MultiGoalEnv 14 | -------------------------------------------------------------------------------- /envs/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/cheetah_hurdle_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/cheetah_hurdle_env.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/cross_maze_ant_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/cross_maze_ant_env.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/gym_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/gym_env.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/helpers.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/helpers.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/hierarchy_proxy_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/hierarchy_proxy_env.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/multi_direction_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/multi_direction_env.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/multigoal.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/multigoal.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/pusher.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/pusher.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/random_goal_ant_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/random_goal_ant_env.cpython-35.pyc -------------------------------------------------------------------------------- /envs/__pycache__/simple_maze_ant_env.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/envs/__pycache__/simple_maze_ant_env.cpython-35.pyc -------------------------------------------------------------------------------- /envs/cheetah_hurdle_env.py: -------------------------------------------------------------------------------- 1 | """Implements a ant which is sparsely rewarded for reaching a goal""" 2 | #from gym.envs.mujoco.half_cheetah import HalfCheetahEnv 3 | #from gym.envs.mujoco.mujoco_env import MujocoEnv 4 | 5 | 6 | from rllab.core.serializable import Serializable 7 | from sac.misc.utils import PROJECT_PATH 8 | from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv 9 | from rllab.envs.mujoco.mujoco_env import MujocoEnv 10 | from rllab.envs.base import Step 11 | from gym import utils 12 | import os 13 | import numpy as np 14 | 15 | MODELS_PATH = os.path.abspath(os.path.join(PROJECT_PATH, 'sac/mujoco_models')) 16 | 17 | class HalfCheetahHurdleEnv(HalfCheetahEnv): 18 | def __init__(self): 19 | self.exteroceptive_observation =[12.0,0,0.5] 20 | self.hurdles_xpos=[-15.,-13.,-9.,-5.,-1.,3.,7.,11.,15.]#,19.,23.,27.] 21 | path = os.path.join(MODELS_PATH, 'half_cheetah_hurdle.xml') 22 | MujocoEnv.__init__(self,file_path=path) 23 | #MujocoEnv.__init__(self) 24 | Serializable.quick_init(self, locals()) 25 | 26 | def get_current_obs(self): 27 | proprioceptive_observation = super().get_current_obs() 28 | x_pos1 =self.get_body_com('ffoot')[0]#self.model.data.qpos.flat[:1] 29 | x_pos2 =self.get_body_com('bfoot')[0]#self.model.data.qpos.flat[:1] 30 | matches = [x for x in self.hurdles_xpos if x >= x_pos2] 31 | next_hurdle_x_pos = [matches[0]] 32 | ff_dist_frm_next_hurdle=[np.linalg.norm(matches[0] - x_pos1)] 33 | bf_dist_frm_next_hurdle=[np.linalg.norm(matches[0] - x_pos2)] 34 | observation =np.concatenate([proprioceptive_observation,next_hurdle_x_pos,bf_dist_frm_next_hurdle]).reshape(-1) 35 | return observation 36 | 37 | def isincollision(self): 38 | hurdle_size=[0.05,1.0,0.03] 39 | x_pos =self.get_body_com('ffoot')[0]#self.model.data.qpos.flat[:1] 40 | matches = [x for x in self.hurdles_xpos if x >= x_pos] 41 | if len(matches)==0: 42 | return False 43 | hurdle_pos =[matches[0],0.0,0.20] 44 | #names=['fthigh','bthigh'] 45 | #names=['torso','bthigh','bshin','bfoot'] 46 | names=['ffoot'] 47 | xyz_pos=[] 48 | for i in range(0,len(names)): 49 | xyz_pos.append(self.get_body_com(names[i])) 50 | for i in range(0,len(names)): 51 | #xyz_position = self.get_body_com(names[i]) 52 | cf=True 53 | for j in range(0,1): 54 | if abs(hurdle_pos[j]-xyz_pos[i][j])>1.5*hurdle_size[j]: 55 | cf=False 56 | break 57 | if cf: 58 | return True 59 | return False 60 | 61 | def get_hurdle_reward(self): 62 | hurdle_size=[0.05,1.0,0.03] 63 | x_pos =self.get_body_com('bfoot')[0]#self.model.data.qpos.flat[:1] 64 | matches = [x for x in self.hurdles_xpos if x >= x_pos] 65 | hurdle_reward =-1.0*len(matches) 66 | 67 | return hurdle_reward 68 | 69 | def step(self, action): 70 | xyz_pos_before = self.get_body_com('bshin') 71 | self.forward_dynamics(action) 72 | xyz_pos_after = self.get_body_com('bshin') 73 | xyz_position = self.get_body_com('torso') 74 | jump_reward = np.abs(self.get_body_comvel("torso")[2]) 75 | run_reward = self.get_body_comvel("torso")[0] 76 | next_obs= self.get_current_obs() 77 | if self.isincollision():# or (xyz_pos_after[0]-xyz_pos_before[0])<-0.01:#dist_from_hurdle < 1 and dist_from_hurdle > 0.3 and z_after<0.05:(xyz_pos_after[0]-xyz_pos_before[0])<-0.01: # 78 | collision_penality=-2.0 79 | #print("collision") 80 | else: 81 | collision_penality=0.0 82 | #print("not collisions") 83 | hurdle_reward = self.get_hurdle_reward() 84 | #print(hurdle_reward) 85 | done = False 86 | goal_reward=0 87 | goal_distance =np.linalg.norm(xyz_position - self.exteroceptive_observation) 88 | if (goal_distance)<1.0: 89 | done=True 90 | goal_reward=1000 91 | else: 92 | done=False 93 | 94 | reward=-1e-1*goal_distance+hurdle_reward+goal_reward+run_reward+3e-1*jump_reward+collision_penality#1e-1*goal_distance+run_reward+jump_reward+collision_penality 95 | info = {'goal_distance': goal_distance} 96 | return Step(next_obs, reward, done, **info) 97 | -------------------------------------------------------------------------------- /envs/cross_maze_ant_env.py: -------------------------------------------------------------------------------- 1 | """Implements an ant whose goal is to reach a target in a maze""" 2 | 3 | import os 4 | 5 | import numpy as np 6 | 7 | from rllab.core.serializable import Serializable 8 | from sac.misc.utils import PROJECT_PATH 9 | from .helpers import random_point_in_circle, get_random_goal_logs 10 | from .random_goal_ant_env import RandomGoalAntEnv 11 | 12 | MODELS_PATH = os.path.abspath( 13 | os.path.join(PROJECT_PATH, 'sac/mujoco_models')) 14 | 15 | class CrossMazeAntEnv(RandomGoalAntEnv, Serializable): 16 | """Implements an ant whose goal is to reach a target in a maze""" 17 | 18 | FILE_PATH = os.path.join(MODELS_PATH, 'cross_maze_ant.xml') 19 | 20 | def __init__(self, 21 | reward_type='dense', 22 | terminate_at_goal=True, 23 | goal_reward_weight=3e-1, 24 | goal_radius=1, 25 | goal_distance=1, 26 | goal_angle_range=(0, 2*np.pi), 27 | velocity_reward_weight=0, 28 | ctrl_cost_coeff=1e-2, 29 | contact_cost_coeff=1e-3, 30 | survive_reward=5e-2, 31 | fixed_goal_position=None, 32 | *args, 33 | **kwargs): 34 | file_path = self.__class__.FILE_PATH 35 | kwargs.pop('file_path', None) 36 | self.fixed_goal_position = fixed_goal_position 37 | 38 | super(CrossMazeAntEnv, self).__init__( 39 | file_path=file_path, 40 | reward_type=reward_type, 41 | terminate_at_goal=terminate_at_goal, 42 | goal_reward_weight=goal_reward_weight, 43 | goal_radius=goal_radius, 44 | goal_distance=goal_distance, 45 | goal_angle_range=goal_angle_range, 46 | velocity_reward_weight=velocity_reward_weight, 47 | ctrl_cost_coeff=ctrl_cost_coeff, 48 | contact_cost_coeff=contact_cost_coeff, 49 | survive_reward=survive_reward, 50 | *args, 51 | **kwargs) 52 | self._serializable_initialized = False 53 | 54 | def reset(self, goal_position=None, *args, **kwargs): 55 | possible_goal_positions = [[6, -6], [6, 6], [12, 0]] 56 | 57 | if goal_position is None: 58 | if self.fixed_goal_position is not None: 59 | goal_position = self.fixed_goal_position 60 | else: 61 | goal_position = possible_goal_positions[ 62 | np.random.choice(len(possible_goal_positions))] 63 | 64 | observation = super(CrossMazeAntEnv, self).reset( 65 | goal_position=np.array(goal_position), *args, **kwargs) 66 | 67 | return observation 68 | 69 | def get_current_obs(self): 70 | observation = super().get_current_obs() 71 | 72 | if self.fixed_goal_position is not None: 73 | return observation[:-2] 74 | 75 | return observation 76 | 77 | def render(self, *args, **kwargs): 78 | result = super(CrossMazeAntEnv, self).render(*args, **kwargs) 79 | self.viewer.cam.elevation = -55 80 | self.viewer.cam.lookat[0] = 7 81 | self.viewer.cam.lookat[2] = 0 82 | self.viewer.cam.distance = self.model.stat.extent * 0.9 83 | self.viewer.cam.azimuth = 0 84 | self.viewer.cam.trackbodyid = 0 85 | 86 | return result 87 | -------------------------------------------------------------------------------- /envs/delayed_env.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from rllab.envs.proxy_env import ProxyEnv 4 | from rllab.core.serializable import Serializable 5 | 6 | 7 | class DelayedEnv(ProxyEnv, Serializable): 8 | def __init__(self, env, delay=0.01): 9 | Serializable.quick_init(self, locals()) 10 | ProxyEnv.__init__(self, env) 11 | 12 | self._delay = delay 13 | 14 | def step(self, action): 15 | time.sleep(self._delay) 16 | return self._wrapped_env.step(action) 17 | -------------------------------------------------------------------------------- /envs/gym_env.py: -------------------------------------------------------------------------------- 1 | """ Rllab implementation with a HACK. See comment in GymEnv.__init__(). """ 2 | import gym 3 | import gym.wrappers 4 | import gym.envs 5 | import gym.spaces 6 | import traceback 7 | import logging 8 | 9 | try: 10 | from gym import logger as monitor_logger 11 | 12 | monitor_logger.setLevel(logging.WARNING) 13 | except Exception as e: 14 | traceback.print_exc() 15 | 16 | import os 17 | import os.path as osp 18 | from rllab.envs.base import Env, Step 19 | from rllab.core.serializable import Serializable 20 | from rllab.spaces.box import Box 21 | from rllab.spaces.discrete import Discrete 22 | from rllab.spaces.product import Product 23 | from rllab.misc import logger 24 | 25 | 26 | def convert_gym_space(space): 27 | if isinstance(space, gym.spaces.Box): 28 | return Box(low=space.low, high=space.high) 29 | elif isinstance(space, gym.spaces.Discrete): 30 | return Discrete(n=space.n) 31 | elif isinstance(space, gym.spaces.Tuple): 32 | return Product([convert_gym_space(x) for x in space.spaces]) 33 | else: 34 | raise NotImplementedError 35 | 36 | 37 | class CappedCubicVideoSchedule(object): 38 | # Copied from gym, since this method is frequently moved around 39 | def __call__(self, count): 40 | if count < 1000: 41 | return int(round(count ** (1. / 3))) ** 3 == count 42 | else: 43 | return count % 1000 == 0 44 | 45 | 46 | class FixedIntervalVideoSchedule(object): 47 | def __init__(self, interval): 48 | self.interval = interval 49 | 50 | def __call__(self, count): 51 | return count % self.interval == 0 52 | 53 | 54 | class NoVideoSchedule(object): 55 | def __call__(self, count): 56 | return False 57 | 58 | 59 | class GymEnv(Env, Serializable): 60 | def __init__(self, env_name, record_video=False, video_schedule=None, log_dir=None, record_log=False, 61 | force_reset=True): 62 | if log_dir is None: 63 | if logger.get_snapshot_dir() is None: 64 | logger.log("Warning: skipping Gym environment monitoring since snapshot_dir not configured.") 65 | else: 66 | log_dir = os.path.join(logger.get_snapshot_dir(), "gym_log") 67 | Serializable.quick_init(self, locals()) 68 | 69 | env = gym.envs.make(env_name) 70 | 71 | # HACK: Gets rid of the TimeLimit wrapper that sets 'done = True' when 72 | # the time limit specified for each environment has been passed and 73 | # therefore the environment is not Markovian (terminal condition depends 74 | # on time rather than state). 75 | env = env.env 76 | 77 | self.env = env 78 | self.env_id = env.spec.id 79 | 80 | assert not (not record_log and record_video) 81 | 82 | if log_dir is None or record_log is False: 83 | self.monitoring = False 84 | else: 85 | if not record_video: 86 | video_schedule = NoVideoSchedule() 87 | else: 88 | if video_schedule is None: 89 | video_schedule = CappedCubicVideoSchedule() 90 | self.env = gym.wrappers.Monitor(self.env, log_dir, video_callable=video_schedule, force=True) 91 | self.monitoring = True 92 | 93 | self._observation_space = convert_gym_space(env.observation_space) 94 | logger.log("observation space: {}".format(self._observation_space)) 95 | self._action_space = convert_gym_space(env.action_space) 96 | logger.log("action space: {}".format(self._action_space)) 97 | self._horizon = env.spec.tags['wrapper_config.TimeLimit.max_episode_steps'] 98 | self._log_dir = log_dir 99 | self._force_reset = force_reset 100 | 101 | @property 102 | def observation_space(self): 103 | return self._observation_space 104 | 105 | @property 106 | def action_space(self): 107 | return self._action_space 108 | 109 | @property 110 | def horizon(self): 111 | return self._horizon 112 | 113 | def reset(self): 114 | if self._force_reset and self.monitoring: 115 | from gym.wrappers.monitoring import Monitor 116 | assert isinstance(self.env, Monitor) 117 | recorder = self.env.stats_recorder 118 | if recorder is not None: 119 | recorder.done = True 120 | return self.env.reset() 121 | 122 | def step(self, action): 123 | next_obs, reward, done, info = self.env.step(action) 124 | return Step(next_obs, reward, done, **info) 125 | 126 | def render(self, mode='human', close=False): 127 | return self.env._render(mode, close) 128 | # self.env.render() 129 | 130 | def terminate(self): 131 | if self.monitoring: 132 | self.env._close() 133 | if self._log_dir is not None: 134 | print(""" 135 | *************************** 136 | 137 | Training finished! You can upload results to OpenAI Gym by running the following command: 138 | 139 | python scripts/submit_gym.py %s 140 | 141 | *************************** 142 | """ % self._log_dir) 143 | -------------------------------------------------------------------------------- /envs/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def random_point_in_circle(angle_range=(0, 2*np.pi), radius=(0, 25)): 5 | angle = np.random.uniform(*angle_range) 6 | radius = radius if np.isscalar(radius) else np.random.uniform(*radius) 7 | x, y = np.cos(angle) * radius, np.sin(angle) * radius 8 | point = np.array([x, y]) 9 | return point 10 | 11 | def get_random_goal_logs(paths, goal_radius, fixed_goal_position=False): 12 | if fixed_goal_position: 13 | position_slice = slice(-3, -1) 14 | else: 15 | position_slice = slice(-5, -3) 16 | 17 | logs = [] 18 | if len(paths) > 0: 19 | progs = [ 20 | np.linalg.norm(path["observations"][-1][position_slice] 21 | - path["observations"][0][position_slice]) 22 | for path in paths 23 | ] 24 | 25 | time_in_goals = [ 26 | np.sum(np.linalg.norm( 27 | ( 28 | path['observations'][:, position_slice] 29 | - [path_goal['goal_position'] for path_goal in path['env_infos']] 30 | ) 31 | , axis=1 32 | ) < goal_radius) 33 | for path in paths 34 | ] 35 | 36 | logs += [ 37 | ('AverageProgress', np.mean(progs)), 38 | ('MaxProgress', np.max(progs)), 39 | ('MinProgress', np.min(progs)), 40 | ('StdProgress', np.std(progs)), 41 | 42 | ('AverageTimeInGoal', np.mean(time_in_goals)), 43 | ('MaxTimeInGoal', np.max(time_in_goals)), 44 | ('MinTimeInGoal', np.min(time_in_goals)), 45 | ('StdTimeInGoal', np.std(time_in_goals)), 46 | ] 47 | 48 | goal_positions, final_positions = zip(*[ 49 | ([path_goal['goal_position'] for path_goal in p['env_infos']][-1], 50 | p['observations'][-1][position_slice]) 51 | for p in paths 52 | ]) 53 | 54 | begin_goal_distances = [ 55 | np.linalg.norm(goal_position) for goal_position in goal_positions] 56 | final_goal_distances = [ 57 | np.linalg.norm(goal_position - final_position) 58 | for goal_position, final_position in zip(goal_positions, final_positions) 59 | ] 60 | progress_towards_goals = [ 61 | begin_goal_distance - final_goal_distance 62 | for (begin_goal_distance, final_goal_distance) 63 | in zip(begin_goal_distances, final_goal_distances) 64 | ] 65 | 66 | 67 | for series, name in zip((begin_goal_distances, 68 | final_goal_distances, 69 | progress_towards_goals), 70 | ('BeginGoalDistance', 71 | 'FinalGoalDistance', 72 | 'ProgressTowardsGoal')): 73 | for fn_name in ('mean', 'std', 'min', 'max'): 74 | fn = getattr(np, fn_name) 75 | logs.append((fn_name.capitalize() + name, fn(series))) 76 | 77 | return logs 78 | 79 | def get_multi_direction_logs(paths): 80 | progs = [ 81 | np.linalg.norm(path["observations"][-1][-3:-1] 82 | - path["observations"][0][-3:-1]) 83 | for path in paths 84 | ] 85 | logs = ( 86 | ('AverageProgress', np.mean(progs)), 87 | ('MaxProgress', np.max(progs)), 88 | ('MinProgress', np.min(progs)), 89 | ('StdProgress', np.std(progs)), 90 | ) 91 | 92 | return logs 93 | -------------------------------------------------------------------------------- /envs/hierarchy_proxy_env.py: -------------------------------------------------------------------------------- 1 | """Implements an environment proxy to test hierarchy policies""" 2 | 3 | from rllab.envs.proxy_env import ProxyEnv 4 | from rllab.core.serializable import Serializable 5 | 6 | class HierarchyProxyEnv(ProxyEnv): 7 | def __init__(self, low_level_policy, *args, **kwargs): 8 | Serializable.quick_init(self, locals()) 9 | self._low_level_policy = low_level_policy 10 | super().__init__(*args, **kwargs) 11 | 12 | def step(self, high_level_action): 13 | current_observation = ( 14 | # Our env might be double wrapped, e.g. around NormalizedEnv 15 | self._wrapped_env._wrapped_env.get_current_obs() 16 | if isinstance(self._wrapped_env, ProxyEnv) 17 | else self._wrapped_env.get_current_obs()) 18 | 19 | with self._low_level_policy.deterministic(h=high_level_action[None]): 20 | action, _ = self._low_level_policy.get_action( 21 | observation=current_observation[:self._low_level_policy._Ds]) 22 | 23 | return super().step(action) 24 | -------------------------------------------------------------------------------- /envs/meta_env.py: -------------------------------------------------------------------------------- 1 | from rllab import spaces 2 | from rllab.core.serializable import Serializable 3 | from rllab.envs.env_spec import EnvSpec 4 | 5 | from sac.misc.utils import concat_obs_z 6 | 7 | import numpy as np 8 | 9 | class MetaEnv(Serializable): 10 | def __init__(self, env, base_policy, num_skills, steps_per_option=100): 11 | Serializable.quick_init(self, locals()) 12 | self._base_policy = base_policy 13 | self._env = env 14 | self._steps_per_option = steps_per_option 15 | self._num_skills = num_skills 16 | self.observation_space = self._env.observation_space 17 | self.action_space = spaces.Discrete(num_skills) 18 | self.spec = EnvSpec(self.observation_space, self.action_space) 19 | self._obs = self.reset() 20 | 21 | def step(self, meta_action): 22 | total_reward = 0 23 | for _ in range(self._steps_per_option): 24 | aug_obs = concat_obs_z(self._obs, meta_action, self._num_skills) 25 | (action, _) = self._base_policy.get_action(aug_obs) 26 | (self._obs, r, done, _) = self._env.step(action) 27 | total_reward += r 28 | if done: break 29 | # Normalize the total reward by number of steps 30 | return (self._obs, total_reward / float(self._steps_per_option), done, {}) 31 | 32 | def reset(self): 33 | return self._env.reset() 34 | 35 | def log_diagnostics(self, paths): 36 | self._env.log_diagnostics(paths) 37 | 38 | def terminate(self): 39 | self._env.terminate() 40 | 41 | 42 | class FixedOptionEnv(Serializable): 43 | def __init__(self, env, num_skills, z): 44 | Serializable.quick_init(self, locals()) 45 | self._env = env 46 | self._num_skills = num_skills 47 | self._z = z 48 | obs_space = self._env.observation_space 49 | low = np.hstack([obs_space.low, np.full(num_skills, 0)]) 50 | high = np.hstack([obs_space.high, np.full(num_skills, 1)]) 51 | self.observation_space = spaces.Box(low=low, high=high) 52 | self.action_space = self._env.action_space 53 | self.spec = EnvSpec(self.observation_space, self.action_space) 54 | 55 | def step(self, action): 56 | (obs, r, done, info) = self._env.step(action) 57 | aug_obs = concat_obs_z(obs, self._z, self._num_skills) 58 | return (aug_obs, r, done, info) 59 | 60 | def reset(self): 61 | obs = self._env.reset() 62 | aug_obs = concat_obs_z(obs, self._z, self._num_skills) 63 | return aug_obs 64 | 65 | def log_diagnostics(self, paths): 66 | self._env.log_diagnostics(paths) 67 | 68 | def terminate(self): 69 | self._env.terminate() 70 | -------------------------------------------------------------------------------- /envs/multi_direction_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv 5 | from rllab.envs.mujoco.ant_env import AntEnv 6 | from rllab.envs.mujoco.humanoid_env import HumanoidEnv 7 | from rllab.envs.base import Step 8 | from rllab.misc import logger 9 | 10 | from .helpers import get_multi_direction_logs 11 | 12 | class MultiDirectionBaseEnv(Serializable): 13 | def __init__(self, 14 | velocity_reward_weight=1.0, 15 | survive_reward=0, 16 | ctrl_cost_coeff=0, 17 | contact_cost_coeff=0, 18 | velocity_deviation_cost_coeff=0, 19 | *args, **kwargs): 20 | self._velocity_reward_weight = velocity_reward_weight 21 | self._survive_reward = survive_reward 22 | 23 | self._ctrl_cost_coeff = ctrl_cost_coeff 24 | self._contact_cost_coeff = contact_cost_coeff 25 | self._velocity_deviation_cost_coeff = velocity_deviation_cost_coeff 26 | Serializable.quick_init(self, locals()) 27 | 28 | @property 29 | def velocity_reward(self): 30 | xy_velocities = self.get_body_comvel("torso")[:2] 31 | #xy_velocities = self.get_body_comvel("torso")[0] 32 | # rewards for speed on xy-plane (no matter which direction) 33 | xy_velocity = np.linalg.norm(xy_velocities) 34 | 35 | velocity_reward = self._velocity_reward_weight * xy_velocity 36 | return velocity_reward 37 | 38 | @property 39 | def survive_reward(self): 40 | return self._survive_reward 41 | 42 | def control_cost(self, action): 43 | lb, ub = self.action_bounds 44 | scaling = (ub - lb) / 2.0 45 | 46 | return 0.5 * self._ctrl_cost_coeff * np.sum( 47 | np.square(action / scaling)) 48 | 49 | @property 50 | def contact_cost(self): 51 | return 0.5 * self._contact_cost_coeff * np.sum( 52 | np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), 53 | 54 | @property 55 | def is_healthy(self): 56 | return True 57 | 58 | @property 59 | def velocity_deviation_cost(self): 60 | velocity_deviation_cost = ( 61 | 0.5 * 62 | self._velocity_deviation_cost_coeff 63 | * np.sum(np.square(self.get_body_comvel("torso")[2:]))) 64 | return velocity_deviation_cost 65 | 66 | @property 67 | def done(self): 68 | done = not self.is_healthy 69 | return done 70 | 71 | 72 | def step(self, action): 73 | self.forward_dynamics(action) 74 | 75 | reward = ( 76 | self.velocity_reward 77 | + self.survive_reward 78 | - self.control_cost(action) 79 | - self.contact_cost 80 | - self.velocity_deviation_cost) 81 | 82 | next_observation = self.get_current_obs() 83 | #return Step(next_observation, float(reward), self.done) 84 | return Step(next_observation, float(reward), False) 85 | 86 | def log_diagnostics(self, paths, *args, **kwargs): 87 | logs = get_multi_direction_logs(paths) 88 | for row in logs: 89 | logger.record_tabular(*row) 90 | 91 | 92 | class MultiDirectionSwimmerEnv(MultiDirectionBaseEnv, SwimmerEnv): 93 | def __init__(self, 94 | ctrl_cost_coeff=1e-2, 95 | *args, **kwargs): 96 | MultiDirectionBaseEnv.__init__( 97 | self, ctrl_cost_coeff=ctrl_cost_coeff, *args, **kwargs) 98 | SwimmerEnv.__init__( 99 | self, ctrl_cost_coeff=ctrl_cost_coeff, *args, **kwargs) 100 | 101 | @property 102 | def velocity_reward(self): 103 | xy_velocities = self.get_body_comvel("torso")[:2] 104 | 105 | # rewards for speed on positive x direction 106 | xy_velocity = np.linalg.norm(xy_velocities) 107 | if xy_velocities[0] < 0: 108 | xy_velocity *= -1.0 109 | 110 | velocity_reward = self._velocity_reward_weight * xy_velocity 111 | return velocity_reward 112 | 113 | class MultiDirectionAntEnv(MultiDirectionBaseEnv, AntEnv): 114 | def __init__(self, 115 | ctrl_cost_coeff=1e-2, 116 | contact_cost_coeff=1e-3, 117 | survive_reward=5e-2, 118 | *args, **kwargs): 119 | MultiDirectionBaseEnv.__init__( 120 | self, 121 | ctrl_cost_coeff=ctrl_cost_coeff, 122 | contact_cost_coeff=contact_cost_coeff, 123 | survive_reward=survive_reward, 124 | *args, **kwargs) 125 | AntEnv.__init__(self, *args, **kwargs) 126 | 127 | @property 128 | def is_healthy(self): 129 | return (np.isfinite(self._state).all() 130 | and 0.2 <= self._state[2] <= 1.0) 131 | 132 | class MultiDirectionHumanoidEnv(MultiDirectionBaseEnv, HumanoidEnv): 133 | def __init__(self, 134 | survive_reward=2e-1, 135 | ctrl_cost_coeff=1e-3, 136 | contact_cost_coeff=1e-5, 137 | velocity_deviation_cost_coeff=1e-2, 138 | *args, **kwargs): 139 | MultiDirectionBaseEnv.__init__( 140 | self, 141 | survive_reward=survive_reward, 142 | ctrl_cost_coeff=ctrl_cost_coeff, 143 | contact_cost_coeff=contact_cost_coeff, 144 | velocity_deviation_cost_coeff=velocity_deviation_cost_coeff, 145 | *args, **kwargs) 146 | HumanoidEnv.__init__( 147 | self, 148 | # survive_reward=survive_reward, 149 | alive_bonus=survive_reward, # TODO: remove this 150 | ctrl_cost_coeff=ctrl_cost_coeff, 151 | # contact_cost_coeff=contact_cost_coeff, 152 | impact_cost_coeff=contact_cost_coeff, # TODO: remove this 153 | vel_deviation_cost_coeff=velocity_deviation_cost_coeff, # TODO: remove this 154 | *args, **kwargs) 155 | 156 | @property 157 | def is_healthy(self): 158 | return 0.8 < self.model.data.qpos[2] < 2.0 159 | -------------------------------------------------------------------------------- /envs/simple_maze_ant_env.py: -------------------------------------------------------------------------------- 1 | """Implements an ant whose goal is to reach a target in a maze""" 2 | 3 | import os 4 | 5 | import numpy as np 6 | 7 | from rllab.core.serializable import Serializable 8 | from sac.misc.utils import PROJECT_PATH 9 | from .helpers import random_point_in_circle, get_random_goal_logs 10 | from .random_goal_ant_env import RandomGoalAntEnv 11 | 12 | MODELS_PATH = os.path.abspath( 13 | os.path.join(PROJECT_PATH, 'sac/mujoco_models')) 14 | 15 | class SimpleMazeAntEnv(RandomGoalAntEnv, Serializable): 16 | """Implements an ant whose goal is to reach a target in a maze""" 17 | 18 | FILE_PATH = os.path.join(MODELS_PATH, 'simple_maze_ant.xml') 19 | 20 | def __init__(self, 21 | reward_type='dense', 22 | terminate_at_goal=True, 23 | goal_reward_weight=3e-1, 24 | goal_radius=1.0, 25 | goal_distance=1.0, 26 | goal_angle_range=(0, 2*np.pi), 27 | velocity_reward_weight=0, 28 | ctrl_cost_coeff=1e-2, 29 | contact_cost_coeff=1e-3, 30 | survive_reward=5e-2, 31 | *args, 32 | **kwargs): 33 | file_path = self.__class__.FILE_PATH 34 | kwargs.pop('file_path', None) 35 | super(SimpleMazeAntEnv, self).__init__( 36 | file_path=file_path, 37 | reward_type=reward_type, 38 | terminate_at_goal=terminate_at_goal, 39 | goal_reward_weight=goal_reward_weight, 40 | goal_radius=goal_radius, 41 | goal_distance=goal_distance, 42 | goal_angle_range=goal_angle_range, 43 | velocity_reward_weight=velocity_reward_weight, 44 | ctrl_cost_coeff=ctrl_cost_coeff, 45 | contact_cost_coeff=contact_cost_coeff, 46 | survive_reward=survive_reward, 47 | *args, 48 | **kwargs) 49 | 50 | def reset(self, *args, **kwargs): 51 | observation = super(SimpleMazeAntEnv, self).reset( 52 | goal_position=np.array([20, -13]), *args, **kwargs) 53 | 54 | return observation 55 | -------------------------------------------------------------------------------- /misc/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /misc/__pycache__/instrument.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/instrument.cpython-35.pyc -------------------------------------------------------------------------------- /misc/__pycache__/mlp.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/mlp.cpython-35.pyc -------------------------------------------------------------------------------- /misc/__pycache__/plotter.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/plotter.cpython-35.pyc -------------------------------------------------------------------------------- /misc/__pycache__/sampler.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/sampler.cpython-35.pyc -------------------------------------------------------------------------------- /misc/__pycache__/tf_utils.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/tf_utils.cpython-35.pyc -------------------------------------------------------------------------------- /misc/__pycache__/utils.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/misc/__pycache__/utils.cpython-35.pyc -------------------------------------------------------------------------------- /misc/instrument.py: -------------------------------------------------------------------------------- 1 | import os 2 | import uuid 3 | 4 | from rllab.misc.instrument import run_experiment_lite 5 | from sac.misc.utils import timestamp 6 | 7 | from sac.misc.utils import PROJECT_PATH 8 | 9 | DEFAULT_LOG_DIR = PROJECT_PATH + "/data" 10 | 11 | def _create_symlink(folder): 12 | # Create a symbolic link that points to the sac folder and include it 13 | # in the tarball. 14 | 15 | # Unique filename for the symlink. 16 | include_path = os.path.join('/tmp/', str(uuid.uuid4())) 17 | os.makedirs(include_path) 18 | 19 | os.symlink(os.path.join(PROJECT_PATH, folder), 20 | os.path.join(include_path, folder)) 21 | 22 | return include_path 23 | 24 | 25 | def run_sac_experiment(main, mode, include_folders=None, log_dir=None, 26 | exp_prefix="experiment", exp_name=None, **kwargs): 27 | if exp_name is None: 28 | exp_name = timestamp() 29 | 30 | if log_dir is None: 31 | log_dir = os.path.join( 32 | DEFAULT_LOG_DIR, 33 | "local", 34 | exp_prefix.replace("_", "-"), 35 | exp_name) 36 | 37 | if include_folders is None: 38 | include_folders = list() 39 | 40 | if mode == 'ec2': 41 | include_folders.append('sac') 42 | all_symlinks = list() 43 | 44 | for folder in include_folders: 45 | all_symlinks.append(_create_symlink(folder)) 46 | 47 | kwargs.update(added_project_directories=all_symlinks) 48 | 49 | run_experiment_lite( 50 | stub_method_call=main, 51 | mode=mode, 52 | exp_prefix=exp_prefix, 53 | exp_name=exp_name, 54 | log_dir=log_dir, 55 | **kwargs, 56 | ) 57 | -------------------------------------------------------------------------------- /misc/plotter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | 5 | class QFPolicyPlotter: 6 | def __init__(self, qf, policy, obs_lst, default_action, n_samples): 7 | self._qf = qf 8 | self._policy = policy 9 | self._obs_lst = obs_lst 10 | self._default_action = default_action 11 | self._n_samples = n_samples 12 | 13 | self._var_inds = np.where(np.isnan(default_action))[0] 14 | assert len(self._var_inds) == 2 15 | 16 | n_plots = len(obs_lst) 17 | 18 | x_size = 5 * n_plots 19 | y_size = 5 20 | 21 | fig = plt.figure(figsize=(x_size, y_size)) 22 | self._ax_lst = [] 23 | for i in range(n_plots): 24 | ax = fig.add_subplot(100 + n_plots * 10 + i + 1) 25 | ax.set_xlim((-1, 1)) 26 | ax.set_ylim((-1, 1)) 27 | ax.grid(True) 28 | self._ax_lst.append(ax) 29 | 30 | self._line_objects = list() 31 | 32 | def draw(self): 33 | # noinspection PyArgumentList 34 | [h.remove() for h in self._line_objects] 35 | self._line_objects = list() 36 | 37 | self._plot_level_curves() 38 | self._plot_action_samples() 39 | 40 | plt.draw() 41 | plt.pause(0.001) 42 | 43 | def _plot_level_curves(self): 44 | # Create mesh grid. 45 | xs = np.linspace(-1, 1, 50) 46 | ys = np.linspace(-1, 1, 50) 47 | xgrid, ygrid = np.meshgrid(xs, ys) 48 | N = len(xs)*len(ys) 49 | 50 | # Copy default values along the first axis and replace nans with 51 | # the mesh grid points. 52 | actions = np.tile(self._default_action, (N, 1)) 53 | actions[:, self._var_inds[0]] = xgrid.ravel() 54 | actions[:, self._var_inds[1]] = ygrid.ravel() 55 | 56 | for ax, obs in zip(self._ax_lst, self._obs_lst): 57 | qs = self._qf.eval(obs[None], actions) 58 | qs = qs.reshape(xgrid.shape) 59 | 60 | cs = ax.contour(xgrid, ygrid, qs, 20) 61 | self._line_objects += cs.collections 62 | self._line_objects += ax.clabel( 63 | cs, inline=1, fontsize=10, fmt='%.2f') 64 | 65 | def _plot_action_samples(self): 66 | for ax, obs in zip(self._ax_lst, self._obs_lst): 67 | actions = self._policy.get_actions( 68 | np.ones((self._n_samples, 1)) * obs[None, :]) 69 | 70 | x, y = actions[:, 0], actions[:, 1] 71 | self._line_objects += ax.plot(x, y, 'b*') 72 | -------------------------------------------------------------------------------- /misc/remote_sampler.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import ray # TODO: Add ray to dependencies. 3 | import tensorflow as tf 4 | import numpy as np 5 | 6 | from rllab.misc.overrides import overrides 7 | from rllab.misc import logger 8 | 9 | from . import tf_utils 10 | from .sampler import Sampler, rollout 11 | 12 | # TODO: Make the remote sampler correctly use the initial exploration policy, as of now, using this will fail 13 | 14 | class RemoteSampler(Sampler): 15 | def __init__(self, **kwargs): 16 | super(RemoteSampler, self).__init__(**kwargs) 17 | 18 | self._remote_environment = None 19 | self._remote_path = None 20 | self._n_episodes = 0 21 | self._total_samples = 0 22 | self._last_path_return = 0 23 | self._max_path_return = -np.inf 24 | 25 | @overrides 26 | def initialize(self, env, policy, pool): 27 | super(RemoteSampler, self).initialize(env, policy, pool) 28 | 29 | ray.init() 30 | 31 | env_pkl = pickle.dumps(env) 32 | policy_pkl = pickle.dumps(policy) 33 | 34 | self._remote_environment = _RemoteEnv.remote(env_pkl, policy_pkl) 35 | 36 | def sample(self): 37 | if self._remote_path is None: 38 | policy_params = self.policy.get_param_values() 39 | self._remote_path = self._remote_environment.rollout.remote( 40 | policy_params, self._max_path_length) 41 | 42 | path_ready, _ = ray.wait([self._remote_path], timeout=0) 43 | 44 | if len(path_ready) or not self.batch_ready(): 45 | path = ray.get(self._remote_path) 46 | self.pool.add_path(path) 47 | self._remote_path = None 48 | self._total_samples += len(path['observations']) 49 | self._last_path_return = np.sum(path['rewards']) 50 | self._max_path_return = max(self._max_path_return, 51 | self._last_path_return) 52 | self._n_episodes += 1 53 | 54 | def log_diagnostics(self): 55 | logger.record_tabular('max-path-return', self._max_path_return) 56 | logger.record_tabular('last-path-return', self._last_path_return) 57 | logger.record_tabular('pool-size', self.pool.size) 58 | logger.record_tabular('episodes', self._n_episodes) 59 | logger.record_tabular('total-samples', self._total_samples) 60 | 61 | 62 | @ray.remote 63 | class _RemoteEnv(object): 64 | def __init__(self, env_pkl, policy_pkl): 65 | self._sess = tf_utils.create_session() 66 | self._sess.run(tf.global_variables_initializer()) 67 | 68 | self._env = pickle.loads(env_pkl) 69 | self._policy = pickle.loads(policy_pkl) 70 | 71 | if hasattr(self._env, 'initialize'): 72 | self._env.initialize() 73 | 74 | def rollout(self, policy_params, path_length): 75 | self._policy.set_param_values(policy_params) 76 | path = rollout(self._env, self._policy, path_length) 77 | 78 | return path 79 | -------------------------------------------------------------------------------- /misc/tf_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from rllab import config 3 | 4 | 5 | def get_default_session(): 6 | return tf.get_default_session() or create_session() 7 | 8 | 9 | def create_session(**kwargs): 10 | """ Create new tensorflow session with given configuration. """ 11 | if "config" not in kwargs: 12 | kwargs["config"] = get_configuration() 13 | return tf.InteractiveSession(**kwargs) 14 | 15 | 16 | def get_configuration(): 17 | """ Returns personal tensorflow configuration. """ 18 | if config.USE_GPU: 19 | raise NotImplementedError 20 | 21 | config_args = dict() 22 | return tf.ConfigProto(**config_args) 23 | -------------------------------------------------------------------------------- /misc/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import datetime 3 | import dateutil.tz 4 | import os 5 | import numpy as np 6 | 7 | PROJECT_PATH = os.path.dirname( 8 | os.path.realpath(os.path.join(__file__, '..', '..'))) 9 | 10 | def timestamp(): 11 | now = datetime.datetime.now(dateutil.tz.tzlocal()) 12 | return now.strftime('%Y-%m-%d-%H-%M-%S-%f-%Z') 13 | 14 | def deep_update(d, u): 15 | for k, v in u.items(): 16 | d[k] = ( 17 | deep_update(d.get(k, {}), v) 18 | if isinstance(v, collections.Mapping) 19 | else v) 20 | 21 | return d 22 | 23 | def get_git_rev(): 24 | try: 25 | import git 26 | repo = git.Repo(os.getcwd()) 27 | git_rev = repo.active_branch.commit.name_rev 28 | except: 29 | git_rev = None 30 | 31 | return git_rev 32 | 33 | def flatten(unflattened, parent_key='', separator='.'): 34 | items = [] 35 | for k, v in unflattened.items(): 36 | if separator in k: 37 | raise ValueError( 38 | "Found separator ({}) from key ({})".format(separator, k)) 39 | new_key = parent_key + separator + k if parent_key else k 40 | if isinstance(v, collections.MutableMapping) and v: 41 | items.extend(flatten(v, new_key, separator=separator).items()) 42 | else: 43 | items.append((new_key, v)) 44 | 45 | return dict(items) 46 | 47 | def unflatten(flattened, separator='.'): 48 | result = {} 49 | for key, value in flattened.items(): 50 | parts = key.split(separator) 51 | d = result 52 | for part in parts[:-1]: 53 | if part not in d: 54 | d[part] = {} 55 | d = d[part] 56 | d[parts[-1]] = value 57 | 58 | return result 59 | 60 | def concat_obs_z(obs, z, num_skills): 61 | """Concatenates the observation to a one-hot encoding of Z.""" 62 | assert np.isscalar(z) 63 | z_one_hot = np.zeros(num_skills) 64 | z_one_hot[z] = 1 65 | return np.hstack([obs, z_one_hot]) 66 | 67 | def split_aug_obs(aug_obs, num_skills): 68 | """Splits an augmented observation into the observation and Z.""" 69 | (obs, z_one_hot) = (aug_obs[:-num_skills], aug_obs[-num_skills:]) 70 | z = np.where(z_one_hot == 1)[0][0] 71 | return (obs, z) 72 | 73 | def _make_dir(filename): 74 | folder = os.path.dirname(filename) 75 | if not os.path.exists(folder): 76 | os.makedirs(folder) 77 | 78 | def _save_video(paths, filename): 79 | import cv2 80 | assert all(['ims' in path for path in paths]) 81 | ims = [im for path in paths for im in path['ims']] 82 | _make_dir(filename) 83 | 84 | # Define the codec and create VideoWriter object 85 | fourcc = cv2.VideoWriter_fourcc(*'MJPG') 86 | fps = 30.0 87 | (height, width, _) = ims[0].shape 88 | writer = cv2.VideoWriter(filename, fourcc, fps, (width, height)) 89 | for im in ims: 90 | writer.write(im) 91 | writer.release() 92 | 93 | def _softmax(x): 94 | max_x = np.max(x) 95 | exp_x = np.exp(x - max_x) 96 | return exp_x / np.sum(exp_x) 97 | -------------------------------------------------------------------------------- /mujoco_models/pusher_2d.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 72 | -------------------------------------------------------------------------------- /mujoco_models/simple_maze_ant.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 92 | -------------------------------------------------------------------------------- /policies/__init__.py: -------------------------------------------------------------------------------- 1 | from .nn_policy import NNPolicy 2 | from .nn_policy2 import NNPolicy2 3 | from .uniform_policy import UniformPolicy 4 | from .gaussian_policy import GaussianPolicy 5 | from .pointer_policy import GaussianPtrPolicy 6 | -------------------------------------------------------------------------------- /policies/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/base.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/base.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/gaussian_policy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/gaussian_policy.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/gmm.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/gmm.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/hierarchical_policy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/hierarchical_policy.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/latent_space_policy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/latent_space_policy.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/nn_policy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/nn_policy.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/nn_policy2.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/nn_policy2.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/pointer_policy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/pointer_policy.cpython-35.pyc -------------------------------------------------------------------------------- /policies/__pycache__/uniform_policy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/policies/__pycache__/uniform_policy.cpython-35.pyc -------------------------------------------------------------------------------- /policies/base.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from sandbox.rocky.tf.core.parameterized import Parameterized 5 | 6 | 7 | class Policy2(Parameterized): 8 | def __init__(self, env_spec): 9 | Parameterized.__init__(self) 10 | self._env_spec = env_spec 11 | 12 | # Should be implemented by all policies 13 | 14 | def get_action(self, observation, sub_level_actions): 15 | raise NotImplementedError 16 | 17 | def get_actions(self, observations,sub_level_actions): 18 | raise NotImplementedError 19 | 20 | def reset(self, dones=None): 21 | pass 22 | 23 | @property 24 | def vectorized(self): 25 | """ 26 | Indicates whether the policy is vectorized. If True, it should implement get_actions(), and support resetting 27 | with multiple simultaneous states. 28 | """ 29 | return False 30 | 31 | @property 32 | def observation_space(self): 33 | return self._env_spec.observation_space 34 | 35 | @property 36 | def action_space(self): 37 | return self._env_spec.action_space 38 | 39 | @property 40 | def env_spec(self): 41 | return self._env_spec 42 | 43 | @property 44 | def recurrent(self): 45 | """ 46 | Indicates whether the policy is recurrent. 47 | :return: 48 | """ 49 | return False 50 | 51 | def log_diagnostics(self, paths): 52 | """ 53 | Log extra information per iteration based on the collected paths 54 | """ 55 | pass 56 | 57 | @property 58 | def state_info_keys(self): 59 | """ 60 | Return keys for the information related to the policy's state when taking an action. 61 | :return: 62 | """ 63 | return [k for k, _ in self.state_info_specs] 64 | 65 | @property 66 | def state_info_specs(self): 67 | """ 68 | Return keys and shapes for the information related to the policy's state when taking an action. 69 | :return: 70 | """ 71 | return list() 72 | 73 | def terminate(self): 74 | """ 75 | Clean up operation 76 | """ 77 | pass 78 | 79 | 80 | class StochasticPolicy(Policy2): 81 | @property 82 | def distribution(self): 83 | """ 84 | :rtype Distribution 85 | """ 86 | raise NotImplementedError 87 | 88 | def dist_info_sym(self, obs_var, state_info_vars): 89 | """ 90 | Return the symbolic distribution information about the actions. 91 | :param obs_var: symbolic variable for observations 92 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at 93 | the time it received the observation 94 | :return: 95 | """ 96 | raise NotImplementedError 97 | 98 | def dist_info(self, obs, state_infos): 99 | """ 100 | Return the distribution information about the actions. 101 | :param obs_var: observation values 102 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at 103 | the time it received the observation 104 | :return: 105 | """ 106 | raise NotImplementedError 107 | -------------------------------------------------------------------------------- /policies/nn_policy.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from rllab.core.serializable import Serializable 4 | 5 | from rllab.misc.overrides import overrides 6 | from sandbox.rocky.tf.policies.base import Policy 7 | 8 | 9 | class NNPolicy(Policy, Serializable): 10 | def __init__(self, env_spec, observation_ph, actions, 11 | scope_name=None): 12 | Serializable.quick_init(self, locals()) 13 | 14 | self._observations_ph = observation_ph 15 | self._actions = actions 16 | self._scope_name = ( 17 | tf.get_variable_scope().name if not scope_name else scope_name 18 | ) 19 | super(NNPolicy, self).__init__(env_spec) 20 | 21 | @overrides 22 | def get_action(self, observation): 23 | """Sample single action based on the observations.""" 24 | return self.get_actions(observation[None])[0], {} 25 | 26 | @overrides 27 | def get_actions(self, observations): 28 | """Sample actions based on the observations.""" 29 | feed_dict = {self._observations_ph: observations} 30 | actions = tf.get_default_session().run(self._actions, feed_dict) 31 | return actions 32 | 33 | @overrides 34 | def log_diagnostics(self, paths): 35 | pass 36 | 37 | @overrides 38 | def get_params_internal(self, **tags): 39 | if tags: 40 | raise NotImplementedError 41 | scope = self._scope_name 42 | # Add "/" to 'scope' unless it's empty (otherwise get_collection will 43 | # return all parameters that start with 'scope'. 44 | scope = scope if scope == '' else scope + '/' 45 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 46 | -------------------------------------------------------------------------------- /policies/nn_policy2.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from rllab.core.serializable import Serializable 4 | 5 | from rllab.misc.overrides import overrides 6 | from sac.policies.base import Policy2 7 | 8 | 9 | class NNPolicy2(Policy2, Serializable): 10 | def __init__(self, env_spec, observation_ph, actions, 11 | scope_name=None): 12 | Serializable.quick_init(self, locals()) 13 | 14 | self._observations_ph = observation_ph 15 | self._actions = actions 16 | self._scope_name = ( 17 | tf.get_variable_scope().name if not scope_name else scope_name 18 | ) 19 | super(NNPolicy2, self).__init__(env_spec) 20 | 21 | @overrides 22 | def get_action(self, observation,sub_level_actions): 23 | """Sample single action based on the observations.""" 24 | return self.get_actions(observation[None],sub_level_actions)[0], {} 25 | 26 | @overrides 27 | def get_actions(self, observations,sub_level_actions): 28 | """Sample actions based on the observations.""" 29 | feed_dict = {self._observations_ph: observations,self.sub_level_actions:sub_level_actions} 30 | actions = tf.get_default_session().run(self._actions, feed_dict) 31 | return actions 32 | 33 | @overrides 34 | def log_diagnostics(self, paths): 35 | pass 36 | 37 | @overrides 38 | def get_params_internal(self, **tags): 39 | if tags: 40 | raise NotImplementedError 41 | scope = self._scope_name 42 | # Add "/" to 'scope' unless it's empty (otherwise get_collection will 43 | # return all parameters that start with 'scope'. 44 | scope = scope if scope == '' else scope + '/' 45 | return tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope) 46 | 47 | 48 | -------------------------------------------------------------------------------- /policies/uniform_policy.py: -------------------------------------------------------------------------------- 1 | from rllab.core.serializable import Serializable 2 | 3 | from rllab.misc.overrides import overrides 4 | from sac.policies.base import Policy2 5 | 6 | import numpy as np 7 | 8 | 9 | class UniformPolicy(Policy2, Serializable): 10 | """ 11 | Fixed policy that randomly samples actions uniformly at random. 12 | 13 | Used for an initial exploration period instead of an undertrained policy. 14 | """ 15 | def __init__(self, env_spec): 16 | Serializable.quick_init(self, locals()) 17 | self._Da = env_spec.action_space.flat_dim 18 | 19 | super(UniformPolicy, self).__init__(env_spec) 20 | 21 | # Assumes action spaces are normalized to be the interval [-1, 1] 22 | @overrides 23 | def get_action(self, observation,sub_level_actions): 24 | return np.random.uniform(-1., 1., self._Da), None 25 | '''@overrides 26 | def get_action(self, observation,sub_level_actions): 27 | probs=np.random.uniform(0.0, 1., 4) 28 | probs=np.argmax(probs)#probs/sum(probs) 29 | #probs=np.array([1,0,0,0],dtype=np.float32) 30 | #probs=np.random.shuffle(probs) 31 | #actions_mean=np.sum(np.multiply(sub_level_actions[0],np.expand_dims(probs,2)),1) 32 | return sub_level_actions[0][0][probs], None 33 | #return np.random.uniform(-1., 1., self._Da), None''' 34 | 35 | @overrides 36 | def get_actions(self, observations,sub_level_actions): 37 | pass 38 | 39 | @overrides 40 | def log_diagnostics(self, paths): 41 | pass 42 | 43 | @overrides 44 | def get_params_internal(self, **tags): 45 | pass 46 | 47 | -------------------------------------------------------------------------------- /preprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | from .mlp_preprocessor import MLPPreprocessor 2 | -------------------------------------------------------------------------------- /preprocessors/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/preprocessors/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /preprocessors/__pycache__/mlp_preprocessor.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/preprocessors/__pycache__/mlp_preprocessor.cpython-35.pyc -------------------------------------------------------------------------------- /preprocessors/mlp_preprocessor.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from rllab.core.serializable import Serializable 4 | 5 | from sandbox.rocky.tf.core.parameterized import Parameterized 6 | 7 | from sac.misc.mlp import MLPFunction 8 | from sac.misc import tf_utils 9 | 10 | class MLPPreprocessor(MLPFunction): 11 | def __init__(self, env_spec, layer_sizes=(128, 16), 12 | output_nonlinearity=None, name='observations_preprocessor'): 13 | 14 | Parameterized.__init__(self) 15 | Serializable.quick_init(self, locals()) 16 | 17 | self._name = name 18 | 19 | self._Do = env_spec.observation_space.flat_dim 20 | 21 | obs_ph = tf.placeholder( 22 | tf.float32, 23 | shape=(None, self._Do), 24 | name='observations', 25 | ) 26 | 27 | self._input_pls = (obs_ph, ) 28 | self._layer_sizes = layer_sizes 29 | self._output_nonlinearity = output_nonlinearity 30 | 31 | self._output_t = self.get_output_for(obs_ph, reuse=tf.AUTO_REUSE) 32 | -------------------------------------------------------------------------------- /primitive-policies/ant/bwrd/bwrd.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/bwrd/bwrd.pkl -------------------------------------------------------------------------------- /primitive-policies/ant/dwrd/dwrd.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/dwrd/dwrd.pkl -------------------------------------------------------------------------------- /primitive-policies/ant/fwrd/fwrd.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/fwrd/fwrd.pkl -------------------------------------------------------------------------------- /primitive-policies/ant/uwrd/uwrd.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/ant/uwrd/uwrd.pkl -------------------------------------------------------------------------------- /primitive-policies/hc/fwd/fwd.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/hc/fwd/fwd.pkl -------------------------------------------------------------------------------- /primitive-policies/hc/jp-longz/jump.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/hc/jp-longz/jump.pkl -------------------------------------------------------------------------------- /primitive-policies/pusher/bottom/bottom.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/pusher/bottom/bottom.pkl -------------------------------------------------------------------------------- /primitive-policies/pusher/left/left.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/primitive-policies/pusher/left/left.pkl -------------------------------------------------------------------------------- /replay_buffers/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_replay_buffer import SimpleReplayBuffer -------------------------------------------------------------------------------- /replay_buffers/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /replay_buffers/__pycache__/replay_buffer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/replay_buffer.cpython-35.pyc -------------------------------------------------------------------------------- /replay_buffers/__pycache__/simple_replay_buffer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/replay_buffers/__pycache__/simple_replay_buffer.cpython-35.pyc -------------------------------------------------------------------------------- /replay_buffers/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class ReplayBuffer(object, metaclass=abc.ABCMeta): 5 | """ 6 | A class used to save and replay data. 7 | """ 8 | 9 | @abc.abstractmethod 10 | def add_sample(self, observation, action, reward, next_observation, 11 | terminal, **kwargs): 12 | """ 13 | Add a transition tuple. 14 | """ 15 | pass 16 | 17 | @abc.abstractmethod 18 | def terminate_episode(self): 19 | """ 20 | Let the replay buffer know that the episode has terminated in case some 21 | special book-keeping has to happen. 22 | :return: 23 | """ 24 | pass 25 | 26 | @property 27 | @abc.abstractmethod 28 | def size(self, **kwargs): 29 | """ 30 | :return: # of unique items that can be sampled. 31 | """ 32 | pass 33 | 34 | def add_path(self, path): 35 | """ 36 | Add a path to the replay buffer. 37 | 38 | This default implementation naively goes through every step, but you 39 | may want to optimize this. 40 | 41 | NOTE: You should NOT call "terminate_episode" after calling add_path. 42 | It's assumed that this function handles the episode termination. 43 | 44 | :param path: Dict like one outputted by railrl.samplers.util.rollout 45 | """ 46 | for i, ( 47 | obs, 48 | sub_level_actions, 49 | action, 50 | reward, 51 | next_obs, 52 | terminal, 53 | agent_info, 54 | env_info 55 | ) in enumerate(zip( 56 | path["observations"], 57 | path["sub_level_actions"], 58 | path["actions"], 59 | path["rewards"], 60 | path["next_observations"], 61 | path["terminals"], 62 | path["agent_infos"], 63 | path["env_infos"], 64 | )): 65 | self.add_sample( 66 | obs, 67 | sub_level_actions, 68 | action, 69 | reward, 70 | terminal, 71 | next_obs, 72 | agent_info=agent_info, 73 | env_info=env_info, 74 | ) 75 | self.terminate_episode() 76 | 77 | @abc.abstractmethod 78 | def random_batch(self, batch_size): 79 | """ 80 | Return a batch of size `batch_size`. 81 | :param batch_size: 82 | :return: 83 | """ 84 | pass 85 | -------------------------------------------------------------------------------- /replay_buffers/simple_replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | 5 | from .replay_buffer import ReplayBuffer 6 | 7 | 8 | class SimpleReplayBuffer(ReplayBuffer, Serializable): 9 | def __init__(self, env_spec, max_replay_buffer_size, seq_len): 10 | super(SimpleReplayBuffer, self).__init__() 11 | Serializable.quick_init(self, locals()) 12 | 13 | max_replay_buffer_size = int(max_replay_buffer_size) 14 | 15 | self._env_spec = env_spec 16 | self._observation_dim = env_spec.observation_space.flat_dim 17 | self._action_dim = env_spec.action_space.flat_dim 18 | self._max_buffer_size = max_replay_buffer_size 19 | self._observations = np.zeros((max_replay_buffer_size, 20 | self._observation_dim)) 21 | # It's a bit memory inefficient to save the observations twice, 22 | # but it makes the code *much* easier since you no longer have to 23 | # worry about termination conditions. 24 | self._next_obs = np.zeros((max_replay_buffer_size, 25 | self._observation_dim)) 26 | self._sub_level_actions = np.zeros((max_replay_buffer_size,seq_len, self._action_dim)) 27 | self._sub_level_probs = np.zeros((max_replay_buffer_size,seq_len, 1)) 28 | self._actions = np.zeros((max_replay_buffer_size, self._action_dim)) 29 | self._rewards = np.zeros(max_replay_buffer_size) 30 | # self._terminals[i] = a terminal was received at time i 31 | self._terminals = np.zeros(max_replay_buffer_size, dtype='uint8') 32 | self._top = 0 33 | self._size = 0 34 | 35 | def add_sample(self, observation,sub_level_actions,sub_level_probs, action, reward, terminal, 36 | next_observation, **kwargs): 37 | self._observations[self._top] = observation 38 | self._sub_level_actions[self._top] = sub_level_actions 39 | self._sub_level_probs[self._top] = sub_level_probs 40 | self._actions[self._top] = action 41 | self._rewards[self._top] = reward 42 | self._terminals[self._top] = terminal 43 | self._next_obs[self._top] = next_observation 44 | 45 | self._advance() 46 | 47 | def terminate_episode(self): 48 | pass 49 | 50 | def _advance(self): 51 | self._top = (self._top + 1) % self._max_buffer_size 52 | if self._size < self._max_buffer_size: 53 | self._size += 1 54 | 55 | def random_batch(self, batch_size): 56 | indices = np.random.randint(0, self._size, batch_size) 57 | return dict( 58 | observations=self._observations[indices], 59 | sub_level_actions=self._sub_level_actions[indices], 60 | sub_level_probs=self._sub_level_probs[indices], 61 | actions=self._actions[indices], 62 | rewards=self._rewards[indices], 63 | terminals=self._terminals[indices], 64 | next_observations=self._next_obs[indices], 65 | ) 66 | 67 | @property 68 | def size(self): 69 | return self._size 70 | 71 | def __getstate__(self): 72 | d = super(SimpleReplayBuffer, self).__getstate__() 73 | d.update(dict( 74 | o=self._observations.tobytes(), 75 | sa=self._sub_level_actions.tobytes(), 76 | sp=self._sub_level_probs.tobytes(), 77 | a=self._actions.tobytes(), 78 | r=self._rewards.tobytes(), 79 | t=self._terminals.tobytes(), 80 | no=self._next_obs.tobytes(), 81 | top=self._top, 82 | size=self._size, 83 | )) 84 | return d 85 | 86 | def __setstate__(self, d): 87 | super(SimpleReplayBuffer, self).__setstate__(d) 88 | self._observations = np.fromstring(d['o']).reshape( 89 | self._max_buffer_size, -1 90 | ) 91 | self._next_obs = np.fromstring(d['no']).reshape( 92 | self._max_buffer_size, -1 93 | ) 94 | self._sub_level_actions = np.fromstring(d['sa']).reshape(self._max_buffer_size,seq_len, -1) 95 | self._sub_level_probs = np.fromstring(d['sp']).reshape(self._max_buffer_size,seq_len, -1) 96 | self._actions = np.fromstring(d['a']).reshape(self._max_buffer_size, -1) 97 | self._rewards = np.fromstring(d['r']).reshape(self._max_buffer_size) 98 | self._terminals = np.fromstring(d['t'], dtype=np.uint8) 99 | self._top = d['top'] 100 | self._size = d['size'] 101 | -------------------------------------------------------------------------------- /sandbox/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/__pycache__/batch_polopt.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/batch_polopt.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/__pycache__/npo.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/npo.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/__pycache__/trpo.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/algos/__pycache__/trpo.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/npg.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/npo.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from rllab.misc import ext 5 | from rllab.misc.overrides import overrides 6 | import rllab.misc.logger as logger 7 | from sandbox.rocky.tf.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer 8 | from sandbox.rocky.tf.algos.batch_polopt import BatchPolopt 9 | from sandbox.rocky.tf.misc import tensor_utils 10 | import tensorflow as tf 11 | 12 | 13 | class NPO(BatchPolopt): 14 | """ 15 | Natural Policy Optimization. 16 | """ 17 | 18 | def __init__( 19 | self, 20 | optimizer=None, 21 | optimizer_args=None, 22 | step_size=0.01, 23 | **kwargs): 24 | if optimizer is None: 25 | if optimizer_args is None: 26 | optimizer_args = dict() 27 | optimizer = PenaltyLbfgsOptimizer(**optimizer_args) 28 | self.optimizer = optimizer 29 | self.step_size = step_size 30 | super(NPO, self).__init__(**kwargs) 31 | 32 | @overrides 33 | def init_opt(self): 34 | is_recurrent = int(self.policy.recurrent) 35 | obs_var = self.env.observation_space.new_tensor_variable( 36 | 'obs', 37 | extra_dims=1 + is_recurrent, 38 | ) 39 | action_var = self.env.action_space.new_tensor_variable( 40 | 'action', 41 | extra_dims=1 + is_recurrent, 42 | ) 43 | advantage_var = tensor_utils.new_tensor( 44 | 'advantage', 45 | ndim=1 + is_recurrent, 46 | dtype=tf.float32, 47 | ) 48 | dist = self.policy.distribution 49 | 50 | old_dist_info_vars = { 51 | k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) 52 | for k, shape in dist.dist_info_specs 53 | } 54 | old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] 55 | 56 | state_info_vars = { 57 | k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) 58 | for k, shape in self.policy.state_info_specs 59 | } 60 | state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] 61 | 62 | if is_recurrent: 63 | valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") 64 | else: 65 | valid_var = None 66 | 67 | dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) 68 | kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) 69 | lr = dist.likelihood_ratio_sym(action_var, old_dist_info_vars, dist_info_vars) 70 | if is_recurrent: 71 | mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) 72 | surr_loss = - tf.reduce_sum(lr * advantage_var * valid_var) / tf.reduce_sum(valid_var) 73 | else: 74 | mean_kl = tf.reduce_mean(kl) 75 | surr_loss = - tf.reduce_mean(lr * advantage_var) 76 | 77 | input_list = [ 78 | obs_var, 79 | action_var, 80 | advantage_var, 81 | ] + state_info_vars_list + old_dist_info_vars_list 82 | if is_recurrent: 83 | input_list.append(valid_var) 84 | 85 | self.optimizer.update_opt( 86 | loss=surr_loss, 87 | target=self.policy, 88 | leq_constraint=(mean_kl, self.step_size), 89 | inputs=input_list, 90 | constraint_name="mean_kl" 91 | ) 92 | return dict() 93 | 94 | @overrides 95 | def optimize_policy(self, itr, samples_data): 96 | all_input_values = tuple(ext.extract( 97 | samples_data, 98 | "observations", "actions", "advantages" 99 | )) 100 | agent_infos = samples_data["agent_infos"] 101 | state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] 102 | dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] 103 | all_input_values += tuple(state_info_list) + tuple(dist_info_list) 104 | if self.policy.recurrent: 105 | all_input_values += (samples_data["valids"],) 106 | logger.log("Computing loss before") 107 | loss_before = self.optimizer.loss(all_input_values) 108 | logger.log("Computing KL before") 109 | mean_kl_before = self.optimizer.constraint_val(all_input_values) 110 | logger.log("Optimizing") 111 | self.optimizer.optimize(all_input_values) 112 | logger.log("Computing KL after") 113 | mean_kl = self.optimizer.constraint_val(all_input_values) 114 | logger.log("Computing loss after") 115 | loss_after = self.optimizer.loss(all_input_values) 116 | logger.record_tabular('LossBefore', loss_before) 117 | logger.record_tabular('LossAfter', loss_after) 118 | logger.record_tabular('MeanKLBefore', mean_kl_before) 119 | logger.record_tabular('MeanKL', mean_kl) 120 | logger.record_tabular('dLoss', loss_before - loss_after) 121 | return dict() 122 | 123 | @overrides 124 | def get_itr_snapshot(self, itr, samples_data): 125 | return dict( 126 | itr=itr, 127 | policy=self.policy, 128 | baseline=self.baseline, 129 | env=self.env, 130 | ) 131 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/trpo.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from sandbox.rocky.tf.algos.npo import NPO 4 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 5 | 6 | 7 | class TRPO(NPO): 8 | """ 9 | Trust Region Policy Optimization 10 | """ 11 | 12 | def __init__( 13 | self, 14 | optimizer=None, 15 | optimizer_args=None, 16 | **kwargs): 17 | if optimizer is None: 18 | if optimizer_args is None: 19 | optimizer_args = dict() 20 | optimizer = ConjugateGradientOptimizer(**optimizer_args) 21 | super(TRPO, self).__init__(optimizer=optimizer, **kwargs) 22 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/vpg.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from rllab.misc import logger 4 | from rllab.misc import ext 5 | from rllab.misc.overrides import overrides 6 | from sandbox.rocky.tf.algos.batch_polopt import BatchPolopt 7 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer 8 | from sandbox.rocky.tf.misc import tensor_utils 9 | from rllab.core.serializable import Serializable 10 | import tensorflow as tf 11 | 12 | 13 | class VPG(BatchPolopt, Serializable): 14 | """ 15 | Vanilla Policy Gradient. 16 | """ 17 | 18 | def __init__( 19 | self, 20 | env, 21 | policy, 22 | baseline, 23 | optimizer=None, 24 | optimizer_args=None, 25 | **kwargs): 26 | Serializable.quick_init(self, locals()) 27 | if optimizer is None: 28 | default_args = dict( 29 | batch_size=None, 30 | max_epochs=1, 31 | ) 32 | if optimizer_args is None: 33 | optimizer_args = default_args 34 | else: 35 | optimizer_args = dict(default_args, **optimizer_args) 36 | optimizer = FirstOrderOptimizer(**optimizer_args) 37 | self.optimizer = optimizer 38 | self.opt_info = None 39 | super(VPG, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs) 40 | 41 | @overrides 42 | def init_opt(self): 43 | is_recurrent = int(self.policy.recurrent) 44 | 45 | obs_var = self.env.observation_space.new_tensor_variable( 46 | 'obs', 47 | extra_dims=1 + is_recurrent, 48 | ) 49 | action_var = self.env.action_space.new_tensor_variable( 50 | 'action', 51 | extra_dims=1 + is_recurrent, 52 | ) 53 | advantage_var = tensor_utils.new_tensor( 54 | name='advantage', 55 | ndim=1 + is_recurrent, 56 | dtype=tf.float32, 57 | ) 58 | dist = self.policy.distribution 59 | 60 | old_dist_info_vars = { 61 | k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name='old_%s' % k) 62 | for k, shape in dist.dist_info_specs 63 | } 64 | old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] 65 | 66 | state_info_vars = { 67 | k: tf.placeholder(tf.float32, shape=[None] * (1 + is_recurrent) + list(shape), name=k) 68 | for k, shape in self.policy.state_info_specs 69 | } 70 | state_info_vars_list = [state_info_vars[k] for k in self.policy.state_info_keys] 71 | 72 | if is_recurrent: 73 | valid_var = tf.placeholder(tf.float32, shape=[None, None], name="valid") 74 | else: 75 | valid_var = None 76 | 77 | dist_info_vars = self.policy.dist_info_sym(obs_var, state_info_vars) 78 | logli = dist.log_likelihood_sym(action_var, dist_info_vars) 79 | kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) 80 | 81 | # formulate as a minimization problem 82 | # The gradient of the surrogate objective is the policy gradient 83 | if is_recurrent: 84 | surr_obj = - tf.reduce_sum(logli * advantage_var * valid_var) / tf.reduce_sum(valid_var) 85 | mean_kl = tf.reduce_sum(kl * valid_var) / tf.reduce_sum(valid_var) 86 | max_kl = tf.reduce_max(kl * valid_var) 87 | else: 88 | surr_obj = - tf.reduce_mean(logli * advantage_var) 89 | mean_kl = tf.reduce_mean(kl) 90 | max_kl = tf.reduce_max(kl) 91 | 92 | input_list = [obs_var, action_var, advantage_var] + state_info_vars_list 93 | if is_recurrent: 94 | input_list.append(valid_var) 95 | 96 | self.optimizer.update_opt(loss=surr_obj, target=self.policy, inputs=input_list) 97 | 98 | f_kl = tensor_utils.compile_function( 99 | inputs=input_list + old_dist_info_vars_list, 100 | outputs=[mean_kl, max_kl], 101 | ) 102 | self.opt_info = dict( 103 | f_kl=f_kl, 104 | ) 105 | 106 | @overrides 107 | def optimize_policy(self, itr, samples_data): 108 | logger.log("optimizing policy") 109 | inputs = ext.extract( 110 | samples_data, 111 | "observations", "actions", "advantages" 112 | ) 113 | agent_infos = samples_data["agent_infos"] 114 | state_info_list = [agent_infos[k] for k in self.policy.state_info_keys] 115 | inputs += tuple(state_info_list) 116 | if self.policy.recurrent: 117 | inputs += (samples_data["valids"],) 118 | dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] 119 | loss_before = self.optimizer.loss(inputs) 120 | self.optimizer.optimize(inputs) 121 | loss_after = self.optimizer.loss(inputs) 122 | logger.record_tabular("LossBefore", loss_before) 123 | logger.record_tabular("LossAfter", loss_after) 124 | 125 | mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) 126 | logger.record_tabular('MeanKL', mean_kl) 127 | logger.record_tabular('MaxKL', max_kl) 128 | 129 | @overrides 130 | def get_itr_snapshot(self, itr, samples_data): 131 | return dict( 132 | itr=itr, 133 | policy=self.policy, 134 | baseline=self.baseline, 135 | env=self.env, 136 | ) 137 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/__pycache__/layers.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/layers.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/__pycache__/layers_powered.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/layers_powered.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/__pycache__/network.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/network.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/__pycache__/parameterized.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/core/__pycache__/parameterized.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/layers_powered.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.core.parameterized import Parameterized 2 | import sandbox.rocky.tf.core.layers as L 3 | import itertools 4 | 5 | 6 | class LayersPowered(Parameterized): 7 | 8 | def __init__(self, output_layers, input_layers=None): 9 | self._output_layers = output_layers 10 | self._input_layers = input_layers 11 | Parameterized.__init__(self) 12 | 13 | def get_params_internal(self, **tags): 14 | layers = L.get_all_layers(self._output_layers, treat_as_input=self._input_layers) 15 | params = itertools.chain.from_iterable(l.get_params(**tags) for l in layers) 16 | return L.unique(params) 17 | 18 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/parameterized.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.misc.tensor_utils import flatten_tensors, unflatten_tensors 5 | import tensorflow as tf 6 | 7 | 8 | load_params = True 9 | 10 | @contextmanager 11 | def suppress_params_loading(): 12 | global load_params 13 | load_params = False 14 | yield 15 | load_params = True 16 | 17 | 18 | class Parameterized(object): 19 | def __init__(self): 20 | self._cached_params = {} 21 | self._cached_param_dtypes = {} 22 | self._cached_param_shapes = {} 23 | self._cached_assign_ops = {} 24 | self._cached_assign_placeholders = {} 25 | 26 | def get_params_internal(self, **tags): 27 | """ 28 | Internal method to be implemented which does not perform caching 29 | """ 30 | raise NotImplementedError 31 | 32 | def get_params(self, **tags): 33 | """ 34 | Get the list of parameters, filtered by the provided tags. 35 | Some common tags include 'regularizable' and 'trainable' 36 | """ 37 | tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0])) 38 | if tag_tuple not in self._cached_params: 39 | self._cached_params[tag_tuple] = self.get_params_internal(**tags) 40 | return self._cached_params[tag_tuple] 41 | 42 | def get_param_dtypes(self, **tags): 43 | tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0])) 44 | if tag_tuple not in self._cached_param_dtypes: 45 | params = self.get_params(**tags) 46 | param_values = tf.get_default_session().run(params) 47 | self._cached_param_dtypes[tag_tuple] = [val.dtype for val in param_values] 48 | return self._cached_param_dtypes[tag_tuple] 49 | 50 | def get_param_shapes(self, **tags): 51 | tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0])) 52 | if tag_tuple not in self._cached_param_shapes: 53 | params = self.get_params(**tags) 54 | param_values = tf.get_default_session().run(params) 55 | self._cached_param_shapes[tag_tuple] = [val.shape for val in param_values] 56 | return self._cached_param_shapes[tag_tuple] 57 | 58 | def get_param_values(self, **tags): 59 | params = self.get_params(**tags) 60 | param_values = tf.get_default_session().run(params) 61 | return flatten_tensors(param_values) 62 | 63 | def set_param_values(self, flattened_params, **tags): 64 | debug = tags.pop("debug", False) 65 | param_values = unflatten_tensors( 66 | flattened_params, self.get_param_shapes(**tags)) 67 | ops = [] 68 | feed_dict = dict() 69 | for param, dtype, value in zip( 70 | self.get_params(**tags), 71 | self.get_param_dtypes(**tags), 72 | param_values): 73 | if param not in self._cached_assign_ops: 74 | assign_placeholder = tf.placeholder(dtype=param.dtype.base_dtype) 75 | assign_op = tf.assign(param, assign_placeholder) 76 | self._cached_assign_ops[param] = assign_op 77 | self._cached_assign_placeholders[param] = assign_placeholder 78 | ops.append(self._cached_assign_ops[param]) 79 | feed_dict[self._cached_assign_placeholders[param]] = value.astype(dtype) 80 | if debug: 81 | print("setting value of %s" % param.name) 82 | tf.get_default_session().run(ops, feed_dict=feed_dict) 83 | 84 | def flat_to_params(self, flattened_params, **tags): 85 | return unflatten_tensors(flattened_params, self.get_param_shapes(**tags)) 86 | 87 | def __getstate__(self): 88 | d = Serializable.__getstate__(self) 89 | global load_params 90 | if load_params: 91 | d["params"] = self.get_param_values() 92 | return d 93 | 94 | def __setstate__(self, d): 95 | Serializable.__setstate__(self, d) 96 | global load_params 97 | if load_params: 98 | tf.get_default_session().run(tf.variables_initializer(self.get_params())) 99 | self.set_param_values(d["params"]) 100 | 101 | 102 | class JointParameterized(Parameterized): 103 | def __init__(self, components): 104 | super(JointParameterized, self).__init__() 105 | self.components = components 106 | 107 | def get_params_internal(self, **tags): 108 | params = [param for comp in self.components for param in comp.get_params_internal(**tags)] 109 | # only return unique parameters 110 | return sorted(set(params), key=hash) 111 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/__pycache__/base.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/base.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/__pycache__/diagonal_gaussian.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/distributions/__pycache__/diagonal_gaussian.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/base.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | class Distribution(object): 6 | @property 7 | def dim(self): 8 | raise NotImplementedError 9 | 10 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 11 | """ 12 | Compute the symbolic KL divergence of two distributions 13 | """ 14 | raise NotImplementedError 15 | 16 | def kl(self, old_dist_info, new_dist_info): 17 | """ 18 | Compute the KL divergence of two distributions 19 | """ 20 | raise NotImplementedError 21 | 22 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 23 | raise NotImplementedError 24 | 25 | def entropy(self, dist_info): 26 | raise NotImplementedError 27 | 28 | def log_likelihood_sym(self, x_var, dist_info_vars): 29 | raise NotImplementedError 30 | 31 | def log_likelihood(self, xs, dist_info): 32 | raise NotImplementedError 33 | 34 | @property 35 | def dist_info_specs(self): 36 | raise NotImplementedError 37 | 38 | @property 39 | def dist_info_keys(self): 40 | return [k for k, _ in self.dist_info_specs] 41 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/bernoulli.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from .base import Distribution 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | TINY = 1e-8 8 | 9 | 10 | class Bernoulli(Distribution): 11 | def __init__(self, dim): 12 | self._dim = dim 13 | 14 | @property 15 | def dim(self): 16 | return self._dim 17 | 18 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 19 | old_p = old_dist_info_vars["p"] 20 | new_p = new_dist_info_vars["p"] 21 | kl = old_p * (tf.log(old_p + TINY) - tf.log(new_p + TINY)) + \ 22 | (1 - old_p) * (tf.log(1 - old_p + TINY) - tf.log(1 - new_p + TINY)) 23 | ndims = kl.get_shape().ndims 24 | return tf.reduce_sum(kl, axis=ndims - 1) 25 | 26 | def kl(self, old_dist_info, new_dist_info): 27 | old_p = old_dist_info["p"] 28 | new_p = new_dist_info["p"] 29 | kl = old_p * (np.log(old_p + TINY) - np.log(new_p + TINY)) + \ 30 | (1 - old_p) * (np.log(1 - old_p + TINY) - np.log(1 - new_p + TINY)) 31 | return np.sum(kl, axis=-1) 32 | 33 | def sample(self, dist_info): 34 | p = np.asarray(dist_info["p"]) 35 | return np.cast['int'](np.random.uniform(low=0., high=1., size=p.shape) < p) 36 | 37 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 38 | old_p = old_dist_info_vars["p"] 39 | new_p = new_dist_info_vars["p"] 40 | ndims = old_p.get_shape().ndims 41 | return tf.reduce_prod(x_var * new_p / (old_p + TINY) + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY), 42 | axis=ndims - 1) 43 | 44 | def log_likelihood_sym(self, x_var, dist_info_vars): 45 | p = dist_info_vars["p"] 46 | ndims = p.get_shape().ndims 47 | return tf.reduce_sum(x_var * tf.log(p + TINY) + (1 - x_var) * tf.log(1 - p + TINY), axis=ndims - 1) 48 | 49 | def log_likelihood(self, xs, dist_info): 50 | p = dist_info["p"] 51 | return np.sum(xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1) 52 | 53 | def entropy(self, dist_info): 54 | p = dist_info["p"] 55 | return np.sum(- p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1) 56 | 57 | @property 58 | def dist_info_keys(self): 59 | return ["p"] 60 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/categorical.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .base import Distribution 3 | import tensorflow as tf 4 | from sandbox.rocky.tf.misc import tensor_utils 5 | 6 | TINY = 1e-8 7 | 8 | 9 | def from_onehot(x_var): 10 | ret = np.zeros((len(x_var),), 'int32') 11 | nonzero_n, nonzero_a = np.nonzero(x_var) 12 | ret[nonzero_n] = nonzero_a 13 | return ret 14 | 15 | 16 | class Categorical(Distribution): 17 | def __init__(self, dim): 18 | self._dim = dim 19 | weights_var = tf.placeholder( 20 | dtype=tf.float32, 21 | shape=(None, dim), 22 | name="weights" 23 | ) 24 | self._f_sample = tensor_utils.compile_function( 25 | inputs=[weights_var], 26 | outputs=tf.multinomial(tf.log(weights_var + 1e-8), num_samples=1)[:, 0], 27 | ) 28 | 29 | @property 30 | def dim(self): 31 | return self._dim 32 | 33 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 34 | """ 35 | Compute the symbolic KL divergence of two categorical distributions 36 | """ 37 | old_prob_var = old_dist_info_vars["prob"] 38 | new_prob_var = new_dist_info_vars["prob"] 39 | ndims = old_prob_var.get_shape().ndims 40 | # Assume layout is N * A 41 | return tf.reduce_sum( 42 | old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), 43 | axis=ndims - 1 44 | ) 45 | 46 | def kl(self, old_dist_info, new_dist_info): 47 | """ 48 | Compute the KL divergence of two categorical distributions 49 | """ 50 | old_prob = old_dist_info["prob"] 51 | new_prob = new_dist_info["prob"] 52 | return np.sum( 53 | old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), 54 | axis=-1 55 | ) 56 | 57 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 58 | old_prob_var = old_dist_info_vars["prob"] 59 | new_prob_var = new_dist_info_vars["prob"] 60 | ndims = old_prob_var.get_shape().ndims 61 | x_var = tf.cast(x_var, tf.float32) 62 | # Assume layout is N * A 63 | return (tf.reduce_sum(new_prob_var * x_var, ndims - 1) + TINY) / \ 64 | (tf.reduce_sum(old_prob_var * x_var, ndims - 1) + TINY) 65 | 66 | def entropy_sym(self, dist_info_vars): 67 | probs = dist_info_vars["prob"] 68 | return -tf.reduce_sum(probs * tf.log(probs + TINY), axis=1) 69 | 70 | def cross_entropy_sym(self, old_dist_info_vars, new_dist_info_vars): 71 | old_prob_var = old_dist_info_vars["prob"] 72 | new_prob_var = new_dist_info_vars["prob"] 73 | ndims = old_prob_var.get_shape().ndims 74 | # Assume layout is N * A 75 | return tf.reduce_sum( 76 | old_prob_var * (- tf.log(new_prob_var + TINY)), 77 | axis=ndims - 1 78 | ) 79 | 80 | def entropy(self, info): 81 | probs = info["prob"] 82 | return -np.sum(probs * np.log(probs + TINY), axis=1) 83 | 84 | def log_likelihood_sym(self, x_var, dist_info_vars): 85 | probs = dist_info_vars["prob"] 86 | ndims = probs.get_shape().ndims 87 | return tf.log(tf.reduce_sum(probs * tf.cast(x_var, tf.float32), ndims - 1) + TINY) 88 | 89 | def log_likelihood(self, xs, dist_info): 90 | probs = dist_info["prob"] 91 | # Assume layout is N * A 92 | return np.log(np.sum(probs * xs, axis=-1) + TINY) 93 | 94 | @property 95 | def dist_info_specs(self): 96 | return [("prob", (self.dim,))] 97 | 98 | def sample(self, dist_info): 99 | return self._f_sample(dist_info["prob"]) 100 | 101 | def sample_sym(self, dist_info): 102 | probs = dist_info["prob"] 103 | samples = tf.multinomial(tf.log(probs + 1e-8), num_samples=1)[:, 0] 104 | return tf.nn.embedding_lookup(np.eye(self.dim, dtype=np.float32), samples) 105 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/diagonal_gaussian.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import tensorflow as tf 5 | import numpy as np 6 | from sandbox.rocky.tf.distributions.base import Distribution 7 | 8 | 9 | class DiagonalGaussian(Distribution): 10 | def __init__(self, dim): 11 | self._dim = dim 12 | 13 | @property 14 | def dim(self): 15 | return self._dim 16 | 17 | def kl(self, old_dist_info, new_dist_info): 18 | old_means = old_dist_info["mean"] 19 | old_log_stds = old_dist_info["log_std"] 20 | new_means = new_dist_info["mean"] 21 | new_log_stds = new_dist_info["log_std"] 22 | """ 23 | Compute the KL divergence of two multivariate Gaussian distribution with 24 | diagonal covariance matrices 25 | """ 26 | old_std = np.exp(old_log_stds) 27 | new_std = np.exp(new_log_stds) 28 | # means: (N*A) 29 | # std: (N*A) 30 | # formula: 31 | # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + 32 | # ln(\sigma_2/\sigma_1) 33 | numerator = np.square(old_means - new_means) + \ 34 | np.square(old_std) - np.square(new_std) 35 | denominator = 2 * np.square(new_std) + 1e-8 36 | return np.sum( 37 | numerator / denominator + new_log_stds - old_log_stds, axis=-1) 38 | # more lossy version 39 | # return TT.sum( 40 | # numerator / denominator + TT.log(new_std) - TT.log(old_std ), axis=-1) 41 | 42 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 43 | old_means = old_dist_info_vars["mean"] 44 | old_log_stds = old_dist_info_vars["log_std"] 45 | new_means = new_dist_info_vars["mean"] 46 | new_log_stds = new_dist_info_vars["log_std"] 47 | """ 48 | Compute the KL divergence of two multivariate Gaussian distribution with 49 | diagonal covariance matrices 50 | """ 51 | old_std = tf.exp(old_log_stds) 52 | new_std = tf.exp(new_log_stds) 53 | # means: (N*A) 54 | # std: (N*A) 55 | # formula: 56 | # { (\mu_1 - \mu_2)^2 + \sigma_1^2 - \sigma_2^2 } / (2\sigma_2^2) + 57 | # ln(\sigma_2/\sigma_1) 58 | numerator = tf.square(old_means - new_means) + \ 59 | tf.square(old_std) - tf.square(new_std) 60 | denominator = 2 * tf.square(new_std) + 1e-8 61 | return tf.reduce_sum( 62 | numerator / denominator + new_log_stds - old_log_stds, axis=-1) 63 | 64 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 65 | logli_new = self.log_likelihood_sym(x_var, new_dist_info_vars) 66 | logli_old = self.log_likelihood_sym(x_var, old_dist_info_vars) 67 | return tf.exp(logli_new - logli_old) 68 | 69 | def log_likelihood_sym(self, x_var, dist_info_vars): 70 | means = dist_info_vars["mean"] 71 | log_stds = dist_info_vars["log_std"] 72 | zs = (x_var - means) / tf.exp(log_stds) 73 | return - tf.reduce_sum(log_stds, axis=-1) - \ 74 | 0.5 * tf.reduce_sum(tf.square(zs), axis=-1) - \ 75 | 0.5 * self.dim * np.log(2 * np.pi) 76 | 77 | def sample(self, dist_info): 78 | means = dist_info["mean"] 79 | log_stds = dist_info["log_std"] 80 | rnd = np.random.normal(size=means.shape) 81 | return rnd * np.exp(log_stds) + means 82 | 83 | def log_likelihood(self, xs, dist_info): 84 | means = dist_info["mean"] 85 | log_stds = dist_info["log_std"] 86 | zs = (xs - means) / np.exp(log_stds) 87 | return - np.sum(log_stds, axis=-1) - \ 88 | 0.5 * np.sum(np.square(zs), axis=-1) - \ 89 | 0.5 * self.dim * np.log(2 * np.pi) 90 | 91 | def entropy(self, dist_info): 92 | log_stds = dist_info["log_std"] 93 | return np.sum(log_stds + np.log(np.sqrt(2 * np.pi * np.e)), axis=-1) 94 | 95 | @property 96 | def dist_info_specs(self): 97 | return [("mean", (self.dim,)), ("log_std", (self.dim,))] 98 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/recurrent_categorical.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | from sandbox.rocky.tf.distributions.categorical import Categorical 4 | from sandbox.rocky.tf.distributions.base import Distribution 5 | 6 | TINY = 1e-8 7 | 8 | 9 | class RecurrentCategorical(Distribution): 10 | def __init__(self, dim): 11 | self._cat = Categorical(dim) 12 | self._dim = dim 13 | 14 | @property 15 | def dim(self): 16 | return self._dim 17 | 18 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 19 | """ 20 | Compute the symbolic KL divergence of two categorical distributions 21 | """ 22 | old_prob_var = old_dist_info_vars["prob"] 23 | new_prob_var = new_dist_info_vars["prob"] 24 | # Assume layout is N * T * A 25 | return tf.reduce_sum( 26 | old_prob_var * (tf.log(old_prob_var + TINY) - tf.log(new_prob_var + TINY)), 27 | axis=2 28 | ) 29 | 30 | def kl(self, old_dist_info, new_dist_info): 31 | """ 32 | Compute the KL divergence of two categorical distributions 33 | """ 34 | old_prob = old_dist_info["prob"] 35 | new_prob = new_dist_info["prob"] 36 | return np.sum( 37 | old_prob * (np.log(old_prob + TINY) - np.log(new_prob + TINY)), 38 | axis=2 39 | ) 40 | 41 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 42 | old_prob_var = old_dist_info_vars["prob"] 43 | new_prob_var = new_dist_info_vars["prob"] 44 | # Assume layout is N * T * A 45 | a_dim = tf.shape(x_var)[2] 46 | flat_ratios = self._cat.likelihood_ratio_sym( 47 | tf.reshape(x_var, tf.stack([-1, a_dim])), 48 | dict(prob=tf.reshape(old_prob_var, tf.stack([-1, a_dim]))), 49 | dict(prob=tf.reshape(new_prob_var, tf.stack([-1, a_dim]))) 50 | ) 51 | return tf.reshape(flat_ratios, tf.shape(old_prob_var)[:2]) 52 | 53 | def entropy(self, dist_info): 54 | probs = dist_info["prob"] 55 | return -np.sum(probs * np.log(probs + TINY), axis=2) 56 | 57 | def entropy_sym(self, dist_info_vars): 58 | probs = dist_info_vars["prob"] 59 | return -tf.reduce_sum(probs * tf.log(probs + TINY), 2) 60 | 61 | def log_likelihood_sym(self, xs, dist_info_vars): 62 | probs = dist_info_vars["prob"] 63 | # Assume layout is N * T * A 64 | a_dim = tf.shape(probs)[2] 65 | # a_dim = TT.printing.Print("lala")(a_dim) 66 | flat_logli = self._cat.log_likelihood_sym( 67 | tf.reshape(xs, tf.stack([-1, a_dim])), 68 | dict(prob=tf.reshape(probs, tf.stack((-1, a_dim)))) 69 | ) 70 | return tf.reshape(flat_logli, tf.shape(probs)[:2]) 71 | 72 | def log_likelihood(self, xs, dist_info): 73 | probs = dist_info["prob"] 74 | # Assume layout is N * T * A 75 | a_dim = tf.shape(probs)[2] 76 | flat_logli = self._cat.log_likelihood_sym( 77 | xs.reshape((-1, a_dim)), 78 | dict(prob=probs.reshape((-1, a_dim))) 79 | ) 80 | return flat_logli.reshape(probs.shape[:2]) 81 | 82 | @property 83 | def dist_info_specs(self): 84 | return [("prob", (self.dim,))] 85 | 86 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/recurrent_diagonal_gaussian.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from sandbox.rocky.tf.distributions.diagonal_gaussian import DiagonalGaussian 5 | 6 | RecurrentDiagonalGaussian = DiagonalGaussian 7 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/__pycache__/base.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/base.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/__pycache__/parallel_vec_env_executor.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/parallel_vec_env_executor.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/__pycache__/vec_env_executor.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/envs/__pycache__/vec_env_executor.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/base.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.proxy_env import ProxyEnv 2 | from rllab.envs.base import EnvSpec 3 | from rllab.spaces.box import Box as TheanoBox 4 | from rllab.spaces.discrete import Discrete as TheanoDiscrete 5 | from rllab.spaces.product import Product as TheanoProduct 6 | from sandbox.rocky.tf.spaces.discrete import Discrete 7 | from sandbox.rocky.tf.spaces.box import Box 8 | from sandbox.rocky.tf.spaces.product import Product 9 | from cached_property import cached_property 10 | 11 | 12 | def to_tf_space(space): 13 | if isinstance(space, TheanoBox): 14 | return Box(low=space.low, high=space.high) 15 | elif isinstance(space, TheanoDiscrete): 16 | return Discrete(space.n) 17 | elif isinstance(space, TheanoProduct): 18 | return Product(list(map(to_tf_space, space.components))) 19 | else: 20 | raise NotImplementedError 21 | 22 | 23 | class WrappedCls(object): 24 | def __init__(self, cls, env_cls, extra_kwargs): 25 | self.cls = cls 26 | self.env_cls = env_cls 27 | self.extra_kwargs = extra_kwargs 28 | 29 | def __call__(self, *args, **kwargs): 30 | return self.cls(self.env_cls(*args, **dict(self.extra_kwargs, **kwargs))) 31 | 32 | 33 | class TfEnv(ProxyEnv): 34 | @cached_property 35 | def observation_space(self): 36 | return to_tf_space(self.wrapped_env.observation_space) 37 | 38 | @cached_property 39 | def action_space(self): 40 | return to_tf_space(self.wrapped_env.action_space) 41 | 42 | @cached_property 43 | def spec(self): 44 | return EnvSpec( 45 | observation_space=self.observation_space, 46 | action_space=self.action_space, 47 | ) 48 | 49 | @property 50 | def vectorized(self): 51 | return getattr(self.wrapped_env, "vectorized", False) 52 | 53 | def vec_env_executor(self, n_envs, max_path_length): 54 | return VecTfEnv(self.wrapped_env.vec_env_executor(n_envs=n_envs, max_path_length=max_path_length)) 55 | 56 | @classmethod 57 | def wrap(cls, env_cls, **extra_kwargs): 58 | # Use a class wrapper rather than a lambda method for smoother serialization 59 | return WrappedCls(cls, env_cls, extra_kwargs) 60 | 61 | 62 | class VecTfEnv(object): 63 | 64 | def __init__(self, vec_env): 65 | self.vec_env = vec_env 66 | 67 | def reset(self): 68 | return self.vec_env.reset() 69 | 70 | @property 71 | def num_envs(self): 72 | return self.vec_env.num_envs 73 | 74 | def step(self, action_n): 75 | return self.vec_env.step(action_n) 76 | 77 | def terminate(self): 78 | self.vec_env.terminate() 79 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/vec_env_executor.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | import pickle as pickle 5 | from sandbox.rocky.tf.misc import tensor_utils 6 | 7 | 8 | class VecEnvExecutor(object): 9 | def __init__(self, envs, max_path_length): 10 | self.envs = envs 11 | self._action_space = envs[0].action_space 12 | self._observation_space = envs[0].observation_space 13 | self.ts = np.zeros(len(self.envs), dtype='int') 14 | self.max_path_length = max_path_length 15 | 16 | def step(self, action_n): 17 | all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] 18 | obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results)))) 19 | dones = np.asarray(dones) 20 | rewards = np.asarray(rewards) 21 | self.ts += 1 22 | if self.max_path_length is not None: 23 | dones[self.ts >= self.max_path_length] = True 24 | for (i, done) in enumerate(dones): 25 | if done: 26 | obs[i] = self.envs[i].reset() 27 | self.ts[i] = 0 28 | return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos) 29 | 30 | def reset(self): 31 | results = [env.reset() for env in self.envs] 32 | self.ts[:] = 0 33 | return results 34 | 35 | @property 36 | def num_envs(self): 37 | return len(self.envs) 38 | 39 | @property 40 | def action_space(self): 41 | return self._action_space 42 | 43 | @property 44 | def observation_space(self): 45 | return self._observation_space 46 | 47 | def terminate(self): 48 | pass 49 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/launchers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/launchers/trpo_cartpole.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.algos.trpo import TRPO 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 6 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp 7 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 8 | from sandbox.rocky.tf.envs.base import TfEnv 9 | from rllab.misc.instrument import stub, run_experiment_lite 10 | 11 | env = TfEnv(normalize(CartpoleEnv())) 12 | 13 | policy = GaussianMLPPolicy( 14 | name="policy", 15 | env_spec=env.spec, 16 | # The neural network policy should have two hidden layers, each with 32 hidden units. 17 | hidden_sizes=(32, 32) 18 | ) 19 | 20 | baseline = LinearFeatureBaseline(env_spec=env.spec) 21 | 22 | algo = TRPO( 23 | env=env, 24 | policy=policy, 25 | baseline=baseline, 26 | batch_size=4000, 27 | max_path_length=100, 28 | n_itr=40, 29 | discount=0.99, 30 | step_size=0.01, 31 | # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) 32 | 33 | ) 34 | algo.train() 35 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/launchers/trpo_cartpole_recurrent.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.algos.trpo import TRPO 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from sandbox.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy 6 | from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy 7 | from sandbox.rocky.tf.envs.base import TfEnv 8 | import sandbox.rocky.tf.core.layers as L 9 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp 10 | from rllab.misc.instrument import stub, run_experiment_lite 11 | 12 | env = TfEnv(normalize(CartpoleEnv())) 13 | 14 | policy = GaussianLSTMPolicy( 15 | name="policy", 16 | env_spec=env.spec, 17 | lstm_layer_cls=L.TfBasicLSTMLayer, 18 | # gru_layer_cls=L.GRULayer, 19 | ) 20 | 21 | baseline = LinearFeatureBaseline(env_spec=env.spec) 22 | 23 | algo = TRPO( 24 | env=env, 25 | policy=policy, 26 | baseline=baseline, 27 | batch_size=4000, 28 | max_path_length=100, 29 | n_itr=10, 30 | discount=0.99, 31 | step_size=0.01, 32 | optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) 33 | ) 34 | algo.train() 35 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/launchers/vpg_cartpole.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.algos.vpg import VPG 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 6 | from sandbox.rocky.tf.envs.base import TfEnv 7 | from rllab.misc.instrument import stub, run_experiment_lite 8 | 9 | env = TfEnv(normalize(CartpoleEnv())) 10 | 11 | policy = GaussianMLPPolicy( 12 | name="policy", 13 | env_spec=env.spec, 14 | # The neural network policy should have two hidden layers, each with 32 hidden units. 15 | hidden_sizes=(32, 32) 16 | ) 17 | 18 | baseline = LinearFeatureBaseline(env_spec=env.spec) 19 | 20 | algo = VPG( 21 | env=env, 22 | policy=policy, 23 | baseline=baseline, 24 | batch_size=10000, 25 | max_path_length=100, 26 | n_itr=40, 27 | discount=0.99, 28 | optimizer_args=dict( 29 | tf_optimizer_args=dict( 30 | learning_rate=0.01, 31 | ) 32 | ) 33 | ) 34 | algo.train() 35 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/misc/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/misc/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/misc/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/misc/__pycache__/tensor_utils.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/misc/__pycache__/tensor_utils.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/misc/tensor_utils.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | 5 | def compile_function(inputs, outputs, log_name=None): 6 | def run(*input_vals): 7 | sess = tf.get_default_session() 8 | return sess.run(outputs, feed_dict=dict(list(zip(inputs, input_vals)))) 9 | 10 | return run 11 | 12 | 13 | def flatten_tensor_variables(ts): 14 | return tf.concat(axis=0, values=[tf.reshape(x, [-1]) for x in ts]) 15 | 16 | 17 | def unflatten_tensor_variables(flatarr, shapes, symb_arrs): 18 | arrs = [] 19 | n = 0 20 | for (shape, symb_arr) in zip(shapes, symb_arrs): 21 | size = np.prod(list(shape)) 22 | arr = tf.reshape(flatarr[n:n + size], shape) 23 | arrs.append(arr) 24 | n += size 25 | return arrs 26 | 27 | 28 | def new_tensor(name, ndim, dtype): 29 | return tf.placeholder(dtype=dtype, shape=[None] * ndim, name=name) 30 | 31 | 32 | def new_tensor_like(name, arr_like): 33 | return new_tensor(name, arr_like.get_shape().ndims, arr_like.dtype.base_dtype) 34 | 35 | 36 | def concat_tensor_list(tensor_list): 37 | return np.concatenate(tensor_list, axis=0) 38 | 39 | 40 | def concat_tensor_dict_list(tensor_dict_list): 41 | keys = list(tensor_dict_list[0].keys()) 42 | ret = dict() 43 | for k in keys: 44 | example = tensor_dict_list[0][k] 45 | if isinstance(example, dict): 46 | v = concat_tensor_dict_list([x[k] for x in tensor_dict_list]) 47 | else: 48 | v = concat_tensor_list([x[k] for x in tensor_dict_list]) 49 | ret[k] = v 50 | return ret 51 | 52 | 53 | def stack_tensor_list(tensor_list): 54 | return np.array(tensor_list) 55 | # tensor_shape = np.array(tensor_list[0]).shape 56 | # if tensor_shape is tuple(): 57 | # return np.array(tensor_list) 58 | # return np.vstack(tensor_list) 59 | 60 | 61 | def stack_tensor_dict_list(tensor_dict_list): 62 | """ 63 | Stack a list of dictionaries of {tensors or dictionary of tensors}. 64 | :param tensor_dict_list: a list of dictionaries of {tensors or dictionary of tensors}. 65 | :return: a dictionary of {stacked tensors or dictionary of stacked tensors} 66 | """ 67 | keys = list(tensor_dict_list[0].keys()) 68 | ret = dict() 69 | for k in keys: 70 | example = tensor_dict_list[0][k] 71 | if isinstance(example, dict): 72 | v = stack_tensor_dict_list([x[k] for x in tensor_dict_list]) 73 | else: 74 | v = stack_tensor_list([x[k] for x in tensor_dict_list]) 75 | ret[k] = v 76 | return ret 77 | 78 | 79 | def split_tensor_dict_list(tensor_dict): 80 | keys = list(tensor_dict.keys()) 81 | ret = None 82 | for k in keys: 83 | vals = tensor_dict[k] 84 | if isinstance(vals, dict): 85 | vals = split_tensor_dict_list(vals) 86 | if ret is None: 87 | ret = [{k: v} for v in vals] 88 | else: 89 | for v, cur_dict in zip(vals, ret): 90 | cur_dict[k] = v 91 | return ret 92 | 93 | 94 | def to_onehot_sym(inds, dim): 95 | return tf.one_hot(inds, depth=dim, on_value=1, off_value=0) 96 | 97 | 98 | def pad_tensor(x, max_len): 99 | return np.concatenate([ 100 | x, 101 | np.tile(np.zeros_like(x[0]), (max_len - len(x),) + (1,) * np.ndim(x[0])) 102 | ]) 103 | 104 | 105 | def pad_tensor_n(xs, max_len): 106 | ret = np.zeros((len(xs), max_len) + xs[0].shape[1:], dtype=xs[0].dtype) 107 | for idx, x in enumerate(xs): 108 | ret[idx][:len(x)] = x 109 | return ret 110 | 111 | 112 | def pad_tensor_dict(tensor_dict, max_len): 113 | keys = list(tensor_dict.keys()) 114 | ret = dict() 115 | for k in keys: 116 | if isinstance(tensor_dict[k], dict): 117 | ret[k] = pad_tensor_dict(tensor_dict[k], max_len) 118 | else: 119 | ret[k] = pad_tensor(tensor_dict[k], max_len) 120 | return ret 121 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/optimizers/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/optimizers/__pycache__/conjugate_gradient_optimizer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/conjugate_gradient_optimizer.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/optimizers/__pycache__/penalty_lbfgs_optimizer.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/optimizers/__pycache__/penalty_lbfgs_optimizer.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/optimizers/first_order_optimizer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from rllab.misc import ext 5 | from rllab.misc import logger 6 | from rllab.core.serializable import Serializable 7 | from sandbox.rocky.tf.misc import tensor_utils 8 | # from rllab.algo.first_order_method import parse_update_method 9 | from rllab.optimizers.minibatch_dataset import BatchDataset 10 | from collections import OrderedDict 11 | import tensorflow as tf 12 | import time 13 | from functools import partial 14 | import pyprind 15 | 16 | 17 | class FirstOrderOptimizer(Serializable): 18 | """ 19 | Performs (stochastic) gradient descent, possibly using fancier methods like adam etc. 20 | """ 21 | 22 | def __init__( 23 | self, 24 | tf_optimizer_cls=None, 25 | tf_optimizer_args=None, 26 | # learning_rate=1e-3, 27 | max_epochs=1000, 28 | tolerance=1e-6, 29 | batch_size=32, 30 | callback=None, 31 | verbose=False, 32 | **kwargs): 33 | """ 34 | 35 | :param max_epochs: 36 | :param tolerance: 37 | :param update_method: 38 | :param batch_size: None or an integer. If None the whole dataset will be used. 39 | :param callback: 40 | :param kwargs: 41 | :return: 42 | """ 43 | Serializable.quick_init(self, locals()) 44 | self._opt_fun = None 45 | self._target = None 46 | self._callback = callback 47 | if tf_optimizer_cls is None: 48 | tf_optimizer_cls = tf.train.AdamOptimizer 49 | if tf_optimizer_args is None: 50 | tf_optimizer_args = dict(learning_rate=1e-3) 51 | self._tf_optimizer = tf_optimizer_cls(**tf_optimizer_args) 52 | self._max_epochs = max_epochs 53 | self._tolerance = tolerance 54 | self._batch_size = batch_size 55 | self._verbose = verbose 56 | self._input_vars = None 57 | self._train_op = None 58 | 59 | def update_opt(self, loss, target, inputs, extra_inputs=None, **kwargs): 60 | """ 61 | :param loss: Symbolic expression for the loss function. 62 | :param target: A parameterized object to optimize over. It should implement methods of the 63 | :class:`rllab.core.paramerized.Parameterized` class. 64 | :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. 65 | :param inputs: A list of symbolic variables as inputs 66 | :return: No return value. 67 | """ 68 | 69 | self._target = target 70 | 71 | self._train_op = self._tf_optimizer.minimize(loss, var_list=target.get_params(trainable=True)) 72 | 73 | # updates = OrderedDict([(k, v.astype(k.dtype)) for k, v in updates.iteritems()]) 74 | 75 | if extra_inputs is None: 76 | extra_inputs = list() 77 | self._input_vars = inputs + extra_inputs 78 | self._opt_fun = ext.lazydict( 79 | f_loss=lambda: tensor_utils.compile_function(inputs + extra_inputs, loss), 80 | ) 81 | 82 | def loss(self, inputs, extra_inputs=None): 83 | if extra_inputs is None: 84 | extra_inputs = tuple() 85 | return self._opt_fun["f_loss"](*(tuple(inputs) + extra_inputs)) 86 | 87 | def optimize(self, inputs, extra_inputs=None, callback=None): 88 | 89 | if len(inputs) == 0: 90 | # Assumes that we should always sample mini-batches 91 | raise NotImplementedError 92 | 93 | f_loss = self._opt_fun["f_loss"] 94 | 95 | if extra_inputs is None: 96 | extra_inputs = tuple() 97 | 98 | last_loss = f_loss(*(tuple(inputs) + extra_inputs)) 99 | 100 | start_time = time.time() 101 | 102 | dataset = BatchDataset(inputs, self._batch_size, extra_inputs=extra_inputs) 103 | 104 | sess = tf.get_default_session() 105 | 106 | for epoch in range(self._max_epochs): 107 | if self._verbose: 108 | logger.log("Epoch %d" % (epoch)) 109 | progbar = pyprind.ProgBar(len(inputs[0])) 110 | 111 | for batch in dataset.iterate(update=True): 112 | sess.run(self._train_op, dict(list(zip(self._input_vars, batch)))) 113 | if self._verbose: 114 | progbar.update(len(batch[0])) 115 | 116 | if self._verbose: 117 | if progbar.active: 118 | progbar.stop() 119 | 120 | new_loss = f_loss(*(tuple(inputs) + extra_inputs)) 121 | 122 | if self._verbose: 123 | logger.log("Epoch: %d | Loss: %f" % (epoch, new_loss)) 124 | if self._callback or callback: 125 | elapsed = time.time() - start_time 126 | callback_args = dict( 127 | loss=new_loss, 128 | params=self._target.get_param_values(trainable=True) if self._target else None, 129 | itr=epoch, 130 | elapsed=elapsed, 131 | ) 132 | if self._callback: 133 | self._callback(callback_args) 134 | if callback: 135 | callback(**callback_args) 136 | 137 | if abs(last_loss - new_loss) < self._tolerance: 138 | break 139 | last_loss = new_loss 140 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/optimizers/lbfgs_optimizer.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from rllab.misc import ext 4 | from sandbox.rocky.tf.misc import tensor_utils 5 | from rllab.core.serializable import Serializable 6 | import tensorflow as tf 7 | import scipy.optimize 8 | import time 9 | 10 | 11 | class LbfgsOptimizer(Serializable): 12 | """ 13 | Performs unconstrained optimization via L-BFGS. 14 | """ 15 | 16 | def __init__(self, name, max_opt_itr=20, callback=None): 17 | Serializable.quick_init(self, locals()) 18 | self._name = name 19 | self._max_opt_itr = max_opt_itr 20 | self._opt_fun = None 21 | self._target = None 22 | self._callback = callback 23 | 24 | def update_opt(self, loss, target, inputs, extra_inputs=None, *args, **kwargs): 25 | """ 26 | :param loss: Symbolic expression for the loss function. 27 | :param target: A parameterized object to optimize over. It should implement methods of the 28 | :class:`rllab.core.paramerized.Parameterized` class. 29 | :param leq_constraint: A constraint provided as a tuple (f, epsilon), of the form f(*inputs) <= epsilon. 30 | :param inputs: A list of symbolic variables as inputs 31 | :return: No return value. 32 | """ 33 | 34 | self._target = target 35 | 36 | def get_opt_output(): 37 | flat_grad = tensor_utils.flatten_tensor_variables(tf.gradients(loss, target.get_params(trainable=True))) 38 | return [tf.cast(loss, tf.float64), tf.cast(flat_grad, tf.float64)] 39 | 40 | if extra_inputs is None: 41 | extra_inputs = list() 42 | 43 | self._opt_fun = ext.lazydict( 44 | f_loss=lambda: tensor_utils.compile_function(inputs + extra_inputs, loss), 45 | f_opt=lambda: tensor_utils.compile_function( 46 | inputs=inputs + extra_inputs, 47 | outputs=get_opt_output(), 48 | ) 49 | ) 50 | 51 | def loss(self, inputs, extra_inputs=None): 52 | if extra_inputs is None: 53 | extra_inputs = list() 54 | return self._opt_fun["f_loss"](*(list(inputs) + list(extra_inputs))) 55 | 56 | def optimize(self, inputs, extra_inputs=None): 57 | f_opt = self._opt_fun["f_opt"] 58 | 59 | if extra_inputs is None: 60 | extra_inputs = list() 61 | 62 | def f_opt_wrapper(flat_params): 63 | self._target.set_param_values(flat_params, trainable=True) 64 | ret = f_opt(*inputs) 65 | return ret 66 | 67 | itr = [0] 68 | start_time = time.time() 69 | 70 | if self._callback: 71 | def opt_callback(params): 72 | loss = self._opt_fun["f_loss"](*(inputs + extra_inputs)) 73 | elapsed = time.time() - start_time 74 | self._callback(dict( 75 | loss=loss, 76 | params=params, 77 | itr=itr[0], 78 | elapsed=elapsed, 79 | )) 80 | itr[0] += 1 81 | else: 82 | opt_callback = None 83 | 84 | scipy.optimize.fmin_l_bfgs_b( 85 | func=f_opt_wrapper, x0=self._target.get_param_values(trainable=True), 86 | maxiter=self._max_opt_itr, callback=opt_callback, 87 | ) 88 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/__pycache__/base.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/base.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_inverse_policy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_inverse_policy.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_policy.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/policies/__pycache__/gaussian_mlp_policy.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/base.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from sandbox.rocky.tf.core.parameterized import Parameterized 5 | 6 | 7 | class Policy(Parameterized): 8 | def __init__(self, env_spec): 9 | Parameterized.__init__(self) 10 | self._env_spec = env_spec 11 | 12 | # Should be implemented by all policies 13 | 14 | def get_action(self, observation): 15 | raise NotImplementedError 16 | 17 | def get_actions(self, observations): 18 | raise NotImplementedError 19 | 20 | def reset(self, dones=None): 21 | pass 22 | 23 | @property 24 | def vectorized(self): 25 | """ 26 | Indicates whether the policy is vectorized. If True, it should implement get_actions(), and support resetting 27 | with multiple simultaneous states. 28 | """ 29 | return False 30 | 31 | @property 32 | def observation_space(self): 33 | return self._env_spec.observation_space 34 | 35 | @property 36 | def action_space(self): 37 | return self._env_spec.action_space 38 | 39 | @property 40 | def env_spec(self): 41 | return self._env_spec 42 | 43 | @property 44 | def recurrent(self): 45 | """ 46 | Indicates whether the policy is recurrent. 47 | :return: 48 | """ 49 | return False 50 | 51 | def log_diagnostics(self, paths): 52 | """ 53 | Log extra information per iteration based on the collected paths 54 | """ 55 | pass 56 | 57 | @property 58 | def state_info_keys(self): 59 | """ 60 | Return keys for the information related to the policy's state when taking an action. 61 | :return: 62 | """ 63 | return [k for k, _ in self.state_info_specs] 64 | 65 | @property 66 | def state_info_specs(self): 67 | """ 68 | Return keys and shapes for the information related to the policy's state when taking an action. 69 | :return: 70 | """ 71 | return list() 72 | 73 | def terminate(self): 74 | """ 75 | Clean up operation 76 | """ 77 | pass 78 | 79 | 80 | class StochasticPolicy(Policy): 81 | @property 82 | def distribution(self): 83 | """ 84 | :rtype Distribution 85 | """ 86 | raise NotImplementedError 87 | 88 | def dist_info_sym(self, obs_var, state_info_vars): 89 | """ 90 | Return the symbolic distribution information about the actions. 91 | :param obs_var: symbolic variable for observations 92 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at 93 | the time it received the observation 94 | :return: 95 | """ 96 | raise NotImplementedError 97 | 98 | def dist_info(self, obs, state_infos): 99 | """ 100 | Return the distribution information about the actions. 101 | :param obs_var: observation values 102 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at 103 | the time it received the observation 104 | :return: 105 | """ 106 | raise NotImplementedError 107 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/categorical_conv_policy.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.core.layers_powered import LayersPowered 2 | import sandbox.rocky.tf.core.layers as L 3 | from sandbox.rocky.tf.core.network import ConvNetwork 4 | from rllab.core.serializable import Serializable 5 | from sandbox.rocky.tf.distributions.categorical import Categorical 6 | from sandbox.rocky.tf.policies.base import StochasticPolicy 7 | from rllab.misc import ext 8 | from sandbox.rocky.tf.misc import tensor_utils 9 | from rllab.misc.overrides import overrides 10 | from sandbox.rocky.tf.spaces.discrete import Discrete 11 | import tensorflow as tf 12 | 13 | 14 | class CategoricalConvPolicy(StochasticPolicy, LayersPowered, Serializable): 15 | def __init__( 16 | self, 17 | name, 18 | env_spec, 19 | conv_filters, conv_filter_sizes, conv_strides, conv_pads, 20 | hidden_sizes=[], 21 | hidden_nonlinearity=tf.nn.relu, 22 | output_nonlinearity=tf.nn.softmax, 23 | prob_network=None, 24 | ): 25 | """ 26 | :param env_spec: A spec for the mdp. 27 | :param hidden_sizes: list of sizes for the fully connected hidden layers 28 | :param hidden_nonlinearity: nonlinearity used for each hidden layer 29 | :param prob_network: manually specified network for this policy, other network params 30 | are ignored 31 | :return: 32 | """ 33 | Serializable.quick_init(self, locals()) 34 | 35 | assert isinstance(env_spec.action_space, Discrete) 36 | 37 | self._env_spec = env_spec 38 | # import pdb; pdb.set_trace() 39 | if prob_network is None: 40 | prob_network = ConvNetwork( 41 | input_shape=env_spec.observation_space.shape, 42 | output_dim=env_spec.action_space.n, 43 | conv_filters=conv_filters, 44 | conv_filter_sizes=conv_filter_sizes, 45 | conv_strides=conv_strides, 46 | conv_pads=conv_pads, 47 | hidden_sizes=hidden_sizes, 48 | hidden_nonlinearity=hidden_nonlinearity, 49 | output_nonlinearity=output_nonlinearity, 50 | name="prob_network", 51 | ) 52 | 53 | self._l_prob = prob_network.output_layer 54 | self._l_obs = prob_network.input_layer 55 | self._f_prob = tensor_utils.compile_function( 56 | [prob_network.input_layer.input_var], 57 | L.get_output(prob_network.output_layer) 58 | ) 59 | 60 | self._dist = Categorical(env_spec.action_space.n) 61 | 62 | super(CategoricalConvPolicy, self).__init__(env_spec) 63 | LayersPowered.__init__(self, [prob_network.output_layer]) 64 | 65 | @property 66 | def vectorized(self): 67 | return True 68 | 69 | @overrides 70 | def dist_info_sym(self, obs_var, state_info_vars=None): 71 | return dict(prob=L.get_output(self._l_prob, {self._l_obs: tf.cast(obs_var, tf.float32)})) 72 | 73 | @overrides 74 | def dist_info(self, obs, state_infos=None): 75 | return dict(prob=self._f_prob(obs)) 76 | 77 | # The return value is a pair. The first item is a matrix (N, A), where each 78 | # entry corresponds to the action value taken. The second item is a vector 79 | # of length N, where each entry is the density value for that action, under 80 | # the current policy 81 | @overrides 82 | def get_action(self, observation): 83 | flat_obs = self.observation_space.flatten(observation) 84 | prob = self._f_prob([flat_obs])[0] 85 | action = self.action_space.weighted_sample(prob) 86 | return action, dict(prob=prob) 87 | 88 | def get_actions(self, observations): 89 | flat_obs = self.observation_space.flatten_n(observations) 90 | probs = self._f_prob(flat_obs) 91 | actions = list(map(self.action_space.weighted_sample, probs)) 92 | return actions, dict(prob=probs) 93 | 94 | @property 95 | def distribution(self): 96 | return self._dist 97 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/categorical_mlp_policy.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.core.layers_powered import LayersPowered 2 | import sandbox.rocky.tf.core.layers as L 3 | from sandbox.rocky.tf.core.network import MLP 4 | from rllab.core.serializable import Serializable 5 | from sandbox.rocky.tf.distributions.categorical import Categorical 6 | from sandbox.rocky.tf.policies.base import StochasticPolicy 7 | from rllab.misc import ext 8 | from sandbox.rocky.tf.misc import tensor_utils 9 | from rllab.misc.overrides import overrides 10 | from sandbox.rocky.tf.spaces.discrete import Discrete 11 | import tensorflow as tf 12 | 13 | 14 | class CategoricalMLPPolicy(StochasticPolicy, LayersPowered, Serializable): 15 | def __init__( 16 | self, 17 | name, 18 | env_spec, 19 | hidden_sizes=(32, 32), 20 | hidden_nonlinearity=tf.nn.tanh, 21 | prob_network=None, 22 | ): 23 | """ 24 | :param env_spec: A spec for the mdp. 25 | :param hidden_sizes: list of sizes for the fully connected hidden layers 26 | :param hidden_nonlinearity: nonlinearity used for each hidden layer 27 | :param prob_network: manually specified network for this policy, other network params 28 | are ignored 29 | :return: 30 | """ 31 | Serializable.quick_init(self, locals()) 32 | 33 | assert isinstance(env_spec.action_space, Discrete) 34 | 35 | with tf.variable_scope(name): 36 | if prob_network is None: 37 | prob_network = MLP( 38 | input_shape=(env_spec.observation_space.flat_dim,), 39 | output_dim=env_spec.action_space.n, 40 | hidden_sizes=hidden_sizes, 41 | hidden_nonlinearity=hidden_nonlinearity, 42 | output_nonlinearity=tf.nn.softmax, 43 | name="prob_network", 44 | ) 45 | 46 | self._l_prob = prob_network.output_layer 47 | self._l_obs = prob_network.input_layer 48 | self._f_prob = tensor_utils.compile_function( 49 | [prob_network.input_layer.input_var], 50 | L.get_output(prob_network.output_layer) 51 | ) 52 | 53 | self._dist = Categorical(env_spec.action_space.n) 54 | 55 | super(CategoricalMLPPolicy, self).__init__(env_spec) 56 | LayersPowered.__init__(self, [prob_network.output_layer]) 57 | 58 | @property 59 | def vectorized(self): 60 | return True 61 | 62 | @overrides 63 | def dist_info_sym(self, obs_var, state_info_vars=None): 64 | return dict(prob=L.get_output(self._l_prob, {self._l_obs: tf.cast(obs_var, tf.float32)})) 65 | 66 | @overrides 67 | def dist_info(self, obs, state_infos=None): 68 | return dict(prob=self._f_prob(obs)) 69 | 70 | # The return value is a pair. The first item is a matrix (N, A), where each 71 | # entry corresponds to the action value taken. The second item is a vector 72 | # of length N, where each entry is the density value for that action, under 73 | # the current policy 74 | @overrides 75 | def get_action(self, observation): 76 | flat_obs = self.observation_space.flatten(observation) 77 | prob = self._f_prob([flat_obs])[0] 78 | action = self.action_space.weighted_sample(prob) 79 | return action, dict(prob=prob) 80 | 81 | def get_actions(self, observations): 82 | flat_obs = self.observation_space.flatten_n(observations) 83 | probs = self._f_prob(flat_obs) 84 | actions = list(map(self.action_space.weighted_sample, probs)) 85 | return actions, dict(prob=probs) 86 | 87 | @property 88 | def distribution(self): 89 | return self._dist 90 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/deterministic_mlp_policy.py: -------------------------------------------------------------------------------- 1 | from rllab.core.serializable import Serializable 2 | from rllab.misc import ext 3 | from rllab.misc.overrides import overrides 4 | from sandbox.rocky.tf.core.layers_powered import LayersPowered 5 | from sandbox.rocky.tf.core.network import MLP 6 | from sandbox.rocky.tf.distributions.categorical import Categorical 7 | from sandbox.rocky.tf.policies.base import Policy 8 | from sandbox.rocky.tf.misc import tensor_utils 9 | 10 | import sandbox.rocky.tf.core.layers as L 11 | from sandbox.rocky.tf.core.layers import batch_norm 12 | 13 | from sandbox.rocky.tf.spaces.discrete import Discrete 14 | import tensorflow as tf 15 | 16 | 17 | class DeterministicMLPPolicy(Policy, LayersPowered, Serializable): 18 | def __init__( 19 | self, 20 | name, 21 | env_spec, 22 | hidden_sizes=(32, 32), 23 | hidden_nonlinearity=tf.nn.relu, 24 | output_nonlinearity=tf.nn.tanh, 25 | prob_network=None, 26 | bn=False): 27 | Serializable.quick_init(self, locals()) 28 | 29 | with tf.variable_scope(name): 30 | if prob_network is None: 31 | prob_network = MLP( 32 | input_shape=(env_spec.observation_space.flat_dim,), 33 | output_dim=env_spec.action_space.flat_dim, 34 | hidden_sizes=hidden_sizes, 35 | hidden_nonlinearity=hidden_nonlinearity, 36 | output_nonlinearity=output_nonlinearity, 37 | # batch_normalization=True, 38 | name="prob_network", 39 | ) 40 | 41 | self._l_prob = prob_network.output_layer 42 | self._l_obs = prob_network.input_layer 43 | self._f_prob = tensor_utils.compile_function( 44 | [prob_network.input_layer.input_var], 45 | L.get_output(prob_network.output_layer, deterministic=True) 46 | ) 47 | 48 | self.prob_network = prob_network 49 | 50 | # Note the deterministic=True argument. It makes sure that when getting 51 | # actions from single observations, we do not update params in the 52 | # batch normalization layers. 53 | # TODO: this doesn't currently work properly in the tf version so we leave out batch_norm 54 | super(DeterministicMLPPolicy, self).__init__(env_spec) 55 | LayersPowered.__init__(self, [prob_network.output_layer]) 56 | 57 | @property 58 | def vectorized(self): 59 | return True 60 | 61 | @overrides 62 | def get_action(self, observation): 63 | flat_obs = self.observation_space.flatten(observation) 64 | action = self._f_prob([flat_obs])[0] 65 | return action, dict() 66 | 67 | @overrides 68 | def get_actions(self, observations): 69 | flat_obs = self.observation_space.flatten_n(observations) 70 | actions = self._f_prob(flat_obs) 71 | return actions, dict() 72 | 73 | def get_action_sym(self, obs_var): 74 | return L.get_output(self.prob_network.output_layer, obs_var) 75 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/uniform_control_policy.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.policies.base import Policy 2 | from rllab.core.serializable import Serializable 3 | 4 | 5 | class UniformControlPolicy(Policy, Serializable): 6 | def __init__( 7 | self, 8 | env_spec, 9 | ): 10 | Serializable.quick_init(self, locals()) 11 | super(UniformControlPolicy, self).__init__(env_spec=env_spec) 12 | 13 | @property 14 | def vectorized(self): 15 | return True 16 | 17 | def get_action(self, observation): 18 | return self.action_space.sample(), dict() 19 | 20 | def get_actions(self, observations): 21 | return self.action_space.sample_n(len(observations)), dict() 22 | 23 | def get_params_internal(self, **tags): 24 | return [] 25 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/q_functions/base.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.core.parameterized import Parameterized 2 | 3 | class QFunction(Parameterized): 4 | pass 5 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/q_functions/continuous_mlp_q_function.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.q_functions.base import QFunction 2 | from rllab.core.serializable import Serializable 3 | from rllab.misc import ext 4 | 5 | from sandbox.rocky.tf.core.layers_powered import LayersPowered 6 | from sandbox.rocky.tf.core.network import MLP 7 | from sandbox.rocky.tf.core.layers import batch_norm 8 | from sandbox.rocky.tf.distributions.categorical import Categorical 9 | from sandbox.rocky.tf.policies.base import StochasticPolicy 10 | from sandbox.rocky.tf.misc import tensor_utils 11 | 12 | import tensorflow as tf 13 | import sandbox.rocky.tf.core.layers as L 14 | 15 | 16 | class ContinuousMLPQFunction(QFunction, LayersPowered, Serializable): 17 | def __init__( 18 | self, 19 | env_spec, 20 | hidden_sizes=(32, 32), 21 | hidden_nonlinearity=tf.nn.relu, 22 | action_merge_layer=-2, 23 | output_nonlinearity=None, 24 | bn=False): 25 | Serializable.quick_init(self, locals()) 26 | 27 | l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") 28 | l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") 29 | 30 | n_layers = len(hidden_sizes) + 1 31 | 32 | if n_layers > 1: 33 | action_merge_layer = \ 34 | (action_merge_layer % n_layers + n_layers) % n_layers 35 | else: 36 | action_merge_layer = 1 37 | 38 | l_hidden = l_obs 39 | 40 | for idx, size in enumerate(hidden_sizes): 41 | if bn: 42 | l_hidden = batch_norm(l_hidden) 43 | 44 | if idx == action_merge_layer: 45 | l_hidden = L.ConcatLayer([l_hidden, l_action]) 46 | 47 | l_hidden = L.DenseLayer( 48 | l_hidden, 49 | num_units=size, 50 | nonlinearity=hidden_nonlinearity, 51 | name="h%d" % (idx + 1) 52 | ) 53 | 54 | if action_merge_layer == n_layers: 55 | l_hidden = L.ConcatLayer([l_hidden, l_action]) 56 | 57 | l_output = L.DenseLayer( 58 | l_hidden, 59 | num_units=1, 60 | nonlinearity=output_nonlinearity, 61 | name="output" 62 | ) 63 | 64 | output_var = L.get_output(l_output, deterministic=True) 65 | 66 | self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var) 67 | self._output_layer = l_output 68 | self._obs_layer = l_obs 69 | self._action_layer = l_action 70 | self._output_nonlinearity = output_nonlinearity 71 | 72 | LayersPowered.__init__(self, [l_output]) 73 | 74 | def get_qval(self, observations, actions): 75 | return self._f_qval(observations, actions) 76 | 77 | def get_qval_sym(self, obs_var, action_var, **kwargs): 78 | qvals = L.get_output( 79 | self._output_layer, 80 | {self._obs_layer: obs_var, self._action_layer: action_var}, 81 | **kwargs 82 | ) 83 | return tf.reshape(qvals, (-1,)) 84 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/regressors/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/regressors/deterministic_mlp_regressor.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | import numpy as np 8 | 9 | import tensorflow as tf 10 | from sandbox.rocky.tf.core.layers_powered import LayersPowered 11 | from sandbox.rocky.tf.core.network import MLP 12 | from sandbox.rocky.tf.misc import tensor_utils 13 | from sandbox.rocky.tf.distributions.categorical import Categorical 14 | from sandbox.rocky.tf.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer 15 | from sandbox.rocky.tf.optimizers.lbfgs_optimizer import LbfgsOptimizer 16 | import sandbox.rocky.tf.core.layers as L 17 | from rllab.core.serializable import Serializable 18 | from rllab.misc import ext 19 | from rllab.misc import logger 20 | 21 | NONE = list() 22 | 23 | 24 | class DeterministicMLPRegressor(LayersPowered, Serializable): 25 | """ 26 | A class for performing nonlinear regression. 27 | """ 28 | 29 | def __init__( 30 | self, 31 | name, 32 | input_shape, 33 | output_dim, 34 | network=None, 35 | hidden_sizes=(32, 32), 36 | hidden_nonlinearity=tf.nn.tanh, 37 | output_nonlinearity=None, 38 | optimizer=None, 39 | normalize_inputs=True, 40 | ): 41 | """ 42 | :param input_shape: Shape of the input data. 43 | :param output_dim: Dimension of output. 44 | :param hidden_sizes: Number of hidden units of each layer of the mean network. 45 | :param hidden_nonlinearity: Non-linearity used for each layer of the mean network. 46 | :param optimizer: Optimizer for minimizing the negative log-likelihood. 47 | """ 48 | Serializable.quick_init(self, locals()) 49 | 50 | with tf.variable_scope(name): 51 | 52 | if optimizer is None: 53 | optimizer = LbfgsOptimizer(name="optimizer") 54 | 55 | self.output_dim = output_dim 56 | self.optimizer = optimizer 57 | 58 | if network is None: 59 | network = MLP( 60 | input_shape=input_shape, 61 | output_dim=output_dim, 62 | hidden_sizes=hidden_sizes, 63 | hidden_nonlinearity=hidden_nonlinearity, 64 | output_nonlinearity=output_nonlinearity, 65 | name="network" 66 | ) 67 | 68 | l_out = network.output_layer 69 | 70 | LayersPowered.__init__(self, [l_out]) 71 | 72 | xs_var = network.input_layer.input_var 73 | ys_var = tf.placeholder(dtype=tf.float32, shape=[None, output_dim], name="ys") 74 | 75 | x_mean_var = tf.get_variable( 76 | name="x_mean", 77 | shape=(1,) + input_shape, 78 | initializer=tf.constant_initializer(0., dtype=tf.float32) 79 | ) 80 | x_std_var = tf.get_variable( 81 | name="x_std", 82 | shape=(1,) + input_shape, 83 | initializer=tf.constant_initializer(1., dtype=tf.float32) 84 | ) 85 | 86 | normalized_xs_var = (xs_var - x_mean_var) / x_std_var 87 | 88 | fit_ys_var = L.get_output(l_out, {network.input_layer: normalized_xs_var}) 89 | 90 | loss = - tf.reduce_mean(tf.square(fit_ys_var - ys_var)) 91 | 92 | self.f_predict = tensor_utils.compile_function([xs_var], fit_ys_var) 93 | 94 | optimizer_args = dict( 95 | loss=loss, 96 | target=self, 97 | network_outputs=[fit_ys_var], 98 | ) 99 | 100 | optimizer_args["inputs"] = [xs_var, ys_var] 101 | 102 | self.optimizer.update_opt(**optimizer_args) 103 | 104 | self.name = name 105 | self.l_out = l_out 106 | 107 | self.normalize_inputs = normalize_inputs 108 | self.x_mean_var = x_mean_var 109 | self.x_std_var = x_std_var 110 | 111 | def predict_sym(self, xs): 112 | return L.get_output(self.l_out, xs) 113 | 114 | # def fit(self, xs, ys): 115 | # if self._normalize_inputs: 116 | # # recompute normalizing constants for inputs 117 | # new_mean = np.mean(xs, axis=0, keepdims=True) 118 | # new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 119 | # tf.get_default_session().run(tf.group( 120 | # tf.assign(self._x_mean_var, new_mean), 121 | # tf.assign(self._x_std_var, new_std), 122 | # )) 123 | # inputs = [xs, ys] 124 | # loss_before = self._optimizer.loss(inputs) 125 | # if self._name: 126 | # prefix = self._name + "_" 127 | # else: 128 | # prefix = "" 129 | # logger.record_tabular(prefix + 'LossBefore', loss_before) 130 | # self._optimizer.optimize(inputs) 131 | # loss_after = self._optimizer.loss(inputs) 132 | # logger.record_tabular(prefix + 'LossAfter', loss_after) 133 | # logger.record_tabular(prefix + 'dLoss', loss_before - loss_after) 134 | 135 | def predict(self, xs): 136 | return self.f_predict(np.asarray(xs)) 137 | 138 | def get_param_values(self, **tags): 139 | return LayersPowered.get_param_values(self, **tags) 140 | 141 | def set_param_values(self, flattened_params, **tags): 142 | return LayersPowered.set_param_values(self, flattened_params, **tags) 143 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/samplers/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/samplers/__pycache__/base.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/base.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/samplers/__pycache__/batch_sampler.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/batch_sampler.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/samplers/__pycache__/vectorized_sampler.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/samplers/__pycache__/vectorized_sampler.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/samplers/batch_sampler.py: -------------------------------------------------------------------------------- 1 | from rllab.sampler.base import BaseSampler 2 | from rllab.sampler import parallel_sampler 3 | from rllab.sampler.stateful_pool import singleton_pool 4 | import tensorflow as tf 5 | 6 | 7 | def worker_init_tf(G): 8 | G.sess = tf.Session() 9 | G.sess.__enter__() 10 | 11 | 12 | def worker_init_tf_vars(G): 13 | G.sess.run(tf.global_variables_initializer()) 14 | 15 | 16 | class BatchSampler(BaseSampler): 17 | def start_worker(self): 18 | if singleton_pool.n_parallel > 1: 19 | singleton_pool.run_each(worker_init_tf) 20 | parallel_sampler.populate_task(self.algo.env, self.algo.policy) 21 | if singleton_pool.n_parallel > 1: 22 | singleton_pool.run_each(worker_init_tf_vars) 23 | 24 | def shutdown_worker(self): 25 | parallel_sampler.terminate_task(scope=self.algo.scope) 26 | 27 | def obtain_samples(self, itr): 28 | cur_policy_params = self.algo.policy.get_param_values() 29 | cur_env_params = self.algo.env.get_param_values() 30 | paths = parallel_sampler.sample_paths( 31 | policy_params=cur_policy_params, 32 | env_params=cur_env_params, 33 | max_samples=self.algo.batch_size, 34 | max_path_length=self.algo.max_path_length, 35 | scope=self.algo.scope, 36 | ) 37 | if self.algo.whole_paths: 38 | return paths 39 | else: 40 | paths_truncated = parallel_sampler.truncate_paths(paths, self.algo.batch_size) 41 | return paths_truncated 42 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/samplers/vectorized_sampler.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | 3 | import tensorflow as tf 4 | from rllab.sampler.base import BaseSampler 5 | #from base import BaseSampler 6 | from sandbox.rocky.tf.envs.parallel_vec_env_executor import ParallelVecEnvExecutor 7 | from sandbox.rocky.tf.envs.vec_env_executor import VecEnvExecutor 8 | from rllab.misc import tensor_utils 9 | import numpy as np 10 | from rllab.sampler.stateful_pool import ProgBarCounter 11 | import rllab.misc.logger as logger 12 | import itertools 13 | 14 | 15 | class VectorizedSampler(BaseSampler): 16 | 17 | def __init__(self, algo, n_envs=None): 18 | super(VectorizedSampler, self).__init__(algo) 19 | self.n_envs = n_envs 20 | 21 | def start_worker(self): 22 | n_envs = self.n_envs 23 | if n_envs is None: 24 | n_envs = int(self.algo.batch_size / self.algo.max_path_length) 25 | n_envs = max(1, min(n_envs, 100)) 26 | 27 | if getattr(self.algo.env, 'vectorized', False): 28 | self.vec_env = self.algo.env.vec_env_executor(n_envs=n_envs, max_path_length=self.algo.max_path_length) 29 | else: 30 | envs = [pickle.loads(pickle.dumps(self.algo.env)) for _ in range(n_envs)] 31 | self.vec_env = VecEnvExecutor( 32 | envs=envs, 33 | max_path_length=self.algo.max_path_length 34 | ) 35 | self.env_spec = self.algo.env.spec 36 | 37 | def shutdown_worker(self): 38 | self.vec_env.terminate() 39 | 40 | def obtain_samples(self, itr): 41 | logger.log("Obtaining samples for iteration %d..." % itr) 42 | paths = [] 43 | n_samples = 0 44 | obses = self.vec_env.reset() 45 | dones = np.asarray([True] * self.vec_env.num_envs) 46 | running_paths = [None] * self.vec_env.num_envs 47 | 48 | pbar = ProgBarCounter(self.algo.batch_size) 49 | policy_time = 0 50 | env_time = 0 51 | process_time = 0 52 | 53 | policy = self.algo.policy 54 | import time 55 | while n_samples < self.algo.batch_size: 56 | t = time.time() 57 | policy.reset(dones) 58 | actions, agent_infos = policy.get_actions(obses) 59 | 60 | policy_time += time.time() - t 61 | t = time.time() 62 | next_obses, rewards, dones, env_infos = self.vec_env.step(actions) 63 | env_time += time.time() - t 64 | 65 | t = time.time() 66 | 67 | agent_infos = tensor_utils.split_tensor_dict_list(agent_infos) 68 | env_infos = tensor_utils.split_tensor_dict_list(env_infos) 69 | if env_infos is None: 70 | env_infos = [dict() for _ in range(self.vec_env.num_envs)] 71 | if agent_infos is None: 72 | agent_infos = [dict() for _ in range(self.vec_env.num_envs)] 73 | for idx, observation, action, reward, env_info, agent_info, done in zip(itertools.count(), obses, actions, 74 | rewards, env_infos, agent_infos, 75 | dones): 76 | if running_paths[idx] is None: 77 | running_paths[idx] = dict( 78 | observations=[], 79 | actions=[], 80 | rewards=[], 81 | env_infos=[], 82 | agent_infos=[], 83 | ) 84 | running_paths[idx]["observations"].append(observation) 85 | running_paths[idx]["actions"].append(action) 86 | running_paths[idx]["rewards"].append(reward) 87 | running_paths[idx]["env_infos"].append(env_info) 88 | running_paths[idx]["agent_infos"].append(agent_info) 89 | if done: 90 | paths.append(dict( 91 | observations=self.env_spec.observation_space.flatten_n(running_paths[idx]["observations"]), 92 | actions=self.env_spec.action_space.flatten_n(running_paths[idx]["actions"]), 93 | rewards=tensor_utils.stack_tensor_list(running_paths[idx]["rewards"]), 94 | env_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["env_infos"]), 95 | agent_infos=tensor_utils.stack_tensor_dict_list(running_paths[idx]["agent_infos"]), 96 | )) 97 | n_samples += len(running_paths[idx]["rewards"]) 98 | running_paths[idx] = None 99 | process_time += time.time() - t 100 | pbar.inc(len(obses)) 101 | obses = next_obses 102 | 103 | pbar.stop() 104 | 105 | logger.record_tabular("PolicyExecTime", policy_time) 106 | logger.record_tabular("EnvExecTime", env_time) 107 | logger.record_tabular("ProcessExecTime", process_time) 108 | 109 | return paths 110 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/__init__.py: -------------------------------------------------------------------------------- 1 | from .product import Product 2 | from .discrete import Discrete 3 | from .box import Box 4 | 5 | __all__ = ["Product", "Discrete", "Box"] 6 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/__pycache__/box.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/box.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/__pycache__/discrete.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/discrete.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/__pycache__/product.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/sandbox/rocky/tf/spaces/__pycache__/product.cpython-35.pyc -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/box.py: -------------------------------------------------------------------------------- 1 | from rllab.spaces.box import Box as TheanoBox 2 | import tensorflow as tf 3 | 4 | 5 | class Box(TheanoBox): 6 | def new_tensor_variable(self, name, extra_dims, flatten=True): 7 | if flatten: 8 | return tf.placeholder(tf.float32, shape=[None] * extra_dims + [self.flat_dim], name=name) 9 | return tf.placeholder(tf.float32, shape=[None] * extra_dims + list(self.shape), name=name) 10 | 11 | @property 12 | def dtype(self): 13 | return tf.float32 14 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/discrete.py: -------------------------------------------------------------------------------- 1 | from rllab.spaces.base import Space 2 | import numpy as np 3 | from rllab.misc import special 4 | from rllab.misc import ext 5 | import tensorflow as tf 6 | 7 | 8 | class Discrete(Space): 9 | """ 10 | {0,1,...,n-1} 11 | """ 12 | 13 | def __init__(self, n): 14 | self._n = n 15 | 16 | @property 17 | def n(self): 18 | return self._n 19 | 20 | def sample(self): 21 | return np.random.randint(self.n) 22 | 23 | def sample_n(self, n): 24 | return np.random.randint(low=0, high=self.n, size=n) 25 | 26 | def contains(self, x): 27 | x = np.asarray(x) 28 | return x.shape == () and x.dtype.kind == 'i' and x >= 0 and x < self.n 29 | 30 | def __repr__(self): 31 | return "Discrete(%d)" % self.n 32 | 33 | def __eq__(self, other): 34 | return self.n == other.n 35 | 36 | def flatten(self, x): 37 | return special.to_onehot(x, self.n) 38 | 39 | def unflatten(self, x): 40 | return special.from_onehot(x) 41 | 42 | def flatten_n(self, x): 43 | return special.to_onehot_n(x, self.n) 44 | 45 | def unflatten_n(self, x): 46 | return special.from_onehot_n(x) 47 | 48 | @property 49 | def default_value(self): 50 | return 0 51 | 52 | @property 53 | def flat_dim(self): 54 | return self.n 55 | 56 | def weighted_sample(self, weights): 57 | return special.weighted_sample(weights, range(self.n)) 58 | 59 | def new_tensor_variable(self, name, extra_dims): 60 | # needed for safe conversion to float32 61 | return tf.placeholder(dtype=tf.uint8, shape=[None] * extra_dims + [self.flat_dim], name=name) 62 | 63 | @property 64 | def dtype(self): 65 | return tf.uint8 66 | 67 | def __eq__(self, other): 68 | if not isinstance(other, Discrete): 69 | return False 70 | return self.n == other.n 71 | 72 | def __hash__(self): 73 | return hash(self.n) 74 | 75 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/product.py: -------------------------------------------------------------------------------- 1 | from rllab.spaces.base import Space 2 | import tensorflow as tf 3 | import numpy as np 4 | 5 | 6 | class Product(Space): 7 | def __init__(self, *components): 8 | if isinstance(components[0], (list, tuple)): 9 | assert len(components) == 1 10 | components = components[0] 11 | self._components = tuple(components) 12 | dtypes = [c.dtype for c in components] 13 | if len(dtypes) > 0 and hasattr(dtypes[0], "as_numpy_dtype"): 14 | dtypes = [d.as_numpy_dtype for d in dtypes] 15 | self._common_dtype = np.core.numerictypes.find_common_type([], dtypes) 16 | 17 | def sample(self): 18 | return tuple(x.sample() for x in self._components) 19 | 20 | @property 21 | def components(self): 22 | return self._components 23 | 24 | def contains(self, x): 25 | return isinstance(x, tuple) and all(c.contains(xi) for c, xi in zip(self._components, x)) 26 | 27 | def new_tensor_variable(self, name, extra_dims): 28 | return tf.placeholder( 29 | dtype=self._common_dtype, 30 | shape=[None] * extra_dims + [self.flat_dim], 31 | name=name, 32 | ) 33 | 34 | @property 35 | def dtype(self): 36 | return self._common_dtype 37 | 38 | @property 39 | def flat_dim(self): 40 | return int(np.sum([c.flat_dim for c in self._components])) 41 | 42 | def flatten(self, x): 43 | return np.concatenate([c.flatten(xi) for c, xi in zip(self._components, x)]) 44 | 45 | def flatten_n(self, xs): 46 | xs_regrouped = [[x[i] for x in xs] for i in range(len(xs[0]))] 47 | flat_regrouped = [c.flatten_n(xi) for c, xi in zip(self.components, xs_regrouped)] 48 | return np.concatenate(flat_regrouped, axis=-1) 49 | 50 | def unflatten(self, x): 51 | dims = [c.flat_dim for c in self._components] 52 | flat_xs = np.split(x, np.cumsum(dims)[:-1]) 53 | return tuple(c.unflatten(xi) for c, xi in zip(self._components, flat_xs)) 54 | 55 | def unflatten_n(self, xs): 56 | dims = [c.flat_dim for c in self._components] 57 | flat_xs = np.split(xs, np.cumsum(dims)[:-1], axis=-1) 58 | unflat_xs = [c.unflatten_n(xi) for c, xi in zip(self.components, flat_xs)] 59 | unflat_xs_grouped = list(zip(*unflat_xs)) 60 | return unflat_xs_grouped 61 | 62 | def __eq__(self, other): 63 | if not isinstance(other, Product): 64 | return False 65 | return tuple(self.components) == tuple(other.components) 66 | 67 | def __hash__(self): 68 | return hash(tuple(self.components)) 69 | -------------------------------------------------------------------------------- /sim_cpolicy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import joblib 3 | from rllab.misc import tensor_utils 4 | import time 5 | from contextlib import contextmanager 6 | import numpy as np 7 | import tensorflow as tf 8 | 9 | from sac.envs import CrossMazeAntEnv, RandomGoalAntEnv,HalfCheetahHurdleEnv 10 | from rllab.envs.normalized_env import normalize 11 | from rllab.misc import tensor_utils 12 | from sac.misc import tf_utils 13 | 14 | def rollout(env, policy,sub_level_policies,path_length=1000, render=True, speedup=10, g=2): 15 | observation = env.reset() 16 | policy.reset() 17 | 18 | t = 0 19 | obs = observation 20 | for t in range(path_length): 21 | 22 | 23 | sub_level_actions=[] 24 | if g!=0: 25 | obs=observation[:-g] 26 | else: 27 | obs=observation 28 | for i in range(0,len(sub_level_policies)): 29 | action, _ = sub_level_policies[i].get_action(obs) 30 | sub_level_actions.append(action.reshape(1,-1)) 31 | sub_level_actions=np.stack(sub_level_actions,axis=0) 32 | sub_level_actions=np.transpose(sub_level_actions,(1,0,2)) 33 | 34 | action, agent_info = policy.get_action(observation,sub_level_actions) 35 | next_obs, reward, terminal, env_info = env.step(action) 36 | 37 | 38 | observation = next_obs 39 | 40 | if render: 41 | env.render() 42 | time_step = 0.05 43 | time.sleep(time_step / speedup) 44 | 45 | if terminal: 46 | break 47 | 48 | 49 | return 0 50 | 51 | 52 | def parse_args(): 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument('file', type=str, help='Path to the snapshot file.') 55 | parser.add_argument('--max-path-length', '-l', type=int, default=1000) 56 | parser.add_argument('--speedup', '-s', type=float, default=10) 57 | parser.add_argument('--domain',type=str,default='ant-cross-maze') 58 | parser.add_argument('--deterministic', '-d', dest='deterministic', 59 | action='store_true') 60 | parser.add_argument('--no-deterministic', '-nd', dest='deterministic', 61 | action='store_false') 62 | parser.add_argument('--policy_h', type=int) 63 | parser.set_defaults(deterministic=True) 64 | 65 | args = parser.parse_args() 66 | 67 | return args 68 | 69 | def load_low_level_policy(policy_path=None,name=None): 70 | with tf_utils.get_default_session().as_default(): 71 | with tf.variable_scope(name, reuse=False): 72 | snapshot = joblib.load(policy_path) 73 | 74 | policy = snapshot["policy"] 75 | return policy 76 | 77 | 78 | def simulate_policy_ant(args): 79 | sub_level_policies=[] 80 | with tf.Session() as sess: 81 | with tf.variable_scope("fwrd", reuse=False): 82 | fwrd = joblib.load("primitive-policies/ant/fwrd/fwrd.pkl") 83 | with tf.variable_scope("bwrd", reuse=False): 84 | bwrd = joblib.load("primitive-policies/ant/bwrd/bwrd.pkl") 85 | with tf.variable_scope("uwrd", reuse=False): 86 | uwrd = joblib.load("primitive-policies/ant/uwrd/uwrd.pkl") 87 | with tf.variable_scope("dwrd", reuse=False): 88 | dwrd = joblib.load("primitive-policies/ant/dwrd/dwrd.pkl") 89 | sub_level_policies.append(fwrd["policy"]) 90 | sub_level_policies.append(bwrd["policy"]) 91 | sub_level_policies.append(uwrd["policy"]) 92 | sub_level_policies.append(dwrd["policy"]) 93 | data = joblib.load(args.file) 94 | if 'algo' in data.keys(): 95 | policy = data['algo'].policy 96 | env = data['algo'].env 97 | else: 98 | policy = data['policy'] 99 | env = data['env'] 100 | with policy.deterministic(args.deterministic): 101 | while True: 102 | path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length,g=2) 103 | 104 | def simulate_policy_pusher(args): 105 | sub_level_policies=[] 106 | with tf.Session() as sess: 107 | with tf.variable_scope("bottom", reuse=False): 108 | btm = joblib.load("primitive-policies/pusher/bottom/bottom.pkl") 109 | with tf.variable_scope("jump", reuse=False): 110 | lft = joblib.load("primitive-policies/pusher/left/left.pkl") 111 | sub_level_policies.append(btm["policy"]) 112 | sub_level_policies.append(lft["policy"]) 113 | data = joblib.load(args.file) 114 | if 'algo' in data.keys(): 115 | policy = data['algo'].policy 116 | env = data['algo'].env 117 | else: 118 | policy = data['policy'] 119 | env =data['env'] 120 | with policy.deterministic(args.deterministic): 121 | while True: 122 | path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length,g=0) 123 | 124 | def simulate_policy_hch(args): 125 | sub_level_policies=[] 126 | with tf.Session() as sess: 127 | with tf.variable_scope("fwrd", reuse=False): 128 | fwrd = joblib.load("primitive-policies/hc/fwd/fwd.pkl") 129 | with tf.variable_scope("jump", reuse=False): 130 | jmp = joblib.load("primitive-policies/hc/jp-longz/jump.pkl") 131 | sub_level_policies.append(fwrd["policy"]) 132 | sub_level_policies.append(jmp["policy"]) 133 | data = joblib.load(args.file) 134 | if 'algo' in data.keys(): 135 | policy = data['algo'].policy 136 | env = data['algo'].env 137 | else: 138 | policy = data['policy'] 139 | env = normalize(HalfCheetahHurdleEnv()) #data['env'] 140 | with policy.deterministic(args.deterministic): 141 | while True: 142 | path = rollout(env, policy,sub_level_policies,path_length=args.max_path_length, g=2) 143 | 144 | if __name__ == "__main__": 145 | args = parse_args() 146 | if args.domain=='ant-cross-maze' or args.domain=='ant-random-goal': 147 | simulate_policy_ant(args) 148 | if args.domain=='cheetah-hurdle': 149 | simulate_policy_hch(args) 150 | if args.domain=='pusher': 151 | simulate_policy_pusher(args) 152 | -------------------------------------------------------------------------------- /sim_policy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import joblib 4 | import tensorflow as tf 5 | 6 | from rllab.sampler.utils import rollout 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('file', type=str, help='Path to the snapshot file.') 11 | parser.add_argument('--max-path-length', '-l', type=int, default=1000) 12 | parser.add_argument('--speedup', '-s', type=float, default=10) 13 | parser.add_argument('--deterministic', '-d', dest='deterministic', 14 | action='store_true') 15 | parser.add_argument('--no-deterministic', '-nd', dest='deterministic', 16 | action='store_false') 17 | parser.add_argument('--policy_h', type=int) 18 | parser.set_defaults(deterministic=True) 19 | 20 | args = parser.parse_args() 21 | 22 | return args 23 | 24 | def simulate_policy(args): 25 | with tf.Session() as sess: 26 | data = joblib.load(args.file) 27 | print(data.keys()) 28 | if 'algo' in data.keys(): 29 | policy = data['algo'].policy 30 | env = data['algo'].env 31 | else: 32 | policy = data['policy'] 33 | env = data['env'] 34 | print(policy) 35 | with policy.deterministic(args.deterministic): 36 | while True: 37 | path = rollout(env, policy, 38 | max_path_length=args.max_path_length, 39 | animated=True, speedup=args.speedup) 40 | if __name__ == "__main__": 41 | args = parse_args() 42 | simulate_policy(args) 43 | -------------------------------------------------------------------------------- /value_functions/__init__.py: -------------------------------------------------------------------------------- 1 | from .value_function import NNVFunction, NNQFunction, NNDiscriminatorFunction 2 | -------------------------------------------------------------------------------- /value_functions/__pycache__/__init__.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/value_functions/__pycache__/__init__.cpython-35.pyc -------------------------------------------------------------------------------- /value_functions/__pycache__/value_function.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ahq1993/compositional_reinforcement_learning/cd19d217b28af43525887ddc83bd4f11368d7398/value_functions/__pycache__/value_function.cpython-35.pyc -------------------------------------------------------------------------------- /value_functions/value_function.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from rllab.core.serializable import Serializable 4 | 5 | from sac.misc.mlp import MLPFunction 6 | from sac.misc import tf_utils 7 | 8 | class NNVFunction(MLPFunction): 9 | 10 | def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='vf'): 11 | Serializable.quick_init(self, locals()) 12 | 13 | self._Do = env_spec.observation_space.flat_dim 14 | self._obs_pl = tf.placeholder( 15 | tf.float32, 16 | shape=[None, self._Do], 17 | name='observation', 18 | ) 19 | 20 | super(NNVFunction, self).__init__( 21 | name, (self._obs_pl,), hidden_layer_sizes) 22 | 23 | 24 | class NNQFunction(MLPFunction): 25 | def __init__(self, env_spec, hidden_layer_sizes=(100, 100), name='qf'): 26 | Serializable.quick_init(self, locals()) 27 | 28 | self._Da = env_spec.action_space.flat_dim 29 | self._Do = env_spec.observation_space.flat_dim 30 | 31 | self._obs_pl = tf.placeholder( 32 | tf.float32, 33 | shape=[None, self._Do], 34 | name='observation', 35 | ) 36 | 37 | self._action_pl = tf.placeholder( 38 | tf.float32, 39 | shape=[None, self._Da], 40 | name='actions', 41 | ) 42 | 43 | super(NNQFunction, self).__init__( 44 | name, (self._obs_pl, self._action_pl), hidden_layer_sizes) 45 | 46 | 47 | class NNDiscriminatorFunction(MLPFunction): 48 | def __init__(self, env_spec, hidden_layer_sizes=(100, 100), num_skills=None): 49 | assert num_skills is not None 50 | Serializable.quick_init(self, locals()) 51 | Parameterized.__init__(self) 52 | 53 | self._Da = env_spec.action_space.flat_dim 54 | self._Do = env_spec.observation_space.flat_dim 55 | 56 | self._obs_pl = tf.placeholder( 57 | tf.float32, 58 | shape=[None, self._Do], 59 | name='observation', 60 | ) 61 | self._action_pl = tf.placeholder( 62 | tf.float32, 63 | shape=[None, self._Da], 64 | name='actions', 65 | ) 66 | 67 | self._name = 'discriminator' 68 | self._input_pls = (self._obs_pl, self._action_pl) 69 | self._layer_sizes = list(hidden_layer_sizes) + [num_skills] 70 | self._output_t = self.get_output_for(*self._input_pls) 71 | --------------------------------------------------------------------------------