├── bayesian_ddpg
    ├── params.pkl
    ├── progress.csv
    ├── params.json
    └── debug.log
├── __pycache__
    ├── ddpg_bayesian.cpython-35.pyc
    ├── sampling_utils.cpython-35.pyc
    ├── ddpg_bayesian_mean.cpython-35.pyc
    ├── dropout_exploration.cpython-35.pyc
    ├── ddpg_bayesian_thompson.cpython-35.pyc
    ├── deterministic_mlp_policy_bayesian.cpython-35.pyc
    └── continuous_mlp_q_function_bayesian.cpython-35.pyc
├── deterministic_mlp_policy_bayesian.py
├── dropout_exploration.py
├── run_bayesian_ddpg.py
├── continuous_mlp_q_function_bayesian.py
├── sampling_utils.py
├── dropout_gal_neuralnet_tf_example.py
├── variational_inference_examples.py
├── ddpg_bayesian_thompson.py
├── ddpg_bayesian_mean.py
├── ddpg_bayesian.py
└── bayesian_network.py


/bayesian_ddpg/params.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/bayesian_ddpg/params.pkl


--------------------------------------------------------------------------------
/__pycache__/ddpg_bayesian.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/ddpg_bayesian.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/sampling_utils.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/sampling_utils.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/ddpg_bayesian_mean.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/ddpg_bayesian_mean.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/dropout_exploration.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/dropout_exploration.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/ddpg_bayesian_thompson.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/ddpg_bayesian_thompson.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/deterministic_mlp_policy_bayesian.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/deterministic_mlp_policy_bayesian.cpython-35.pyc


--------------------------------------------------------------------------------
/__pycache__/continuous_mlp_q_function_bayesian.cpython-35.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/continuous_mlp_q_function_bayesian.cpython-35.pyc


--------------------------------------------------------------------------------
/bayesian_ddpg/progress.csv:
--------------------------------------------------------------------------------
1 | AverageAbsQYDiff,MaxReturn,Epoch,AverageAbsQ,MaxEsReturn,StdReturn,PolicyRegParamNorm,AverageQLoss,AverageDiscountedReturn,AverageY,Iteration,StdEsReturn,QFunRegParamNorm,AverageEsReturn,MinEsReturn,AverageAbsY,AverageAction,AverageReturn,AverageQ,MinReturn,AveragePolicySurr
2 | 0.758090012217,9.31850444693,9,0.365071,54.8695015601,0.04653903996,11.23,0.74919,8.76006204415,0.48040966073,9,7.05844821432,11.13,9.9070100333,1.67898668029,0.593880576329,0.218841,9.18140387997,-0.243523,9.07004472335,-0.053608
3 | 0.177415410708,1.91244584333,10,0.823965,8.61305045778,0.0811045362752,13.2281,0.0712594,1.7088114456,0.805172051351,10,1.56573821189,11.3933,2.15915361844,0.0122472149906,0.845080383496,0.999596,1.70341682201,0.795785,1.49563580939,-0.672253
4 | 0.12357864363,3.89928951199,11,1.19482,6.41710902887,0.115736916112,13.4963,0.0288876,3.53009868395,1.14473330561,11,0.737778156092,11.7883,4.05680174472,1.23751043098,1.20288642826,1.0,3.58419018176,1.1393,3.26894831589,-0.974544
5 | 0.144135365208,3.89855122419,12,1.66405,6.2912818238,0.113467534219,13.4963,0.0412518,3.53067386378,1.59586680965,12,0.57573980753,12.3922,4.13937903654,2.10169953317,1.6723605747,1.0,3.58477456358,1.59025,3.29676145871,-1.35587
6 | 


--------------------------------------------------------------------------------
/deterministic_mlp_policy_bayesian.py:
--------------------------------------------------------------------------------
 1 | import lasagne
 2 | import lasagne.layers as L
 3 | import lasagne.nonlinearities as NL
 4 | import lasagne.init as LI
 5 | from rllab.core.serializable import Serializable
 6 | from rllab.misc import ext
 7 | from rllab.misc.overrides import overrides
 8 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
 9 | from sandbox.rocky.tf.core.network import MLP
10 | from sandbox.rocky.tf.distributions.categorical import Categorical
11 | from sandbox.rocky.tf.policies.base import Policy
12 | from sandbox.rocky.tf.misc import tensor_utils
13 | 
14 | import sandbox.rocky.tf.core.layers as L
15 | from sandbox.rocky.tf.core.layers import batch_norm
16 | 
17 | from sandbox.rocky.tf.spaces.discrete import Discrete
18 | import tensorflow as tf
19 | 
20 | 
21 | class DeterministicMLPPolicy(Policy, LayersPowered, Serializable):
22 |     def __init__(
23 |             self,
24 |             name,
25 |             env_spec,
26 |             hidden_sizes=(32, 32),
27 |             hidden_nonlinearity=tf.nn.relu,
28 |             output_nonlinearity=tf.nn.tanh,
29 |             prob_network=None,
30 |             bn=False):
31 |         Serializable.quick_init(self, locals())
32 | 
33 |         ## Apply MC Dropout on the MLP networks here
34 | 
35 |         with tf.variable_scope(name):
36 |             if prob_network is None:
37 |                 prob_network = MLP(
38 |                     input_shape=(env_spec.observation_space.flat_dim,),
39 |                     output_dim=env_spec.action_space.flat_dim,
40 |                     hidden_sizes=hidden_sizes,
41 |                     hidden_nonlinearity=hidden_nonlinearity,
42 |                     output_nonlinearity=output_nonlinearity,
43 |                     # batch_normalization=True,
44 |                     name="prob_network",
45 |                 )
46 | 
47 |             self._l_prob = prob_network.output_layer
48 |             self._l_obs = prob_network.input_layer
49 |             self._f_prob = tensor_utils.compile_function(
50 |                 [prob_network.input_layer.input_var],
51 |                 L.get_output(prob_network.output_layer, deterministic=True)
52 |             )
53 | 
54 |         self.prob_network = prob_network
55 | 
56 |         # Note the deterministic=True argument. It makes sure that when getting
57 |         # actions from single observations, we do not update params in the
58 |         # batch normalization layers.
59 |         # TODO: this doesn't currently work properly in the tf version so we leave out batch_norm
60 |         super(DeterministicMLPPolicy, self).__init__(env_spec)
61 |         LayersPowered.__init__(self, [prob_network.output_layer])
62 | 
63 | 
64 | 
65 |     @property
66 |     def vectorized(self):
67 |         return True
68 |         
69 |     @overrides
70 |     def get_action(self, observation):
71 |         flat_obs = self.observation_space.flatten(observation)
72 |         action = self._f_prob([flat_obs])[0]
73 |         return action, dict()
74 | 
75 |     @overrides
76 |     def get_actions(self, observations):
77 |         flat_obs = self.observation_space.flatten_n(observations)
78 |         actions = self._f_prob(flat_obs)
79 |         return actions, dict()
80 | 
81 |     def get_action_sym(self, obs_var):
82 |         return L.get_output(self.prob_network.output_layer, obs_var)
83 | 


--------------------------------------------------------------------------------
/dropout_exploration.py:
--------------------------------------------------------------------------------
 1 | from rllab.misc.overrides import overrides
 2 | from rllab.misc.ext import AttrDict
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.spaces.box import Box
 5 | from rllab.exploration_strategies.base import ExplorationStrategy
 6 | import numpy as np
 7 | import numpy.random as nr
 8 | 
 9 | 
10 | class MCDropout(ExplorationStrategy, Serializable):
11 |     """
12 |     This strategy implements the Ornstein-Uhlenbeck process, which adds
13 |     time-correlated noise to the actions taken by the deterministic policy.
14 |     The OU process satisfies the following stochastic differential equation:
15 |     dxt = theta*(mu - xt)*dt + sigma*dWt
16 |     where Wt denotes the Wiener process
17 |     """
18 | 
19 |     def __init__(self, env_spec, mu=0, theta=0.15, sigma=0.3, **kwargs):
20 |         assert isinstance(env_spec.action_space, Box)
21 |         assert len(env_spec.action_space.shape) == 1
22 |         Serializable.quick_init(self, locals())
23 |         self.mu = mu
24 |         self.theta = theta
25 |         self.sigma = sigma
26 |         self.action_space = env_spec.action_space
27 |         self.state = np.ones(self.action_space.flat_dim) * self.mu
28 |         self.reset()
29 | 
30 |     def __getstate__(self):
31 |         d = Serializable.__getstate__(self)
32 |         d["state"] = self.state
33 |         return d
34 | 
35 |     def __setstate__(self, d):
36 |         Serializable.__setstate__(self, d)
37 |         self.state = d["state"]
38 | 
39 |     @overrides
40 |     def reset(self):
41 |         self.state = np.ones(self.action_space.flat_dim) * self.mu
42 | 
43 |     def evolve_state(self):
44 |         x = self.state
45 |         dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x))
46 |         self.state = x + dx
47 |         return self.state
48 | 
49 |     @overrides
50 |     def get_action(self, t, observation, policy, **kwargs):
51 |         action, _ = policy.get_action(observation)
52 |         ou_state = self.evolve_state()
53 |         return np.clip(action + ou_state, self.action_space.low, self.action_space.high)
54 | 
55 | 
56 |     # def get_action(self, t, observation, policy, **kwargs):
57 |     #     #applying MC Dropout and taking the mean action?
58 |     #     action, _ = policy.get_action(observation)
59 | 
60 |     #     mc_dropout = 10
61 |     #     all_actions = np.zeros(shape=(mc_dropout, action.shape[0]))
62 |     #     for d in range(mc_dropout):
63 |     #         action, _ = policy.get_action(observation)
64 |     #         all_actions[d, :] = action
65 |     #     mean_action = np.mean(all_actions, axis=0)
66 | 
67 |     #     return mean_action
68 | 
69 | 
70 |     # def get_stochastic_action(self, t, observation, policy, **kwargs):
71 |     #     action, _ = policy.get_action(observation)
72 |     #     mc_dropout = 10
73 |     #     all_actions = np.zeros(shape=(mc_dropout, action.shape[0]))
74 | 
75 |     #     for d in range(mc_dropout):
76 |     #         action, _ = policy.get_action(observation)
77 |     #         all_actions[d, :] = action
78 | 
79 |     #     mean_action = np.mean(all_actions, axis=0)
80 | 
81 | 
82 |     #     return mean_action
83 | 
84 | 
85 |     
86 | if __name__ == "__main__":
87 |     ou = MCDropout(env_spec=AttrDict(action_space=Box(low=-1, high=1, shape=(1,))), mu=0, theta=0.15, sigma=0.3)
88 |     states = []
89 |     for i in range(1000):
90 |         states.append(ou.evolve_state()[0])
91 |     import matplotlib.pyplot as plt
92 | 
93 |     plt.plot(states)
94 |     plt.show()
95 | 


--------------------------------------------------------------------------------
/run_bayesian_ddpg.py:
--------------------------------------------------------------------------------
  1 | from ddpg_bayesian_thompson import DDPG as DDPG_Thompson
  2 | from ddpg_bayesian_mean import DDPG as DDPG_Mean
  3 | from ddpg_bayesian import DDPG as DDPG_Bayesian
  4 | from dropout_exploration import MCDropout
  5 | from deterministic_mlp_policy_bayesian import DeterministicMLPPolicy
  6 | from continuous_mlp_q_function_bayesian import ContinuousMLPQFunction
  7 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
  8 | from rllab.envs.normalized_env import normalize
  9 | from rllab.misc.instrument import stub, run_experiment_lite
 10 | from sandbox.rocky.tf.envs.base import TfEnv
 11 | from rllab.envs.gym_env import GymEnv
 12 | from rllab.misc import ext
 13 | import pickle
 14 | import tensorflow as tf
 15 | import argparse
 16 | 
 17 | parser = argparse.ArgumentParser()
 18 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular")
 19 | parser.add_argument("env", help="The environment name from OpenAIGym environments")
 20 | parser.add_argument("--num_epochs", default=100, type=int)
 21 | parser.add_argument("--plot", action="store_true")
 22 | # parser.add_argument("--data_dir", default="./data/")
 23 | args = parser.parse_args()
 24 | 
 25 | stub(globals())
 26 | ext.set_seed(1)
 27 | 
 28 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"]
 29 | 
 30 | other_env_class_map  = { "Cartpole" :  CartpoleEnv}
 31 | 
 32 | if args.env in supported_gym_envs:
 33 |     gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False)
 34 |     # gymenv.env.seed(1)
 35 | else:
 36 |     gymenv = other_env_class_map[args.env]()
 37 | 
 38 | 
 39 | env = TfEnv(normalize(gymenv))
 40 | 
 41 | policy = DeterministicMLPPolicy(
 42 |     env_spec=env.spec,
 43 |     name="policy",
 44 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
 45 |     hidden_sizes=(100, 50, 25),
 46 |     hidden_nonlinearity=tf.nn.relu,
 47 | )
 48 | 
 49 | es = MCDropout(env_spec=env.spec)
 50 | 
 51 | qf = ContinuousMLPQFunction(env_spec=env.spec,
 52 |                             hidden_sizes=(100,100),
 53 |                             hidden_nonlinearity=tf.nn.relu,)
 54 | 
 55 | 
 56 | ddpg_type_map = {"Thompson" : DDPG_Thompson, "Mean" : DDPG_Mean, "Bayesian" : DDPG_Bayesian}
 57 | 
 58 | ddpg_class = ddpg_type_map[args.type]
 59 | 
 60 | ## loops:
 61 | num_experiments = 1
 62 | batch_size_values = [64]
 63 | 
 64 | 
 65 | 
 66 | 
 67 | for b in range(len(batch_size_values)): 
 68 |     
 69 |     for e in range(num_experiments):
 70 | 
 71 |         algo = ddpg_class(
 72 |             env=env,
 73 |             policy=policy,
 74 |             es=es,
 75 |             qf=qf,
 76 |             batch_size=64,
 77 |             max_path_length=env.horizon,
 78 |             epoch_length=1000,
 79 |             min_pool_size=10000,
 80 |             n_epochs=args.num_epochs,
 81 |             discount=0.99,
 82 |             scale_reward=1.0,
 83 |             qf_learning_rate=1e-3,
 84 |             policy_learning_rate=1e-4,
 85 |             # Uncomment both lines (this and the plot parameter below) to enable plotting
 86 |             plot=args.plot,
 87 |         )
 88 | 
 89 | 
 90 |         run_experiment_lite(
 91 |             algo.train(),
 92 |             # log_dir=args.data_dir,
 93 |             # Number of parallel workers for sampling
 94 |             n_parallel=1,
 95 |             # Only keep the snapshot parameters for the last iteration
 96 |             snapshot_mode="last",
 97 |             # Specifies the seed for the experiment. If this is not provided, a random seed
 98 |             # will be used
 99 |             exp_name="Trial_Bayesian_Exploration/",
100 |             seed=1,
101 |             plot=args.plot,
102 |         )
103 | 


--------------------------------------------------------------------------------
/continuous_mlp_q_function_bayesian.py:
--------------------------------------------------------------------------------
  1 | from sandbox.rocky.tf.q_functions.base import QFunction
  2 | from rllab.core.serializable import Serializable
  3 | from rllab.misc import ext
  4 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
  5 | from sandbox.rocky.tf.core.network import MLP
  6 | from sandbox.rocky.tf.core.layers import batch_norm
  7 | from sandbox.rocky.tf.distributions.categorical import Categorical
  8 | from sandbox.rocky.tf.policies.base import StochasticPolicy
  9 | from sandbox.rocky.tf.misc import tensor_utils
 10 | import tensorflow as tf
 11 | import sandbox.rocky.tf.core.layers as L
 12 | 
 13 | import numpy as np
 14 | 
 15 | 
 16 | class ContinuousMLPQFunction(QFunction, LayersPowered, Serializable):
 17 |     def __init__(
 18 |             self,
 19 |             env_spec,
 20 |             hidden_sizes=(32, 32),
 21 |             hidden_nonlinearity=tf.nn.relu,
 22 |             action_merge_layer=-2,
 23 |             output_nonlinearity=None,
 24 |             bn=False,
 25 |             dropout=0.1):
 26 |         Serializable.quick_init(self, locals())
 27 | 
 28 |         l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs")
 29 |         l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions")
 30 | 
 31 |         n_layers = len(hidden_sizes) + 1
 32 | 
 33 |         if n_layers > 1:
 34 |             action_merge_layer = \
 35 |                 (action_merge_layer % n_layers + n_layers) % n_layers
 36 |         else:
 37 |             action_merge_layer = 1
 38 | 
 39 |         l_hidden = l_obs
 40 | 
 41 |         for idx, size in enumerate(hidden_sizes):
 42 |             if bn:
 43 |                 l_hidden = batch_norm(l_hidden)
 44 | 
 45 |             if idx == action_merge_layer:
 46 |                 l_hidden = L.ConcatLayer([l_hidden, l_action])
 47 | 
 48 |             l_hidden = L.DenseLayer(
 49 |                 l_hidden,
 50 |                 num_units=size,
 51 |                 nonlinearity=hidden_nonlinearity,
 52 |                 name="h%d" % (idx + 1)
 53 |             )
 54 | 
 55 |             l_hidden = L.DropoutLayer(l_hidden, dropout)
 56 | 
 57 | 
 58 | 
 59 |         if action_merge_layer == n_layers:
 60 |             l_hidden = L.ConcatLayer([l_hidden, l_action])
 61 | 
 62 |         l_output = L.DenseLayer(
 63 |             l_hidden,
 64 |             num_units=1,
 65 |             nonlinearity=output_nonlinearity,
 66 |             name="output"
 67 |         )
 68 |     
 69 |         output_var = L.get_output(l_output, deterministic=True)
 70 |         output_var_drop = L.get_output(l_output, deterministic=False)
 71 | 
 72 |         self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var)
 73 |         self._f_qval_drop = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var_drop)
 74 | 
 75 | 
 76 |         self._output_layer = l_output
 77 |         self._obs_layer = l_obs
 78 |         self._action_layer = l_action
 79 |         self._output_nonlinearity = output_nonlinearity
 80 | 
 81 |         LayersPowered.__init__(self, [l_output])
 82 | 
 83 |     def get_qval(self, observations, actions):
 84 |         return self._f_qval(observations, actions)
 85 | 
 86 | 
 87 |     def get_qval_dropout(self, observations, actions):
 88 |         return self._f_qval_drop(observations, actions)
 89 | 
 90 | 
 91 |     def get_qval_sym(self, obs_var, action_var, **kwargs):
 92 |         qvals = L.get_output(self._output_layer, {self._obs_layer: obs_var, self._action_layer: action_var}, **kwargs)
 93 |         return tf.reshape(qvals, (-1,))
 94 | 
 95 | 
 96 |     """
 97 |     want this to return mean qvals + lamba * variance as the output
 98 |     """
 99 |     def get_qval_plus_var_sym(self, obs_var, action_var, **kwargs):
100 | 
101 |         """
102 |         TO DO HERE
103 |         """
104 | 
105 |         mc_dropout = 5
106 |         all_qvals = []
107 |         for m in range(mc_dropout):
108 |             qvals = L.get_output(self._output_layer, {self._obs_layer: obs_var, self._action_layer: action_var}, **kwargs)
109 |             all_qvals = np.append(all_qvals, qvals)
110 |         
111 |         return tf.reshape(qvals, (-1,))
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/sampling_utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | 
  4 | import numpy as np
  5 | import rllab.misc.logger as logger
  6 | 
  7 | class SimpleReplayPool(object):
  8 |     """
  9 |     Used from https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/rllab/pool/simple_pool.py
 10 |     """
 11 |     def __init__(
 12 |             self, max_pool_size, observation_dim, action_dim,
 13 |             replacement_policy='stochastic', replacement_prob=1.0,
 14 |             max_skip_episode=10):
 15 |         self._observation_dim = observation_dim
 16 |         self._action_dim = action_dim
 17 |         self._max_pool_size = max_pool_size
 18 |         self._replacement_policy = replacement_policy
 19 |         self._replacement_prob = replacement_prob
 20 |         self._max_skip_episode = max_skip_episode
 21 |         self._observations = np.zeros(
 22 |             (max_pool_size, observation_dim),
 23 |         )
 24 |         self._actions = np.zeros(
 25 |             (max_pool_size, action_dim),
 26 |         )
 27 |         self._rewards = np.zeros(max_pool_size)
 28 |         self._terminals = np.zeros(max_pool_size, dtype='uint8')
 29 |         self._initials = np.zeros(max_pool_size, dtype='uint8')
 30 |         self._bottom = 0
 31 |         self._top = 0
 32 |         self._size = 0
 33 | 
 34 |     def add_sample(self, observation, action, reward, terminal, initial):
 35 |         self.check_replacement()
 36 |         self._observations[self._top] = observation
 37 |         self._actions[self._top] = action
 38 |         self._rewards[self._top] = reward
 39 |         self._terminals[self._top] = terminal
 40 |         self._initials[self._top] = initial
 41 |         self.advance()
 42 | 
 43 |     def check_replacement(self):
 44 |         if self._replacement_prob < 1.0:
 45 |             if self._size < self._max_pool_size or \
 46 |                 not self._initials[self._top]: return
 47 |             self.advance_until_terminate()
 48 | 
 49 |     def get_skip_flag(self):
 50 |         if self._replacement_policy == 'full': skip = False
 51 |         elif self._replacement_policy == 'stochastic':
 52 |             skip = np.random.uniform() > self._replacement_prob
 53 |         else: raise NotImplementedError
 54 |         return skip
 55 | 
 56 |     def advance_until_terminate(self):
 57 |         skip = self.get_skip_flag()
 58 |         n_skips = 0
 59 |         old_top = self._top
 60 |         new_top = (old_top + 1) % self._max_pool_size
 61 |         while skip and old_top != new_top and n_skips < self._max_skip_episode:
 62 |             n_skips += 1
 63 |             self.advance()
 64 |             while not self._initials[self._top]:
 65 |                 self.advance()
 66 |             skip = self.get_skip_flag()
 67 |             new_top = self._top
 68 |         logger.log("add_sample, skipped %d episodes, top=%d->%d"%(
 69 |             n_skips, old_top, new_top))
 70 | 
 71 |     def advance(self):
 72 |         self._top = (self._top + 1) % self._max_pool_size
 73 |         if self._size >= self._max_pool_size:
 74 |             self._bottom = (self._bottom + 1) % self._max_pool_size
 75 |         else:
 76 |             self._size += 1
 77 | 
 78 |     def random_batch(self, batch_size):
 79 |         assert self._size > batch_size
 80 |         indices = np.zeros(batch_size, dtype='uint64')
 81 |         transition_indices = np.zeros(batch_size, dtype='uint64')
 82 |         count = 0
 83 |         while count < batch_size:
 84 |             index = np.random.randint(self._bottom, self._bottom + self._size) % self._max_pool_size
 85 |             # make sure that the transition is valid: if we are at the end of the pool, we need to discard
 86 |             # this sample
 87 |             if index == self._size - 1 and self._size <= self._max_pool_size:
 88 |                 continue
 89 |             # if self._terminals[index]:
 90 |             #     continue
 91 |             transition_index = (index + 1) % self._max_pool_size
 92 |             # make sure that the transition is valid: discard the transition if it crosses horizon-triggered resets
 93 |             if not self._terminals[index] and self._initials[transition_index]:
 94 |                 continue
 95 |             indices[count] = index
 96 |             transition_indices[count] = transition_index
 97 |             count += 1
 98 |         return dict(
 99 |             observations=self._observations[indices],
100 |             actions=self._actions[indices],
101 |             rewards=self._rewards[indices],
102 |             terminals=self._terminals[indices],
103 |             initials=self._initials[indices],
104 |             next_observations=self._observations[transition_indices]
105 |         )
106 | 
107 |     @property
108 |     def size(self):
109 |         return self._size
110 | 


--------------------------------------------------------------------------------
/bayesian_ddpg/params.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "args_data": "gANjcmxsYWIubWlzYy5pbnN0cnVtZW50ClN0dWJNZXRob2RDYWxsCnEAKYFxAX1xAihYCAAAAF9fa3dhcmdzcQN9cQRYBgAAAF9fYXJnc3EFKGNybGxhYi5taXNjLmluc3RydW1lbnQKU3R1Yk9iamVjdApxBimBcQd9cQgoWAsAAABwcm94eV9jbGFzc3EJY2RkcGdfYmF5ZXNpYW5fdGhvbXBzb24KRERQRwpxClgGAAAAa3dhcmdzcQt9cQwoWAgAAABkaXNjb3VudHENRz/vrhR64UeuWAgAAABuX2Vwb2Noc3EOSxRYAgAAAGVzcQ9oBimBcRB9cREoaAljZHJvcG91dF9leHBsb3JhdGlvbgpNQ0Ryb3BvdXQKcRJoC31xE1gIAAAAZW52X3NwZWNxFGNybGxhYi5taXNjLmluc3RydW1lbnQKU3R1YkF0dHIKcRUpgXEWfXEXKFgKAAAAX2F0dHJfbmFtZXEYWAQAAABzcGVjcRlYBAAAAF9vYmpxGmgGKYFxG31xHChoCWNzYW5kYm94LnJvY2t5LnRmLmVudnMuYmFzZQpUZkVudgpxHWgLfXEeWAsAAAB3cmFwcGVkX2VudnEfaAYpgXEgfXEhKGgJY3JsbGFiLmVudnMubm9ybWFsaXplZF9lbnYKTm9ybWFsaXplZEVudgpxImgLfXEjWAMAAABlbnZxJGgGKYFxJX1xJihoCWNybGxhYi5lbnZzLmd5bV9lbnYKR3ltRW52CnEnaAt9cSgoWAoAAAByZWNvcmRfbG9ncSmJWAgAAABlbnZfbmFtZXEqWAkAAABIb3BwZXItdjFxK1gLAAAAZm9yY2VfcmVzZXRxLIhYDAAAAHJlY29yZF92aWRlb3EtiXVYBAAAAGFyZ3NxLil1YnNoLil1YnNoLil1YnVic2guKXViWA0AAABtaW5fcG9vbF9zaXplcS9NECdYDAAAAHNjYWxlX3Jld2FyZHEwRz/wAAAAAAAAWAoAAABiYXRjaF9zaXplcTFLQFgQAAAAcWZfbGVhcm5pbmdfcmF0ZXEyRz9QYk3S8an8WAQAAABwbG90cTOJWAIAAABxZnE0aAYpgXE1fXE2KGgJY2NvbnRpbnVvdXNfbWxwX3FfZnVuY3Rpb25fYmF5ZXNpYW4KQ29udGludW91c01MUFFGdW5jdGlvbgpxN2gLfXE4KFgMAAAAaGlkZGVuX3NpemVzcTlLZEtkhnE6aBRoFSmBcTt9cTwoaBhoGWgaaBt1YlgTAAAAaGlkZGVuX25vbmxpbmVhcml0eXE9Y3RlbnNvcmZsb3cucHl0aG9uLm9wcy5nZW5fbm5fb3BzCnJlbHUKcT51aC4pdWJYFAAAAHBvbGljeV9sZWFybmluZ19yYXRlcT9HPxo24uscQy1YBgAAAHBvbGljeXFAaAYpgXFBfXFCKGgJY2RldGVybWluaXN0aWNfbWxwX3BvbGljeV9iYXllc2lhbgpEZXRlcm1pbmlzdGljTUxQUG9saWN5CnFDaAt9cUQoWAQAAABuYW1lcUVoQGg5S2RLMksZh3FGaBRoFSmBcUd9cUgoaBhoGWgaaBt1Ymg9aD51aC4pdWJYDAAAAGVwb2NoX2xlbmd0aHFJTegDWA8AAABtYXhfcGF0aF9sZW5ndGhxSmgVKYFxS31xTChoGFgHAAAAaG9yaXpvbnFNaBpoG3ViaCRoG3VoLil1YlgFAAAAdHJhaW5xTil9cU90cVB1Yi4=",
  3 |   "exp_name": "Trial_Bayesian_Exploration/",
  4 |   "json_args": {
  5 |     "algo": {
  6 |       "_name": "ddpg_bayesian_thompson.DDPG",
  7 |       "batch_size": 64,
  8 |       "discount": 0.99,
  9 |       "epoch_length": 1000,
 10 |       "es": {
 11 |         "_name": "dropout_exploration.MCDropout",
 12 |         "env_spec": {
 13 |           "attr": "spec",
 14 |           "obj": {
 15 |             "_name": "sandbox.rocky.tf.envs.base.TfEnv",
 16 |             "wrapped_env": {
 17 |               "_name": "rllab.envs.normalized_env.NormalizedEnv",
 18 |               "env": {
 19 |                 "_name": "rllab.envs.gym_env.GymEnv",
 20 |                 "env_name": "Hopper-v1",
 21 |                 "force_reset": true,
 22 |                 "record_log": false,
 23 |                 "record_video": false
 24 |               }
 25 |             }
 26 |           }
 27 |         }
 28 |       },
 29 |       "max_path_length": {
 30 |         "attr": "horizon",
 31 |         "obj": {
 32 |           "_name": "sandbox.rocky.tf.envs.base.TfEnv",
 33 |           "wrapped_env": {
 34 |             "_name": "rllab.envs.normalized_env.NormalizedEnv",
 35 |             "env": {
 36 |               "_name": "rllab.envs.gym_env.GymEnv",
 37 |               "env_name": "Hopper-v1",
 38 |               "force_reset": true,
 39 |               "record_log": false,
 40 |               "record_video": false
 41 |             }
 42 |           }
 43 |         }
 44 |       },
 45 |       "min_pool_size": 10000,
 46 |       "n_epochs": 20,
 47 |       "plot": false,
 48 |       "policy_learning_rate": 0.0001,
 49 |       "qf": {
 50 |         "_name": "continuous_mlp_q_function_bayesian.ContinuousMLPQFunction",
 51 |         "env_spec": {
 52 |           "attr": "spec",
 53 |           "obj": {
 54 |             "_name": "sandbox.rocky.tf.envs.base.TfEnv",
 55 |             "wrapped_env": {
 56 |               "_name": "rllab.envs.normalized_env.NormalizedEnv",
 57 |               "env": {
 58 |                 "_name": "rllab.envs.gym_env.GymEnv",
 59 |                 "env_name": "Hopper-v1",
 60 |                 "force_reset": true,
 61 |                 "record_log": false,
 62 |                 "record_video": false
 63 |               }
 64 |             }
 65 |           }
 66 |         },
 67 |         "hidden_nonlinearity": "tensorflow.python.ops.gen_nn_ops.relu",
 68 |         "hidden_sizes": [
 69 |           100,
 70 |           100
 71 |         ]
 72 |       },
 73 |       "qf_learning_rate": 0.001,
 74 |       "scale_reward": 1.0
 75 |     },
 76 |     "env": {
 77 |       "_name": "sandbox.rocky.tf.envs.base.TfEnv",
 78 |       "wrapped_env": {
 79 |         "_name": "rllab.envs.normalized_env.NormalizedEnv",
 80 |         "env": {
 81 |           "_name": "rllab.envs.gym_env.GymEnv",
 82 |           "env_name": "Hopper-v1",
 83 |           "force_reset": true,
 84 |           "record_log": false,
 85 |           "record_video": false
 86 |         }
 87 |       }
 88 |     },
 89 |     "policy": {
 90 |       "_name": "deterministic_mlp_policy_bayesian.DeterministicMLPPolicy",
 91 |       "env_spec": {
 92 |         "attr": "spec",
 93 |         "obj": {
 94 |           "_name": "sandbox.rocky.tf.envs.base.TfEnv",
 95 |           "wrapped_env": {
 96 |             "_name": "rllab.envs.normalized_env.NormalizedEnv",
 97 |             "env": {
 98 |               "_name": "rllab.envs.gym_env.GymEnv",
 99 |               "env_name": "Hopper-v1",
100 |               "force_reset": true,
101 |               "record_log": false,
102 |               "record_video": false
103 |             }
104 |           }
105 |         }
106 |       },
107 |       "hidden_nonlinearity": "tensorflow.python.ops.gen_nn_ops.relu",
108 |       "hidden_sizes": [
109 |         100,
110 |         50,
111 |         25
112 |       ],
113 |       "name": "policy"
114 |     }
115 |   },
116 |   "log_dir": "./bayesian_ddpg/",
117 |   "log_tabular_only": false,
118 |   "n_parallel": 1,
119 |   "params_log_file": "params.json",
120 |   "plot": false,
121 |   "resume_from": null,
122 |   "seed": 1,
123 |   "snapshot_gap": 1,
124 |   "snapshot_mode": "last",
125 |   "tabular_log_file": "progress.csv",
126 |   "text_log_file": "debug.log",
127 |   "use_cloudpickle": false,
128 |   "variant_data": null,
129 |   "variant_log_file": "variant.json"
130 | }


--------------------------------------------------------------------------------
/dropout_gal_neuralnet_tf_example.py:
--------------------------------------------------------------------------------
  1 | #### Uncertainty in Deep Learning
  2 | #### To keep the dropout during test time :
  3 | #### https://medium.com/towards-data-science/adding-uncertainty-to-deep-learning-ecc2401f2013
  4 | 
  5 | #### Another useful link
  6 | #### https://github.com/tensorflow/tensorflow/issues/97
  7 | 
  8 | from __future__ import absolute_import
  9 | from __future__ import division
 10 | from __future__ import print_function
 11 | 
 12 | import math
 13 | import os
 14 | 
 15 | import tensorflow as tf
 16 | from tensorflow.examples.tutorials.mnist import input_data
 17 | import pdb
 18 | import numpy as np
 19 | 
 20 | 
 21 | 
 22 | ### create TF graph session
 23 | tf.reset_default_graph()
 24 | sess = tf.Session()
 25 | 
 26 | LOGDIR = './graphs'
 27 | mnist = input_data.read_data_sets('/tmp/data', one_hot=True)
 28 | 
 29 | 
 30 | #defining the model structure
 31 | # number of neurons in each hidden layer
 32 | HIDDEN1_SIZE = 500
 33 | HIDDEN2_SIZE = 250
 34 | NUM_CLASSES = 10
 35 | NUM_PIXELS = 28 * 28
 36 | 
 37 | # experiment with the nubmer of training steps to 
 38 | # see the effect
 39 | TRAIN_STEPS = 2000
 40 | BATCH_SIZE = 100
 41 | 
 42 | LEARNING_RATE = 0.01
 43 | 
 44 | ### creating the inputs for the model
 45 | with tf.name_scope('input'):
 46 | 	# Define inputs
 47 | 	images = tf.placeholder(dtype=tf.float32, shape=[None, NUM_PIXELS])
 48 | 	labels = tf.placeholder(dtype=tf.float32, shape=[None, NUM_CLASSES])
 49 | 
 50 | 
 51 | # Function to create a fully connected layer
 52 | def fc_layer(input, size_out, name="fc", activation=None):
 53 |     with tf.name_scope(name):
 54 |         size_in = int(input.shape[1])
 55 |         w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1), name="weights")
 56 |         b = tf.Variable(tf.constant(0.1, shape=[size_out]), name="bias")
 57 | 
 58 |         #output of the network
 59 |         wx_plus_b = tf.matmul(input, w) + b
 60 | 
 61 |         if activation: return activation(wx_plus_b)
 62 |         return wx_plus_b
 63 | 
 64 | 
 65 | ### defining the network here
 66 | fc1 = fc_layer(images, HIDDEN1_SIZE, "fc1", activation=tf.nn.relu)
 67 | fc2 = fc_layer(fc1, HIDDEN2_SIZE, "fc2", activation=tf.nn.relu)
 68 | dropped = tf.nn.dropout(fc2, keep_prob=0.9)
 69 | #### network output
 70 | y = fc_layer(dropped, NUM_CLASSES, name="output")
 71 | 
 72 | 
 73 | 
 74 | with tf.name_scope("loss"):
 75 |     loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=labels))
 76 |     tf.summary.scalar('loss', loss)
 77 | 
 78 | 
 79 | with tf.name_scope("optimizer"):
 80 |     train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss)
 81 | 
 82 | # Define evaluation
 83 | with tf.name_scope("evaluation"):
 84 | 	prediction = tf.argmax(y, 1)
 85 | 
 86 | 	correct_prediction = tf.equal(prediction, tf.argmax(labels, 1))
 87 | 	accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
 88 | 	tf.summary.scalar('accuracy', accuracy)
 89 | 
 90 | 
 91 | train_writer = tf.summary.FileWriter(os.path.join(LOGDIR, "train"))
 92 | train_writer.add_graph(sess.graph)
 93 | test_writer = tf.summary.FileWriter(os.path.join(LOGDIR, "test"))
 94 | summary_op = tf.summary.merge_all()
 95 | 
 96 | ##### Constructing TF graph upto this
 97 | 
 98 | 
 99 | sess.run(tf.global_variables_initializer())
100 | 
101 | 
102 | MC_SAMPLES = 20
103 | for step in range(TRAIN_STEPS):
104 |     batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE)
105 | 
106 |     ### training results
107 |     summary_result, _ = sess.run([summary_op, train], feed_dict={images: batch_xs, labels: batch_ys})
108 | 
109 | 
110 |     orig_predicted_y = sess.run([y], feed_dict = {images : mnist.test.images})
111 |     
112 | 
113 |     ## use a placeholder here?
114 |     #All_MC_Predicted_Classes = tf.placeholder(dtype=tf.float32, shape=[10000, MC_SAMPLES])    
115 |     All_MC_Predicted_Classes = np.zeros(shape=(10000, MC_SAMPLES))
116 | 
117 | 
118 |     for m in range(MC_SAMPLES):
119 |     	predicted_y = sess.run([y], feed_dict = {images : mnist.test.images})
120 | 
121 |         ### using numpy`
122 |     	predicted_y = np.asarray(predicted_y)
123 |     	predicted_y = predicted_y[0, :, :]
124 |     	predicted_class = np.argmax(predicted_y, 1)
125 |     	pred = np.array([predicted_class]).T
126 |     	All_MC_Predicted_Classes[:, m] = predicted_class
127 | 
128 | 
129 |         #predicted_class = tf.argmax(predicted_y, 1)
130 |     	# All_MC_Predicted_Classes[:, m] = tf.argmax(predicted_y, 1)
131 | 
132 | 	Mean_Predicted_Classes = np.mean(All_MC_Predicted_Classes, axis=1)
133 |     	Variance_Predicted_Classes = np.var(All_MC_Predicted_Classes, axis=1)
134 | 	# Mean_Predicted_Classes = tf.reduce_mean(All_MC_Predicted_Classes, axis=1) 	
135 | 	# Mean_Predicted_Classes = Mean_Predicted_Classes.tolist()
136 | 
137 | 
138 |     correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(labels,1))
139 |     accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
140 | 
141 |     #acc = sess.run(accuracy, feed_dict={images: mnist.test.images, labels: mnist.test.labels})
142 | 
143 | 
144 |     print ("Step", step)
145 |     # print ("Test Accuracy", acc)
146 | 
147 | 
148 | 
149 | 
150 | 
151 | 
152 | 
153 | 
154 | 
155 | 
156 | """
157 | Doing computation on the TF graph
158 | """
159 | 
160 | # sess.run(tf.global_variables_initializer())
161 | # ###### Example to calculate MC Sample based accuracy using MCDropout
162 | # MC_SAMPLES = 10
163 | # for step in range(TRAIN_STEPS):
164 | #     batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE)
165 | #     summary_result, train_result = sess.run([summary_op, train], feed_dict={images: batch_xs, labels: batch_ys})
166 | 
167 | #     # calculate accuracy on the test set, every 100 steps.
168 | #     # we're using the entire test set here, so this will be a bit slow
169 | #     if step % 100 == 0:
170 | #     	print ("Step Number", step)
171 | #         #_, acc = sess.run([summary_op, accuracy], feed_dict={images: mnist.test.images, labels: mnist.test.labels})
172 | 
173 | #         All_Accuracy = np.zeros(shape=([MC_SAMPLES]))
174 | #         for m in range(MC_SAMPLES):
175 | # 			_, acc = sess.run([summary_op, accuracy], feed_dict={images: mnist.test.images, labels: mnist.test.labels})
176 | # 			acc = np.array([acc])
177 | 
178 | # 			All_Accuracy[m] = acc
179 | 
180 | # 	print ("All Accuracy", All_Accuracy)
181 | # 	print ("Mean Accuracy", np.mean(All_Accuracy))
182 | 
183 | 
184 | 
185 | 
186 | 
187 | 
188 | 
189 | """
190 | To keep the dropout at test time
191 | """
192 | 
193 | # keep the dropout during test time
194 | #mc_post = [sess.run(nn, feed_dict={x: data}) for _ in range(100)]
195 | 
196 | #and then we need sample variance + inverse precision
197 | # def _tau_inv(keep_prob, N, l2=0.005, lambda_=0.00001):
198 | #     tau = keep_prob * l2 / (2. * N * lambda_)
199 | #     return 1. / tau
200 | 
201 | # np.var(mc_post) + _tau_inv(0.5, 100)
202 | 


--------------------------------------------------------------------------------
/variational_inference_examples.py:
--------------------------------------------------------------------------------
  1 | ## from : https://gist.github.com/tokestermw/a9de2ef498a09747bbf673ddf6ea4843
  2 | ## tokestermw/tf_ed_vi_tutorial.py
  3 | from __future__ import absolute_import
  4 | from __future__ import division
  5 | from __future__ import print_function
  6 | 
  7 | import sys
  8 | import json
  9 | import tqdm
 10 | 
 11 | import numpy as np
 12 | import tensorflow as tf
 13 | import edward as ed
 14 | 
 15 | N_FEATURES = 2
 16 | DATA_LENGTH = 3
 17 | 
 18 | REAL_WEIGHT = 0.7
 19 | REAL_BIAS = 2.5
 20 | REAL_DATA = np.array([2., 4., 6.])
 21 | REAL_LABELS = REAL_WEIGHT * REAL_DATA + REAL_BIAS
 22 | 
 23 | MC_SAMPLES = 100
 24 | 
 25 | 
 26 | def create_fake_data():
 27 |     data = np.random.randint(0, 10, DATA_LENGTH).astype(np.float32)
 28 |     noise = np.random.randn(DATA_LENGTH).astype(np.float32)
 29 |     labels = REAL_WEIGHT * data + REAL_BIAS + noise
 30 |     return data, labels
 31 | 
 32 | 
 33 | class Model:
 34 |     def __init__(self, model_name, dropout_keep_prob=1.0):
 35 |         self.model_name = model_name
 36 |         self.dropout_keep_prob = dropout_keep_prob
 37 | 
 38 |         self.x = tf.placeholder(tf.float32, (DATA_LENGTH, ))
 39 |         self.y = tf.placeholder(tf.float32, (DATA_LENGTH, ))
 40 | 
 41 |         if self.model_name == "simple_linear":
 42 |             # -- set parameters
 43 |             self.bias = tf.get_variable("bias", [1])
 44 |             self.weight = tf.get_variable("weight", [1])  # should be 1.
 45 | 
 46 |             # -- set dropout (optional)
 47 |             self.add_dropout()
 48 | 
 49 |             # -- set model
 50 |             self.nn = self.weight * self.x + self.bias
 51 | 
 52 |             # -- set loss
 53 |             self.loss = tf.reduce_mean((self.y - self.nn) ** 2)
 54 | 
 55 |         elif self.model_name == "bayesian_simple_linear":
 56 |             # -- set priors
 57 |             self.weight_mu = tf.zeros(1)  # tf.get_variable("weight_mu", [1])  
 58 |             self.weight_sigma = tf.ones(1)  # fixed hyperparameters
 59 |             self.weight = ed.models.Normal(mu=self.weight_mu, sigma=self.weight_sigma)
 60 | 
 61 |             self.bias_mu = tf.zeros(1)  # tf.get_variable("bias_mu", [1])  
 62 |             self.bias_sigma = tf.ones(1)  # fixed hyperparameters
 63 |             self.bias = ed.models.Normal(mu=self.bias_mu, sigma=self.bias_sigma)
 64 | 
 65 |             # -- set model
 66 |             self.nn_mean = self.weight * self.x + self.bias
 67 |             self.nn_sigma = tf.ones(1)  # fixed hyperparameters
 68 |             self.nn = ed.models.Normal(mu=self.nn_mean, sigma=self.nn_sigma)
 69 | 
 70 |             # -- set variational parameters
 71 |             self.qweight = ed.models.Normal(
 72 |                 mu=tf.get_variable("qweight_mu", initializer=tf.random_normal([1])),
 73 |                 sigma=tf.nn.softplus(tf.get_variable("qweight_sigma", initializer=tf.random_normal([1]))))
 74 | 
 75 |             self.qbias = ed.models.Normal(
 76 |                 mu=tf.get_variable("qbias_mu", initializer=tf.random_normal([1])),
 77 |                 sigma=tf.nn.softplus(tf.get_variable("qbias_sigma", initializer=tf.random_normal([1]))))
 78 | 
 79 |             # -- inference
 80 |             self.latent_vars = {self.weight: self.qweight, self.bias: self.qbias}
 81 |             self.data = {self.nn: self.y}
 82 | 
 83 |             self.loss = (self.latent_vars, self.data)
 84 | 
 85 |         else:
 86 |             raise ValueError("Wrong model error.")
 87 | 
 88 |     def add_dropout(self):
 89 |         self._keep_prob = tf.Variable(name="keep_prob", initial_value=self.dropout_keep_prob, trainable=False)
 90 | 
 91 |         self.bias = tf.cond(
 92 |             self._keep_prob < 1.0, lambda: tf.nn.dropout(self.bias, keep_prob=self._keep_prob), lambda: self.bias)
 93 |         self.weight = tf.cond(
 94 |             self._keep_prob < 1.0, lambda: tf.nn.dropout(self.weight, keep_prob=self._keep_prob), lambda: self.weight)
 95 | 
 96 |     @property
 97 |     def keep_prob(self):
 98 |         return self._keep_prob
 99 | 
100 |     def optimize(self):
101 | 
102 |         if _is_loss_function(self.loss):
103 |             # loss optimization
104 |             self.optimizer = tf.train.GradientDescentOptimizer(.005)
105 |             self.train_op = self.optimizer.minimize(self.loss)
106 | 
107 |         else:
108 |             # variational inference
109 |             latent_vars, data = self.loss
110 |             self.inference = ed.KLqp(latent_vars=latent_vars, data=data)
111 |             self.inference.initialize()
112 |             self.train_op = self.inference
113 | 
114 |         return self.train_op
115 | 
116 | 
117 | def _is_loss_function(loss):
118 |     return isinstance(loss, tf.Tensor)
119 | 
120 | 
121 | def _section(text):
122 |     print("-"*10 + " ", text.upper())
123 | 
124 | 
125 | def _tau_inv(keep_prob, N, l2=0.005, lambda_=0.00001):
126 |     # -- Variational Dropout Uncertainty Interval by Gal
127 |     # https://github.com/yaringal/DropoutUncertaintyDemos/blob/master/convnetjs/regression_uncertainty.js
128 |     tau = keep_prob * l2 / (2. * N * lambda_)
129 |     return 1. / tau
130 | 
131 | 
132 | def main(model_name, dropout_keep_prob=1.0):
133 |     _section("set model")
134 |     model = Model(model_name, dropout_keep_prob)
135 |     train_op = model.optimize()
136 | 
137 |     local_init_op = tf.local_variables_initializer()
138 |     global_init_op = tf.global_variables_initializer()
139 | 
140 |     tvars = tf.trainable_variables()
141 | 
142 |     _section("train")
143 |     with tf.Session() as sess:
144 |         sess.run([local_init_op, global_init_op])
145 | 
146 |         tq = tqdm.trange(2000)
147 |         for it in tq:
148 |             data, labels = create_fake_data()
149 | 
150 |             if _is_loss_function(model.loss):
151 |                 sess.run(train_op, feed_dict={
152 |                     model.x: data,
153 |                     model.y: labels,
154 |                 })
155 |                 weight_, bias_ = sess.run([model.weight, model.bias])
156 |                 tq.set_postfix(weight=weight_, bias=bias_)
157 | 
158 |             else:
159 |                 train_op.update(feed_dict={
160 |                     model.x: data, 
161 |                     model.y: labels,
162 |                 })
163 |                 weight_, bias_ = sess.run([model.weight.value(), model.bias.value()])
164 |                 tq.set_postfix(weight=weight_, bias=bias_)
165 | 
166 |         print("trainable variables:", json.dumps({t.name: sess.run(t).tolist() for t in tvars}, indent=4))
167 | 
168 |         _section("predict on sample data")
169 |         print("real weight", REAL_WEIGHT)
170 |         print("real bias", REAL_BIAS)
171 |         print("real data", REAL_DATA)
172 |         print("real labels", REAL_LABELS)
173 | 
174 |         # -- checking out the variable distribution
175 |         if _is_loss_function(model.loss):
176 |             if dropout_keep_prob < 1.0:
177 |                 # don't do dropout for point estimate
178 |                 sess.run(model.keep_prob.assign(1.0))
179 |                 _section("loss optimization w/ dropout")
180 |             else:
181 |                 _section("loss optimization")
182 | 
183 |             nn_point_estimate, weight_point_estimate, bias_point_estimate = \
184 |                 sess.run([model.nn, model.weight, model.bias], feed_dict={
185 |                     model.x: REAL_DATA, 
186 |                     model.y: REAL_LABELS,
187 |                 })
188 |             print("weight point estimate", weight_point_estimate)
189 |             print("bias point estimate", bias_point_estimate)
190 |             print("nn point estimate", nn_point_estimate)
191 |             print("mean absolute error", np.mean(np.absolute(nn_point_estimate - REAL_LABELS)))
192 | 
193 |             if dropout_keep_prob < 1.0:
194 |                 _section("monte carlo simulations")
195 | 
196 |                 sess.run(model.keep_prob.assign(dropout_keep_prob))
197 | 
198 |                 nn_mc = []
199 |                 for _ in range(MC_SAMPLES):
200 |                     nn_mc.append(sess.run([model.nn], feed_dict={model.x: REAL_DATA, model.y: REAL_LABELS}))
201 |                 nn_mc = np.array(nn_mc)
202 | 
203 |                 print("monte carlo nn mean", np.mean(nn_mc, axis=0))
204 |                 print("monte carlo nn variance", np.var(nn_mc, axis=0))
205 |                 print("+ Gal inverse precision", np.var(nn_mc, axis=0) + _tau_inv(dropout_keep_prob, MC_SAMPLES))
206 |                 print("mean absolute error", np.mean(np.absolute(np.mean(nn_mc, axis=0) - REAL_LABELS)))
207 | 
208 |         else:
209 |             _section("variational inference")
210 | 
211 |             weight_mean, weight_var = sess.run(tf.nn.moments(model.qweight.sample(MC_SAMPLES), axes=[0]))
212 |             print("weight posterior mean and variance", weight_mean, weight_var)
213 |             bias_mean, bias_var = sess.run(tf.nn.moments(model.qbias.sample(MC_SAMPLES), axes=[0]))
214 |             print("bias posterior mean and variance", bias_mean, bias_var)
215 | 
216 |             _section("prior predictive checks")
217 |             prior = ed.copy(model.nn, {
218 |                 model.weight: model.weight.mean(), model.bias: model.bias.mean(),
219 |             }, scope="copied/prior")
220 | 
221 |             nn_prior = []
222 |             for _ in range(MC_SAMPLES):
223 |                 nn_prior.append(sess.run(prior.value(), feed_dict={model.x: REAL_DATA, model.y: REAL_LABELS}))
224 |             nn_prior = np.array(nn_prior)
225 | 
226 |             print("nn prior mean and variance", np.mean(nn_prior, axis=0), np.var(nn_prior, axis=0))
227 |             print("mean absolute error", np.mean(np.absolute(np.mean(nn_prior, axis=0) - REAL_LABELS)))
228 | 
229 |             _section("posterior predictive checks")
230 |             posterior = ed.copy(model.nn, dict_swap={
231 |                 model.weight: model.qweight.mean(), model.bias: model.qbias.mean(),
232 |             }, scope="copied/posterior")
233 | 
234 |             nn_post = sess.run(posterior.sample(MC_SAMPLES), feed_dict={model.x: REAL_DATA})
235 | 
236 |             print("nn posterior mean and variance", np.mean(nn_post, axis=0), np.var(nn_post, axis=0))
237 |             print("mean absolute error", np.mean(np.absolute(np.mean(nn_post, axis=0) - REAL_LABELS)))
238 | 
239 | 
240 |     # TODO: plot?
241 | 
242 | 
243 | if __name__ == '__main__':
244 |     """ Try the following:
245 |     >>> python tf_vi_tutorial.py simple_linear
246 |     >>> python tf_vi_tutorial.py simple_linear .9
247 |     >>> python tf_vi_tutorial.py bayesian_simple_linear
248 |     """
249 |     args = sys.argv
250 | 
251 |     if len(args) == 1:
252 |         main("simple_linear")
253 |     elif len(args) == 2:
254 |         _, model_name = args
255 |         main(model_name)
256 |     elif len(args) == 3:
257 |         _, model_name, dropout_keep_prob = args
258 |         dropout_keep_prob = float(dropout_keep_prob)
259 |         assert 0 < dropout_keep_prob <= 1.0, "keep it real"
260 |         main(model_name, float(dropout_keep_prob))


--------------------------------------------------------------------------------
/bayesian_ddpg/debug.log:
--------------------------------------------------------------------------------
  1 | 2017-07-12 09:22:32.919231 EDT | [Trial_Bayesian_Exploration/] observation space: Box(11,)
  2 | 2017-07-12 09:22:32.919512 EDT | [Trial_Bayesian_Exploration/] action space: Box(3,)
  3 | 2017-07-12 09:22:33.415071 EDT | [Trial_Bayesian_Exploration/] Populating workers...
  4 | 2017-07-12 09:22:33.415311 EDT | [Trial_Bayesian_Exploration/] Populated
  5 | 2017-07-12 09:22:35.323080 EDT | [Trial_Bayesian_Exploration/] epoch #0 | Training started
  6 | 2017-07-12 09:22:36.167981 EDT | [Trial_Bayesian_Exploration/] epoch #0 | Training finished
  7 | 2017-07-12 09:22:36.168245 EDT | [Trial_Bayesian_Exploration/] epoch #0 | Trained qf 0 steps, policy 0 steps
  8 | 2017-07-12 09:22:36.168460 EDT | [Trial_Bayesian_Exploration/] epoch #1 | Training started
  9 | 2017-07-12 09:22:37.006537 EDT | [Trial_Bayesian_Exploration/] epoch #1 | Training finished
 10 | 2017-07-12 09:22:37.006788 EDT | [Trial_Bayesian_Exploration/] epoch #1 | Trained qf 0 steps, policy 0 steps
 11 | 2017-07-12 09:22:37.006989 EDT | [Trial_Bayesian_Exploration/] epoch #2 | Training started
 12 | 2017-07-12 09:22:37.828863 EDT | [Trial_Bayesian_Exploration/] epoch #2 | Training finished
 13 | 2017-07-12 09:22:37.829077 EDT | [Trial_Bayesian_Exploration/] epoch #2 | Trained qf 0 steps, policy 0 steps
 14 | 2017-07-12 09:22:37.829240 EDT | [Trial_Bayesian_Exploration/] epoch #3 | Training started
 15 | 2017-07-12 09:22:38.652386 EDT | [Trial_Bayesian_Exploration/] epoch #3 | Training finished
 16 | 2017-07-12 09:22:38.652639 EDT | [Trial_Bayesian_Exploration/] epoch #3 | Trained qf 0 steps, policy 0 steps
 17 | 2017-07-12 09:22:38.652842 EDT | [Trial_Bayesian_Exploration/] epoch #4 | Training started
 18 | 2017-07-12 09:22:39.469835 EDT | [Trial_Bayesian_Exploration/] epoch #4 | Training finished
 19 | 2017-07-12 09:22:39.470033 EDT | [Trial_Bayesian_Exploration/] epoch #4 | Trained qf 0 steps, policy 0 steps
 20 | 2017-07-12 09:22:39.470186 EDT | [Trial_Bayesian_Exploration/] epoch #5 | Training started
 21 | 2017-07-12 09:22:40.297185 EDT | [Trial_Bayesian_Exploration/] epoch #5 | Training finished
 22 | 2017-07-12 09:22:40.297431 EDT | [Trial_Bayesian_Exploration/] epoch #5 | Trained qf 0 steps, policy 0 steps
 23 | 2017-07-12 09:22:40.297639 EDT | [Trial_Bayesian_Exploration/] epoch #6 | Training started
 24 | 2017-07-12 09:22:41.124994 EDT | [Trial_Bayesian_Exploration/] epoch #6 | Training finished
 25 | 2017-07-12 09:22:41.125214 EDT | [Trial_Bayesian_Exploration/] epoch #6 | Trained qf 0 steps, policy 0 steps
 26 | 2017-07-12 09:22:41.125367 EDT | [Trial_Bayesian_Exploration/] epoch #7 | Training started
 27 | 2017-07-12 09:22:41.955415 EDT | [Trial_Bayesian_Exploration/] epoch #7 | Training finished
 28 | 2017-07-12 09:22:41.955668 EDT | [Trial_Bayesian_Exploration/] epoch #7 | Trained qf 0 steps, policy 0 steps
 29 | 2017-07-12 09:22:41.955873 EDT | [Trial_Bayesian_Exploration/] epoch #8 | Training started
 30 | 2017-07-12 09:22:42.786357 EDT | [Trial_Bayesian_Exploration/] epoch #8 | Training finished
 31 | 2017-07-12 09:22:42.786557 EDT | [Trial_Bayesian_Exploration/] epoch #8 | Trained qf 0 steps, policy 0 steps
 32 | 2017-07-12 09:22:42.786709 EDT | [Trial_Bayesian_Exploration/] epoch #9 | Training started
 33 | 2017-07-12 09:22:43.779252 EDT | [Trial_Bayesian_Exploration/] epoch #9 | Training finished
 34 | 2017-07-12 09:22:43.779507 EDT | [Trial_Bayesian_Exploration/] epoch #9 | Trained qf 1 steps, policy 1 steps
 35 | 2017-07-12 09:22:43.779679 EDT | [Trial_Bayesian_Exploration/] epoch #9 | Collecting samples for evaluation
 36 | 2017-07-12 09:22:51.221690 EDT | -----------------------  ---------
 37 | 2017-07-12 09:22:51.221938 EDT | Epoch                     9
 38 | 2017-07-12 09:22:51.222175 EDT | Iteration                 9
 39 | 2017-07-12 09:22:51.222309 EDT | AverageReturn             9.1814
 40 | 2017-07-12 09:22:51.222531 EDT | StdReturn                 0.046539
 41 | 2017-07-12 09:22:51.222720 EDT | MaxReturn                 9.3185
 42 | 2017-07-12 09:22:51.222896 EDT | MinReturn                 9.07004
 43 | 2017-07-12 09:22:51.223063 EDT | AverageEsReturn           9.90701
 44 | 2017-07-12 09:22:51.223227 EDT | StdEsReturn               7.05845
 45 | 2017-07-12 09:22:51.223392 EDT | MaxEsReturn              54.8695
 46 | 2017-07-12 09:22:51.223556 EDT | MinEsReturn               1.67899
 47 | 2017-07-12 09:22:51.223720 EDT | AverageDiscountedReturn   8.76006
 48 | 2017-07-12 09:22:51.223900 EDT | AverageQLoss              0.74919
 49 | 2017-07-12 09:22:51.224063 EDT | AveragePolicySurr        -0.053608
 50 | 2017-07-12 09:22:51.224257 EDT | AverageQ                 -0.243523
 51 | 2017-07-12 09:22:51.224422 EDT | AverageAbsQ               0.365071
 52 | 2017-07-12 09:22:51.224587 EDT | AverageY                  0.48041
 53 | 2017-07-12 09:22:51.224750 EDT | AverageAbsY               0.593881
 54 | 2017-07-12 09:22:51.224914 EDT | AverageAbsQYDiff          0.75809
 55 | 2017-07-12 09:22:51.225078 EDT | AverageAction             0.218841
 56 | 2017-07-12 09:22:51.225240 EDT | PolicyRegParamNorm       11.23
 57 | 2017-07-12 09:22:51.225402 EDT | QFunRegParamNorm         11.13
 58 | 2017-07-12 09:22:51.225564 EDT | -----------------------  ---------
 59 | 2017-07-12 09:22:51.225882 EDT | [Trial_Bayesian_Exploration/] epoch #10 | Training started
 60 | 2017-07-12 09:24:18.240014 EDT | [Trial_Bayesian_Exploration/] epoch #10 | Training finished
 61 | 2017-07-12 09:24:18.240239 EDT | [Trial_Bayesian_Exploration/] epoch #10 | Trained qf 1000 steps, policy 1000 steps
 62 | 2017-07-12 09:24:18.240429 EDT | [Trial_Bayesian_Exploration/] epoch #10 | Collecting samples for evaluation
 63 | 2017-07-12 09:24:26.551050 EDT | -----------------------  ----------
 64 | 2017-07-12 09:24:26.551356 EDT | Epoch                    10
 65 | 2017-07-12 09:24:26.551567 EDT | Iteration                10
 66 | 2017-07-12 09:24:26.551764 EDT | AverageReturn             1.70342
 67 | 2017-07-12 09:24:26.552028 EDT | StdReturn                 0.0811045
 68 | 2017-07-12 09:24:26.552227 EDT | MaxReturn                 1.91245
 69 | 2017-07-12 09:24:26.552417 EDT | MinReturn                 1.49564
 70 | 2017-07-12 09:24:26.552608 EDT | AverageEsReturn           2.15915
 71 | 2017-07-12 09:24:26.552795 EDT | StdEsReturn               1.56574
 72 | 2017-07-12 09:24:26.552983 EDT | MaxEsReturn               8.61305
 73 | 2017-07-12 09:24:26.553170 EDT | MinEsReturn               0.0122472
 74 | 2017-07-12 09:24:26.553358 EDT | AverageDiscountedReturn   1.70881
 75 | 2017-07-12 09:24:26.553544 EDT | AverageQLoss              0.0712594
 76 | 2017-07-12 09:24:26.553731 EDT | AveragePolicySurr        -0.672253
 77 | 2017-07-12 09:24:26.553918 EDT | AverageQ                  0.795785
 78 | 2017-07-12 09:24:26.554105 EDT | AverageAbsQ               0.823965
 79 | 2017-07-12 09:24:26.554291 EDT | AverageY                  0.805172
 80 | 2017-07-12 09:24:26.554476 EDT | AverageAbsY               0.84508
 81 | 2017-07-12 09:24:26.554662 EDT | AverageAbsQYDiff          0.177415
 82 | 2017-07-12 09:24:26.554848 EDT | AverageAction             0.999596
 83 | 2017-07-12 09:24:26.555033 EDT | PolicyRegParamNorm       13.2281
 84 | 2017-07-12 09:24:26.555220 EDT | QFunRegParamNorm         11.3933
 85 | 2017-07-12 09:24:26.555406 EDT | -----------------------  ----------
 86 | 2017-07-12 09:24:26.555707 EDT | [Trial_Bayesian_Exploration/] epoch #11 | Training started
 87 | 2017-07-12 09:25:46.611911 EDT | [Trial_Bayesian_Exploration/] epoch #11 | Training finished
 88 | 2017-07-12 09:25:46.612170 EDT | [Trial_Bayesian_Exploration/] epoch #11 | Trained qf 1000 steps, policy 1000 steps
 89 | 2017-07-12 09:25:46.612352 EDT | [Trial_Bayesian_Exploration/] epoch #11 | Collecting samples for evaluation
 90 | 2017-07-12 09:25:54.435698 EDT | -----------------------  ----------
 91 | 2017-07-12 09:25:54.436028 EDT | Epoch                    11
 92 | 2017-07-12 09:25:54.436392 EDT | Iteration                11
 93 | 2017-07-12 09:25:54.436696 EDT | AverageReturn             3.58419
 94 | 2017-07-12 09:25:54.436997 EDT | StdReturn                 0.115737
 95 | 2017-07-12 09:25:54.437365 EDT | MaxReturn                 3.89929
 96 | 2017-07-12 09:25:54.437606 EDT | MinReturn                 3.26895
 97 | 2017-07-12 09:25:54.437795 EDT | AverageEsReturn           4.0568
 98 | 2017-07-12 09:25:54.437981 EDT | StdEsReturn               0.737778
 99 | 2017-07-12 09:25:54.438166 EDT | MaxEsReturn               6.41711
100 | 2017-07-12 09:25:54.438353 EDT | MinEsReturn               1.23751
101 | 2017-07-12 09:25:54.438526 EDT | AverageDiscountedReturn   3.5301
102 | 2017-07-12 09:25:54.438680 EDT | AverageQLoss              0.0288876
103 | 2017-07-12 09:25:54.438924 EDT | AveragePolicySurr        -0.974544
104 | 2017-07-12 09:25:54.439177 EDT | AverageQ                  1.1393
105 | 2017-07-12 09:25:54.439456 EDT | AverageAbsQ               1.19482
106 | 2017-07-12 09:25:54.439660 EDT | AverageY                  1.14473
107 | 2017-07-12 09:25:54.439862 EDT | AverageAbsY               1.20289
108 | 2017-07-12 09:25:54.440104 EDT | AverageAbsQYDiff          0.123579
109 | 2017-07-12 09:25:54.440304 EDT | AverageAction             1
110 | 2017-07-12 09:25:54.440497 EDT | PolicyRegParamNorm       13.4963
111 | 2017-07-12 09:25:54.440679 EDT | QFunRegParamNorm         11.7883
112 | 2017-07-12 09:25:54.440858 EDT | -----------------------  ----------
113 | 2017-07-12 09:25:54.441183 EDT | [Trial_Bayesian_Exploration/] epoch #12 | Training started
114 | 2017-07-12 09:27:13.633224 EDT | [Trial_Bayesian_Exploration/] epoch #12 | Training finished
115 | 2017-07-12 09:27:13.633485 EDT | [Trial_Bayesian_Exploration/] epoch #12 | Trained qf 1000 steps, policy 1000 steps
116 | 2017-07-12 09:27:13.633668 EDT | [Trial_Bayesian_Exploration/] epoch #12 | Collecting samples for evaluation
117 | 2017-07-12 09:27:21.520496 EDT | -----------------------  ----------
118 | 2017-07-12 09:27:21.520771 EDT | Epoch                    12
119 | 2017-07-12 09:27:21.521064 EDT | Iteration                12
120 | 2017-07-12 09:27:21.521281 EDT | AverageReturn             3.58477
121 | 2017-07-12 09:27:21.521496 EDT | StdReturn                 0.113468
122 | 2017-07-12 09:27:21.521707 EDT | MaxReturn                 3.89855
123 | 2017-07-12 09:27:21.521917 EDT | MinReturn                 3.29676
124 | 2017-07-12 09:27:21.522127 EDT | AverageEsReturn           4.13938
125 | 2017-07-12 09:27:21.522378 EDT | StdEsReturn               0.57574
126 | 2017-07-12 09:27:21.522593 EDT | MaxEsReturn               6.29128
127 | 2017-07-12 09:27:21.522809 EDT | MinEsReturn               2.1017
128 | 2017-07-12 09:27:21.523011 EDT | AverageDiscountedReturn   3.53067
129 | 2017-07-12 09:27:21.523216 EDT | AverageQLoss              0.0412518
130 | 2017-07-12 09:27:21.523416 EDT | AveragePolicySurr        -1.35587
131 | 2017-07-12 09:27:21.523616 EDT | AverageQ                  1.59025
132 | 2017-07-12 09:27:21.523829 EDT | AverageAbsQ               1.66405
133 | 2017-07-12 09:27:21.524035 EDT | AverageY                  1.59587
134 | 2017-07-12 09:27:21.524242 EDT | AverageAbsY               1.67236
135 | 2017-07-12 09:27:21.524449 EDT | AverageAbsQYDiff          0.144135
136 | 2017-07-12 09:27:21.524655 EDT | AverageAction             1
137 | 2017-07-12 09:27:21.524875 EDT | PolicyRegParamNorm       13.4963
138 | 2017-07-12 09:27:21.525082 EDT | QFunRegParamNorm         12.3922
139 | 2017-07-12 09:27:21.525289 EDT | -----------------------  ----------
140 | 2017-07-12 09:27:21.525633 EDT | [Trial_Bayesian_Exploration/] epoch #13 | Training started
141 | 


--------------------------------------------------------------------------------
/ddpg_bayesian_thompson.py:
--------------------------------------------------------------------------------
  1 | # FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py
  2 | from rllab.algos.base import RLAlgorithm
  3 | from rllab.misc.overrides import overrides
  4 | from rllab.misc import special
  5 | from sandbox.rocky.tf.misc import tensor_utils
  6 | from rllab.sampler import parallel_sampler
  7 | from rllab.plotter import plotter
  8 | from rllab.misc import ext
  9 | import rllab.misc.logger as logger
 10 | #import pickle as pickle
 11 | import numpy as np
 12 | import pyprind
 13 | import tensorflow as tf
 14 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
 15 | #from sandbox.rocky.tf.core.parameterized import suppress_params_loading
 16 | from rllab.core.serializable import Serializable
 17 | from sampling_utils import SimpleReplayPool
 18 | 
 19 | class DDPG(RLAlgorithm):
 20 |     """
 21 |     Deep Deterministic Policy Gradient.
 22 |     """
 23 | 
 24 |     def __init__(
 25 |             self,
 26 |             env,
 27 |             policy,
 28 |             qf,
 29 |             es,
 30 |             batch_size=32,
 31 |             n_epochs=200,
 32 |             epoch_length=1000,
 33 |             min_pool_size=10000,
 34 |             replay_pool_size=1000000,
 35 |             replacement_prob=1.0,
 36 |             discount=0.99,
 37 |             max_path_length=250,
 38 |             qf_weight_decay=0.,
 39 |             qf_update_method='adam',
 40 |             qf_learning_rate=1e-3,
 41 |             policy_weight_decay=0,
 42 |             policy_update_method='adam',
 43 |             policy_learning_rate=1e-3,
 44 |             policy_updates_ratio=1.0,
 45 |             eval_samples=10000,
 46 |             soft_target=True,
 47 |             soft_target_tau=0.001,
 48 |             n_updates_per_sample=1,
 49 |             scale_reward=1.0,
 50 |             include_horizon_terminal_transitions=False,
 51 |             plot=False,
 52 |             pause_for_plot=False):
 53 |         """
 54 |         :param env: Environment
 55 |         :param policy: Policy
 56 |         :param qf: Q function
 57 |         :param es: Exploration strategy
 58 |         :param batch_size: Number of samples for each minibatch.
 59 |         :param n_epochs: Number of epochs. Policy will be evaluated after each epoch.
 60 |         :param epoch_length: How many timesteps for each epoch.
 61 |         :param min_pool_size: Minimum size of the pool to start training.
 62 |         :param replay_pool_size: Size of the experience replay pool.
 63 |         :param discount: Discount factor for the cumulative return.
 64 |         :param max_path_length: Discount factor for the cumulative return.
 65 |         :param qf_weight_decay: Weight decay factor for parameters of the Q function.
 66 |         :param qf_update_method: Online optimization method for training Q function.
 67 |         :param qf_learning_rate: Learning rate for training Q function.
 68 |         :param policy_weight_decay: Weight decay factor for parameters of the policy.
 69 |         :param policy_update_method: Online optimization method for training the policy.
 70 |         :param policy_learning_rate: Learning rate for training the policy.
 71 |         :param eval_samples: Number of samples (timesteps) for evaluating the policy.
 72 |         :param soft_target_tau: Interpolation parameter for doing the soft target update.
 73 |         :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained
 74 |         :param scale_reward: The scaling factor applied to the rewards when training
 75 |         :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the
 76 |         horizon was reached. This might make the Q value back up less stable for certain tasks.
 77 |         :param plot: Whether to visualize the policy performance after each eval_interval.
 78 |         :param pause_for_plot: Whether to pause before continuing when plotting.
 79 |         :return:
 80 |         """
 81 |         self.env = env
 82 |         self.policy = policy
 83 |         self.qf = qf
 84 |         self.es = es
 85 |         self.batch_size = batch_size
 86 |         self.n_epochs = n_epochs
 87 |         self.epoch_length = epoch_length
 88 |         self.min_pool_size = min_pool_size
 89 |         self.replay_pool_size = replay_pool_size
 90 |         self.replacement_prob = replacement_prob
 91 |         self.discount = discount
 92 |         self.max_path_length = max_path_length
 93 |         self.qf_weight_decay = qf_weight_decay
 94 |         self.qf_update_method = \
 95 |             FirstOrderOptimizer(
 96 |                 update_method=qf_update_method,
 97 |                 learning_rate=qf_learning_rate,
 98 |             )
 99 |         self.qf_learning_rate = qf_learning_rate
100 |         self.policy_weight_decay = policy_weight_decay
101 | 
102 | 
103 |         self.policy_update_method = \
104 |             FirstOrderOptimizer(
105 |                 update_method=policy_update_method,
106 |                 learning_rate=policy_learning_rate,
107 |             )
108 |         self.policy_learning_rate = policy_learning_rate
109 |         self.policy_updates_ratio = policy_updates_ratio
110 |         self.eval_samples = eval_samples
111 |         self.soft_target_tau = soft_target_tau
112 |         self.n_updates_per_sample = n_updates_per_sample
113 |         self.include_horizon_terminal_transitions = include_horizon_terminal_transitions
114 |         self.plot = plot
115 |         self.pause_for_plot = pause_for_plot
116 | 
117 |         self.qf_loss_averages = []
118 |         self.policy_surr_averages = []
119 |         self.q_averages = []
120 |         self.y_averages = []
121 |         self.paths = []
122 |         self.es_path_returns = []
123 |         self.paths_samples_cnt = 0
124 | 
125 |         self.scale_reward = scale_reward
126 | 
127 |         self.train_policy_itr = 0
128 | 
129 |         self.opt_info = None
130 | 
131 |     def start_worker(self):
132 |         parallel_sampler.populate_task(self.env, self.policy)
133 |         if self.plot:
134 |             plotter.init_plot(self.env, self.policy)
135 | 
136 |     @overrides
137 |     def train(self):
138 |         with tf.Session() as sess:
139 |             sess.run(tf.global_variables_initializer())
140 |             # This seems like a rather sequential method
141 |             pool = SimpleReplayPool(
142 |                 max_pool_size=self.replay_pool_size,
143 |                 observation_dim=self.env.observation_space.flat_dim,
144 |                 action_dim=self.env.action_space.flat_dim,
145 |                 replacement_prob=self.replacement_prob,
146 |             )
147 |             self.start_worker()
148 | 
149 |             self.init_opt()
150 |             # This initializes the optimizer parameters
151 |             sess.run(tf.global_variables_initializer())
152 |             itr = 0
153 |             path_length = 0
154 |             path_return = 0
155 |             terminal = False
156 |             initial = False
157 |             observation = self.env.reset()
158 | 
159 |             with tf.variable_scope("sample_policy"):
160 |                 sample_policy = Serializable.clone(self.policy)
161 | 
162 |             for epoch in range(self.n_epochs):
163 |                 logger.push_prefix('epoch #%d | ' % epoch)
164 |                 logger.log("Training started")
165 |                 train_qf_itr, train_policy_itr = 0, 0
166 | 
167 |                 #sample a policy function from the posterior at every episode
168 |                 #move in the entire episode with the sampled policy function?
169 | 
170 |                 for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
171 |                     # Execute policy
172 |                     if terminal:  # or path_length > self.max_path_length:
173 |                         # Note that if the last time step ends an episode, the very
174 |                         # last state and observation will be ignored and not added
175 |                         # to the replay pool
176 |                         observation = self.env.reset()
177 |                         self.es.reset()
178 |                         sample_policy.reset()
179 |                         self.es_path_returns.append(path_return)
180 |                         path_length = 0
181 |                         path_return = 0
182 |                         initial = True
183 |                     else:
184 |                         initial = False
185 |                         
186 |                     action = self.es.get_action(itr, observation, policy=sample_policy)  # qf=qf)
187 | 
188 |                                         
189 |                     next_observation, reward, terminal, _ = self.env.step(action)
190 |                     path_length += 1
191 |                     path_return += reward
192 | 
193 | 
194 |                     if not terminal and path_length >= self.max_path_length:
195 |                         terminal = True
196 |                         # only include the terminal transition in this case if the flag was set
197 |                         if self.include_horizon_terminal_transitions:
198 |                             pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
199 |                     else:
200 |                         pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
201 | 
202 |                     observation = next_observation
203 | 
204 |                     if pool.size >= self.min_pool_size:
205 |                         for update_itr in range(self.n_updates_per_sample):
206 |                             # Train policy
207 |                             batch = pool.random_batch(self.batch_size)
208 |                             itrs = self.do_training(itr, epoch, batch)
209 |                             train_qf_itr += itrs[0]
210 |                             train_policy_itr += itrs[1]
211 |                         sample_policy.set_param_values(self.policy.get_param_values())
212 | 
213 |                     itr += 1
214 | 
215 |                 logger.log("Training finished")
216 |                 logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr))
217 |                 if pool.size >= self.min_pool_size:
218 |                     self.evaluate(epoch, pool)
219 |                     params = self.get_epoch_snapshot(epoch)
220 |                     logger.save_itr_params(epoch, params)
221 |                 logger.dump_tabular(with_prefix=False)
222 |                 logger.pop_prefix()
223 |                 if self.plot:
224 |                     self.update_plot()
225 |                     if self.pause_for_plot:
226 |                         input("Plotting evaluation run: Press Enter to "
227 |                                   "continue...")
228 |             self.env.terminate()
229 |             self.policy.terminate()
230 | 
231 |     def init_opt(self):
232 | 
233 |         # First, create "target" policy and Q functions
234 |         with tf.variable_scope("target_policy"):
235 |             target_policy = Serializable.clone(self.policy)
236 |         with tf.variable_scope("target_qf"):
237 |             target_qf = Serializable.clone(self.qf)
238 | 
239 |         # y need to be computed first
240 |         obs = self.env.observation_space.new_tensor_variable(
241 |             'obs',
242 |             extra_dims=1,
243 |         )
244 | 
245 |         # The yi values are computed separately as above and then passed to
246 |         # the training functions below
247 |         action = self.env.action_space.new_tensor_variable(
248 |             'action',
249 |             extra_dims=1,
250 |         )
251 | 
252 |         yvar = tensor_utils.new_tensor(
253 |             'ys',
254 |             ndim=1,
255 |             dtype=tf.float32,
256 |         )
257 | 
258 |         qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
259 |                                sum([tf.reduce_sum(tf.square(param)) for param in
260 |                                     self.qf.get_params(regularizable=True)])
261 | 
262 |         qval = self.qf.get_qval_sym(obs, action)
263 | 
264 |         qf_loss = tf.reduce_mean(tf.square(yvar - qval))
265 |         qf_reg_loss = qf_loss + qf_weight_decay_term
266 | 
267 |         policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
268 |                                    sum([tf.reduce_sum(tf.square(param))
269 |                                         for param in self.policy.get_params(regularizable=True)])
270 | 
271 | 
272 |         policy_qval = self.qf.get_qval_sym(
273 |             obs, self.policy.get_action_sym(obs),
274 |             deterministic=True
275 |         )
276 | 
277 | 
278 |         policy_surr = -tf.reduce_mean(policy_qval)
279 | 
280 |         policy_reg_surr = policy_surr + policy_weight_decay_term
281 | 
282 |         qf_input_list = [yvar, obs, action]
283 |         policy_input_list = [obs]
284 | 
285 |         self.qf_update_method.update_opt(
286 |             loss=qf_reg_loss, target=self.qf, inputs=qf_input_list)
287 | 
288 | 
289 |         self.policy_update_method.update_opt(
290 |             loss=policy_reg_surr, target=self.policy, inputs=policy_input_list)
291 | 
292 |         f_train_qf = tensor_utils.compile_function(
293 |             inputs=qf_input_list,
294 |             outputs=[qf_loss, qval, self.qf_update_method._train_op],
295 |         )
296 | 
297 |         f_train_policy = tensor_utils.compile_function(
298 |             inputs=policy_input_list,
299 |             outputs=[policy_surr, self.policy_update_method._train_op],
300 |         )
301 | 
302 |         self.opt_info = dict(
303 |             f_train_qf=f_train_qf,
304 |             f_train_policy=f_train_policy,
305 |             target_qf=target_qf,
306 |             target_policy=target_policy,
307 |         )
308 | 
309 |     def do_training(self, itr, epoch,  batch):
310 | 
311 |         obs, actions, rewards, next_obs, terminals = ext.extract(
312 |             batch,
313 |             "observations", "actions", "rewards", "next_observations",
314 |             "terminals"
315 |         )
316 | 
317 |         # compute the on-policy y values
318 |         target_qf = self.opt_info["target_qf"]
319 |         target_policy = self.opt_info["target_policy"]
320 | 
321 |         next_actions, _ = target_policy.get_actions(next_obs)
322 |         next_qvals = target_qf.get_qval(next_obs, next_actions)
323 | 
324 | 
325 |         """
326 |         Uncertainty in Critic Networks for exploration
327 |         - Thompson Sampling with the critic target networks
328 |         """
329 | 
330 |         """
331 |         Possible way (b) : for targets, use max(Q) 
332 |         - take the max (Q_0, Q_1, Q_2, ... Q_k) from the MC Dropout Q networks
333 |         """
334 | 
335 |         mc_dropout  = 50
336 |         all_posterior_qvals = np.zeros(shape=(next_obs.shape[0], mc_dropout))
337 |         for d in range(mc_dropout):
338 |             posterior_qvals = target_qf.get_qval_dropout(next_obs, next_actions)
339 | 
340 |             all_posterior_qvals[:, d] = posterior_qvals[:, 0]
341 | 
342 | 
343 |         sum_all_posterior_qvals = np.sum(all_posterior_qvals, axis=0)
344 |         max_Q_ind = np.argmax(sum_all_posterior_qvals)
345 | 
346 |         max_Q = all_posterior_qvals[:, max_Q_ind]
347 |         variance_next_qvals = np.std(all_posterior_qvals, axis=1)
348 | 
349 |         lambda_expl = 10 / epoch
350 |         qval_bayesian = max_Q + lambda_expl * variance_next_qvals
351 | 
352 | 
353 |         ys = rewards + (1. - terminals) * self.discount * qval_bayesian.reshape(-1)
354 | 
355 |         f_train_qf = self.opt_info["f_train_qf"]
356 | 
357 |         qf_loss, qval, _ = f_train_qf(ys, obs, actions)
358 | 
359 |         target_qf.set_param_values(
360 |             target_qf.get_param_values() * (1.0 - self.soft_target_tau) +
361 |             self.qf.get_param_values() * self.soft_target_tau)
362 |         self.qf_loss_averages.append(qf_loss)
363 |         self.q_averages.append(qval)
364 |         self.y_averages.append(ys)
365 | 
366 |         self.train_policy_itr += self.policy_updates_ratio
367 |         train_policy_itr = 0
368 | 
369 | 
370 |         while self.train_policy_itr > 0:
371 | 
372 |             f_train_policy = self.opt_info["f_train_policy"]
373 |             policy_surr, _ = f_train_policy(obs)
374 | 
375 | 
376 |             target_policy.set_param_values(
377 |                 target_policy.get_param_values() * (1.0 - self.soft_target_tau) +
378 |                 self.policy.get_param_values() * self.soft_target_tau)
379 |             self.policy_surr_averages.append(policy_surr)
380 |             self.train_policy_itr -= 1
381 |             train_policy_itr += 1
382 |         return 1, train_policy_itr # number of itrs qf, policy are trained
383 | 
384 | 
385 | 
386 | 
387 | 
388 | 
389 |     def evaluate(self, epoch, pool):
390 |         logger.log("Collecting samples for evaluation")
391 |         paths = parallel_sampler.sample_paths(
392 |             policy_params=self.policy.get_param_values(),
393 |             max_samples=self.eval_samples,
394 |             max_path_length=self.max_path_length,
395 |         )
396 | 
397 |         average_discounted_return = np.mean(
398 |             [special.discount_return(path["rewards"], self.discount) for path in paths]
399 |         )
400 | 
401 |         returns = [sum(path["rewards"]) for path in paths]
402 | 
403 |         all_qs = np.concatenate(self.q_averages)
404 |         all_ys = np.concatenate(self.y_averages)
405 | 
406 |         average_q_loss = np.mean(self.qf_loss_averages)
407 |         average_policy_surr = np.mean(self.policy_surr_averages)
408 |         average_action = np.mean(np.square(np.concatenate(
409 |             [path["actions"] for path in paths]
410 |         )))
411 | 
412 |         policy_reg_param_norm = np.linalg.norm(
413 |             self.policy.get_param_values(regularizable=True)
414 |         )
415 |         qfun_reg_param_norm = np.linalg.norm(
416 |             self.qf.get_param_values(regularizable=True)
417 |         )
418 | 
419 |         logger.record_tabular('Epoch', epoch)
420 |         logger.record_tabular('Iteration', epoch)
421 |         logger.record_tabular('AverageReturn', np.mean(returns))
422 |         logger.record_tabular('StdReturn',
423 |                               np.std(returns))
424 |         logger.record_tabular('MaxReturn',
425 |                               np.max(returns))
426 |         logger.record_tabular('MinReturn',
427 |                               np.min(returns))
428 |         if len(self.es_path_returns) > 0:
429 |             logger.record_tabular('AverageEsReturn',
430 |                                   np.mean(self.es_path_returns))
431 |             logger.record_tabular('StdEsReturn',
432 |                                   np.std(self.es_path_returns))
433 |             logger.record_tabular('MaxEsReturn',
434 |                                   np.max(self.es_path_returns))
435 |             logger.record_tabular('MinEsReturn',
436 |                                   np.min(self.es_path_returns))
437 |         logger.record_tabular('AverageDiscountedReturn',
438 |                               average_discounted_return)
439 |         logger.record_tabular('AverageQLoss', average_q_loss)
440 |         logger.record_tabular('AveragePolicySurr', average_policy_surr)
441 |         logger.record_tabular('AverageQ', np.mean(all_qs))
442 |         logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
443 |         logger.record_tabular('AverageY', np.mean(all_ys))
444 |         logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
445 |         logger.record_tabular('AverageAbsQYDiff',
446 |                               np.mean(np.abs(all_qs - all_ys)))
447 |         logger.record_tabular('AverageAction', average_action)
448 | 
449 |         logger.record_tabular('PolicyRegParamNorm',
450 |                               policy_reg_param_norm)
451 |         logger.record_tabular('QFunRegParamNorm',
452 |                               qfun_reg_param_norm)
453 | 
454 |         self.env.log_diagnostics(paths)
455 |         self.policy.log_diagnostics(paths)
456 | 
457 |         self.qf_loss_averages = []
458 |         self.policy_surr_averages = []
459 | 
460 |         self.q_averages = []
461 |         self.y_averages = []
462 |         self.es_path_returns = []
463 | 
464 |     def update_plot(self):
465 |         if self.plot:
466 |             plotter.update_plot(self.policy, self.max_path_length)
467 | 
468 |     def get_epoch_snapshot(self, epoch):
469 |         return dict(
470 |             env=self.env,
471 |             epoch=epoch,
472 |             qf=self.qf,
473 |             policy=self.policy,
474 |             target_qf=self.opt_info["target_qf"],
475 |             target_policy=self.opt_info["target_policy"],
476 |             es=self.es,
477 |         )
478 | 


--------------------------------------------------------------------------------
/ddpg_bayesian_mean.py:
--------------------------------------------------------------------------------
  1 | # FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py
  2 | from rllab.algos.base import RLAlgorithm
  3 | from rllab.misc.overrides import overrides
  4 | from rllab.misc import special
  5 | from sandbox.rocky.tf.misc import tensor_utils
  6 | from rllab.sampler import parallel_sampler
  7 | from rllab.plotter import plotter
  8 | from rllab.misc import ext
  9 | import rllab.misc.logger as logger
 10 | #import pickle as pickle
 11 | import numpy as np
 12 | import pyprind
 13 | import tensorflow as tf
 14 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
 15 | #from sandbox.rocky.tf.core.parameterized import suppress_params_loading
 16 | from rllab.core.serializable import Serializable
 17 | from sampling_utils import SimpleReplayPool
 18 | 
 19 | class DDPG(RLAlgorithm):
 20 |     """
 21 |     Deep Deterministic Policy Gradient.
 22 |     """
 23 | 
 24 |     def __init__(
 25 |             self,
 26 |             env,
 27 |             policy,
 28 |             qf,
 29 |             es,
 30 |             batch_size=32,
 31 |             n_epochs=200,
 32 |             epoch_length=1000,
 33 |             min_pool_size=10000,
 34 |             replay_pool_size=1000000,
 35 |             replacement_prob=1.0,
 36 |             discount=0.99,
 37 |             max_path_length=250,
 38 |             qf_weight_decay=0.,
 39 |             qf_update_method='adam',
 40 |             qf_learning_rate=1e-3,
 41 |             policy_weight_decay=0,
 42 |             policy_update_method='adam',
 43 |             policy_learning_rate=1e-3,
 44 |             policy_updates_ratio=1.0,
 45 |             eval_samples=10000,
 46 |             soft_target=True,
 47 |             soft_target_tau=0.001,
 48 |             n_updates_per_sample=1,
 49 |             scale_reward=1.0,
 50 |             include_horizon_terminal_transitions=False,
 51 |             plot=False,
 52 |             pause_for_plot=False):
 53 |         """
 54 |         :param env: Environment
 55 |         :param policy: Policy
 56 |         :param qf: Q function
 57 |         :param es: Exploration strategy
 58 |         :param batch_size: Number of samples for each minibatch.
 59 |         :param n_epochs: Number of epochs. Policy will be evaluated after each epoch.
 60 |         :param epoch_length: How many timesteps for each epoch.
 61 |         :param min_pool_size: Minimum size of the pool to start training.
 62 |         :param replay_pool_size: Size of the experience replay pool.
 63 |         :param discount: Discount factor for the cumulative return.
 64 |         :param max_path_length: Discount factor for the cumulative return.
 65 |         :param qf_weight_decay: Weight decay factor for parameters of the Q function.
 66 |         :param qf_update_method: Online optimization method for training Q function.
 67 |         :param qf_learning_rate: Learning rate for training Q function.
 68 |         :param policy_weight_decay: Weight decay factor for parameters of the policy.
 69 |         :param policy_update_method: Online optimization method for training the policy.
 70 |         :param policy_learning_rate: Learning rate for training the policy.
 71 |         :param eval_samples: Number of samples (timesteps) for evaluating the policy.
 72 |         :param soft_target_tau: Interpolation parameter for doing the soft target update.
 73 |         :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained
 74 |         :param scale_reward: The scaling factor applied to the rewards when training
 75 |         :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the
 76 |         horizon was reached. This might make the Q value back up less stable for certain tasks.
 77 |         :param plot: Whether to visualize the policy performance after each eval_interval.
 78 |         :param pause_for_plot: Whether to pause before continuing when plotting.
 79 |         :return:
 80 |         """
 81 |         self.env = env
 82 |         self.policy = policy
 83 |         self.qf = qf
 84 |         self.es = es
 85 |         self.batch_size = batch_size
 86 |         self.n_epochs = n_epochs
 87 |         self.epoch_length = epoch_length
 88 |         self.min_pool_size = min_pool_size
 89 |         self.replay_pool_size = replay_pool_size
 90 |         self.replacement_prob = replacement_prob
 91 |         self.discount = discount
 92 |         self.max_path_length = max_path_length
 93 |         self.qf_weight_decay = qf_weight_decay
 94 |         self.qf_update_method = \
 95 |             FirstOrderOptimizer(
 96 |                 update_method=qf_update_method,
 97 |                 learning_rate=qf_learning_rate,
 98 |             )
 99 |         self.qf_learning_rate = qf_learning_rate
100 |         self.policy_weight_decay = policy_weight_decay
101 | 
102 | 
103 |         self.policy_update_method = \
104 |             FirstOrderOptimizer(
105 |                 update_method=policy_update_method,
106 |                 learning_rate=policy_learning_rate,
107 |             )
108 |         self.policy_learning_rate = policy_learning_rate
109 |         self.policy_updates_ratio = policy_updates_ratio
110 |         self.eval_samples = eval_samples
111 |         self.soft_target_tau = soft_target_tau
112 |         self.n_updates_per_sample = n_updates_per_sample
113 |         self.include_horizon_terminal_transitions = include_horizon_terminal_transitions
114 |         self.plot = plot
115 |         self.pause_for_plot = pause_for_plot
116 | 
117 |         self.qf_loss_averages = []
118 |         self.policy_surr_averages = []
119 |         self.q_averages = []
120 |         self.y_averages = []
121 |         self.paths = []
122 |         self.es_path_returns = []
123 |         self.paths_samples_cnt = 0
124 | 
125 |         self.scale_reward = scale_reward
126 | 
127 |         self.train_policy_itr = 0
128 | 
129 |         self.opt_info = None
130 | 
131 |     def start_worker(self):
132 |         parallel_sampler.populate_task(self.env, self.policy)
133 |         if self.plot:
134 |             plotter.init_plot(self.env, self.policy)
135 | 
136 |     @overrides
137 |     def train(self):
138 |         with tf.Session() as sess:
139 |             sess.run(tf.global_variables_initializer())
140 |             # This seems like a rather sequential method
141 |             pool = SimpleReplayPool(
142 |                 max_pool_size=self.replay_pool_size,
143 |                 observation_dim=self.env.observation_space.flat_dim,
144 |                 action_dim=self.env.action_space.flat_dim,
145 |                 replacement_prob=self.replacement_prob,
146 |             )
147 |             self.start_worker()
148 | 
149 |             self.init_opt()
150 |             # This initializes the optimizer parameters
151 |             sess.run(tf.global_variables_initializer())
152 |             itr = 0
153 |             path_length = 0
154 |             path_return = 0
155 |             terminal = False
156 |             initial = False
157 |             observation = self.env.reset()
158 | 
159 |             with tf.variable_scope("sample_policy"):
160 |                 sample_policy = Serializable.clone(self.policy)
161 | 
162 |             for epoch in range(self.n_epochs):
163 |                 logger.push_prefix('epoch #%d | ' % epoch)
164 |                 logger.log("Training started")
165 |                 train_qf_itr, train_policy_itr = 0, 0
166 | 
167 |                 #sample a policy function from the posterior at every episode
168 |                 #move in the entire episode with the sampled policy function?
169 | 
170 |                 for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
171 |                     # Execute policy
172 |                     if terminal:  # or path_length > self.max_path_length:
173 |                         # Note that if the last time step ends an episode, the very
174 |                         # last state and observation will be ignored and not added
175 |                         # to the replay pool
176 |                         observation = self.env.reset()
177 |                         self.es.reset()
178 |                         sample_policy.reset()
179 |                         self.es_path_returns.append(path_return)
180 |                         path_length = 0
181 |                         path_return = 0
182 |                         initial = True
183 |                     else:
184 |                         initial = False
185 |                         
186 |                     action = self.es.get_action(itr, observation, policy=sample_policy)  # qf=qf)
187 | 
188 |                                         
189 |                     next_observation, reward, terminal, _ = self.env.step(action)
190 |                     path_length += 1
191 |                     path_return += reward
192 | 
193 | 
194 |                     if not terminal and path_length >= self.max_path_length:
195 |                         terminal = True
196 |                         # only include the terminal transition in this case if the flag was set
197 |                         if self.include_horizon_terminal_transitions:
198 |                             pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
199 |                     else:
200 |                         pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
201 | 
202 |                     observation = next_observation
203 | 
204 |                     if pool.size >= self.min_pool_size:
205 |                         for update_itr in range(self.n_updates_per_sample):
206 |                             # Train policy
207 |                             batch = pool.random_batch(self.batch_size)
208 |                             itrs = self.do_training(itr, epoch, batch)
209 |                             train_qf_itr += itrs[0]
210 |                             train_policy_itr += itrs[1]
211 |                         sample_policy.set_param_values(self.policy.get_param_values())
212 | 
213 |                     itr += 1
214 | 
215 |                 logger.log("Training finished")
216 |                 logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr))
217 |                 if pool.size >= self.min_pool_size:
218 |                     self.evaluate(epoch, pool)
219 |                     params = self.get_epoch_snapshot(epoch)
220 |                     logger.save_itr_params(epoch, params)
221 |                 logger.dump_tabular(with_prefix=False)
222 |                 logger.pop_prefix()
223 |                 if self.plot:
224 |                     self.update_plot()
225 |                     if self.pause_for_plot:
226 |                         input("Plotting evaluation run: Press Enter to "
227 |                                   "continue...")
228 |             self.env.terminate()
229 |             self.policy.terminate()
230 | 
231 |     def init_opt(self):
232 | 
233 |         # First, create "target" policy and Q functions
234 |         with tf.variable_scope("target_policy"):
235 |             target_policy = Serializable.clone(self.policy)
236 |         with tf.variable_scope("target_qf"):
237 |             target_qf = Serializable.clone(self.qf)
238 | 
239 |         # y need to be computed first
240 |         obs = self.env.observation_space.new_tensor_variable(
241 |             'obs',
242 |             extra_dims=1,
243 |         )
244 | 
245 |         # The yi values are computed separately as above and then passed to
246 |         # the training functions below
247 |         action = self.env.action_space.new_tensor_variable(
248 |             'action',
249 |             extra_dims=1,
250 |         )
251 | 
252 |         yvar = tensor_utils.new_tensor(
253 |             'ys',
254 |             ndim=1,
255 |             dtype=tf.float32,
256 |         )
257 | 
258 |         qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
259 |                                sum([tf.reduce_sum(tf.square(param)) for param in
260 |                                     self.qf.get_params(regularizable=True)])
261 | 
262 |         qval = self.qf.get_qval_sym(obs, action)
263 | 
264 |         qf_loss = tf.reduce_mean(tf.square(yvar - qval))
265 |         qf_reg_loss = qf_loss + qf_weight_decay_term
266 | 
267 |         policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
268 |                                    sum([tf.reduce_sum(tf.square(param))
269 |                                         for param in self.policy.get_params(regularizable=True)])
270 | 
271 | 
272 |         policy_qval = self.qf.get_qval_sym(
273 |             obs, self.policy.get_action_sym(obs),
274 |             deterministic=True
275 |         )
276 | 
277 | 
278 |         policy_surr = -tf.reduce_mean(policy_qval)
279 | 
280 |         policy_reg_surr = policy_surr + policy_weight_decay_term
281 | 
282 |         qf_input_list = [yvar, obs, action]
283 |         policy_input_list = [obs]
284 | 
285 |         self.qf_update_method.update_opt(
286 |             loss=qf_reg_loss, target=self.qf, inputs=qf_input_list)
287 | 
288 | 
289 |         self.policy_update_method.update_opt(
290 |             loss=policy_reg_surr, target=self.policy, inputs=policy_input_list)
291 | 
292 |         f_train_qf = tensor_utils.compile_function(
293 |             inputs=qf_input_list,
294 |             outputs=[qf_loss, qval, self.qf_update_method._train_op],
295 |         )
296 | 
297 |         f_train_policy = tensor_utils.compile_function(
298 |             inputs=policy_input_list,
299 |             outputs=[policy_surr, self.policy_update_method._train_op],
300 |         )
301 | 
302 |         self.opt_info = dict(
303 |             f_train_qf=f_train_qf,
304 |             f_train_policy=f_train_policy,
305 |             target_qf=target_qf,
306 |             target_policy=target_policy,
307 |         )
308 | 
309 |     def do_training(self, itr, epoch, batch):
310 | 
311 |         obs, actions, rewards, next_obs, terminals = ext.extract(
312 |             batch,
313 |             "observations", "actions", "rewards", "next_observations",
314 |             "terminals"
315 |         )
316 | 
317 |         # compute the on-policy y values
318 |         target_qf = self.opt_info["target_qf"]
319 |         target_policy = self.opt_info["target_policy"]
320 | 
321 |         next_actions, _ = target_policy.get_actions(next_obs)
322 |         next_qvals = target_qf.get_qval(next_obs, next_actions)
323 | 
324 | 
325 |         """
326 |         Uncertainty in Critic Networks for exploration
327 |         - Thompson Sampling with the critic target networks
328 |         """
329 | 
330 |         """
331 |         Possible way (a) : for targets, take mean(Q) + lambda * variance(Q) over all Q evaluations
332 |         """
333 | 
334 |         """
335 |         Apply MCDropout here - get the mean of Q and the variance over Q
336 |         """
337 |         mc_dropout = 50
338 |         all_posterior_qvals = np.zeros(shape=(next_obs.shape[0], mc_dropout))
339 |         for d in range(mc_dropout):
340 |             posterior_qvals = target_qf.get_qval_dropout(next_obs, next_actions)
341 |             all_posterior_qvals[:, d] = posterior_qvals[:, 0]
342 | 
343 |         ## mean of the Q function posterior
344 |         # mean_next_qvals = np.array([np.mean(all_posterior_qvals, axis=1)]).T
345 |         mean_next_qvals = np.mean(all_posterior_qvals, axis=1)
346 |         variance_next_qvals = np.std(all_posterior_qvals, axis=1)
347 | 
348 | 
349 |         #### lambda parameter to tune between optimistic/pessimistic exploration
350 |         lambda_expl = 10 / epoch
351 |         qval_bayesian = mean_next_qvals + lambda_expl * variance_next_qvals 
352 | 
353 |         ys = rewards + (1. - terminals) * self.discount * qval_bayesian.reshape(-1)
354 | 
355 | 
356 |         f_train_qf = self.opt_info["f_train_qf"]
357 | 
358 |         qf_loss, qval, _ = f_train_qf(ys, obs, actions)
359 | 
360 |         target_qf.set_param_values(
361 |             target_qf.get_param_values() * (1.0 - self.soft_target_tau) +
362 |             self.qf.get_param_values() * self.soft_target_tau)
363 |         self.qf_loss_averages.append(qf_loss)
364 |         self.q_averages.append(qval)
365 |         self.y_averages.append(ys)
366 | 
367 |         self.train_policy_itr += self.policy_updates_ratio
368 |         train_policy_itr = 0
369 | 
370 | 
371 |         while self.train_policy_itr > 0:
372 | 
373 |             f_train_policy = self.opt_info["f_train_policy"]
374 |             policy_surr, _ = f_train_policy(obs)
375 | 
376 | 
377 |             target_policy.set_param_values(
378 |                 target_policy.get_param_values() * (1.0 - self.soft_target_tau) +
379 |                 self.policy.get_param_values() * self.soft_target_tau)
380 |             self.policy_surr_averages.append(policy_surr)
381 |             self.train_policy_itr -= 1
382 |             train_policy_itr += 1
383 |         return 1, train_policy_itr # number of itrs qf, policy are trained
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 |     def evaluate(self, epoch, pool):
391 |         logger.log("Collecting samples for evaluation")
392 |         paths = parallel_sampler.sample_paths(
393 |             policy_params=self.policy.get_param_values(),
394 |             max_samples=self.eval_samples,
395 |             max_path_length=self.max_path_length,
396 |         )
397 | 
398 |         average_discounted_return = np.mean(
399 |             [special.discount_return(path["rewards"], self.discount) for path in paths]
400 |         )
401 | 
402 |         returns = [sum(path["rewards"]) for path in paths]
403 | 
404 |         all_qs = np.concatenate(self.q_averages)
405 |         all_ys = np.concatenate(self.y_averages)
406 | 
407 |         average_q_loss = np.mean(self.qf_loss_averages)
408 |         average_policy_surr = np.mean(self.policy_surr_averages)
409 |         average_action = np.mean(np.square(np.concatenate(
410 |             [path["actions"] for path in paths]
411 |         )))
412 | 
413 |         policy_reg_param_norm = np.linalg.norm(
414 |             self.policy.get_param_values(regularizable=True)
415 |         )
416 |         qfun_reg_param_norm = np.linalg.norm(
417 |             self.qf.get_param_values(regularizable=True)
418 |         )
419 | 
420 |         logger.record_tabular('Epoch', epoch)
421 |         logger.record_tabular('Iteration', epoch)
422 |         logger.record_tabular('AverageReturn', np.mean(returns))
423 |         logger.record_tabular('StdReturn',
424 |                               np.std(returns))
425 |         logger.record_tabular('MaxReturn',
426 |                               np.max(returns))
427 |         logger.record_tabular('MinReturn',
428 |                               np.min(returns))
429 |         if len(self.es_path_returns) > 0:
430 |             logger.record_tabular('AverageEsReturn',
431 |                                   np.mean(self.es_path_returns))
432 |             logger.record_tabular('StdEsReturn',
433 |                                   np.std(self.es_path_returns))
434 |             logger.record_tabular('MaxEsReturn',
435 |                                   np.max(self.es_path_returns))
436 |             logger.record_tabular('MinEsReturn',
437 |                                   np.min(self.es_path_returns))
438 |         logger.record_tabular('AverageDiscountedReturn',
439 |                               average_discounted_return)
440 |         logger.record_tabular('AverageQLoss', average_q_loss)
441 |         logger.record_tabular('AveragePolicySurr', average_policy_surr)
442 |         logger.record_tabular('AverageQ', np.mean(all_qs))
443 |         logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
444 |         logger.record_tabular('AverageY', np.mean(all_ys))
445 |         logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
446 |         logger.record_tabular('AverageAbsQYDiff',
447 |                               np.mean(np.abs(all_qs - all_ys)))
448 |         logger.record_tabular('AverageAction', average_action)
449 | 
450 |         logger.record_tabular('PolicyRegParamNorm',
451 |                               policy_reg_param_norm)
452 |         logger.record_tabular('QFunRegParamNorm',
453 |                               qfun_reg_param_norm)
454 | 
455 |         self.env.log_diagnostics(paths)
456 |         self.policy.log_diagnostics(paths)
457 | 
458 |         self.qf_loss_averages = []
459 |         self.policy_surr_averages = []
460 | 
461 |         self.q_averages = []
462 |         self.y_averages = []
463 |         self.es_path_returns = []
464 | 
465 |     def update_plot(self):
466 |         if self.plot:
467 |             plotter.update_plot(self.policy, self.max_path_length)
468 | 
469 |     def get_epoch_snapshot(self, epoch):
470 |         return dict(
471 |             env=self.env,
472 |             epoch=epoch,
473 |             qf=self.qf,
474 |             policy=self.policy,
475 |             target_qf=self.opt_info["target_qf"],
476 |             target_policy=self.opt_info["target_policy"],
477 |             es=self.es,
478 |         )
479 | 


--------------------------------------------------------------------------------
/ddpg_bayesian.py:
--------------------------------------------------------------------------------
  1 | # FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py
  2 | from rllab.algos.base import RLAlgorithm
  3 | from rllab.misc.overrides import overrides
  4 | from rllab.misc import special
  5 | from sandbox.rocky.tf.misc import tensor_utils
  6 | from rllab.sampler import parallel_sampler
  7 | from rllab.plotter import plotter
  8 | from rllab.misc import ext
  9 | import rllab.misc.logger as logger
 10 | #import pickle as pickle
 11 | import numpy as np
 12 | import pyprind
 13 | import tensorflow as tf
 14 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer
 15 | #from sandbox.rocky.tf.core.parameterized import suppress_params_loading
 16 | from rllab.core.serializable import Serializable
 17 | from sampling_utils import SimpleReplayPool
 18 | 
 19 | class DDPG(RLAlgorithm):
 20 |     """
 21 |     Deep Deterministic Policy Gradient.
 22 |     """
 23 | 
 24 |     def __init__(
 25 |             self,
 26 |             env,
 27 |             policy,
 28 |             qf,
 29 |             es,
 30 |             batch_size=32,
 31 |             n_epochs=200,
 32 |             epoch_length=1000,
 33 |             min_pool_size=10000,
 34 |             replay_pool_size=1000000,
 35 |             replacement_prob=1.0,
 36 |             discount=0.99,
 37 |             max_path_length=250,
 38 |             qf_weight_decay=0.,
 39 |             qf_update_method='adam',
 40 |             qf_learning_rate=1e-3,
 41 |             policy_weight_decay=0,
 42 |             policy_update_method='adam',
 43 |             policy_learning_rate=1e-3,
 44 |             policy_updates_ratio=1.0,
 45 |             eval_samples=10000,
 46 |             soft_target=True,
 47 |             soft_target_tau=0.001,
 48 |             n_updates_per_sample=1,
 49 |             scale_reward=1.0,
 50 |             include_horizon_terminal_transitions=False,
 51 |             plot=False,
 52 |             pause_for_plot=False):
 53 |         """
 54 |         :param env: Environment
 55 |         :param policy: Policy
 56 |         :param qf: Q function
 57 |         :param es: Exploration strategy
 58 |         :param batch_size: Number of samples for each minibatch.
 59 |         :param n_epochs: Number of epochs. Policy will be evaluated after each epoch.
 60 |         :param epoch_length: How many timesteps for each epoch.
 61 |         :param min_pool_size: Minimum size of the pool to start training.
 62 |         :param replay_pool_size: Size of the experience replay pool.
 63 |         :param discount: Discount factor for the cumulative return.
 64 |         :param max_path_length: Discount factor for the cumulative return.
 65 |         :param qf_weight_decay: Weight decay factor for parameters of the Q function.
 66 |         :param qf_update_method: Online optimization method for training Q function.
 67 |         :param qf_learning_rate: Learning rate for training Q function.
 68 |         :param policy_weight_decay: Weight decay factor for parameters of the policy.
 69 |         :param policy_update_method: Online optimization method for training the policy.
 70 |         :param policy_learning_rate: Learning rate for training the policy.
 71 |         :param eval_samples: Number of samples (timesteps) for evaluating the policy.
 72 |         :param soft_target_tau: Interpolation parameter for doing the soft target update.
 73 |         :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained
 74 |         :param scale_reward: The scaling factor applied to the rewards when training
 75 |         :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the
 76 |         horizon was reached. This might make the Q value back up less stable for certain tasks.
 77 |         :param plot: Whether to visualize the policy performance after each eval_interval.
 78 |         :param pause_for_plot: Whether to pause before continuing when plotting.
 79 |         :return:
 80 |         """
 81 |         self.env = env
 82 |         self.policy = policy
 83 |         self.qf = qf
 84 |         self.es = es
 85 |         self.batch_size = batch_size
 86 |         self.n_epochs = n_epochs
 87 |         self.epoch_length = epoch_length
 88 |         self.min_pool_size = min_pool_size
 89 |         self.replay_pool_size = replay_pool_size
 90 |         self.replacement_prob = replacement_prob
 91 |         self.discount = discount
 92 |         self.max_path_length = max_path_length
 93 |         self.qf_weight_decay = qf_weight_decay
 94 |         self.qf_update_method = \
 95 |             FirstOrderOptimizer(
 96 |                 update_method=qf_update_method,
 97 |                 learning_rate=qf_learning_rate,
 98 |             )
 99 |         self.qf_learning_rate = qf_learning_rate
100 |         self.policy_weight_decay = policy_weight_decay
101 | 
102 | 
103 |         self.policy_update_method = \
104 |             FirstOrderOptimizer(
105 |                 update_method=policy_update_method,
106 |                 learning_rate=policy_learning_rate,
107 |             )
108 |         self.policy_learning_rate = policy_learning_rate
109 |         self.policy_updates_ratio = policy_updates_ratio
110 |         self.eval_samples = eval_samples
111 |         self.soft_target_tau = soft_target_tau
112 |         self.n_updates_per_sample = n_updates_per_sample
113 |         self.include_horizon_terminal_transitions = include_horizon_terminal_transitions
114 |         self.plot = plot
115 |         self.pause_for_plot = pause_for_plot
116 | 
117 |         self.qf_loss_averages = []
118 |         self.policy_surr_averages = []
119 |         self.q_averages = []
120 |         self.y_averages = []
121 |         self.paths = []
122 |         self.es_path_returns = []
123 |         self.paths_samples_cnt = 0
124 | 
125 |         self.scale_reward = scale_reward
126 | 
127 |         self.train_policy_itr = 0
128 | 
129 |         self.opt_info = None
130 | 
131 |     def start_worker(self):
132 |         parallel_sampler.populate_task(self.env, self.policy)
133 |         if self.plot:
134 |             plotter.init_plot(self.env, self.policy)
135 | 
136 |     @overrides
137 |     def train(self):
138 |         with tf.Session() as sess:
139 |             sess.run(tf.global_variables_initializer())
140 |             # This seems like a rather sequential method
141 |             pool = SimpleReplayPool(
142 |                 max_pool_size=self.replay_pool_size,
143 |                 observation_dim=self.env.observation_space.flat_dim,
144 |                 action_dim=self.env.action_space.flat_dim,
145 |                 replacement_prob=self.replacement_prob,
146 |             )
147 |             self.start_worker()
148 | 
149 |             self.init_opt()
150 |             # This initializes the optimizer parameters
151 |             sess.run(tf.global_variables_initializer())
152 |             itr = 0
153 |             path_length = 0
154 |             path_return = 0
155 |             terminal = False
156 |             initial = False
157 |             observation = self.env.reset()
158 | 
159 |             with tf.variable_scope("sample_policy"):
160 |                 sample_policy = Serializable.clone(self.policy)
161 | 
162 |             for epoch in range(self.n_epochs):
163 |                 logger.push_prefix('epoch #%d | ' % epoch)
164 |                 logger.log("Training started")
165 |                 train_qf_itr, train_policy_itr = 0, 0
166 | 
167 |                 #sample a policy function from the posterior at every episode
168 |                 #move in the entire episode with the sampled policy function?
169 | 
170 |                 for epoch_itr in pyprind.prog_bar(range(self.epoch_length)):
171 |                     # Execute policy
172 |                     if terminal:  # or path_length > self.max_path_length:
173 |                         # Note that if the last time step ends an episode, the very
174 |                         # last state and observation will be ignored and not added
175 |                         # to the replay pool
176 |                         observation = self.env.reset()
177 |                         self.es.reset()
178 |                         sample_policy.reset()
179 |                         self.es_path_returns.append(path_return)
180 |                         path_length = 0
181 |                         path_return = 0
182 |                         initial = True
183 |                     else:
184 |                         initial = False
185 |                         
186 |                     action = self.es.get_action(itr, observation, policy=sample_policy)  # qf=qf)
187 | 
188 |                                         
189 |                     next_observation, reward, terminal, _ = self.env.step(action)
190 |                     path_length += 1
191 |                     path_return += reward
192 | 
193 | 
194 |                     if not terminal and path_length >= self.max_path_length:
195 |                         terminal = True
196 |                         # only include the terminal transition in this case if the flag was set
197 |                         if self.include_horizon_terminal_transitions:
198 |                             pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
199 |                     else:
200 |                         pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial)
201 | 
202 |                     observation = next_observation
203 | 
204 |                     if pool.size >= self.min_pool_size:
205 |                         for update_itr in range(self.n_updates_per_sample):
206 |                             # Train policy
207 |                             batch = pool.random_batch(self.batch_size)
208 |                             itrs = self.do_training(itr, batch)
209 |                             train_qf_itr += itrs[0]
210 |                             train_policy_itr += itrs[1]
211 |                         sample_policy.set_param_values(self.policy.get_param_values())
212 | 
213 |                     itr += 1
214 | 
215 |                 logger.log("Training finished")
216 |                 logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr))
217 |                 if pool.size >= self.min_pool_size:
218 |                     self.evaluate(epoch, pool)
219 |                     params = self.get_epoch_snapshot(epoch)
220 |                     logger.save_itr_params(epoch, params)
221 |                 logger.dump_tabular(with_prefix=False)
222 |                 logger.pop_prefix()
223 |                 if self.plot:
224 |                     self.update_plot()
225 |                     if self.pause_for_plot:
226 |                         input("Plotting evaluation run: Press Enter to "
227 |                                   "continue...")
228 |             self.env.terminate()
229 |             self.policy.terminate()
230 | 
231 | 
232 | 
233 |     def init_opt(self):
234 | 
235 |         # First, create "target" policy and Q functions
236 |         with tf.variable_scope("target_policy"):
237 |             target_policy = Serializable.clone(self.policy)
238 |         with tf.variable_scope("target_qf"):
239 |             target_qf = Serializable.clone(self.qf)
240 | 
241 |         # y need to be computed first
242 |         obs = self.env.observation_space.new_tensor_variable(
243 |             'obs',
244 |             extra_dims=1,
245 |         )
246 | 
247 |         # The yi values are computed separately as above and then passed to
248 |         # the training functions below
249 |         action = self.env.action_space.new_tensor_variable(
250 |             'action',
251 |             extra_dims=1,
252 |         )
253 | 
254 |         yvar = tensor_utils.new_tensor(
255 |             'ys',
256 |             ndim=1,
257 |             dtype=tf.float32,
258 |         )
259 | 
260 |         qf_weight_decay_term = 0.5 * self.qf_weight_decay * \
261 |                                sum([tf.reduce_sum(tf.square(param)) for param in
262 |                                     self.qf.get_params(regularizable=True)])
263 | 
264 |         qval = self.qf.get_qval_sym(obs, action)
265 | 
266 |         qf_loss = tf.reduce_mean(tf.square(yvar - qval))
267 |         qf_reg_loss = qf_loss + qf_weight_decay_term
268 | 
269 |         policy_weight_decay_term = 0.5 * self.policy_weight_decay * \
270 |                                    sum([tf.reduce_sum(tf.square(param))
271 |                                         for param in self.policy.get_params(regularizable=True)])
272 | 
273 | 
274 |         """
275 |         Commenting these out below
276 |         """
277 |         # import pdb; pdb.set_trace()
278 |         policy_qval = self.qf.get_qval_sym(obs, self.policy.get_action_sym(obs), deterministic=True)
279 |         # policy_surr = -tf.reduce_mean(policy_qval)
280 | 
281 |         #############################
282 | 
283 |         """
284 |         TO DO HERE
285 |         """
286 |         # Yes implement something like get_qval_plus_var_sym(state, policy, lambda): where it returns mean + lambda*variance 
287 |         # as tf output. variance is a function of multiple forward passes of dropout Q function.   
288 |         # get_qval_plus_var_sym(state, policy, lambda)
289 | 
290 |         # policy_qval_plus_var = self.qf.get_qval_plus_var_sym(obs, self.policy.get_action_sym(obs), lambda, deterministic=True)
291 |         # policy_qval = self.qf.get_qval_plus_var_sym(obs, self.policy.get_action_sym(obs), deterministic=False)
292 | 
293 |         ############################
294 | 
295 | 
296 |         #############################
297 | 
298 |         """
299 |         CHANGES HERE
300 |         """
301 | 
302 |         policy_surr = -tf.reduce_mean(policy_qval)
303 |         policy_input_list = [obs, qval]
304 | 
305 | 
306 |         ############################
307 | 
308 |         policy_reg_surr = policy_surr + policy_weight_decay_term
309 | 
310 |         qf_input_list = [yvar, obs, action]
311 | 
312 |         """
313 |         Commented out
314 |         """
315 |         # policy_input_list = [obs]
316 | 
317 |         self.qf_update_method.update_opt(
318 |             loss=qf_reg_loss, target=self.qf, inputs=qf_input_list)
319 | 
320 | 
321 |         self.policy_update_method.update_opt(
322 |             loss=policy_reg_surr, target=self.policy, inputs=policy_input_list)
323 | 
324 |         f_train_qf = tensor_utils.compile_function(
325 |             inputs=qf_input_list,
326 |             outputs=[qf_loss, qval, self.qf_update_method._train_op],
327 |         )
328 | 
329 |         f_train_policy = tensor_utils.compile_function(
330 |             inputs=policy_input_list,
331 |             outputs=[policy_surr, self.policy_update_method._train_op],
332 |         )
333 | 
334 |         self.opt_info = dict(
335 |             f_train_qf=f_train_qf,
336 |             f_train_policy=f_train_policy,
337 |             target_qf=target_qf,
338 |             target_policy=target_policy,
339 |         )
340 | 
341 | 
342 |     def do_training(self, itr, batch):
343 | 
344 |         obs, actions, rewards, next_obs, terminals = ext.extract(
345 |             batch,
346 |             "observations", "actions", "rewards", "next_observations",
347 |             "terminals"
348 |         )
349 | 
350 |         # compute the on-policy y values
351 |         target_qf = self.opt_info["target_qf"]
352 |         target_policy = self.opt_info["target_policy"]
353 | 
354 |         next_actions, _ = target_policy.get_actions(next_obs)
355 |         next_qvals = target_qf.get_qval(next_obs, next_actions)
356 | 
357 |         ys = rewards + (1. - terminals) * self.discount * next_qvals.reshape(-1)
358 | 
359 |         ### for critic update step
360 |         f_train_qf = self.opt_info["f_train_qf"]
361 |         qf_loss, qval, _ = f_train_qf(ys, obs, actions)
362 | 
363 |         target_qf.set_param_values(
364 |             target_qf.get_param_values() * (1.0 - self.soft_target_tau) +
365 |             self.qf.get_param_values() * self.soft_target_tau)
366 |         self.qf_loss_averages.append(qf_loss)
367 |         self.q_averages.append(qval)
368 |         self.y_averages.append(ys)
369 | 
370 |         self.train_policy_itr += self.policy_updates_ratio
371 |         train_policy_itr = 0
372 | 
373 | 
374 |         while self.train_policy_itr > 0:
375 | 
376 |             ### for actor update step
377 |             f_train_policy = self.opt_info["f_train_policy"]
378 | 
379 | 
380 |             MC_SAMPLES=20
381 |             all_qval = np.zeros(shape=(obs.shape[0], MC_SAMPLES))
382 |             for m in range(MC_SAMPLES):
383 | 
384 |                 _, qval, _ = f_train_qf(ys, obs, actions)
385 |                 all_qval[:, m] = qval
386 | 
387 |             mean_qval = np.mean(all_qval, axis=1)
388 |             var_qval = np.var(all_qval, axis=1)
389 | 
390 |             qval_uncertain = mean_qval + var_qval
391 | 
392 |             #policy_surr, _ = f_train_policy(obs)
393 |             policy_surr, _ = f_train_policy(obs, qval_uncertain)
394 | 
395 |             target_policy.set_param_values(
396 |                 target_policy.get_param_values() * (1.0 - self.soft_target_tau) +
397 |                 self.policy.get_param_values() * self.soft_target_tau)
398 |             self.policy_surr_averages.append(policy_surr)
399 |             self.train_policy_itr -= 1
400 |             train_policy_itr += 1
401 |         return 1, train_policy_itr # number of itrs qf, policy are trained
402 | 
403 | 
404 | 
405 | 
406 | 
407 | 
408 | 
409 | 
410 |     def evaluate(self, epoch, pool):
411 |         logger.log("Collecting samples for evaluation")
412 |         paths = parallel_sampler.sample_paths(
413 |             policy_params=self.policy.get_param_values(),
414 |             max_samples=self.eval_samples,
415 |             max_path_length=self.max_path_length,
416 |         )
417 | 
418 |         average_discounted_return = np.mean(
419 |             [special.discount_return(path["rewards"], self.discount) for path in paths]
420 |         )
421 | 
422 |         returns = [sum(path["rewards"]) for path in paths]
423 | 
424 |         all_qs = np.concatenate(self.q_averages)
425 |         all_ys = np.concatenate(self.y_averages)
426 | 
427 |         average_q_loss = np.mean(self.qf_loss_averages)
428 |         average_policy_surr = np.mean(self.policy_surr_averages)
429 |         average_action = np.mean(np.square(np.concatenate(
430 |             [path["actions"] for path in paths]
431 |         )))
432 | 
433 |         policy_reg_param_norm = np.linalg.norm(
434 |             self.policy.get_param_values(regularizable=True)
435 |         )
436 |         qfun_reg_param_norm = np.linalg.norm(
437 |             self.qf.get_param_values(regularizable=True)
438 |         )
439 | 
440 |         logger.record_tabular('Epoch', epoch)
441 |         logger.record_tabular('Iteration', epoch)
442 |         logger.record_tabular('AverageReturn', np.mean(returns))
443 |         logger.record_tabular('StdReturn',
444 |                               np.std(returns))
445 |         logger.record_tabular('MaxReturn',
446 |                               np.max(returns))
447 |         logger.record_tabular('MinReturn',
448 |                               np.min(returns))
449 |         if len(self.es_path_returns) > 0:
450 |             logger.record_tabular('AverageEsReturn',
451 |                                   np.mean(self.es_path_returns))
452 |             logger.record_tabular('StdEsReturn',
453 |                                   np.std(self.es_path_returns))
454 |             logger.record_tabular('MaxEsReturn',
455 |                                   np.max(self.es_path_returns))
456 |             logger.record_tabular('MinEsReturn',
457 |                                   np.min(self.es_path_returns))
458 |         logger.record_tabular('AverageDiscountedReturn',
459 |                               average_discounted_return)
460 |         logger.record_tabular('AverageQLoss', average_q_loss)
461 |         logger.record_tabular('AveragePolicySurr', average_policy_surr)
462 |         logger.record_tabular('AverageQ', np.mean(all_qs))
463 |         logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs)))
464 |         logger.record_tabular('AverageY', np.mean(all_ys))
465 |         logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys)))
466 |         logger.record_tabular('AverageAbsQYDiff',
467 |                               np.mean(np.abs(all_qs - all_ys)))
468 |         logger.record_tabular('AverageAction', average_action)
469 | 
470 |         logger.record_tabular('PolicyRegParamNorm',
471 |                               policy_reg_param_norm)
472 |         logger.record_tabular('QFunRegParamNorm',
473 |                               qfun_reg_param_norm)
474 | 
475 |         self.env.log_diagnostics(paths)
476 |         self.policy.log_diagnostics(paths)
477 | 
478 |         self.qf_loss_averages = []
479 |         self.policy_surr_averages = []
480 | 
481 |         self.q_averages = []
482 |         self.y_averages = []
483 |         self.es_path_returns = []
484 | 
485 |     def update_plot(self):
486 |         if self.plot:
487 |             plotter.update_plot(self.policy, self.max_path_length)
488 | 
489 |     def get_epoch_snapshot(self, epoch):
490 |         return dict(
491 |             env=self.env,
492 |             epoch=epoch,
493 |             qf=self.qf,
494 |             policy=self.policy,
495 |             target_qf=self.opt_info["target_qf"],
496 |             target_policy=self.opt_info["target_policy"],
497 |             es=self.es,
498 |         )
499 | 


--------------------------------------------------------------------------------
/bayesian_network.py:
--------------------------------------------------------------------------------
  1 | import sandbox.rocky.tf.core.layers as L
  2 | import tensorflow as tf
  3 | import numpy as np
  4 | import itertools
  5 | from rllab.core.serializable import Serializable
  6 | from sandbox.rocky.tf.core.parameterized import Parameterized
  7 | from sandbox.rocky.tf.core.layers_powered import LayersPowered
  8 | 
  9 | 
 10 | class MLP(LayersPowered, Serializable):
 11 |     def __init__(self, name, output_dim, hidden_sizes, hidden_nonlinearity, dropout_prob,
 12 |                  output_nonlinearity, hidden_W_init=L.XavierUniformInitializer(), hidden_b_init=tf.zeros_initializer(),
 13 |                  output_W_init=L.XavierUniformInitializer(), output_b_init=tf.zeros_initializer(),
 14 |                  input_var=None, input_layer=None, input_shape=None, batch_normalization=False, weight_normalization=False,
 15 |                  ):
 16 | 
 17 |         Serializable.quick_init(self, locals())
 18 | 
 19 |         with tf.variable_scope(name):
 20 |             if input_layer is None:
 21 |                 l_in = L.InputLayer(shape=(None,) + input_shape, input_var=input_var, name="input")
 22 |             else:
 23 |                 l_in = input_layer
 24 |             self._layers = [l_in] 
 25 | 
 26 |             ##applying dropout on all layers?
 27 |             l_hid_dropout_input = L.DropoutLayer(l_in, p = dropout_prob)
 28 |             l_hid = l_hid_dropout_input
 29 | 
 30 | 
 31 |             # l_hid = l_in
 32 |             if batch_normalization:
 33 |                 l_hid = L.batch_norm(l_hid)
 34 |             for idx, hidden_size in enumerate(hidden_sizes):
 35 |                 l_hid = L.DenseLayer(
 36 |                     l_hid,
 37 |                     num_units=hidden_size,
 38 |                     nonlinearity=hidden_nonlinearity,
 39 |                     name="hidden_%d" % idx,
 40 |                     W=hidden_W_init,
 41 |                     b=hidden_b_init,
 42 |                     weight_normalization=weight_normalization
 43 |                 )
 44 |                 if batch_normalization:
 45 |                     l_hid = L.batch_norm(l_hid)
 46 |                 self._layers.append(l_hid)
 47 | 
 48 | 
 49 |             ###applying dropout to the last hidden layer?
 50 |             l_hid_dropout = L.DropoutLayer(l_hid, p=dropout_prob)
 51 | 
 52 |             l_out = L.DenseLayer(
 53 |                 l_hid_dropout,
 54 |                 num_units=output_dim,
 55 |                 nonlinearity=output_nonlinearity,
 56 |                 name="output",
 57 |                 W=output_W_init,
 58 |                 b=output_b_init,
 59 |                 weight_normalization=weight_normalization
 60 |             )
 61 | 
 62 |             # l_out = L.DenseLayer(
 63 |             #     l_hid,
 64 |             #     num_units=output_dim,
 65 |             #     nonlinearity=output_nonlinearity,
 66 |             #     name="output",
 67 |             #     W=output_W_init,
 68 |             #     b=output_b_init,
 69 |             #     weight_normalization=weight_normalization
 70 |             # )
 71 | 
 72 |             #Alternative, making output layer the dropout layer
 73 |             # l_out = L.DropoutLayer(l_hid, p=dropout_prob)
 74 | 
 75 |             if batch_normalization:
 76 |                 l_out = L.batch_norm(l_out)
 77 | 
 78 | 
 79 |             self._layers.append(l_out)
 80 |             self._l_in = l_in
 81 |             self._l_out = l_out
 82 |             # self._input_var = l_in.input_var
 83 |             self._output = L.get_output(l_out)
 84 | 
 85 |             LayersPowered.__init__(self, l_out)
 86 | 
 87 |     @property
 88 |     def input_layer(self):
 89 |         return self._l_in
 90 | 
 91 |     @property
 92 |     def output_layer(self):
 93 |         return self._l_out
 94 | 
 95 |     @property
 96 |     def input_var(self):
 97 |         return self._l_in.input_var
 98 | 
 99 |     @property
100 |     def layers(self):
101 |         return self._layers
102 | 
103 |     @property
104 |     def output(self):
105 |         return self._output
106 | 
107 | 
108 | class ConvNetwork(LayersPowered, Serializable):
109 |     def __init__(self, name, input_shape, output_dim,
110 |                  conv_filters, conv_filter_sizes, conv_strides, conv_pads,
111 |                  hidden_sizes, hidden_nonlinearity, output_nonlinearity,
112 |                  hidden_W_init=L.XavierUniformInitializer(), hidden_b_init=tf.zeros_initializer(),
113 |                  output_W_init=L.XavierUniformInitializer(), output_b_init=tf.zeros_initializer(),
114 |                  input_var=None, input_layer=None, batch_normalization=False, weight_normalization=False):
115 |         Serializable.quick_init(self, locals())
116 |         """
117 |         A network composed of several convolution layers followed by some fc layers.
118 |         input_shape: (width,height,channel)
119 |             HOWEVER, network inputs are assumed flattened. This network will first unflatten the inputs and then apply the standard convolutions and so on.
120 |         conv_filters: a list of numbers of convolution kernel
121 |         conv_filter_sizes: a list of sizes (int) of the convolution kernels
122 |         conv_strides: a list of strides (int) of the conv kernels
123 |         conv_pads: a list of pad formats (either 'SAME' or 'VALID')
124 |         hidden_nonlinearity: a nonlinearity from tf.nn, shared by all conv and fc layers
125 |         hidden_sizes: a list of numbers of hidden units for all fc layers
126 |         """
127 |         with tf.variable_scope(name):
128 |             if input_layer is not None:
129 |                 l_in = input_layer
130 |                 l_hid = l_in
131 |             elif len(input_shape) == 3:
132 |                 l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var, name="input")
133 |                 l_hid = L.reshape(l_in, ([0],) + input_shape, name="reshape_input")
134 |             elif len(input_shape) == 2:
135 |                 l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var, name="input")
136 |                 input_shape = (1,) + input_shape
137 |                 l_hid = L.reshape(l_in, ([0],) + input_shape, name="reshape_input")
138 |             else:
139 |                 l_in = L.InputLayer(shape=(None,) + input_shape, input_var=input_var, name="input")
140 |                 l_hid = l_in
141 | 
142 |             if batch_normalization:
143 |                 l_hid = L.batch_norm(l_hid)
144 |             for idx, conv_filter, filter_size, stride, pad in zip(
145 |                     range(len(conv_filters)),
146 |                     conv_filters,
147 |                     conv_filter_sizes,
148 |                     conv_strides,
149 |                     conv_pads,
150 |             ):
151 |                 l_hid = L.Conv2DLayer(
152 |                     l_hid,
153 |                     num_filters=conv_filter,
154 |                     filter_size=filter_size,
155 |                     stride=(stride, stride),
156 |                     pad=pad,
157 |                     nonlinearity=hidden_nonlinearity,
158 |                     name="conv_hidden_%d" % idx,
159 |                     weight_normalization=weight_normalization,
160 |                 )
161 |                 if batch_normalization:
162 |                     l_hid = L.batch_norm(l_hid)
163 | 
164 |             if output_nonlinearity == L.spatial_expected_softmax:
165 |                 assert len(hidden_sizes) == 0
166 |                 assert output_dim == conv_filters[-1] * 2
167 |                 l_hid.nonlinearity = tf.identity
168 |                 l_out = L.SpatialExpectedSoftmaxLayer(l_hid)
169 |             else:
170 |                 l_hid = L.flatten(l_hid, name="conv_flatten")
171 |                 for idx, hidden_size in enumerate(hidden_sizes):
172 |                     l_hid = L.DenseLayer(
173 |                         l_hid,
174 |                         num_units=hidden_size,
175 |                         nonlinearity=hidden_nonlinearity,
176 |                         name="hidden_%d" % idx,
177 |                         W=hidden_W_init,
178 |                         b=hidden_b_init,
179 |                         weight_normalization=weight_normalization,
180 |                     )
181 |                     if batch_normalization:
182 |                         l_hid = L.batch_norm(l_hid)
183 |                 l_out = L.DenseLayer(
184 |                     l_hid,
185 |                     num_units=output_dim,
186 |                     nonlinearity=output_nonlinearity,
187 |                     name="output",
188 |                     W=output_W_init,
189 |                     b=output_b_init,
190 |                     weight_normalization=weight_normalization,
191 |                 )
192 |                 if batch_normalization:
193 |                     l_out = L.batch_norm(l_out)
194 |             self._l_in = l_in
195 |             self._l_out = l_out
196 |             # self._input_var = l_in.input_var
197 | 
198 |         LayersPowered.__init__(self, l_out)
199 | 
200 |     @property
201 |     def input_layer(self):
202 |         return self._l_in
203 | 
204 |     @property
205 |     def output_layer(self):
206 |         return self._l_out
207 | 
208 |     @property
209 |     def input_var(self):
210 |         return self._l_in.input_var
211 | 
212 | 
213 | class GRUNetwork(object):
214 |     def __init__(self, name, input_shape, output_dim, hidden_dim, hidden_nonlinearity=tf.nn.relu,
215 |                  gru_layer_cls=L.GRULayer,
216 |                  output_nonlinearity=None, input_var=None, input_layer=None, layer_args=None):
217 |         with tf.variable_scope(name):
218 |             if input_layer is None:
219 |                 l_in = L.InputLayer(shape=(None, None) + input_shape, input_var=input_var, name="input")
220 |             else:
221 |                 l_in = input_layer
222 |             l_step_input = L.InputLayer(shape=(None,) + input_shape, name="step_input")
223 |             l_step_prev_state = L.InputLayer(shape=(None, hidden_dim), name="step_prev_state")
224 |             if layer_args is None:
225 |                 layer_args = dict()
226 |             l_gru = gru_layer_cls(l_in, num_units=hidden_dim, hidden_nonlinearity=hidden_nonlinearity,
227 |                                   hidden_init_trainable=False, name="gru", **layer_args)
228 |             l_gru_flat = L.ReshapeLayer(
229 |                 l_gru, shape=(-1, hidden_dim),
230 |                 name="gru_flat"
231 |             )
232 |             l_output_flat = L.DenseLayer(
233 |                 l_gru_flat,
234 |                 num_units=output_dim,
235 |                 nonlinearity=output_nonlinearity,
236 |                 name="output_flat"
237 |             )
238 |             l_output = L.OpLayer(
239 |                 l_output_flat,
240 |                 op=lambda flat_output, l_input:
241 |                 tf.reshape(flat_output, tf.stack((tf.shape(l_input)[0], tf.shape(l_input)[1], -1))),
242 |                 shape_op=lambda flat_output_shape, l_input_shape:
243 |                 (l_input_shape[0], l_input_shape[1], flat_output_shape[-1]),
244 |                 extras=[l_in],
245 |                 name="output"
246 |             )
247 |             l_step_state = l_gru.get_step_layer(l_step_input, l_step_prev_state, name="step_state")
248 |             l_step_hidden = l_step_state
249 |             l_step_output = L.DenseLayer(
250 |                 l_step_hidden,
251 |                 num_units=output_dim,
252 |                 nonlinearity=output_nonlinearity,
253 |                 W=l_output_flat.W,
254 |                 b=l_output_flat.b,
255 |                 name="step_output"
256 |             )
257 | 
258 |             self._l_in = l_in
259 |             self._hid_init_param = l_gru.h0
260 |             self._l_gru = l_gru
261 |             self._l_out = l_output
262 |             self._l_step_input = l_step_input
263 |             self._l_step_prev_state = l_step_prev_state
264 |             self._l_step_hidden = l_step_hidden
265 |             self._l_step_state = l_step_state
266 |             self._l_step_output = l_step_output
267 |             self._hidden_dim = hidden_dim
268 | 
269 |     @property
270 |     def state_dim(self):
271 |         return self._hidden_dim
272 | 
273 |     @property
274 |     def hidden_dim(self):
275 |         return self._hidden_dim
276 | 
277 |     @property
278 |     def input_layer(self):
279 |         return self._l_in
280 | 
281 |     @property
282 |     def input_var(self):
283 |         return self._l_in.input_var
284 | 
285 |     @property
286 |     def output_layer(self):
287 |         return self._l_out
288 | 
289 |     @property
290 |     def recurrent_layer(self):
291 |         return self._l_gru
292 | 
293 |     @property
294 |     def step_input_layer(self):
295 |         return self._l_step_input
296 | 
297 |     @property
298 |     def step_prev_state_layer(self):
299 |         return self._l_step_prev_state
300 | 
301 |     @property
302 |     def step_hidden_layer(self):
303 |         return self._l_step_hidden
304 | 
305 |     @property
306 |     def step_state_layer(self):
307 |         return self._l_step_state
308 | 
309 |     @property
310 |     def step_output_layer(self):
311 |         return self._l_step_output
312 | 
313 |     @property
314 |     def hid_init_param(self):
315 |         return self._hid_init_param
316 | 
317 |     @property
318 |     def state_init_param(self):
319 |         return self._hid_init_param
320 | 
321 | 
322 | class LSTMNetwork(object):
323 |     def __init__(self, name, input_shape, output_dim, hidden_dim, hidden_nonlinearity=tf.nn.relu,
324 |                  lstm_layer_cls=L.LSTMLayer,
325 |                  output_nonlinearity=None, input_var=None, input_layer=None, forget_bias=1.0, use_peepholes=False,
326 |                  layer_args=None):
327 |         with tf.variable_scope(name):
328 |             if input_layer is None:
329 |                 l_in = L.InputLayer(shape=(None, None) + input_shape, input_var=input_var, name="input")
330 |             else:
331 |                 l_in = input_layer
332 |             l_step_input = L.InputLayer(shape=(None,) + input_shape, name="step_input")
333 |             # contains previous hidden and cell state
334 |             l_step_prev_state = L.InputLayer(shape=(None, hidden_dim * 2), name="step_prev_state")
335 |             if layer_args is None:
336 |                 layer_args = dict()
337 |             l_lstm = lstm_layer_cls(l_in, num_units=hidden_dim, hidden_nonlinearity=hidden_nonlinearity,
338 |                                     hidden_init_trainable=False, name="lstm", forget_bias=forget_bias,
339 |                                     cell_init_trainable=False, use_peepholes=use_peepholes, **layer_args)
340 |             l_lstm_flat = L.ReshapeLayer(
341 |                 l_lstm, shape=(-1, hidden_dim),
342 |                 name="lstm_flat"
343 |             )
344 |             l_output_flat = L.DenseLayer(
345 |                 l_lstm_flat,
346 |                 num_units=output_dim,
347 |                 nonlinearity=output_nonlinearity,
348 |                 name="output_flat"
349 |             )
350 |             l_output = L.OpLayer(
351 |                 l_output_flat,
352 |                 op=lambda flat_output, l_input:
353 |                 tf.reshape(flat_output, tf.stack((tf.shape(l_input)[0], tf.shape(l_input)[1], -1))),
354 |                 shape_op=lambda flat_output_shape, l_input_shape:
355 |                 (l_input_shape[0], l_input_shape[1], flat_output_shape[-1]),
356 |                 extras=[l_in],
357 |                 name="output"
358 |             )
359 |             l_step_state = l_lstm.get_step_layer(l_step_input, l_step_prev_state, name="step_state")
360 |             l_step_hidden = L.SliceLayer(l_step_state, indices=slice(hidden_dim), name="step_hidden")
361 |             l_step_cell = L.SliceLayer(l_step_state, indices=slice(hidden_dim, None), name="step_cell")
362 |             l_step_output = L.DenseLayer(
363 |                 l_step_hidden,
364 |                 num_units=output_dim,
365 |                 nonlinearity=output_nonlinearity,
366 |                 W=l_output_flat.W,
367 |                 b=l_output_flat.b,
368 |                 name="step_output"
369 |             )
370 | 
371 |             self._l_in = l_in
372 |             self._hid_init_param = l_lstm.h0
373 |             self._cell_init_param = l_lstm.c0
374 |             self._l_lstm = l_lstm
375 |             self._l_out = l_output
376 |             self._l_step_input = l_step_input
377 |             self._l_step_prev_state = l_step_prev_state
378 |             self._l_step_hidden = l_step_hidden
379 |             self._l_step_cell = l_step_cell
380 |             self._l_step_state = l_step_state
381 |             self._l_step_output = l_step_output
382 |             self._hidden_dim = hidden_dim
383 | 
384 |     @property
385 |     def state_dim(self):
386 |         return self._hidden_dim * 2
387 | 
388 |     @property
389 |     def input_layer(self):
390 |         return self._l_in
391 | 
392 |     @property
393 |     def input_var(self):
394 |         return self._l_in.input_var
395 | 
396 |     @property
397 |     def output_layer(self):
398 |         return self._l_out
399 | 
400 |     @property
401 |     def recurrent_layer(self):
402 |         return self._l_lstm
403 | 
404 |     @property
405 |     def step_input_layer(self):
406 |         return self._l_step_input
407 | 
408 |     @property
409 |     def step_prev_state_layer(self):
410 |         return self._l_step_prev_state
411 | 
412 |     @property
413 |     def step_hidden_layer(self):
414 |         return self._l_step_hidden
415 | 
416 |     @property
417 |     def step_state_layer(self):
418 |         return self._l_step_state
419 | 
420 |     @property
421 |     def step_cell_layer(self):
422 |         return self._l_step_cell
423 | 
424 |     @property
425 |     def step_output_layer(self):
426 |         return self._l_step_output
427 | 
428 |     @property
429 |     def hid_init_param(self):
430 |         return self._hid_init_param
431 | 
432 |     @property
433 |     def cell_init_param(self):
434 |         return self._cell_init_param
435 | 
436 |     @property
437 |     def state_init_param(self):
438 |         return tf.concat(axis=0, values=[self._hid_init_param, self._cell_init_param])
439 | 
440 | 
441 | class ConvMergeNetwork(LayersPowered, Serializable):
442 |     """
443 |     This network allows the input to consist of a convolution-friendly component, plus a non-convolution-friendly
444 |     component. These two components will be concatenated in the fully connected layers. There can also be a list of
445 |     optional layers for the non-convolution-friendly component alone.
446 | 
447 | 
448 |     The input to the network should be a matrix where each row is a single input entry, with both the aforementioned
449 |     components flattened out and then concatenated together
450 |     """
451 | 
452 |     def __init__(self, name, input_shape, extra_input_shape, output_dim, hidden_sizes,
453 |                  conv_filters, conv_filter_sizes, conv_strides, conv_pads,
454 |                  extra_hidden_sizes=None,
455 |                  hidden_W_init=L.XavierUniformInitializer(), hidden_b_init=tf.zeros_initializer(),
456 |                  output_W_init=L.XavierUniformInitializer(), output_b_init=tf.zeros_initializer(),
457 |                  hidden_nonlinearity=tf.nn.relu,
458 |                  output_nonlinearity=None,
459 |                  input_var=None, input_layer=None):
460 |         Serializable.quick_init(self, locals())
461 | 
462 |         if extra_hidden_sizes is None:
463 |             extra_hidden_sizes = []
464 | 
465 |         with tf.variable_scope(name):
466 | 
467 |             input_flat_dim = np.prod(input_shape)
468 |             extra_input_flat_dim = np.prod(extra_input_shape)
469 |             total_input_flat_dim = input_flat_dim + extra_input_flat_dim
470 | 
471 |             if input_layer is None:
472 |                 l_in = L.InputLayer(shape=(None, total_input_flat_dim), input_var=input_var, name="input")
473 |             else:
474 |                 l_in = input_layer
475 | 
476 |             l_conv_in = L.reshape(
477 |                 L.SliceLayer(
478 |                     l_in,
479 |                     indices=slice(input_flat_dim),
480 |                     name="conv_slice"
481 |                 ),
482 |                 ([0],) + input_shape,
483 |                 name="conv_reshaped"
484 |             )
485 |             l_extra_in = L.reshape(
486 |                 L.SliceLayer(
487 |                     l_in,
488 |                     indices=slice(input_flat_dim, None),
489 |                     name="extra_slice"
490 |                 ),
491 |                 ([0],) + extra_input_shape,
492 |                 name="extra_reshaped"
493 |             )
494 | 
495 |             l_conv_hid = l_conv_in
496 |             for idx, conv_filter, filter_size, stride, pad in zip(
497 |                     range(len(conv_filters)),
498 |                     conv_filters,
499 |                     conv_filter_sizes,
500 |                     conv_strides,
501 |                     conv_pads,
502 |             ):
503 |                 l_conv_hid = L.Conv2DLayer(
504 |                     l_conv_hid,
505 |                     num_filters=conv_filter,
506 |                     filter_size=filter_size,
507 |                     stride=(stride, stride),
508 |                     pad=pad,
509 |                     nonlinearity=hidden_nonlinearity,
510 |                     name="conv_hidden_%d" % idx,
511 |                 )
512 | 
513 |             l_extra_hid = l_extra_in
514 |             for idx, hidden_size in enumerate(extra_hidden_sizes):
515 |                 l_extra_hid = L.DenseLayer(
516 |                     l_extra_hid,
517 |                     num_units=hidden_size,
518 |                     nonlinearity=hidden_nonlinearity,
519 |                     name="extra_hidden_%d" % idx,
520 |                     W=hidden_W_init,
521 |                     b=hidden_b_init,
522 |                 )
523 | 
524 |             l_joint_hid = L.concat(
525 |                 [L.flatten(l_conv_hid, name="conv_hidden_flat"), l_extra_hid],
526 |                 name="joint_hidden"
527 |             )
528 | 
529 |             for idx, hidden_size in enumerate(hidden_sizes):
530 |                 l_joint_hid = L.DenseLayer(
531 |                     l_joint_hid,
532 |                     num_units=hidden_size,
533 |                     nonlinearity=hidden_nonlinearity,
534 |                     name="joint_hidden_%d" % idx,
535 |                     W=hidden_W_init,
536 |                     b=hidden_b_init,
537 |                 )
538 |             l_out = L.DenseLayer(
539 |                 l_joint_hid,
540 |                 num_units=output_dim,
541 |                 nonlinearity=output_nonlinearity,
542 |                 name="output",
543 |                 W=output_W_init,
544 |                 b=output_b_init,
545 |             )
546 |             self._l_in = l_in
547 |             self._l_out = l_out
548 | 
549 |             LayersPowered.__init__(self, [l_out], input_layers=[l_in])
550 | 
551 |     @property
552 |     def input_layer(self):
553 |         return self._l_in
554 | 
555 |     @property
556 |     def output_layer(self):
557 |         return self._l_out
558 | 
559 |     @property
560 |     def input_var(self):
561 |         return self._l_in.input_var
562 | 


--------------------------------------------------------------------------------