├── bayesian_ddpg ├── params.pkl ├── progress.csv ├── params.json └── debug.log ├── __pycache__ ├── ddpg_bayesian.cpython-35.pyc ├── sampling_utils.cpython-35.pyc ├── ddpg_bayesian_mean.cpython-35.pyc ├── dropout_exploration.cpython-35.pyc ├── ddpg_bayesian_thompson.cpython-35.pyc ├── deterministic_mlp_policy_bayesian.cpython-35.pyc └── continuous_mlp_q_function_bayesian.cpython-35.pyc ├── deterministic_mlp_policy_bayesian.py ├── dropout_exploration.py ├── run_bayesian_ddpg.py ├── continuous_mlp_q_function_bayesian.py ├── sampling_utils.py ├── dropout_gal_neuralnet_tf_example.py ├── variational_inference_examples.py ├── ddpg_bayesian_thompson.py ├── ddpg_bayesian_mean.py ├── ddpg_bayesian.py └── bayesian_network.py /bayesian_ddpg/params.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/bayesian_ddpg/params.pkl -------------------------------------------------------------------------------- /__pycache__/ddpg_bayesian.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/ddpg_bayesian.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/sampling_utils.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/sampling_utils.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/ddpg_bayesian_mean.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/ddpg_bayesian_mean.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/dropout_exploration.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/dropout_exploration.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/ddpg_bayesian_thompson.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/ddpg_bayesian_thompson.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/deterministic_mlp_policy_bayesian.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/deterministic_mlp_policy_bayesian.cpython-35.pyc -------------------------------------------------------------------------------- /__pycache__/continuous_mlp_q_function_bayesian.cpython-35.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Riashat/Bayesian-Exploration-Deep-RL/HEAD/__pycache__/continuous_mlp_q_function_bayesian.cpython-35.pyc -------------------------------------------------------------------------------- /bayesian_ddpg/progress.csv: -------------------------------------------------------------------------------- 1 | AverageAbsQYDiff,MaxReturn,Epoch,AverageAbsQ,MaxEsReturn,StdReturn,PolicyRegParamNorm,AverageQLoss,AverageDiscountedReturn,AverageY,Iteration,StdEsReturn,QFunRegParamNorm,AverageEsReturn,MinEsReturn,AverageAbsY,AverageAction,AverageReturn,AverageQ,MinReturn,AveragePolicySurr 2 | 0.758090012217,9.31850444693,9,0.365071,54.8695015601,0.04653903996,11.23,0.74919,8.76006204415,0.48040966073,9,7.05844821432,11.13,9.9070100333,1.67898668029,0.593880576329,0.218841,9.18140387997,-0.243523,9.07004472335,-0.053608 3 | 0.177415410708,1.91244584333,10,0.823965,8.61305045778,0.0811045362752,13.2281,0.0712594,1.7088114456,0.805172051351,10,1.56573821189,11.3933,2.15915361844,0.0122472149906,0.845080383496,0.999596,1.70341682201,0.795785,1.49563580939,-0.672253 4 | 0.12357864363,3.89928951199,11,1.19482,6.41710902887,0.115736916112,13.4963,0.0288876,3.53009868395,1.14473330561,11,0.737778156092,11.7883,4.05680174472,1.23751043098,1.20288642826,1.0,3.58419018176,1.1393,3.26894831589,-0.974544 5 | 0.144135365208,3.89855122419,12,1.66405,6.2912818238,0.113467534219,13.4963,0.0412518,3.53067386378,1.59586680965,12,0.57573980753,12.3922,4.13937903654,2.10169953317,1.6723605747,1.0,3.58477456358,1.59025,3.29676145871,-1.35587 6 | -------------------------------------------------------------------------------- /deterministic_mlp_policy_bayesian.py: -------------------------------------------------------------------------------- 1 | import lasagne 2 | import lasagne.layers as L 3 | import lasagne.nonlinearities as NL 4 | import lasagne.init as LI 5 | from rllab.core.serializable import Serializable 6 | from rllab.misc import ext 7 | from rllab.misc.overrides import overrides 8 | from sandbox.rocky.tf.core.layers_powered import LayersPowered 9 | from sandbox.rocky.tf.core.network import MLP 10 | from sandbox.rocky.tf.distributions.categorical import Categorical 11 | from sandbox.rocky.tf.policies.base import Policy 12 | from sandbox.rocky.tf.misc import tensor_utils 13 | 14 | import sandbox.rocky.tf.core.layers as L 15 | from sandbox.rocky.tf.core.layers import batch_norm 16 | 17 | from sandbox.rocky.tf.spaces.discrete import Discrete 18 | import tensorflow as tf 19 | 20 | 21 | class DeterministicMLPPolicy(Policy, LayersPowered, Serializable): 22 | def __init__( 23 | self, 24 | name, 25 | env_spec, 26 | hidden_sizes=(32, 32), 27 | hidden_nonlinearity=tf.nn.relu, 28 | output_nonlinearity=tf.nn.tanh, 29 | prob_network=None, 30 | bn=False): 31 | Serializable.quick_init(self, locals()) 32 | 33 | ## Apply MC Dropout on the MLP networks here 34 | 35 | with tf.variable_scope(name): 36 | if prob_network is None: 37 | prob_network = MLP( 38 | input_shape=(env_spec.observation_space.flat_dim,), 39 | output_dim=env_spec.action_space.flat_dim, 40 | hidden_sizes=hidden_sizes, 41 | hidden_nonlinearity=hidden_nonlinearity, 42 | output_nonlinearity=output_nonlinearity, 43 | # batch_normalization=True, 44 | name="prob_network", 45 | ) 46 | 47 | self._l_prob = prob_network.output_layer 48 | self._l_obs = prob_network.input_layer 49 | self._f_prob = tensor_utils.compile_function( 50 | [prob_network.input_layer.input_var], 51 | L.get_output(prob_network.output_layer, deterministic=True) 52 | ) 53 | 54 | self.prob_network = prob_network 55 | 56 | # Note the deterministic=True argument. It makes sure that when getting 57 | # actions from single observations, we do not update params in the 58 | # batch normalization layers. 59 | # TODO: this doesn't currently work properly in the tf version so we leave out batch_norm 60 | super(DeterministicMLPPolicy, self).__init__(env_spec) 61 | LayersPowered.__init__(self, [prob_network.output_layer]) 62 | 63 | 64 | 65 | @property 66 | def vectorized(self): 67 | return True 68 | 69 | @overrides 70 | def get_action(self, observation): 71 | flat_obs = self.observation_space.flatten(observation) 72 | action = self._f_prob([flat_obs])[0] 73 | return action, dict() 74 | 75 | @overrides 76 | def get_actions(self, observations): 77 | flat_obs = self.observation_space.flatten_n(observations) 78 | actions = self._f_prob(flat_obs) 79 | return actions, dict() 80 | 81 | def get_action_sym(self, obs_var): 82 | return L.get_output(self.prob_network.output_layer, obs_var) 83 | -------------------------------------------------------------------------------- /dropout_exploration.py: -------------------------------------------------------------------------------- 1 | from rllab.misc.overrides import overrides 2 | from rllab.misc.ext import AttrDict 3 | from rllab.core.serializable import Serializable 4 | from rllab.spaces.box import Box 5 | from rllab.exploration_strategies.base import ExplorationStrategy 6 | import numpy as np 7 | import numpy.random as nr 8 | 9 | 10 | class MCDropout(ExplorationStrategy, Serializable): 11 | """ 12 | This strategy implements the Ornstein-Uhlenbeck process, which adds 13 | time-correlated noise to the actions taken by the deterministic policy. 14 | The OU process satisfies the following stochastic differential equation: 15 | dxt = theta*(mu - xt)*dt + sigma*dWt 16 | where Wt denotes the Wiener process 17 | """ 18 | 19 | def __init__(self, env_spec, mu=0, theta=0.15, sigma=0.3, **kwargs): 20 | assert isinstance(env_spec.action_space, Box) 21 | assert len(env_spec.action_space.shape) == 1 22 | Serializable.quick_init(self, locals()) 23 | self.mu = mu 24 | self.theta = theta 25 | self.sigma = sigma 26 | self.action_space = env_spec.action_space 27 | self.state = np.ones(self.action_space.flat_dim) * self.mu 28 | self.reset() 29 | 30 | def __getstate__(self): 31 | d = Serializable.__getstate__(self) 32 | d["state"] = self.state 33 | return d 34 | 35 | def __setstate__(self, d): 36 | Serializable.__setstate__(self, d) 37 | self.state = d["state"] 38 | 39 | @overrides 40 | def reset(self): 41 | self.state = np.ones(self.action_space.flat_dim) * self.mu 42 | 43 | def evolve_state(self): 44 | x = self.state 45 | dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x)) 46 | self.state = x + dx 47 | return self.state 48 | 49 | @overrides 50 | def get_action(self, t, observation, policy, **kwargs): 51 | action, _ = policy.get_action(observation) 52 | ou_state = self.evolve_state() 53 | return np.clip(action + ou_state, self.action_space.low, self.action_space.high) 54 | 55 | 56 | # def get_action(self, t, observation, policy, **kwargs): 57 | # #applying MC Dropout and taking the mean action? 58 | # action, _ = policy.get_action(observation) 59 | 60 | # mc_dropout = 10 61 | # all_actions = np.zeros(shape=(mc_dropout, action.shape[0])) 62 | # for d in range(mc_dropout): 63 | # action, _ = policy.get_action(observation) 64 | # all_actions[d, :] = action 65 | # mean_action = np.mean(all_actions, axis=0) 66 | 67 | # return mean_action 68 | 69 | 70 | # def get_stochastic_action(self, t, observation, policy, **kwargs): 71 | # action, _ = policy.get_action(observation) 72 | # mc_dropout = 10 73 | # all_actions = np.zeros(shape=(mc_dropout, action.shape[0])) 74 | 75 | # for d in range(mc_dropout): 76 | # action, _ = policy.get_action(observation) 77 | # all_actions[d, :] = action 78 | 79 | # mean_action = np.mean(all_actions, axis=0) 80 | 81 | 82 | # return mean_action 83 | 84 | 85 | 86 | if __name__ == "__main__": 87 | ou = MCDropout(env_spec=AttrDict(action_space=Box(low=-1, high=1, shape=(1,))), mu=0, theta=0.15, sigma=0.3) 88 | states = [] 89 | for i in range(1000): 90 | states.append(ou.evolve_state()[0]) 91 | import matplotlib.pyplot as plt 92 | 93 | plt.plot(states) 94 | plt.show() 95 | -------------------------------------------------------------------------------- /run_bayesian_ddpg.py: -------------------------------------------------------------------------------- 1 | from ddpg_bayesian_thompson import DDPG as DDPG_Thompson 2 | from ddpg_bayesian_mean import DDPG as DDPG_Mean 3 | from ddpg_bayesian import DDPG as DDPG_Bayesian 4 | from dropout_exploration import MCDropout 5 | from deterministic_mlp_policy_bayesian import DeterministicMLPPolicy 6 | from continuous_mlp_q_function_bayesian import ContinuousMLPQFunction 7 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 8 | from rllab.envs.normalized_env import normalize 9 | from rllab.misc.instrument import stub, run_experiment_lite 10 | from sandbox.rocky.tf.envs.base import TfEnv 11 | from rllab.envs.gym_env import GymEnv 12 | from rllab.misc import ext 13 | import pickle 14 | import tensorflow as tf 15 | import argparse 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("type", help="Type of DDPG to run: unified, unified-gated, regular") 19 | parser.add_argument("env", help="The environment name from OpenAIGym environments") 20 | parser.add_argument("--num_epochs", default=100, type=int) 21 | parser.add_argument("--plot", action="store_true") 22 | # parser.add_argument("--data_dir", default="./data/") 23 | args = parser.parse_args() 24 | 25 | stub(globals()) 26 | ext.set_seed(1) 27 | 28 | supported_gym_envs = ["MountainCarContinuous-v0", "Hopper-v1", "Walker2d-v1", "Humanoid-v1", "Reacher-v1", "HalfCheetah-v1", "Swimmer-v1", "HumanoidStandup-v1"] 29 | 30 | other_env_class_map = { "Cartpole" : CartpoleEnv} 31 | 32 | if args.env in supported_gym_envs: 33 | gymenv = GymEnv(args.env, force_reset=True, record_video=False, record_log=False) 34 | # gymenv.env.seed(1) 35 | else: 36 | gymenv = other_env_class_map[args.env]() 37 | 38 | 39 | env = TfEnv(normalize(gymenv)) 40 | 41 | policy = DeterministicMLPPolicy( 42 | env_spec=env.spec, 43 | name="policy", 44 | # The neural network policy should have two hidden layers, each with 32 hidden units. 45 | hidden_sizes=(100, 50, 25), 46 | hidden_nonlinearity=tf.nn.relu, 47 | ) 48 | 49 | es = MCDropout(env_spec=env.spec) 50 | 51 | qf = ContinuousMLPQFunction(env_spec=env.spec, 52 | hidden_sizes=(100,100), 53 | hidden_nonlinearity=tf.nn.relu,) 54 | 55 | 56 | ddpg_type_map = {"Thompson" : DDPG_Thompson, "Mean" : DDPG_Mean, "Bayesian" : DDPG_Bayesian} 57 | 58 | ddpg_class = ddpg_type_map[args.type] 59 | 60 | ## loops: 61 | num_experiments = 1 62 | batch_size_values = [64] 63 | 64 | 65 | 66 | 67 | for b in range(len(batch_size_values)): 68 | 69 | for e in range(num_experiments): 70 | 71 | algo = ddpg_class( 72 | env=env, 73 | policy=policy, 74 | es=es, 75 | qf=qf, 76 | batch_size=64, 77 | max_path_length=env.horizon, 78 | epoch_length=1000, 79 | min_pool_size=10000, 80 | n_epochs=args.num_epochs, 81 | discount=0.99, 82 | scale_reward=1.0, 83 | qf_learning_rate=1e-3, 84 | policy_learning_rate=1e-4, 85 | # Uncomment both lines (this and the plot parameter below) to enable plotting 86 | plot=args.plot, 87 | ) 88 | 89 | 90 | run_experiment_lite( 91 | algo.train(), 92 | # log_dir=args.data_dir, 93 | # Number of parallel workers for sampling 94 | n_parallel=1, 95 | # Only keep the snapshot parameters for the last iteration 96 | snapshot_mode="last", 97 | # Specifies the seed for the experiment. If this is not provided, a random seed 98 | # will be used 99 | exp_name="Trial_Bayesian_Exploration/", 100 | seed=1, 101 | plot=args.plot, 102 | ) 103 | -------------------------------------------------------------------------------- /continuous_mlp_q_function_bayesian.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.q_functions.base import QFunction 2 | from rllab.core.serializable import Serializable 3 | from rllab.misc import ext 4 | from sandbox.rocky.tf.core.layers_powered import LayersPowered 5 | from sandbox.rocky.tf.core.network import MLP 6 | from sandbox.rocky.tf.core.layers import batch_norm 7 | from sandbox.rocky.tf.distributions.categorical import Categorical 8 | from sandbox.rocky.tf.policies.base import StochasticPolicy 9 | from sandbox.rocky.tf.misc import tensor_utils 10 | import tensorflow as tf 11 | import sandbox.rocky.tf.core.layers as L 12 | 13 | import numpy as np 14 | 15 | 16 | class ContinuousMLPQFunction(QFunction, LayersPowered, Serializable): 17 | def __init__( 18 | self, 19 | env_spec, 20 | hidden_sizes=(32, 32), 21 | hidden_nonlinearity=tf.nn.relu, 22 | action_merge_layer=-2, 23 | output_nonlinearity=None, 24 | bn=False, 25 | dropout=0.1): 26 | Serializable.quick_init(self, locals()) 27 | 28 | l_obs = L.InputLayer(shape=(None, env_spec.observation_space.flat_dim), name="obs") 29 | l_action = L.InputLayer(shape=(None, env_spec.action_space.flat_dim), name="actions") 30 | 31 | n_layers = len(hidden_sizes) + 1 32 | 33 | if n_layers > 1: 34 | action_merge_layer = \ 35 | (action_merge_layer % n_layers + n_layers) % n_layers 36 | else: 37 | action_merge_layer = 1 38 | 39 | l_hidden = l_obs 40 | 41 | for idx, size in enumerate(hidden_sizes): 42 | if bn: 43 | l_hidden = batch_norm(l_hidden) 44 | 45 | if idx == action_merge_layer: 46 | l_hidden = L.ConcatLayer([l_hidden, l_action]) 47 | 48 | l_hidden = L.DenseLayer( 49 | l_hidden, 50 | num_units=size, 51 | nonlinearity=hidden_nonlinearity, 52 | name="h%d" % (idx + 1) 53 | ) 54 | 55 | l_hidden = L.DropoutLayer(l_hidden, dropout) 56 | 57 | 58 | 59 | if action_merge_layer == n_layers: 60 | l_hidden = L.ConcatLayer([l_hidden, l_action]) 61 | 62 | l_output = L.DenseLayer( 63 | l_hidden, 64 | num_units=1, 65 | nonlinearity=output_nonlinearity, 66 | name="output" 67 | ) 68 | 69 | output_var = L.get_output(l_output, deterministic=True) 70 | output_var_drop = L.get_output(l_output, deterministic=False) 71 | 72 | self._f_qval = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var) 73 | self._f_qval_drop = tensor_utils.compile_function([l_obs.input_var, l_action.input_var], output_var_drop) 74 | 75 | 76 | self._output_layer = l_output 77 | self._obs_layer = l_obs 78 | self._action_layer = l_action 79 | self._output_nonlinearity = output_nonlinearity 80 | 81 | LayersPowered.__init__(self, [l_output]) 82 | 83 | def get_qval(self, observations, actions): 84 | return self._f_qval(observations, actions) 85 | 86 | 87 | def get_qval_dropout(self, observations, actions): 88 | return self._f_qval_drop(observations, actions) 89 | 90 | 91 | def get_qval_sym(self, obs_var, action_var, **kwargs): 92 | qvals = L.get_output(self._output_layer, {self._obs_layer: obs_var, self._action_layer: action_var}, **kwargs) 93 | return tf.reshape(qvals, (-1,)) 94 | 95 | 96 | """ 97 | want this to return mean qvals + lamba * variance as the output 98 | """ 99 | def get_qval_plus_var_sym(self, obs_var, action_var, **kwargs): 100 | 101 | """ 102 | TO DO HERE 103 | """ 104 | 105 | mc_dropout = 5 106 | all_qvals = [] 107 | for m in range(mc_dropout): 108 | qvals = L.get_output(self._output_layer, {self._obs_layer: obs_var, self._action_layer: action_var}, **kwargs) 109 | all_qvals = np.append(all_qvals, qvals) 110 | 111 | return tf.reshape(qvals, (-1,)) 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | -------------------------------------------------------------------------------- /sampling_utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | import numpy as np 5 | import rllab.misc.logger as logger 6 | 7 | class SimpleReplayPool(object): 8 | """ 9 | Used from https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/rllab/pool/simple_pool.py 10 | """ 11 | def __init__( 12 | self, max_pool_size, observation_dim, action_dim, 13 | replacement_policy='stochastic', replacement_prob=1.0, 14 | max_skip_episode=10): 15 | self._observation_dim = observation_dim 16 | self._action_dim = action_dim 17 | self._max_pool_size = max_pool_size 18 | self._replacement_policy = replacement_policy 19 | self._replacement_prob = replacement_prob 20 | self._max_skip_episode = max_skip_episode 21 | self._observations = np.zeros( 22 | (max_pool_size, observation_dim), 23 | ) 24 | self._actions = np.zeros( 25 | (max_pool_size, action_dim), 26 | ) 27 | self._rewards = np.zeros(max_pool_size) 28 | self._terminals = np.zeros(max_pool_size, dtype='uint8') 29 | self._initials = np.zeros(max_pool_size, dtype='uint8') 30 | self._bottom = 0 31 | self._top = 0 32 | self._size = 0 33 | 34 | def add_sample(self, observation, action, reward, terminal, initial): 35 | self.check_replacement() 36 | self._observations[self._top] = observation 37 | self._actions[self._top] = action 38 | self._rewards[self._top] = reward 39 | self._terminals[self._top] = terminal 40 | self._initials[self._top] = initial 41 | self.advance() 42 | 43 | def check_replacement(self): 44 | if self._replacement_prob < 1.0: 45 | if self._size < self._max_pool_size or \ 46 | not self._initials[self._top]: return 47 | self.advance_until_terminate() 48 | 49 | def get_skip_flag(self): 50 | if self._replacement_policy == 'full': skip = False 51 | elif self._replacement_policy == 'stochastic': 52 | skip = np.random.uniform() > self._replacement_prob 53 | else: raise NotImplementedError 54 | return skip 55 | 56 | def advance_until_terminate(self): 57 | skip = self.get_skip_flag() 58 | n_skips = 0 59 | old_top = self._top 60 | new_top = (old_top + 1) % self._max_pool_size 61 | while skip and old_top != new_top and n_skips < self._max_skip_episode: 62 | n_skips += 1 63 | self.advance() 64 | while not self._initials[self._top]: 65 | self.advance() 66 | skip = self.get_skip_flag() 67 | new_top = self._top 68 | logger.log("add_sample, skipped %d episodes, top=%d->%d"%( 69 | n_skips, old_top, new_top)) 70 | 71 | def advance(self): 72 | self._top = (self._top + 1) % self._max_pool_size 73 | if self._size >= self._max_pool_size: 74 | self._bottom = (self._bottom + 1) % self._max_pool_size 75 | else: 76 | self._size += 1 77 | 78 | def random_batch(self, batch_size): 79 | assert self._size > batch_size 80 | indices = np.zeros(batch_size, dtype='uint64') 81 | transition_indices = np.zeros(batch_size, dtype='uint64') 82 | count = 0 83 | while count < batch_size: 84 | index = np.random.randint(self._bottom, self._bottom + self._size) % self._max_pool_size 85 | # make sure that the transition is valid: if we are at the end of the pool, we need to discard 86 | # this sample 87 | if index == self._size - 1 and self._size <= self._max_pool_size: 88 | continue 89 | # if self._terminals[index]: 90 | # continue 91 | transition_index = (index + 1) % self._max_pool_size 92 | # make sure that the transition is valid: discard the transition if it crosses horizon-triggered resets 93 | if not self._terminals[index] and self._initials[transition_index]: 94 | continue 95 | indices[count] = index 96 | transition_indices[count] = transition_index 97 | count += 1 98 | return dict( 99 | observations=self._observations[indices], 100 | actions=self._actions[indices], 101 | rewards=self._rewards[indices], 102 | terminals=self._terminals[indices], 103 | initials=self._initials[indices], 104 | next_observations=self._observations[transition_indices] 105 | ) 106 | 107 | @property 108 | def size(self): 109 | return self._size 110 | -------------------------------------------------------------------------------- /bayesian_ddpg/params.json: -------------------------------------------------------------------------------- 1 | { 2 | "args_data": "gANjcmxsYWIubWlzYy5pbnN0cnVtZW50ClN0dWJNZXRob2RDYWxsCnEAKYFxAX1xAihYCAAAAF9fa3dhcmdzcQN9cQRYBgAAAF9fYXJnc3EFKGNybGxhYi5taXNjLmluc3RydW1lbnQKU3R1Yk9iamVjdApxBimBcQd9cQgoWAsAAABwcm94eV9jbGFzc3EJY2RkcGdfYmF5ZXNpYW5fdGhvbXBzb24KRERQRwpxClgGAAAAa3dhcmdzcQt9cQwoWAgAAABkaXNjb3VudHENRz/vrhR64UeuWAgAAABuX2Vwb2Noc3EOSxRYAgAAAGVzcQ9oBimBcRB9cREoaAljZHJvcG91dF9leHBsb3JhdGlvbgpNQ0Ryb3BvdXQKcRJoC31xE1gIAAAAZW52X3NwZWNxFGNybGxhYi5taXNjLmluc3RydW1lbnQKU3R1YkF0dHIKcRUpgXEWfXEXKFgKAAAAX2F0dHJfbmFtZXEYWAQAAABzcGVjcRlYBAAAAF9vYmpxGmgGKYFxG31xHChoCWNzYW5kYm94LnJvY2t5LnRmLmVudnMuYmFzZQpUZkVudgpxHWgLfXEeWAsAAAB3cmFwcGVkX2VudnEfaAYpgXEgfXEhKGgJY3JsbGFiLmVudnMubm9ybWFsaXplZF9lbnYKTm9ybWFsaXplZEVudgpxImgLfXEjWAMAAABlbnZxJGgGKYFxJX1xJihoCWNybGxhYi5lbnZzLmd5bV9lbnYKR3ltRW52CnEnaAt9cSgoWAoAAAByZWNvcmRfbG9ncSmJWAgAAABlbnZfbmFtZXEqWAkAAABIb3BwZXItdjFxK1gLAAAAZm9yY2VfcmVzZXRxLIhYDAAAAHJlY29yZF92aWRlb3EtiXVYBAAAAGFyZ3NxLil1YnNoLil1YnNoLil1YnVic2guKXViWA0AAABtaW5fcG9vbF9zaXplcS9NECdYDAAAAHNjYWxlX3Jld2FyZHEwRz/wAAAAAAAAWAoAAABiYXRjaF9zaXplcTFLQFgQAAAAcWZfbGVhcm5pbmdfcmF0ZXEyRz9QYk3S8an8WAQAAABwbG90cTOJWAIAAABxZnE0aAYpgXE1fXE2KGgJY2NvbnRpbnVvdXNfbWxwX3FfZnVuY3Rpb25fYmF5ZXNpYW4KQ29udGludW91c01MUFFGdW5jdGlvbgpxN2gLfXE4KFgMAAAAaGlkZGVuX3NpemVzcTlLZEtkhnE6aBRoFSmBcTt9cTwoaBhoGWgaaBt1YlgTAAAAaGlkZGVuX25vbmxpbmVhcml0eXE9Y3RlbnNvcmZsb3cucHl0aG9uLm9wcy5nZW5fbm5fb3BzCnJlbHUKcT51aC4pdWJYFAAAAHBvbGljeV9sZWFybmluZ19yYXRlcT9HPxo24uscQy1YBgAAAHBvbGljeXFAaAYpgXFBfXFCKGgJY2RldGVybWluaXN0aWNfbWxwX3BvbGljeV9iYXllc2lhbgpEZXRlcm1pbmlzdGljTUxQUG9saWN5CnFDaAt9cUQoWAQAAABuYW1lcUVoQGg5S2RLMksZh3FGaBRoFSmBcUd9cUgoaBhoGWgaaBt1Ymg9aD51aC4pdWJYDAAAAGVwb2NoX2xlbmd0aHFJTegDWA8AAABtYXhfcGF0aF9sZW5ndGhxSmgVKYFxS31xTChoGFgHAAAAaG9yaXpvbnFNaBpoG3ViaCRoG3VoLil1YlgFAAAAdHJhaW5xTil9cU90cVB1Yi4=", 3 | "exp_name": "Trial_Bayesian_Exploration/", 4 | "json_args": { 5 | "algo": { 6 | "_name": "ddpg_bayesian_thompson.DDPG", 7 | "batch_size": 64, 8 | "discount": 0.99, 9 | "epoch_length": 1000, 10 | "es": { 11 | "_name": "dropout_exploration.MCDropout", 12 | "env_spec": { 13 | "attr": "spec", 14 | "obj": { 15 | "_name": "sandbox.rocky.tf.envs.base.TfEnv", 16 | "wrapped_env": { 17 | "_name": "rllab.envs.normalized_env.NormalizedEnv", 18 | "env": { 19 | "_name": "rllab.envs.gym_env.GymEnv", 20 | "env_name": "Hopper-v1", 21 | "force_reset": true, 22 | "record_log": false, 23 | "record_video": false 24 | } 25 | } 26 | } 27 | } 28 | }, 29 | "max_path_length": { 30 | "attr": "horizon", 31 | "obj": { 32 | "_name": "sandbox.rocky.tf.envs.base.TfEnv", 33 | "wrapped_env": { 34 | "_name": "rllab.envs.normalized_env.NormalizedEnv", 35 | "env": { 36 | "_name": "rllab.envs.gym_env.GymEnv", 37 | "env_name": "Hopper-v1", 38 | "force_reset": true, 39 | "record_log": false, 40 | "record_video": false 41 | } 42 | } 43 | } 44 | }, 45 | "min_pool_size": 10000, 46 | "n_epochs": 20, 47 | "plot": false, 48 | "policy_learning_rate": 0.0001, 49 | "qf": { 50 | "_name": "continuous_mlp_q_function_bayesian.ContinuousMLPQFunction", 51 | "env_spec": { 52 | "attr": "spec", 53 | "obj": { 54 | "_name": "sandbox.rocky.tf.envs.base.TfEnv", 55 | "wrapped_env": { 56 | "_name": "rllab.envs.normalized_env.NormalizedEnv", 57 | "env": { 58 | "_name": "rllab.envs.gym_env.GymEnv", 59 | "env_name": "Hopper-v1", 60 | "force_reset": true, 61 | "record_log": false, 62 | "record_video": false 63 | } 64 | } 65 | } 66 | }, 67 | "hidden_nonlinearity": "tensorflow.python.ops.gen_nn_ops.relu", 68 | "hidden_sizes": [ 69 | 100, 70 | 100 71 | ] 72 | }, 73 | "qf_learning_rate": 0.001, 74 | "scale_reward": 1.0 75 | }, 76 | "env": { 77 | "_name": "sandbox.rocky.tf.envs.base.TfEnv", 78 | "wrapped_env": { 79 | "_name": "rllab.envs.normalized_env.NormalizedEnv", 80 | "env": { 81 | "_name": "rllab.envs.gym_env.GymEnv", 82 | "env_name": "Hopper-v1", 83 | "force_reset": true, 84 | "record_log": false, 85 | "record_video": false 86 | } 87 | } 88 | }, 89 | "policy": { 90 | "_name": "deterministic_mlp_policy_bayesian.DeterministicMLPPolicy", 91 | "env_spec": { 92 | "attr": "spec", 93 | "obj": { 94 | "_name": "sandbox.rocky.tf.envs.base.TfEnv", 95 | "wrapped_env": { 96 | "_name": "rllab.envs.normalized_env.NormalizedEnv", 97 | "env": { 98 | "_name": "rllab.envs.gym_env.GymEnv", 99 | "env_name": "Hopper-v1", 100 | "force_reset": true, 101 | "record_log": false, 102 | "record_video": false 103 | } 104 | } 105 | } 106 | }, 107 | "hidden_nonlinearity": "tensorflow.python.ops.gen_nn_ops.relu", 108 | "hidden_sizes": [ 109 | 100, 110 | 50, 111 | 25 112 | ], 113 | "name": "policy" 114 | } 115 | }, 116 | "log_dir": "./bayesian_ddpg/", 117 | "log_tabular_only": false, 118 | "n_parallel": 1, 119 | "params_log_file": "params.json", 120 | "plot": false, 121 | "resume_from": null, 122 | "seed": 1, 123 | "snapshot_gap": 1, 124 | "snapshot_mode": "last", 125 | "tabular_log_file": "progress.csv", 126 | "text_log_file": "debug.log", 127 | "use_cloudpickle": false, 128 | "variant_data": null, 129 | "variant_log_file": "variant.json" 130 | } -------------------------------------------------------------------------------- /dropout_gal_neuralnet_tf_example.py: -------------------------------------------------------------------------------- 1 | #### Uncertainty in Deep Learning 2 | #### To keep the dropout during test time : 3 | #### https://medium.com/towards-data-science/adding-uncertainty-to-deep-learning-ecc2401f2013 4 | 5 | #### Another useful link 6 | #### https://github.com/tensorflow/tensorflow/issues/97 7 | 8 | from __future__ import absolute_import 9 | from __future__ import division 10 | from __future__ import print_function 11 | 12 | import math 13 | import os 14 | 15 | import tensorflow as tf 16 | from tensorflow.examples.tutorials.mnist import input_data 17 | import pdb 18 | import numpy as np 19 | 20 | 21 | 22 | ### create TF graph session 23 | tf.reset_default_graph() 24 | sess = tf.Session() 25 | 26 | LOGDIR = './graphs' 27 | mnist = input_data.read_data_sets('/tmp/data', one_hot=True) 28 | 29 | 30 | #defining the model structure 31 | # number of neurons in each hidden layer 32 | HIDDEN1_SIZE = 500 33 | HIDDEN2_SIZE = 250 34 | NUM_CLASSES = 10 35 | NUM_PIXELS = 28 * 28 36 | 37 | # experiment with the nubmer of training steps to 38 | # see the effect 39 | TRAIN_STEPS = 2000 40 | BATCH_SIZE = 100 41 | 42 | LEARNING_RATE = 0.01 43 | 44 | ### creating the inputs for the model 45 | with tf.name_scope('input'): 46 | # Define inputs 47 | images = tf.placeholder(dtype=tf.float32, shape=[None, NUM_PIXELS]) 48 | labels = tf.placeholder(dtype=tf.float32, shape=[None, NUM_CLASSES]) 49 | 50 | 51 | # Function to create a fully connected layer 52 | def fc_layer(input, size_out, name="fc", activation=None): 53 | with tf.name_scope(name): 54 | size_in = int(input.shape[1]) 55 | w = tf.Variable(tf.truncated_normal([size_in, size_out], stddev=0.1), name="weights") 56 | b = tf.Variable(tf.constant(0.1, shape=[size_out]), name="bias") 57 | 58 | #output of the network 59 | wx_plus_b = tf.matmul(input, w) + b 60 | 61 | if activation: return activation(wx_plus_b) 62 | return wx_plus_b 63 | 64 | 65 | ### defining the network here 66 | fc1 = fc_layer(images, HIDDEN1_SIZE, "fc1", activation=tf.nn.relu) 67 | fc2 = fc_layer(fc1, HIDDEN2_SIZE, "fc2", activation=tf.nn.relu) 68 | dropped = tf.nn.dropout(fc2, keep_prob=0.9) 69 | #### network output 70 | y = fc_layer(dropped, NUM_CLASSES, name="output") 71 | 72 | 73 | 74 | with tf.name_scope("loss"): 75 | loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=y, labels=labels)) 76 | tf.summary.scalar('loss', loss) 77 | 78 | 79 | with tf.name_scope("optimizer"): 80 | train = tf.train.AdamOptimizer(LEARNING_RATE).minimize(loss) 81 | 82 | # Define evaluation 83 | with tf.name_scope("evaluation"): 84 | prediction = tf.argmax(y, 1) 85 | 86 | correct_prediction = tf.equal(prediction, tf.argmax(labels, 1)) 87 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) 88 | tf.summary.scalar('accuracy', accuracy) 89 | 90 | 91 | train_writer = tf.summary.FileWriter(os.path.join(LOGDIR, "train")) 92 | train_writer.add_graph(sess.graph) 93 | test_writer = tf.summary.FileWriter(os.path.join(LOGDIR, "test")) 94 | summary_op = tf.summary.merge_all() 95 | 96 | ##### Constructing TF graph upto this 97 | 98 | 99 | sess.run(tf.global_variables_initializer()) 100 | 101 | 102 | MC_SAMPLES = 20 103 | for step in range(TRAIN_STEPS): 104 | batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE) 105 | 106 | ### training results 107 | summary_result, _ = sess.run([summary_op, train], feed_dict={images: batch_xs, labels: batch_ys}) 108 | 109 | 110 | orig_predicted_y = sess.run([y], feed_dict = {images : mnist.test.images}) 111 | 112 | 113 | ## use a placeholder here? 114 | #All_MC_Predicted_Classes = tf.placeholder(dtype=tf.float32, shape=[10000, MC_SAMPLES]) 115 | All_MC_Predicted_Classes = np.zeros(shape=(10000, MC_SAMPLES)) 116 | 117 | 118 | for m in range(MC_SAMPLES): 119 | predicted_y = sess.run([y], feed_dict = {images : mnist.test.images}) 120 | 121 | ### using numpy` 122 | predicted_y = np.asarray(predicted_y) 123 | predicted_y = predicted_y[0, :, :] 124 | predicted_class = np.argmax(predicted_y, 1) 125 | pred = np.array([predicted_class]).T 126 | All_MC_Predicted_Classes[:, m] = predicted_class 127 | 128 | 129 | #predicted_class = tf.argmax(predicted_y, 1) 130 | # All_MC_Predicted_Classes[:, m] = tf.argmax(predicted_y, 1) 131 | 132 | Mean_Predicted_Classes = np.mean(All_MC_Predicted_Classes, axis=1) 133 | Variance_Predicted_Classes = np.var(All_MC_Predicted_Classes, axis=1) 134 | # Mean_Predicted_Classes = tf.reduce_mean(All_MC_Predicted_Classes, axis=1) 135 | # Mean_Predicted_Classes = Mean_Predicted_Classes.tolist() 136 | 137 | 138 | correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(labels,1)) 139 | accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) 140 | 141 | #acc = sess.run(accuracy, feed_dict={images: mnist.test.images, labels: mnist.test.labels}) 142 | 143 | 144 | print ("Step", step) 145 | # print ("Test Accuracy", acc) 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | """ 157 | Doing computation on the TF graph 158 | """ 159 | 160 | # sess.run(tf.global_variables_initializer()) 161 | # ###### Example to calculate MC Sample based accuracy using MCDropout 162 | # MC_SAMPLES = 10 163 | # for step in range(TRAIN_STEPS): 164 | # batch_xs, batch_ys = mnist.train.next_batch(BATCH_SIZE) 165 | # summary_result, train_result = sess.run([summary_op, train], feed_dict={images: batch_xs, labels: batch_ys}) 166 | 167 | # # calculate accuracy on the test set, every 100 steps. 168 | # # we're using the entire test set here, so this will be a bit slow 169 | # if step % 100 == 0: 170 | # print ("Step Number", step) 171 | # #_, acc = sess.run([summary_op, accuracy], feed_dict={images: mnist.test.images, labels: mnist.test.labels}) 172 | 173 | # All_Accuracy = np.zeros(shape=([MC_SAMPLES])) 174 | # for m in range(MC_SAMPLES): 175 | # _, acc = sess.run([summary_op, accuracy], feed_dict={images: mnist.test.images, labels: mnist.test.labels}) 176 | # acc = np.array([acc]) 177 | 178 | # All_Accuracy[m] = acc 179 | 180 | # print ("All Accuracy", All_Accuracy) 181 | # print ("Mean Accuracy", np.mean(All_Accuracy)) 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | """ 190 | To keep the dropout at test time 191 | """ 192 | 193 | # keep the dropout during test time 194 | #mc_post = [sess.run(nn, feed_dict={x: data}) for _ in range(100)] 195 | 196 | #and then we need sample variance + inverse precision 197 | # def _tau_inv(keep_prob, N, l2=0.005, lambda_=0.00001): 198 | # tau = keep_prob * l2 / (2. * N * lambda_) 199 | # return 1. / tau 200 | 201 | # np.var(mc_post) + _tau_inv(0.5, 100) 202 | -------------------------------------------------------------------------------- /variational_inference_examples.py: -------------------------------------------------------------------------------- 1 | ## from : https://gist.github.com/tokestermw/a9de2ef498a09747bbf673ddf6ea4843 2 | ## tokestermw/tf_ed_vi_tutorial.py 3 | from __future__ import absolute_import 4 | from __future__ import division 5 | from __future__ import print_function 6 | 7 | import sys 8 | import json 9 | import tqdm 10 | 11 | import numpy as np 12 | import tensorflow as tf 13 | import edward as ed 14 | 15 | N_FEATURES = 2 16 | DATA_LENGTH = 3 17 | 18 | REAL_WEIGHT = 0.7 19 | REAL_BIAS = 2.5 20 | REAL_DATA = np.array([2., 4., 6.]) 21 | REAL_LABELS = REAL_WEIGHT * REAL_DATA + REAL_BIAS 22 | 23 | MC_SAMPLES = 100 24 | 25 | 26 | def create_fake_data(): 27 | data = np.random.randint(0, 10, DATA_LENGTH).astype(np.float32) 28 | noise = np.random.randn(DATA_LENGTH).astype(np.float32) 29 | labels = REAL_WEIGHT * data + REAL_BIAS + noise 30 | return data, labels 31 | 32 | 33 | class Model: 34 | def __init__(self, model_name, dropout_keep_prob=1.0): 35 | self.model_name = model_name 36 | self.dropout_keep_prob = dropout_keep_prob 37 | 38 | self.x = tf.placeholder(tf.float32, (DATA_LENGTH, )) 39 | self.y = tf.placeholder(tf.float32, (DATA_LENGTH, )) 40 | 41 | if self.model_name == "simple_linear": 42 | # -- set parameters 43 | self.bias = tf.get_variable("bias", [1]) 44 | self.weight = tf.get_variable("weight", [1]) # should be 1. 45 | 46 | # -- set dropout (optional) 47 | self.add_dropout() 48 | 49 | # -- set model 50 | self.nn = self.weight * self.x + self.bias 51 | 52 | # -- set loss 53 | self.loss = tf.reduce_mean((self.y - self.nn) ** 2) 54 | 55 | elif self.model_name == "bayesian_simple_linear": 56 | # -- set priors 57 | self.weight_mu = tf.zeros(1) # tf.get_variable("weight_mu", [1]) 58 | self.weight_sigma = tf.ones(1) # fixed hyperparameters 59 | self.weight = ed.models.Normal(mu=self.weight_mu, sigma=self.weight_sigma) 60 | 61 | self.bias_mu = tf.zeros(1) # tf.get_variable("bias_mu", [1]) 62 | self.bias_sigma = tf.ones(1) # fixed hyperparameters 63 | self.bias = ed.models.Normal(mu=self.bias_mu, sigma=self.bias_sigma) 64 | 65 | # -- set model 66 | self.nn_mean = self.weight * self.x + self.bias 67 | self.nn_sigma = tf.ones(1) # fixed hyperparameters 68 | self.nn = ed.models.Normal(mu=self.nn_mean, sigma=self.nn_sigma) 69 | 70 | # -- set variational parameters 71 | self.qweight = ed.models.Normal( 72 | mu=tf.get_variable("qweight_mu", initializer=tf.random_normal([1])), 73 | sigma=tf.nn.softplus(tf.get_variable("qweight_sigma", initializer=tf.random_normal([1])))) 74 | 75 | self.qbias = ed.models.Normal( 76 | mu=tf.get_variable("qbias_mu", initializer=tf.random_normal([1])), 77 | sigma=tf.nn.softplus(tf.get_variable("qbias_sigma", initializer=tf.random_normal([1])))) 78 | 79 | # -- inference 80 | self.latent_vars = {self.weight: self.qweight, self.bias: self.qbias} 81 | self.data = {self.nn: self.y} 82 | 83 | self.loss = (self.latent_vars, self.data) 84 | 85 | else: 86 | raise ValueError("Wrong model error.") 87 | 88 | def add_dropout(self): 89 | self._keep_prob = tf.Variable(name="keep_prob", initial_value=self.dropout_keep_prob, trainable=False) 90 | 91 | self.bias = tf.cond( 92 | self._keep_prob < 1.0, lambda: tf.nn.dropout(self.bias, keep_prob=self._keep_prob), lambda: self.bias) 93 | self.weight = tf.cond( 94 | self._keep_prob < 1.0, lambda: tf.nn.dropout(self.weight, keep_prob=self._keep_prob), lambda: self.weight) 95 | 96 | @property 97 | def keep_prob(self): 98 | return self._keep_prob 99 | 100 | def optimize(self): 101 | 102 | if _is_loss_function(self.loss): 103 | # loss optimization 104 | self.optimizer = tf.train.GradientDescentOptimizer(.005) 105 | self.train_op = self.optimizer.minimize(self.loss) 106 | 107 | else: 108 | # variational inference 109 | latent_vars, data = self.loss 110 | self.inference = ed.KLqp(latent_vars=latent_vars, data=data) 111 | self.inference.initialize() 112 | self.train_op = self.inference 113 | 114 | return self.train_op 115 | 116 | 117 | def _is_loss_function(loss): 118 | return isinstance(loss, tf.Tensor) 119 | 120 | 121 | def _section(text): 122 | print("-"*10 + " ", text.upper()) 123 | 124 | 125 | def _tau_inv(keep_prob, N, l2=0.005, lambda_=0.00001): 126 | # -- Variational Dropout Uncertainty Interval by Gal 127 | # https://github.com/yaringal/DropoutUncertaintyDemos/blob/master/convnetjs/regression_uncertainty.js 128 | tau = keep_prob * l2 / (2. * N * lambda_) 129 | return 1. / tau 130 | 131 | 132 | def main(model_name, dropout_keep_prob=1.0): 133 | _section("set model") 134 | model = Model(model_name, dropout_keep_prob) 135 | train_op = model.optimize() 136 | 137 | local_init_op = tf.local_variables_initializer() 138 | global_init_op = tf.global_variables_initializer() 139 | 140 | tvars = tf.trainable_variables() 141 | 142 | _section("train") 143 | with tf.Session() as sess: 144 | sess.run([local_init_op, global_init_op]) 145 | 146 | tq = tqdm.trange(2000) 147 | for it in tq: 148 | data, labels = create_fake_data() 149 | 150 | if _is_loss_function(model.loss): 151 | sess.run(train_op, feed_dict={ 152 | model.x: data, 153 | model.y: labels, 154 | }) 155 | weight_, bias_ = sess.run([model.weight, model.bias]) 156 | tq.set_postfix(weight=weight_, bias=bias_) 157 | 158 | else: 159 | train_op.update(feed_dict={ 160 | model.x: data, 161 | model.y: labels, 162 | }) 163 | weight_, bias_ = sess.run([model.weight.value(), model.bias.value()]) 164 | tq.set_postfix(weight=weight_, bias=bias_) 165 | 166 | print("trainable variables:", json.dumps({t.name: sess.run(t).tolist() for t in tvars}, indent=4)) 167 | 168 | _section("predict on sample data") 169 | print("real weight", REAL_WEIGHT) 170 | print("real bias", REAL_BIAS) 171 | print("real data", REAL_DATA) 172 | print("real labels", REAL_LABELS) 173 | 174 | # -- checking out the variable distribution 175 | if _is_loss_function(model.loss): 176 | if dropout_keep_prob < 1.0: 177 | # don't do dropout for point estimate 178 | sess.run(model.keep_prob.assign(1.0)) 179 | _section("loss optimization w/ dropout") 180 | else: 181 | _section("loss optimization") 182 | 183 | nn_point_estimate, weight_point_estimate, bias_point_estimate = \ 184 | sess.run([model.nn, model.weight, model.bias], feed_dict={ 185 | model.x: REAL_DATA, 186 | model.y: REAL_LABELS, 187 | }) 188 | print("weight point estimate", weight_point_estimate) 189 | print("bias point estimate", bias_point_estimate) 190 | print("nn point estimate", nn_point_estimate) 191 | print("mean absolute error", np.mean(np.absolute(nn_point_estimate - REAL_LABELS))) 192 | 193 | if dropout_keep_prob < 1.0: 194 | _section("monte carlo simulations") 195 | 196 | sess.run(model.keep_prob.assign(dropout_keep_prob)) 197 | 198 | nn_mc = [] 199 | for _ in range(MC_SAMPLES): 200 | nn_mc.append(sess.run([model.nn], feed_dict={model.x: REAL_DATA, model.y: REAL_LABELS})) 201 | nn_mc = np.array(nn_mc) 202 | 203 | print("monte carlo nn mean", np.mean(nn_mc, axis=0)) 204 | print("monte carlo nn variance", np.var(nn_mc, axis=0)) 205 | print("+ Gal inverse precision", np.var(nn_mc, axis=0) + _tau_inv(dropout_keep_prob, MC_SAMPLES)) 206 | print("mean absolute error", np.mean(np.absolute(np.mean(nn_mc, axis=0) - REAL_LABELS))) 207 | 208 | else: 209 | _section("variational inference") 210 | 211 | weight_mean, weight_var = sess.run(tf.nn.moments(model.qweight.sample(MC_SAMPLES), axes=[0])) 212 | print("weight posterior mean and variance", weight_mean, weight_var) 213 | bias_mean, bias_var = sess.run(tf.nn.moments(model.qbias.sample(MC_SAMPLES), axes=[0])) 214 | print("bias posterior mean and variance", bias_mean, bias_var) 215 | 216 | _section("prior predictive checks") 217 | prior = ed.copy(model.nn, { 218 | model.weight: model.weight.mean(), model.bias: model.bias.mean(), 219 | }, scope="copied/prior") 220 | 221 | nn_prior = [] 222 | for _ in range(MC_SAMPLES): 223 | nn_prior.append(sess.run(prior.value(), feed_dict={model.x: REAL_DATA, model.y: REAL_LABELS})) 224 | nn_prior = np.array(nn_prior) 225 | 226 | print("nn prior mean and variance", np.mean(nn_prior, axis=0), np.var(nn_prior, axis=0)) 227 | print("mean absolute error", np.mean(np.absolute(np.mean(nn_prior, axis=0) - REAL_LABELS))) 228 | 229 | _section("posterior predictive checks") 230 | posterior = ed.copy(model.nn, dict_swap={ 231 | model.weight: model.qweight.mean(), model.bias: model.qbias.mean(), 232 | }, scope="copied/posterior") 233 | 234 | nn_post = sess.run(posterior.sample(MC_SAMPLES), feed_dict={model.x: REAL_DATA}) 235 | 236 | print("nn posterior mean and variance", np.mean(nn_post, axis=0), np.var(nn_post, axis=0)) 237 | print("mean absolute error", np.mean(np.absolute(np.mean(nn_post, axis=0) - REAL_LABELS))) 238 | 239 | 240 | # TODO: plot? 241 | 242 | 243 | if __name__ == '__main__': 244 | """ Try the following: 245 | >>> python tf_vi_tutorial.py simple_linear 246 | >>> python tf_vi_tutorial.py simple_linear .9 247 | >>> python tf_vi_tutorial.py bayesian_simple_linear 248 | """ 249 | args = sys.argv 250 | 251 | if len(args) == 1: 252 | main("simple_linear") 253 | elif len(args) == 2: 254 | _, model_name = args 255 | main(model_name) 256 | elif len(args) == 3: 257 | _, model_name, dropout_keep_prob = args 258 | dropout_keep_prob = float(dropout_keep_prob) 259 | assert 0 < dropout_keep_prob <= 1.0, "keep it real" 260 | main(model_name, float(dropout_keep_prob)) -------------------------------------------------------------------------------- /bayesian_ddpg/debug.log: -------------------------------------------------------------------------------- 1 | 2017-07-12 09:22:32.919231 EDT | [Trial_Bayesian_Exploration/] observation space: Box(11,) 2 | 2017-07-12 09:22:32.919512 EDT | [Trial_Bayesian_Exploration/] action space: Box(3,) 3 | 2017-07-12 09:22:33.415071 EDT | [Trial_Bayesian_Exploration/] Populating workers... 4 | 2017-07-12 09:22:33.415311 EDT | [Trial_Bayesian_Exploration/] Populated 5 | 2017-07-12 09:22:35.323080 EDT | [Trial_Bayesian_Exploration/] epoch #0 | Training started 6 | 2017-07-12 09:22:36.167981 EDT | [Trial_Bayesian_Exploration/] epoch #0 | Training finished 7 | 2017-07-12 09:22:36.168245 EDT | [Trial_Bayesian_Exploration/] epoch #0 | Trained qf 0 steps, policy 0 steps 8 | 2017-07-12 09:22:36.168460 EDT | [Trial_Bayesian_Exploration/] epoch #1 | Training started 9 | 2017-07-12 09:22:37.006537 EDT | [Trial_Bayesian_Exploration/] epoch #1 | Training finished 10 | 2017-07-12 09:22:37.006788 EDT | [Trial_Bayesian_Exploration/] epoch #1 | Trained qf 0 steps, policy 0 steps 11 | 2017-07-12 09:22:37.006989 EDT | [Trial_Bayesian_Exploration/] epoch #2 | Training started 12 | 2017-07-12 09:22:37.828863 EDT | [Trial_Bayesian_Exploration/] epoch #2 | Training finished 13 | 2017-07-12 09:22:37.829077 EDT | [Trial_Bayesian_Exploration/] epoch #2 | Trained qf 0 steps, policy 0 steps 14 | 2017-07-12 09:22:37.829240 EDT | [Trial_Bayesian_Exploration/] epoch #3 | Training started 15 | 2017-07-12 09:22:38.652386 EDT | [Trial_Bayesian_Exploration/] epoch #3 | Training finished 16 | 2017-07-12 09:22:38.652639 EDT | [Trial_Bayesian_Exploration/] epoch #3 | Trained qf 0 steps, policy 0 steps 17 | 2017-07-12 09:22:38.652842 EDT | [Trial_Bayesian_Exploration/] epoch #4 | Training started 18 | 2017-07-12 09:22:39.469835 EDT | [Trial_Bayesian_Exploration/] epoch #4 | Training finished 19 | 2017-07-12 09:22:39.470033 EDT | [Trial_Bayesian_Exploration/] epoch #4 | Trained qf 0 steps, policy 0 steps 20 | 2017-07-12 09:22:39.470186 EDT | [Trial_Bayesian_Exploration/] epoch #5 | Training started 21 | 2017-07-12 09:22:40.297185 EDT | [Trial_Bayesian_Exploration/] epoch #5 | Training finished 22 | 2017-07-12 09:22:40.297431 EDT | [Trial_Bayesian_Exploration/] epoch #5 | Trained qf 0 steps, policy 0 steps 23 | 2017-07-12 09:22:40.297639 EDT | [Trial_Bayesian_Exploration/] epoch #6 | Training started 24 | 2017-07-12 09:22:41.124994 EDT | [Trial_Bayesian_Exploration/] epoch #6 | Training finished 25 | 2017-07-12 09:22:41.125214 EDT | [Trial_Bayesian_Exploration/] epoch #6 | Trained qf 0 steps, policy 0 steps 26 | 2017-07-12 09:22:41.125367 EDT | [Trial_Bayesian_Exploration/] epoch #7 | Training started 27 | 2017-07-12 09:22:41.955415 EDT | [Trial_Bayesian_Exploration/] epoch #7 | Training finished 28 | 2017-07-12 09:22:41.955668 EDT | [Trial_Bayesian_Exploration/] epoch #7 | Trained qf 0 steps, policy 0 steps 29 | 2017-07-12 09:22:41.955873 EDT | [Trial_Bayesian_Exploration/] epoch #8 | Training started 30 | 2017-07-12 09:22:42.786357 EDT | [Trial_Bayesian_Exploration/] epoch #8 | Training finished 31 | 2017-07-12 09:22:42.786557 EDT | [Trial_Bayesian_Exploration/] epoch #8 | Trained qf 0 steps, policy 0 steps 32 | 2017-07-12 09:22:42.786709 EDT | [Trial_Bayesian_Exploration/] epoch #9 | Training started 33 | 2017-07-12 09:22:43.779252 EDT | [Trial_Bayesian_Exploration/] epoch #9 | Training finished 34 | 2017-07-12 09:22:43.779507 EDT | [Trial_Bayesian_Exploration/] epoch #9 | Trained qf 1 steps, policy 1 steps 35 | 2017-07-12 09:22:43.779679 EDT | [Trial_Bayesian_Exploration/] epoch #9 | Collecting samples for evaluation 36 | 2017-07-12 09:22:51.221690 EDT | ----------------------- --------- 37 | 2017-07-12 09:22:51.221938 EDT | Epoch 9 38 | 2017-07-12 09:22:51.222175 EDT | Iteration 9 39 | 2017-07-12 09:22:51.222309 EDT | AverageReturn 9.1814 40 | 2017-07-12 09:22:51.222531 EDT | StdReturn 0.046539 41 | 2017-07-12 09:22:51.222720 EDT | MaxReturn 9.3185 42 | 2017-07-12 09:22:51.222896 EDT | MinReturn 9.07004 43 | 2017-07-12 09:22:51.223063 EDT | AverageEsReturn 9.90701 44 | 2017-07-12 09:22:51.223227 EDT | StdEsReturn 7.05845 45 | 2017-07-12 09:22:51.223392 EDT | MaxEsReturn 54.8695 46 | 2017-07-12 09:22:51.223556 EDT | MinEsReturn 1.67899 47 | 2017-07-12 09:22:51.223720 EDT | AverageDiscountedReturn 8.76006 48 | 2017-07-12 09:22:51.223900 EDT | AverageQLoss 0.74919 49 | 2017-07-12 09:22:51.224063 EDT | AveragePolicySurr -0.053608 50 | 2017-07-12 09:22:51.224257 EDT | AverageQ -0.243523 51 | 2017-07-12 09:22:51.224422 EDT | AverageAbsQ 0.365071 52 | 2017-07-12 09:22:51.224587 EDT | AverageY 0.48041 53 | 2017-07-12 09:22:51.224750 EDT | AverageAbsY 0.593881 54 | 2017-07-12 09:22:51.224914 EDT | AverageAbsQYDiff 0.75809 55 | 2017-07-12 09:22:51.225078 EDT | AverageAction 0.218841 56 | 2017-07-12 09:22:51.225240 EDT | PolicyRegParamNorm 11.23 57 | 2017-07-12 09:22:51.225402 EDT | QFunRegParamNorm 11.13 58 | 2017-07-12 09:22:51.225564 EDT | ----------------------- --------- 59 | 2017-07-12 09:22:51.225882 EDT | [Trial_Bayesian_Exploration/] epoch #10 | Training started 60 | 2017-07-12 09:24:18.240014 EDT | [Trial_Bayesian_Exploration/] epoch #10 | Training finished 61 | 2017-07-12 09:24:18.240239 EDT | [Trial_Bayesian_Exploration/] epoch #10 | Trained qf 1000 steps, policy 1000 steps 62 | 2017-07-12 09:24:18.240429 EDT | [Trial_Bayesian_Exploration/] epoch #10 | Collecting samples for evaluation 63 | 2017-07-12 09:24:26.551050 EDT | ----------------------- ---------- 64 | 2017-07-12 09:24:26.551356 EDT | Epoch 10 65 | 2017-07-12 09:24:26.551567 EDT | Iteration 10 66 | 2017-07-12 09:24:26.551764 EDT | AverageReturn 1.70342 67 | 2017-07-12 09:24:26.552028 EDT | StdReturn 0.0811045 68 | 2017-07-12 09:24:26.552227 EDT | MaxReturn 1.91245 69 | 2017-07-12 09:24:26.552417 EDT | MinReturn 1.49564 70 | 2017-07-12 09:24:26.552608 EDT | AverageEsReturn 2.15915 71 | 2017-07-12 09:24:26.552795 EDT | StdEsReturn 1.56574 72 | 2017-07-12 09:24:26.552983 EDT | MaxEsReturn 8.61305 73 | 2017-07-12 09:24:26.553170 EDT | MinEsReturn 0.0122472 74 | 2017-07-12 09:24:26.553358 EDT | AverageDiscountedReturn 1.70881 75 | 2017-07-12 09:24:26.553544 EDT | AverageQLoss 0.0712594 76 | 2017-07-12 09:24:26.553731 EDT | AveragePolicySurr -0.672253 77 | 2017-07-12 09:24:26.553918 EDT | AverageQ 0.795785 78 | 2017-07-12 09:24:26.554105 EDT | AverageAbsQ 0.823965 79 | 2017-07-12 09:24:26.554291 EDT | AverageY 0.805172 80 | 2017-07-12 09:24:26.554476 EDT | AverageAbsY 0.84508 81 | 2017-07-12 09:24:26.554662 EDT | AverageAbsQYDiff 0.177415 82 | 2017-07-12 09:24:26.554848 EDT | AverageAction 0.999596 83 | 2017-07-12 09:24:26.555033 EDT | PolicyRegParamNorm 13.2281 84 | 2017-07-12 09:24:26.555220 EDT | QFunRegParamNorm 11.3933 85 | 2017-07-12 09:24:26.555406 EDT | ----------------------- ---------- 86 | 2017-07-12 09:24:26.555707 EDT | [Trial_Bayesian_Exploration/] epoch #11 | Training started 87 | 2017-07-12 09:25:46.611911 EDT | [Trial_Bayesian_Exploration/] epoch #11 | Training finished 88 | 2017-07-12 09:25:46.612170 EDT | [Trial_Bayesian_Exploration/] epoch #11 | Trained qf 1000 steps, policy 1000 steps 89 | 2017-07-12 09:25:46.612352 EDT | [Trial_Bayesian_Exploration/] epoch #11 | Collecting samples for evaluation 90 | 2017-07-12 09:25:54.435698 EDT | ----------------------- ---------- 91 | 2017-07-12 09:25:54.436028 EDT | Epoch 11 92 | 2017-07-12 09:25:54.436392 EDT | Iteration 11 93 | 2017-07-12 09:25:54.436696 EDT | AverageReturn 3.58419 94 | 2017-07-12 09:25:54.436997 EDT | StdReturn 0.115737 95 | 2017-07-12 09:25:54.437365 EDT | MaxReturn 3.89929 96 | 2017-07-12 09:25:54.437606 EDT | MinReturn 3.26895 97 | 2017-07-12 09:25:54.437795 EDT | AverageEsReturn 4.0568 98 | 2017-07-12 09:25:54.437981 EDT | StdEsReturn 0.737778 99 | 2017-07-12 09:25:54.438166 EDT | MaxEsReturn 6.41711 100 | 2017-07-12 09:25:54.438353 EDT | MinEsReturn 1.23751 101 | 2017-07-12 09:25:54.438526 EDT | AverageDiscountedReturn 3.5301 102 | 2017-07-12 09:25:54.438680 EDT | AverageQLoss 0.0288876 103 | 2017-07-12 09:25:54.438924 EDT | AveragePolicySurr -0.974544 104 | 2017-07-12 09:25:54.439177 EDT | AverageQ 1.1393 105 | 2017-07-12 09:25:54.439456 EDT | AverageAbsQ 1.19482 106 | 2017-07-12 09:25:54.439660 EDT | AverageY 1.14473 107 | 2017-07-12 09:25:54.439862 EDT | AverageAbsY 1.20289 108 | 2017-07-12 09:25:54.440104 EDT | AverageAbsQYDiff 0.123579 109 | 2017-07-12 09:25:54.440304 EDT | AverageAction 1 110 | 2017-07-12 09:25:54.440497 EDT | PolicyRegParamNorm 13.4963 111 | 2017-07-12 09:25:54.440679 EDT | QFunRegParamNorm 11.7883 112 | 2017-07-12 09:25:54.440858 EDT | ----------------------- ---------- 113 | 2017-07-12 09:25:54.441183 EDT | [Trial_Bayesian_Exploration/] epoch #12 | Training started 114 | 2017-07-12 09:27:13.633224 EDT | [Trial_Bayesian_Exploration/] epoch #12 | Training finished 115 | 2017-07-12 09:27:13.633485 EDT | [Trial_Bayesian_Exploration/] epoch #12 | Trained qf 1000 steps, policy 1000 steps 116 | 2017-07-12 09:27:13.633668 EDT | [Trial_Bayesian_Exploration/] epoch #12 | Collecting samples for evaluation 117 | 2017-07-12 09:27:21.520496 EDT | ----------------------- ---------- 118 | 2017-07-12 09:27:21.520771 EDT | Epoch 12 119 | 2017-07-12 09:27:21.521064 EDT | Iteration 12 120 | 2017-07-12 09:27:21.521281 EDT | AverageReturn 3.58477 121 | 2017-07-12 09:27:21.521496 EDT | StdReturn 0.113468 122 | 2017-07-12 09:27:21.521707 EDT | MaxReturn 3.89855 123 | 2017-07-12 09:27:21.521917 EDT | MinReturn 3.29676 124 | 2017-07-12 09:27:21.522127 EDT | AverageEsReturn 4.13938 125 | 2017-07-12 09:27:21.522378 EDT | StdEsReturn 0.57574 126 | 2017-07-12 09:27:21.522593 EDT | MaxEsReturn 6.29128 127 | 2017-07-12 09:27:21.522809 EDT | MinEsReturn 2.1017 128 | 2017-07-12 09:27:21.523011 EDT | AverageDiscountedReturn 3.53067 129 | 2017-07-12 09:27:21.523216 EDT | AverageQLoss 0.0412518 130 | 2017-07-12 09:27:21.523416 EDT | AveragePolicySurr -1.35587 131 | 2017-07-12 09:27:21.523616 EDT | AverageQ 1.59025 132 | 2017-07-12 09:27:21.523829 EDT | AverageAbsQ 1.66405 133 | 2017-07-12 09:27:21.524035 EDT | AverageY 1.59587 134 | 2017-07-12 09:27:21.524242 EDT | AverageAbsY 1.67236 135 | 2017-07-12 09:27:21.524449 EDT | AverageAbsQYDiff 0.144135 136 | 2017-07-12 09:27:21.524655 EDT | AverageAction 1 137 | 2017-07-12 09:27:21.524875 EDT | PolicyRegParamNorm 13.4963 138 | 2017-07-12 09:27:21.525082 EDT | QFunRegParamNorm 12.3922 139 | 2017-07-12 09:27:21.525289 EDT | ----------------------- ---------- 140 | 2017-07-12 09:27:21.525633 EDT | [Trial_Bayesian_Exploration/] epoch #13 | Training started 141 | -------------------------------------------------------------------------------- /ddpg_bayesian_thompson.py: -------------------------------------------------------------------------------- 1 | # FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py 2 | from rllab.algos.base import RLAlgorithm 3 | from rllab.misc.overrides import overrides 4 | from rllab.misc import special 5 | from sandbox.rocky.tf.misc import tensor_utils 6 | from rllab.sampler import parallel_sampler 7 | from rllab.plotter import plotter 8 | from rllab.misc import ext 9 | import rllab.misc.logger as logger 10 | #import pickle as pickle 11 | import numpy as np 12 | import pyprind 13 | import tensorflow as tf 14 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer 15 | #from sandbox.rocky.tf.core.parameterized import suppress_params_loading 16 | from rllab.core.serializable import Serializable 17 | from sampling_utils import SimpleReplayPool 18 | 19 | class DDPG(RLAlgorithm): 20 | """ 21 | Deep Deterministic Policy Gradient. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | env, 27 | policy, 28 | qf, 29 | es, 30 | batch_size=32, 31 | n_epochs=200, 32 | epoch_length=1000, 33 | min_pool_size=10000, 34 | replay_pool_size=1000000, 35 | replacement_prob=1.0, 36 | discount=0.99, 37 | max_path_length=250, 38 | qf_weight_decay=0., 39 | qf_update_method='adam', 40 | qf_learning_rate=1e-3, 41 | policy_weight_decay=0, 42 | policy_update_method='adam', 43 | policy_learning_rate=1e-3, 44 | policy_updates_ratio=1.0, 45 | eval_samples=10000, 46 | soft_target=True, 47 | soft_target_tau=0.001, 48 | n_updates_per_sample=1, 49 | scale_reward=1.0, 50 | include_horizon_terminal_transitions=False, 51 | plot=False, 52 | pause_for_plot=False): 53 | """ 54 | :param env: Environment 55 | :param policy: Policy 56 | :param qf: Q function 57 | :param es: Exploration strategy 58 | :param batch_size: Number of samples for each minibatch. 59 | :param n_epochs: Number of epochs. Policy will be evaluated after each epoch. 60 | :param epoch_length: How many timesteps for each epoch. 61 | :param min_pool_size: Minimum size of the pool to start training. 62 | :param replay_pool_size: Size of the experience replay pool. 63 | :param discount: Discount factor for the cumulative return. 64 | :param max_path_length: Discount factor for the cumulative return. 65 | :param qf_weight_decay: Weight decay factor for parameters of the Q function. 66 | :param qf_update_method: Online optimization method for training Q function. 67 | :param qf_learning_rate: Learning rate for training Q function. 68 | :param policy_weight_decay: Weight decay factor for parameters of the policy. 69 | :param policy_update_method: Online optimization method for training the policy. 70 | :param policy_learning_rate: Learning rate for training the policy. 71 | :param eval_samples: Number of samples (timesteps) for evaluating the policy. 72 | :param soft_target_tau: Interpolation parameter for doing the soft target update. 73 | :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained 74 | :param scale_reward: The scaling factor applied to the rewards when training 75 | :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the 76 | horizon was reached. This might make the Q value back up less stable for certain tasks. 77 | :param plot: Whether to visualize the policy performance after each eval_interval. 78 | :param pause_for_plot: Whether to pause before continuing when plotting. 79 | :return: 80 | """ 81 | self.env = env 82 | self.policy = policy 83 | self.qf = qf 84 | self.es = es 85 | self.batch_size = batch_size 86 | self.n_epochs = n_epochs 87 | self.epoch_length = epoch_length 88 | self.min_pool_size = min_pool_size 89 | self.replay_pool_size = replay_pool_size 90 | self.replacement_prob = replacement_prob 91 | self.discount = discount 92 | self.max_path_length = max_path_length 93 | self.qf_weight_decay = qf_weight_decay 94 | self.qf_update_method = \ 95 | FirstOrderOptimizer( 96 | update_method=qf_update_method, 97 | learning_rate=qf_learning_rate, 98 | ) 99 | self.qf_learning_rate = qf_learning_rate 100 | self.policy_weight_decay = policy_weight_decay 101 | 102 | 103 | self.policy_update_method = \ 104 | FirstOrderOptimizer( 105 | update_method=policy_update_method, 106 | learning_rate=policy_learning_rate, 107 | ) 108 | self.policy_learning_rate = policy_learning_rate 109 | self.policy_updates_ratio = policy_updates_ratio 110 | self.eval_samples = eval_samples 111 | self.soft_target_tau = soft_target_tau 112 | self.n_updates_per_sample = n_updates_per_sample 113 | self.include_horizon_terminal_transitions = include_horizon_terminal_transitions 114 | self.plot = plot 115 | self.pause_for_plot = pause_for_plot 116 | 117 | self.qf_loss_averages = [] 118 | self.policy_surr_averages = [] 119 | self.q_averages = [] 120 | self.y_averages = [] 121 | self.paths = [] 122 | self.es_path_returns = [] 123 | self.paths_samples_cnt = 0 124 | 125 | self.scale_reward = scale_reward 126 | 127 | self.train_policy_itr = 0 128 | 129 | self.opt_info = None 130 | 131 | def start_worker(self): 132 | parallel_sampler.populate_task(self.env, self.policy) 133 | if self.plot: 134 | plotter.init_plot(self.env, self.policy) 135 | 136 | @overrides 137 | def train(self): 138 | with tf.Session() as sess: 139 | sess.run(tf.global_variables_initializer()) 140 | # This seems like a rather sequential method 141 | pool = SimpleReplayPool( 142 | max_pool_size=self.replay_pool_size, 143 | observation_dim=self.env.observation_space.flat_dim, 144 | action_dim=self.env.action_space.flat_dim, 145 | replacement_prob=self.replacement_prob, 146 | ) 147 | self.start_worker() 148 | 149 | self.init_opt() 150 | # This initializes the optimizer parameters 151 | sess.run(tf.global_variables_initializer()) 152 | itr = 0 153 | path_length = 0 154 | path_return = 0 155 | terminal = False 156 | initial = False 157 | observation = self.env.reset() 158 | 159 | with tf.variable_scope("sample_policy"): 160 | sample_policy = Serializable.clone(self.policy) 161 | 162 | for epoch in range(self.n_epochs): 163 | logger.push_prefix('epoch #%d | ' % epoch) 164 | logger.log("Training started") 165 | train_qf_itr, train_policy_itr = 0, 0 166 | 167 | #sample a policy function from the posterior at every episode 168 | #move in the entire episode with the sampled policy function? 169 | 170 | for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): 171 | # Execute policy 172 | if terminal: # or path_length > self.max_path_length: 173 | # Note that if the last time step ends an episode, the very 174 | # last state and observation will be ignored and not added 175 | # to the replay pool 176 | observation = self.env.reset() 177 | self.es.reset() 178 | sample_policy.reset() 179 | self.es_path_returns.append(path_return) 180 | path_length = 0 181 | path_return = 0 182 | initial = True 183 | else: 184 | initial = False 185 | 186 | action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) 187 | 188 | 189 | next_observation, reward, terminal, _ = self.env.step(action) 190 | path_length += 1 191 | path_return += reward 192 | 193 | 194 | if not terminal and path_length >= self.max_path_length: 195 | terminal = True 196 | # only include the terminal transition in this case if the flag was set 197 | if self.include_horizon_terminal_transitions: 198 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 199 | else: 200 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 201 | 202 | observation = next_observation 203 | 204 | if pool.size >= self.min_pool_size: 205 | for update_itr in range(self.n_updates_per_sample): 206 | # Train policy 207 | batch = pool.random_batch(self.batch_size) 208 | itrs = self.do_training(itr, epoch, batch) 209 | train_qf_itr += itrs[0] 210 | train_policy_itr += itrs[1] 211 | sample_policy.set_param_values(self.policy.get_param_values()) 212 | 213 | itr += 1 214 | 215 | logger.log("Training finished") 216 | logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr)) 217 | if pool.size >= self.min_pool_size: 218 | self.evaluate(epoch, pool) 219 | params = self.get_epoch_snapshot(epoch) 220 | logger.save_itr_params(epoch, params) 221 | logger.dump_tabular(with_prefix=False) 222 | logger.pop_prefix() 223 | if self.plot: 224 | self.update_plot() 225 | if self.pause_for_plot: 226 | input("Plotting evaluation run: Press Enter to " 227 | "continue...") 228 | self.env.terminate() 229 | self.policy.terminate() 230 | 231 | def init_opt(self): 232 | 233 | # First, create "target" policy and Q functions 234 | with tf.variable_scope("target_policy"): 235 | target_policy = Serializable.clone(self.policy) 236 | with tf.variable_scope("target_qf"): 237 | target_qf = Serializable.clone(self.qf) 238 | 239 | # y need to be computed first 240 | obs = self.env.observation_space.new_tensor_variable( 241 | 'obs', 242 | extra_dims=1, 243 | ) 244 | 245 | # The yi values are computed separately as above and then passed to 246 | # the training functions below 247 | action = self.env.action_space.new_tensor_variable( 248 | 'action', 249 | extra_dims=1, 250 | ) 251 | 252 | yvar = tensor_utils.new_tensor( 253 | 'ys', 254 | ndim=1, 255 | dtype=tf.float32, 256 | ) 257 | 258 | qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ 259 | sum([tf.reduce_sum(tf.square(param)) for param in 260 | self.qf.get_params(regularizable=True)]) 261 | 262 | qval = self.qf.get_qval_sym(obs, action) 263 | 264 | qf_loss = tf.reduce_mean(tf.square(yvar - qval)) 265 | qf_reg_loss = qf_loss + qf_weight_decay_term 266 | 267 | policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ 268 | sum([tf.reduce_sum(tf.square(param)) 269 | for param in self.policy.get_params(regularizable=True)]) 270 | 271 | 272 | policy_qval = self.qf.get_qval_sym( 273 | obs, self.policy.get_action_sym(obs), 274 | deterministic=True 275 | ) 276 | 277 | 278 | policy_surr = -tf.reduce_mean(policy_qval) 279 | 280 | policy_reg_surr = policy_surr + policy_weight_decay_term 281 | 282 | qf_input_list = [yvar, obs, action] 283 | policy_input_list = [obs] 284 | 285 | self.qf_update_method.update_opt( 286 | loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) 287 | 288 | 289 | self.policy_update_method.update_opt( 290 | loss=policy_reg_surr, target=self.policy, inputs=policy_input_list) 291 | 292 | f_train_qf = tensor_utils.compile_function( 293 | inputs=qf_input_list, 294 | outputs=[qf_loss, qval, self.qf_update_method._train_op], 295 | ) 296 | 297 | f_train_policy = tensor_utils.compile_function( 298 | inputs=policy_input_list, 299 | outputs=[policy_surr, self.policy_update_method._train_op], 300 | ) 301 | 302 | self.opt_info = dict( 303 | f_train_qf=f_train_qf, 304 | f_train_policy=f_train_policy, 305 | target_qf=target_qf, 306 | target_policy=target_policy, 307 | ) 308 | 309 | def do_training(self, itr, epoch, batch): 310 | 311 | obs, actions, rewards, next_obs, terminals = ext.extract( 312 | batch, 313 | "observations", "actions", "rewards", "next_observations", 314 | "terminals" 315 | ) 316 | 317 | # compute the on-policy y values 318 | target_qf = self.opt_info["target_qf"] 319 | target_policy = self.opt_info["target_policy"] 320 | 321 | next_actions, _ = target_policy.get_actions(next_obs) 322 | next_qvals = target_qf.get_qval(next_obs, next_actions) 323 | 324 | 325 | """ 326 | Uncertainty in Critic Networks for exploration 327 | - Thompson Sampling with the critic target networks 328 | """ 329 | 330 | """ 331 | Possible way (b) : for targets, use max(Q) 332 | - take the max (Q_0, Q_1, Q_2, ... Q_k) from the MC Dropout Q networks 333 | """ 334 | 335 | mc_dropout = 50 336 | all_posterior_qvals = np.zeros(shape=(next_obs.shape[0], mc_dropout)) 337 | for d in range(mc_dropout): 338 | posterior_qvals = target_qf.get_qval_dropout(next_obs, next_actions) 339 | 340 | all_posterior_qvals[:, d] = posterior_qvals[:, 0] 341 | 342 | 343 | sum_all_posterior_qvals = np.sum(all_posterior_qvals, axis=0) 344 | max_Q_ind = np.argmax(sum_all_posterior_qvals) 345 | 346 | max_Q = all_posterior_qvals[:, max_Q_ind] 347 | variance_next_qvals = np.std(all_posterior_qvals, axis=1) 348 | 349 | lambda_expl = 10 / epoch 350 | qval_bayesian = max_Q + lambda_expl * variance_next_qvals 351 | 352 | 353 | ys = rewards + (1. - terminals) * self.discount * qval_bayesian.reshape(-1) 354 | 355 | f_train_qf = self.opt_info["f_train_qf"] 356 | 357 | qf_loss, qval, _ = f_train_qf(ys, obs, actions) 358 | 359 | target_qf.set_param_values( 360 | target_qf.get_param_values() * (1.0 - self.soft_target_tau) + 361 | self.qf.get_param_values() * self.soft_target_tau) 362 | self.qf_loss_averages.append(qf_loss) 363 | self.q_averages.append(qval) 364 | self.y_averages.append(ys) 365 | 366 | self.train_policy_itr += self.policy_updates_ratio 367 | train_policy_itr = 0 368 | 369 | 370 | while self.train_policy_itr > 0: 371 | 372 | f_train_policy = self.opt_info["f_train_policy"] 373 | policy_surr, _ = f_train_policy(obs) 374 | 375 | 376 | target_policy.set_param_values( 377 | target_policy.get_param_values() * (1.0 - self.soft_target_tau) + 378 | self.policy.get_param_values() * self.soft_target_tau) 379 | self.policy_surr_averages.append(policy_surr) 380 | self.train_policy_itr -= 1 381 | train_policy_itr += 1 382 | return 1, train_policy_itr # number of itrs qf, policy are trained 383 | 384 | 385 | 386 | 387 | 388 | 389 | def evaluate(self, epoch, pool): 390 | logger.log("Collecting samples for evaluation") 391 | paths = parallel_sampler.sample_paths( 392 | policy_params=self.policy.get_param_values(), 393 | max_samples=self.eval_samples, 394 | max_path_length=self.max_path_length, 395 | ) 396 | 397 | average_discounted_return = np.mean( 398 | [special.discount_return(path["rewards"], self.discount) for path in paths] 399 | ) 400 | 401 | returns = [sum(path["rewards"]) for path in paths] 402 | 403 | all_qs = np.concatenate(self.q_averages) 404 | all_ys = np.concatenate(self.y_averages) 405 | 406 | average_q_loss = np.mean(self.qf_loss_averages) 407 | average_policy_surr = np.mean(self.policy_surr_averages) 408 | average_action = np.mean(np.square(np.concatenate( 409 | [path["actions"] for path in paths] 410 | ))) 411 | 412 | policy_reg_param_norm = np.linalg.norm( 413 | self.policy.get_param_values(regularizable=True) 414 | ) 415 | qfun_reg_param_norm = np.linalg.norm( 416 | self.qf.get_param_values(regularizable=True) 417 | ) 418 | 419 | logger.record_tabular('Epoch', epoch) 420 | logger.record_tabular('Iteration', epoch) 421 | logger.record_tabular('AverageReturn', np.mean(returns)) 422 | logger.record_tabular('StdReturn', 423 | np.std(returns)) 424 | logger.record_tabular('MaxReturn', 425 | np.max(returns)) 426 | logger.record_tabular('MinReturn', 427 | np.min(returns)) 428 | if len(self.es_path_returns) > 0: 429 | logger.record_tabular('AverageEsReturn', 430 | np.mean(self.es_path_returns)) 431 | logger.record_tabular('StdEsReturn', 432 | np.std(self.es_path_returns)) 433 | logger.record_tabular('MaxEsReturn', 434 | np.max(self.es_path_returns)) 435 | logger.record_tabular('MinEsReturn', 436 | np.min(self.es_path_returns)) 437 | logger.record_tabular('AverageDiscountedReturn', 438 | average_discounted_return) 439 | logger.record_tabular('AverageQLoss', average_q_loss) 440 | logger.record_tabular('AveragePolicySurr', average_policy_surr) 441 | logger.record_tabular('AverageQ', np.mean(all_qs)) 442 | logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) 443 | logger.record_tabular('AverageY', np.mean(all_ys)) 444 | logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) 445 | logger.record_tabular('AverageAbsQYDiff', 446 | np.mean(np.abs(all_qs - all_ys))) 447 | logger.record_tabular('AverageAction', average_action) 448 | 449 | logger.record_tabular('PolicyRegParamNorm', 450 | policy_reg_param_norm) 451 | logger.record_tabular('QFunRegParamNorm', 452 | qfun_reg_param_norm) 453 | 454 | self.env.log_diagnostics(paths) 455 | self.policy.log_diagnostics(paths) 456 | 457 | self.qf_loss_averages = [] 458 | self.policy_surr_averages = [] 459 | 460 | self.q_averages = [] 461 | self.y_averages = [] 462 | self.es_path_returns = [] 463 | 464 | def update_plot(self): 465 | if self.plot: 466 | plotter.update_plot(self.policy, self.max_path_length) 467 | 468 | def get_epoch_snapshot(self, epoch): 469 | return dict( 470 | env=self.env, 471 | epoch=epoch, 472 | qf=self.qf, 473 | policy=self.policy, 474 | target_qf=self.opt_info["target_qf"], 475 | target_policy=self.opt_info["target_policy"], 476 | es=self.es, 477 | ) 478 | -------------------------------------------------------------------------------- /ddpg_bayesian_mean.py: -------------------------------------------------------------------------------- 1 | # FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py 2 | from rllab.algos.base import RLAlgorithm 3 | from rllab.misc.overrides import overrides 4 | from rllab.misc import special 5 | from sandbox.rocky.tf.misc import tensor_utils 6 | from rllab.sampler import parallel_sampler 7 | from rllab.plotter import plotter 8 | from rllab.misc import ext 9 | import rllab.misc.logger as logger 10 | #import pickle as pickle 11 | import numpy as np 12 | import pyprind 13 | import tensorflow as tf 14 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer 15 | #from sandbox.rocky.tf.core.parameterized import suppress_params_loading 16 | from rllab.core.serializable import Serializable 17 | from sampling_utils import SimpleReplayPool 18 | 19 | class DDPG(RLAlgorithm): 20 | """ 21 | Deep Deterministic Policy Gradient. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | env, 27 | policy, 28 | qf, 29 | es, 30 | batch_size=32, 31 | n_epochs=200, 32 | epoch_length=1000, 33 | min_pool_size=10000, 34 | replay_pool_size=1000000, 35 | replacement_prob=1.0, 36 | discount=0.99, 37 | max_path_length=250, 38 | qf_weight_decay=0., 39 | qf_update_method='adam', 40 | qf_learning_rate=1e-3, 41 | policy_weight_decay=0, 42 | policy_update_method='adam', 43 | policy_learning_rate=1e-3, 44 | policy_updates_ratio=1.0, 45 | eval_samples=10000, 46 | soft_target=True, 47 | soft_target_tau=0.001, 48 | n_updates_per_sample=1, 49 | scale_reward=1.0, 50 | include_horizon_terminal_transitions=False, 51 | plot=False, 52 | pause_for_plot=False): 53 | """ 54 | :param env: Environment 55 | :param policy: Policy 56 | :param qf: Q function 57 | :param es: Exploration strategy 58 | :param batch_size: Number of samples for each minibatch. 59 | :param n_epochs: Number of epochs. Policy will be evaluated after each epoch. 60 | :param epoch_length: How many timesteps for each epoch. 61 | :param min_pool_size: Minimum size of the pool to start training. 62 | :param replay_pool_size: Size of the experience replay pool. 63 | :param discount: Discount factor for the cumulative return. 64 | :param max_path_length: Discount factor for the cumulative return. 65 | :param qf_weight_decay: Weight decay factor for parameters of the Q function. 66 | :param qf_update_method: Online optimization method for training Q function. 67 | :param qf_learning_rate: Learning rate for training Q function. 68 | :param policy_weight_decay: Weight decay factor for parameters of the policy. 69 | :param policy_update_method: Online optimization method for training the policy. 70 | :param policy_learning_rate: Learning rate for training the policy. 71 | :param eval_samples: Number of samples (timesteps) for evaluating the policy. 72 | :param soft_target_tau: Interpolation parameter for doing the soft target update. 73 | :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained 74 | :param scale_reward: The scaling factor applied to the rewards when training 75 | :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the 76 | horizon was reached. This might make the Q value back up less stable for certain tasks. 77 | :param plot: Whether to visualize the policy performance after each eval_interval. 78 | :param pause_for_plot: Whether to pause before continuing when plotting. 79 | :return: 80 | """ 81 | self.env = env 82 | self.policy = policy 83 | self.qf = qf 84 | self.es = es 85 | self.batch_size = batch_size 86 | self.n_epochs = n_epochs 87 | self.epoch_length = epoch_length 88 | self.min_pool_size = min_pool_size 89 | self.replay_pool_size = replay_pool_size 90 | self.replacement_prob = replacement_prob 91 | self.discount = discount 92 | self.max_path_length = max_path_length 93 | self.qf_weight_decay = qf_weight_decay 94 | self.qf_update_method = \ 95 | FirstOrderOptimizer( 96 | update_method=qf_update_method, 97 | learning_rate=qf_learning_rate, 98 | ) 99 | self.qf_learning_rate = qf_learning_rate 100 | self.policy_weight_decay = policy_weight_decay 101 | 102 | 103 | self.policy_update_method = \ 104 | FirstOrderOptimizer( 105 | update_method=policy_update_method, 106 | learning_rate=policy_learning_rate, 107 | ) 108 | self.policy_learning_rate = policy_learning_rate 109 | self.policy_updates_ratio = policy_updates_ratio 110 | self.eval_samples = eval_samples 111 | self.soft_target_tau = soft_target_tau 112 | self.n_updates_per_sample = n_updates_per_sample 113 | self.include_horizon_terminal_transitions = include_horizon_terminal_transitions 114 | self.plot = plot 115 | self.pause_for_plot = pause_for_plot 116 | 117 | self.qf_loss_averages = [] 118 | self.policy_surr_averages = [] 119 | self.q_averages = [] 120 | self.y_averages = [] 121 | self.paths = [] 122 | self.es_path_returns = [] 123 | self.paths_samples_cnt = 0 124 | 125 | self.scale_reward = scale_reward 126 | 127 | self.train_policy_itr = 0 128 | 129 | self.opt_info = None 130 | 131 | def start_worker(self): 132 | parallel_sampler.populate_task(self.env, self.policy) 133 | if self.plot: 134 | plotter.init_plot(self.env, self.policy) 135 | 136 | @overrides 137 | def train(self): 138 | with tf.Session() as sess: 139 | sess.run(tf.global_variables_initializer()) 140 | # This seems like a rather sequential method 141 | pool = SimpleReplayPool( 142 | max_pool_size=self.replay_pool_size, 143 | observation_dim=self.env.observation_space.flat_dim, 144 | action_dim=self.env.action_space.flat_dim, 145 | replacement_prob=self.replacement_prob, 146 | ) 147 | self.start_worker() 148 | 149 | self.init_opt() 150 | # This initializes the optimizer parameters 151 | sess.run(tf.global_variables_initializer()) 152 | itr = 0 153 | path_length = 0 154 | path_return = 0 155 | terminal = False 156 | initial = False 157 | observation = self.env.reset() 158 | 159 | with tf.variable_scope("sample_policy"): 160 | sample_policy = Serializable.clone(self.policy) 161 | 162 | for epoch in range(self.n_epochs): 163 | logger.push_prefix('epoch #%d | ' % epoch) 164 | logger.log("Training started") 165 | train_qf_itr, train_policy_itr = 0, 0 166 | 167 | #sample a policy function from the posterior at every episode 168 | #move in the entire episode with the sampled policy function? 169 | 170 | for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): 171 | # Execute policy 172 | if terminal: # or path_length > self.max_path_length: 173 | # Note that if the last time step ends an episode, the very 174 | # last state and observation will be ignored and not added 175 | # to the replay pool 176 | observation = self.env.reset() 177 | self.es.reset() 178 | sample_policy.reset() 179 | self.es_path_returns.append(path_return) 180 | path_length = 0 181 | path_return = 0 182 | initial = True 183 | else: 184 | initial = False 185 | 186 | action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) 187 | 188 | 189 | next_observation, reward, terminal, _ = self.env.step(action) 190 | path_length += 1 191 | path_return += reward 192 | 193 | 194 | if not terminal and path_length >= self.max_path_length: 195 | terminal = True 196 | # only include the terminal transition in this case if the flag was set 197 | if self.include_horizon_terminal_transitions: 198 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 199 | else: 200 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 201 | 202 | observation = next_observation 203 | 204 | if pool.size >= self.min_pool_size: 205 | for update_itr in range(self.n_updates_per_sample): 206 | # Train policy 207 | batch = pool.random_batch(self.batch_size) 208 | itrs = self.do_training(itr, epoch, batch) 209 | train_qf_itr += itrs[0] 210 | train_policy_itr += itrs[1] 211 | sample_policy.set_param_values(self.policy.get_param_values()) 212 | 213 | itr += 1 214 | 215 | logger.log("Training finished") 216 | logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr)) 217 | if pool.size >= self.min_pool_size: 218 | self.evaluate(epoch, pool) 219 | params = self.get_epoch_snapshot(epoch) 220 | logger.save_itr_params(epoch, params) 221 | logger.dump_tabular(with_prefix=False) 222 | logger.pop_prefix() 223 | if self.plot: 224 | self.update_plot() 225 | if self.pause_for_plot: 226 | input("Plotting evaluation run: Press Enter to " 227 | "continue...") 228 | self.env.terminate() 229 | self.policy.terminate() 230 | 231 | def init_opt(self): 232 | 233 | # First, create "target" policy and Q functions 234 | with tf.variable_scope("target_policy"): 235 | target_policy = Serializable.clone(self.policy) 236 | with tf.variable_scope("target_qf"): 237 | target_qf = Serializable.clone(self.qf) 238 | 239 | # y need to be computed first 240 | obs = self.env.observation_space.new_tensor_variable( 241 | 'obs', 242 | extra_dims=1, 243 | ) 244 | 245 | # The yi values are computed separately as above and then passed to 246 | # the training functions below 247 | action = self.env.action_space.new_tensor_variable( 248 | 'action', 249 | extra_dims=1, 250 | ) 251 | 252 | yvar = tensor_utils.new_tensor( 253 | 'ys', 254 | ndim=1, 255 | dtype=tf.float32, 256 | ) 257 | 258 | qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ 259 | sum([tf.reduce_sum(tf.square(param)) for param in 260 | self.qf.get_params(regularizable=True)]) 261 | 262 | qval = self.qf.get_qval_sym(obs, action) 263 | 264 | qf_loss = tf.reduce_mean(tf.square(yvar - qval)) 265 | qf_reg_loss = qf_loss + qf_weight_decay_term 266 | 267 | policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ 268 | sum([tf.reduce_sum(tf.square(param)) 269 | for param in self.policy.get_params(regularizable=True)]) 270 | 271 | 272 | policy_qval = self.qf.get_qval_sym( 273 | obs, self.policy.get_action_sym(obs), 274 | deterministic=True 275 | ) 276 | 277 | 278 | policy_surr = -tf.reduce_mean(policy_qval) 279 | 280 | policy_reg_surr = policy_surr + policy_weight_decay_term 281 | 282 | qf_input_list = [yvar, obs, action] 283 | policy_input_list = [obs] 284 | 285 | self.qf_update_method.update_opt( 286 | loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) 287 | 288 | 289 | self.policy_update_method.update_opt( 290 | loss=policy_reg_surr, target=self.policy, inputs=policy_input_list) 291 | 292 | f_train_qf = tensor_utils.compile_function( 293 | inputs=qf_input_list, 294 | outputs=[qf_loss, qval, self.qf_update_method._train_op], 295 | ) 296 | 297 | f_train_policy = tensor_utils.compile_function( 298 | inputs=policy_input_list, 299 | outputs=[policy_surr, self.policy_update_method._train_op], 300 | ) 301 | 302 | self.opt_info = dict( 303 | f_train_qf=f_train_qf, 304 | f_train_policy=f_train_policy, 305 | target_qf=target_qf, 306 | target_policy=target_policy, 307 | ) 308 | 309 | def do_training(self, itr, epoch, batch): 310 | 311 | obs, actions, rewards, next_obs, terminals = ext.extract( 312 | batch, 313 | "observations", "actions", "rewards", "next_observations", 314 | "terminals" 315 | ) 316 | 317 | # compute the on-policy y values 318 | target_qf = self.opt_info["target_qf"] 319 | target_policy = self.opt_info["target_policy"] 320 | 321 | next_actions, _ = target_policy.get_actions(next_obs) 322 | next_qvals = target_qf.get_qval(next_obs, next_actions) 323 | 324 | 325 | """ 326 | Uncertainty in Critic Networks for exploration 327 | - Thompson Sampling with the critic target networks 328 | """ 329 | 330 | """ 331 | Possible way (a) : for targets, take mean(Q) + lambda * variance(Q) over all Q evaluations 332 | """ 333 | 334 | """ 335 | Apply MCDropout here - get the mean of Q and the variance over Q 336 | """ 337 | mc_dropout = 50 338 | all_posterior_qvals = np.zeros(shape=(next_obs.shape[0], mc_dropout)) 339 | for d in range(mc_dropout): 340 | posterior_qvals = target_qf.get_qval_dropout(next_obs, next_actions) 341 | all_posterior_qvals[:, d] = posterior_qvals[:, 0] 342 | 343 | ## mean of the Q function posterior 344 | # mean_next_qvals = np.array([np.mean(all_posterior_qvals, axis=1)]).T 345 | mean_next_qvals = np.mean(all_posterior_qvals, axis=1) 346 | variance_next_qvals = np.std(all_posterior_qvals, axis=1) 347 | 348 | 349 | #### lambda parameter to tune between optimistic/pessimistic exploration 350 | lambda_expl = 10 / epoch 351 | qval_bayesian = mean_next_qvals + lambda_expl * variance_next_qvals 352 | 353 | ys = rewards + (1. - terminals) * self.discount * qval_bayesian.reshape(-1) 354 | 355 | 356 | f_train_qf = self.opt_info["f_train_qf"] 357 | 358 | qf_loss, qval, _ = f_train_qf(ys, obs, actions) 359 | 360 | target_qf.set_param_values( 361 | target_qf.get_param_values() * (1.0 - self.soft_target_tau) + 362 | self.qf.get_param_values() * self.soft_target_tau) 363 | self.qf_loss_averages.append(qf_loss) 364 | self.q_averages.append(qval) 365 | self.y_averages.append(ys) 366 | 367 | self.train_policy_itr += self.policy_updates_ratio 368 | train_policy_itr = 0 369 | 370 | 371 | while self.train_policy_itr > 0: 372 | 373 | f_train_policy = self.opt_info["f_train_policy"] 374 | policy_surr, _ = f_train_policy(obs) 375 | 376 | 377 | target_policy.set_param_values( 378 | target_policy.get_param_values() * (1.0 - self.soft_target_tau) + 379 | self.policy.get_param_values() * self.soft_target_tau) 380 | self.policy_surr_averages.append(policy_surr) 381 | self.train_policy_itr -= 1 382 | train_policy_itr += 1 383 | return 1, train_policy_itr # number of itrs qf, policy are trained 384 | 385 | 386 | 387 | 388 | 389 | 390 | def evaluate(self, epoch, pool): 391 | logger.log("Collecting samples for evaluation") 392 | paths = parallel_sampler.sample_paths( 393 | policy_params=self.policy.get_param_values(), 394 | max_samples=self.eval_samples, 395 | max_path_length=self.max_path_length, 396 | ) 397 | 398 | average_discounted_return = np.mean( 399 | [special.discount_return(path["rewards"], self.discount) for path in paths] 400 | ) 401 | 402 | returns = [sum(path["rewards"]) for path in paths] 403 | 404 | all_qs = np.concatenate(self.q_averages) 405 | all_ys = np.concatenate(self.y_averages) 406 | 407 | average_q_loss = np.mean(self.qf_loss_averages) 408 | average_policy_surr = np.mean(self.policy_surr_averages) 409 | average_action = np.mean(np.square(np.concatenate( 410 | [path["actions"] for path in paths] 411 | ))) 412 | 413 | policy_reg_param_norm = np.linalg.norm( 414 | self.policy.get_param_values(regularizable=True) 415 | ) 416 | qfun_reg_param_norm = np.linalg.norm( 417 | self.qf.get_param_values(regularizable=True) 418 | ) 419 | 420 | logger.record_tabular('Epoch', epoch) 421 | logger.record_tabular('Iteration', epoch) 422 | logger.record_tabular('AverageReturn', np.mean(returns)) 423 | logger.record_tabular('StdReturn', 424 | np.std(returns)) 425 | logger.record_tabular('MaxReturn', 426 | np.max(returns)) 427 | logger.record_tabular('MinReturn', 428 | np.min(returns)) 429 | if len(self.es_path_returns) > 0: 430 | logger.record_tabular('AverageEsReturn', 431 | np.mean(self.es_path_returns)) 432 | logger.record_tabular('StdEsReturn', 433 | np.std(self.es_path_returns)) 434 | logger.record_tabular('MaxEsReturn', 435 | np.max(self.es_path_returns)) 436 | logger.record_tabular('MinEsReturn', 437 | np.min(self.es_path_returns)) 438 | logger.record_tabular('AverageDiscountedReturn', 439 | average_discounted_return) 440 | logger.record_tabular('AverageQLoss', average_q_loss) 441 | logger.record_tabular('AveragePolicySurr', average_policy_surr) 442 | logger.record_tabular('AverageQ', np.mean(all_qs)) 443 | logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) 444 | logger.record_tabular('AverageY', np.mean(all_ys)) 445 | logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) 446 | logger.record_tabular('AverageAbsQYDiff', 447 | np.mean(np.abs(all_qs - all_ys))) 448 | logger.record_tabular('AverageAction', average_action) 449 | 450 | logger.record_tabular('PolicyRegParamNorm', 451 | policy_reg_param_norm) 452 | logger.record_tabular('QFunRegParamNorm', 453 | qfun_reg_param_norm) 454 | 455 | self.env.log_diagnostics(paths) 456 | self.policy.log_diagnostics(paths) 457 | 458 | self.qf_loss_averages = [] 459 | self.policy_surr_averages = [] 460 | 461 | self.q_averages = [] 462 | self.y_averages = [] 463 | self.es_path_returns = [] 464 | 465 | def update_plot(self): 466 | if self.plot: 467 | plotter.update_plot(self.policy, self.max_path_length) 468 | 469 | def get_epoch_snapshot(self, epoch): 470 | return dict( 471 | env=self.env, 472 | epoch=epoch, 473 | qf=self.qf, 474 | policy=self.policy, 475 | target_qf=self.opt_info["target_qf"], 476 | target_policy=self.opt_info["target_policy"], 477 | es=self.es, 478 | ) 479 | -------------------------------------------------------------------------------- /ddpg_bayesian.py: -------------------------------------------------------------------------------- 1 | # FROM: https://raw.githubusercontent.com/shaneshixiang/rllabplusplus/master/sandbox/rocky/tf/algos/ddpg.py 2 | from rllab.algos.base import RLAlgorithm 3 | from rllab.misc.overrides import overrides 4 | from rllab.misc import special 5 | from sandbox.rocky.tf.misc import tensor_utils 6 | from rllab.sampler import parallel_sampler 7 | from rllab.plotter import plotter 8 | from rllab.misc import ext 9 | import rllab.misc.logger as logger 10 | #import pickle as pickle 11 | import numpy as np 12 | import pyprind 13 | import tensorflow as tf 14 | from sandbox.rocky.tf.optimizers.first_order_optimizer import FirstOrderOptimizer 15 | #from sandbox.rocky.tf.core.parameterized import suppress_params_loading 16 | from rllab.core.serializable import Serializable 17 | from sampling_utils import SimpleReplayPool 18 | 19 | class DDPG(RLAlgorithm): 20 | """ 21 | Deep Deterministic Policy Gradient. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | env, 27 | policy, 28 | qf, 29 | es, 30 | batch_size=32, 31 | n_epochs=200, 32 | epoch_length=1000, 33 | min_pool_size=10000, 34 | replay_pool_size=1000000, 35 | replacement_prob=1.0, 36 | discount=0.99, 37 | max_path_length=250, 38 | qf_weight_decay=0., 39 | qf_update_method='adam', 40 | qf_learning_rate=1e-3, 41 | policy_weight_decay=0, 42 | policy_update_method='adam', 43 | policy_learning_rate=1e-3, 44 | policy_updates_ratio=1.0, 45 | eval_samples=10000, 46 | soft_target=True, 47 | soft_target_tau=0.001, 48 | n_updates_per_sample=1, 49 | scale_reward=1.0, 50 | include_horizon_terminal_transitions=False, 51 | plot=False, 52 | pause_for_plot=False): 53 | """ 54 | :param env: Environment 55 | :param policy: Policy 56 | :param qf: Q function 57 | :param es: Exploration strategy 58 | :param batch_size: Number of samples for each minibatch. 59 | :param n_epochs: Number of epochs. Policy will be evaluated after each epoch. 60 | :param epoch_length: How many timesteps for each epoch. 61 | :param min_pool_size: Minimum size of the pool to start training. 62 | :param replay_pool_size: Size of the experience replay pool. 63 | :param discount: Discount factor for the cumulative return. 64 | :param max_path_length: Discount factor for the cumulative return. 65 | :param qf_weight_decay: Weight decay factor for parameters of the Q function. 66 | :param qf_update_method: Online optimization method for training Q function. 67 | :param qf_learning_rate: Learning rate for training Q function. 68 | :param policy_weight_decay: Weight decay factor for parameters of the policy. 69 | :param policy_update_method: Online optimization method for training the policy. 70 | :param policy_learning_rate: Learning rate for training the policy. 71 | :param eval_samples: Number of samples (timesteps) for evaluating the policy. 72 | :param soft_target_tau: Interpolation parameter for doing the soft target update. 73 | :param n_updates_per_sample: Number of Q function and policy updates per new sample obtained 74 | :param scale_reward: The scaling factor applied to the rewards when training 75 | :param include_horizon_terminal_transitions: whether to include transitions with terminal=True because the 76 | horizon was reached. This might make the Q value back up less stable for certain tasks. 77 | :param plot: Whether to visualize the policy performance after each eval_interval. 78 | :param pause_for_plot: Whether to pause before continuing when plotting. 79 | :return: 80 | """ 81 | self.env = env 82 | self.policy = policy 83 | self.qf = qf 84 | self.es = es 85 | self.batch_size = batch_size 86 | self.n_epochs = n_epochs 87 | self.epoch_length = epoch_length 88 | self.min_pool_size = min_pool_size 89 | self.replay_pool_size = replay_pool_size 90 | self.replacement_prob = replacement_prob 91 | self.discount = discount 92 | self.max_path_length = max_path_length 93 | self.qf_weight_decay = qf_weight_decay 94 | self.qf_update_method = \ 95 | FirstOrderOptimizer( 96 | update_method=qf_update_method, 97 | learning_rate=qf_learning_rate, 98 | ) 99 | self.qf_learning_rate = qf_learning_rate 100 | self.policy_weight_decay = policy_weight_decay 101 | 102 | 103 | self.policy_update_method = \ 104 | FirstOrderOptimizer( 105 | update_method=policy_update_method, 106 | learning_rate=policy_learning_rate, 107 | ) 108 | self.policy_learning_rate = policy_learning_rate 109 | self.policy_updates_ratio = policy_updates_ratio 110 | self.eval_samples = eval_samples 111 | self.soft_target_tau = soft_target_tau 112 | self.n_updates_per_sample = n_updates_per_sample 113 | self.include_horizon_terminal_transitions = include_horizon_terminal_transitions 114 | self.plot = plot 115 | self.pause_for_plot = pause_for_plot 116 | 117 | self.qf_loss_averages = [] 118 | self.policy_surr_averages = [] 119 | self.q_averages = [] 120 | self.y_averages = [] 121 | self.paths = [] 122 | self.es_path_returns = [] 123 | self.paths_samples_cnt = 0 124 | 125 | self.scale_reward = scale_reward 126 | 127 | self.train_policy_itr = 0 128 | 129 | self.opt_info = None 130 | 131 | def start_worker(self): 132 | parallel_sampler.populate_task(self.env, self.policy) 133 | if self.plot: 134 | plotter.init_plot(self.env, self.policy) 135 | 136 | @overrides 137 | def train(self): 138 | with tf.Session() as sess: 139 | sess.run(tf.global_variables_initializer()) 140 | # This seems like a rather sequential method 141 | pool = SimpleReplayPool( 142 | max_pool_size=self.replay_pool_size, 143 | observation_dim=self.env.observation_space.flat_dim, 144 | action_dim=self.env.action_space.flat_dim, 145 | replacement_prob=self.replacement_prob, 146 | ) 147 | self.start_worker() 148 | 149 | self.init_opt() 150 | # This initializes the optimizer parameters 151 | sess.run(tf.global_variables_initializer()) 152 | itr = 0 153 | path_length = 0 154 | path_return = 0 155 | terminal = False 156 | initial = False 157 | observation = self.env.reset() 158 | 159 | with tf.variable_scope("sample_policy"): 160 | sample_policy = Serializable.clone(self.policy) 161 | 162 | for epoch in range(self.n_epochs): 163 | logger.push_prefix('epoch #%d | ' % epoch) 164 | logger.log("Training started") 165 | train_qf_itr, train_policy_itr = 0, 0 166 | 167 | #sample a policy function from the posterior at every episode 168 | #move in the entire episode with the sampled policy function? 169 | 170 | for epoch_itr in pyprind.prog_bar(range(self.epoch_length)): 171 | # Execute policy 172 | if terminal: # or path_length > self.max_path_length: 173 | # Note that if the last time step ends an episode, the very 174 | # last state and observation will be ignored and not added 175 | # to the replay pool 176 | observation = self.env.reset() 177 | self.es.reset() 178 | sample_policy.reset() 179 | self.es_path_returns.append(path_return) 180 | path_length = 0 181 | path_return = 0 182 | initial = True 183 | else: 184 | initial = False 185 | 186 | action = self.es.get_action(itr, observation, policy=sample_policy) # qf=qf) 187 | 188 | 189 | next_observation, reward, terminal, _ = self.env.step(action) 190 | path_length += 1 191 | path_return += reward 192 | 193 | 194 | if not terminal and path_length >= self.max_path_length: 195 | terminal = True 196 | # only include the terminal transition in this case if the flag was set 197 | if self.include_horizon_terminal_transitions: 198 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 199 | else: 200 | pool.add_sample(observation, action, reward * self.scale_reward, terminal, initial) 201 | 202 | observation = next_observation 203 | 204 | if pool.size >= self.min_pool_size: 205 | for update_itr in range(self.n_updates_per_sample): 206 | # Train policy 207 | batch = pool.random_batch(self.batch_size) 208 | itrs = self.do_training(itr, batch) 209 | train_qf_itr += itrs[0] 210 | train_policy_itr += itrs[1] 211 | sample_policy.set_param_values(self.policy.get_param_values()) 212 | 213 | itr += 1 214 | 215 | logger.log("Training finished") 216 | logger.log("Trained qf %d steps, policy %d steps"%(train_qf_itr, train_policy_itr)) 217 | if pool.size >= self.min_pool_size: 218 | self.evaluate(epoch, pool) 219 | params = self.get_epoch_snapshot(epoch) 220 | logger.save_itr_params(epoch, params) 221 | logger.dump_tabular(with_prefix=False) 222 | logger.pop_prefix() 223 | if self.plot: 224 | self.update_plot() 225 | if self.pause_for_plot: 226 | input("Plotting evaluation run: Press Enter to " 227 | "continue...") 228 | self.env.terminate() 229 | self.policy.terminate() 230 | 231 | 232 | 233 | def init_opt(self): 234 | 235 | # First, create "target" policy and Q functions 236 | with tf.variable_scope("target_policy"): 237 | target_policy = Serializable.clone(self.policy) 238 | with tf.variable_scope("target_qf"): 239 | target_qf = Serializable.clone(self.qf) 240 | 241 | # y need to be computed first 242 | obs = self.env.observation_space.new_tensor_variable( 243 | 'obs', 244 | extra_dims=1, 245 | ) 246 | 247 | # The yi values are computed separately as above and then passed to 248 | # the training functions below 249 | action = self.env.action_space.new_tensor_variable( 250 | 'action', 251 | extra_dims=1, 252 | ) 253 | 254 | yvar = tensor_utils.new_tensor( 255 | 'ys', 256 | ndim=1, 257 | dtype=tf.float32, 258 | ) 259 | 260 | qf_weight_decay_term = 0.5 * self.qf_weight_decay * \ 261 | sum([tf.reduce_sum(tf.square(param)) for param in 262 | self.qf.get_params(regularizable=True)]) 263 | 264 | qval = self.qf.get_qval_sym(obs, action) 265 | 266 | qf_loss = tf.reduce_mean(tf.square(yvar - qval)) 267 | qf_reg_loss = qf_loss + qf_weight_decay_term 268 | 269 | policy_weight_decay_term = 0.5 * self.policy_weight_decay * \ 270 | sum([tf.reduce_sum(tf.square(param)) 271 | for param in self.policy.get_params(regularizable=True)]) 272 | 273 | 274 | """ 275 | Commenting these out below 276 | """ 277 | # import pdb; pdb.set_trace() 278 | policy_qval = self.qf.get_qval_sym(obs, self.policy.get_action_sym(obs), deterministic=True) 279 | # policy_surr = -tf.reduce_mean(policy_qval) 280 | 281 | ############################# 282 | 283 | """ 284 | TO DO HERE 285 | """ 286 | # Yes implement something like get_qval_plus_var_sym(state, policy, lambda): where it returns mean + lambda*variance 287 | # as tf output. variance is a function of multiple forward passes of dropout Q function. 288 | # get_qval_plus_var_sym(state, policy, lambda) 289 | 290 | # policy_qval_plus_var = self.qf.get_qval_plus_var_sym(obs, self.policy.get_action_sym(obs), lambda, deterministic=True) 291 | # policy_qval = self.qf.get_qval_plus_var_sym(obs, self.policy.get_action_sym(obs), deterministic=False) 292 | 293 | ############################ 294 | 295 | 296 | ############################# 297 | 298 | """ 299 | CHANGES HERE 300 | """ 301 | 302 | policy_surr = -tf.reduce_mean(policy_qval) 303 | policy_input_list = [obs, qval] 304 | 305 | 306 | ############################ 307 | 308 | policy_reg_surr = policy_surr + policy_weight_decay_term 309 | 310 | qf_input_list = [yvar, obs, action] 311 | 312 | """ 313 | Commented out 314 | """ 315 | # policy_input_list = [obs] 316 | 317 | self.qf_update_method.update_opt( 318 | loss=qf_reg_loss, target=self.qf, inputs=qf_input_list) 319 | 320 | 321 | self.policy_update_method.update_opt( 322 | loss=policy_reg_surr, target=self.policy, inputs=policy_input_list) 323 | 324 | f_train_qf = tensor_utils.compile_function( 325 | inputs=qf_input_list, 326 | outputs=[qf_loss, qval, self.qf_update_method._train_op], 327 | ) 328 | 329 | f_train_policy = tensor_utils.compile_function( 330 | inputs=policy_input_list, 331 | outputs=[policy_surr, self.policy_update_method._train_op], 332 | ) 333 | 334 | self.opt_info = dict( 335 | f_train_qf=f_train_qf, 336 | f_train_policy=f_train_policy, 337 | target_qf=target_qf, 338 | target_policy=target_policy, 339 | ) 340 | 341 | 342 | def do_training(self, itr, batch): 343 | 344 | obs, actions, rewards, next_obs, terminals = ext.extract( 345 | batch, 346 | "observations", "actions", "rewards", "next_observations", 347 | "terminals" 348 | ) 349 | 350 | # compute the on-policy y values 351 | target_qf = self.opt_info["target_qf"] 352 | target_policy = self.opt_info["target_policy"] 353 | 354 | next_actions, _ = target_policy.get_actions(next_obs) 355 | next_qvals = target_qf.get_qval(next_obs, next_actions) 356 | 357 | ys = rewards + (1. - terminals) * self.discount * next_qvals.reshape(-1) 358 | 359 | ### for critic update step 360 | f_train_qf = self.opt_info["f_train_qf"] 361 | qf_loss, qval, _ = f_train_qf(ys, obs, actions) 362 | 363 | target_qf.set_param_values( 364 | target_qf.get_param_values() * (1.0 - self.soft_target_tau) + 365 | self.qf.get_param_values() * self.soft_target_tau) 366 | self.qf_loss_averages.append(qf_loss) 367 | self.q_averages.append(qval) 368 | self.y_averages.append(ys) 369 | 370 | self.train_policy_itr += self.policy_updates_ratio 371 | train_policy_itr = 0 372 | 373 | 374 | while self.train_policy_itr > 0: 375 | 376 | ### for actor update step 377 | f_train_policy = self.opt_info["f_train_policy"] 378 | 379 | 380 | MC_SAMPLES=20 381 | all_qval = np.zeros(shape=(obs.shape[0], MC_SAMPLES)) 382 | for m in range(MC_SAMPLES): 383 | 384 | _, qval, _ = f_train_qf(ys, obs, actions) 385 | all_qval[:, m] = qval 386 | 387 | mean_qval = np.mean(all_qval, axis=1) 388 | var_qval = np.var(all_qval, axis=1) 389 | 390 | qval_uncertain = mean_qval + var_qval 391 | 392 | #policy_surr, _ = f_train_policy(obs) 393 | policy_surr, _ = f_train_policy(obs, qval_uncertain) 394 | 395 | target_policy.set_param_values( 396 | target_policy.get_param_values() * (1.0 - self.soft_target_tau) + 397 | self.policy.get_param_values() * self.soft_target_tau) 398 | self.policy_surr_averages.append(policy_surr) 399 | self.train_policy_itr -= 1 400 | train_policy_itr += 1 401 | return 1, train_policy_itr # number of itrs qf, policy are trained 402 | 403 | 404 | 405 | 406 | 407 | 408 | 409 | 410 | def evaluate(self, epoch, pool): 411 | logger.log("Collecting samples for evaluation") 412 | paths = parallel_sampler.sample_paths( 413 | policy_params=self.policy.get_param_values(), 414 | max_samples=self.eval_samples, 415 | max_path_length=self.max_path_length, 416 | ) 417 | 418 | average_discounted_return = np.mean( 419 | [special.discount_return(path["rewards"], self.discount) for path in paths] 420 | ) 421 | 422 | returns = [sum(path["rewards"]) for path in paths] 423 | 424 | all_qs = np.concatenate(self.q_averages) 425 | all_ys = np.concatenate(self.y_averages) 426 | 427 | average_q_loss = np.mean(self.qf_loss_averages) 428 | average_policy_surr = np.mean(self.policy_surr_averages) 429 | average_action = np.mean(np.square(np.concatenate( 430 | [path["actions"] for path in paths] 431 | ))) 432 | 433 | policy_reg_param_norm = np.linalg.norm( 434 | self.policy.get_param_values(regularizable=True) 435 | ) 436 | qfun_reg_param_norm = np.linalg.norm( 437 | self.qf.get_param_values(regularizable=True) 438 | ) 439 | 440 | logger.record_tabular('Epoch', epoch) 441 | logger.record_tabular('Iteration', epoch) 442 | logger.record_tabular('AverageReturn', np.mean(returns)) 443 | logger.record_tabular('StdReturn', 444 | np.std(returns)) 445 | logger.record_tabular('MaxReturn', 446 | np.max(returns)) 447 | logger.record_tabular('MinReturn', 448 | np.min(returns)) 449 | if len(self.es_path_returns) > 0: 450 | logger.record_tabular('AverageEsReturn', 451 | np.mean(self.es_path_returns)) 452 | logger.record_tabular('StdEsReturn', 453 | np.std(self.es_path_returns)) 454 | logger.record_tabular('MaxEsReturn', 455 | np.max(self.es_path_returns)) 456 | logger.record_tabular('MinEsReturn', 457 | np.min(self.es_path_returns)) 458 | logger.record_tabular('AverageDiscountedReturn', 459 | average_discounted_return) 460 | logger.record_tabular('AverageQLoss', average_q_loss) 461 | logger.record_tabular('AveragePolicySurr', average_policy_surr) 462 | logger.record_tabular('AverageQ', np.mean(all_qs)) 463 | logger.record_tabular('AverageAbsQ', np.mean(np.abs(all_qs))) 464 | logger.record_tabular('AverageY', np.mean(all_ys)) 465 | logger.record_tabular('AverageAbsY', np.mean(np.abs(all_ys))) 466 | logger.record_tabular('AverageAbsQYDiff', 467 | np.mean(np.abs(all_qs - all_ys))) 468 | logger.record_tabular('AverageAction', average_action) 469 | 470 | logger.record_tabular('PolicyRegParamNorm', 471 | policy_reg_param_norm) 472 | logger.record_tabular('QFunRegParamNorm', 473 | qfun_reg_param_norm) 474 | 475 | self.env.log_diagnostics(paths) 476 | self.policy.log_diagnostics(paths) 477 | 478 | self.qf_loss_averages = [] 479 | self.policy_surr_averages = [] 480 | 481 | self.q_averages = [] 482 | self.y_averages = [] 483 | self.es_path_returns = [] 484 | 485 | def update_plot(self): 486 | if self.plot: 487 | plotter.update_plot(self.policy, self.max_path_length) 488 | 489 | def get_epoch_snapshot(self, epoch): 490 | return dict( 491 | env=self.env, 492 | epoch=epoch, 493 | qf=self.qf, 494 | policy=self.policy, 495 | target_qf=self.opt_info["target_qf"], 496 | target_policy=self.opt_info["target_policy"], 497 | es=self.es, 498 | ) 499 | -------------------------------------------------------------------------------- /bayesian_network.py: -------------------------------------------------------------------------------- 1 | import sandbox.rocky.tf.core.layers as L 2 | import tensorflow as tf 3 | import numpy as np 4 | import itertools 5 | from rllab.core.serializable import Serializable 6 | from sandbox.rocky.tf.core.parameterized import Parameterized 7 | from sandbox.rocky.tf.core.layers_powered import LayersPowered 8 | 9 | 10 | class MLP(LayersPowered, Serializable): 11 | def __init__(self, name, output_dim, hidden_sizes, hidden_nonlinearity, dropout_prob, 12 | output_nonlinearity, hidden_W_init=L.XavierUniformInitializer(), hidden_b_init=tf.zeros_initializer(), 13 | output_W_init=L.XavierUniformInitializer(), output_b_init=tf.zeros_initializer(), 14 | input_var=None, input_layer=None, input_shape=None, batch_normalization=False, weight_normalization=False, 15 | ): 16 | 17 | Serializable.quick_init(self, locals()) 18 | 19 | with tf.variable_scope(name): 20 | if input_layer is None: 21 | l_in = L.InputLayer(shape=(None,) + input_shape, input_var=input_var, name="input") 22 | else: 23 | l_in = input_layer 24 | self._layers = [l_in] 25 | 26 | ##applying dropout on all layers? 27 | l_hid_dropout_input = L.DropoutLayer(l_in, p = dropout_prob) 28 | l_hid = l_hid_dropout_input 29 | 30 | 31 | # l_hid = l_in 32 | if batch_normalization: 33 | l_hid = L.batch_norm(l_hid) 34 | for idx, hidden_size in enumerate(hidden_sizes): 35 | l_hid = L.DenseLayer( 36 | l_hid, 37 | num_units=hidden_size, 38 | nonlinearity=hidden_nonlinearity, 39 | name="hidden_%d" % idx, 40 | W=hidden_W_init, 41 | b=hidden_b_init, 42 | weight_normalization=weight_normalization 43 | ) 44 | if batch_normalization: 45 | l_hid = L.batch_norm(l_hid) 46 | self._layers.append(l_hid) 47 | 48 | 49 | ###applying dropout to the last hidden layer? 50 | l_hid_dropout = L.DropoutLayer(l_hid, p=dropout_prob) 51 | 52 | l_out = L.DenseLayer( 53 | l_hid_dropout, 54 | num_units=output_dim, 55 | nonlinearity=output_nonlinearity, 56 | name="output", 57 | W=output_W_init, 58 | b=output_b_init, 59 | weight_normalization=weight_normalization 60 | ) 61 | 62 | # l_out = L.DenseLayer( 63 | # l_hid, 64 | # num_units=output_dim, 65 | # nonlinearity=output_nonlinearity, 66 | # name="output", 67 | # W=output_W_init, 68 | # b=output_b_init, 69 | # weight_normalization=weight_normalization 70 | # ) 71 | 72 | #Alternative, making output layer the dropout layer 73 | # l_out = L.DropoutLayer(l_hid, p=dropout_prob) 74 | 75 | if batch_normalization: 76 | l_out = L.batch_norm(l_out) 77 | 78 | 79 | self._layers.append(l_out) 80 | self._l_in = l_in 81 | self._l_out = l_out 82 | # self._input_var = l_in.input_var 83 | self._output = L.get_output(l_out) 84 | 85 | LayersPowered.__init__(self, l_out) 86 | 87 | @property 88 | def input_layer(self): 89 | return self._l_in 90 | 91 | @property 92 | def output_layer(self): 93 | return self._l_out 94 | 95 | @property 96 | def input_var(self): 97 | return self._l_in.input_var 98 | 99 | @property 100 | def layers(self): 101 | return self._layers 102 | 103 | @property 104 | def output(self): 105 | return self._output 106 | 107 | 108 | class ConvNetwork(LayersPowered, Serializable): 109 | def __init__(self, name, input_shape, output_dim, 110 | conv_filters, conv_filter_sizes, conv_strides, conv_pads, 111 | hidden_sizes, hidden_nonlinearity, output_nonlinearity, 112 | hidden_W_init=L.XavierUniformInitializer(), hidden_b_init=tf.zeros_initializer(), 113 | output_W_init=L.XavierUniformInitializer(), output_b_init=tf.zeros_initializer(), 114 | input_var=None, input_layer=None, batch_normalization=False, weight_normalization=False): 115 | Serializable.quick_init(self, locals()) 116 | """ 117 | A network composed of several convolution layers followed by some fc layers. 118 | input_shape: (width,height,channel) 119 | HOWEVER, network inputs are assumed flattened. This network will first unflatten the inputs and then apply the standard convolutions and so on. 120 | conv_filters: a list of numbers of convolution kernel 121 | conv_filter_sizes: a list of sizes (int) of the convolution kernels 122 | conv_strides: a list of strides (int) of the conv kernels 123 | conv_pads: a list of pad formats (either 'SAME' or 'VALID') 124 | hidden_nonlinearity: a nonlinearity from tf.nn, shared by all conv and fc layers 125 | hidden_sizes: a list of numbers of hidden units for all fc layers 126 | """ 127 | with tf.variable_scope(name): 128 | if input_layer is not None: 129 | l_in = input_layer 130 | l_hid = l_in 131 | elif len(input_shape) == 3: 132 | l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var, name="input") 133 | l_hid = L.reshape(l_in, ([0],) + input_shape, name="reshape_input") 134 | elif len(input_shape) == 2: 135 | l_in = L.InputLayer(shape=(None, np.prod(input_shape)), input_var=input_var, name="input") 136 | input_shape = (1,) + input_shape 137 | l_hid = L.reshape(l_in, ([0],) + input_shape, name="reshape_input") 138 | else: 139 | l_in = L.InputLayer(shape=(None,) + input_shape, input_var=input_var, name="input") 140 | l_hid = l_in 141 | 142 | if batch_normalization: 143 | l_hid = L.batch_norm(l_hid) 144 | for idx, conv_filter, filter_size, stride, pad in zip( 145 | range(len(conv_filters)), 146 | conv_filters, 147 | conv_filter_sizes, 148 | conv_strides, 149 | conv_pads, 150 | ): 151 | l_hid = L.Conv2DLayer( 152 | l_hid, 153 | num_filters=conv_filter, 154 | filter_size=filter_size, 155 | stride=(stride, stride), 156 | pad=pad, 157 | nonlinearity=hidden_nonlinearity, 158 | name="conv_hidden_%d" % idx, 159 | weight_normalization=weight_normalization, 160 | ) 161 | if batch_normalization: 162 | l_hid = L.batch_norm(l_hid) 163 | 164 | if output_nonlinearity == L.spatial_expected_softmax: 165 | assert len(hidden_sizes) == 0 166 | assert output_dim == conv_filters[-1] * 2 167 | l_hid.nonlinearity = tf.identity 168 | l_out = L.SpatialExpectedSoftmaxLayer(l_hid) 169 | else: 170 | l_hid = L.flatten(l_hid, name="conv_flatten") 171 | for idx, hidden_size in enumerate(hidden_sizes): 172 | l_hid = L.DenseLayer( 173 | l_hid, 174 | num_units=hidden_size, 175 | nonlinearity=hidden_nonlinearity, 176 | name="hidden_%d" % idx, 177 | W=hidden_W_init, 178 | b=hidden_b_init, 179 | weight_normalization=weight_normalization, 180 | ) 181 | if batch_normalization: 182 | l_hid = L.batch_norm(l_hid) 183 | l_out = L.DenseLayer( 184 | l_hid, 185 | num_units=output_dim, 186 | nonlinearity=output_nonlinearity, 187 | name="output", 188 | W=output_W_init, 189 | b=output_b_init, 190 | weight_normalization=weight_normalization, 191 | ) 192 | if batch_normalization: 193 | l_out = L.batch_norm(l_out) 194 | self._l_in = l_in 195 | self._l_out = l_out 196 | # self._input_var = l_in.input_var 197 | 198 | LayersPowered.__init__(self, l_out) 199 | 200 | @property 201 | def input_layer(self): 202 | return self._l_in 203 | 204 | @property 205 | def output_layer(self): 206 | return self._l_out 207 | 208 | @property 209 | def input_var(self): 210 | return self._l_in.input_var 211 | 212 | 213 | class GRUNetwork(object): 214 | def __init__(self, name, input_shape, output_dim, hidden_dim, hidden_nonlinearity=tf.nn.relu, 215 | gru_layer_cls=L.GRULayer, 216 | output_nonlinearity=None, input_var=None, input_layer=None, layer_args=None): 217 | with tf.variable_scope(name): 218 | if input_layer is None: 219 | l_in = L.InputLayer(shape=(None, None) + input_shape, input_var=input_var, name="input") 220 | else: 221 | l_in = input_layer 222 | l_step_input = L.InputLayer(shape=(None,) + input_shape, name="step_input") 223 | l_step_prev_state = L.InputLayer(shape=(None, hidden_dim), name="step_prev_state") 224 | if layer_args is None: 225 | layer_args = dict() 226 | l_gru = gru_layer_cls(l_in, num_units=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, 227 | hidden_init_trainable=False, name="gru", **layer_args) 228 | l_gru_flat = L.ReshapeLayer( 229 | l_gru, shape=(-1, hidden_dim), 230 | name="gru_flat" 231 | ) 232 | l_output_flat = L.DenseLayer( 233 | l_gru_flat, 234 | num_units=output_dim, 235 | nonlinearity=output_nonlinearity, 236 | name="output_flat" 237 | ) 238 | l_output = L.OpLayer( 239 | l_output_flat, 240 | op=lambda flat_output, l_input: 241 | tf.reshape(flat_output, tf.stack((tf.shape(l_input)[0], tf.shape(l_input)[1], -1))), 242 | shape_op=lambda flat_output_shape, l_input_shape: 243 | (l_input_shape[0], l_input_shape[1], flat_output_shape[-1]), 244 | extras=[l_in], 245 | name="output" 246 | ) 247 | l_step_state = l_gru.get_step_layer(l_step_input, l_step_prev_state, name="step_state") 248 | l_step_hidden = l_step_state 249 | l_step_output = L.DenseLayer( 250 | l_step_hidden, 251 | num_units=output_dim, 252 | nonlinearity=output_nonlinearity, 253 | W=l_output_flat.W, 254 | b=l_output_flat.b, 255 | name="step_output" 256 | ) 257 | 258 | self._l_in = l_in 259 | self._hid_init_param = l_gru.h0 260 | self._l_gru = l_gru 261 | self._l_out = l_output 262 | self._l_step_input = l_step_input 263 | self._l_step_prev_state = l_step_prev_state 264 | self._l_step_hidden = l_step_hidden 265 | self._l_step_state = l_step_state 266 | self._l_step_output = l_step_output 267 | self._hidden_dim = hidden_dim 268 | 269 | @property 270 | def state_dim(self): 271 | return self._hidden_dim 272 | 273 | @property 274 | def hidden_dim(self): 275 | return self._hidden_dim 276 | 277 | @property 278 | def input_layer(self): 279 | return self._l_in 280 | 281 | @property 282 | def input_var(self): 283 | return self._l_in.input_var 284 | 285 | @property 286 | def output_layer(self): 287 | return self._l_out 288 | 289 | @property 290 | def recurrent_layer(self): 291 | return self._l_gru 292 | 293 | @property 294 | def step_input_layer(self): 295 | return self._l_step_input 296 | 297 | @property 298 | def step_prev_state_layer(self): 299 | return self._l_step_prev_state 300 | 301 | @property 302 | def step_hidden_layer(self): 303 | return self._l_step_hidden 304 | 305 | @property 306 | def step_state_layer(self): 307 | return self._l_step_state 308 | 309 | @property 310 | def step_output_layer(self): 311 | return self._l_step_output 312 | 313 | @property 314 | def hid_init_param(self): 315 | return self._hid_init_param 316 | 317 | @property 318 | def state_init_param(self): 319 | return self._hid_init_param 320 | 321 | 322 | class LSTMNetwork(object): 323 | def __init__(self, name, input_shape, output_dim, hidden_dim, hidden_nonlinearity=tf.nn.relu, 324 | lstm_layer_cls=L.LSTMLayer, 325 | output_nonlinearity=None, input_var=None, input_layer=None, forget_bias=1.0, use_peepholes=False, 326 | layer_args=None): 327 | with tf.variable_scope(name): 328 | if input_layer is None: 329 | l_in = L.InputLayer(shape=(None, None) + input_shape, input_var=input_var, name="input") 330 | else: 331 | l_in = input_layer 332 | l_step_input = L.InputLayer(shape=(None,) + input_shape, name="step_input") 333 | # contains previous hidden and cell state 334 | l_step_prev_state = L.InputLayer(shape=(None, hidden_dim * 2), name="step_prev_state") 335 | if layer_args is None: 336 | layer_args = dict() 337 | l_lstm = lstm_layer_cls(l_in, num_units=hidden_dim, hidden_nonlinearity=hidden_nonlinearity, 338 | hidden_init_trainable=False, name="lstm", forget_bias=forget_bias, 339 | cell_init_trainable=False, use_peepholes=use_peepholes, **layer_args) 340 | l_lstm_flat = L.ReshapeLayer( 341 | l_lstm, shape=(-1, hidden_dim), 342 | name="lstm_flat" 343 | ) 344 | l_output_flat = L.DenseLayer( 345 | l_lstm_flat, 346 | num_units=output_dim, 347 | nonlinearity=output_nonlinearity, 348 | name="output_flat" 349 | ) 350 | l_output = L.OpLayer( 351 | l_output_flat, 352 | op=lambda flat_output, l_input: 353 | tf.reshape(flat_output, tf.stack((tf.shape(l_input)[0], tf.shape(l_input)[1], -1))), 354 | shape_op=lambda flat_output_shape, l_input_shape: 355 | (l_input_shape[0], l_input_shape[1], flat_output_shape[-1]), 356 | extras=[l_in], 357 | name="output" 358 | ) 359 | l_step_state = l_lstm.get_step_layer(l_step_input, l_step_prev_state, name="step_state") 360 | l_step_hidden = L.SliceLayer(l_step_state, indices=slice(hidden_dim), name="step_hidden") 361 | l_step_cell = L.SliceLayer(l_step_state, indices=slice(hidden_dim, None), name="step_cell") 362 | l_step_output = L.DenseLayer( 363 | l_step_hidden, 364 | num_units=output_dim, 365 | nonlinearity=output_nonlinearity, 366 | W=l_output_flat.W, 367 | b=l_output_flat.b, 368 | name="step_output" 369 | ) 370 | 371 | self._l_in = l_in 372 | self._hid_init_param = l_lstm.h0 373 | self._cell_init_param = l_lstm.c0 374 | self._l_lstm = l_lstm 375 | self._l_out = l_output 376 | self._l_step_input = l_step_input 377 | self._l_step_prev_state = l_step_prev_state 378 | self._l_step_hidden = l_step_hidden 379 | self._l_step_cell = l_step_cell 380 | self._l_step_state = l_step_state 381 | self._l_step_output = l_step_output 382 | self._hidden_dim = hidden_dim 383 | 384 | @property 385 | def state_dim(self): 386 | return self._hidden_dim * 2 387 | 388 | @property 389 | def input_layer(self): 390 | return self._l_in 391 | 392 | @property 393 | def input_var(self): 394 | return self._l_in.input_var 395 | 396 | @property 397 | def output_layer(self): 398 | return self._l_out 399 | 400 | @property 401 | def recurrent_layer(self): 402 | return self._l_lstm 403 | 404 | @property 405 | def step_input_layer(self): 406 | return self._l_step_input 407 | 408 | @property 409 | def step_prev_state_layer(self): 410 | return self._l_step_prev_state 411 | 412 | @property 413 | def step_hidden_layer(self): 414 | return self._l_step_hidden 415 | 416 | @property 417 | def step_state_layer(self): 418 | return self._l_step_state 419 | 420 | @property 421 | def step_cell_layer(self): 422 | return self._l_step_cell 423 | 424 | @property 425 | def step_output_layer(self): 426 | return self._l_step_output 427 | 428 | @property 429 | def hid_init_param(self): 430 | return self._hid_init_param 431 | 432 | @property 433 | def cell_init_param(self): 434 | return self._cell_init_param 435 | 436 | @property 437 | def state_init_param(self): 438 | return tf.concat(axis=0, values=[self._hid_init_param, self._cell_init_param]) 439 | 440 | 441 | class ConvMergeNetwork(LayersPowered, Serializable): 442 | """ 443 | This network allows the input to consist of a convolution-friendly component, plus a non-convolution-friendly 444 | component. These two components will be concatenated in the fully connected layers. There can also be a list of 445 | optional layers for the non-convolution-friendly component alone. 446 | 447 | 448 | The input to the network should be a matrix where each row is a single input entry, with both the aforementioned 449 | components flattened out and then concatenated together 450 | """ 451 | 452 | def __init__(self, name, input_shape, extra_input_shape, output_dim, hidden_sizes, 453 | conv_filters, conv_filter_sizes, conv_strides, conv_pads, 454 | extra_hidden_sizes=None, 455 | hidden_W_init=L.XavierUniformInitializer(), hidden_b_init=tf.zeros_initializer(), 456 | output_W_init=L.XavierUniformInitializer(), output_b_init=tf.zeros_initializer(), 457 | hidden_nonlinearity=tf.nn.relu, 458 | output_nonlinearity=None, 459 | input_var=None, input_layer=None): 460 | Serializable.quick_init(self, locals()) 461 | 462 | if extra_hidden_sizes is None: 463 | extra_hidden_sizes = [] 464 | 465 | with tf.variable_scope(name): 466 | 467 | input_flat_dim = np.prod(input_shape) 468 | extra_input_flat_dim = np.prod(extra_input_shape) 469 | total_input_flat_dim = input_flat_dim + extra_input_flat_dim 470 | 471 | if input_layer is None: 472 | l_in = L.InputLayer(shape=(None, total_input_flat_dim), input_var=input_var, name="input") 473 | else: 474 | l_in = input_layer 475 | 476 | l_conv_in = L.reshape( 477 | L.SliceLayer( 478 | l_in, 479 | indices=slice(input_flat_dim), 480 | name="conv_slice" 481 | ), 482 | ([0],) + input_shape, 483 | name="conv_reshaped" 484 | ) 485 | l_extra_in = L.reshape( 486 | L.SliceLayer( 487 | l_in, 488 | indices=slice(input_flat_dim, None), 489 | name="extra_slice" 490 | ), 491 | ([0],) + extra_input_shape, 492 | name="extra_reshaped" 493 | ) 494 | 495 | l_conv_hid = l_conv_in 496 | for idx, conv_filter, filter_size, stride, pad in zip( 497 | range(len(conv_filters)), 498 | conv_filters, 499 | conv_filter_sizes, 500 | conv_strides, 501 | conv_pads, 502 | ): 503 | l_conv_hid = L.Conv2DLayer( 504 | l_conv_hid, 505 | num_filters=conv_filter, 506 | filter_size=filter_size, 507 | stride=(stride, stride), 508 | pad=pad, 509 | nonlinearity=hidden_nonlinearity, 510 | name="conv_hidden_%d" % idx, 511 | ) 512 | 513 | l_extra_hid = l_extra_in 514 | for idx, hidden_size in enumerate(extra_hidden_sizes): 515 | l_extra_hid = L.DenseLayer( 516 | l_extra_hid, 517 | num_units=hidden_size, 518 | nonlinearity=hidden_nonlinearity, 519 | name="extra_hidden_%d" % idx, 520 | W=hidden_W_init, 521 | b=hidden_b_init, 522 | ) 523 | 524 | l_joint_hid = L.concat( 525 | [L.flatten(l_conv_hid, name="conv_hidden_flat"), l_extra_hid], 526 | name="joint_hidden" 527 | ) 528 | 529 | for idx, hidden_size in enumerate(hidden_sizes): 530 | l_joint_hid = L.DenseLayer( 531 | l_joint_hid, 532 | num_units=hidden_size, 533 | nonlinearity=hidden_nonlinearity, 534 | name="joint_hidden_%d" % idx, 535 | W=hidden_W_init, 536 | b=hidden_b_init, 537 | ) 538 | l_out = L.DenseLayer( 539 | l_joint_hid, 540 | num_units=output_dim, 541 | nonlinearity=output_nonlinearity, 542 | name="output", 543 | W=output_W_init, 544 | b=output_b_init, 545 | ) 546 | self._l_in = l_in 547 | self._l_out = l_out 548 | 549 | LayersPowered.__init__(self, [l_out], input_layers=[l_in]) 550 | 551 | @property 552 | def input_layer(self): 553 | return self._l_in 554 | 555 | @property 556 | def output_layer(self): 557 | return self._l_out 558 | 559 | @property 560 | def input_var(self): 561 | return self._l_in.input_var 562 | --------------------------------------------------------------------------------