├── .gitignore ├── README.md ├── __init__.py ├── algos ├── __init__.py ├── batch_polopt_expl.py ├── erwr_expl.py ├── npo_expl.py ├── trpo_expl.py └── vpg_expl.py ├── dynamics ├── __init__.py └── bnn.py ├── envs ├── __init__.py ├── cartpole_swingup_env_x.py ├── double_pendulum_env_x.py ├── half_cheetah_env_x.py └── mountain_car_env_x.py ├── experiments ├── __init__.py ├── run_experiment_lite.py ├── run_trpo.py └── run_trpo_expl.py └── sampler ├── __init__.py └── parallel_sampler_expl.py /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | *.pyc 3 | *-checkpoint.ipynb 4 | .DS_Store 5 | *.h5 6 | *.log 7 | *.npz 8 | secrets.py 9 | *.avi 10 | *.mp4 11 | build 12 | build_linux 13 | .idea 14 | .sublime-project 15 | run_experiment.sh 16 | scratch-notebooks 17 | launch_scripts 18 | *.sh.e* 19 | *.sh.o* 20 | MUJOCO_LOG.TXT 21 | vendor/mujoco 22 | .project 23 | .pydevproject 24 | *.pdf 25 | .env 26 | snippets 27 | private 28 | lua 29 | iterate.dat 30 | .env 31 | src/ 32 | .settings 33 | .pods 34 | docs/_build 35 | *.swp 36 | *.dat 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | **Status:** Archive (code is provided as-is, no updates expected) 2 | 3 | # How to run VIME 4 | 5 | Variational Information Maximizing Exploration (VIME) as presented in Curiosity-driven Exploration in Deep Reinforcement Learning via Bayesian Neural Networks by *R. Houthooft, X. Chen, Y. Duan, J. Schulman, F. De Turck, P. Abbeel* (http://arxiv.org/abs/1605.09674). 6 | 7 | To reproduce the results, you should first have [rllab](https://github.com/rllab/rllab) and Mujoco v1.31 configured. Then, run the following commands in the root folder of `rllab`: 8 | 9 | ```bash 10 | git submodule add -f git@github.com:openai/vime.git sandbox/vime 11 | touch sandbox/__init__.py 12 | ``` 13 | 14 | Then you can do the following: 15 | - Execute TRPO+VIME on the hierarchical SwimmerGather environment via `python sandbox/vime/experiments/run_trpo_expl.py`. 16 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/__init__.py -------------------------------------------------------------------------------- /algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/algos/__init__.py -------------------------------------------------------------------------------- /algos/batch_polopt_expl.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.algos.base import RLAlgorithm 4 | from sandbox.vime.sampler import parallel_sampler_expl as parallel_sampler 5 | from rllab.misc import special 6 | from rllab.misc import tensor_utils 7 | from rllab.algos import util 8 | import rllab.misc.logger as logger 9 | import rllab.plotter as plotter 10 | 11 | # exploration imports 12 | # ------------------- 13 | import theano 14 | import lasagne 15 | from collections import deque 16 | import time 17 | from sandbox.vime.dynamics import bnn 18 | # ------------------- 19 | 20 | 21 | class SimpleReplayPool(object): 22 | """Replay pool""" 23 | 24 | def __init__( 25 | self, max_pool_size, observation_shape, action_dim, 26 | observation_dtype=theano.config.floatX, # @UndefinedVariable 27 | action_dtype=theano.config.floatX): # @UndefinedVariable 28 | self._observation_shape = observation_shape 29 | self._action_dim = action_dim 30 | self._observation_dtype = observation_dtype 31 | self._action_dtype = action_dtype 32 | self._max_pool_size = max_pool_size 33 | 34 | self._observations = np.zeros( 35 | (max_pool_size,) + observation_shape, 36 | dtype=observation_dtype 37 | ) 38 | self._actions = np.zeros( 39 | (max_pool_size, action_dim), 40 | dtype=action_dtype 41 | ) 42 | self._rewards = np.zeros(max_pool_size, dtype='float32') 43 | self._terminals = np.zeros(max_pool_size, dtype='uint8') 44 | self._bottom = 0 45 | self._top = 0 46 | self._size = 0 47 | 48 | def add_sample(self, observation, action, reward, terminal): 49 | self._observations[self._top] = observation 50 | self._actions[self._top] = action 51 | self._rewards[self._top] = reward 52 | self._terminals[self._top] = terminal 53 | self._top = (self._top + 1) % self._max_pool_size 54 | if self._size >= self._max_pool_size: 55 | self._bottom = (self._bottom + 1) % self._max_pool_size 56 | else: 57 | self._size = self._size + 1 58 | 59 | def random_batch(self, batch_size): 60 | assert self._size > batch_size 61 | indices = np.zeros(batch_size, dtype='uint64') 62 | transition_indices = np.zeros(batch_size, dtype='uint64') 63 | count = 0 64 | while count < batch_size: 65 | index = np.random.randint( 66 | self._bottom, self._bottom + self._size) % self._max_pool_size 67 | # make sure that the transition is valid: if we are at the end of the pool, we need to discard 68 | # this sample 69 | if index == self._size - 1 and self._size <= self._max_pool_size: 70 | continue 71 | transition_index = (index + 1) % self._max_pool_size 72 | indices[count] = index 73 | transition_indices[count] = transition_index 74 | count += 1 75 | return dict( 76 | observations=self._observations[indices], 77 | actions=self._actions[indices], 78 | rewards=self._rewards[indices], 79 | terminals=self._terminals[indices], 80 | next_observations=self._observations[transition_indices] 81 | ) 82 | 83 | def mean_obs_act(self): 84 | if self._size >= self._max_pool_size: 85 | obs = self._observations 86 | act = self._actions 87 | else: 88 | obs = self._observations[:self._top + 1] 89 | act = self._actions[:self._top + 1] 90 | obs_mean = np.mean(obs, axis=0) 91 | obs_std = np.std(obs, axis=0) 92 | act_mean = np.mean(act, axis=0) 93 | act_std = np.std(act, axis=0) 94 | return obs_mean, obs_std, act_mean, act_std 95 | 96 | @property 97 | def size(self): 98 | return self._size 99 | 100 | 101 | class BatchPolopt(RLAlgorithm): 102 | """ 103 | Base class for batch sampling-based policy optimization methods. 104 | This includes various policy gradient methods like vpg, npg, ppo, trpo, etc. 105 | """ 106 | 107 | def __init__( 108 | self, 109 | env, 110 | policy, 111 | baseline, 112 | n_itr=500, 113 | start_itr=0, 114 | batch_size=5000, 115 | max_path_length=500, 116 | discount=0.99, 117 | gae_lambda=1, 118 | plot=False, 119 | pause_for_plot=False, 120 | whole_paths=True, 121 | center_adv=True, 122 | positive_adv=False, 123 | record_states=False, 124 | store_paths=False, 125 | algorithm_parallelized=False, 126 | # exploration params 127 | eta=1., 128 | snn_n_samples=10, 129 | prior_sd=0.5, 130 | use_kl_ratio=False, 131 | kl_q_len=10, 132 | use_reverse_kl_reg=False, 133 | reverse_kl_reg_factor=1e-3, 134 | use_replay_pool=True, 135 | replay_pool_size=100000, 136 | min_pool_size=500, 137 | n_updates_per_sample=500, 138 | pool_batch_size=10, 139 | eta_discount=1.0, 140 | n_itr_update=5, 141 | reward_alpha=0.001, 142 | kl_alpha=0.001, 143 | normalize_reward=False, 144 | kl_batch_size=1, 145 | use_kl_ratio_q=False, 146 | unn_n_hidden=[32], 147 | unn_layers_type=[1, 1], 148 | unn_learning_rate=0.001, 149 | second_order_update=False, 150 | compression=False, 151 | information_gain=True, 152 | **kwargs 153 | ): 154 | """ 155 | :param env: Environment 156 | :param policy: Policy 157 | :param baseline: Baseline 158 | :param n_itr: Number of iterations. 159 | :param start_itr: Starting iteration. 160 | :param batch_size: Number of samples per iteration. 161 | :param max_path_length: Maximum length of a single rollout. 162 | :param discount: Discount. 163 | :param gae_lambda: Lambda used for generalized advantage estimation. 164 | :param plot: Plot evaluation run after each iteration. 165 | :param pause_for_plot: Whether to pause before contiuing when plotting. 166 | :param whole_paths: Make sure that the samples contain whole trajectories, even if the actual batch size is 167 | slightly larger than the specified batch_size. 168 | :param center_adv: Whether to rescale the advantages so that they have mean 0 and standard deviation 1. 169 | :param positive_adv: Whether to shift the advantages so that they are always positive. When used in 170 | conjunction with center_adv the advantages will be standardized before shifting. 171 | :param store_paths: Whether to save all paths data to the snapshot. 172 | :return: 173 | """ 174 | self.env = env 175 | self.policy = policy 176 | self.baseline = baseline 177 | self.n_itr = n_itr 178 | self.start_itr = start_itr 179 | self.batch_size = batch_size 180 | self.max_path_length = max_path_length 181 | self.discount = discount 182 | self.gae_lambda = gae_lambda 183 | self.plot = plot 184 | self.pause_for_plot = pause_for_plot 185 | self.whole_paths = whole_paths 186 | self.center_adv = center_adv 187 | self.positive_adv = positive_adv 188 | self.store_paths = store_paths 189 | 190 | # Set exploration params 191 | # ---------------------- 192 | self.eta = eta 193 | self.snn_n_samples = snn_n_samples 194 | self.prior_sd = prior_sd 195 | self.use_kl_ratio = use_kl_ratio 196 | self.kl_q_len = kl_q_len 197 | self.use_reverse_kl_reg = use_reverse_kl_reg 198 | self.reverse_kl_reg_factor = reverse_kl_reg_factor 199 | self.use_replay_pool = use_replay_pool 200 | self.replay_pool_size = replay_pool_size 201 | self.min_pool_size = min_pool_size 202 | self.n_updates_per_sample = n_updates_per_sample 203 | self.pool_batch_size = pool_batch_size 204 | self.eta_discount = eta_discount 205 | self.n_itr_update = n_itr_update 206 | self.reward_alpha = reward_alpha 207 | self.kl_alpha = kl_alpha 208 | self.normalize_reward = normalize_reward 209 | self.kl_batch_size = kl_batch_size 210 | self.use_kl_ratio_q = use_kl_ratio_q 211 | self.unn_n_hidden = unn_n_hidden 212 | self.unn_layers_type = unn_layers_type 213 | self.unn_learning_rate = unn_learning_rate 214 | self.second_order_update = second_order_update 215 | self.compression = compression 216 | self.information_gain = information_gain 217 | # ---------------------- 218 | 219 | if self.second_order_update: 220 | assert self.kl_batch_size == 1 221 | assert self.n_itr_update == 1 222 | 223 | # Params to keep track of moving average (both intrinsic and external 224 | # reward) mean/var. 225 | if self.normalize_reward: 226 | self._reward_mean = deque(maxlen=self.kl_q_len) 227 | self._reward_std = deque(maxlen=self.kl_q_len) 228 | if self.use_kl_ratio: 229 | self._kl_mean = deque(maxlen=self.kl_q_len) 230 | self._kl_std = deque(maxlen=self.kl_q_len) 231 | 232 | if self.use_kl_ratio_q: 233 | # Add Queue here to keep track of N last kl values, compute average 234 | # over them and divide current kl values by it. This counters the 235 | # exploding kl value problem. 236 | self.kl_previous = deque(maxlen=self.kl_q_len) 237 | 238 | def start_worker(self): 239 | parallel_sampler.populate_task(self.env, self.policy, self.bnn) 240 | if self.plot: 241 | plotter.init_plot(self.env, self.policy) 242 | 243 | def shutdown_worker(self): 244 | pass 245 | 246 | def train(self): 247 | 248 | # Bayesian neural network (BNN) initialization. 249 | # ------------------------------------------------ 250 | batch_size = 1 # Redundant 251 | n_batches = 5 # Hardcode or annealing scheme \pi_i. 252 | 253 | # MDP observation and action dimensions. 254 | obs_dim = np.prod(self.env.observation_space.shape) 255 | act_dim = np.prod(self.env.action_space.shape) 256 | 257 | logger.log("Building BNN model (eta={}) ...".format(self.eta)) 258 | start_time = time.time() 259 | 260 | self.bnn = bnn.BNN( 261 | n_in=(obs_dim + act_dim), 262 | n_hidden=self.unn_n_hidden, 263 | n_out=obs_dim, 264 | n_batches=n_batches, 265 | layers_type=self.unn_layers_type, 266 | trans_func=lasagne.nonlinearities.rectify, 267 | out_func=lasagne.nonlinearities.linear, 268 | batch_size=batch_size, 269 | n_samples=self.snn_n_samples, 270 | prior_sd=self.prior_sd, 271 | use_reverse_kl_reg=self.use_reverse_kl_reg, 272 | reverse_kl_reg_factor=self.reverse_kl_reg_factor, 273 | # stochastic_output=self.stochastic_output, 274 | second_order_update=self.second_order_update, 275 | learning_rate=self.unn_learning_rate, 276 | compression=self.compression, 277 | information_gain=self.information_gain 278 | ) 279 | 280 | logger.log( 281 | "Model built ({:.1f} sec).".format((time.time() - start_time))) 282 | 283 | if self.use_replay_pool: 284 | self.pool = SimpleReplayPool( 285 | max_pool_size=self.replay_pool_size, 286 | observation_shape=self.env.observation_space.shape, 287 | action_dim=act_dim 288 | ) 289 | # ------------------------------------------------ 290 | 291 | self.start_worker() 292 | self.init_opt() 293 | episode_rewards = [] 294 | episode_lengths = [] 295 | for itr in xrange(self.start_itr, self.n_itr): 296 | logger.push_prefix('itr #%d | ' % itr) 297 | 298 | paths = self.obtain_samples(itr) 299 | samples_data = self.process_samples(itr, paths) 300 | 301 | # Exploration code 302 | # ---------------- 303 | if self.use_replay_pool: 304 | # Fill replay pool. 305 | logger.log("Fitting dynamics model using replay pool ...") 306 | for path in samples_data['paths']: 307 | path_len = len(path['rewards']) 308 | for i in xrange(path_len): 309 | obs = path['observations'][i] 310 | act = path['actions'][i] 311 | rew = path['rewards'][i] 312 | term = (i == path_len - 1) 313 | self.pool.add_sample(obs, act, rew, term) 314 | 315 | # Now we train the dynamics model using the replay self.pool; only 316 | # if self.pool is large enough. 317 | if self.pool.size >= self.min_pool_size: 318 | obs_mean, obs_std, act_mean, act_std = self.pool.mean_obs_act() 319 | _inputss = [] 320 | _targetss = [] 321 | for _ in xrange(self.n_updates_per_sample): 322 | batch = self.pool.random_batch( 323 | self.pool_batch_size) 324 | obs = (batch['observations'] - obs_mean) / \ 325 | (obs_std + 1e-8) 326 | next_obs = ( 327 | batch['next_observations'] - obs_mean) / (obs_std + 1e-8) 328 | act = (batch['actions'] - act_mean) / \ 329 | (act_std + 1e-8) 330 | _inputs = np.hstack( 331 | [obs, act]) 332 | _targets = next_obs 333 | _inputss.append(_inputs) 334 | _targetss.append(_targets) 335 | 336 | old_acc = 0. 337 | for _inputs, _targets in zip(_inputss, _targetss): 338 | _out = self.bnn.pred_fn(_inputs) 339 | old_acc += np.mean(np.square(_out - _targets)) 340 | old_acc /= len(_inputss) 341 | 342 | for _inputs, _targets in zip(_inputss, _targetss): 343 | self.bnn.train_fn(_inputs, _targets) 344 | 345 | new_acc = 0. 346 | for _inputs, _targets in zip(_inputss, _targetss): 347 | _out = self.bnn.pred_fn(_inputs) 348 | new_acc += np.mean(np.square(_out - _targets)) 349 | new_acc /= len(_inputss) 350 | 351 | logger.record_tabular( 352 | 'BNN_DynModelSqLossBefore', old_acc) 353 | logger.record_tabular( 354 | 'BNN_DynModelSqLossAfter', new_acc) 355 | # ---------------- 356 | 357 | self.env.log_diagnostics(paths) 358 | self.policy.log_diagnostics(paths) 359 | self.baseline.log_diagnostics(paths) 360 | self.optimize_policy(itr, samples_data) 361 | logger.log("saving snapshot...") 362 | params = self.get_itr_snapshot(itr, samples_data) 363 | paths = samples_data["paths"] 364 | if self.store_paths: 365 | params["paths"] = paths 366 | episode_rewards.extend(sum(p["rewards"]) for p in paths) 367 | episode_lengths.extend(len(p["rewards"]) for p in paths) 368 | params["episode_rewards"] = np.array(episode_rewards) 369 | params["episode_lengths"] = np.array(episode_lengths) 370 | params["algo"] = self 371 | logger.save_itr_params(itr, params) 372 | logger.log("saved") 373 | logger.dump_tabular(with_prefix=False) 374 | logger.pop_prefix() 375 | if self.plot: 376 | self.update_plot() 377 | if self.pause_for_plot: 378 | raw_input("Plotting evaluation run: Press Enter to " 379 | "continue...") 380 | 381 | self.shutdown_worker() 382 | 383 | def init_opt(self): 384 | """ 385 | Initialize the optimization procedure. If using theano / cgt, this may 386 | include declaring all the variables and compiling functions 387 | """ 388 | raise NotImplementedError 389 | 390 | def get_itr_snapshot(self, itr, samples_data): 391 | """ 392 | Returns all the data that should be saved in the snapshot for this 393 | iteration. 394 | """ 395 | raise NotImplementedError 396 | 397 | def optimize_policy(self, itr, samples_data): 398 | raise NotImplementedError 399 | 400 | def update_plot(self): 401 | if self.plot: 402 | plotter.update_plot(self.policy, self.max_path_length) 403 | 404 | def obtain_samples(self, itr): 405 | cur_params = self.policy.get_param_values() 406 | cur_dynamics_params = self.bnn.get_param_values() 407 | 408 | reward_mean = None 409 | reward_std = None 410 | if self.normalize_reward: 411 | # Compute running mean/std. 412 | reward_mean = np.mean(np.asarray(self._reward_mean)) 413 | reward_std = np.mean(np.asarray(self._reward_std)) 414 | 415 | # Mean/std obs/act based on replay pool. 416 | obs_mean, obs_std, act_mean, act_std = self.pool.mean_obs_act() 417 | 418 | paths = parallel_sampler.sample_paths( 419 | policy_params=cur_params, 420 | dynamics_params=cur_dynamics_params, 421 | max_samples=self.batch_size, 422 | max_path_length=self.max_path_length, 423 | itr=itr, 424 | normalize_reward=self.normalize_reward, 425 | reward_mean=reward_mean, 426 | reward_std=reward_std, 427 | kl_batch_size=self.kl_batch_size, 428 | n_itr_update=self.n_itr_update, 429 | use_replay_pool=self.use_replay_pool, 430 | obs_mean=obs_mean, 431 | obs_std=obs_std, 432 | act_mean=act_mean, 433 | act_std=act_std, 434 | second_order_update=self.second_order_update 435 | ) 436 | if self.whole_paths: 437 | return paths 438 | else: 439 | paths_truncated = parallel_sampler.truncate_paths( 440 | paths, self.batch_size) 441 | return paths_truncated 442 | 443 | def process_samples(self, itr, paths): 444 | 445 | if self.normalize_reward: 446 | # Update reward mean/std Q. 447 | rewards = [] 448 | for i in xrange(len(paths)): 449 | rewards.append(paths[i]['rewards']) 450 | rewards_flat = np.hstack(rewards) 451 | self._reward_mean.append(np.mean(rewards_flat)) 452 | self._reward_std.append(np.std(rewards_flat)) 453 | 454 | # Normalize rewards. 455 | reward_mean = np.mean(np.asarray(self._reward_mean)) 456 | reward_std = np.mean(np.asarray(self._reward_std)) 457 | for i in xrange(len(paths)): 458 | paths[i]['rewards'] = ( 459 | paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8) 460 | 461 | if itr > 0: 462 | kls = [] 463 | for i in xrange(len(paths)): 464 | kls.append(paths[i]['KL']) 465 | 466 | kls_flat = np.hstack(kls) 467 | 468 | logger.record_tabular('Expl_MeanKL', np.mean(kls_flat)) 469 | logger.record_tabular('Expl_StdKL', np.std(kls_flat)) 470 | logger.record_tabular('Expl_MinKL', np.min(kls_flat)) 471 | logger.record_tabular('Expl_MaxKL', np.max(kls_flat)) 472 | 473 | # Perform normlization of the intrinsic rewards. 474 | if self.use_kl_ratio: 475 | if self.use_kl_ratio_q: 476 | # Update kl Q 477 | self.kl_previous.append(np.median(np.hstack(kls))) 478 | previous_mean_kl = np.mean(np.asarray(self.kl_previous)) 479 | for i in xrange(len(kls)): 480 | kls[i] = kls[i] / previous_mean_kl 481 | 482 | # Add KL ass intrinsic reward to external reward 483 | for i in xrange(len(paths)): 484 | paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i] 485 | 486 | # Discount eta 487 | self.eta *= self.eta_discount 488 | 489 | else: 490 | logger.record_tabular('Expl_MeanKL', 0.) 491 | logger.record_tabular('Expl_StdKL', 0.) 492 | logger.record_tabular('Expl_MinKL', 0.) 493 | logger.record_tabular('Expl_MaxKL', 0.) 494 | 495 | baselines = [] 496 | returns = [] 497 | for path in paths: 498 | path_baselines = np.append(self.baseline.predict(path), 0) 499 | deltas = path["rewards"] + \ 500 | self.discount * path_baselines[1:] - \ 501 | path_baselines[:-1] 502 | path["advantages"] = special.discount_cumsum( 503 | deltas, self.discount * self.gae_lambda) 504 | path["returns"] = special.discount_cumsum( 505 | path["rewards_orig"], self.discount) 506 | baselines.append(path_baselines[:-1]) 507 | returns.append(path["returns"]) 508 | 509 | if not self.policy.recurrent: 510 | observations = tensor_utils.concat_tensor_list( 511 | [path["observations"] for path in paths]) 512 | actions = tensor_utils.concat_tensor_list( 513 | [path["actions"] for path in paths]) 514 | rewards = tensor_utils.concat_tensor_list( 515 | [path["rewards"] for path in paths]) 516 | advantages = tensor_utils.concat_tensor_list( 517 | [path["advantages"] for path in paths]) 518 | env_infos = tensor_utils.concat_tensor_dict_list( 519 | [path["env_infos"] for path in paths]) 520 | agent_infos = tensor_utils.concat_tensor_dict_list( 521 | [path["agent_infos"] for path in paths]) 522 | 523 | if self.center_adv: 524 | advantages = util.center_advantages(advantages) 525 | 526 | if self.positive_adv: 527 | advantages = util.shift_advantages_to_positive(advantages) 528 | 529 | average_discounted_return = \ 530 | np.mean([path["returns"][0] for path in paths]) 531 | 532 | undiscounted_returns = [ 533 | sum(path["rewards_orig"]) for path in paths] 534 | 535 | ent = np.mean(self.policy.distribution.entropy(agent_infos)) 536 | 537 | ev = special.explained_variance_1d( 538 | np.concatenate(baselines), 539 | np.concatenate(returns) 540 | ) 541 | 542 | samples_data = dict( 543 | observations=observations, 544 | actions=actions, 545 | rewards=rewards, 546 | advantages=advantages, 547 | env_infos=env_infos, 548 | agent_infos=agent_infos, 549 | paths=paths, 550 | ) 551 | else: 552 | max_path_length = max([len(path["advantages"]) for path in paths]) 553 | 554 | # make all paths the same length (pad extra advantages with 0) 555 | obs = [path["observations"] for path in paths] 556 | obs = np.array( 557 | [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs]) 558 | 559 | if self.center_adv: 560 | raw_adv = np.concatenate( 561 | [path["advantages"] for path in paths]) 562 | adv_mean = np.mean(raw_adv) 563 | adv_std = np.std(raw_adv) + 1e-8 564 | adv = [ 565 | (path["advantages"] - adv_mean) / adv_std for path in paths] 566 | else: 567 | adv = [path["advantages"] for path in paths] 568 | 569 | adv = np.array( 570 | [tensor_utils.pad_tensor(a, max_path_length) for a in adv]) 571 | 572 | actions = [path["actions"] for path in paths] 573 | actions = np.array( 574 | [tensor_utils.pad_tensor(a, max_path_length) for a in actions]) 575 | 576 | rewards = [path["rewards"] for path in paths] 577 | rewards = np.array( 578 | [tensor_utils.pad_tensor(r, max_path_length) for r in rewards]) 579 | 580 | agent_infos = [path["agent_infos"] for path in paths] 581 | agent_infos = tensor_utils.stack_tensor_dict_list( 582 | [tensor_utils.pad_tensor_dict( 583 | p, max_path_length) for p in agent_infos] 584 | ) 585 | 586 | env_infos = [path["env_infos"] for path in paths] 587 | env_infos = tensor_utils.stack_tensor_dict_list( 588 | [tensor_utils.pad_tensor_dict( 589 | p, max_path_length) for p in env_infos] 590 | ) 591 | 592 | valids = [np.ones_like(path["returns"]) for path in paths] 593 | valids = np.array( 594 | [tensor_utils.pad_tensor(v, max_path_length) for v in valids]) 595 | 596 | average_discounted_return = \ 597 | np.mean([path["returns"][0] for path in paths]) 598 | 599 | undiscounted_returns = [sum(path["rewards"]) for path in paths] 600 | 601 | ent = np.mean(self.policy.distribution.entropy(agent_infos)) 602 | 603 | ev = special.explained_variance_1d( 604 | np.concatenate(baselines), 605 | np.concatenate(returns) 606 | ) 607 | 608 | samples_data = dict( 609 | observations=obs, 610 | actions=actions, 611 | advantages=adv, 612 | rewards=rewards, 613 | valids=valids, 614 | agent_infos=agent_infos, 615 | env_infos=env_infos, 616 | paths=paths, 617 | ) 618 | 619 | logger.log("fitting baseline...") 620 | self.baseline.fit(paths) 621 | logger.log("fitted") 622 | 623 | logger.record_tabular('Iteration', itr) 624 | logger.record_tabular('AverageDiscountedReturn', 625 | average_discounted_return) 626 | logger.record_tabular('AverageReturn', np.mean(undiscounted_returns)) 627 | logger.record_tabular('ExplainedVariance', ev) 628 | logger.record_tabular('NumTrajs', len(paths)) 629 | logger.record_tabular('Entropy', ent) 630 | logger.record_tabular('Perplexity', np.exp(ent)) 631 | logger.record_tabular('StdReturn', np.std(undiscounted_returns)) 632 | logger.record_tabular('MaxReturn', np.max(undiscounted_returns)) 633 | logger.record_tabular('MinReturn', np.min(undiscounted_returns)) 634 | 635 | return samples_data 636 | -------------------------------------------------------------------------------- /algos/erwr_expl.py: -------------------------------------------------------------------------------- 1 | from sandbox.vime.algos.vpg_expl import VPG 2 | from rllab.optimizers.lbfgs_optimizer import LbfgsOptimizer 3 | from rllab.core.serializable import Serializable 4 | 5 | 6 | class ERWR(VPG, Serializable): 7 | """ 8 | Episodic Reward Weighted Regression [1]_ 9 | 10 | Notes 11 | ----- 12 | This does not implement the original RwR [2]_ that deals with "immediate reward problems" since 13 | it doesn't find solutions that optimize for temporally delayed rewards. 14 | 15 | .. [1] Kober, Jens, and Jan R. Peters. "Policy search for motor primitives in robotics." Advances in neural information processing systems. 2009. 16 | .. [2] Peters, Jan, and Stefan Schaal. "Using reward-weighted regression for reinforcement learning of task space control." Approximate Dynamic Programming and Reinforcement Learning, 2007. ADPRL 2007. IEEE International Symposium on. IEEE, 2007. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | optimizer=None, 22 | optimizer_args=None, 23 | positive_adv=None, 24 | **kwargs): 25 | Serializable.quick_init(self, locals()) 26 | if optimizer is None: 27 | if optimizer_args is None: 28 | optimizer_args = dict() 29 | optimizer = LbfgsOptimizer(**optimizer_args) 30 | super(ERWR, self).__init__( 31 | optimizer=optimizer, 32 | positive_adv=True if positive_adv is None else positive_adv, 33 | **kwargs 34 | ) 35 | -------------------------------------------------------------------------------- /algos/npo_expl.py: -------------------------------------------------------------------------------- 1 | from rllab.misc import ext 2 | from rllab.misc.overrides import overrides 3 | from sandbox.vime.algos.batch_polopt_expl import BatchPolopt 4 | import rllab.misc.logger as logger 5 | import theano 6 | import theano.tensor as TT 7 | from rllab.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer 8 | 9 | 10 | class NPO(BatchPolopt): 11 | """ 12 | Natural Policy Optimization. 13 | """ 14 | 15 | def __init__( 16 | self, 17 | optimizer=None, 18 | optimizer_args=None, 19 | step_size=0.01, 20 | **kwargs): 21 | if optimizer is None: 22 | if optimizer_args is None: 23 | optimizer_args = dict() 24 | optimizer = PenaltyLbfgsOptimizer(**optimizer_args) 25 | self.optimizer = optimizer 26 | self.step_size = step_size 27 | super(NPO, self).__init__(**kwargs) 28 | 29 | @overrides 30 | def init_opt(self): 31 | is_recurrent = int(self.policy.recurrent) 32 | obs_var = self.env.observation_space.new_tensor_variable( 33 | 'obs', 34 | extra_dims=1 + is_recurrent, 35 | ) 36 | action_var = self.env.action_space.new_tensor_variable( 37 | 'action', 38 | extra_dims=1 + is_recurrent, 39 | ) 40 | advantage_var = ext.new_tensor( 41 | 'advantage', 42 | ndim=1 + is_recurrent, 43 | dtype=theano.config.floatX 44 | ) 45 | dist = self.policy.distribution 46 | old_dist_info_vars = { 47 | k: ext.new_tensor( 48 | 'old_%s' % k, 49 | ndim=2 + is_recurrent, 50 | dtype=theano.config.floatX 51 | ) for k in dist.dist_info_keys 52 | } 53 | old_dist_info_vars_list = [old_dist_info_vars[k] 54 | for k in dist.dist_info_keys] 55 | 56 | if is_recurrent: 57 | valid_var = TT.matrix('valid') 58 | else: 59 | valid_var = None 60 | 61 | dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) 62 | kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) 63 | lr = dist.likelihood_ratio_sym( 64 | action_var, old_dist_info_vars, dist_info_vars) 65 | if is_recurrent: 66 | mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) 67 | surr_loss = - \ 68 | TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var) 69 | else: 70 | mean_kl = TT.mean(kl) 71 | surr_loss = - TT.mean(lr * advantage_var) 72 | 73 | input_list = [ 74 | obs_var, 75 | action_var, 76 | advantage_var, 77 | ] + old_dist_info_vars_list 78 | if is_recurrent: 79 | input_list.append(valid_var) 80 | 81 | self.optimizer.update_opt( 82 | loss=surr_loss, 83 | target=self.policy, 84 | leq_constraint=(mean_kl, self.step_size), 85 | inputs=input_list, 86 | constraint_name="mean_kl" 87 | ) 88 | return dict() 89 | 90 | @overrides 91 | def optimize_policy(self, itr, samples_data): 92 | all_input_values = tuple(ext.extract( 93 | samples_data, 94 | "observations", "actions", "advantages" 95 | )) 96 | agent_infos = samples_data["agent_infos"] 97 | info_list = [agent_infos[k] 98 | for k in self.policy.distribution.dist_info_keys] 99 | all_input_values += tuple(info_list) 100 | if self.policy.recurrent: 101 | all_input_values += (samples_data["valids"],) 102 | loss_before = self.optimizer.loss(all_input_values) 103 | self.optimizer.optimize(all_input_values) 104 | mean_kl = self.optimizer.constraint_val(all_input_values) 105 | loss_after = self.optimizer.loss(all_input_values) 106 | logger.record_tabular('LossAfter', loss_after) 107 | logger.record_tabular('MeanKL', mean_kl) 108 | logger.record_tabular('dLoss', loss_before - loss_after) 109 | return dict() 110 | 111 | @overrides 112 | def get_itr_snapshot(self, itr, samples_data): 113 | return dict( 114 | itr=itr, 115 | policy=self.policy, 116 | baseline=self.baseline, 117 | env=self.env, 118 | ) 119 | -------------------------------------------------------------------------------- /algos/trpo_expl.py: -------------------------------------------------------------------------------- 1 | from sandbox.vime.algos.npo_expl import NPO 2 | from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 3 | from rllab.core.serializable import Serializable 4 | 5 | 6 | class TRPO(NPO, Serializable): 7 | """ 8 | Trust Region Policy Optimization 9 | """ 10 | 11 | def __init__( 12 | self, 13 | optimizer=None, 14 | optimizer_args=None, 15 | **kwargs): 16 | Serializable.quick_init(self, locals()) 17 | if optimizer is None: 18 | if optimizer_args is None: 19 | optimizer_args = dict() 20 | optimizer = ConjugateGradientOptimizer(**optimizer_args) 21 | super(TRPO, self).__init__(optimizer=optimizer, **kwargs) 22 | -------------------------------------------------------------------------------- /algos/vpg_expl.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as TT 2 | import theano 3 | from rllab.misc import logger 4 | from rllab.misc.overrides import overrides 5 | from rllab.misc import ext 6 | from sandbox.vime.algos.batch_polopt_expl import BatchPolopt 7 | from rllab.optimizers.first_order_optimizer import FirstOrderOptimizer 8 | from rllab.core.serializable import Serializable 9 | 10 | 11 | class VPG(BatchPolopt, Serializable): 12 | """ 13 | Vanilla Policy Gradient. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | env, 19 | policy, 20 | baseline, 21 | optimizer=None, 22 | optimizer_args=None, 23 | **kwargs): 24 | Serializable.quick_init(self, locals()) 25 | if optimizer is None: 26 | default_args = dict( 27 | batch_size=None, 28 | max_epochs=1, 29 | ) 30 | if optimizer_args is None: 31 | optimizer_args = default_args 32 | else: 33 | optimizer_args = dict(default_args, **optimizer_args) 34 | optimizer = FirstOrderOptimizer(**optimizer_args) 35 | self.optimizer = optimizer 36 | self.opt_info = None 37 | super(VPG, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs) 38 | 39 | @overrides 40 | def init_opt(self): 41 | is_recurrent = int(self.policy.recurrent) 42 | 43 | obs_var = self.env.observation_space.new_tensor_variable( 44 | 'obs', 45 | extra_dims=1 + is_recurrent, 46 | ) 47 | action_var = self.env.action_space.new_tensor_variable( 48 | 'action', 49 | extra_dims=1 + is_recurrent, 50 | ) 51 | advantage_var = ext.new_tensor( 52 | 'advantage', 53 | ndim=1 + is_recurrent, 54 | dtype=theano.config.floatX 55 | ) 56 | dist = self.policy.distribution 57 | old_dist_info_vars = { 58 | k: ext.new_tensor( 59 | 'old_%s' % k, 60 | ndim=2 + is_recurrent, 61 | dtype=theano.config.floatX 62 | ) for k in dist.dist_info_keys 63 | } 64 | old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys] 65 | 66 | if is_recurrent: 67 | valid_var = TT.matrix('valid') 68 | else: 69 | valid_var = None 70 | 71 | dist_info_vars = self.policy.dist_info_sym(obs_var, action_var) 72 | logli = dist.log_likelihood_sym(action_var, dist_info_vars) 73 | kl = dist.kl_sym(old_dist_info_vars, dist_info_vars) 74 | 75 | # formulate as a minimization problem 76 | # The gradient of the surrogate objective is the policy gradient 77 | if is_recurrent: 78 | surr_obj = - TT.sum(logli * advantage_var * valid_var) / TT.sum(valid_var) 79 | mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var) 80 | max_kl = TT.max(kl * valid_var) 81 | else: 82 | surr_obj = - TT.mean(logli * advantage_var) 83 | mean_kl = TT.mean(kl) 84 | max_kl = TT.max(kl) 85 | 86 | input_list = [obs_var, action_var, advantage_var] 87 | if is_recurrent: 88 | input_list.append(valid_var) 89 | 90 | self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list) 91 | 92 | f_kl = ext.compile_function( 93 | inputs=input_list + old_dist_info_vars_list, 94 | outputs=[mean_kl, max_kl], 95 | ) 96 | self.opt_info = dict( 97 | f_kl=f_kl, 98 | ) 99 | 100 | @overrides 101 | def optimize_policy(self, itr, samples_data): 102 | logger.log("optimizing policy") 103 | inputs = ext.extract( 104 | samples_data, 105 | "observations", "actions", "advantages" 106 | ) 107 | if self.policy.recurrent: 108 | inputs += (samples_data["valids"],) 109 | agent_infos = samples_data["agent_infos"] 110 | dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys] 111 | loss_before = self.optimizer.loss(inputs) 112 | self.optimizer.optimize(inputs) 113 | loss_after = self.optimizer.loss(inputs) 114 | logger.record_tabular("LossBefore", loss_before) 115 | logger.record_tabular("LossAfter", loss_after) 116 | 117 | mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list)) 118 | logger.record_tabular('MeanKL', mean_kl) 119 | logger.record_tabular('MaxKL', max_kl) 120 | 121 | @overrides 122 | def get_itr_snapshot(self, itr, samples_data): 123 | return dict( 124 | itr=itr, 125 | policy=self.policy, 126 | baseline=self.baseline, 127 | env=self.env, 128 | ) 129 | -------------------------------------------------------------------------------- /dynamics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/dynamics/__init__.py -------------------------------------------------------------------------------- /dynamics/bnn.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import numpy as np 3 | import theano.tensor as T 4 | import lasagne 5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams 6 | from rllab.core.lasagne_powered import LasagnePowered 7 | from rllab.core.serializable import Serializable 8 | from rllab.misc import ext 9 | from collections import OrderedDict 10 | import theano 11 | 12 | # ---------------- 13 | BNN_LAYER_TAG = 'BNNLayer' 14 | USE_REPARAMETRIZATION_TRICK = True 15 | # ---------------- 16 | 17 | 18 | class BNNLayer(lasagne.layers.Layer): 19 | """Probabilistic layer that uses Gaussian weights. 20 | 21 | Each weight has two parameters: mean and standard deviation (std). 22 | """ 23 | 24 | def __init__(self, 25 | incoming, 26 | num_units, 27 | nonlinearity=lasagne.nonlinearities.rectify, 28 | prior_sd=None, 29 | **kwargs): 30 | super(BNNLayer, self).__init__(incoming, **kwargs) 31 | 32 | self._srng = RandomStreams() 33 | 34 | # Set vars. 35 | self.nonlinearity = nonlinearity 36 | self.num_inputs = int(np.prod(self.input_shape[1:])) 37 | self.num_units = num_units 38 | self.prior_sd = prior_sd 39 | 40 | prior_rho = self.std_to_log(self.prior_sd) 41 | 42 | self.W = np.random.normal(0., prior_sd, 43 | (self.num_inputs, self.num_units)) # @UndefinedVariable 44 | self.b = np.zeros( 45 | (self.num_units,), 46 | dtype=theano.config.floatX) # @UndefinedVariable 47 | 48 | # Here we set the priors. 49 | # ----------------------- 50 | self.mu = self.add_param( 51 | lasagne.init.Normal(1., 0.), 52 | (self.num_inputs, self.num_units), 53 | name='mu' 54 | ) 55 | self.rho = self.add_param( 56 | lasagne.init.Constant(prior_rho), 57 | (self.num_inputs, self.num_units), 58 | name='rho' 59 | ) 60 | # Bias priors. 61 | self.b_mu = self.add_param( 62 | lasagne.init.Normal(1., 0.), 63 | (self.num_units,), 64 | name="b_mu", 65 | regularizable=False 66 | ) 67 | self.b_rho = self.add_param( 68 | lasagne.init.Constant(prior_rho), 69 | (self.num_units,), 70 | name="b_rho", 71 | regularizable=False 72 | ) 73 | # ----------------------- 74 | 75 | # Backup params for KL calculations. 76 | self.mu_old = self.add_param( 77 | np.zeros((self.num_inputs, self.num_units)), 78 | (self.num_inputs, self.num_units), 79 | name='mu_old', 80 | trainable=False, 81 | oldparam=True 82 | ) 83 | self.rho_old = self.add_param( 84 | np.ones((self.num_inputs, self.num_units)), 85 | (self.num_inputs, self.num_units), 86 | name='rho_old', 87 | trainable=False, 88 | oldparam=True 89 | ) 90 | # Bias priors. 91 | self.b_mu_old = self.add_param( 92 | np.zeros((self.num_units,)), 93 | (self.num_units,), 94 | name="b_mu_old", 95 | regularizable=False, 96 | trainable=False, 97 | oldparam=True 98 | ) 99 | self.b_rho_old = self.add_param( 100 | np.ones((self.num_units,)), 101 | (self.num_units,), 102 | name="b_rho_old", 103 | regularizable=False, 104 | trainable=False, 105 | oldparam=True 106 | ) 107 | 108 | def log_to_std(self, rho): 109 | """Transformation for allowing rho in \mathbb{R}, rather than \mathbb{R}_+ 110 | 111 | This makes sure that we don't get negative stds. However, a downside might be 112 | that we have little gradient on close to 0 std (= -inf using this transformation). 113 | """ 114 | return T.log(1 + T.exp(rho)) 115 | 116 | def std_to_log(self, sigma): 117 | """Reverse log_to_std transformation.""" 118 | return np.log(np.exp(sigma) - 1) 119 | 120 | def get_W(self): 121 | # Here we generate random epsilon values from a normal distribution 122 | epsilon = self._srng.normal(size=(self.num_inputs, self.num_units), avg=0., std=1., 123 | dtype=theano.config.floatX) # @UndefinedVariable 124 | # Here we calculate weights based on shifting and rescaling according 125 | # to mean and variance (paper step 2) 126 | W = self.mu + self.log_to_std(self.rho) * epsilon 127 | self.W = W 128 | return W 129 | 130 | def get_b(self): 131 | # Here we generate random epsilon values from a normal distribution 132 | epsilon = self._srng.normal(size=(self.num_units,), avg=0., std=1., 133 | dtype=theano.config.floatX) # @UndefinedVariable 134 | b = self.b_mu + self.log_to_std(self.b_rho) * epsilon 135 | self.b = b 136 | return b 137 | 138 | def get_output_for_reparametrization(self, input, **kwargs): 139 | """Implementation of the local reparametrization trick. 140 | 141 | This essentially leads to a speedup compared to the naive implementation case. 142 | Furthermore, it leads to gradients with less variance. 143 | 144 | References 145 | ---------- 146 | Kingma et al., "Variational Dropout and the Local Reparametrization Trick", 2015 147 | """ 148 | if input.ndim > 2: 149 | # if the input has more than two dimensions, flatten it into a 150 | # batch of feature vectors. 151 | input = input.flatten(2) 152 | 153 | gamma = T.dot(input, self.mu) + self.b_mu.dimshuffle('x', 0) 154 | delta = T.dot(T.square(input), T.square(self.log_to_std( 155 | self.rho))) + T.square(self.log_to_std(self.b_rho)).dimshuffle('x', 0) 156 | epsilon = self._srng.normal(size=(self.num_units,), avg=0., std=1., 157 | dtype=theano.config.floatX) # @UndefinedVariable 158 | 159 | activation = gamma + T.sqrt(delta) * epsilon 160 | 161 | return self.nonlinearity(activation) 162 | 163 | def save_old_params(self): 164 | """Save old parameter values for KL calculation.""" 165 | self.mu_old.set_value(self.mu.get_value()) 166 | self.rho_old.set_value(self.rho.get_value()) 167 | self.b_mu_old.set_value(self.b_mu.get_value()) 168 | self.b_rho_old.set_value(self.b_rho.get_value()) 169 | 170 | def reset_to_old_params(self): 171 | """Reset to old parameter values for KL calculation.""" 172 | self.mu.set_value(self.mu_old.get_value()) 173 | self.rho.set_value(self.rho_old.get_value()) 174 | self.b_mu.set_value(self.b_mu_old.get_value()) 175 | self.b_rho.set_value(self.b_rho_old.get_value()) 176 | 177 | def kl_div_p_q(self, p_mean, p_std, q_mean, q_std): 178 | """KL divergence D_{KL}[p(x)||q(x)] for a fully factorized Gaussian""" 179 | numerator = T.square(p_mean - q_mean) + \ 180 | T.square(p_std) - T.square(q_std) 181 | denominator = 2 * T.square(q_std) + 1e-8 182 | return T.sum( 183 | numerator / denominator + T.log(q_std) - T.log(p_std)) 184 | 185 | def kl_div_new_old(self): 186 | kl_div = self.kl_div_p_q( 187 | self.mu, self.log_to_std(self.rho), self.mu_old, self.log_to_std(self.rho_old)) 188 | kl_div += self.kl_div_p_q(self.b_mu, self.log_to_std(self.b_rho), 189 | self.b_mu_old, self.log_to_std(self.b_rho_old)) 190 | return kl_div 191 | 192 | def kl_div_old_new(self): 193 | kl_div = self.kl_div_p_q( 194 | self.mu_old, self.log_to_std(self.rho_old), self.mu, self.log_to_std(self.rho)) 195 | kl_div += self.kl_div_p_q(self.b_mu_old, 196 | self.log_to_std(self.b_rho_old), self.b_mu, self.log_to_std(self.b_rho)) 197 | return kl_div 198 | 199 | def kl_div_new_prior(self): 200 | kl_div = self.kl_div_p_q( 201 | self.mu, self.log_to_std(self.rho), 0., self.prior_sd) 202 | kl_div += self.kl_div_p_q(self.b_mu, 203 | self.log_to_std(self.b_rho), 0., self.prior_sd) 204 | return kl_div 205 | 206 | def kl_div_old_prior(self): 207 | kl_div = self.kl_div_p_q( 208 | self.mu_old, self.log_to_std(self.rho_old), 0., self.prior_sd) 209 | kl_div += self.kl_div_p_q(self.b_mu_old, 210 | self.log_to_std(self.b_rho_old), 0., self.prior_sd) 211 | return kl_div 212 | 213 | def kl_div_prior_new(self): 214 | kl_div = self.kl_div_p_q( 215 | 0., self.prior_sd, self.mu, self.log_to_std(self.rho)) 216 | kl_div += self.kl_div_p_q(0., self.prior_sd, 217 | self.b_mu, self.log_to_std(self.b_rho)) 218 | return kl_div 219 | 220 | def get_output_for(self, input, **kwargs): 221 | if USE_REPARAMETRIZATION_TRICK: 222 | return self.get_output_for_reparametrization(input, **kwargs) 223 | else: 224 | return self.get_output_for_default(input, **kwargs) 225 | 226 | def get_output_for_default(self, input, **kwargs): 227 | if input.ndim > 2: 228 | # if the input has more than two dimensions, flatten it into a 229 | # batch of feature vectors. 230 | input = input.flatten(2) 231 | 232 | activation = T.dot(input, self.get_W()) + \ 233 | self.get_b().dimshuffle('x', 0) 234 | 235 | return self.nonlinearity(activation) 236 | 237 | def get_output_shape_for(self, input_shape): 238 | return (input_shape[0], self.num_units) 239 | 240 | 241 | class BNN(LasagnePowered, Serializable): 242 | """Bayesian neural network (BNN) based on Blundell2016.""" 243 | 244 | def __init__(self, n_in, 245 | n_hidden, 246 | n_out, 247 | layers_type, 248 | n_batches, 249 | trans_func=lasagne.nonlinearities.rectify, 250 | out_func=lasagne.nonlinearities.linear, 251 | batch_size=100, 252 | n_samples=10, 253 | prior_sd=0.5, 254 | use_reverse_kl_reg=False, 255 | reverse_kl_reg_factor=0.1, 256 | likelihood_sd=5.0, 257 | second_order_update=False, 258 | learning_rate=0.0001, 259 | compression=False, 260 | information_gain=True, 261 | ): 262 | 263 | Serializable.quick_init(self, locals()) 264 | assert len(layers_type) == len(n_hidden) + 1 265 | 266 | self.n_in = n_in 267 | self.n_hidden = n_hidden 268 | self.n_out = n_out 269 | self.batch_size = batch_size 270 | self.transf = trans_func 271 | self.outf = out_func 272 | self.n_samples = n_samples 273 | self.prior_sd = prior_sd 274 | self.layers_type = layers_type 275 | self.n_batches = n_batches 276 | self.use_reverse_kl_reg = use_reverse_kl_reg 277 | self.reverse_kl_reg_factor = reverse_kl_reg_factor 278 | self.likelihood_sd = likelihood_sd 279 | self.second_order_update = second_order_update 280 | self.learning_rate = learning_rate 281 | self.compression = compression 282 | self.information_gain = information_gain 283 | 284 | assert self.information_gain or self.compression 285 | 286 | # Build network architecture. 287 | self.build_network() 288 | 289 | # Build model might depend on this. 290 | LasagnePowered.__init__(self, [self.network]) 291 | 292 | # Compile theano functions. 293 | self.build_model() 294 | 295 | def save_old_params(self): 296 | layers = filter(lambda l: l.name == BNN_LAYER_TAG, 297 | lasagne.layers.get_all_layers(self.network)[1:]) 298 | for layer in layers: 299 | layer.save_old_params() 300 | 301 | def reset_to_old_params(self): 302 | layers = filter(lambda l: l.name == BNN_LAYER_TAG, 303 | lasagne.layers.get_all_layers(self.network)[1:]) 304 | for layer in layers: 305 | layer.reset_to_old_params() 306 | 307 | def compression_improvement(self): 308 | """KL divergence KL[old_param||new_param]""" 309 | layers = filter(lambda l: l.name == BNN_LAYER_TAG, 310 | lasagne.layers.get_all_layers(self.network)[1:]) 311 | return sum(l.kl_div_old_new() for l in layers) 312 | 313 | def inf_gain(self): 314 | """KL divergence KL[new_param||old_param]""" 315 | layers = filter(lambda l: l.name == BNN_LAYER_TAG, 316 | lasagne.layers.get_all_layers(self.network)[1:]) 317 | return sum(l.kl_div_new_old() for l in layers) 318 | 319 | def surprise(self): 320 | surpr = 0. 321 | if self.compression: 322 | surpr += self.compression_improvement() 323 | if self.information_gain: 324 | surpr += self.inf_gain() 325 | return surpr 326 | 327 | def kl_div(self): 328 | """KL divergence KL[new_param||old_param]""" 329 | layers = filter(lambda l: l.name == BNN_LAYER_TAG, 330 | lasagne.layers.get_all_layers(self.network)[1:]) 331 | return sum(l.kl_div_new_old() for l in layers) 332 | 333 | def log_p_w_q_w_kl(self): 334 | """KL divergence KL[q_\phi(w)||p(w)]""" 335 | layers = filter(lambda l: l.name == BNN_LAYER_TAG, 336 | lasagne.layers.get_all_layers(self.network)[1:]) 337 | return sum(l.kl_div_new_prior() for l in layers) 338 | 339 | def reverse_log_p_w_q_w_kl(self): 340 | """KL divergence KL[p(w)||q_\phi(w)]""" 341 | layers = filter(lambda l: l.name == BNN_LAYER_TAG, 342 | lasagne.layers.get_all_layers(self.network)[1:]) 343 | return sum(l.kl_div_prior_new() for l in layers) 344 | 345 | def _log_prob_normal(self, input, mu=0., sigma=1.): 346 | log_normal = - \ 347 | T.log(sigma) - T.log(T.sqrt(2 * np.pi)) - \ 348 | T.square(input - mu) / (2 * T.square(sigma)) 349 | return T.sum(log_normal) 350 | 351 | def pred_sym(self, input): 352 | return lasagne.layers.get_output(self.network, input) 353 | 354 | def loss(self, input, target): 355 | 356 | # MC samples. 357 | _log_p_D_given_w = [] 358 | for _ in xrange(self.n_samples): 359 | # Make prediction. 360 | prediction = self.pred_sym(input) 361 | # Calculate model likelihood log(P(D|w)). 362 | _log_p_D_given_w.append(self._log_prob_normal( 363 | target, prediction, self.likelihood_sd)) 364 | log_p_D_given_w = sum(_log_p_D_given_w) 365 | # Calculate variational posterior log(q(w)) and prior log(p(w)). 366 | kl = self.log_p_w_q_w_kl() 367 | if self.use_reverse_kl_reg: 368 | kl += self.reverse_kl_reg_factor * \ 369 | self.reverse_log_p_w_q_w_kl() 370 | 371 | # Calculate loss function. 372 | return kl / self.n_batches - log_p_D_given_w / self.n_samples 373 | 374 | def loss_last_sample(self, input, target): 375 | """The difference with the original loss is that we only update based on the latest sample. 376 | This means that instead of using the prior p(w), we use the previous approximated posterior 377 | q(w) for the KL term in the objective function: KL[q(w)|p(w)] becomems KL[q'(w)|q(w)]. 378 | """ 379 | 380 | # MC samples. 381 | _log_p_D_given_w = [] 382 | for _ in xrange(self.n_samples): 383 | # Make prediction. 384 | prediction = self.pred_sym(input) 385 | # Calculate model likelihood log(P(sample|w)). 386 | _log_p_D_given_w.append(self._log_prob_normal( 387 | target, prediction, self.likelihood_sd)) 388 | log_p_D_given_w = sum(_log_p_D_given_w) 389 | # Calculate loss function. 390 | # self.kl_div() should be zero when taking second order step 391 | return self.kl_div() - log_p_D_given_w / self.n_samples 392 | 393 | def build_network(self): 394 | 395 | # Input layer 396 | network = lasagne.layers.InputLayer(shape=(1, self.n_in)) 397 | 398 | # Hidden layers 399 | for i in xrange(len(self.n_hidden)): 400 | # Probabilistic layer (1) or deterministic layer (0). 401 | if self.layers_type[i] == 1: 402 | network = BNNLayer( 403 | network, self.n_hidden[i], nonlinearity=self.transf, prior_sd=self.prior_sd, name=BNN_LAYER_TAG) 404 | else: 405 | network = lasagne.layers.DenseLayer( 406 | network, self.n_hidden[i], nonlinearity=self.transf) 407 | 408 | # Output layer 409 | if self.layers_type[len(self.n_hidden)] == 1: 410 | # Probabilistic layer (1) or deterministic layer (0). 411 | network = BNNLayer( 412 | network, self.n_out, nonlinearity=self.outf, prior_sd=self.prior_sd, name=BNN_LAYER_TAG) 413 | else: 414 | network = lasagne.layers.DenseLayer( 415 | network, self.n_out, nonlinearity=self.outf) 416 | 417 | self.network = network 418 | 419 | def build_model(self): 420 | 421 | # Prepare Theano variables for inputs and targets 422 | # Same input for classification as regression. 423 | input_var = T.matrix('inputs', 424 | dtype=theano.config.floatX) # @UndefinedVariable 425 | target_var = T.matrix('targets', 426 | dtype=theano.config.floatX) # @UndefinedVariable 427 | 428 | # Loss function. 429 | loss = self.loss(input_var, target_var) 430 | loss_only_last_sample = self.loss_last_sample(input_var, target_var) 431 | 432 | # Create update methods. 433 | params = lasagne.layers.get_all_params(self.network, trainable=True) 434 | updates = lasagne.updates.adam( 435 | loss, params, learning_rate=self.learning_rate) 436 | 437 | # Train/val fn. 438 | self.pred_fn = ext.compile_function( 439 | [input_var], self.pred_sym(input_var), log_name='pred_fn') 440 | self.train_fn = ext.compile_function( 441 | [input_var, target_var], loss, updates=updates, log_name='train_fn') 442 | 443 | if self.second_order_update: 444 | 445 | oldparams = lasagne.layers.get_all_params( 446 | self.network, oldparam=True) 447 | step_size = T.scalar('step_size', 448 | dtype=theano.config.floatX) # @UndefinedVariable 449 | 450 | def second_order_update(loss_or_grads, params, oldparams, step_size): 451 | """Second-order update method for optimizing loss_last_sample, so basically, 452 | KL term (new params || old params) + NLL of latest sample. The Hessian is 453 | evaluated at the origin and provides curvature information to make a more 454 | informed step in the correct descent direction.""" 455 | grads = T.grad(loss_or_grads, params) 456 | updates = OrderedDict() 457 | for i in xrange(len(params)): 458 | param = params[i] 459 | grad = grads[i] 460 | if param.name == 'mu' or param.name == 'b_mu': 461 | oldparam_rho = oldparams[i + 1] 462 | invH = T.square(T.log(1 + T.exp(oldparam_rho))) 463 | else: 464 | oldparam_rho = oldparams[i] 465 | p = param 466 | 467 | H = 2. * (T.exp(2 * p)) / \ 468 | (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2) 469 | invH = 1. / H 470 | updates[param] = param - step_size * invH * grad 471 | 472 | return updates 473 | 474 | def fast_kl_div(loss, params, oldparams, step_size): 475 | 476 | grads = T.grad(loss, params) 477 | 478 | kl_component = [] 479 | for i in xrange(len(params)): 480 | param = params[i] 481 | grad = grads[i] 482 | 483 | if param.name == 'mu' or param.name == 'b_mu': 484 | oldparam_rho = oldparams[i + 1] 485 | invH = T.square(T.log(1 + T.exp(oldparam_rho))) 486 | else: 487 | oldparam_rho = oldparams[i] 488 | p = param 489 | 490 | H = 2. * (T.exp(2 * p)) / \ 491 | (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2) 492 | invH = 1. / H 493 | 494 | kl_component.append( 495 | T.sum(T.square(step_size) * T.square(grad) * invH)) 496 | 497 | return sum(kl_component) 498 | 499 | compute_fast_kl_div = fast_kl_div( 500 | loss_only_last_sample, params, oldparams, step_size) 501 | 502 | self.train_update_fn = ext.compile_function( 503 | [input_var, target_var, step_size], compute_fast_kl_div, log_name='f_compute_fast_kl_div') 504 | 505 | # updates_kl = second_order_update( 506 | # loss_only_last_sample, params, oldparams, step_size) 507 | # 508 | # self.train_update_fn = ext.compile_function( 509 | # [input_var, target_var, step_size], loss_only_last_sample, updates=updates_kl, log_name='train_update_fn') 510 | else: 511 | self.train_update_fn = ext.compile_function( 512 | [input_var, target_var], loss_only_last_sample, updates=updates, log_name='train_update_fn') 513 | 514 | # called kl div closed form but should be called surprise 515 | self.f_kl_div_closed_form = ext.compile_function( 516 | [], self.surprise(), log_name='kl_div_fn') 517 | 518 | if __name__ == '__main__': 519 | pass 520 | -------------------------------------------------------------------------------- /envs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/envs/__init__.py -------------------------------------------------------------------------------- /envs/cartpole_swingup_env_x.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pygame 3 | from rllab.envs.box2d.parser import find_body 4 | 5 | from rllab.core.serializable import Serializable 6 | from rllab.envs.box2d.box2d_env import Box2DEnv 7 | from rllab.misc import autoargs 8 | from rllab.misc.overrides import overrides 9 | 10 | 11 | # Tornio, Matti, and Tapani Raiko. "Variational Bayesian approach for 12 | # nonlinear identification and control." Proc. of the IFAC Workshop on 13 | # Nonlinear Model Predictive Control for Fast Systems, NMPC FS06. 2006. 14 | class CartpoleSwingupEnvX(Box2DEnv, Serializable): 15 | 16 | @autoargs.inherit(Box2DEnv.__init__) 17 | def __init__(self, *args, **kwargs): 18 | super(CartpoleSwingupEnvX, self).__init__( 19 | self.model_path("cartpole.xml.mako"), 20 | *args, **kwargs 21 | ) 22 | self.max_cart_pos = 3 23 | self.max_reward_cart_pos = 3 24 | self.cart = find_body(self.world, "cart") 25 | self.pole = find_body(self.world, "pole") 26 | Serializable.__init__(self, *args, **kwargs) 27 | 28 | @overrides 29 | def reset(self): 30 | self._set_state(self.initial_state) 31 | self._invalidate_state_caches() 32 | bounds = np.array([ 33 | [-1, -2, np.pi - 1, -3], 34 | [1, 2, np.pi + 1, 3], 35 | ]) 36 | low, high = bounds 37 | xpos, xvel, apos, avel = np.random.uniform(low, high) 38 | self.cart.position = (xpos, self.cart.position[1]) 39 | self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1]) 40 | self.pole.angle = apos 41 | self.pole.angularVelocity = avel 42 | return self.get_current_obs() 43 | 44 | @overrides 45 | def compute_reward(self, action): 46 | yield 47 | if self.is_current_done(): 48 | yield 0 49 | else: 50 | if abs(self.cart.position[0]) > self.max_reward_cart_pos: 51 | yield 0 52 | else: 53 | cs = np.cos(self.pole.angle) 54 | if cs > 0.8: 55 | rew = 1.0 56 | else: 57 | rew = 0. 58 | yield rew 59 | 60 | @overrides 61 | def is_current_done(self): 62 | return abs(self.cart.position[0]) > self.max_cart_pos 63 | 64 | @overrides 65 | def action_from_keys(self, keys): 66 | if keys[pygame.K_LEFT]: 67 | return np.asarray([-10]) 68 | elif keys[pygame.K_RIGHT]: 69 | return np.asarray([+10]) 70 | else: 71 | return np.asarray([0]) 72 | -------------------------------------------------------------------------------- /envs/double_pendulum_env_x.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rllab.envs.box2d.parser import find_body 3 | 4 | from rllab.core.serializable import Serializable 5 | from rllab.envs.box2d.box2d_env import Box2DEnv 6 | from rllab.misc import autoargs 7 | from rllab.misc.overrides import overrides 8 | 9 | 10 | # http://mlg.eng.cam.ac.uk/pilco/ 11 | class DoublePendulumEnvX(Box2DEnv, Serializable): 12 | 13 | @autoargs.inherit(Box2DEnv.__init__) 14 | def __init__(self, *args, **kwargs): 15 | # make sure mdp-level step is 100ms long 16 | kwargs["frame_skip"] = kwargs.get("frame_skip", 2) 17 | if kwargs.get("template_args", {}).get("noise", False): 18 | self.link_len = (np.random.rand()-0.5) + 1 19 | else: 20 | self.link_len = 1 21 | kwargs["template_args"] = kwargs.get("template_args", {}) 22 | kwargs["template_args"]["link_len"] = self.link_len 23 | super(DoublePendulumEnvX, self).__init__( 24 | self.model_path("double_pendulum.xml.mako"), 25 | *args, **kwargs 26 | ) 27 | self.link1 = find_body(self.world, "link1") 28 | self.link2 = find_body(self.world, "link2") 29 | Serializable.__init__(self, *args, **kwargs) 30 | 31 | @overrides 32 | def reset(self): 33 | self._set_state(self.initial_state) 34 | self._invalidate_state_caches() 35 | stds = np.array([0.1, 0.1, 0.01, 0.01]) 36 | pos1, pos2, v1, v2 = np.random.randn(*stds.shape) * stds 37 | self.link1.angle = pos1 38 | self.link2.angle = pos2 39 | self.link1.angularVelocity = v1 40 | self.link2.angularVelocity = v2 41 | return self.get_current_obs() 42 | 43 | def get_tip_pos(self): 44 | cur_center_pos = self.link2.position 45 | cur_angle = self.link2.angle 46 | cur_pos = ( 47 | cur_center_pos[0] - self.link_len*np.sin(cur_angle), 48 | cur_center_pos[1] - self.link_len*np.cos(cur_angle) 49 | ) 50 | return cur_pos 51 | 52 | @overrides 53 | def compute_reward(self, action): 54 | yield 55 | tgt_pos = np.asarray([0, self.link_len * 2]) 56 | cur_pos = self.get_tip_pos() 57 | dist = np.linalg.norm(cur_pos - tgt_pos) 58 | if dist < 1: 59 | rew = 1. 60 | else: 61 | rew = 0. 62 | yield rew 63 | 64 | def is_current_done(self): 65 | return False 66 | 67 | -------------------------------------------------------------------------------- /envs/half_cheetah_env_x.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.envs.base import Step 5 | from rllab.envs.mujoco.mujoco_env import MujocoEnv 6 | from rllab.misc import logger 7 | from rllab.misc.overrides import overrides 8 | 9 | 10 | def smooth_abs(x, param): 11 | return np.sqrt(np.square(x) + np.square(param)) - param 12 | 13 | 14 | class HalfCheetahEnvX(MujocoEnv, Serializable): 15 | 16 | FILE = 'half_cheetah.xml' 17 | 18 | def __init__(self, *args, **kwargs): 19 | super(HalfCheetahEnvX, self).__init__(*args, **kwargs) 20 | Serializable.__init__(self, *args, **kwargs) 21 | 22 | def get_current_obs(self): 23 | return np.concatenate([ 24 | self.model.data.qpos.flatten()[1:], 25 | self.model.data.qvel.flat, 26 | self.get_body_com("torso").flat, 27 | ]) 28 | 29 | def get_body_xmat(self, body_name): 30 | idx = self.model.body_names.index(body_name) 31 | return self.model.data.xmat[idx].reshape((3, 3)) 32 | 33 | def get_body_com(self, body_name): 34 | idx = self.model.body_names.index(body_name) 35 | return self.model.data.com_subtree[idx] 36 | 37 | def step(self, action): 38 | self.forward_dynamics(action) 39 | next_obs = self.get_current_obs() 40 | action = np.clip(action, *self.action_bounds) 41 | ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action)) 42 | run_cost = -1 * self.get_body_comvel("torso")[0] 43 | cost = ctrl_cost + run_cost 44 | reward = -cost 45 | done = False 46 | if self.get_body_com("torso")[0] <= 5.0: 47 | reward = 0. 48 | else: 49 | reward = 1.0 50 | return Step(next_obs, reward, done) 51 | 52 | @overrides 53 | def log_diagnostics(self, paths): 54 | progs = [ 55 | path["observations"][-1][-3] - path["observations"][0][-3] 56 | for path in paths 57 | ] 58 | logger.record_tabular('AverageForwardProgress', np.mean(progs)) 59 | logger.record_tabular('MaxForwardProgress', np.max(progs)) 60 | logger.record_tabular('MinForwardProgress', np.min(progs)) 61 | logger.record_tabular('StdForwardProgress', np.std(progs)) 62 | -------------------------------------------------------------------------------- /envs/mountain_car_env_x.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pygame 3 | from rllab.envs.box2d.parser import find_body 4 | 5 | from rllab.core.serializable import Serializable 6 | from rllab.envs.box2d.box2d_env import Box2DEnv 7 | from rllab.misc import autoargs 8 | from rllab.misc.overrides import overrides 9 | 10 | 11 | class MountainCarEnvX(Box2DEnv, Serializable): 12 | 13 | @autoargs.inherit(Box2DEnv.__init__) 14 | @autoargs.arg("height_bonus_coeff", type=float, 15 | help="Height bonus added to each step's reward") 16 | @autoargs.arg("goal_cart_pos", type=float, 17 | help="Goal horizontal position") 18 | def __init__(self, 19 | height_bonus=1., 20 | goal_cart_pos=0.6, 21 | *args, **kwargs): 22 | super(MountainCarEnvX, self).__init__( 23 | self.model_path("mountain_car.xml.mako"), 24 | *args, **kwargs 25 | ) 26 | self.max_cart_pos = 2 27 | self.goal_cart_pos = goal_cart_pos 28 | self.height_bonus = height_bonus 29 | self.cart = find_body(self.world, "cart") 30 | Serializable.quick_init(self, locals()) 31 | 32 | @overrides 33 | def compute_reward(self, action): 34 | yield 35 | yield self.is_current_done() 36 | 37 | @overrides 38 | def is_current_done(self): 39 | return self.cart.position[0] >= self.goal_cart_pos \ 40 | or abs(self.cart.position[0]) >= self.max_cart_pos 41 | 42 | @overrides 43 | def reset(self): 44 | self._set_state(self.initial_state) 45 | self._invalidate_state_caches() 46 | bounds = np.array([ 47 | [-1], 48 | [1], 49 | ]) 50 | low, high = bounds 51 | xvel = np.random.uniform(low, high) 52 | self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1]) 53 | return self.get_current_obs() 54 | 55 | @overrides 56 | def action_from_keys(self, keys): 57 | if keys[pygame.K_LEFT]: 58 | return np.asarray([-1]) 59 | elif keys[pygame.K_RIGHT]: 60 | return np.asarray([+1]) 61 | else: 62 | return np.asarray([0]) 63 | 64 | -------------------------------------------------------------------------------- /experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/experiments/__init__.py -------------------------------------------------------------------------------- /experiments/run_experiment_lite.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import sys 4 | 5 | sys.path.append(".") 6 | 7 | from rllab.misc.ext import is_iterable, set_seed 8 | from rllab.misc.instrument import concretize 9 | from rllab import config 10 | import rllab.misc.logger as logger 11 | import argparse 12 | import os.path as osp 13 | import datetime 14 | import dateutil.tz 15 | import ast 16 | import uuid 17 | import cPickle as pickle 18 | import base64 19 | 20 | 21 | def run_experiment(argv): 22 | 23 | default_log_dir = config.LOG_DIR 24 | now = datetime.datetime.now(dateutil.tz.tzlocal()) 25 | 26 | # avoid name clashes when running distributed jobs 27 | rand_id = str(uuid.uuid4())[:5] 28 | timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z') 29 | 30 | default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id) 31 | parser = argparse.ArgumentParser() 32 | parser.add_argument('--n_parallel', type=int, default=1, 33 | help='Number of parallel workers to perform rollouts.') 34 | parser.add_argument( 35 | '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.') 36 | parser.add_argument('--log_dir', type=str, default=default_log_dir, 37 | help='Path to save the log and iteration snapshot.') 38 | parser.add_argument('--snapshot_mode', type=str, default='all', 39 | help='Mode to save the snapshot. Can be either "all" ' 40 | '(all iterations will be saved), "last" (only ' 41 | 'the last iteration will be saved), or "none" ' 42 | '(do not save snapshots)') 43 | parser.add_argument('--tabular_log_file', type=str, default='progress.csv', 44 | help='Name of the tabular log file (in csv).') 45 | parser.add_argument('--text_log_file', type=str, default='debug.log', 46 | help='Name of the text log file (in pure text).') 47 | parser.add_argument('--params_log_file', type=str, default='params.json', 48 | help='Name of the parameter log file (in json).') 49 | parser.add_argument('--plot', type=ast.literal_eval, default=False, 50 | help='Whether to plot the iteration results') 51 | parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False, 52 | help='Whether to only print the tabular log information (in a horizontal format)') 53 | parser.add_argument('--seed', type=int, 54 | help='Random seed for numpy') 55 | parser.add_argument('--args_data', type=str, 56 | help='Pickled data for stub objects') 57 | 58 | args = parser.parse_args(argv[1:]) 59 | 60 | from sandbox.vime.sampler import parallel_sampler_expl as parallel_sampler 61 | parallel_sampler.initialize(n_parallel=args.n_parallel) 62 | 63 | if args.seed is not None: 64 | set_seed(args.seed) 65 | parallel_sampler.set_seed(args.seed) 66 | 67 | if args.plot: 68 | from rllab.plotter import plotter 69 | plotter.init_worker() 70 | 71 | # read from stdin 72 | data = pickle.loads(base64.b64decode(args.args_data)) 73 | 74 | log_dir = args.log_dir 75 | # exp_dir = osp.join(log_dir, args.exp_name) 76 | tabular_log_file = osp.join(log_dir, args.tabular_log_file) 77 | text_log_file = osp.join(log_dir, args.text_log_file) 78 | params_log_file = osp.join(log_dir, args.params_log_file) 79 | 80 | logger.log_parameters_lite(params_log_file, args) 81 | logger.add_text_output(text_log_file) 82 | logger.add_tabular_output(tabular_log_file) 83 | prev_snapshot_dir = logger.get_snapshot_dir() 84 | prev_mode = logger.get_snapshot_mode() 85 | logger.set_snapshot_dir(log_dir) 86 | logger.set_snapshot_mode(args.snapshot_mode) 87 | logger.set_log_tabular_only(args.log_tabular_only) 88 | logger.push_prefix("[%s] " % args.exp_name) 89 | 90 | maybe_iter = concretize(data) 91 | if is_iterable(maybe_iter): 92 | for _ in maybe_iter: 93 | pass 94 | 95 | logger.set_snapshot_mode(prev_mode) 96 | logger.set_snapshot_dir(prev_snapshot_dir) 97 | logger.remove_tabular_output(tabular_log_file) 98 | logger.remove_text_output(text_log_file) 99 | logger.pop_prefix() 100 | 101 | 102 | if __name__ == "__main__": 103 | run_experiment(sys.argv) 104 | -------------------------------------------------------------------------------- /experiments/run_trpo.py: -------------------------------------------------------------------------------- 1 | import os 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.mujoco.gather.swimmer_gather_env import SwimmerGatherEnv 4 | os.environ["THEANO_FLAGS"] = "device=cpu" 5 | 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 7 | from rllab.envs.normalized_env import NormalizedEnv 8 | 9 | from rllab.algos.trpo import TRPO 10 | from rllab.misc.instrument import stub, run_experiment_lite 11 | import itertools 12 | 13 | stub(globals()) 14 | 15 | # Param ranges 16 | seeds = range(2) 17 | # SwimmerGather hierarchical task 18 | mdp_classes = [SwimmerGatherEnv] 19 | mdps = [NormalizedEnv(env=mdp_class()) 20 | for mdp_class in mdp_classes] 21 | param_cart_product = itertools.product( 22 | mdps, seeds 23 | ) 24 | 25 | for mdp, seed in param_cart_product: 26 | 27 | policy = GaussianMLPPolicy( 28 | env_spec=mdp.spec, 29 | hidden_sizes=(64, 32), 30 | ) 31 | 32 | baseline = LinearFeatureBaseline( 33 | mdp.spec, 34 | ) 35 | 36 | batch_size = 50000 37 | algo = TRPO( 38 | env=mdp, 39 | policy=policy, 40 | baseline=baseline, 41 | batch_size=batch_size, 42 | whole_paths=True, 43 | max_path_length=500, 44 | n_itr=10000, 45 | step_size=0.01, 46 | subsample_factor=1.0, 47 | ) 48 | 49 | run_experiment_lite( 50 | algo.train(), 51 | exp_prefix="trpo", 52 | n_parallel=4, 53 | snapshot_mode="last", 54 | seed=seed, 55 | mode="local" 56 | ) 57 | -------------------------------------------------------------------------------- /experiments/run_trpo_expl.py: -------------------------------------------------------------------------------- 1 | import os 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.mujoco.gather.swimmer_gather_env import SwimmerGatherEnv 4 | os.environ["THEANO_FLAGS"] = "device=cpu" 5 | 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 7 | from rllab.envs.normalized_env import NormalizedEnv 8 | 9 | from sandbox.vime.algos.trpo_expl import TRPO 10 | from rllab.misc.instrument import stub, run_experiment_lite 11 | import itertools 12 | 13 | stub(globals()) 14 | 15 | # Param ranges 16 | seeds = range(2) 17 | etas = [0.0001] 18 | # SwimmerGather hierarchical task 19 | mdp_classes = [SwimmerGatherEnv] 20 | mdps = [NormalizedEnv(env=mdp_class()) 21 | for mdp_class in mdp_classes] 22 | 23 | param_cart_product = itertools.product( 24 | mdps, etas, seeds 25 | ) 26 | 27 | for mdp, eta, seed in param_cart_product: 28 | 29 | policy = GaussianMLPPolicy( 30 | env_spec=mdp.spec, 31 | hidden_sizes=(64, 32), 32 | ) 33 | 34 | baseline = LinearFeatureBaseline( 35 | mdp.spec, 36 | ) 37 | 38 | batch_size = 50000 39 | algo = TRPO( 40 | env=mdp, 41 | policy=policy, 42 | baseline=baseline, 43 | batch_size=batch_size, 44 | whole_paths=True, 45 | max_path_length=500, 46 | n_itr=10000, 47 | step_size=0.01, 48 | eta=eta, 49 | snn_n_samples=10, 50 | subsample_factor=1.0, 51 | use_replay_pool=True, 52 | use_kl_ratio=True, 53 | use_kl_ratio_q=True, 54 | n_itr_update=1, 55 | kl_batch_size=1, 56 | normalize_reward=False, 57 | replay_pool_size=1000000, 58 | n_updates_per_sample=5000, 59 | second_order_update=True, 60 | unn_n_hidden=[32], 61 | unn_layers_type=[1, 1], 62 | unn_learning_rate=0.0001 63 | ) 64 | 65 | run_experiment_lite( 66 | algo.train(), 67 | exp_prefix="trpo-expl", 68 | n_parallel=4, 69 | snapshot_mode="last", 70 | seed=seed, 71 | mode="local", 72 | script="sandbox/vime/experiments/run_experiment_lite.py", 73 | ) 74 | -------------------------------------------------------------------------------- /sampler/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/sampler/__init__.py -------------------------------------------------------------------------------- /sampler/parallel_sampler_expl.py: -------------------------------------------------------------------------------- 1 | from rllab.sampler.utils import rollout 2 | from rllab.sampler.stateful_pool import singleton_pool 3 | from rllab.misc import ext 4 | from rllab.misc import logger 5 | from rllab.misc import tensor_utils 6 | import numpy as np 7 | 8 | 9 | def _worker_init(G, id): 10 | import os 11 | os.environ['THEANO_FLAGS'] = 'device=cpu' 12 | G.worker_id = id 13 | 14 | 15 | def initialize(n_parallel): 16 | singleton_pool.initialize(n_parallel) 17 | singleton_pool.run_each( 18 | _worker_init, [(id,) for id in xrange(singleton_pool.n_parallel)]) 19 | 20 | 21 | def _worker_populate_task(G, env, policy, dynamics): 22 | G.env = env 23 | G.policy = policy 24 | G.dynamics = dynamics 25 | 26 | 27 | def populate_task(env, policy, dynamics): 28 | logger.log("Populating workers...") 29 | singleton_pool.run_each( 30 | _worker_populate_task, 31 | [(env, policy, dynamics)] * singleton_pool.n_parallel 32 | ) 33 | logger.log("Populated") 34 | 35 | 36 | def _worker_set_seed(_, seed): 37 | ext.set_seed(seed) 38 | 39 | 40 | def set_seed(seed): 41 | singleton_pool.run_each( 42 | _worker_set_seed, 43 | [(seed + i,) for i in xrange(singleton_pool.n_parallel)] 44 | ) 45 | 46 | 47 | def _worker_set_policy_params(G, params): 48 | G.policy.set_param_values(params) 49 | 50 | 51 | def _worker_set_dynamics_params(G, params): 52 | G.dynamics.set_param_values(params) 53 | 54 | 55 | def _worker_collect_one_path(G, max_path_length, itr, normalize_reward, 56 | reward_mean, reward_std, kl_batch_size, n_itr_update, use_replay_pool, 57 | obs_mean, obs_std, act_mean, act_std, second_order_update): 58 | # Path rollout. 59 | path = rollout(G.env, G.policy, max_path_length) 60 | 61 | # Computing intrinsic rewards. 62 | # ---------------------------- 63 | # Save original reward. 64 | path['rewards_orig'] = np.array(path['rewards']) 65 | 66 | if itr > 0: 67 | # Iterate over all paths and compute intrinsic reward by updating the 68 | # model on each observation, calculating the KL divergence of the new 69 | # params to the old ones, and undoing this operation. 70 | obs = (path['observations'] - obs_mean) / (obs_std + 1e-8) 71 | act = (path['actions'] - act_mean) / (act_std + 1e-8) 72 | rew = path['rewards'] 73 | # inputs = (o,a), target = o' 74 | obs_nxt = np.vstack([obs[1:]]) 75 | _inputs = np.hstack([obs[:-1], act[:-1]]) 76 | _targets = obs_nxt 77 | # KL vector assumes same shape as reward. 78 | kl = np.zeros(rew.shape) 79 | for j in xrange(int(np.ceil(obs.shape[0] / float(kl_batch_size)))): 80 | 81 | # Save old params for every update. 82 | G.dynamics.save_old_params() 83 | 84 | start = j * kl_batch_size 85 | end = np.minimum( 86 | (j + 1) * kl_batch_size, obs.shape[0] - 1) 87 | 88 | if second_order_update: 89 | # We do a line search over the best step sizes using 90 | # step_size * invH * grad 91 | # best_loss_value = np.inf 92 | for step_size in [0.01]: 93 | G.dynamics.save_old_params() 94 | loss_value = G.dynamics.train_update_fn( 95 | _inputs[start:end], _targets[start:end], step_size) 96 | kl_div = np.clip(loss_value, 0, 1000) 97 | # If using replay pool, undo updates. 98 | if use_replay_pool: 99 | G.dynamics.reset_to_old_params() 100 | else: 101 | # Update model weights based on current minibatch. 102 | for _ in xrange(n_itr_update): 103 | G.dynamics.train_update_fn( 104 | _inputs[start:end], _targets[start:end]) 105 | # Calculate current minibatch KL. 106 | kl_div = np.clip( 107 | float(G.dynamics.f_kl_div_closed_form()), 0, 1000) 108 | 109 | for k in xrange(start, end): 110 | kl[k] = kl_div 111 | # If using replay pool, undo updates. 112 | if use_replay_pool: 113 | G.dynamics.reset_to_old_params() 114 | 115 | # Last element in KL vector needs to be replaced by second last one 116 | # because the actual last observation has no next observation. 117 | kl[-1] = kl[-2] 118 | 119 | # Stuff it in path 120 | path['KL'] = kl 121 | # ---------------------------- 122 | 123 | return path, len(path["rewards"]) 124 | 125 | 126 | def sample_paths( 127 | policy_params, 128 | dynamics_params, 129 | max_samples, 130 | max_path_length=np.inf, 131 | itr=None, 132 | normalize_reward=None, 133 | reward_mean=None, 134 | reward_std=None, 135 | kl_batch_size=None, 136 | n_itr_update=None, 137 | use_replay_pool=None, 138 | obs_mean=None, 139 | obs_std=None, 140 | act_mean=None, 141 | act_std=None, 142 | second_order_update=None 143 | ): 144 | """ 145 | :param policy_params: parameters for the policy. This will be updated on each worker process 146 | :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples 147 | might be greater since all trajectories will be rolled out either until termination or until max_path_length is 148 | reached 149 | :param max_path_length: horizon / maximum length of a single trajectory 150 | :return: a list of collected paths 151 | """ 152 | singleton_pool.run_each( 153 | _worker_set_policy_params, 154 | [(policy_params,)] * singleton_pool.n_parallel 155 | ) 156 | 157 | # Set dynamics params. 158 | # -------------------- 159 | singleton_pool.run_each( 160 | _worker_set_dynamics_params, 161 | [(dynamics_params,)] * singleton_pool.n_parallel 162 | ) 163 | # -------------------- 164 | return singleton_pool.run_collect( 165 | _worker_collect_one_path, 166 | threshold=max_samples, 167 | args=(max_path_length, itr, normalize_reward, reward_mean, 168 | reward_std, kl_batch_size, n_itr_update, use_replay_pool, obs_mean, obs_std, act_mean, act_std, second_order_update), 169 | show_prog_bar=True 170 | ) 171 | 172 | 173 | def truncate_paths(paths, max_samples): 174 | """ 175 | Truncate the list of paths so that the total number of samples is exactly equal to max_samples. This is done by 176 | removing extra paths at the end of the list, and make the last path shorter if necessary 177 | :param paths: a list of paths 178 | :param max_samples: the absolute maximum number of samples 179 | :return: a list of paths, truncated so that the number of samples adds up to max-samples 180 | """ 181 | # chop samples collected by extra paths 182 | # make a copy 183 | paths = list(paths) 184 | total_n_samples = sum(len(path["rewards"]) for path in paths) 185 | while len(paths) > 0 and total_n_samples - len(paths[-1]["rewards"]) >= max_samples: 186 | total_n_samples -= len(paths.pop(-1)["rewards"]) 187 | if len(paths) > 0: 188 | last_path = paths.pop(-1) 189 | truncated_last_path = dict() 190 | truncated_len = len( 191 | last_path["rewards"]) - (total_n_samples - max_samples) 192 | for k, v in last_path.iteritems(): 193 | if k in ["observations", "actions", "rewards"]: 194 | truncated_last_path[k] = tensor_utils.truncate_tensor_list( 195 | v, truncated_len) 196 | elif k in ["env_infos", "agent_infos"]: 197 | truncated_last_path[k] = tensor_utils.truncate_tensor_dict( 198 | v, truncated_len) 199 | else: 200 | raise NotImplementedError 201 | paths.append(truncated_last_path) 202 | return paths 203 | --------------------------------------------------------------------------------