├── .gitignore
├── README.md
├── __init__.py
├── algos
    ├── __init__.py
    ├── batch_polopt_expl.py
    ├── erwr_expl.py
    ├── npo_expl.py
    ├── trpo_expl.py
    └── vpg_expl.py
├── dynamics
    ├── __init__.py
    └── bnn.py
├── envs
    ├── __init__.py
    ├── cartpole_swingup_env_x.py
    ├── double_pendulum_env_x.py
    ├── half_cheetah_env_x.py
    └── mountain_car_env_x.py
├── experiments
    ├── __init__.py
    ├── run_experiment_lite.py
    ├── run_trpo.py
    └── run_trpo_expl.py
└── sampler
    ├── __init__.py
    └── parallel_sampler_expl.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | *.pyc
 3 | *-checkpoint.ipynb
 4 | .DS_Store
 5 | *.h5
 6 | *.log
 7 | *.npz
 8 | secrets.py
 9 | *.avi
10 | *.mp4
11 | build
12 | build_linux
13 | .idea
14 | .sublime-project
15 | run_experiment.sh
16 | scratch-notebooks
17 | launch_scripts
18 | *.sh.e*
19 | *.sh.o*
20 | MUJOCO_LOG.TXT
21 | vendor/mujoco
22 | .project
23 | .pydevproject
24 | *.pdf
25 | .env
26 | snippets
27 | private
28 | lua
29 | iterate.dat
30 | .env
31 | src/
32 | .settings
33 | .pods
34 | docs/_build
35 | *.swp
36 | *.dat
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | **Status:** Archive (code is provided as-is, no updates expected)
 2 | 
 3 | # How to run VIME
 4 | 
 5 | Variational Information Maximizing Exploration (VIME) as presented in Curiosity-driven Exploration in Deep Reinforcement Learning via Bayesian Neural Networks by *R. Houthooft, X. Chen, Y. Duan, J. Schulman, F. De Turck, P. Abbeel* (http://arxiv.org/abs/1605.09674). 
 6 | 
 7 | To reproduce the results, you should first have [rllab](https://github.com/rllab/rllab) and Mujoco v1.31 configured. Then, run the following commands in the root folder of `rllab`:
 8 | 
 9 | ```bash
10 | git submodule add -f git@github.com:openai/vime.git sandbox/vime
11 | touch sandbox/__init__.py
12 | ```
13 | 
14 | Then you can do the following:
15 | - Execute TRPO+VIME on the hierarchical SwimmerGather environment via `python sandbox/vime/experiments/run_trpo_expl.py`.
16 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/__init__.py


--------------------------------------------------------------------------------
/algos/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/algos/__init__.py


--------------------------------------------------------------------------------
/algos/batch_polopt_expl.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from rllab.algos.base import RLAlgorithm
  4 | from sandbox.vime.sampler import parallel_sampler_expl as parallel_sampler
  5 | from rllab.misc import special
  6 | from rllab.misc import tensor_utils
  7 | from rllab.algos import util
  8 | import rllab.misc.logger as logger
  9 | import rllab.plotter as plotter
 10 | 
 11 | # exploration imports
 12 | # -------------------
 13 | import theano
 14 | import lasagne
 15 | from collections import deque
 16 | import time
 17 | from sandbox.vime.dynamics import bnn
 18 | # -------------------
 19 | 
 20 | 
 21 | class SimpleReplayPool(object):
 22 |     """Replay pool"""
 23 | 
 24 |     def __init__(
 25 |             self, max_pool_size, observation_shape, action_dim,
 26 |             observation_dtype=theano.config.floatX,  # @UndefinedVariable
 27 |             action_dtype=theano.config.floatX):  # @UndefinedVariable
 28 |         self._observation_shape = observation_shape
 29 |         self._action_dim = action_dim
 30 |         self._observation_dtype = observation_dtype
 31 |         self._action_dtype = action_dtype
 32 |         self._max_pool_size = max_pool_size
 33 | 
 34 |         self._observations = np.zeros(
 35 |             (max_pool_size,) + observation_shape,
 36 |             dtype=observation_dtype
 37 |         )
 38 |         self._actions = np.zeros(
 39 |             (max_pool_size, action_dim),
 40 |             dtype=action_dtype
 41 |         )
 42 |         self._rewards = np.zeros(max_pool_size, dtype='float32')
 43 |         self._terminals = np.zeros(max_pool_size, dtype='uint8')
 44 |         self._bottom = 0
 45 |         self._top = 0
 46 |         self._size = 0
 47 | 
 48 |     def add_sample(self, observation, action, reward, terminal):
 49 |         self._observations[self._top] = observation
 50 |         self._actions[self._top] = action
 51 |         self._rewards[self._top] = reward
 52 |         self._terminals[self._top] = terminal
 53 |         self._top = (self._top + 1) % self._max_pool_size
 54 |         if self._size >= self._max_pool_size:
 55 |             self._bottom = (self._bottom + 1) % self._max_pool_size
 56 |         else:
 57 |             self._size = self._size + 1
 58 | 
 59 |     def random_batch(self, batch_size):
 60 |         assert self._size > batch_size
 61 |         indices = np.zeros(batch_size, dtype='uint64')
 62 |         transition_indices = np.zeros(batch_size, dtype='uint64')
 63 |         count = 0
 64 |         while count < batch_size:
 65 |             index = np.random.randint(
 66 |                 self._bottom, self._bottom + self._size) % self._max_pool_size
 67 |             # make sure that the transition is valid: if we are at the end of the pool, we need to discard
 68 |             # this sample
 69 |             if index == self._size - 1 and self._size <= self._max_pool_size:
 70 |                 continue
 71 |             transition_index = (index + 1) % self._max_pool_size
 72 |             indices[count] = index
 73 |             transition_indices[count] = transition_index
 74 |             count += 1
 75 |         return dict(
 76 |             observations=self._observations[indices],
 77 |             actions=self._actions[indices],
 78 |             rewards=self._rewards[indices],
 79 |             terminals=self._terminals[indices],
 80 |             next_observations=self._observations[transition_indices]
 81 |         )
 82 | 
 83 |     def mean_obs_act(self):
 84 |         if self._size >= self._max_pool_size:
 85 |             obs = self._observations
 86 |             act = self._actions
 87 |         else:
 88 |             obs = self._observations[:self._top + 1]
 89 |             act = self._actions[:self._top + 1]
 90 |         obs_mean = np.mean(obs, axis=0)
 91 |         obs_std = np.std(obs, axis=0)
 92 |         act_mean = np.mean(act, axis=0)
 93 |         act_std = np.std(act, axis=0)
 94 |         return obs_mean, obs_std, act_mean, act_std
 95 | 
 96 |     @property
 97 |     def size(self):
 98 |         return self._size
 99 | 
100 | 
101 | class BatchPolopt(RLAlgorithm):
102 |     """
103 |     Base class for batch sampling-based policy optimization methods.
104 |     This includes various policy gradient methods like vpg, npg, ppo, trpo, etc.
105 |     """
106 | 
107 |     def __init__(
108 |             self,
109 |             env,
110 |             policy,
111 |             baseline,
112 |             n_itr=500,
113 |             start_itr=0,
114 |             batch_size=5000,
115 |             max_path_length=500,
116 |             discount=0.99,
117 |             gae_lambda=1,
118 |             plot=False,
119 |             pause_for_plot=False,
120 |             whole_paths=True,
121 |             center_adv=True,
122 |             positive_adv=False,
123 |             record_states=False,
124 |             store_paths=False,
125 |             algorithm_parallelized=False,
126 |             # exploration params
127 |             eta=1.,
128 |             snn_n_samples=10,
129 |             prior_sd=0.5,
130 |             use_kl_ratio=False,
131 |             kl_q_len=10,
132 |             use_reverse_kl_reg=False,
133 |             reverse_kl_reg_factor=1e-3,
134 |             use_replay_pool=True,
135 |             replay_pool_size=100000,
136 |             min_pool_size=500,
137 |             n_updates_per_sample=500,
138 |             pool_batch_size=10,
139 |             eta_discount=1.0,
140 |             n_itr_update=5,
141 |             reward_alpha=0.001,
142 |             kl_alpha=0.001,
143 |             normalize_reward=False,
144 |             kl_batch_size=1,
145 |             use_kl_ratio_q=False,
146 |             unn_n_hidden=[32],
147 |             unn_layers_type=[1, 1],
148 |             unn_learning_rate=0.001,
149 |             second_order_update=False,
150 |             compression=False,
151 |             information_gain=True,
152 |             **kwargs
153 |     ):
154 |         """
155 |         :param env: Environment
156 |         :param policy: Policy
157 |         :param baseline: Baseline
158 |         :param n_itr: Number of iterations.
159 |         :param start_itr: Starting iteration.
160 |         :param batch_size: Number of samples per iteration.
161 |         :param max_path_length: Maximum length of a single rollout.
162 |         :param discount: Discount.
163 |         :param gae_lambda: Lambda used for generalized advantage estimation.
164 |         :param plot: Plot evaluation run after each iteration.
165 |         :param pause_for_plot: Whether to pause before contiuing when plotting.
166 |         :param whole_paths: Make sure that the samples contain whole trajectories, even if the actual batch size is
167 |         slightly larger than the specified batch_size.
168 |         :param center_adv: Whether to rescale the advantages so that they have mean 0 and standard deviation 1.
169 |         :param positive_adv: Whether to shift the advantages so that they are always positive. When used in
170 |         conjunction with center_adv the advantages will be standardized before shifting.
171 |         :param store_paths: Whether to save all paths data to the snapshot.
172 |         :return:
173 |         """
174 |         self.env = env
175 |         self.policy = policy
176 |         self.baseline = baseline
177 |         self.n_itr = n_itr
178 |         self.start_itr = start_itr
179 |         self.batch_size = batch_size
180 |         self.max_path_length = max_path_length
181 |         self.discount = discount
182 |         self.gae_lambda = gae_lambda
183 |         self.plot = plot
184 |         self.pause_for_plot = pause_for_plot
185 |         self.whole_paths = whole_paths
186 |         self.center_adv = center_adv
187 |         self.positive_adv = positive_adv
188 |         self.store_paths = store_paths
189 | 
190 |         # Set exploration params
191 |         # ----------------------
192 |         self.eta = eta
193 |         self.snn_n_samples = snn_n_samples
194 |         self.prior_sd = prior_sd
195 |         self.use_kl_ratio = use_kl_ratio
196 |         self.kl_q_len = kl_q_len
197 |         self.use_reverse_kl_reg = use_reverse_kl_reg
198 |         self.reverse_kl_reg_factor = reverse_kl_reg_factor
199 |         self.use_replay_pool = use_replay_pool
200 |         self.replay_pool_size = replay_pool_size
201 |         self.min_pool_size = min_pool_size
202 |         self.n_updates_per_sample = n_updates_per_sample
203 |         self.pool_batch_size = pool_batch_size
204 |         self.eta_discount = eta_discount
205 |         self.n_itr_update = n_itr_update
206 |         self.reward_alpha = reward_alpha
207 |         self.kl_alpha = kl_alpha
208 |         self.normalize_reward = normalize_reward
209 |         self.kl_batch_size = kl_batch_size
210 |         self.use_kl_ratio_q = use_kl_ratio_q
211 |         self.unn_n_hidden = unn_n_hidden
212 |         self.unn_layers_type = unn_layers_type
213 |         self.unn_learning_rate = unn_learning_rate
214 |         self.second_order_update = second_order_update
215 |         self.compression = compression
216 |         self.information_gain = information_gain
217 |         # ----------------------
218 | 
219 |         if self.second_order_update:
220 |             assert self.kl_batch_size == 1
221 |             assert self.n_itr_update == 1
222 | 
223 |         # Params to keep track of moving average (both intrinsic and external
224 |         # reward) mean/var.
225 |         if self.normalize_reward:
226 |             self._reward_mean = deque(maxlen=self.kl_q_len)
227 |             self._reward_std = deque(maxlen=self.kl_q_len)
228 |         if self.use_kl_ratio:
229 |             self._kl_mean = deque(maxlen=self.kl_q_len)
230 |             self._kl_std = deque(maxlen=self.kl_q_len)
231 | 
232 |         if self.use_kl_ratio_q:
233 |             # Add Queue here to keep track of N last kl values, compute average
234 |             # over them and divide current kl values by it. This counters the
235 |             # exploding kl value problem.
236 |             self.kl_previous = deque(maxlen=self.kl_q_len)
237 | 
238 |     def start_worker(self):
239 |         parallel_sampler.populate_task(self.env, self.policy, self.bnn)
240 |         if self.plot:
241 |             plotter.init_plot(self.env, self.policy)
242 | 
243 |     def shutdown_worker(self):
244 |         pass
245 | 
246 |     def train(self):
247 | 
248 |         # Bayesian neural network (BNN) initialization.
249 |         # ------------------------------------------------
250 |         batch_size = 1  # Redundant
251 |         n_batches = 5  # Hardcode or annealing scheme \pi_i.
252 | 
253 |         # MDP observation and action dimensions.
254 |         obs_dim = np.prod(self.env.observation_space.shape)
255 |         act_dim = np.prod(self.env.action_space.shape)
256 | 
257 |         logger.log("Building BNN model (eta={}) ...".format(self.eta))
258 |         start_time = time.time()
259 | 
260 |         self.bnn = bnn.BNN(
261 |             n_in=(obs_dim + act_dim),
262 |             n_hidden=self.unn_n_hidden,
263 |             n_out=obs_dim,
264 |             n_batches=n_batches,
265 |             layers_type=self.unn_layers_type,
266 |             trans_func=lasagne.nonlinearities.rectify,
267 |             out_func=lasagne.nonlinearities.linear,
268 |             batch_size=batch_size,
269 |             n_samples=self.snn_n_samples,
270 |             prior_sd=self.prior_sd,
271 |             use_reverse_kl_reg=self.use_reverse_kl_reg,
272 |             reverse_kl_reg_factor=self.reverse_kl_reg_factor,
273 |             #             stochastic_output=self.stochastic_output,
274 |             second_order_update=self.second_order_update,
275 |             learning_rate=self.unn_learning_rate,
276 |             compression=self.compression,
277 |             information_gain=self.information_gain
278 |         )
279 | 
280 |         logger.log(
281 |             "Model built ({:.1f} sec).".format((time.time() - start_time)))
282 | 
283 |         if self.use_replay_pool:
284 |             self.pool = SimpleReplayPool(
285 |                 max_pool_size=self.replay_pool_size,
286 |                 observation_shape=self.env.observation_space.shape,
287 |                 action_dim=act_dim
288 |             )
289 |         # ------------------------------------------------
290 | 
291 |         self.start_worker()
292 |         self.init_opt()
293 |         episode_rewards = []
294 |         episode_lengths = []
295 |         for itr in xrange(self.start_itr, self.n_itr):
296 |             logger.push_prefix('itr #%d | ' % itr)
297 | 
298 |             paths = self.obtain_samples(itr)
299 |             samples_data = self.process_samples(itr, paths)
300 | 
301 |             # Exploration code
302 |             # ----------------
303 |             if self.use_replay_pool:
304 |                 # Fill replay pool.
305 |                 logger.log("Fitting dynamics model using replay pool ...")
306 |                 for path in samples_data['paths']:
307 |                     path_len = len(path['rewards'])
308 |                     for i in xrange(path_len):
309 |                         obs = path['observations'][i]
310 |                         act = path['actions'][i]
311 |                         rew = path['rewards'][i]
312 |                         term = (i == path_len - 1)
313 |                         self.pool.add_sample(obs, act, rew, term)
314 | 
315 |                 # Now we train the dynamics model using the replay self.pool; only
316 |                 # if self.pool is large enough.
317 |                 if self.pool.size >= self.min_pool_size:
318 |                     obs_mean, obs_std, act_mean, act_std = self.pool.mean_obs_act()
319 |                     _inputss = []
320 |                     _targetss = []
321 |                     for _ in xrange(self.n_updates_per_sample):
322 |                         batch = self.pool.random_batch(
323 |                             self.pool_batch_size)
324 |                         obs = (batch['observations'] - obs_mean) / \
325 |                             (obs_std + 1e-8)
326 |                         next_obs = (
327 |                             batch['next_observations'] - obs_mean) / (obs_std + 1e-8)
328 |                         act = (batch['actions'] - act_mean) / \
329 |                             (act_std + 1e-8)
330 |                         _inputs = np.hstack(
331 |                             [obs, act])
332 |                         _targets = next_obs
333 |                         _inputss.append(_inputs)
334 |                         _targetss.append(_targets)
335 | 
336 |                     old_acc = 0.
337 |                     for _inputs, _targets in zip(_inputss, _targetss):
338 |                         _out = self.bnn.pred_fn(_inputs)
339 |                         old_acc += np.mean(np.square(_out - _targets))
340 |                     old_acc /= len(_inputss)
341 | 
342 |                     for _inputs, _targets in zip(_inputss, _targetss):
343 |                         self.bnn.train_fn(_inputs, _targets)
344 | 
345 |                     new_acc = 0.
346 |                     for _inputs, _targets in zip(_inputss, _targetss):
347 |                         _out = self.bnn.pred_fn(_inputs)
348 |                         new_acc += np.mean(np.square(_out - _targets))
349 |                     new_acc /= len(_inputss)
350 | 
351 |                     logger.record_tabular(
352 |                         'BNN_DynModelSqLossBefore', old_acc)
353 |                     logger.record_tabular(
354 |                         'BNN_DynModelSqLossAfter', new_acc)
355 |             # ----------------
356 | 
357 |             self.env.log_diagnostics(paths)
358 |             self.policy.log_diagnostics(paths)
359 |             self.baseline.log_diagnostics(paths)
360 |             self.optimize_policy(itr, samples_data)
361 |             logger.log("saving snapshot...")
362 |             params = self.get_itr_snapshot(itr, samples_data)
363 |             paths = samples_data["paths"]
364 |             if self.store_paths:
365 |                 params["paths"] = paths
366 |             episode_rewards.extend(sum(p["rewards"]) for p in paths)
367 |             episode_lengths.extend(len(p["rewards"]) for p in paths)
368 |             params["episode_rewards"] = np.array(episode_rewards)
369 |             params["episode_lengths"] = np.array(episode_lengths)
370 |             params["algo"] = self
371 |             logger.save_itr_params(itr, params)
372 |             logger.log("saved")
373 |             logger.dump_tabular(with_prefix=False)
374 |             logger.pop_prefix()
375 |             if self.plot:
376 |                 self.update_plot()
377 |                 if self.pause_for_plot:
378 |                     raw_input("Plotting evaluation run: Press Enter to "
379 |                               "continue...")
380 | 
381 |         self.shutdown_worker()
382 | 
383 |     def init_opt(self):
384 |         """
385 |         Initialize the optimization procedure. If using theano / cgt, this may
386 |         include declaring all the variables and compiling functions
387 |         """
388 |         raise NotImplementedError
389 | 
390 |     def get_itr_snapshot(self, itr, samples_data):
391 |         """
392 |         Returns all the data that should be saved in the snapshot for this
393 |         iteration.
394 |         """
395 |         raise NotImplementedError
396 | 
397 |     def optimize_policy(self, itr, samples_data):
398 |         raise NotImplementedError
399 | 
400 |     def update_plot(self):
401 |         if self.plot:
402 |             plotter.update_plot(self.policy, self.max_path_length)
403 | 
404 |     def obtain_samples(self, itr):
405 |         cur_params = self.policy.get_param_values()
406 |         cur_dynamics_params = self.bnn.get_param_values()
407 | 
408 |         reward_mean = None
409 |         reward_std = None
410 |         if self.normalize_reward:
411 |             # Compute running mean/std.
412 |             reward_mean = np.mean(np.asarray(self._reward_mean))
413 |             reward_std = np.mean(np.asarray(self._reward_std))
414 | 
415 |         # Mean/std obs/act based on replay pool.
416 |         obs_mean, obs_std, act_mean, act_std = self.pool.mean_obs_act()
417 | 
418 |         paths = parallel_sampler.sample_paths(
419 |             policy_params=cur_params,
420 |             dynamics_params=cur_dynamics_params,
421 |             max_samples=self.batch_size,
422 |             max_path_length=self.max_path_length,
423 |             itr=itr,
424 |             normalize_reward=self.normalize_reward,
425 |             reward_mean=reward_mean,
426 |             reward_std=reward_std,
427 |             kl_batch_size=self.kl_batch_size,
428 |             n_itr_update=self.n_itr_update,
429 |             use_replay_pool=self.use_replay_pool,
430 |             obs_mean=obs_mean,
431 |             obs_std=obs_std,
432 |             act_mean=act_mean,
433 |             act_std=act_std,
434 |             second_order_update=self.second_order_update
435 |         )
436 |         if self.whole_paths:
437 |             return paths
438 |         else:
439 |             paths_truncated = parallel_sampler.truncate_paths(
440 |                 paths, self.batch_size)
441 |             return paths_truncated
442 | 
443 |     def process_samples(self, itr, paths):
444 | 
445 |         if self.normalize_reward:
446 |             # Update reward mean/std Q.
447 |             rewards = []
448 |             for i in xrange(len(paths)):
449 |                 rewards.append(paths[i]['rewards'])
450 |             rewards_flat = np.hstack(rewards)
451 |             self._reward_mean.append(np.mean(rewards_flat))
452 |             self._reward_std.append(np.std(rewards_flat))
453 | 
454 |             # Normalize rewards.
455 |             reward_mean = np.mean(np.asarray(self._reward_mean))
456 |             reward_std = np.mean(np.asarray(self._reward_std))
457 |             for i in xrange(len(paths)):
458 |                 paths[i]['rewards'] = (
459 |                     paths[i]['rewards'] - reward_mean) / (reward_std + 1e-8)
460 | 
461 |         if itr > 0:
462 |             kls = []
463 |             for i in xrange(len(paths)):
464 |                 kls.append(paths[i]['KL'])
465 | 
466 |             kls_flat = np.hstack(kls)
467 | 
468 |             logger.record_tabular('Expl_MeanKL', np.mean(kls_flat))
469 |             logger.record_tabular('Expl_StdKL', np.std(kls_flat))
470 |             logger.record_tabular('Expl_MinKL', np.min(kls_flat))
471 |             logger.record_tabular('Expl_MaxKL', np.max(kls_flat))
472 | 
473 |             # Perform normlization of the intrinsic rewards.
474 |             if self.use_kl_ratio:
475 |                 if self.use_kl_ratio_q:
476 |                     # Update kl Q
477 |                     self.kl_previous.append(np.median(np.hstack(kls)))
478 |                     previous_mean_kl = np.mean(np.asarray(self.kl_previous))
479 |                     for i in xrange(len(kls)):
480 |                         kls[i] = kls[i] / previous_mean_kl
481 | 
482 |             # Add KL ass intrinsic reward to external reward
483 |             for i in xrange(len(paths)):
484 |                 paths[i]['rewards'] = paths[i]['rewards'] + self.eta * kls[i]
485 | 
486 |             # Discount eta
487 |             self.eta *= self.eta_discount
488 | 
489 |         else:
490 |             logger.record_tabular('Expl_MeanKL', 0.)
491 |             logger.record_tabular('Expl_StdKL', 0.)
492 |             logger.record_tabular('Expl_MinKL', 0.)
493 |             logger.record_tabular('Expl_MaxKL', 0.)
494 | 
495 |         baselines = []
496 |         returns = []
497 |         for path in paths:
498 |             path_baselines = np.append(self.baseline.predict(path), 0)
499 |             deltas = path["rewards"] + \
500 |                 self.discount * path_baselines[1:] - \
501 |                 path_baselines[:-1]
502 |             path["advantages"] = special.discount_cumsum(
503 |                 deltas, self.discount * self.gae_lambda)
504 |             path["returns"] = special.discount_cumsum(
505 |                 path["rewards_orig"], self.discount)
506 |             baselines.append(path_baselines[:-1])
507 |             returns.append(path["returns"])
508 | 
509 |         if not self.policy.recurrent:
510 |             observations = tensor_utils.concat_tensor_list(
511 |                 [path["observations"] for path in paths])
512 |             actions = tensor_utils.concat_tensor_list(
513 |                 [path["actions"] for path in paths])
514 |             rewards = tensor_utils.concat_tensor_list(
515 |                 [path["rewards"] for path in paths])
516 |             advantages = tensor_utils.concat_tensor_list(
517 |                 [path["advantages"] for path in paths])
518 |             env_infos = tensor_utils.concat_tensor_dict_list(
519 |                 [path["env_infos"] for path in paths])
520 |             agent_infos = tensor_utils.concat_tensor_dict_list(
521 |                 [path["agent_infos"] for path in paths])
522 | 
523 |             if self.center_adv:
524 |                 advantages = util.center_advantages(advantages)
525 | 
526 |             if self.positive_adv:
527 |                 advantages = util.shift_advantages_to_positive(advantages)
528 | 
529 |             average_discounted_return = \
530 |                 np.mean([path["returns"][0] for path in paths])
531 | 
532 |             undiscounted_returns = [
533 |                 sum(path["rewards_orig"]) for path in paths]
534 | 
535 |             ent = np.mean(self.policy.distribution.entropy(agent_infos))
536 | 
537 |             ev = special.explained_variance_1d(
538 |                 np.concatenate(baselines),
539 |                 np.concatenate(returns)
540 |             )
541 | 
542 |             samples_data = dict(
543 |                 observations=observations,
544 |                 actions=actions,
545 |                 rewards=rewards,
546 |                 advantages=advantages,
547 |                 env_infos=env_infos,
548 |                 agent_infos=agent_infos,
549 |                 paths=paths,
550 |             )
551 |         else:
552 |             max_path_length = max([len(path["advantages"]) for path in paths])
553 | 
554 |             # make all paths the same length (pad extra advantages with 0)
555 |             obs = [path["observations"] for path in paths]
556 |             obs = np.array(
557 |                 [tensor_utils.pad_tensor(ob, max_path_length) for ob in obs])
558 | 
559 |             if self.center_adv:
560 |                 raw_adv = np.concatenate(
561 |                     [path["advantages"] for path in paths])
562 |                 adv_mean = np.mean(raw_adv)
563 |                 adv_std = np.std(raw_adv) + 1e-8
564 |                 adv = [
565 |                     (path["advantages"] - adv_mean) / adv_std for path in paths]
566 |             else:
567 |                 adv = [path["advantages"] for path in paths]
568 | 
569 |             adv = np.array(
570 |                 [tensor_utils.pad_tensor(a, max_path_length) for a in adv])
571 | 
572 |             actions = [path["actions"] for path in paths]
573 |             actions = np.array(
574 |                 [tensor_utils.pad_tensor(a, max_path_length) for a in actions])
575 | 
576 |             rewards = [path["rewards"] for path in paths]
577 |             rewards = np.array(
578 |                 [tensor_utils.pad_tensor(r, max_path_length) for r in rewards])
579 | 
580 |             agent_infos = [path["agent_infos"] for path in paths]
581 |             agent_infos = tensor_utils.stack_tensor_dict_list(
582 |                 [tensor_utils.pad_tensor_dict(
583 |                     p, max_path_length) for p in agent_infos]
584 |             )
585 | 
586 |             env_infos = [path["env_infos"] for path in paths]
587 |             env_infos = tensor_utils.stack_tensor_dict_list(
588 |                 [tensor_utils.pad_tensor_dict(
589 |                     p, max_path_length) for p in env_infos]
590 |             )
591 | 
592 |             valids = [np.ones_like(path["returns"]) for path in paths]
593 |             valids = np.array(
594 |                 [tensor_utils.pad_tensor(v, max_path_length) for v in valids])
595 | 
596 |             average_discounted_return = \
597 |                 np.mean([path["returns"][0] for path in paths])
598 | 
599 |             undiscounted_returns = [sum(path["rewards"]) for path in paths]
600 | 
601 |             ent = np.mean(self.policy.distribution.entropy(agent_infos))
602 | 
603 |             ev = special.explained_variance_1d(
604 |                 np.concatenate(baselines),
605 |                 np.concatenate(returns)
606 |             )
607 | 
608 |             samples_data = dict(
609 |                 observations=obs,
610 |                 actions=actions,
611 |                 advantages=adv,
612 |                 rewards=rewards,
613 |                 valids=valids,
614 |                 agent_infos=agent_infos,
615 |                 env_infos=env_infos,
616 |                 paths=paths,
617 |             )
618 | 
619 |         logger.log("fitting baseline...")
620 |         self.baseline.fit(paths)
621 |         logger.log("fitted")
622 | 
623 |         logger.record_tabular('Iteration', itr)
624 |         logger.record_tabular('AverageDiscountedReturn',
625 |                               average_discounted_return)
626 |         logger.record_tabular('AverageReturn', np.mean(undiscounted_returns))
627 |         logger.record_tabular('ExplainedVariance', ev)
628 |         logger.record_tabular('NumTrajs', len(paths))
629 |         logger.record_tabular('Entropy', ent)
630 |         logger.record_tabular('Perplexity', np.exp(ent))
631 |         logger.record_tabular('StdReturn', np.std(undiscounted_returns))
632 |         logger.record_tabular('MaxReturn', np.max(undiscounted_returns))
633 |         logger.record_tabular('MinReturn', np.min(undiscounted_returns))
634 | 
635 |         return samples_data
636 | 


--------------------------------------------------------------------------------
/algos/erwr_expl.py:
--------------------------------------------------------------------------------
 1 | from sandbox.vime.algos.vpg_expl import VPG
 2 | from rllab.optimizers.lbfgs_optimizer import LbfgsOptimizer
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | 
 6 | class ERWR(VPG, Serializable):
 7 |     """
 8 |     Episodic Reward Weighted Regression [1]_
 9 | 
10 |     Notes
11 |     -----
12 |     This does not implement the original RwR [2]_ that deals with "immediate reward problems" since
13 |     it doesn't find solutions that optimize for temporally delayed rewards.
14 | 
15 |     .. [1] Kober, Jens, and Jan R. Peters. "Policy search for motor primitives in robotics." Advances in neural information processing systems. 2009.
16 |     .. [2] Peters, Jan, and Stefan Schaal. "Using reward-weighted regression for reinforcement learning of task space control." Approximate Dynamic Programming and Reinforcement Learning, 2007. ADPRL 2007. IEEE International Symposium on. IEEE, 2007.
17 |     """
18 | 
19 |     def __init__(
20 |             self,
21 |             optimizer=None,
22 |             optimizer_args=None,
23 |             positive_adv=None,
24 |             **kwargs):
25 |         Serializable.quick_init(self, locals())
26 |         if optimizer is None:
27 |             if optimizer_args is None:
28 |                 optimizer_args = dict()
29 |             optimizer = LbfgsOptimizer(**optimizer_args)
30 |         super(ERWR, self).__init__(
31 |             optimizer=optimizer,
32 |             positive_adv=True if positive_adv is None else positive_adv,
33 |             **kwargs
34 |         )
35 | 


--------------------------------------------------------------------------------
/algos/npo_expl.py:
--------------------------------------------------------------------------------
  1 | from rllab.misc import ext
  2 | from rllab.misc.overrides import overrides
  3 | from sandbox.vime.algos.batch_polopt_expl import BatchPolopt
  4 | import rllab.misc.logger as logger
  5 | import theano
  6 | import theano.tensor as TT
  7 | from rllab.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer
  8 | 
  9 | 
 10 | class NPO(BatchPolopt):
 11 |     """
 12 |     Natural Policy Optimization.
 13 |     """
 14 | 
 15 |     def __init__(
 16 |             self,
 17 |             optimizer=None,
 18 |             optimizer_args=None,
 19 |             step_size=0.01,
 20 |             **kwargs):
 21 |         if optimizer is None:
 22 |             if optimizer_args is None:
 23 |                 optimizer_args = dict()
 24 |             optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
 25 |         self.optimizer = optimizer
 26 |         self.step_size = step_size
 27 |         super(NPO, self).__init__(**kwargs)
 28 | 
 29 |     @overrides
 30 |     def init_opt(self):
 31 |         is_recurrent = int(self.policy.recurrent)
 32 |         obs_var = self.env.observation_space.new_tensor_variable(
 33 |             'obs',
 34 |             extra_dims=1 + is_recurrent,
 35 |         )
 36 |         action_var = self.env.action_space.new_tensor_variable(
 37 |             'action',
 38 |             extra_dims=1 + is_recurrent,
 39 |         )
 40 |         advantage_var = ext.new_tensor(
 41 |             'advantage',
 42 |             ndim=1 + is_recurrent,
 43 |             dtype=theano.config.floatX
 44 |         )
 45 |         dist = self.policy.distribution
 46 |         old_dist_info_vars = {
 47 |             k: ext.new_tensor(
 48 |                 'old_%s' % k,
 49 |                 ndim=2 + is_recurrent,
 50 |                 dtype=theano.config.floatX
 51 |             ) for k in dist.dist_info_keys
 52 |         }
 53 |         old_dist_info_vars_list = [old_dist_info_vars[k]
 54 |                                    for k in dist.dist_info_keys]
 55 | 
 56 |         if is_recurrent:
 57 |             valid_var = TT.matrix('valid')
 58 |         else:
 59 |             valid_var = None
 60 | 
 61 |         dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
 62 |         kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
 63 |         lr = dist.likelihood_ratio_sym(
 64 |             action_var, old_dist_info_vars, dist_info_vars)
 65 |         if is_recurrent:
 66 |             mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
 67 |             surr_loss = - \
 68 |                 TT.sum(lr * advantage_var * valid_var) / TT.sum(valid_var)
 69 |         else:
 70 |             mean_kl = TT.mean(kl)
 71 |             surr_loss = - TT.mean(lr * advantage_var)
 72 | 
 73 |         input_list = [
 74 |             obs_var,
 75 |             action_var,
 76 |             advantage_var,
 77 |         ] + old_dist_info_vars_list
 78 |         if is_recurrent:
 79 |             input_list.append(valid_var)
 80 | 
 81 |         self.optimizer.update_opt(
 82 |             loss=surr_loss,
 83 |             target=self.policy,
 84 |             leq_constraint=(mean_kl, self.step_size),
 85 |             inputs=input_list,
 86 |             constraint_name="mean_kl"
 87 |         )
 88 |         return dict()
 89 | 
 90 |     @overrides
 91 |     def optimize_policy(self, itr, samples_data):
 92 |         all_input_values = tuple(ext.extract(
 93 |             samples_data,
 94 |             "observations", "actions", "advantages"
 95 |         ))
 96 |         agent_infos = samples_data["agent_infos"]
 97 |         info_list = [agent_infos[k]
 98 |                      for k in self.policy.distribution.dist_info_keys]
 99 |         all_input_values += tuple(info_list)
100 |         if self.policy.recurrent:
101 |             all_input_values += (samples_data["valids"],)
102 |         loss_before = self.optimizer.loss(all_input_values)
103 |         self.optimizer.optimize(all_input_values)
104 |         mean_kl = self.optimizer.constraint_val(all_input_values)
105 |         loss_after = self.optimizer.loss(all_input_values)
106 |         logger.record_tabular('LossAfter', loss_after)
107 |         logger.record_tabular('MeanKL', mean_kl)
108 |         logger.record_tabular('dLoss', loss_before - loss_after)
109 |         return dict()
110 | 
111 |     @overrides
112 |     def get_itr_snapshot(self, itr, samples_data):
113 |         return dict(
114 |             itr=itr,
115 |             policy=self.policy,
116 |             baseline=self.baseline,
117 |             env=self.env,
118 |         )
119 | 


--------------------------------------------------------------------------------
/algos/trpo_expl.py:
--------------------------------------------------------------------------------
 1 | from sandbox.vime.algos.npo_expl import NPO
 2 | from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | 
 6 | class TRPO(NPO, Serializable):
 7 |     """
 8 |     Trust Region Policy Optimization
 9 |     """
10 | 
11 |     def __init__(
12 |             self,
13 |             optimizer=None,
14 |             optimizer_args=None,
15 |             **kwargs):
16 |         Serializable.quick_init(self, locals())
17 |         if optimizer is None:
18 |             if optimizer_args is None:
19 |                 optimizer_args = dict()
20 |             optimizer = ConjugateGradientOptimizer(**optimizer_args)
21 |         super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
22 | 


--------------------------------------------------------------------------------
/algos/vpg_expl.py:
--------------------------------------------------------------------------------
  1 | import theano.tensor as TT
  2 | import theano
  3 | from rllab.misc import logger
  4 | from rllab.misc.overrides import overrides
  5 | from rllab.misc import ext
  6 | from sandbox.vime.algos.batch_polopt_expl import BatchPolopt
  7 | from rllab.optimizers.first_order_optimizer import FirstOrderOptimizer
  8 | from rllab.core.serializable import Serializable
  9 | 
 10 | 
 11 | class VPG(BatchPolopt, Serializable):
 12 |     """
 13 |     Vanilla Policy Gradient.
 14 |     """
 15 | 
 16 |     def __init__(
 17 |             self,
 18 |             env,
 19 |             policy,
 20 |             baseline,
 21 |             optimizer=None,
 22 |             optimizer_args=None,
 23 |             **kwargs):
 24 |         Serializable.quick_init(self, locals())
 25 |         if optimizer is None:
 26 |             default_args = dict(
 27 |                 batch_size=None,
 28 |                 max_epochs=1,
 29 |             )
 30 |             if optimizer_args is None:
 31 |                 optimizer_args = default_args
 32 |             else:
 33 |                 optimizer_args = dict(default_args, **optimizer_args)
 34 |             optimizer = FirstOrderOptimizer(**optimizer_args)
 35 |         self.optimizer = optimizer
 36 |         self.opt_info = None
 37 |         super(VPG, self).__init__(env=env, policy=policy, baseline=baseline, **kwargs)
 38 | 
 39 |     @overrides
 40 |     def init_opt(self):
 41 |         is_recurrent = int(self.policy.recurrent)
 42 | 
 43 |         obs_var = self.env.observation_space.new_tensor_variable(
 44 |             'obs',
 45 |             extra_dims=1 + is_recurrent,
 46 |         )
 47 |         action_var = self.env.action_space.new_tensor_variable(
 48 |             'action',
 49 |             extra_dims=1 + is_recurrent,
 50 |         )
 51 |         advantage_var = ext.new_tensor(
 52 |             'advantage',
 53 |             ndim=1 + is_recurrent,
 54 |             dtype=theano.config.floatX
 55 |         )
 56 |         dist = self.policy.distribution
 57 |         old_dist_info_vars = {
 58 |             k: ext.new_tensor(
 59 |                 'old_%s' % k,
 60 |                 ndim=2 + is_recurrent,
 61 |                 dtype=theano.config.floatX
 62 |             ) for k in dist.dist_info_keys
 63 |             }
 64 |         old_dist_info_vars_list = [old_dist_info_vars[k] for k in dist.dist_info_keys]
 65 | 
 66 |         if is_recurrent:
 67 |             valid_var = TT.matrix('valid')
 68 |         else:
 69 |             valid_var = None
 70 | 
 71 |         dist_info_vars = self.policy.dist_info_sym(obs_var, action_var)
 72 |         logli = dist.log_likelihood_sym(action_var, dist_info_vars)
 73 |         kl = dist.kl_sym(old_dist_info_vars, dist_info_vars)
 74 | 
 75 |         # formulate as a minimization problem
 76 |         # The gradient of the surrogate objective is the policy gradient
 77 |         if is_recurrent:
 78 |             surr_obj = - TT.sum(logli * advantage_var * valid_var) / TT.sum(valid_var)
 79 |             mean_kl = TT.sum(kl * valid_var) / TT.sum(valid_var)
 80 |             max_kl = TT.max(kl * valid_var)
 81 |         else:
 82 |             surr_obj = - TT.mean(logli * advantage_var)
 83 |             mean_kl = TT.mean(kl)
 84 |             max_kl = TT.max(kl)
 85 | 
 86 |         input_list = [obs_var, action_var, advantage_var]
 87 |         if is_recurrent:
 88 |             input_list.append(valid_var)
 89 | 
 90 |         self.optimizer.update_opt(surr_obj, target=self.policy, inputs=input_list)
 91 | 
 92 |         f_kl = ext.compile_function(
 93 |             inputs=input_list + old_dist_info_vars_list,
 94 |             outputs=[mean_kl, max_kl],
 95 |         )
 96 |         self.opt_info = dict(
 97 |             f_kl=f_kl,
 98 |         )
 99 | 
100 |     @overrides
101 |     def optimize_policy(self, itr, samples_data):
102 |         logger.log("optimizing policy")
103 |         inputs = ext.extract(
104 |             samples_data,
105 |             "observations", "actions", "advantages"
106 |         )
107 |         if self.policy.recurrent:
108 |             inputs += (samples_data["valids"],)
109 |         agent_infos = samples_data["agent_infos"]
110 |         dist_info_list = [agent_infos[k] for k in self.policy.distribution.dist_info_keys]
111 |         loss_before = self.optimizer.loss(inputs)
112 |         self.optimizer.optimize(inputs)
113 |         loss_after = self.optimizer.loss(inputs)
114 |         logger.record_tabular("LossBefore", loss_before)
115 |         logger.record_tabular("LossAfter", loss_after)
116 | 
117 |         mean_kl, max_kl = self.opt_info['f_kl'](*(list(inputs) + dist_info_list))
118 |         logger.record_tabular('MeanKL', mean_kl)
119 |         logger.record_tabular('MaxKL', max_kl)
120 | 
121 |     @overrides
122 |     def get_itr_snapshot(self, itr, samples_data):
123 |         return dict(
124 |             itr=itr,
125 |             policy=self.policy,
126 |             baseline=self.baseline,
127 |             env=self.env,
128 |         )
129 | 


--------------------------------------------------------------------------------
/dynamics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/dynamics/__init__.py


--------------------------------------------------------------------------------
/dynamics/bnn.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import numpy as np
  3 | import theano.tensor as T
  4 | import lasagne
  5 | from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
  6 | from rllab.core.lasagne_powered import LasagnePowered
  7 | from rllab.core.serializable import Serializable
  8 | from rllab.misc import ext
  9 | from collections import OrderedDict
 10 | import theano
 11 | 
 12 | # ----------------
 13 | BNN_LAYER_TAG = 'BNNLayer'
 14 | USE_REPARAMETRIZATION_TRICK = True
 15 | # ----------------
 16 | 
 17 | 
 18 | class BNNLayer(lasagne.layers.Layer):
 19 |     """Probabilistic layer that uses Gaussian weights.
 20 | 
 21 |     Each weight has two parameters: mean and standard deviation (std).
 22 |     """
 23 | 
 24 |     def __init__(self,
 25 |                  incoming,
 26 |                  num_units,
 27 |                  nonlinearity=lasagne.nonlinearities.rectify,
 28 |                  prior_sd=None,
 29 |                  **kwargs):
 30 |         super(BNNLayer, self).__init__(incoming, **kwargs)
 31 | 
 32 |         self._srng = RandomStreams()
 33 | 
 34 |         # Set vars.
 35 |         self.nonlinearity = nonlinearity
 36 |         self.num_inputs = int(np.prod(self.input_shape[1:]))
 37 |         self.num_units = num_units
 38 |         self.prior_sd = prior_sd
 39 | 
 40 |         prior_rho = self.std_to_log(self.prior_sd)
 41 | 
 42 |         self.W = np.random.normal(0., prior_sd,
 43 |                                   (self.num_inputs, self.num_units))  # @UndefinedVariable
 44 |         self.b = np.zeros(
 45 |             (self.num_units,),
 46 |             dtype=theano.config.floatX)  # @UndefinedVariable
 47 | 
 48 |         # Here we set the priors.
 49 |         # -----------------------
 50 |         self.mu = self.add_param(
 51 |             lasagne.init.Normal(1., 0.),
 52 |             (self.num_inputs, self.num_units),
 53 |             name='mu'
 54 |         )
 55 |         self.rho = self.add_param(
 56 |             lasagne.init.Constant(prior_rho),
 57 |             (self.num_inputs, self.num_units),
 58 |             name='rho'
 59 |         )
 60 |         # Bias priors.
 61 |         self.b_mu = self.add_param(
 62 |             lasagne.init.Normal(1., 0.),
 63 |             (self.num_units,),
 64 |             name="b_mu",
 65 |             regularizable=False
 66 |         )
 67 |         self.b_rho = self.add_param(
 68 |             lasagne.init.Constant(prior_rho),
 69 |             (self.num_units,),
 70 |             name="b_rho",
 71 |             regularizable=False
 72 |         )
 73 |         # -----------------------
 74 | 
 75 |         # Backup params for KL calculations.
 76 |         self.mu_old = self.add_param(
 77 |             np.zeros((self.num_inputs, self.num_units)),
 78 |             (self.num_inputs, self.num_units),
 79 |             name='mu_old',
 80 |             trainable=False,
 81 |             oldparam=True
 82 |         )
 83 |         self.rho_old = self.add_param(
 84 |             np.ones((self.num_inputs, self.num_units)),
 85 |             (self.num_inputs, self.num_units),
 86 |             name='rho_old',
 87 |             trainable=False,
 88 |             oldparam=True
 89 |         )
 90 |         # Bias priors.
 91 |         self.b_mu_old = self.add_param(
 92 |             np.zeros((self.num_units,)),
 93 |             (self.num_units,),
 94 |             name="b_mu_old",
 95 |             regularizable=False,
 96 |             trainable=False,
 97 |             oldparam=True
 98 |         )
 99 |         self.b_rho_old = self.add_param(
100 |             np.ones((self.num_units,)),
101 |             (self.num_units,),
102 |             name="b_rho_old",
103 |             regularizable=False,
104 |             trainable=False,
105 |             oldparam=True
106 |         )
107 | 
108 |     def log_to_std(self, rho):
109 |         """Transformation for allowing rho in \mathbb{R}, rather than \mathbb{R}_+
110 | 
111 |         This makes sure that we don't get negative stds. However, a downside might be
112 |         that we have little gradient on close to 0 std (= -inf using this transformation).
113 |         """
114 |         return T.log(1 + T.exp(rho))
115 | 
116 |     def std_to_log(self, sigma):
117 |         """Reverse log_to_std transformation."""
118 |         return np.log(np.exp(sigma) - 1)
119 | 
120 |     def get_W(self):
121 |         # Here we generate random epsilon values from a normal distribution
122 |         epsilon = self._srng.normal(size=(self.num_inputs, self.num_units), avg=0., std=1.,
123 |                                     dtype=theano.config.floatX)  # @UndefinedVariable
124 |         # Here we calculate weights based on shifting and rescaling according
125 |         # to mean and variance (paper step 2)
126 |         W = self.mu + self.log_to_std(self.rho) * epsilon
127 |         self.W = W
128 |         return W
129 | 
130 |     def get_b(self):
131 |         # Here we generate random epsilon values from a normal distribution
132 |         epsilon = self._srng.normal(size=(self.num_units,), avg=0., std=1.,
133 |                                     dtype=theano.config.floatX)  # @UndefinedVariable
134 |         b = self.b_mu + self.log_to_std(self.b_rho) * epsilon
135 |         self.b = b
136 |         return b
137 | 
138 |     def get_output_for_reparametrization(self, input, **kwargs):
139 |         """Implementation of the local reparametrization trick.
140 | 
141 |         This essentially leads to a speedup compared to the naive implementation case.
142 |         Furthermore, it leads to gradients with less variance.
143 | 
144 |         References
145 |         ----------
146 |         Kingma et al., "Variational Dropout and the Local Reparametrization Trick", 2015
147 |         """
148 |         if input.ndim > 2:
149 |             # if the input has more than two dimensions, flatten it into a
150 |             # batch of feature vectors.
151 |             input = input.flatten(2)
152 | 
153 |         gamma = T.dot(input, self.mu) + self.b_mu.dimshuffle('x', 0)
154 |         delta = T.dot(T.square(input), T.square(self.log_to_std(
155 |             self.rho))) + T.square(self.log_to_std(self.b_rho)).dimshuffle('x', 0)
156 |         epsilon = self._srng.normal(size=(self.num_units,), avg=0., std=1.,
157 |                                     dtype=theano.config.floatX)  # @UndefinedVariable
158 | 
159 |         activation = gamma + T.sqrt(delta) * epsilon
160 | 
161 |         return self.nonlinearity(activation)
162 | 
163 |     def save_old_params(self):
164 |         """Save old parameter values for KL calculation."""
165 |         self.mu_old.set_value(self.mu.get_value())
166 |         self.rho_old.set_value(self.rho.get_value())
167 |         self.b_mu_old.set_value(self.b_mu.get_value())
168 |         self.b_rho_old.set_value(self.b_rho.get_value())
169 | 
170 |     def reset_to_old_params(self):
171 |         """Reset to old parameter values for KL calculation."""
172 |         self.mu.set_value(self.mu_old.get_value())
173 |         self.rho.set_value(self.rho_old.get_value())
174 |         self.b_mu.set_value(self.b_mu_old.get_value())
175 |         self.b_rho.set_value(self.b_rho_old.get_value())
176 | 
177 |     def kl_div_p_q(self, p_mean, p_std, q_mean, q_std):
178 |         """KL divergence D_{KL}[p(x)||q(x)] for a fully factorized Gaussian"""
179 |         numerator = T.square(p_mean - q_mean) + \
180 |             T.square(p_std) - T.square(q_std)
181 |         denominator = 2 * T.square(q_std) + 1e-8
182 |         return T.sum(
183 |             numerator / denominator + T.log(q_std) - T.log(p_std))
184 | 
185 |     def kl_div_new_old(self):
186 |         kl_div = self.kl_div_p_q(
187 |             self.mu, self.log_to_std(self.rho), self.mu_old, self.log_to_std(self.rho_old))
188 |         kl_div += self.kl_div_p_q(self.b_mu, self.log_to_std(self.b_rho),
189 |                                   self.b_mu_old, self.log_to_std(self.b_rho_old))
190 |         return kl_div
191 | 
192 |     def kl_div_old_new(self):
193 |         kl_div = self.kl_div_p_q(
194 |             self.mu_old, self.log_to_std(self.rho_old), self.mu, self.log_to_std(self.rho))
195 |         kl_div += self.kl_div_p_q(self.b_mu_old,
196 |                                   self.log_to_std(self.b_rho_old), self.b_mu, self.log_to_std(self.b_rho))
197 |         return kl_div
198 | 
199 |     def kl_div_new_prior(self):
200 |         kl_div = self.kl_div_p_q(
201 |             self.mu, self.log_to_std(self.rho), 0., self.prior_sd)
202 |         kl_div += self.kl_div_p_q(self.b_mu,
203 |                                   self.log_to_std(self.b_rho), 0., self.prior_sd)
204 |         return kl_div
205 | 
206 |     def kl_div_old_prior(self):
207 |         kl_div = self.kl_div_p_q(
208 |             self.mu_old, self.log_to_std(self.rho_old), 0., self.prior_sd)
209 |         kl_div += self.kl_div_p_q(self.b_mu_old,
210 |                                   self.log_to_std(self.b_rho_old), 0., self.prior_sd)
211 |         return kl_div
212 | 
213 |     def kl_div_prior_new(self):
214 |         kl_div = self.kl_div_p_q(
215 |             0., self.prior_sd, self.mu,  self.log_to_std(self.rho))
216 |         kl_div += self.kl_div_p_q(0., self.prior_sd,
217 |                                   self.b_mu, self.log_to_std(self.b_rho))
218 |         return kl_div
219 | 
220 |     def get_output_for(self, input, **kwargs):
221 |         if USE_REPARAMETRIZATION_TRICK:
222 |             return self.get_output_for_reparametrization(input, **kwargs)
223 |         else:
224 |             return self.get_output_for_default(input, **kwargs)
225 | 
226 |     def get_output_for_default(self, input, **kwargs):
227 |         if input.ndim > 2:
228 |             # if the input has more than two dimensions, flatten it into a
229 |             # batch of feature vectors.
230 |             input = input.flatten(2)
231 | 
232 |         activation = T.dot(input, self.get_W()) + \
233 |             self.get_b().dimshuffle('x', 0)
234 | 
235 |         return self.nonlinearity(activation)
236 | 
237 |     def get_output_shape_for(self, input_shape):
238 |         return (input_shape[0], self.num_units)
239 | 
240 | 
241 | class BNN(LasagnePowered, Serializable):
242 |     """Bayesian neural network (BNN) based on Blundell2016."""
243 | 
244 |     def __init__(self, n_in,
245 |                  n_hidden,
246 |                  n_out,
247 |                  layers_type,
248 |                  n_batches,
249 |                  trans_func=lasagne.nonlinearities.rectify,
250 |                  out_func=lasagne.nonlinearities.linear,
251 |                  batch_size=100,
252 |                  n_samples=10,
253 |                  prior_sd=0.5,
254 |                  use_reverse_kl_reg=False,
255 |                  reverse_kl_reg_factor=0.1,
256 |                  likelihood_sd=5.0,
257 |                  second_order_update=False,
258 |                  learning_rate=0.0001,
259 |                  compression=False,
260 |                  information_gain=True,
261 |                  ):
262 | 
263 |         Serializable.quick_init(self, locals())
264 |         assert len(layers_type) == len(n_hidden) + 1
265 | 
266 |         self.n_in = n_in
267 |         self.n_hidden = n_hidden
268 |         self.n_out = n_out
269 |         self.batch_size = batch_size
270 |         self.transf = trans_func
271 |         self.outf = out_func
272 |         self.n_samples = n_samples
273 |         self.prior_sd = prior_sd
274 |         self.layers_type = layers_type
275 |         self.n_batches = n_batches
276 |         self.use_reverse_kl_reg = use_reverse_kl_reg
277 |         self.reverse_kl_reg_factor = reverse_kl_reg_factor
278 |         self.likelihood_sd = likelihood_sd
279 |         self.second_order_update = second_order_update
280 |         self.learning_rate = learning_rate
281 |         self.compression = compression
282 |         self.information_gain = information_gain
283 | 
284 |         assert self.information_gain or self.compression
285 | 
286 |         # Build network architecture.
287 |         self.build_network()
288 | 
289 |         # Build model might depend on this.
290 |         LasagnePowered.__init__(self, [self.network])
291 | 
292 |         # Compile theano functions.
293 |         self.build_model()
294 | 
295 |     def save_old_params(self):
296 |         layers = filter(lambda l: l.name == BNN_LAYER_TAG,
297 |                         lasagne.layers.get_all_layers(self.network)[1:])
298 |         for layer in layers:
299 |             layer.save_old_params()
300 | 
301 |     def reset_to_old_params(self):
302 |         layers = filter(lambda l: l.name == BNN_LAYER_TAG,
303 |                         lasagne.layers.get_all_layers(self.network)[1:])
304 |         for layer in layers:
305 |             layer.reset_to_old_params()
306 | 
307 |     def compression_improvement(self):
308 |         """KL divergence KL[old_param||new_param]"""
309 |         layers = filter(lambda l: l.name == BNN_LAYER_TAG,
310 |                         lasagne.layers.get_all_layers(self.network)[1:])
311 |         return sum(l.kl_div_old_new() for l in layers)
312 | 
313 |     def inf_gain(self):
314 |         """KL divergence KL[new_param||old_param]"""
315 |         layers = filter(lambda l: l.name == BNN_LAYER_TAG,
316 |                         lasagne.layers.get_all_layers(self.network)[1:])
317 |         return sum(l.kl_div_new_old() for l in layers)
318 | 
319 |     def surprise(self):
320 |         surpr = 0.
321 |         if self.compression:
322 |             surpr += self.compression_improvement()
323 |         if self.information_gain:
324 |             surpr += self.inf_gain()
325 |         return surpr
326 | 
327 |     def kl_div(self):
328 |         """KL divergence KL[new_param||old_param]"""
329 |         layers = filter(lambda l: l.name == BNN_LAYER_TAG,
330 |                         lasagne.layers.get_all_layers(self.network)[1:])
331 |         return sum(l.kl_div_new_old() for l in layers)
332 | 
333 |     def log_p_w_q_w_kl(self):
334 |         """KL divergence KL[q_\phi(w)||p(w)]"""
335 |         layers = filter(lambda l: l.name == BNN_LAYER_TAG,
336 |                         lasagne.layers.get_all_layers(self.network)[1:])
337 |         return sum(l.kl_div_new_prior() for l in layers)
338 | 
339 |     def reverse_log_p_w_q_w_kl(self):
340 |         """KL divergence KL[p(w)||q_\phi(w)]"""
341 |         layers = filter(lambda l: l.name == BNN_LAYER_TAG,
342 |                         lasagne.layers.get_all_layers(self.network)[1:])
343 |         return sum(l.kl_div_prior_new() for l in layers)
344 | 
345 |     def _log_prob_normal(self, input, mu=0., sigma=1.):
346 |         log_normal = - \
347 |             T.log(sigma) - T.log(T.sqrt(2 * np.pi)) - \
348 |             T.square(input - mu) / (2 * T.square(sigma))
349 |         return T.sum(log_normal)
350 | 
351 |     def pred_sym(self, input):
352 |         return lasagne.layers.get_output(self.network, input)
353 | 
354 |     def loss(self, input, target):
355 | 
356 |         # MC samples.
357 |         _log_p_D_given_w = []
358 |         for _ in xrange(self.n_samples):
359 |             # Make prediction.
360 |             prediction = self.pred_sym(input)
361 |             # Calculate model likelihood log(P(D|w)).
362 |             _log_p_D_given_w.append(self._log_prob_normal(
363 |                 target, prediction, self.likelihood_sd))
364 |         log_p_D_given_w = sum(_log_p_D_given_w)
365 |         # Calculate variational posterior log(q(w)) and prior log(p(w)).
366 |         kl = self.log_p_w_q_w_kl()
367 |         if self.use_reverse_kl_reg:
368 |             kl += self.reverse_kl_reg_factor * \
369 |                 self.reverse_log_p_w_q_w_kl()
370 | 
371 |         # Calculate loss function.
372 |         return kl / self.n_batches - log_p_D_given_w / self.n_samples
373 | 
374 |     def loss_last_sample(self, input, target):
375 |         """The difference with the original loss is that we only update based on the latest sample.
376 |         This means that instead of using the prior p(w), we use the previous approximated posterior
377 |         q(w) for the KL term in the objective function: KL[q(w)|p(w)] becomems KL[q'(w)|q(w)].
378 |         """
379 | 
380 |         # MC samples.
381 |         _log_p_D_given_w = []
382 |         for _ in xrange(self.n_samples):
383 |             # Make prediction.
384 |             prediction = self.pred_sym(input)
385 |             # Calculate model likelihood log(P(sample|w)).
386 |             _log_p_D_given_w.append(self._log_prob_normal(
387 |                 target, prediction, self.likelihood_sd))
388 |         log_p_D_given_w = sum(_log_p_D_given_w)
389 |         # Calculate loss function.
390 |         # self.kl_div() should be zero when taking second order step
391 |         return self.kl_div() - log_p_D_given_w / self.n_samples
392 | 
393 |     def build_network(self):
394 | 
395 |         # Input layer
396 |         network = lasagne.layers.InputLayer(shape=(1, self.n_in))
397 | 
398 |         # Hidden layers
399 |         for i in xrange(len(self.n_hidden)):
400 |             # Probabilistic layer (1) or deterministic layer (0).
401 |             if self.layers_type[i] == 1:
402 |                 network = BNNLayer(
403 |                     network, self.n_hidden[i], nonlinearity=self.transf, prior_sd=self.prior_sd, name=BNN_LAYER_TAG)
404 |             else:
405 |                 network = lasagne.layers.DenseLayer(
406 |                     network, self.n_hidden[i], nonlinearity=self.transf)
407 | 
408 |         # Output layer
409 |         if self.layers_type[len(self.n_hidden)] == 1:
410 |             # Probabilistic layer (1) or deterministic layer (0).
411 |             network = BNNLayer(
412 |                 network, self.n_out, nonlinearity=self.outf, prior_sd=self.prior_sd, name=BNN_LAYER_TAG)
413 |         else:
414 |             network = lasagne.layers.DenseLayer(
415 |                 network, self.n_out, nonlinearity=self.outf)
416 | 
417 |         self.network = network
418 | 
419 |     def build_model(self):
420 | 
421 |         # Prepare Theano variables for inputs and targets
422 |         # Same input for classification as regression.
423 |         input_var = T.matrix('inputs',
424 |                              dtype=theano.config.floatX)  # @UndefinedVariable
425 |         target_var = T.matrix('targets',
426 |                               dtype=theano.config.floatX)  # @UndefinedVariable
427 | 
428 |         # Loss function.
429 |         loss = self.loss(input_var, target_var)
430 |         loss_only_last_sample = self.loss_last_sample(input_var, target_var)
431 | 
432 |         # Create update methods.
433 |         params = lasagne.layers.get_all_params(self.network, trainable=True)
434 |         updates = lasagne.updates.adam(
435 |             loss, params, learning_rate=self.learning_rate)
436 | 
437 |         # Train/val fn.
438 |         self.pred_fn = ext.compile_function(
439 |             [input_var], self.pred_sym(input_var), log_name='pred_fn')
440 |         self.train_fn = ext.compile_function(
441 |             [input_var, target_var], loss, updates=updates, log_name='train_fn')
442 | 
443 |         if self.second_order_update:
444 | 
445 |             oldparams = lasagne.layers.get_all_params(
446 |                 self.network, oldparam=True)
447 |             step_size = T.scalar('step_size',
448 |                                  dtype=theano.config.floatX)  # @UndefinedVariable
449 | 
450 |             def second_order_update(loss_or_grads, params, oldparams, step_size):
451 |                 """Second-order update method for optimizing loss_last_sample, so basically,
452 |                 KL term (new params || old params) + NLL of latest sample. The Hessian is
453 |                 evaluated at the origin and provides curvature information to make a more
454 |                 informed step in the correct descent direction."""
455 |                 grads = T.grad(loss_or_grads, params)
456 |                 updates = OrderedDict()
457 |                 for i in xrange(len(params)):
458 |                     param = params[i]
459 |                     grad = grads[i]
460 |                     if param.name == 'mu' or param.name == 'b_mu':
461 |                         oldparam_rho = oldparams[i + 1]
462 |                         invH = T.square(T.log(1 + T.exp(oldparam_rho)))
463 |                     else:
464 |                         oldparam_rho = oldparams[i]
465 |                         p = param
466 | 
467 |                         H = 2. * (T.exp(2 * p)) / \
468 |                             (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2)
469 |                         invH = 1. / H
470 |                     updates[param] = param - step_size * invH * grad
471 | 
472 |                 return updates
473 | 
474 |             def fast_kl_div(loss, params, oldparams, step_size):
475 | 
476 |                 grads = T.grad(loss, params)
477 | 
478 |                 kl_component = []
479 |                 for i in xrange(len(params)):
480 |                     param = params[i]
481 |                     grad = grads[i]
482 | 
483 |                     if param.name == 'mu' or param.name == 'b_mu':
484 |                         oldparam_rho = oldparams[i + 1]
485 |                         invH = T.square(T.log(1 + T.exp(oldparam_rho)))
486 |                     else:
487 |                         oldparam_rho = oldparams[i]
488 |                         p = param
489 | 
490 |                         H = 2. * (T.exp(2 * p)) / \
491 |                             (1 + T.exp(p))**2 / (T.log(1 + T.exp(p))**2)
492 |                         invH = 1. / H
493 | 
494 |                     kl_component.append(
495 |                         T.sum(T.square(step_size) * T.square(grad) * invH))
496 | 
497 |                 return sum(kl_component)
498 | 
499 |             compute_fast_kl_div = fast_kl_div(
500 |                 loss_only_last_sample, params, oldparams, step_size)
501 | 
502 |             self.train_update_fn = ext.compile_function(
503 |                 [input_var, target_var, step_size], compute_fast_kl_div, log_name='f_compute_fast_kl_div')
504 | 
505 | #             updates_kl = second_order_update(
506 | #                 loss_only_last_sample, params, oldparams, step_size)
507 | #
508 | #             self.train_update_fn = ext.compile_function(
509 | #                 [input_var, target_var, step_size], loss_only_last_sample, updates=updates_kl, log_name='train_update_fn')
510 |         else:
511 |             self.train_update_fn = ext.compile_function(
512 |                 [input_var, target_var], loss_only_last_sample, updates=updates, log_name='train_update_fn')
513 | 
514 |         # called kl div closed form but should be called surprise
515 |         self.f_kl_div_closed_form = ext.compile_function(
516 |             [], self.surprise(), log_name='kl_div_fn')
517 | 
518 | if __name__ == '__main__':
519 |     pass
520 | 


--------------------------------------------------------------------------------
/envs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/envs/__init__.py


--------------------------------------------------------------------------------
/envs/cartpole_swingup_env_x.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pygame
 3 | from rllab.envs.box2d.parser import find_body
 4 | 
 5 | from rllab.core.serializable import Serializable
 6 | from rllab.envs.box2d.box2d_env import Box2DEnv
 7 | from rllab.misc import autoargs
 8 | from rllab.misc.overrides import overrides
 9 | 
10 | 
11 | # Tornio, Matti, and Tapani Raiko. "Variational Bayesian approach for
12 | # nonlinear identification and control." Proc. of the IFAC Workshop on
13 | # Nonlinear Model Predictive Control for Fast Systems, NMPC FS06. 2006.
14 | class CartpoleSwingupEnvX(Box2DEnv, Serializable):
15 | 
16 |     @autoargs.inherit(Box2DEnv.__init__)
17 |     def __init__(self, *args, **kwargs):
18 |         super(CartpoleSwingupEnvX, self).__init__(
19 |             self.model_path("cartpole.xml.mako"),
20 |             *args, **kwargs
21 |         )
22 |         self.max_cart_pos = 3
23 |         self.max_reward_cart_pos = 3
24 |         self.cart = find_body(self.world, "cart")
25 |         self.pole = find_body(self.world, "pole")
26 |         Serializable.__init__(self, *args, **kwargs)
27 | 
28 |     @overrides
29 |     def reset(self):
30 |         self._set_state(self.initial_state)
31 |         self._invalidate_state_caches()
32 |         bounds = np.array([
33 |             [-1, -2, np.pi - 1, -3],
34 |             [1, 2, np.pi + 1, 3],
35 |         ])
36 |         low, high = bounds
37 |         xpos, xvel, apos, avel = np.random.uniform(low, high)
38 |         self.cart.position = (xpos, self.cart.position[1])
39 |         self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1])
40 |         self.pole.angle = apos
41 |         self.pole.angularVelocity = avel
42 |         return self.get_current_obs()
43 | 
44 |     @overrides
45 |     def compute_reward(self, action):
46 |         yield
47 |         if self.is_current_done():
48 |             yield 0 
49 |         else:
50 |             if abs(self.cart.position[0]) > self.max_reward_cart_pos:
51 |                 yield 0
52 |             else:
53 |                 cs = np.cos(self.pole.angle)
54 |                 if cs > 0.8:
55 |                     rew = 1.0
56 |                 else:
57 |                     rew = 0.
58 |                 yield rew
59 | 
60 |     @overrides
61 |     def is_current_done(self):
62 |         return abs(self.cart.position[0]) > self.max_cart_pos
63 | 
64 |     @overrides
65 |     def action_from_keys(self, keys):
66 |         if keys[pygame.K_LEFT]:
67 |             return np.asarray([-10])
68 |         elif keys[pygame.K_RIGHT]:
69 |             return np.asarray([+10])
70 |         else:
71 |             return np.asarray([0])
72 | 


--------------------------------------------------------------------------------
/envs/double_pendulum_env_x.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rllab.envs.box2d.parser import find_body
 3 | 
 4 | from rllab.core.serializable import Serializable
 5 | from rllab.envs.box2d.box2d_env import Box2DEnv
 6 | from rllab.misc import autoargs
 7 | from rllab.misc.overrides import overrides
 8 | 
 9 | 
10 | # http://mlg.eng.cam.ac.uk/pilco/
11 | class DoublePendulumEnvX(Box2DEnv, Serializable):
12 | 
13 |     @autoargs.inherit(Box2DEnv.__init__)
14 |     def __init__(self, *args, **kwargs):
15 |         # make sure mdp-level step is 100ms long
16 |         kwargs["frame_skip"] = kwargs.get("frame_skip", 2)
17 |         if kwargs.get("template_args", {}).get("noise", False):
18 |             self.link_len = (np.random.rand()-0.5) + 1
19 |         else:
20 |             self.link_len = 1
21 |         kwargs["template_args"] = kwargs.get("template_args", {})
22 |         kwargs["template_args"]["link_len"] = self.link_len
23 |         super(DoublePendulumEnvX, self).__init__(
24 |             self.model_path("double_pendulum.xml.mako"),
25 |             *args, **kwargs
26 |         )
27 |         self.link1 = find_body(self.world, "link1")
28 |         self.link2 = find_body(self.world, "link2")
29 |         Serializable.__init__(self, *args, **kwargs)
30 | 
31 |     @overrides
32 |     def reset(self):
33 |         self._set_state(self.initial_state)
34 |         self._invalidate_state_caches()
35 |         stds = np.array([0.1, 0.1, 0.01, 0.01])
36 |         pos1, pos2, v1, v2 = np.random.randn(*stds.shape) * stds
37 |         self.link1.angle = pos1
38 |         self.link2.angle = pos2
39 |         self.link1.angularVelocity = v1
40 |         self.link2.angularVelocity = v2
41 |         return self.get_current_obs()
42 | 
43 |     def get_tip_pos(self):
44 |         cur_center_pos = self.link2.position
45 |         cur_angle = self.link2.angle
46 |         cur_pos = (
47 |             cur_center_pos[0] - self.link_len*np.sin(cur_angle),
48 |             cur_center_pos[1] - self.link_len*np.cos(cur_angle)
49 |         )
50 |         return cur_pos
51 | 
52 |     @overrides
53 |     def compute_reward(self, action):
54 |         yield
55 |         tgt_pos = np.asarray([0, self.link_len * 2])
56 |         cur_pos = self.get_tip_pos()
57 |         dist = np.linalg.norm(cur_pos - tgt_pos)
58 |         if dist < 1:
59 |             rew = 1.
60 |         else:
61 |             rew = 0.
62 |         yield rew 
63 | 
64 |     def is_current_done(self):
65 |         return False
66 | 
67 | 


--------------------------------------------------------------------------------
/envs/half_cheetah_env_x.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.envs.base import Step
 5 | from rllab.envs.mujoco.mujoco_env import MujocoEnv
 6 | from rllab.misc import logger
 7 | from rllab.misc.overrides import overrides
 8 | 
 9 | 
10 | def smooth_abs(x, param):
11 |     return np.sqrt(np.square(x) + np.square(param)) - param
12 | 
13 | 
14 | class HalfCheetahEnvX(MujocoEnv, Serializable):
15 | 
16 |     FILE = 'half_cheetah.xml'
17 | 
18 |     def __init__(self, *args, **kwargs):
19 |         super(HalfCheetahEnvX, self).__init__(*args, **kwargs)
20 |         Serializable.__init__(self, *args, **kwargs)
21 | 
22 |     def get_current_obs(self):
23 |         return np.concatenate([
24 |             self.model.data.qpos.flatten()[1:],
25 |             self.model.data.qvel.flat,
26 |             self.get_body_com("torso").flat,
27 |         ])
28 | 
29 |     def get_body_xmat(self, body_name):
30 |         idx = self.model.body_names.index(body_name)
31 |         return self.model.data.xmat[idx].reshape((3, 3))
32 | 
33 |     def get_body_com(self, body_name):
34 |         idx = self.model.body_names.index(body_name)
35 |         return self.model.data.com_subtree[idx]
36 | 
37 |     def step(self, action):
38 |         self.forward_dynamics(action)
39 |         next_obs = self.get_current_obs()
40 |         action = np.clip(action, *self.action_bounds)
41 |         ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action))
42 |         run_cost = -1 * self.get_body_comvel("torso")[0]
43 |         cost = ctrl_cost + run_cost
44 |         reward = -cost
45 |         done = False
46 |         if self.get_body_com("torso")[0] <= 5.0:
47 |             reward = 0.
48 |         else:
49 |             reward = 1.0
50 |         return Step(next_obs, reward, done)
51 | 
52 |     @overrides
53 |     def log_diagnostics(self, paths):
54 |         progs = [
55 |             path["observations"][-1][-3] - path["observations"][0][-3]
56 |             for path in paths
57 |         ]
58 |         logger.record_tabular('AverageForwardProgress', np.mean(progs))
59 |         logger.record_tabular('MaxForwardProgress', np.max(progs))
60 |         logger.record_tabular('MinForwardProgress', np.min(progs))
61 |         logger.record_tabular('StdForwardProgress', np.std(progs))
62 | 


--------------------------------------------------------------------------------
/envs/mountain_car_env_x.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pygame
 3 | from rllab.envs.box2d.parser import find_body
 4 | 
 5 | from rllab.core.serializable import Serializable
 6 | from rllab.envs.box2d.box2d_env import Box2DEnv
 7 | from rllab.misc import autoargs
 8 | from rllab.misc.overrides import overrides
 9 | 
10 | 
11 | class MountainCarEnvX(Box2DEnv, Serializable):
12 | 
13 |     @autoargs.inherit(Box2DEnv.__init__)
14 |     @autoargs.arg("height_bonus_coeff", type=float,
15 |                   help="Height bonus added to each step's reward")
16 |     @autoargs.arg("goal_cart_pos", type=float,
17 |                   help="Goal horizontal position")
18 |     def __init__(self,
19 |                  height_bonus=1.,
20 |                  goal_cart_pos=0.6,
21 |                  *args, **kwargs):
22 |         super(MountainCarEnvX, self).__init__(
23 |             self.model_path("mountain_car.xml.mako"),
24 |             *args, **kwargs
25 |         )
26 |         self.max_cart_pos = 2
27 |         self.goal_cart_pos = goal_cart_pos
28 |         self.height_bonus = height_bonus
29 |         self.cart = find_body(self.world, "cart")
30 |         Serializable.quick_init(self, locals())
31 | 
32 |     @overrides
33 |     def compute_reward(self, action):
34 |         yield
35 |         yield self.is_current_done()
36 | 
37 |     @overrides
38 |     def is_current_done(self):
39 |         return self.cart.position[0] >= self.goal_cart_pos \
40 |             or abs(self.cart.position[0]) >= self.max_cart_pos
41 | 
42 |     @overrides
43 |     def reset(self):
44 |         self._set_state(self.initial_state)
45 |         self._invalidate_state_caches()
46 |         bounds = np.array([
47 |             [-1],
48 |             [1],
49 |         ])
50 |         low, high = bounds
51 |         xvel = np.random.uniform(low, high)
52 |         self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1])
53 |         return self.get_current_obs()
54 | 
55 |     @overrides
56 |     def action_from_keys(self, keys):
57 |         if keys[pygame.K_LEFT]:
58 |             return np.asarray([-1])
59 |         elif keys[pygame.K_RIGHT]:
60 |             return np.asarray([+1])
61 |         else:
62 |             return np.asarray([0])
63 | 
64 | 


--------------------------------------------------------------------------------
/experiments/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/experiments/__init__.py


--------------------------------------------------------------------------------
/experiments/run_experiment_lite.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | 
  3 | import sys
  4 | 
  5 | sys.path.append(".")
  6 | 
  7 | from rllab.misc.ext import is_iterable, set_seed
  8 | from rllab.misc.instrument import concretize
  9 | from rllab import config
 10 | import rllab.misc.logger as logger
 11 | import argparse
 12 | import os.path as osp
 13 | import datetime
 14 | import dateutil.tz
 15 | import ast
 16 | import uuid
 17 | import cPickle as pickle
 18 | import base64
 19 | 
 20 | 
 21 | def run_experiment(argv):
 22 | 
 23 |     default_log_dir = config.LOG_DIR
 24 |     now = datetime.datetime.now(dateutil.tz.tzlocal())
 25 | 
 26 |     # avoid name clashes when running distributed jobs
 27 |     rand_id = str(uuid.uuid4())[:5]
 28 |     timestamp = now.strftime('%Y_%m_%d_%H_%M_%S_%f_%Z')
 29 | 
 30 |     default_exp_name = 'experiment_%s_%s' % (timestamp, rand_id)
 31 |     parser = argparse.ArgumentParser()
 32 |     parser.add_argument('--n_parallel', type=int, default=1,
 33 |                         help='Number of parallel workers to perform rollouts.')
 34 |     parser.add_argument(
 35 |         '--exp_name', type=str, default=default_exp_name, help='Name of the experiment.')
 36 |     parser.add_argument('--log_dir', type=str, default=default_log_dir,
 37 |                         help='Path to save the log and iteration snapshot.')
 38 |     parser.add_argument('--snapshot_mode', type=str, default='all',
 39 |                         help='Mode to save the snapshot. Can be either "all" '
 40 |                              '(all iterations will be saved), "last" (only '
 41 |                              'the last iteration will be saved), or "none" '
 42 |                              '(do not save snapshots)')
 43 |     parser.add_argument('--tabular_log_file', type=str, default='progress.csv',
 44 |                         help='Name of the tabular log file (in csv).')
 45 |     parser.add_argument('--text_log_file', type=str, default='debug.log',
 46 |                         help='Name of the text log file (in pure text).')
 47 |     parser.add_argument('--params_log_file', type=str, default='params.json',
 48 |                         help='Name of the parameter log file (in json).')
 49 |     parser.add_argument('--plot', type=ast.literal_eval, default=False,
 50 |                         help='Whether to plot the iteration results')
 51 |     parser.add_argument('--log_tabular_only', type=ast.literal_eval, default=False,
 52 |                         help='Whether to only print the tabular log information (in a horizontal format)')
 53 |     parser.add_argument('--seed', type=int,
 54 |                         help='Random seed for numpy')
 55 |     parser.add_argument('--args_data', type=str,
 56 |                         help='Pickled data for stub objects')
 57 | 
 58 |     args = parser.parse_args(argv[1:])
 59 | 
 60 |     from sandbox.vime.sampler import parallel_sampler_expl as parallel_sampler
 61 |     parallel_sampler.initialize(n_parallel=args.n_parallel)
 62 | 
 63 |     if args.seed is not None:
 64 |         set_seed(args.seed)
 65 |         parallel_sampler.set_seed(args.seed)
 66 | 
 67 |     if args.plot:
 68 |         from rllab.plotter import plotter
 69 |         plotter.init_worker()
 70 | 
 71 |     # read from stdin
 72 |     data = pickle.loads(base64.b64decode(args.args_data))
 73 | 
 74 |     log_dir = args.log_dir
 75 |     # exp_dir = osp.join(log_dir, args.exp_name)
 76 |     tabular_log_file = osp.join(log_dir, args.tabular_log_file)
 77 |     text_log_file = osp.join(log_dir, args.text_log_file)
 78 |     params_log_file = osp.join(log_dir, args.params_log_file)
 79 | 
 80 |     logger.log_parameters_lite(params_log_file, args)
 81 |     logger.add_text_output(text_log_file)
 82 |     logger.add_tabular_output(tabular_log_file)
 83 |     prev_snapshot_dir = logger.get_snapshot_dir()
 84 |     prev_mode = logger.get_snapshot_mode()
 85 |     logger.set_snapshot_dir(log_dir)
 86 |     logger.set_snapshot_mode(args.snapshot_mode)
 87 |     logger.set_log_tabular_only(args.log_tabular_only)
 88 |     logger.push_prefix("[%s] " % args.exp_name)
 89 | 
 90 |     maybe_iter = concretize(data)
 91 |     if is_iterable(maybe_iter):
 92 |         for _ in maybe_iter:
 93 |             pass
 94 | 
 95 |     logger.set_snapshot_mode(prev_mode)
 96 |     logger.set_snapshot_dir(prev_snapshot_dir)
 97 |     logger.remove_tabular_output(tabular_log_file)
 98 |     logger.remove_text_output(text_log_file)
 99 |     logger.pop_prefix()
100 | 
101 | 
102 | if __name__ == "__main__":
103 |     run_experiment(sys.argv)
104 | 


--------------------------------------------------------------------------------
/experiments/run_trpo.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.mujoco.gather.swimmer_gather_env import SwimmerGatherEnv
 4 | os.environ["THEANO_FLAGS"] = "device=cpu"
 5 | 
 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 7 | from rllab.envs.normalized_env import NormalizedEnv
 8 | 
 9 | from rllab.algos.trpo import TRPO
10 | from rllab.misc.instrument import stub, run_experiment_lite
11 | import itertools
12 | 
13 | stub(globals())
14 | 
15 | # Param ranges
16 | seeds = range(2)
17 | # SwimmerGather hierarchical task
18 | mdp_classes = [SwimmerGatherEnv]
19 | mdps = [NormalizedEnv(env=mdp_class())
20 |         for mdp_class in mdp_classes]
21 | param_cart_product = itertools.product(
22 |     mdps, seeds
23 | )
24 | 
25 | for mdp, seed in param_cart_product:
26 | 
27 |     policy = GaussianMLPPolicy(
28 |         env_spec=mdp.spec,
29 |         hidden_sizes=(64, 32),
30 |     )
31 | 
32 |     baseline = LinearFeatureBaseline(
33 |         mdp.spec,
34 |     )
35 | 
36 |     batch_size = 50000
37 |     algo = TRPO(
38 |         env=mdp,
39 |         policy=policy,
40 |         baseline=baseline,
41 |         batch_size=batch_size,
42 |         whole_paths=True,
43 |         max_path_length=500,
44 |         n_itr=10000,
45 |         step_size=0.01,
46 |         subsample_factor=1.0,
47 |     )
48 | 
49 |     run_experiment_lite(
50 |         algo.train(),
51 |         exp_prefix="trpo",
52 |         n_parallel=4,
53 |         snapshot_mode="last",
54 |         seed=seed,
55 |         mode="local"
56 |     )
57 | 


--------------------------------------------------------------------------------
/experiments/run_trpo_expl.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.mujoco.gather.swimmer_gather_env import SwimmerGatherEnv
 4 | os.environ["THEANO_FLAGS"] = "device=cpu"
 5 | 
 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 7 | from rllab.envs.normalized_env import NormalizedEnv
 8 | 
 9 | from sandbox.vime.algos.trpo_expl import TRPO
10 | from rllab.misc.instrument import stub, run_experiment_lite
11 | import itertools
12 | 
13 | stub(globals())
14 | 
15 | # Param ranges
16 | seeds = range(2)
17 | etas = [0.0001]
18 | # SwimmerGather hierarchical task
19 | mdp_classes = [SwimmerGatherEnv]
20 | mdps = [NormalizedEnv(env=mdp_class())
21 |         for mdp_class in mdp_classes]
22 | 
23 | param_cart_product = itertools.product(
24 |     mdps, etas, seeds
25 | )
26 | 
27 | for mdp, eta, seed in param_cart_product:
28 | 
29 |     policy = GaussianMLPPolicy(
30 |         env_spec=mdp.spec,
31 |         hidden_sizes=(64, 32),
32 |     )
33 | 
34 |     baseline = LinearFeatureBaseline(
35 |         mdp.spec,
36 |     )
37 | 
38 |     batch_size = 50000
39 |     algo = TRPO(
40 |         env=mdp,
41 |         policy=policy,
42 |         baseline=baseline,
43 |         batch_size=batch_size,
44 |         whole_paths=True,
45 |         max_path_length=500,
46 |         n_itr=10000,
47 |         step_size=0.01,
48 |         eta=eta,
49 |         snn_n_samples=10,
50 |         subsample_factor=1.0,
51 |         use_replay_pool=True,
52 |         use_kl_ratio=True,
53 |         use_kl_ratio_q=True,
54 |         n_itr_update=1,
55 |         kl_batch_size=1,
56 |         normalize_reward=False,
57 |         replay_pool_size=1000000,
58 |         n_updates_per_sample=5000,
59 |         second_order_update=True,
60 |         unn_n_hidden=[32],
61 |         unn_layers_type=[1, 1],
62 |         unn_learning_rate=0.0001
63 |     )
64 | 
65 |     run_experiment_lite(
66 |         algo.train(),
67 |         exp_prefix="trpo-expl",
68 |         n_parallel=4,
69 |         snapshot_mode="last",
70 |         seed=seed,
71 |         mode="local",
72 |         script="sandbox/vime/experiments/run_experiment_lite.py",
73 |     )
74 | 


--------------------------------------------------------------------------------
/sampler/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/openai/vime/ad6ca976ef07e8c1fd4c0a353716bbd0b52539fb/sampler/__init__.py


--------------------------------------------------------------------------------
/sampler/parallel_sampler_expl.py:
--------------------------------------------------------------------------------
  1 | from rllab.sampler.utils import rollout
  2 | from rllab.sampler.stateful_pool import singleton_pool
  3 | from rllab.misc import ext
  4 | from rllab.misc import logger
  5 | from rllab.misc import tensor_utils
  6 | import numpy as np
  7 | 
  8 | 
  9 | def _worker_init(G, id):
 10 |     import os
 11 |     os.environ['THEANO_FLAGS'] = 'device=cpu'
 12 |     G.worker_id = id
 13 | 
 14 | 
 15 | def initialize(n_parallel):
 16 |     singleton_pool.initialize(n_parallel)
 17 |     singleton_pool.run_each(
 18 |         _worker_init, [(id,) for id in xrange(singleton_pool.n_parallel)])
 19 | 
 20 | 
 21 | def _worker_populate_task(G, env, policy, dynamics):
 22 |     G.env = env
 23 |     G.policy = policy
 24 |     G.dynamics = dynamics
 25 | 
 26 | 
 27 | def populate_task(env, policy, dynamics):
 28 |     logger.log("Populating workers...")
 29 |     singleton_pool.run_each(
 30 |         _worker_populate_task,
 31 |         [(env, policy, dynamics)] * singleton_pool.n_parallel
 32 |     )
 33 |     logger.log("Populated")
 34 | 
 35 | 
 36 | def _worker_set_seed(_, seed):
 37 |     ext.set_seed(seed)
 38 | 
 39 | 
 40 | def set_seed(seed):
 41 |     singleton_pool.run_each(
 42 |         _worker_set_seed,
 43 |         [(seed + i,) for i in xrange(singleton_pool.n_parallel)]
 44 |     )
 45 | 
 46 | 
 47 | def _worker_set_policy_params(G, params):
 48 |     G.policy.set_param_values(params)
 49 | 
 50 | 
 51 | def _worker_set_dynamics_params(G, params):
 52 |     G.dynamics.set_param_values(params)
 53 | 
 54 | 
 55 | def _worker_collect_one_path(G, max_path_length, itr, normalize_reward,
 56 |                              reward_mean, reward_std, kl_batch_size, n_itr_update, use_replay_pool,
 57 |                              obs_mean, obs_std, act_mean, act_std, second_order_update):
 58 |     # Path rollout.
 59 |     path = rollout(G.env, G.policy, max_path_length)
 60 | 
 61 |     # Computing intrinsic rewards.
 62 |     # ----------------------------
 63 |     # Save original reward.
 64 |     path['rewards_orig'] = np.array(path['rewards'])
 65 | 
 66 |     if itr > 0:
 67 |         # Iterate over all paths and compute intrinsic reward by updating the
 68 |         # model on each observation, calculating the KL divergence of the new
 69 |         # params to the old ones, and undoing this operation.
 70 |         obs = (path['observations'] - obs_mean) / (obs_std + 1e-8)
 71 |         act = (path['actions'] - act_mean) / (act_std + 1e-8)
 72 |         rew = path['rewards']
 73 |         # inputs = (o,a), target = o'
 74 |         obs_nxt = np.vstack([obs[1:]])
 75 |         _inputs = np.hstack([obs[:-1], act[:-1]])
 76 |         _targets = obs_nxt
 77 |         # KL vector assumes same shape as reward.
 78 |         kl = np.zeros(rew.shape)
 79 |         for j in xrange(int(np.ceil(obs.shape[0] / float(kl_batch_size)))):
 80 | 
 81 |             # Save old params for every update.
 82 |             G.dynamics.save_old_params()
 83 | 
 84 |             start = j * kl_batch_size
 85 |             end = np.minimum(
 86 |                 (j + 1) * kl_batch_size, obs.shape[0] - 1)
 87 | 
 88 |             if second_order_update:
 89 |                 # We do a line search over the best step sizes using
 90 |                 # step_size * invH * grad
 91 |                 #                 best_loss_value = np.inf
 92 |                 for step_size in [0.01]:
 93 |                     G.dynamics.save_old_params()
 94 |                     loss_value = G.dynamics.train_update_fn(
 95 |                          _inputs[start:end], _targets[start:end], step_size)
 96 |                     kl_div = np.clip(loss_value, 0, 1000)
 97 |                     # If using replay pool, undo updates.
 98 |                     if use_replay_pool:
 99 |                         G.dynamics.reset_to_old_params()
100 |             else:
101 |                 # Update model weights based on current minibatch.
102 |                 for _ in xrange(n_itr_update):
103 |                     G.dynamics.train_update_fn(
104 |                         _inputs[start:end], _targets[start:end])
105 |                 # Calculate current minibatch KL.
106 |                 kl_div = np.clip(
107 |                     float(G.dynamics.f_kl_div_closed_form()), 0, 1000)
108 | 
109 |             for k in xrange(start, end):
110 |                 kl[k] = kl_div
111 |             # If using replay pool, undo updates.
112 |             if use_replay_pool:
113 |                 G.dynamics.reset_to_old_params()
114 | 
115 |         # Last element in KL vector needs to be replaced by second last one
116 |         # because the actual last observation has no next observation.
117 |         kl[-1] = kl[-2]
118 | 
119 |         # Stuff it in path
120 |         path['KL'] = kl
121 |         # ----------------------------
122 | 
123 |     return path, len(path["rewards"])
124 | 
125 | 
126 | def sample_paths(
127 |         policy_params,
128 |         dynamics_params,
129 |         max_samples,
130 |         max_path_length=np.inf,
131 |         itr=None,
132 |         normalize_reward=None,
133 |         reward_mean=None,
134 |         reward_std=None,
135 |         kl_batch_size=None,
136 |         n_itr_update=None,
137 |         use_replay_pool=None,
138 |         obs_mean=None,
139 |         obs_std=None,
140 |         act_mean=None,
141 |         act_std=None,
142 |         second_order_update=None
143 | ):
144 |     """
145 |     :param policy_params: parameters for the policy. This will be updated on each worker process
146 |     :param max_samples: desired maximum number of samples to be collected. The actual number of collected samples
147 |     might be greater since all trajectories will be rolled out either until termination or until max_path_length is
148 |     reached
149 |     :param max_path_length: horizon / maximum length of a single trajectory
150 |     :return: a list of collected paths
151 |     """
152 |     singleton_pool.run_each(
153 |         _worker_set_policy_params,
154 |         [(policy_params,)] * singleton_pool.n_parallel
155 |     )
156 | 
157 |     # Set dynamics params.
158 |     # --------------------
159 |     singleton_pool.run_each(
160 |         _worker_set_dynamics_params,
161 |         [(dynamics_params,)] * singleton_pool.n_parallel
162 |     )
163 |     # --------------------
164 |     return singleton_pool.run_collect(
165 |         _worker_collect_one_path,
166 |         threshold=max_samples,
167 |         args=(max_path_length, itr, normalize_reward, reward_mean,
168 |               reward_std, kl_batch_size, n_itr_update, use_replay_pool, obs_mean, obs_std, act_mean, act_std, second_order_update),
169 |         show_prog_bar=True
170 |     )
171 | 
172 | 
173 | def truncate_paths(paths, max_samples):
174 |     """
175 |     Truncate the list of paths so that the total number of samples is exactly equal to max_samples. This is done by
176 |     removing extra paths at the end of the list, and make the last path shorter if necessary
177 |     :param paths: a list of paths
178 |     :param max_samples: the absolute maximum number of samples
179 |     :return: a list of paths, truncated so that the number of samples adds up to max-samples
180 |     """
181 |     # chop samples collected by extra paths
182 |     # make a copy
183 |     paths = list(paths)
184 |     total_n_samples = sum(len(path["rewards"]) for path in paths)
185 |     while len(paths) > 0 and total_n_samples - len(paths[-1]["rewards"]) >= max_samples:
186 |         total_n_samples -= len(paths.pop(-1)["rewards"])
187 |     if len(paths) > 0:
188 |         last_path = paths.pop(-1)
189 |         truncated_last_path = dict()
190 |         truncated_len = len(
191 |             last_path["rewards"]) - (total_n_samples - max_samples)
192 |         for k, v in last_path.iteritems():
193 |             if k in ["observations", "actions", "rewards"]:
194 |                 truncated_last_path[k] = tensor_utils.truncate_tensor_list(
195 |                     v, truncated_len)
196 |             elif k in ["env_infos", "agent_infos"]:
197 |                 truncated_last_path[k] = tensor_utils.truncate_tensor_dict(
198 |                     v, truncated_len)
199 |             else:
200 |                 raise NotImplementedError
201 |         paths.append(truncated_last_path)
202 |     return paths
203 | 


--------------------------------------------------------------------------------