├── rllab
    ├── __init__.py
    ├── misc
    │   ├── meta.py
    │   ├── __init__.py
    │   ├── mako_utils.py
    │   └── resolve.py
    ├── algos
    │   ├── __init__.py
    │   ├── base.py
    │   ├── nop.py
    │   ├── trpo.py
    │   ├── ppo.py
    │   ├── tnpg.py
    │   └── erwr.py
    ├── core
    │   ├── __init__.py
    │   ├── lasagne_powered.py
    │   └── serializable.py
    ├── envs
    │   ├── __init__.py
    │   ├── box2d
    │   │   ├── __init__.py
    │   │   ├── parser
    │   │   │   └── __init__.py
    │   │   ├── models
    │   │   │   ├── mountain_car.xml.mako
    │   │   │   ├── double_pendulum.xml.mako
    │   │   │   ├── cartpole.xml.mako
    │   │   │   ├── car_parking.xml
    │   │   │   └── car_parking.xml.rb
    │   │   ├── cartpole_env.py
    │   │   ├── mountain_car_env.py
    │   │   ├── double_pendulum_env.py
    │   │   └── cartpole_swingup_env.py
    │   ├── mujoco
    │   │   ├── __init__.py
    │   │   ├── gather
    │   │   │   ├── __init__.py
    │   │   │   ├── point_gather_env.py
    │   │   │   ├── swimmer_gather_env.py
    │   │   │   └── ant_gather_env.py
    │   │   ├── maze
    │   │   │   ├── __init__.py
    │   │   │   ├── point_maze_env.py
    │   │   │   ├── swimmer_maze_env.py
    │   │   │   └── ant_maze_env.py
    │   │   ├── humanoid_env.py
    │   │   ├── point_env.py
    │   │   ├── ant_env.py
    │   │   ├── half_cheetah_env.py
    │   │   ├── walker2d_env.py
    │   │   ├── inverted_double_pendulum_env.py
    │   │   ├── swimmer_env.py
    │   │   └── swimmer_randgoal_env.py
    │   ├── env_spec.py
    │   ├── identification_env.py
    │   ├── proxy_env.py
    │   └── sliding_mem_env.py
    ├── baselines
    │   ├── __init__.py
    │   ├── zero_baseline.py
    │   ├── base.py
    │   ├── linear_feature_baseline.py
    │   ├── gaussian_conv_baseline.py
    │   └── gaussian_mlp_baseline.py
    ├── optimizers
    │   ├── __init__.py
    │   └── minibatch_dataset.py
    ├── policies
    │   ├── __init__.py
    │   ├── uniform_control_policy.py
    │   └── base.py
    ├── sampler
    │   ├── __init__.py
    │   └── utils.py
    ├── distributions
    │   ├── __init__.py
    │   ├── recurrent_diagonal_gaussian.py
    │   ├── delta.py
    │   ├── base.py
    │   └── bernoulli.py
    ├── q_functions
    │   ├── __init__.py
    │   └── base.py
    ├── exploration_strategies
    │   ├── __init__.py
    │   ├── base.py
    │   ├── gaussian_strategy.py
    │   └── ou_strategy.py
    ├── plotter
    │   ├── __init__.py
    │   └── plotter.py
    ├── mujoco_py
    │   ├── .rvmrc
    │   ├── Gemfile
    │   ├── mjconstants.py
    │   ├── mjextra.py
    │   ├── __init__.py
    │   ├── gen_binding.sh
    │   └── Gemfile.lock
    ├── regressors
    │   ├── __init__.py
    │   └── product_regressor.py
    ├── viskit
    │   └── __init__.py
    ├── spaces
    │   ├── __init__.py
    │   ├── base.py
    │   ├── discrete.py
    │   ├── box.py
    │   └── product.py
    ├── config_personal_template.py
    └── config.py
├── tests
    ├── __init__.py
    ├── envs
    │   ├── __init__.py
    │   └── test_maze_env.py
    ├── algos
    │   ├── __init__.py
    │   └── test_trpo.py
    ├── regression_tests
    │   ├── __init__.py
    │   └── test_issue_3.py
    ├── test_networks.py
    ├── test_stateful_pool.py
    ├── test_serializable.py
    ├── test_baselines.py
    ├── test_sampler.py
    ├── test_spaces.py
    └── test_instrument.py
├── contrib
    ├── __init__.py
    └── alexbeloi
    │   ├── __init__.py
    │   └── examples
    │       ├── __init__.py
    │       ├── vpgis_cartpole.py
    │       └── trpois_cartpole.py
├── examples
    ├── __init__.py
    ├── nop_cartpole.py
    ├── trpo_cartpole.py
    ├── point_env.py
    ├── trpo_cartpole_recurrent.py
    ├── trpo_cartpole_stub.py
    ├── vpg_point.py
    ├── point_env_rand2goal.py
    ├── ddpg_cartpole_stub.py
    ├── old
    │   └── sens_vpg_point.py
    ├── point_env_randgoal_oracle.py
    ├── point_env_randgoal.py
    ├── icml
    │   └── trpo_point.py
    ├── vpg_swimmer.py
    ├── trpo_gym.py
    ├── cluster_demo.py
    ├── trpo_point.py
    ├── cluster_gym_mujoco_demo.py
    └── trpo_swimmer.py
├── sandbox
    ├── __init__.py
    └── rocky
    │   ├── __init__.py
    │   └── tf
    │       ├── __init__.py
    │       ├── algos
    │           ├── npg.py
    │           ├── __init__.py
    │           ├── trpo.py
    │           └── sensitive_trpo.py
    │       ├── core
    │           ├── __init__.py
    │           └── layers_powered.py
    │       ├── envs
    │           ├── __init__.py
    │           └── vec_env_executor.py
    │       ├── misc
    │           └── __init__.py
    │       ├── launchers
    │           ├── __init__.py
    │           ├── vpg_cartpole.py
    │           ├── trpo_cartpole_recurrent.py
    │           └── trpo_cartpole.py
    │       ├── policies
    │           ├── __init__.py
    │           └── uniform_control_policy.py
    │       ├── samplers
    │           └── __init__.py
    │       ├── distributions
    │           ├── __init__.py
    │           ├── recurrent_diagonal_gaussian.py
    │           ├── base.py
    │           └── bernoulli.py
    │       ├── optimizers
    │           └── __init__.py
    │       ├── regressors
    │           └── __init__.py
    │       └── spaces
    │           ├── __init__.py
    │           ├── box.py
    │           ├── discrete.py
    │           └── product.py
├── scripts
    ├── __init__.py
    ├── setup_linux.sh
    ├── submit_gym.py
    ├── setup_osx.sh
    ├── sync_s3.py
    ├── setup_mujoco.sh
    ├── sim_policy.py
    └── resume_training.py
├── icml
    ├── ant_results.png
    ├── paths_viz.png
    ├── point_results.png
    ├── antdirec_results.png
    ├── cheetah_results.png
    ├── maml_paths_viz.png
    ├── pretrain_paths_viz.png
    ├── ant_results_logscale.png
    ├── cheetahdirec_results.png
    ├── icml_ant_results_maml.pkl
    ├── icml_ant_results_oracle.pkl
    ├── icml_ant_results_random.pkl
    ├── icml_ant_results_maml_bak.pkl
    ├── icml_ant_results_pretrain.pkl
    ├── icml_antdirec_results_maml.pkl
    ├── icml_cheetah_results_maml.pkl
    ├── icml_antdirec_results_oracle.pkl
    ├── icml_antdirec_results_random.pkl
    ├── icml_cheetah_results_oracle.pkl
    ├── icml_cheetah_results_random.pkl
    ├── icml_antdirec_results_pretrain.pkl
    ├── icml_cheetah_results_pretrain.pkl
    ├── icml_cheetahdirec_results_maml.pkl
    ├── icml_cheetah_results_pretrain_bak.pkl
    ├── icml_cheetahdirec_results_oracle.pkl
    ├── icml_cheetahdirec_results_random.pkl
    ├── icml_antdirec_results_maml_batch20.pkl
    ├── icml_cheetahdirec_results_pretrain.pkl
    ├── icml_point_results_oracle.pkl
    ├── make_paths_plot.py
    ├── make_point_plots.py
    ├── make_antdirec_plots.py
    ├── make_cheetahdirec_plots.py
    ├── make_cheetah_plots.py
    └── make_ant_plots.py
├── docs
    ├── user
    │   ├── cluster_1.png
    │   ├── cluster_2.png
    │   ├── cluster_3.png
    │   └── installation.rst
    └── index.rst
├── setup.py
├── docker
    ├── tester_Dockerfile
    ├── Dockerfile
    └── gpu_Dockerfile
├── circle.yml
├── vendor
    └── mujoco_models
    │   ├── red_ball.xml
    │   ├── green_ball.xml
    │   ├── point.xml
    │   └── swimmer.xml
├── .gitignore
├── environment.yml
└── LICENSE


/rllab/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/misc/meta.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/contrib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/algos/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sandbox/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/baselines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/policies/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/sampler/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sandbox/rocky/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/algos/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/contrib/alexbeloi/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/q_functions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/contrib/alexbeloi/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/gather/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/maze/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/npg.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/rllab/exploration_strategies/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/tests/regression_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/optimizers/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/regressors/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/rllab/plotter/__init__.py:
--------------------------------------------------------------------------------
1 | from .plotter import *
2 | 


--------------------------------------------------------------------------------
/rllab/mujoco_py/.rvmrc:
--------------------------------------------------------------------------------
1 | rvm use 2.1.0@mjpy --create
2 | 


--------------------------------------------------------------------------------
/rllab/regressors/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'dementrock'
2 | 


--------------------------------------------------------------------------------
/rllab/viskit/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'dementrock'
2 | 


--------------------------------------------------------------------------------
/icml/ant_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/ant_results.png


--------------------------------------------------------------------------------
/icml/paths_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/paths_viz.png


--------------------------------------------------------------------------------
/icml/point_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/point_results.png


--------------------------------------------------------------------------------
/docs/user/cluster_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/docs/user/cluster_1.png


--------------------------------------------------------------------------------
/docs/user/cluster_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/docs/user/cluster_2.png


--------------------------------------------------------------------------------
/docs/user/cluster_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/docs/user/cluster_3.png


--------------------------------------------------------------------------------
/icml/antdirec_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/antdirec_results.png


--------------------------------------------------------------------------------
/icml/cheetah_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/cheetah_results.png


--------------------------------------------------------------------------------
/icml/maml_paths_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/maml_paths_viz.png


--------------------------------------------------------------------------------
/icml/pretrain_paths_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/pretrain_paths_viz.png


--------------------------------------------------------------------------------
/rllab/envs/box2d/parser/__init__.py:
--------------------------------------------------------------------------------
1 | from .xml_box2d import world_from_xml, find_body, find_joint
2 | 


--------------------------------------------------------------------------------
/rllab/mujoco_py/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 | 
3 | gem 'pry'
4 | gem 'activesupport'
5 | 


--------------------------------------------------------------------------------
/icml/ant_results_logscale.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/ant_results_logscale.png


--------------------------------------------------------------------------------
/icml/cheetahdirec_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/cheetahdirec_results.png


--------------------------------------------------------------------------------
/icml/icml_ant_results_maml.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_maml.pkl


--------------------------------------------------------------------------------
/icml/icml_ant_results_oracle.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_oracle.pkl


--------------------------------------------------------------------------------
/icml/icml_ant_results_random.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_random.pkl


--------------------------------------------------------------------------------
/icml/icml_ant_results_maml_bak.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_maml_bak.pkl


--------------------------------------------------------------------------------
/icml/icml_ant_results_pretrain.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_pretrain.pkl


--------------------------------------------------------------------------------
/icml/icml_antdirec_results_maml.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_maml.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetah_results_maml.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_maml.pkl


--------------------------------------------------------------------------------
/icml/icml_antdirec_results_oracle.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_oracle.pkl


--------------------------------------------------------------------------------
/icml/icml_antdirec_results_random.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_random.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetah_results_oracle.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_oracle.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetah_results_random.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_random.pkl


--------------------------------------------------------------------------------
/icml/icml_antdirec_results_pretrain.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_pretrain.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetah_results_pretrain.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_pretrain.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetahdirec_results_maml.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetahdirec_results_maml.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetah_results_pretrain_bak.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_pretrain_bak.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetahdirec_results_oracle.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetahdirec_results_oracle.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetahdirec_results_random.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetahdirec_results_random.pkl


--------------------------------------------------------------------------------
/icml/icml_antdirec_results_maml_batch20.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_maml_batch20.pkl


--------------------------------------------------------------------------------
/icml/icml_cheetahdirec_results_pretrain.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetahdirec_results_pretrain.pkl


--------------------------------------------------------------------------------
/rllab/q_functions/base.py:
--------------------------------------------------------------------------------
1 | from rllab.core.parameterized import Parameterized
2 | 
3 | 
4 | class QFunction(Parameterized):
5 |     pass
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # setup.py
2 | from setuptools import setup
3 | 
4 | setup(
5 |     name='rllab',
6 |     version='0.1.0',
7 |     packages=['rllab'],
8 | )
9 | 


--------------------------------------------------------------------------------
/rllab/mujoco_py/mjconstants.py:
--------------------------------------------------------------------------------
1 | MOUSE_ROTATE_V = 1
2 | MOUSE_ROTATE_H = 2
3 | MOUSE_MOVE_V = 3
4 | MOUSE_MOVE_H = 4
5 | MOUSE_ZOOM = 5
6 | 
7 | mjOBJ_BODY = 1
8 | 


--------------------------------------------------------------------------------
/rllab/spaces/__init__.py:
--------------------------------------------------------------------------------
1 | from .product import Product
2 | from .discrete import Discrete
3 | from .box import Box
4 | 
5 | __all__ = ["Product", "Discrete", "Box"]


--------------------------------------------------------------------------------
/rllab/algos/base.py:
--------------------------------------------------------------------------------
1 | class Algorithm(object):
2 |     pass
3 | 
4 | 
5 | class RLAlgorithm(Algorithm):
6 | 
7 |     def train(self):
8 |         raise NotImplementedError
9 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/__init__.py:
--------------------------------------------------------------------------------
1 | from .product import Product
2 | from .discrete import Discrete
3 | from .box import Box
4 | 
5 | __all__ = ["Product", "Discrete", "Box"]
6 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/humanoid_env.py:
--------------------------------------------------------------------------------
1 | from .simple_humanoid_env import SimpleHumanoidEnv
2 | 
3 | 
4 | # Taken from Wojciech's code
5 | class HumanoidEnv(SimpleHumanoidEnv):
6 | 
7 |     FILE = 'humanoid.xml'
8 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/recurrent_diagonal_gaussian.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | from sandbox.rocky.tf.distributions.diagonal_gaussian import DiagonalGaussian
5 | 
6 | RecurrentDiagonalGaussian = DiagonalGaussian
7 | 


--------------------------------------------------------------------------------
/rllab/exploration_strategies/base.py:
--------------------------------------------------------------------------------
1 | class ExplorationStrategy(object):
2 |     def get_action(self, t, observation, policy, **kwargs):
3 |         raise NotImplementedError
4 | 
5 |     def reset(self):
6 |         pass
7 | 


--------------------------------------------------------------------------------
/docker/tester_Dockerfile:
--------------------------------------------------------------------------------
1 | FROM neocxi/rllab_exp_gpu_tf:py3
2 | 
3 | RUN bash -c 'source activate rllab3 && conda install -y nomkl && conda uninstall -y scipy && conda install -y scipy'
4 | 
5 | ADD . /root/code/rllab
6 | WORKDIR /root/code/rllab
7 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/gather/point_gather_env.py:
--------------------------------------------------------------------------------
1 | from rllab.envs.mujoco.gather.gather_env import GatherEnv
2 | from rllab.envs.mujoco.point_env import PointEnv
3 | 
4 | 
5 | class PointGatherEnv(GatherEnv):
6 | 
7 |     MODEL_CLASS = PointEnv
8 |     ORI_IND = 2
9 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/gather/swimmer_gather_env.py:
--------------------------------------------------------------------------------
1 | from rllab.envs.mujoco.gather.gather_env import GatherEnv
2 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv
3 | 
4 | 
5 | class SwimmerGatherEnv(GatherEnv):
6 | 
7 |     MODEL_CLASS = SwimmerEnv
8 |     ORI_IND = 2
9 | 


--------------------------------------------------------------------------------
/rllab/distributions/recurrent_diagonal_gaussian.py:
--------------------------------------------------------------------------------
1 | import theano.tensor as TT
2 | import numpy as np
3 | from rllab.distributions.base import Distribution
4 | from rllab.distributions.diagonal_gaussian import DiagonalGaussian
5 | 
6 | RecurrentDiagonalGaussian = DiagonalGaussian
7 | 


--------------------------------------------------------------------------------
/rllab/mujoco_py/mjextra.py:
--------------------------------------------------------------------------------
1 | def append_objects(cur, extra):
2 |     for i in range(cur.ngeom, cur.ngeom + extra.ngeom):
3 |         cur.geoms[i] = extra.geoms[i - cur.ngeom]
4 |     cur.ngeom = cur.ngeom + extra.ngeom
5 |     if cur.ngeom > cur.maxgeom: 
6 |         raise ValueError("buffer limit exceeded!")
7 | 


--------------------------------------------------------------------------------
/rllab/mujoco_py/__init__.py:
--------------------------------------------------------------------------------
1 | from .mjviewer import MjViewer
2 | from .mjcore import MjModel
3 | from .mjcore import register_license
4 | import os
5 | from .mjconstants import *
6 | 
7 | register_license(os.path.join(os.path.dirname(__file__),
8 |                               '../../vendor/mujoco/mjkey.txt'))
9 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/box.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from rllab.spaces.box import Box as TheanoBox
 5 | import tensorflow as tf
 6 | 
 7 | 
 8 | class Box(TheanoBox):
 9 |     def new_tensor_variable(self, name, extra_dims):
10 |         return tf.placeholder(tf.float32, shape=[None] * extra_dims + [self.flat_dim], name=name)
11 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/maze/point_maze_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.mujoco.maze.maze_env import MazeEnv
 2 | from rllab.envs.mujoco.point_env import PointEnv
 3 | 
 4 | 
 5 | class PointMazeEnv(MazeEnv):
 6 | 
 7 |     MODEL_CLASS = PointEnv
 8 |     ORI_IND = 2
 9 | 
10 |     MAZE_HEIGHT = 2
11 |     MAZE_SIZE_SCALING = 3.0
12 | 
13 |     MANUAL_COLLISION = True
14 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/maze/swimmer_maze_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.mujoco.maze.maze_env import MazeEnv
 2 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv
 3 | 
 4 | 
 5 | class SwimmerMazeEnv(MazeEnv):
 6 | 
 7 |     MODEL_CLASS = SwimmerEnv
 8 |     ORI_IND = 2
 9 | 
10 |     MAZE_HEIGHT = 0.5
11 |     MAZE_SIZE_SCALING = 4
12 |     MAZE_MAKE_CONTACTS = True
13 | 
14 | 


--------------------------------------------------------------------------------
/circle.yml:
--------------------------------------------------------------------------------
 1 | machine:
 2 |   services:
 3 |     - docker
 4 | 
 5 | dependencies:
 6 |   cache_directories:
 7 |     - "~/docker"
 8 |   override:
 9 |     - docker info
10 |     - if [[ -e ~/docker/image.tar ]]; then docker load -i ~/docker/image.tar; fi
11 |     - docker build -t tester -f docker/tester_Dockerfile .
12 |     - mkdir -p ~/docker; docker save tester > ~/docker/image.tar
13 | 
14 | test:
15 |   override:
16 |     - docker run tester /bin/bash -li -c "CIRCLECI=true nose2"
17 | 


--------------------------------------------------------------------------------
/tests/envs/test_maze_env.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | 
 3 | from rllab.envs.mujoco.maze.maze_env_utils import line_intersect, ray_segment_intersect
 4 | 
 5 | 
 6 | def test_line_intersect():
 7 |     assert line_intersect((0, 0), (0, 1), (0, 0), (1, 0))[:2] == (0, 0)
 8 |     assert line_intersect((0, 0), (0, 1), (0, 0), (0, 1))[2] == 0
 9 |     assert ray_segment_intersect(ray=((0, 0), 0), segment=((1, -1), (1, 1))) == (1, 0)
10 |     assert ray_segment_intersect(ray=((0, 0), math.pi), segment=((1, -1), (1, 1))) is None
11 | 


--------------------------------------------------------------------------------
/rllab/mujoco_py/gen_binding.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | parent_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P )
 3 | mujoco_path=$parent_path/../../vendor/mujoco
 4 | rm /tmp/code_gen_mujoco.h
 5 | cat $mujoco_path/mjdata.h >> /tmp/code_gen_mujoco.h && \
 6 |   cat $mujoco_path/mjmodel.h >> /tmp/code_gen_mujoco.h && \
 7 |   cat $mujoco_path/mjrender.h >> /tmp/code_gen_mujoco.h && \
 8 |   cat $mujoco_path/mjvisualize.h >> /tmp/code_gen_mujoco.h && \
 9 |   ruby $parent_path/codegen.rb /tmp/code_gen_mujoco.h $mujoco_path/mjxmacro.h > $parent_path/mjtypes.py
10 | 


--------------------------------------------------------------------------------
/tests/test_networks.py:
--------------------------------------------------------------------------------
 1 | def test_gru_network():
 2 |     from rllab.core.network import GRUNetwork
 3 |     import lasagne.layers as L
 4 |     from rllab.misc import ext
 5 |     import numpy as np
 6 |     network = GRUNetwork(
 7 |         input_shape=(2, 3),
 8 |         output_dim=5,
 9 |         hidden_dim=4,
10 |     )
11 |     f_output = ext.compile_function(
12 |         inputs=[network.input_layer.input_var],
13 |         outputs=L.get_output(network.output_layer)
14 |     )
15 |     assert f_output(np.zeros((6, 8, 2, 3))).shape == (6, 8, 5)
16 | 


--------------------------------------------------------------------------------
/vendor/mujoco_models/red_ball.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="box">
 2 | 	<compiler inertiafromgeom="true" angle="degree" coordinate="local"/>
 3 | 	<worldbody>
 4 |     <body name="ball" pos="0 0 0">
 5 |       <joint name='ballx' type='slide' axis='1 0 0' pos='0 0 0' limited='false' damping='0.1' armature='0' stiffness='0'/>
 6 |       <joint name='bally' type='slide' axis='0 1 0' pos='0 0 0' limited='false' damping='0.1' armature='0' stiffness='0'/>
 7 | 		  <geom type="sphere" size="0.5" pos="0 0 0.5" rgba="1 0 0 1" />
 8 |     </body>
 9 | 	</worldbody>
10 | </mujoco>
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data
 2 | *.pyc
 3 | *-checkpoint.ipynb
 4 | .DS_Store
 5 | *.h5
 6 | *.log
 7 | *.npz
 8 | secrets.py
 9 | *.avi
10 | *.mp4
11 | build
12 | build_linux
13 | .idea
14 | .sublime-project
15 | run_experiment.sh
16 | scratch-notebooks
17 | launch_scripts
18 | *.sh.e*
19 | *.sh.o*
20 | MUJOCO_LOG.TXT
21 | vendor/mujoco
22 | .project
23 | .pydevproject
24 | *.pdf
25 | .env
26 | snippets
27 | private
28 | lua
29 | iterate.dat
30 | .env
31 | src/
32 | .settings
33 | .pods
34 | docs/_build
35 | blackbox.zip
36 | blackbox
37 | rllab/config_personal.py
38 | *.swp
39 | 


--------------------------------------------------------------------------------
/vendor/mujoco_models/green_ball.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="box">
 2 | 	<compiler inertiafromgeom="true" angle="degree" coordinate="local"/>
 3 | 	<worldbody>
 4 |     <body name="ball" pos="0 0 0">
 5 |       <joint name='ballx' type='slide' axis='1 0 0' pos='0 0 0' limited='false' damping='0.1' armature='0' stiffness='0'/>
 6 |       <joint name='bally' type='slide' axis='0 1 0' pos='0 0 0' limited='false' damping='0.1' armature='0' stiffness='0'/>
 7 | 		  <geom type="sphere" size="0.5" pos="0 0 0.5" rgba="0 1 0 1" />
 8 |     </body>
 9 | 	</worldbody>
10 | </mujoco>
11 | 


--------------------------------------------------------------------------------
/rllab/baselines/zero_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rllab.baselines.base import Baseline
 3 | from rllab.misc.overrides import overrides
 4 | 
 5 | 
 6 | class ZeroBaseline(Baseline):
 7 | 
 8 |     def __init__(self, env_spec):
 9 |         pass
10 | 
11 |     @overrides
12 |     def get_param_values(self, **kwargs):
13 |         return None
14 | 
15 |     @overrides
16 |     def set_param_values(self, val, **kwargs):
17 |         pass
18 | 
19 |     @overrides
20 |     def fit(self, paths, **kwargs):
21 |         pass
22 | 
23 |     @overrides
24 |     def predict(self, path):
25 |         return np.zeros_like(path["rewards"])
26 | 


--------------------------------------------------------------------------------
/rllab/algos/nop.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.batch_polopt import BatchPolopt
 2 | from rllab.misc.overrides import overrides
 3 | 
 4 | 
 5 | class NOP(BatchPolopt):
 6 |     """
 7 |     NOP (no optimization performed) policy search algorithm
 8 |     """
 9 | 
10 |     def __init__(
11 |             self,
12 |             **kwargs):
13 |         super(NOP, self).__init__(**kwargs)
14 | 
15 |     @overrides
16 |     def init_opt(self):
17 |         pass
18 | 
19 |     @overrides
20 |     def optimize_policy(self, itr, samples_data):
21 |         pass
22 | 
23 |     @overrides
24 |     def get_itr_snapshot(self, itr, samples_data):
25 |         return dict()
26 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/gather/ant_gather_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.mujoco.gather.gather_env import GatherEnv
 2 | from rllab.envs.mujoco.ant_env import AntEnv
 3 | from rllab.envs.mujoco.mujoco_env import q_mult, q_inv
 4 | import math
 5 | 
 6 | class AntGatherEnv(GatherEnv):
 7 | 
 8 |     MODEL_CLASS = AntEnv
 9 |     ORI_IND = 3
10 | 
11 |     def get_ori(self):
12 |         ori = [0, 1, 0, 0] 
13 |         rot = self.inner_env.model.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND+4] # take the quaternion
14 |         ori = q_mult(q_mult(rot,ori),q_inv(rot))[1:3] # project onto x-y plane
15 |         ori = math.atan2(ori[1],ori[0]) 
16 |         return ori
17 | 


--------------------------------------------------------------------------------
/rllab/misc/mako_utils.py:
--------------------------------------------------------------------------------
 1 | 
 2 | def compute_rect_vertices(fromp, to, radius):
 3 |     x1, y1 = fromp
 4 |     x2, y2 = to
 5 |     if abs(y1 - y2) < 1e-6:
 6 |         dx = 0
 7 |         dy = radius
 8 |     else:
 9 |         dx = radius * 1.0 / (((x1 - x2) / (y1 - y2)) ** 2 + 1) ** 0.5
10 |         # equivalently dx = radius * (y2-y1).to_f / ((x2-x1)**2 + (y2-y1)**2)**0.5
11 |         dy = (radius**2 - dx**2) ** 0.5
12 |         dy *= -1 if (x1 - x2) * (y1 - y2) > 0 else 1
13 | 
14 |     return ";".join([",".join(map(str, r)) for r in [
15 |       [x1 + dx, y1 + dy],
16 |       [x2 + dx, y2 + dy],
17 |       [x2 - dx, y2 - dy],
18 |       [x1 - dx, y1 - dy],
19 |     ]])
20 | 
21 | 


--------------------------------------------------------------------------------
/rllab/mujoco_py/Gemfile.lock:
--------------------------------------------------------------------------------
 1 | GEM
 2 |   remote: https://rubygems.org/
 3 |   specs:
 4 |     activesupport (4.1.8)
 5 |       i18n (~> 0.6, >= 0.6.9)
 6 |       json (~> 1.7, >= 1.7.7)
 7 |       minitest (~> 5.1)
 8 |       thread_safe (~> 0.1)
 9 |       tzinfo (~> 1.1)
10 |     coderay (1.1.0)
11 |     i18n (0.7.0)
12 |     json (1.8.1)
13 |     method_source (0.8.2)
14 |     minitest (5.5.1)
15 |     pry (0.10.1)
16 |       coderay (~> 1.1.0)
17 |       method_source (~> 0.8.1)
18 |       slop (~> 3.4)
19 |     slop (3.6.0)
20 |     thread_safe (0.3.4)
21 |     tzinfo (1.2.2)
22 |       thread_safe (~> 0.1)
23 | 
24 | PLATFORMS
25 |   ruby
26 | 
27 | DEPENDENCIES
28 |   activesupport
29 |   pry
30 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/trpo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from sandbox.rocky.tf.algos.npo import NPO
 4 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
 5 | 
 6 | 
 7 | class TRPO(NPO):
 8 |     """
 9 |     Trust Region Policy Optimization
10 |     """
11 | 
12 |     def __init__(
13 |             self,
14 |             optimizer=None,
15 |             optimizer_args=None,
16 |             **kwargs):
17 |         if optimizer is None:
18 |             if optimizer_args is None:
19 |                 optimizer_args = dict()
20 |             optimizer = ConjugateGradientOptimizer(**optimizer_args)
21 |         super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
22 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/core/layers_powered.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.core.parameterized import Parameterized
 2 | import sandbox.rocky.tf.core.layers as L
 3 | import itertools
 4 | 
 5 | 
 6 | class LayersPowered(Parameterized):
 7 | 
 8 |     def __init__(self, output_layers, input_layers=None):
 9 |         self._output_layers = output_layers
10 |         self._input_layers = input_layers
11 |         Parameterized.__init__(self)
12 | 
13 |     def get_params_internal(self, **tags):
14 |         layers = L.get_all_layers(self._output_layers, treat_as_input=self._input_layers)
15 |         params = itertools.chain.from_iterable(l.get_params(**tags) for l in layers)
16 |         return L.unique(params)
17 | 
18 | 


--------------------------------------------------------------------------------
/rllab/algos/trpo.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.npo import NPO
 2 | from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | 
 6 | class TRPO(NPO):
 7 |     """
 8 |     Trust Region Policy Optimization
 9 |     """
10 | 
11 |     def __init__(
12 |             self,
13 |             optimizer=None,
14 |             optimizer_args=None,
15 |             **kwargs):
16 |         if optimizer is None:
17 |             if optimizer_args is None:
18 |                 optimizer_args = dict()
19 |             optimizer = ConjugateGradientOptimizer(**optimizer_args)
20 |         super(TRPO, self).__init__(optimizer=optimizer, **kwargs)
21 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/maze/ant_maze_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.mujoco.maze.maze_env import MazeEnv
 2 | from rllab.envs.mujoco.ant_env import AntEnv
 3 | from rllab.envs.mujoco.mujoco_env import q_mult, q_inv
 4 | import math
 5 | 
 6 | 
 7 | class AntMazeEnv(MazeEnv):
 8 | 
 9 |     MODEL_CLASS = AntEnv
10 |     ORI_IND = 3
11 | 
12 |     MAZE_HEIGHT = 2
13 |     MAZE_SIZE_SCALING = 3.0
14 | 
15 |     def get_ori(self):
16 |         ori = [0, 1, 0, 0] 
17 |         rot = self.wrapped_env.model.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND+4] # take the quaternion
18 |         ori = q_mult(q_mult(rot,ori),q_inv(rot))[1:3] # project onto x-y plane
19 |         ori = math.atan2(ori[1],ori[0]) 
20 |         return ori
21 | 


--------------------------------------------------------------------------------
/tests/test_stateful_pool.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | def _worker_collect_once(_):
 6 |     return 'a', 1
 7 | 
 8 | 
 9 | def test_stateful_pool():
10 |     from rllab.sampler import stateful_pool
11 |     stateful_pool.singleton_pool.initialize(n_parallel=3)
12 |     results = stateful_pool.singleton_pool.run_collect(_worker_collect_once, 3, show_prog_bar=False)
13 |     assert tuple(results) == ('a', 'a', 'a')
14 | 
15 | 
16 | def test_stateful_pool_over_capacity():
17 |     from rllab.sampler import stateful_pool
18 |     stateful_pool.singleton_pool.initialize(n_parallel=4)
19 |     results = stateful_pool.singleton_pool.run_collect(_worker_collect_once, 3, show_prog_bar=False)
20 |     assert len(results) >= 3
21 | 


--------------------------------------------------------------------------------
/rllab/envs/env_spec.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.serializable import Serializable
 2 | from rllab.spaces.base import Space
 3 | 
 4 | 
 5 | class EnvSpec(Serializable):
 6 | 
 7 |     def __init__(
 8 |             self,
 9 |             observation_space,
10 |             action_space):
11 |         """
12 |         :type observation_space: Space
13 |         :type action_space: Space
14 |         """
15 |         Serializable.quick_init(self, locals())
16 |         self._observation_space = observation_space
17 |         self._action_space = action_space
18 | 
19 |     @property
20 |     def observation_space(self):
21 |         return self._observation_space
22 | 
23 |     @property
24 |     def action_space(self):
25 |         return self._action_space
26 | 


--------------------------------------------------------------------------------
/rllab/algos/ppo.py:
--------------------------------------------------------------------------------
 1 | from rllab.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer
 2 | from rllab.algos.npo import NPO
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | 
 6 | class PPO(NPO, Serializable):
 7 |     """
 8 |     Penalized Policy Optimization.
 9 |     """
10 | 
11 |     def __init__(
12 |             self,
13 |             optimizer=None,
14 |             optimizer_args=None,
15 |             **kwargs):
16 |         Serializable.quick_init(self, locals())
17 |         if optimizer is None:
18 |             if optimizer_args is None:
19 |                 optimizer_args = dict()
20 |             optimizer = PenaltyLbfgsOptimizer(**optimizer_args)
21 |         super(PPO, self).__init__(optimizer=optimizer, **kwargs)
22 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/algos/sensitive_trpo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from sandbox.rocky.tf.algos.sensitive_npo import SensitiveNPO
 4 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
 5 | 
 6 | 
 7 | class SensitiveTRPO(SensitiveNPO):
 8 |     """
 9 |     Trust Region Policy Optimization
10 |     """
11 | 
12 |     def __init__(
13 |             self,
14 |             optimizer=None,
15 |             optimizer_args=None,
16 |             **kwargs):
17 |         if optimizer is None:
18 |             if optimizer_args is None:
19 |                 optimizer_args = dict()
20 |             optimizer = ConjugateGradientOptimizer(**optimizer_args)
21 |         super(SensitiveTRPO, self).__init__(optimizer=optimizer, **kwargs)
22 | 


--------------------------------------------------------------------------------
/rllab/core/lasagne_powered.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.parameterized import Parameterized
 2 | from rllab.misc.overrides import overrides
 3 | import lasagne.layers as L
 4 | 
 5 | 
 6 | class LasagnePowered(Parameterized):
 7 |     def __init__(self, output_layers):
 8 |         self._output_layers = output_layers
 9 |         super(LasagnePowered, self).__init__()
10 | 
11 |     @property
12 |     def output_layers(self):
13 |         return self._output_layers
14 | 
15 |     @overrides
16 |     def get_params_internal(self, **tags):  # this gives ALL the vars (not the params values)
17 |         return L.get_all_params(  # this lasagne function also returns all var below the passed layers
18 |             L.concat(self._output_layers),
19 |             **tags
20 |         )
21 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/models/mountain_car.xml.mako:
--------------------------------------------------------------------------------
 1 | <%
 2 |     noise = opts.get("noise", False)
 3 |     track_width = 4
 4 |     if noise:
 5 |         import numpy as np
 6 |         track_width += np.random.uniform(-1, 1)
 7 | %>
 8 | 
 9 | <box2d>
10 |   <world timestep="0.05">
11 |     <body name="cart" type="dynamic" position="0,0.05">
12 |       <fixture density="5" friction="0.0005" shape="polygon" box="0.2, 0.1"/>
13 |     </body>
14 |     <body name="track" type="static" position="0,1">
15 |       <fixture shape="sine_chain" height="1" width="${track_width}"/>
16 |     </body>
17 |     <state type="xpos" body="cart"/>
18 |     <state type="xvel" body="cart"/>
19 |     <control type="force" body="cart" anchor="0,0" direction="1,0" ctrllimit="-1,1"/>
20 |   </world>
21 | </box2d>
22 | 


--------------------------------------------------------------------------------
/examples/nop_cartpole.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.nop import NOP
 2 | from rllab.baselines.zero_baseline import ZeroBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from rllab.policies.uniform_control_policy import UniformControlPolicy
 6 | 
 7 | env = normalize(CartpoleEnv())
 8 | 
 9 | policy = UniformControlPolicy(
10 |     env_spec=env.spec,
11 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
12 | )
13 | 
14 | baseline = ZeroBaseline(env_spec=env.spec)
15 | 
16 | algo = NOP(
17 |     env=env,
18 |     policy=policy,
19 |     baseline=baseline,
20 |     batch_size=4000,
21 |     max_path_length=100,
22 |     n_itr=40,
23 |     discount=0.99,
24 |     step_size=0.01,
25 | )
26 | algo.train()
27 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/policies/uniform_control_policy.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.policies.base import Policy
 2 | from rllab.core.serializable import Serializable
 3 | 
 4 | 
 5 | class UniformControlPolicy(Policy, Serializable):
 6 |     def __init__(
 7 |             self,
 8 |             env_spec,
 9 |     ):
10 |         Serializable.quick_init(self, locals())
11 |         super(UniformControlPolicy, self).__init__(env_spec=env_spec)
12 | 
13 |     @property
14 |     def vectorized(self):
15 |         return True
16 | 
17 |     def get_action(self, observation):
18 |         return self.action_space.sample(), dict()
19 | 
20 |     def get_actions(self, observations):
21 |         return self.action_space.sample_n(len(observations)), dict()
22 | 
23 |     def get_params_internal(self, **tags):
24 |         return []
25 | 


--------------------------------------------------------------------------------
/rllab/algos/tnpg.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.npo import NPO
 2 | from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
 3 | from rllab.misc import ext
 4 | 
 5 | 
 6 | class TNPG(NPO):
 7 |     """
 8 |     Truncated Natural Policy Gradient.
 9 |     """
10 | 
11 |     def __init__(
12 |             self,
13 |             optimizer=None,
14 |             optimizer_args=None,
15 |             **kwargs):
16 |         if optimizer is None:
17 |             default_args = dict(max_backtracks=1)
18 |             if optimizer_args is None:
19 |                 optimizer_args = default_args
20 |             else:
21 |                 optimizer_args = dict(default_args, **optimizer_args)
22 |             optimizer = ConjugateGradientOptimizer(**optimizer_args)
23 |         super(TNPG, self).__init__(optimizer=optimizer, **kwargs)
24 | 


--------------------------------------------------------------------------------
/examples/trpo_cartpole.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.trpo import TRPO
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 6 | 
 7 | env = normalize(CartpoleEnv())
 8 | 
 9 | policy = GaussianMLPPolicy(
10 |     env_spec=env.spec,
11 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
12 |     hidden_sizes=(32, 32)
13 | )
14 | 
15 | baseline = LinearFeatureBaseline(env_spec=env.spec)
16 | 
17 | algo = TRPO(
18 |     env=env,
19 |     policy=policy,
20 |     baseline=baseline,
21 |     batch_size=4000,
22 |     max_path_length=100,
23 |     n_itr=40,
24 |     discount=0.99,
25 |     step_size=0.01,
26 | )
27 | algo.train()
28 | 


--------------------------------------------------------------------------------
/tests/test_serializable.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | from sandbox.rocky.tf.core.parameterized import Parameterized, suppress_params_loading
 5 | 
 6 | 
 7 | class Simple(Parameterized, Serializable):
 8 |     def __init__(self, name):
 9 |         Serializable.quick_init(self, locals())
10 |         with tf.variable_scope(name):
11 |             self.w = tf.get_variable("w", [10, 10])
12 | 
13 |     def get_params_internal(self, **tags):
14 |         return [self.w]
15 | 
16 | 
17 | def test_serializable():
18 |     with suppress_params_loading():
19 |         obj = Simple(name="obj")
20 |         obj1 = Serializable.clone(obj, name="obj1")
21 |         assert obj.w.name.startswith('obj/')
22 |         assert obj1.w.name.startswith('obj1/')
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     test_serializable()
27 | 


--------------------------------------------------------------------------------
/scripts/setup_linux.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Make sure that conda is available
 3 | 
 4 | hash conda 2>/dev/null || {
 5 |     echo "Please install anaconda before continuing. You can download it at https://www.continuum.io/downloads. Please use the Python 2.7 installer."
 6 |     exit 0
 7 | }
 8 | 
 9 | echo "Installing system dependencies"
10 | echo "You will probably be asked for your sudo password."
11 | sudo apt-get update
12 | sudo apt-get install -y python-pip python-dev swig cmake build-essential
13 | sudo apt-get build-dep -y python-pygame
14 | sudo apt-get build-dep -y python-scipy
15 | 
16 | # Make sure that we're under the directory of the project
17 | cd "$(dirname "$0")/.."
18 | 
19 | echo "Creating conda environment..."
20 | conda env create -f environment.yml
21 | conda env update
22 | 
23 | echo "Conda environment created! Make sure to run \`source activate rllab3\` whenever you open a new terminal and want to run programs under rllab."
24 | 


--------------------------------------------------------------------------------
/icml/icml_point_results_oracle.pkl:
--------------------------------------------------------------------------------
 1 | (dp0
 2 | S'task_avg_returns'
 3 | p1
 4 | (lp2
 5 | (lp3
 6 | F-1.97272875745
 7 | aF-2.34575318154
 8 | aF-2.32250937247
 9 | aF-2.08331732345
10 | aF-1.48506580113
11 | aF-1.82948806173
12 | aF-2.9593656989
13 | aF-2.95927984391
14 | aF-1.62553040865
15 | aF-2.24929753541
16 | aF-2.60588825185
17 | aF-1.87460507218
18 | aF-2.16624064843
19 | aF-2.92060743576
20 | aF-3.14571198472
21 | aF-2.19234063112
22 | aF-2.53227361567
23 | aF-1.53461938736
24 | aF-2.10279844877
25 | aF-3.24244584946
26 | aF-2.76242677435
27 | aF-2.37053743068
28 | aF-2.17942891458
29 | aF-2.06120806957
30 | aF-2.33617484501
31 | aF-3.09065130611
32 | aF-2.19778531831
33 | aF-2.64369463483
34 | aF-2.08031379918
35 | aF-1.84096218
36 | aF-2.13910248301
37 | aF-1.48007332088
38 | aF-2.59933037373
39 | aF-1.67871774468
40 | aF-2.41717877275
41 | aF-2.17676728609
42 | aF-2.92057994794
43 | aF-2.21554034345
44 | aF-2.39851887364
45 | aF-1.7900390061
46 | aas.


--------------------------------------------------------------------------------
/rllab/envs/identification_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.serializable import Serializable
 2 | from rllab.envs.proxy_env import ProxyEnv
 3 | from rllab.misc.overrides import overrides
 4 | 
 5 | 
 6 | class IdentificationEnv(ProxyEnv, Serializable):
 7 | 
 8 |     def __init__(self, mdp_cls, mdp_args):
 9 |         Serializable.quick_init(self, locals())
10 |         self.mdp_cls = mdp_cls
11 |         self.mdp_args = dict(mdp_args)
12 |         self.mdp_args["template_args"] = dict(noise=True)
13 |         mdp = self.gen_mdp()
14 |         super(IdentificationEnv, self).__init__(mdp)
15 | 
16 |     def gen_mdp(self):
17 |         return self.mdp_cls(**self.mdp_args)
18 | 
19 |     @overrides
20 |     def reset(self):
21 |         if getattr(self, "_mdp", None):
22 |             if hasattr(self._wrapped_env, "release"):
23 |                 self._wrapped_env.release()
24 |         self._wrapped_env = self.gen_mdp()
25 |         return super(IdentificationEnv, self).reset()
26 | 
27 | 


--------------------------------------------------------------------------------
/rllab/baselines/base.py:
--------------------------------------------------------------------------------
 1 | from rllab.misc import autoargs
 2 | 
 3 | 
 4 | class Baseline(object):
 5 | 
 6 |     def __init__(self, env_spec):
 7 |         self._mdp_spec = env_spec
 8 | 
 9 |     @property
10 |     def algorithm_parallelized(self):
11 |         return False
12 | 
13 |     def get_param_values(self):
14 |         raise NotImplementedError
15 | 
16 |     def set_param_values(self, val):
17 |         raise NotImplementedError
18 | 
19 |     def fit(self, paths):
20 |         raise NotImplementedError
21 | 
22 |     def predict(self, path):
23 |         raise NotImplementedError
24 | 
25 |     @classmethod
26 |     @autoargs.add_args
27 |     def add_args(cls, parser):
28 |         pass
29 | 
30 |     @classmethod
31 |     @autoargs.new_from_args
32 |     def new_from_args(cls, args, mdp):
33 |         pass
34 | 
35 |     def log_diagnostics(self, paths):
36 |         """
37 |         Log extra information per iteration based on the collected paths
38 |         """
39 |         pass
40 | 


--------------------------------------------------------------------------------
/tests/test_baselines.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | os.environ['THEANO_FLAGS'] = 'mode=FAST_COMPILE,optimizer=None'
 4 | 
 5 | from rllab.algos.vpg import VPG
 6 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 7 | from rllab.baselines.zero_baseline import ZeroBaseline
 8 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 9 | from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline
10 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
11 | from nose2 import tools
12 | 
13 | 
14 | baselines = [ZeroBaseline, LinearFeatureBaseline, GaussianMLPBaseline]
15 | 
16 | 
17 | @tools.params(*baselines)
18 | def test_baseline(baseline_cls):
19 |     env = CartpoleEnv()
20 |     policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,))
21 |     baseline = baseline_cls(env_spec=env.spec)
22 |     algo = VPG(
23 |         env=env, policy=policy, baseline=baseline,
24 |         n_itr=1, batch_size=1000, max_path_length=100
25 |     )
26 |     algo.train()
27 | 


--------------------------------------------------------------------------------
/examples/point_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.base import Env
 2 | from rllab.spaces import Box
 3 | from rllab.envs.base import Step
 4 | import numpy as np
 5 | 
 6 | 
 7 | class PointEnv(Env):
 8 |     @property
 9 |     def observation_space(self):
10 |         return Box(low=-np.inf, high=np.inf, shape=(2,))
11 | 
12 |     @property
13 |     def action_space(self):
14 |         return Box(low=-0.1, high=0.1, shape=(2,))
15 | 
16 |     def reset(self, **kwargs):
17 |         self._state = np.random.uniform(-1, 1, size=(2,))
18 |         observation = np.copy(self._state)
19 |         return observation
20 | 
21 |     def step(self, action):
22 |         self._state = self._state + action
23 |         x, y = self._state
24 |         reward = - (x ** 2 + y ** 2) ** 0.5
25 |         done = abs(x) < 0.01 and abs(y) < 0.01
26 |         next_observation = np.copy(self._state)
27 |         return Step(observation=next_observation, reward=reward, done=done)
28 | 
29 |     def render(self):
30 |         print('current state:', self._state)
31 | 


--------------------------------------------------------------------------------
/tests/regression_tests/test_issue_3.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from nose2.tools import such
 5 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 7 | from rllab.algos.trpo import TRPO
 8 | from rllab.baselines.zero_baseline import ZeroBaseline
 9 | 
10 | with such.A("Issue #3") as it:
11 |     @it.should("be fixed")
12 |     def test_issue_3():
13 |         """
14 |         As reported in https://github.com/rllab/rllab/issues/3, the adaptive_std parameter was not functioning properly
15 |         """
16 |         env = CartpoleEnv()
17 |         policy = GaussianMLPPolicy(
18 |             env_spec=env,
19 |             adaptive_std=True
20 |         )
21 |         baseline = ZeroBaseline(env_spec=env.spec)
22 |         algo = TRPO(
23 |             env=env,
24 |             policy=policy,
25 |             baseline=baseline,
26 |             batch_size=100,
27 |             n_itr=1
28 |         )
29 |         algo.train()
30 | 
31 | it.createTests(globals())
32 | 


--------------------------------------------------------------------------------
/tests/test_sampler.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import numpy as np
 4 | 
 5 | 
 6 | def test_truncate_paths():
 7 |     from rllab.sampler.parallel_sampler import truncate_paths
 8 | 
 9 |     paths = [
10 |         dict(
11 |             observations=np.zeros((100, 1)),
12 |             actions=np.zeros((100, 1)),
13 |             rewards=np.zeros(100),
14 |             env_infos=dict(),
15 |             agent_infos=dict(lala=np.zeros(100)),
16 |         ),
17 |         dict(
18 |             observations=np.zeros((50, 1)),
19 |             actions=np.zeros((50, 1)),
20 |             rewards=np.zeros(50),
21 |             env_infos=dict(),
22 |             agent_infos=dict(lala=np.zeros(50)),
23 |         ),
24 |     ]
25 | 
26 |     truncated = truncate_paths(paths, 130)
27 |     assert len(truncated) == 2
28 |     assert len(truncated[-1]["observations"]) == 30
29 |     assert len(truncated[0]["observations"]) == 100
30 |     # make sure not to change the original one
31 |     assert len(paths) == 2
32 |     assert len(paths[-1]["observations"]) == 50
33 | 


--------------------------------------------------------------------------------
/rllab/distributions/delta.py:
--------------------------------------------------------------------------------
 1 | from rllab.distributions.base import Distribution
 2 | 
 3 | class Delta(Distribution):
 4 |     @property
 5 |     def dim(self):
 6 |         return 0
 7 | 
 8 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
 9 |         return None
10 | 
11 |     def kl(self, old_dist_info, new_dist_info):
12 |         return None
13 | 
14 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
15 |         raise NotImplementedError
16 | 
17 |     def entropy(self, dist_info):
18 |         raise NotImplementedError
19 | 
20 |     def log_likelihood_sym(self, x_var, dist_info_vars):
21 |         raise NotImplementedError
22 | 
23 |     def likelihood_sym(self, x_var, dist_info_vars):
24 |         return TT.exp(self.log_likelihood_sym(x_var, dist_info_vars))
25 | 
26 |     def log_likelihood(self, xs, dist_info):
27 |         return None
28 | 
29 |     @property
30 |     def dist_info_keys(self):
31 |         return None
32 | 
33 |     def entropy(self,dist_info):
34 |         return 0
35 | 


--------------------------------------------------------------------------------
/scripts/submit_gym.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import argparse
 4 | import os
 5 | import os.path as osp
 6 | import gym
 7 | from rllab.viskit.core import load_params
 8 | 
 9 | if __name__ == "__main__":
10 |     # rl_gym.api_key = 'g8JOpnNVmcjMShBiFtyji2VWX3P2uCzc'
11 |     if 'OPENAI_GYM_API_KEY' not in os.environ:
12 |         raise ValueError("OpenAi Gym API key not configured. Please register an account on https://gym.openai.com and"
13 |                          " set the OPENAI_GYM_API_KEY environment variable, and try the script again.")
14 | 
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument('log_dir', type=str,
17 |                         help='path to the logging directory')
18 |     parser.add_argument('--algorithm_id', type=str, default=None, help='Algorithm ID')
19 |     args = parser.parse_args()
20 |     snapshot_dir = osp.abspath(osp.join(args.log_dir, ".."))
21 |     params_file_path = osp.join(snapshot_dir, "params.json")
22 |     params_json = load_params(params_file_path)
23 |     gym.upload(args.log_dir, algorithm_id=args.algorithm_id)
24 | 


--------------------------------------------------------------------------------
/scripts/setup_osx.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Make sure that pip is available
 3 | hash brew 2>/dev/null || {
 4 |     echo "Please install homebrew before continuing. You can use the following command to install:"
 5 |     echo "/usr/bin/ruby -e \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)\""
 6 |     exit 0
 7 | }
 8 | 
 9 | hash conda 2>/dev/null || {
10 |     echo "Please install anaconda before continuing. You can download it at https://www.continuum.io/downloads. Please use the Python 2.7 installer."
11 |     exit 0
12 | }
13 | 
14 | 
15 | echo "Installing system dependencies"
16 | echo "You will probably be asked for your sudo password."
17 | 
18 | brew install swig sdl sdl_image sdl_mixer sdl_ttf portmidi
19 | 
20 | # Make sure that we're under the directory of the project
21 | cd "$(dirname "$0")/.."
22 | echo "Creating conda environment..."
23 | conda env create -f environment.yml
24 | conda env update
25 | 
26 | echo "Conda environment created! Make sure to run \`source activate rllab3\` whenever you open a new terminal and want to run programs under rllab."
27 | 


--------------------------------------------------------------------------------
/examples/trpo_cartpole_recurrent.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from rllab.algos.trpo import TRPO
 5 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 6 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 7 | from rllab.envs.normalized_env import normalize
 8 | from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy
 9 | from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
10 | from rllab.misc.instrument import stub, run_experiment_lite
11 | 
12 | stub(globals())
13 | 
14 | env = normalize(CartpoleEnv())
15 | 
16 | policy = GaussianGRUPolicy(
17 |     env_spec=env.spec,
18 | )
19 | 
20 | baseline = LinearFeatureBaseline(env_spec=env.spec)
21 | 
22 | algo = TRPO(
23 |     env=env,
24 |     policy=policy,
25 |     baseline=baseline,
26 |     batch_size=4000,
27 |     max_path_length=100,
28 |     n_itr=10,
29 |     discount=0.99,
30 |     step_size=0.01,
31 |     optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
32 | )
33 | run_experiment_lite(
34 |     algo.train(),
35 |     n_parallel=1,
36 |     seed=1,
37 | )
38 | 


--------------------------------------------------------------------------------
/tests/test_spaces.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from rllab.spaces import Product, Discrete, Box
 3 | import numpy as np
 4 | 
 5 | 
 6 | def test_product_space():
 7 |     _ = Product([Discrete(3), Discrete(2)])
 8 |     product_space = Product(Discrete(3), Discrete(2))
 9 |     sample = product_space.sample()
10 |     assert product_space.contains(sample)
11 | 
12 | 
13 | def test_product_space_unflatten_n():
14 |     space = Product([Discrete(3), Discrete(3)])
15 |     np.testing.assert_array_equal(space.flatten((2, 2)), space.flatten_n([(2, 2)])[0])
16 |     np.testing.assert_array_equal(
17 |         space.unflatten(space.flatten((2, 2))),
18 |         space.unflatten_n(space.flatten_n([(2, 2)]))[0]
19 |     )
20 | 
21 | 
22 | def test_box():
23 |     space = Box(low=-1, high=1, shape=(2, 2))
24 |     np.testing.assert_array_equal(space.flatten([[1, 2], [3, 4]]), [1, 2, 3, 4])
25 |     np.testing.assert_array_equal(space.flatten_n([[[1, 2], [3, 4]]]), [[1, 2, 3, 4]])
26 |     np.testing.assert_array_equal(space.unflatten([1, 2, 3, 4]), [[1, 2], [3, 4]])
27 |     np.testing.assert_array_equal(space.unflatten_n([[1, 2, 3, 4]]), [[[1, 2], [3, 4]]])
28 | 


--------------------------------------------------------------------------------
/contrib/alexbeloi/examples/vpgis_cartpole.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.vpg import VPG
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 6 | from contrib.alexbeloi.is_sampler import ISSampler
 7 | 
 8 | """
 9 | Example using VPG with ISSampler, iterations alternate between live and
10 | importance sampled iterations.
11 | """
12 | 
13 | env = normalize(CartpoleEnv())
14 | 
15 | policy = GaussianMLPPolicy(
16 |     env_spec=env.spec,
17 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
18 |     hidden_sizes=(32, 32)
19 | )
20 | 
21 | baseline = LinearFeatureBaseline(env_spec=env.spec)
22 | 
23 | algo = VPG(
24 |     env=env,
25 |     policy=policy,
26 |     baseline=baseline,
27 |     batch_size=4000,
28 |     max_path_length=100,
29 |     n_itr=40,
30 |     discount=0.99,
31 |     step_size=0.01,
32 |     sampler_cls=ISSampler,
33 |     sampler_args=dict(n_backtrack=1),
34 | )
35 | algo.train()
36 | 


--------------------------------------------------------------------------------
/rllab/policies/uniform_control_policy.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.parameterized import Parameterized
 2 | from rllab.core.serializable import Serializable
 3 | from rllab.distributions.delta import Delta
 4 | from rllab.policies.base import Policy
 5 | from rllab.misc.overrides import overrides
 6 | 
 7 | 
 8 | class UniformControlPolicy(Policy, Serializable):
 9 |     def __init__(
10 |             self,
11 |             env_spec,
12 |     ):
13 |         Serializable.quick_init(self, locals())
14 |         super(UniformControlPolicy, self).__init__(env_spec=env_spec)
15 | 
16 |     @overrides
17 |     def get_action(self, observation):
18 |         return self.action_space.sample(), dict()
19 | 
20 |     def get_params_internal(self, **tags):
21 |         return []
22 | 
23 |     def get_actions(self, observations):
24 |         return self.action_space.sample_n(len(observations)), dict()
25 | 
26 |     @property
27 |     def vectorized(self):
28 |         return True
29 | 
30 |     def reset(self, dones=None):
31 |         pass
32 | 
33 |     @property
34 |     def distribution(self):
35 |         # Just a placeholder
36 |         return Delta()
37 | 


--------------------------------------------------------------------------------
/rllab/config_personal_template.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | USE_GPU = False
 4 | 
 5 | DOCKER_IMAGE = "rein/rllab-exp-new"
 6 | 
 7 | KUBE_PREFIX = "template_"
 8 | 
 9 | DOCKER_LOG_DIR = "/tmp/expt"
10 | 
11 | AWS_IMAGE_ID = "ami-67c5d00d"
12 | 
13 | if USE_GPU:
14 |     AWS_INSTANCE_TYPE = "g2.2xlarge"
15 | else:
16 |     AWS_INSTANCE_TYPE = "c4.2xlarge"
17 | 
18 | AWS_KEY_NAME = "research_virginia"
19 | 
20 | AWS_SPOT = True
21 | 
22 | AWS_SPOT_PRICE = '10.0'
23 | 
24 | AWS_IAM_INSTANCE_PROFILE_NAME = "rllab"
25 | 
26 | AWS_SECURITY_GROUPS = ["rllab"]
27 | 
28 | AWS_REGION_NAME = "us-west-2"
29 | 
30 | AWS_CODE_SYNC_S3_PATH = "<insert aws s3 bucket url for code>e"
31 | 
32 | CODE_SYNC_IGNORES = ["*.git/*", "*data/*", "*src/*",
33 |                      "*.pods/*", "*tests/*", "*examples/*", "docs/*"]
34 | 
35 | LOCAL_CODE_DIR = "<insert local code dir>"
36 | 
37 | AWS_S3_PATH = "<insert aws s3 bucket url>"
38 | 
39 | LABEL = "template"
40 | 
41 | DOCKER_CODE_DIR = "/root/code/rllab"
42 | 
43 | AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY", "<insert aws key>")
44 | 
45 | AWS_ACCESS_SECRET = os.environ.get("AWS_ACCESS_SECRET", "<insert aws secret>")
46 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/base.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | 
 5 | class Distribution(object):
 6 |     @property
 7 |     def dim(self):
 8 |         raise NotImplementedError
 9 | 
10 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
11 |         """
12 |         Compute the symbolic KL divergence of two distributions
13 |         """
14 |         raise NotImplementedError
15 | 
16 |     def kl(self, old_dist_info, new_dist_info):
17 |         """
18 |         Compute the KL divergence of two distributions
19 |         """
20 |         raise NotImplementedError
21 | 
22 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
23 |         raise NotImplementedError
24 | 
25 |     def entropy(self, dist_info):
26 |         raise NotImplementedError
27 | 
28 |     def log_likelihood_sym(self, x_var, dist_info_vars):
29 |         raise NotImplementedError
30 | 
31 |     def log_likelihood(self, xs, dist_info):
32 |         raise NotImplementedError
33 | 
34 |     @property
35 |     def dist_info_specs(self):
36 |         raise NotImplementedError
37 | 
38 |     @property
39 |     def dist_info_keys(self):
40 |         return [k for k, _ in self.dist_info_specs]
41 | 


--------------------------------------------------------------------------------
/rllab/distributions/base.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as TT
 2 | 
 3 | class Distribution(object):
 4 | 
 5 |     @property
 6 |     def dim(self):
 7 |         raise NotImplementedError
 8 | 
 9 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
10 |         """
11 |         Compute the symbolic KL divergence of two distributions
12 |         """
13 |         raise NotImplementedError
14 | 
15 |     def kl(self, old_dist_info, new_dist_info):
16 |         """
17 |         Compute the KL divergence of two distributions
18 |         """
19 |         raise NotImplementedError
20 | 
21 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
22 |         raise NotImplementedError
23 | 
24 |     def entropy(self, dist_info):
25 |         raise NotImplementedError
26 | 
27 |     def log_likelihood_sym(self, x_var, dist_info_vars):
28 |         raise NotImplementedError
29 | 
30 |     def likelihood_sym(self, x_var, dist_info_vars):
31 |         return TT.exp(self.log_likelihood_sym(x_var, dist_info_vars))
32 | 
33 |     def log_likelihood(self, xs, dist_info):
34 |         raise NotImplementedError
35 | 
36 |     @property
37 |     def dist_info_keys(self):
38 |         raise NotImplementedError
39 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/vpg_cartpole.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from sandbox.rocky.tf.algos.vpg import VPG
 5 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 6 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 7 | from rllab.envs.normalized_env import normalize
 8 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
 9 | from sandbox.rocky.tf.envs.base import TfEnv
10 | from rllab.misc.instrument import stub, run_experiment_lite
11 | 
12 | stub(globals())
13 | 
14 | env = TfEnv(normalize(CartpoleEnv()))
15 | 
16 | policy = GaussianMLPPolicy(
17 |     name="policy",
18 |     env_spec=env.spec,
19 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
20 |     hidden_sizes=(32, 32)
21 | )
22 | 
23 | baseline = LinearFeatureBaseline(env_spec=env.spec)
24 | 
25 | algo = VPG(
26 |     env=env,
27 |     policy=policy,
28 |     baseline=baseline,
29 |     batch_size=10000,
30 |     max_path_length=100,
31 |     n_itr=40,
32 |     discount=0.99,
33 |     optimizer_args=dict(
34 |         tf_optimizer_args=dict(
35 |             learning_rate=0.01,
36 |         )
37 |     )
38 | )
39 | run_experiment_lite(
40 |     algo.train(),
41 |     n_parallel=2,
42 |     seed=1,
43 | )
44 | 


--------------------------------------------------------------------------------
/rllab/exploration_strategies/gaussian_strategy.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.serializable import Serializable
 2 | from rllab.spaces.box import Box
 3 | from rllab.exploration_strategies.base import ExplorationStrategy
 4 | import numpy as np
 5 | 
 6 | 
 7 | class GaussianStrategy(ExplorationStrategy, Serializable):
 8 |     """
 9 |     This strategy adds Gaussian noise to the action taken by the deterministic policy.
10 |     """
11 | 
12 |     def __init__(self, env_spec, max_sigma=1.0, min_sigma=0.1, decay_period=1000000):
13 |         assert isinstance(env_spec.action_space, Box)
14 |         assert len(env_spec.action_space.shape) == 1
15 |         Serializable.quick_init(self, locals())
16 |         self._max_sigma = max_sigma
17 |         self._min_sigma = min_sigma
18 |         self._decay_period = decay_period
19 |         self._action_space = env_spec.action_space
20 | 
21 |     def get_action(self, t, observation, policy, **kwargs):
22 |         action, agent_info = policy.get_action(observation)
23 |         sigma = self._max_sigma - (self._max_sigma - self._min_sigma) * min(1.0, t * 1.0 / self._decay_period)
24 |         return np.clip(action + np.random.normal(size=len(action)) * sigma, self._action_space.low,
25 |                        self._action_space.high)
26 | 


--------------------------------------------------------------------------------
/rllab/envs/proxy_env.py:
--------------------------------------------------------------------------------
 1 | from .base import Env
 2 | 
 3 | 
 4 | class ProxyEnv(Env):
 5 |     def __init__(self, wrapped_env):
 6 |         self._wrapped_env = wrapped_env
 7 | 
 8 |     @property
 9 |     def wrapped_env(self):
10 |         return self._wrapped_env
11 | 
12 |     def reset(self, *args, **kwargs):
13 |         return self._wrapped_env.reset(*args, **kwargs)
14 | 
15 |     @property
16 |     def action_space(self):
17 |         return self._wrapped_env.action_space
18 | 
19 |     @property
20 |     def observation_space(self):
21 |         return self._wrapped_env.observation_space
22 | 
23 |     def step(self, action):
24 |         return self._wrapped_env.step(action)
25 | 
26 |     def render(self, *args, **kwargs):
27 |         return self._wrapped_env.render(*args, **kwargs)
28 | 
29 |     def log_diagnostics(self, paths, prefix=''):
30 |         self._wrapped_env.log_diagnostics(paths, prefix=prefix)
31 | 
32 |     @property
33 |     def horizon(self):
34 |         return self._wrapped_env.horizon
35 | 
36 |     def terminate(self):
37 |         self._wrapped_env.terminate()
38 | 
39 |     def get_param_values(self):
40 |         return self._wrapped_env.get_param_values()
41 | 
42 |     def set_param_values(self,params):
43 |         self._wrapped_env.set_param_values(params)
44 | 


--------------------------------------------------------------------------------
/contrib/alexbeloi/examples/trpois_cartpole.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.trpo import TRPO
 2 | from rllab.algos.tnpg import TNPG
 3 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 4 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 5 | from rllab.envs.normalized_env import normalize
 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 7 | from contrib.alexbeloi.is_sampler import ISSampler
 8 | 
 9 | """
10 | Example using VPG with ISSampler, iterations alternate between live and
11 | importance sampled iterations.
12 | """
13 | 
14 | env = normalize(CartpoleEnv())
15 | 
16 | policy = GaussianMLPPolicy(
17 |     env_spec=env.spec,
18 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
19 |     hidden_sizes=(32, 32)
20 | )
21 | 
22 | baseline = LinearFeatureBaseline(env_spec=env.spec)
23 | 
24 | optimizer_args = dict(
25 |     # debug_nan=True,
26 |     # reg_coeff=0.1,
27 |     # cg_iters=2
28 | )
29 | 
30 | algo = TRPO(
31 |     env=env,
32 |     policy=policy,
33 |     baseline=baseline,
34 |     batch_size=4000,
35 |     max_path_length=100,
36 |     n_itr=200,
37 |     discount=0.99,
38 |     step_size=0.01,
39 |     sampler_cls=ISSampler,
40 |     sampler_args=dict(n_backtrack=1),
41 |     optimizer_args=optimizer_args
42 | )
43 | algo.train()
44 | 


--------------------------------------------------------------------------------
/examples/trpo_cartpole_stub.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.trpo import TRPO
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from rllab.misc.instrument import stub, run_experiment_lite
 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 7 | 
 8 | stub(globals())
 9 | 
10 | env = normalize(CartpoleEnv())
11 | 
12 | policy = GaussianMLPPolicy(
13 |     env_spec=env.spec,
14 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
15 |     hidden_sizes=(32, 32)
16 | )
17 | 
18 | baseline = LinearFeatureBaseline(env_spec=env.spec)
19 | 
20 | algo = TRPO(
21 |     env=env,
22 |     policy=policy,
23 |     baseline=baseline,
24 |     batch_size=4000,
25 |     max_path_length=100,
26 |     n_itr=1000,
27 |     discount=0.99,
28 |     step_size=0.01,
29 |     # Uncomment both lines (this and the plot parameter below) to enable plotting
30 |     # plot=True,
31 | )
32 | 
33 | run_experiment_lite(
34 |     algo.train(),
35 |     # Number of parallel workers for sampling
36 |     n_parallel=1,
37 |     # Only keep the snapshot parameters for the last iteration
38 |     snapshot_mode="last",
39 |     # Specifies the seed for the experiment. If this is not provided, a random seed
40 |     # will be used
41 |     seed=1,
42 |     # plot=True,
43 | )
44 | 


--------------------------------------------------------------------------------
/rllab/optimizers/minibatch_dataset.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class BatchDataset(object):
 5 | 
 6 |     def __init__(self, inputs, batch_size, extra_inputs=None):
 7 |         self._inputs = [
 8 |             i for i in inputs
 9 |         ]
10 |         if extra_inputs is None:
11 |             extra_inputs = []
12 |         self._extra_inputs = extra_inputs
13 |         self._batch_size = batch_size
14 |         if batch_size is not None:
15 |             self._ids = np.arange(self._inputs[0].shape[0])
16 |             self.update()
17 | 
18 |     @property
19 |     def number_batches(self):
20 |         if self._batch_size is None:
21 |             return 1
22 |         return int(np.ceil(self._inputs[0].shape[0] * 1.0 / self._batch_size))
23 | 
24 |     def iterate(self, update=True):
25 |         if self._batch_size is None:
26 |             yield list(self._inputs) + list(self._extra_inputs)
27 |         else:
28 |             for itr in range(self.number_batches):
29 |                 batch_start = itr * self._batch_size
30 |                 batch_end = (itr + 1) * self._batch_size
31 |                 batch_ids = self._ids[batch_start:batch_end]
32 |                 batch = [d[batch_ids] for d in self._inputs]
33 |                 yield list(batch) + list(self._extra_inputs)
34 |             if update:
35 |                 self.update()
36 | 
37 |     def update(self):
38 |         np.random.shuffle(self._ids)
39 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/trpo_cartpole_recurrent.py:
--------------------------------------------------------------------------------
 1 | from sandbox.rocky.tf.algos.trpo import TRPO
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from sandbox.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy
 6 | from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy
 7 | from sandbox.rocky.tf.envs.base import TfEnv
 8 | import sandbox.rocky.tf.core.layers as L
 9 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp
10 | from rllab.misc.instrument import stub, run_experiment_lite
11 | 
12 | stub(globals())
13 | 
14 | env = TfEnv(normalize(CartpoleEnv()))
15 | 
16 | policy = GaussianLSTMPolicy(
17 |     name="policy",
18 |     env_spec=env.spec,
19 |     lstm_layer_cls=L.TfBasicLSTMLayer,
20 |     # gru_layer_cls=L.GRULayer,
21 | )
22 | 
23 | baseline = LinearFeatureBaseline(env_spec=env.spec)
24 | 
25 | algo = TRPO(
26 |     env=env,
27 |     policy=policy,
28 |     baseline=baseline,
29 |     batch_size=4000,
30 |     max_path_length=100,
31 |     n_itr=10,
32 |     discount=0.99,
33 |     step_size=0.01,
34 |     optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
35 | )
36 | run_experiment_lite(
37 |     algo.train(),
38 |     n_parallel=4,
39 |     seed=1,
40 | )
41 | 


--------------------------------------------------------------------------------
/examples/vpg_point.py:
--------------------------------------------------------------------------------
 1 | #from rllab.algos.vpg import VPG
 2 | from sandbox.rocky.tf.algos.vpg import VPG
 3 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 4 | from rllab.baselines.zero_baseline import ZeroBaseline
 5 | from examples.point_env import PointEnv
 6 | from examples.point_env_randgoal import PointEnvRandGoal
 7 | from rllab.envs.normalized_env import normalize
 8 | from rllab.misc.instrument import stub, run_experiment_lite
 9 | #from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
10 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
11 | #from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
12 | from sandbox.rocky.tf.envs.base import TfEnv
13 | 
14 | stub(globals())
15 | 
16 | #env = TfEnv(normalize(PointEnv()))
17 | env = TfEnv(normalize(PointEnvRandGoal()))
18 | policy = GaussianMLPPolicy(
19 |     name="policy",
20 |     env_spec=env.spec,
21 | )
22 | #baseline = LinearFeatureBaseline(env_spec=env.spec)
23 | baseline = ZeroBaseline(env_spec=env.spec)
24 | algo = VPG(
25 |     env=env,
26 |     policy=policy,
27 |     baseline=baseline,
28 |     #batch_size=20,
29 |     max_path_length=5,
30 |     n_itr=100,
31 |     #plot=True,
32 | )
33 | run_experiment_lite(
34 |     algo.train(),
35 |     n_parallel=1,
36 |     snapshot_mode="last",
37 |     seed=1,
38 |     exp_prefix='deleteme',
39 |     exp_name='deleteme',
40 |     #plot=True,
41 | )
42 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/launchers/trpo_cartpole.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from sandbox.rocky.tf.algos.trpo import TRPO
 5 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 6 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 7 | from rllab.envs.normalized_env import normalize
 8 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer
 9 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp
10 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
11 | from sandbox.rocky.tf.envs.base import TfEnv
12 | from rllab.misc.instrument import stub, run_experiment_lite
13 | 
14 | stub(globals())
15 | 
16 | env = TfEnv(normalize(CartpoleEnv()))
17 | 
18 | policy = GaussianMLPPolicy(
19 |     name="policy",
20 |     env_spec=env.spec,
21 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
22 |     hidden_sizes=(32, 32)
23 | )
24 | 
25 | baseline = LinearFeatureBaseline(env_spec=env.spec)
26 | 
27 | algo = TRPO(
28 |     env=env,
29 |     policy=policy,
30 |     baseline=baseline,
31 |     batch_size=4000,
32 |     max_path_length=100,
33 |     n_itr=40,
34 |     discount=0.99,
35 |     step_size=0.01,
36 |     # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5))
37 | 
38 | )
39 | run_experiment_lite(
40 |     algo.train(),
41 |     n_parallel=4,
42 |     seed=1,
43 | )
44 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. rllab documentation master file, created by
 2 |    sphinx-quickstart on Mon Feb 15 20:07:12 2016.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to rllab
 7 | ================
 8 | 
 9 | rllab is a framework for developing and evaluating reinforcement learning algorithms.
10 | 
11 | rllab is a work in progress, input is welcome. The available documentation is limited for now.
12 | 
13 | User Guide
14 | ==========
15 | 
16 | The rllab user guide explains how to install rllab, how to run experiments, and how to implement new MDPs and new algorithms.
17 | 
18 | .. toctree::
19 |    :maxdepth: 2
20 | 
21 |    user/installation
22 |    user/experiments
23 |    user/gym_integration
24 |    user/implement_env
25 |    user/implement_algo_basic
26 |    user/implement_algo_advanced
27 |    user/cluster
28 | 
29 | 
30 | Citing rllab
31 | ============
32 | 
33 | If you use rllab for academic research, you are highly encouraged to cite the following paper:
34 | 
35 | - Yan Duan, Xi Chen, Rein Houthooft, John Schulman, Pieter Abbeel. "`Benchmarking Deep Reinforcement Learning for Continuous Control <http://arxiv.org/abs/1604.06778>`_. *Proceedings of the 33rd International Conference on Machine Learning (ICML), 2016.*
36 | 
37 | 
38 | Indices and tables
39 | ==================
40 | 
41 | * :ref:`genindex`
42 | * :ref:`modindex`
43 | * :ref:`search`
44 | 
45 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: rllab3
 2 | channels:
 3 |     - https://conda.anaconda.org/kne 
 4 |     - https://conda.binstar.org/tlatorre 
 5 |     - https://conda.anaconda.org/cjs14
 6 |     - https://conda.anaconda.org/menpo
 7 |     - jjhelmus
 8 | dependencies:
 9 |     - python==3.5.2
10 |     - numpy==1.10.4
11 |     - scipy
12 |     - path.py
13 |     - python-dateutil
14 |     - joblib==0.9.4
15 |     - mako
16 |     - ipywidgets
17 |     - numba
18 |     - flask
19 |     - pybox2d
20 |     - pygame
21 |     - h5py
22 |     - matplotlib
23 |     - opencv3=3.1.0
24 |     - scikit-learn
25 |     - tensorflow
26 |     - pip:
27 |         - Pillow
28 |         - atari-py
29 |         - pyprind
30 |         - ipdb
31 |         - boto3
32 |         - PyOpenGL
33 |         - nose2
34 |         - pyzmq
35 |         - msgpack-python
36 |         - mujoco_py
37 |         - cached_property
38 |         - line_profiler
39 |         - Cython
40 |         - git+https://github.com/Theano/Theano.git@adfe319ce6b781083d8dc3200fb4481b00853791#egg=Theano
41 |         - git+https://github.com/neocxi/Lasagne.git@484866cf8b38d878e92d521be445968531646bb8#egg=Lasagne
42 |         - git+https://github.com/plotly/plotly.py.git@2594076e29584ede2d09f2aa40a8a195b3f3fc66#egg=plotly
43 |         - awscli
44 |         - git+https://github.com/openai/gym.git
45 |         - pyglet
46 |         - git+https://github.com/neocxi/prettytensor.git
47 |         - jupyter
48 |         - progressbar2
49 |         - chainer==1.15.0
50 | 


--------------------------------------------------------------------------------
/rllab/algos/erwr.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.vpg import VPG
 2 | from rllab.optimizers.lbfgs_optimizer import LbfgsOptimizer
 3 | from rllab.core.serializable import Serializable
 4 | 
 5 | 
 6 | class ERWR(VPG, Serializable):
 7 |     """
 8 |     Episodic Reward Weighted Regression [1]_
 9 | 
10 |     Notes
11 |     -----
12 |     This does not implement the original RwR [2]_ that deals with "immediate reward problems" since
13 |     it doesn't find solutions that optimize for temporally delayed rewards.
14 | 
15 |     .. [1] Kober, Jens, and Jan R. Peters. "Policy search for motor primitives in robotics." Advances in neural information processing systems. 2009.
16 |     .. [2] Peters, Jan, and Stefan Schaal. "Using reward-weighted regression for reinforcement learning of task space control." Approximate Dynamic Programming and Reinforcement Learning, 2007. ADPRL 2007. IEEE International Symposium on. IEEE, 2007.
17 |     """
18 | 
19 |     def __init__(
20 |             self,
21 |             optimizer=None,
22 |             optimizer_args=None,
23 |             positive_adv=None,
24 |             **kwargs):
25 |         Serializable.quick_init(self, locals())
26 |         if optimizer is None:
27 |             if optimizer_args is None:
28 |                 optimizer_args = dict()
29 |             optimizer = LbfgsOptimizer(**optimizer_args)
30 |         super(ERWR, self).__init__(
31 |             optimizer=optimizer,
32 |             positive_adv=True if positive_adv is None else positive_adv,
33 |             **kwargs
34 |         )
35 | 
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) 2016 rllab contributors
 4 | 
 5 | rllab uses a shared copyright model: each contributor holds copyright over
 6 | their contributions to rllab. The project versioning records all such
 7 | contribution and copyright details.
 8 | By contributing to the rllab repository through pull-request, comment,
 9 | or otherwise, the contributor releases their content to the license and
10 | copyright terms herein.
11 | 
12 | Permission is hereby granted, free of charge, to any person obtaining a copy
13 | of this software and associated documentation files (the "Software"), to deal
14 | in the Software without restriction, including without limitation the rights
15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16 | copies of the Software, and to permit persons to whom the Software is
17 | furnished to do so, subject to the following conditions:
18 | 
19 | The above copyright notice and this permission notice shall be included in all
20 | copies or substantial portions of the Software.
21 | 
22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
28 | SOFTWARE.
29 | 


--------------------------------------------------------------------------------
/rllab/spaces/base.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class Space(object):
 5 |     """
 6 |     Provides a classification state spaces and action spaces,
 7 |     so you can write generic code that applies to any Environment.
 8 |     E.g. to choose a random action.
 9 |     """
10 | 
11 |     def sample(self, seed=0):
12 |         """
13 |         Uniformly randomly sample a random elemnt of this space
14 |         """
15 |         raise NotImplementedError
16 | 
17 |     def contains(self, x):
18 |         """
19 |         Return boolean specifying if x is a valid
20 |         member of this space
21 |         """
22 |         raise NotImplementedError
23 | 
24 |     def flatten(self, x):
25 |         raise NotImplementedError
26 | 
27 |     def unflatten(self, x):
28 |         raise NotImplementedError
29 | 
30 |     def flatten_n(self, xs):
31 |         raise NotImplementedError
32 | 
33 |     def unflatten_n(self, xs):
34 |         raise NotImplementedError
35 | 
36 |     @property
37 |     def flat_dim(self):
38 |         """
39 |         The dimension of the flattened vector of the tensor representation
40 |         """
41 |         raise NotImplementedError
42 | 
43 |     def new_tensor_variable(self, name, extra_dims):
44 |         """
45 |         Create a Theano tensor variable given the name and extra dimensions prepended
46 |         :param name: name of the variable
47 |         :param extra_dims: extra dimensions in the front
48 |         :return: the created tensor variable
49 |         """
50 |         raise NotImplementedError
51 | 


--------------------------------------------------------------------------------
/examples/point_env_rand2goal.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.base import Env
 2 | from rllab.spaces import Box
 3 | from rllab.envs.base import Step
 4 | import numpy as np
 5 | 
 6 | 
 7 | class PointEnvRandGoal(Env):
 8 |     def __init__(self):
 9 |         # TODO - call super class init?
10 |         self._goal = None
11 | 
12 |     @property
13 |     def observation_space(self):
14 |         return Box(low=-np.inf, high=np.inf, shape=(2,))
15 | 
16 |     @property
17 |     def action_space(self):
18 |         return Box(low=-0.1, high=0.1, shape=(2,))
19 | 
20 |     def reset(self, reset_args=None):
21 |         goal = reset_args
22 |         if goal is not None:
23 |             self._goal = goal
24 |         elif self._goal is None:
25 |         #else:
26 |             # Only set a new goal if this env hasn't had one defined before.
27 |             goals = [np.array([-0.5,0]), np.array([0.5,0])]
28 |             self._goal = goals[np.random.randint(2)]
29 | 
30 |         self._state = (0, 0)
31 |         observation = np.copy(self._state)
32 |         return observation
33 | 
34 |     def step(self, action):
35 |         self._state = self._state + action
36 |         x, y = self._state
37 |         x -= self._goal[0]
38 |         y -= self._goal[1]
39 |         reward = - (x ** 2 + y ** 2) ** 0.5
40 |         done = abs(x) < 0.01 and abs(y) < 0.01
41 |         next_observation = np.copy(self._state)
42 |         return Step(observation=next_observation, reward=reward, done=done)
43 | 
44 |     def render(self):
45 |         print('current state:', self._state)
46 | 


--------------------------------------------------------------------------------
/scripts/sync_s3.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('.')
 3 | from rllab import config
 4 | import os
 5 | import argparse
 6 | import ast
 7 | 
 8 | if __name__ == "__main__":
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('folder', type=str, default=None, nargs='?')
11 |     parser.add_argument('--dry', action='store_true', default=False)
12 |     parser.add_argument('--bare', action='store_true', default=False)
13 |     parser.add_argument('--all', action='store_true', default=False)
14 |     args = parser.parse_args()
15 |     remote_dir = config.AWS_S3_PATH
16 |     local_dir = os.path.join(config.LOG_DIR, "s3")
17 |     if args.folder:
18 |         remote_dir = os.path.join(remote_dir, args.folder)
19 |         local_dir = os.path.join(local_dir, args.folder)
20 |     if args.bare:
21 |         command = ("""
22 |             aws s3 sync {remote_dir} {local_dir} --exclude '*' --include '*.csv' --include '*.json' --content-type "UTF-8"
23 |         """.format(local_dir=local_dir, remote_dir=remote_dir))
24 |     elif args.all:
25 |         command = ("""
26 |             aws s3 sync {remote_dir} {local_dir} --content-type "UTF-8"
27 |         """.format(local_dir=local_dir, remote_dir=remote_dir))
28 |     else:
29 |         command = ("""
30 |             aws s3 sync {remote_dir} {local_dir} --exclude '*stdout.log' --exclude '*stdouterr.log' --content-type "UTF-8"
31 |         """.format(local_dir=local_dir, remote_dir=remote_dir))
32 |     if args.dry:
33 |         print(command)
34 |     else:
35 |         os.system(command)
36 | 


--------------------------------------------------------------------------------
/rllab/baselines/linear_feature_baseline.py:
--------------------------------------------------------------------------------
 1 | from rllab.baselines.base import Baseline
 2 | from rllab.misc.overrides import overrides
 3 | import numpy as np
 4 | 
 5 | 
 6 | class LinearFeatureBaseline(Baseline):
 7 |     def __init__(self, env_spec, reg_coeff=1e-5):
 8 |         self._coeffs = None
 9 |         self._reg_coeff = reg_coeff
10 | 
11 |     @overrides
12 |     def get_param_values(self, **tags):
13 |         return self._coeffs
14 | 
15 |     @overrides
16 |     def set_param_values(self, val, **tags):
17 |         self._coeffs = val
18 | 
19 |     def _features(self, path):
20 |         o = np.clip(path["observations"], -10, 10)
21 |         l = len(path["rewards"])
22 |         al = np.arange(l).reshape(-1, 1) / 100.0
23 |         return np.concatenate([o, o ** 2, al, al ** 2, al ** 3, np.ones((l, 1))], axis=1)
24 | 
25 |     @overrides
26 |     def fit(self, paths, **kwargs):
27 |         featmat = np.concatenate([self._features(path) for path in paths])
28 |         returns = np.concatenate([path["returns"] for path in paths])
29 |         reg_coeff = self._reg_coeff
30 |         for _ in range(5):
31 |             self._coeffs = np.linalg.lstsq(
32 |                 featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]),
33 |                 featmat.T.dot(returns)
34 |             )[0]
35 |             if not np.any(np.isnan(self._coeffs)):
36 |                 break
37 |             reg_coeff *= 10
38 | 
39 |     @overrides
40 |     def predict(self, path):
41 |         if self._coeffs is None:
42 |             return np.zeros(len(path["rewards"]))
43 |         return self._features(path).dot(self._coeffs)
44 | 


--------------------------------------------------------------------------------
/rllab/core/serializable.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | 
 3 | 
 4 | class Serializable(object):
 5 | 
 6 |     def __init__(self, *args, **kwargs):
 7 |         self.__args = args
 8 |         self.__kwargs = kwargs
 9 | 
10 |     def quick_init(self, locals_):
11 |         if getattr(self, "_serializable_initialized", False):
12 |             return
13 |         spec = inspect.getargspec(self.__init__)
14 |         # Exclude the first "self" parameter
15 |         in_order_args = [locals_[arg] for arg in spec.args][1:]
16 |         if spec.varargs:
17 |             varargs = locals_[spec.varargs]
18 |         else:
19 |             varargs = tuple()
20 |         if spec.keywords:
21 |             kwargs = locals_[spec.keywords]
22 |         else:
23 |             kwargs = dict()
24 |         self.__args = tuple(in_order_args) + varargs
25 |         self.__kwargs = kwargs
26 |         setattr(self, "_serializable_initialized", True)
27 | 
28 |     def __getstate__(self):
29 |         return {"__args": self.__args, "__kwargs": self.__kwargs}
30 | 
31 |     def __setstate__(self, d):
32 |         # convert all __args to keyword-based arguments
33 |         in_order_args = inspect.getargspec(self.__init__).args[1:]
34 |         out = type(self)(**dict(zip(in_order_args, d["__args"]), **d["__kwargs"]))
35 |         self.__dict__.update(out.__dict__)
36 | 
37 |     @classmethod
38 |     def clone(cls, obj, **kwargs):
39 |         assert isinstance(obj, Serializable)
40 |         d = obj.__getstate__()
41 |         d["__kwargs"] = dict(d["__kwargs"], **kwargs)
42 |         out = type(obj).__new__(type(obj))
43 |         out.__setstate__(d)
44 |         return out
45 | 


--------------------------------------------------------------------------------
/examples/ddpg_cartpole_stub.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.ddpg import DDPG
 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 3 | from rllab.envs.normalized_env import normalize
 4 | from rllab.misc.instrument import stub, run_experiment_lite
 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy
 6 | from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy
 7 | from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction
 8 | 
 9 | stub(globals())
10 | 
11 | env = normalize(CartpoleEnv())
12 | 
13 | policy = DeterministicMLPPolicy(
14 |     env_spec=env.spec,
15 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
16 |     hidden_sizes=(32, 32)
17 | )
18 | 
19 | es = OUStrategy(env_spec=env.spec)
20 | 
21 | qf = ContinuousMLPQFunction(env_spec=env.spec)
22 | 
23 | algo = DDPG(
24 |     env=env,
25 |     policy=policy,
26 |     es=es,
27 |     qf=qf,
28 |     batch_size=32,
29 |     max_path_length=100,
30 |     epoch_length=1000,
31 |     min_pool_size=10000,
32 |     n_epochs=1000,
33 |     discount=0.99,
34 |     scale_reward=0.01,
35 |     qf_learning_rate=1e-3,
36 |     policy_learning_rate=1e-4,
37 |     # Uncomment both lines (this and the plot parameter below) to enable plotting
38 |     # plot=True,
39 | )
40 | 
41 | run_experiment_lite(
42 |     algo.train(),
43 |     # Number of parallel workers for sampling
44 |     n_parallel=1,
45 |     # Only keep the snapshot parameters for the last iteration
46 |     snapshot_mode="last",
47 |     # Specifies the seed for the experiment. If this is not provided, a random seed
48 |     # will be used
49 |     seed=1,
50 |     # plot=True,
51 | )
52 | 


--------------------------------------------------------------------------------
/examples/old/sens_vpg_point.py:
--------------------------------------------------------------------------------
 1 | #from rllab.algos.vpg import VPG
 2 | from sandbox.rocky.tf.algos.sensitive_vpg import SensitiveVPG
 3 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 4 | from rllab.baselines.zero_baseline import ZeroBaseline
 5 | from examples.point_env import PointEnv
 6 | from examples.point_env_randgoal import PointEnvRandGoal
 7 | from rllab.envs.normalized_env import normalize
 8 | from rllab.misc.instrument import stub, run_experiment_lite
 9 | #from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
10 | from sandbox.rocky.tf.policies.sens_minimal_gauss_mlp_policy import SensitiveGaussianMLPPolicy
11 | from sandbox.rocky.tf.envs.base import TfEnv
12 | 
13 | import tensorflow as tf
14 | 
15 | stub(globals())
16 | 
17 | #env = TfEnv(normalize(PointEnv()))
18 | env = TfEnv(normalize(PointEnvRandGoal()))
19 | policy = SensitiveGaussianMLPPolicy(
20 |     name="policy",
21 |     env_spec=env.spec,
22 |     grad_step_size=1.0,
23 |     hidden_nonlinearity=tf.nn.relu,
24 | )
25 | baseline = LinearFeatureBaseline(env_spec=env.spec)
26 | #baseline = ZeroBaseline(env_spec=env.spec)
27 | algo = SensitiveVPG(
28 |     env=env,
29 |     policy=policy,
30 |     baseline=baseline,
31 |     batch_size=20, # use 100 trajs for grad update
32 |     max_path_length=5,
33 |     meta_batch_size=100,
34 |     n_itr=100,
35 |     use_sensitive=False,
36 |     optimizer_args={'learning_rate': 1e-3}
37 |     #plot=True,
38 | )
39 | run_experiment_lite(
40 |     algo.train(),
41 |     n_parallel=1,
42 |     snapshot_mode="last",
43 |     seed=1,
44 |     exp_prefix='sensitive1dT5_2017_01_18',
45 |     exp_name='nosensitive_linbaseline',
46 |     #plot=True,
47 | )
48 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/models/double_pendulum.xml.mako:
--------------------------------------------------------------------------------
 1 | <%
 2 |     from rllab.misc.mako_utils import compute_rect_vertices
 3 |     link_len = opts['link_len']
 4 |     link_width = 0.1
 5 | %>
 6 | 
 7 | <box2d>
 8 |   <world timestep="0.01" velitr="20" positr="20">
 9 |     <body name="link1" type="dynamic" position="0,0">
10 |       <fixture
11 |               density="5.0"
12 |               group="-1"
13 |               shape="polygon"
14 |               vertices="${compute_rect_vertices([0,0], [0, -link_len], link_width/2)}"
15 |       />
16 |     </body>
17 |     <body name="link2" type="dynamic" position="0,${-link_len}">
18 |       <fixture
19 |               density="5.0"
20 |               group="-1"
21 |               shape="polygon"
22 |               vertices="${compute_rect_vertices([0,0], [0,-link_len], link_width/2)}"
23 |       />
24 |     </body>
25 |     <body name="track" type="static" position="0,-0.1">
26 |       <fixture group="-1" shape="polygon" box="100,0.1"/>
27 |     </body>
28 |     <joint type="revolute" name="link_joint_1" bodyA="track" bodyB="link1" anchor="0,0"/>
29 |     <joint type="revolute" name="link_joint_2" bodyA="link1" bodyB="link2" anchor="0,${-link_len}"/>
30 |     <!-- <control type="torque" joint="link_joint_1" ctrllimit="-3,3"/> -->
31 |     <control type="torque" joint="link_joint_2" ctrllimit="-50,50" />
32 |     <state type="apos" body="link1" transform="sin"/>
33 |     <state type="apos" body="link1" transform="cos"/>
34 |     <state type="avel" body="link1"/>
35 |     <state type="apos" body="link2" transform="sin"/>
36 |     <state type="apos" body="link2" transform="cos"/>
37 |     <state type="avel" body="link2"/>
38 |   </world>
39 | </box2d>
40 | 
41 | 


--------------------------------------------------------------------------------
/rllab/baselines/gaussian_conv_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.misc.overrides import overrides
 5 | from rllab.core.parameterized import Parameterized
 6 | from rllab.baselines.base import Baseline
 7 | from rllab.regressors.gaussian_conv_regressor import GaussianConvRegressor
 8 | 
 9 | 
10 | class GaussianConvBaseline(Baseline, Parameterized, Serializable):
11 | 
12 |     def __init__(
13 |             self,
14 |             env_spec,
15 |             subsample_factor=1.,
16 |             regressor_args=None,
17 |     ):
18 |         Serializable.quick_init(self, locals())
19 |         super(GaussianConvBaseline, self).__init__(env_spec)
20 |         if regressor_args is None:
21 |             regressor_args = dict()
22 | 
23 |         self._regressor = GaussianConvRegressor(
24 |             input_shape=env_spec.observation_space.shape,
25 |             output_dim=1,
26 |             name="vf",
27 |             **regressor_args
28 |         )
29 | 
30 |     @overrides
31 |     def fit(self, paths):
32 |         observations = np.concatenate([p["observations"] for p in paths])
33 |         returns = np.concatenate([p["returns"] for p in paths])
34 |         self._regressor.fit(observations, returns.reshape((-1, 1)))
35 | 
36 |     @overrides
37 |     def predict(self, path):
38 |         return self._regressor.predict(path["observations"]).flatten()
39 | 
40 |     @overrides
41 |     def get_param_values(self, **tags):
42 |         return self._regressor.get_param_values(**tags)
43 | 
44 |     @overrides
45 |     def set_param_values(self, flattened_params, **tags):
46 |         self._regressor.set_param_values(flattened_params, **tags)
47 | 


--------------------------------------------------------------------------------
/examples/point_env_randgoal_oracle.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.base import Env
 2 | from rllab.spaces import Box
 3 | from rllab.envs.base import Step
 4 | import numpy as np
 5 | 
 6 | 
 7 | class PointEnvRandGoalOracle(Env):
 8 |     def __init__(self, goal=None):
 9 |         # TODO - call super class init?
10 |         self._goal = goal
11 |         if goal is None:
12 |             self.set_at_init = False
13 |         else:
14 |             self.set_at_init = True
15 | 
16 |     @property
17 |     def observation_space(self):
18 |         return Box(low=-np.inf, high=np.inf, shape=(4,))
19 | 
20 |     @property
21 |     def action_space(self):
22 |         return Box(low=-0.1, high=0.1, shape=(2,))
23 | 
24 |     def sample_goals(self, num_goals):
25 |         return np.random.uniform(-0.5, 0.5, size=(num_goals, 2, ))
26 | 
27 |     def reset(self, reset_args=None):
28 |         goal = reset_args
29 |         if goal is not None:
30 |             self._goal = goal
31 |         elif not self.set_at_init:
32 |             self._goal = np.random.uniform(-0.5, 0.5, size=(2,))
33 | 
34 |         self._state = (0, 0)
35 |         observation = np.copy(self._state)
36 |         return np.r_[observation, np.copy(self._goal)]
37 | 
38 |     def step(self, action):
39 |         self._state = self._state + action
40 |         x, y = self._state
41 |         x -= self._goal[0]
42 |         y -= self._goal[1]
43 |         reward = - (x ** 2 + y ** 2) ** 0.5
44 |         done = abs(x) < 0.01 and abs(y) < 0.01
45 |         next_observation = np.r_[np.copy(self._state), np.copy(self._goal)]
46 |         return Step(observation=next_observation, reward=reward, done=done)
47 | 
48 |     def render(self):
49 |         print('current state:', self._state)
50 | 


--------------------------------------------------------------------------------
/rllab/envs/sliding_mem_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.envs.base import Step
 5 | from rllab.envs.proxy_env import ProxyEnv
 6 | from rllab.misc import autoargs
 7 | from rllab.misc.overrides import overrides
 8 | from rllab.spaces import Box
 9 | 
10 | 
11 | class SlidingMemEnv(ProxyEnv, Serializable):
12 | 
13 |     def __init__(
14 |             self,
15 |             env,
16 |             n_steps=4,
17 |             axis=0,
18 |     ):
19 |         super().__init__(env)
20 |         Serializable.quick_init(self, locals())
21 |         self.n_steps = n_steps
22 |         self.axis = axis
23 |         self.buffer = None
24 | 
25 |     def reset_buffer(self, new_):
26 |         assert self.axis == 0
27 |         self.buffer = np.zeros(self.observation_space.shape, dtype=np.float32)
28 |         self.buffer[0:] = new_
29 | 
30 |     def add_to_buffer(self, new_):
31 |         assert self.axis == 0
32 |         self.buffer[1:] = self.buffer[:-1]
33 |         self.buffer[:1] = new_
34 | 
35 |     @property
36 |     def observation_space(self):
37 |         origin = self._wrapped_env.observation_space
38 |         return Box(
39 |             *[
40 |                 np.repeat(b, self.n_steps, axis=self.axis)
41 |                 for b in origin.bounds
42 |             ]
43 |         )
44 | 
45 |     @overrides
46 |     def reset(self):
47 |         obs = self._wrapped_env.reset()
48 |         self.reset_buffer(obs)
49 |         return self.buffer
50 | 
51 |     @overrides
52 |     def step(self, action):
53 |         next_obs, reward, done, info = self._wrapped_env.step(action)
54 |         self.add_to_buffer(next_obs)
55 |         return Step(self.buffer, reward, done, **info)
56 | 
57 | 


--------------------------------------------------------------------------------
/rllab/baselines/gaussian_mlp_baseline.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.core.parameterized import Parameterized
 5 | from rllab.baselines.base import Baseline
 6 | from rllab.misc.overrides import overrides
 7 | from rllab.regressors.gaussian_mlp_regressor import GaussianMLPRegressor
 8 | 
 9 | 
10 | class GaussianMLPBaseline(Baseline, Parameterized, Serializable):
11 | 
12 |     def __init__(
13 |             self,
14 |             env_spec,
15 |             subsample_factor=1.,
16 |             num_seq_inputs=1,
17 |             regressor_args=None,
18 |     ):
19 |         Serializable.quick_init(self, locals())
20 |         super(GaussianMLPBaseline, self).__init__(env_spec)
21 |         if regressor_args is None:
22 |             regressor_args = dict()
23 | 
24 |         self._regressor = GaussianMLPRegressor(
25 |             input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,),
26 |             output_dim=1,
27 |             name="vf",
28 |             **regressor_args
29 |         )
30 | 
31 |     @overrides
32 |     def fit(self, paths, log=True):
33 |         observations = np.concatenate([p["observations"] for p in paths])
34 |         returns = np.concatenate([p["returns"] for p in paths])
35 |         self._regressor.fit(observations, returns.reshape((-1, 1)), log=log)
36 | 
37 |     @overrides
38 |     def predict(self, path):
39 |         return self._regressor.predict(path["observations"]).flatten()
40 | 
41 |     @overrides
42 |     def get_param_values(self, **tags):
43 |         return self._regressor.get_param_values(**tags)
44 | 
45 |     @overrides
46 |     def set_param_values(self, flattened_params, **tags):
47 |         self._regressor.set_param_values(flattened_params, **tags)
48 | 


--------------------------------------------------------------------------------
/scripts/setup_mujoco.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ "$(uname)" == "Darwin" ]; then
 4 |     mujoco_file="libmujoco131.dylib"
 5 |     glfw_file="libglfw.3.dylib"
 6 |     zip_file="mjpro131_osx.zip"
 7 |     mktemp_cmd="mktemp -d /tmp/mujoco"
 8 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then
 9 |     mujoco_file="libmujoco131.so"
10 |     glfw_file="libglfw.so.3"
11 |     zip_file="mjpro131_linux.zip"
12 |     mktemp_cmd="mktemp -d"
13 | fi
14 | 
15 | if [ ! -f vendor/mujoco/$mujoco_file ]; then
16 |     read -e -p "Please enter the path to the mujoco zip file [$zip_file]:" path
17 |     path=${path:-$zip_file} 
18 |     eval path=\"$path\"
19 |     if [ ! -f $path ]; then
20 |         echo "No file found at $path"
21 |         exit 0
22 |     fi
23 |     rm -r /tmp/mujoco
24 |     dir=`$mktemp_cmd`
25 |     unzip $path -d $dir
26 |     if [ ! -f $dir/mjpro131/bin/$mujoco_file ]; then
27 |         echo "mjpro/$mujoco_file not found. Make sure you have the correct file (most likely named $zip_file)"
28 |         exit 0
29 |     fi
30 |     if [ ! -f $dir/mjpro131/bin/$glfw_file ]; then
31 |         echo "mjpro/$glfw_file not found. Make sure you have the correct file (most likely named $zip_file)"
32 |         exit 0
33 |     fi
34 | 
35 |     mkdir -p vendor/mujoco
36 |     cp $dir/mjpro131/bin/$mujoco_file vendor/mujoco/
37 |     cp $dir/mjpro131/bin/$glfw_file vendor/mujoco/
38 | fi
39 | 
40 | if [ ! -f vendor/mujoco/mjkey.txt ]; then
41 |     read -e -p "Please enter the path to the mujoco license file [mjkey.txt]:" path
42 |     path=${path:-mjkey.txt}
43 |     eval path=$path
44 |     if [ ! -f $path ]; then
45 |         echo "No file found at $path"
46 |         exit 0
47 |     fi
48 |     cp $path vendor/mujoco/mjkey.txt
49 | fi
50 | 
51 | echo "Mujoco has been set up!"
52 | 


--------------------------------------------------------------------------------
/scripts/sim_policy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import joblib
 4 | import tensorflow as tf
 5 | 
 6 | from rllab.misc.console import query_yes_no
 7 | from rllab.sampler.utils import rollout
 8 | 
 9 | if __name__ == "__main__":
10 | 
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('file', type=str,
13 |                         help='path to the snapshot file')
14 |     parser.add_argument('--max_path_length', type=int, default=1000,
15 |                         help='Max length of rollout')
16 |     parser.add_argument('--speedup', type=float, default=1,
17 |                         help='Speedup')
18 |     parser.add_argument('--video_filename', type=str,
19 |                         help='path to the out video file')
20 |     parser.add_argument('--prompt', type=bool, default=False,
21 |                         help='Whether or not to prompt for more sim')
22 |     args = parser.parse_args()
23 | 
24 |     max_tries = 10
25 |     tri = 0
26 |     while True:
27 |         tri += 1
28 |         with tf.Session() as sess:
29 |             data = joblib.load(args.file)
30 |             policy = data['policy']
31 |             env = data['env']
32 |             while True:
33 |                 path = rollout(env, policy, max_path_length=args.max_path_length,
34 |                                animated=True, speedup=args.speedup, video_filename=args.video_filename)
35 |                 if args.prompt:
36 |                     if not query_yes_no('Continue simulation?'):
37 |                         break
38 |                 else:
39 |                     break
40 |             #import pdb; pdb.set_trace()
41 |         if len(path['rewards']) < args.max_path_length and tri >= max_tries:
42 |             tf.reset_default_graph()
43 |             continue
44 |         break
45 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/models/cartpole.xml.mako:
--------------------------------------------------------------------------------
 1 | <%
 2 |     from rllab.misc.mako_utils import compute_rect_vertices
 3 |     cart_width = 4.0 / (12 ** 0.5)
 4 |     cart_height = 3.0 / (12 ** 0.5)
 5 | 
 6 |     pole_width = 0.1
 7 |     pole_height = 1.0
 8 |     noise = opts.get("noise", False)
 9 |     if noise:
10 |         import numpy as np
11 |         pole_height += (np.random.rand()-0.5) * pole_height * 1
12 | 
13 |     cart_friction = 0.0005
14 |     pole_friction = 0.000002
15 | %>
16 | 
17 | <box2d>
18 |   <world timestep="0.05">
19 |     <body name="cart" type="dynamic" position="0,${cart_height/2}">
20 |       <fixture
21 |               density="1"
22 |               friction="${cart_friction}"
23 |               shape="polygon"
24 |               box="${cart_width/2},${cart_height/2}"
25 |       />
26 |     </body>
27 |     <body name="pole" type="dynamic" position="0,${cart_height}">
28 |       <fixture
29 |               density="1"
30 |               friction="${pole_friction}"
31 |               group="-1"
32 |               shape="polygon"
33 |               vertices="${compute_rect_vertices((0, 0), (0, pole_height), pole_width/2)}"
34 |       />
35 |     </body>
36 |     <body name="track" type="static" position="0,${cart_height/2}">
37 |       <fixture friction="${pole_friction}" group="-1" shape="polygon" box="100,0.1"/>
38 |     </body>
39 |     <joint type="revolute" name="pole_joint" bodyA="cart" bodyB="pole" anchor="0,${cart_height}"/>
40 |     <joint type="prismatic" name="track_cart" bodyA="track" bodyB="cart"/>
41 |     <state type="xpos" body="cart"/>
42 |     <state type="xvel" body="cart"/>
43 |     <state type="apos" body="pole"/>
44 |     <state type="avel" body="pole"/>
45 |     <control type="force" body="cart" anchor="0,0" direction="1,0" ctrllimit="-10,10"/>
46 |   </world>
47 | </box2d>
48 | 
49 | 


--------------------------------------------------------------------------------
/examples/point_env_randgoal.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.base import Env
 2 | from rllab.spaces import Box
 3 | from rllab.envs.base import Step
 4 | import numpy as np
 5 | 
 6 | 
 7 | class PointEnvRandGoal(Env):
 8 |     def __init__(self, goal=None):  # Can set goal to test adaptation.
 9 |         self._goal = goal
10 | 
11 |     @property
12 |     def observation_space(self):
13 |         return Box(low=-np.inf, high=np.inf, shape=(2,))
14 | 
15 |     @property
16 |     def action_space(self):
17 |         return Box(low=-0.1, high=0.1, shape=(2,))
18 | 
19 |     def sample_goals(self, num_goals):
20 |         return np.random.uniform(-0.5, 0.5, size=(num_goals, 2, ))
21 | 
22 |     def reset(self, reset_args=None):
23 |         goal = reset_args
24 |         if goal is not None:
25 |             self._goal = goal
26 |         elif self._goal is None:
27 |             # Only set a new goal if this env hasn't had one defined before.
28 |             self._goal = np.random.uniform(-0.5, 0.5, size=(2,))
29 |             #goals = [np.array([-0.5,0]), np.array([0.5,0])]
30 |             #goals = np.array([[-0.5,0], [0.5,0],[0.2,0.2],[-0.2,-0.2],[0.5,0.5],[0,0.5],[0,-0.5],[-0.5,-0.5],[0.5,-0.5],[-0.5,0.5]])
31 |             #self._goal = goals[np.random.randint(10)]
32 | 
33 |         self._state = (0, 0)
34 |         observation = np.copy(self._state)
35 |         return observation
36 | 
37 |     def step(self, action):
38 |         self._state = self._state + action
39 |         x, y = self._state
40 |         x -= self._goal[0]
41 |         y -= self._goal[1]
42 |         reward = - (x ** 2 + y ** 2) ** 0.5
43 |         done = abs(x) < 0.01 and abs(y) < 0.01
44 |         next_observation = np.copy(self._state)
45 |         return Step(observation=next_observation, reward=reward, done=done, goal=self._goal)
46 | 
47 |     def render(self):
48 |         print('current state:', self._state)
49 | 


--------------------------------------------------------------------------------
/tests/algos/test_trpo.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from rllab.envs.base import Env, Step
 4 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 5 | from rllab.baselines.zero_baseline import ZeroBaseline
 6 | from rllab.algos.trpo import TRPO
 7 | from rllab.spaces.box import Box
 8 | import lasagne.nonlinearities
 9 | import numpy as np
10 | import theano.tensor as TT
11 | 
12 | 
13 | class DummyEnv(Env):
14 |     @property
15 |     def observation_space(self):
16 |         return Box(low=-np.inf, high=np.inf, shape=(1,))
17 | 
18 |     @property
19 |     def action_space(self):
20 |         return Box(low=-5.0, high=5.0, shape=(1,))
21 | 
22 |     def reset(self):
23 |         return np.zeros(1)
24 | 
25 |     def step(self, action):
26 |         return Step(observation=np.zeros(1), reward=np.random.normal(), done=True)
27 | 
28 | 
29 | def naive_relu(x):
30 |     return TT.max(x, 0)
31 | 
32 | 
33 | def test_trpo_relu_nan():
34 |     env = DummyEnv()
35 |     policy = GaussianMLPPolicy(
36 |         env_spec=env.spec,
37 |         hidden_nonlinearity=naive_relu,
38 |         hidden_sizes=(1,))
39 |     baseline = ZeroBaseline(env_spec=env.spec)
40 |     algo = TRPO(
41 |         env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100,
42 |         step_size=0.001
43 |     )
44 |     algo.train()
45 |     assert not np.isnan(np.sum(policy.get_param_values()))
46 | 
47 | 
48 | def test_trpo_deterministic_nan():
49 |     env = DummyEnv()
50 |     policy = GaussianMLPPolicy(
51 |         env_spec=env.spec,
52 |         hidden_sizes=(1,))
53 |     policy._l_log_std.param.set_value([np.float32(np.log(1e-8))])
54 |     baseline = ZeroBaseline(env_spec=env.spec)
55 |     algo = TRPO(
56 |         env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100,
57 |         step_size=0.01
58 |     )
59 |     algo.train()
60 |     assert not np.isnan(np.sum(policy.get_param_values()))
61 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/envs/vec_env_executor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pickle as pickle
 3 | from sandbox.rocky.tf.misc import tensor_utils
 4 | 
 5 | 
 6 | class VecEnvExecutor(object):
 7 |     def __init__(self, envs, max_path_length):
 8 |         self.envs = envs
 9 |         self._action_space = envs[0].action_space
10 |         self._observation_space = envs[0].observation_space
11 |         self.ts = np.zeros(len(self.envs), dtype='int')
12 |         self.max_path_length = max_path_length
13 | 
14 |     def step(self, action_n, reset_args=None):
15 |         if reset_args is None:
16 |             reset_args = [None]*len(self.envs)
17 |         all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)]
18 |         obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results))))
19 |         dones = np.asarray(dones)
20 |         rewards = np.asarray(rewards)
21 |         self.ts += 1
22 |         if self.max_path_length is not None:
23 |             dones[self.ts >= self.max_path_length] = True
24 |         for (i, done) in enumerate(dones):
25 |             if done:
26 |                 obs[i] = self.envs[i].reset(reset_args=reset_args[i])
27 |                 self.ts[i] = 0
28 |         return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos)
29 | 
30 |     def reset(self, reset_args=None):
31 |         if reset_args is not None:
32 |             results = [env.reset(reset_args=arg) for env, arg in zip(self.envs, reset_args)]
33 |         else:
34 |             results = [env.reset() for env in self.envs]
35 |         self.ts[:] = 0
36 |         return results
37 | 
38 |     @property
39 |     def num_envs(self):
40 |         return len(self.envs)
41 | 
42 |     @property
43 |     def action_space(self):
44 |         return self._action_space
45 | 
46 |     @property
47 |     def observation_space(self):
48 |         return self._observation_space
49 | 
50 |     def terminate(self):
51 |         pass
52 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/discrete.py:
--------------------------------------------------------------------------------
 1 | from rllab.spaces.base import Space
 2 | import numpy as np
 3 | from rllab.misc import special
 4 | from rllab.misc import ext
 5 | import tensorflow as tf
 6 | 
 7 | 
 8 | class Discrete(Space):
 9 |     """
10 |     {0,1,...,n-1}
11 |     """
12 | 
13 |     def __init__(self, n):
14 |         self._n = n
15 | 
16 |     @property
17 |     def n(self):
18 |         return self._n
19 | 
20 |     def sample(self):
21 |         return np.random.randint(self.n)
22 | 
23 |     def sample_n(self, n):
24 |         return np.random.randint(low=0, high=self.n, size=n)
25 | 
26 |     def contains(self, x):
27 |         x = np.asarray(x)
28 |         return x.shape == () and x.dtype.kind == 'i' and x >= 0 and x < self.n
29 | 
30 |     def __repr__(self):
31 |         return "Discrete(%d)" % self.n
32 | 
33 |     def __eq__(self, other):
34 |         return self.n == other.n
35 | 
36 |     def flatten(self, x):
37 |         return special.to_onehot(x, self.n)
38 | 
39 |     def unflatten(self, x):
40 |         return special.from_onehot(x)
41 | 
42 |     def flatten_n(self, x):
43 |         return special.to_onehot_n(x, self.n)
44 | 
45 |     def unflatten_n(self, x):
46 |         return special.from_onehot_n(x)
47 | 
48 |     @property
49 |     def default_value(self):
50 |         return 0
51 | 
52 |     @property
53 |     def flat_dim(self):
54 |         return self.n
55 | 
56 |     def weighted_sample(self, weights):
57 |         return special.weighted_sample(weights, range(self.n))
58 | 
59 |     def new_tensor_variable(self, name, extra_dims):
60 |         # needed for safe conversion to float32
61 |         return tf.placeholder(dtype=tf.uint8, shape=[None] * extra_dims + [self.flat_dim], name=name)
62 | 
63 |     def __eq__(self, other):
64 |         if not isinstance(other, Discrete):
65 |             return False
66 |         return self.n == other.n
67 | 
68 |     def __hash__(self):
69 |         return hash(self.n)
70 | 


--------------------------------------------------------------------------------
/vendor/mujoco_models/point.xml:
--------------------------------------------------------------------------------
 1 | <mujoco>
 2 |   <compiler inertiafromgeom="true" angle="degree" coordinate="local" />
 3 |   <option timestep="0.02" integrator="RK4" />
 4 |   <default>
 5 |     <joint limited="false" armature="0" damping="0" />
 6 |     <geom condim="3" conaffinity="0" margin="0" friction="1 0.5 0.5" rgba="0.8 0.6 0.4 1" density="100" />
 7 |   </default>
 8 |   <asset>
 9 |     <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0" />
10 |     <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01" />
11 |     <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100" />
12 |     <material name='MatPlane' texture="texplane" shininess="1" texrepeat="30 30" specular="1"  reflectance="0.5" />
13 |     <material name='geom' texture="texgeom" texuniform="true" />
14 |   </asset>
15 |   <worldbody>
16 |     <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
17 |     <geom name='floor' material="MatPlane" pos='0 0 0' size='40 40 40' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3' />
18 |     <body name="torso" pos="0 0 0">
19 |       <geom name="pointbody" type="sphere" size="0.5" pos="0 0 0.5" />
20 |       <geom name="pointarrow" type="box" size="0.5 0.1 0.1" pos="0.6 0 0.5" />
21 |       <joint name='ballx' type='slide' axis='1 0 0' pos='0 0 0' />
22 |       <joint name='bally' type='slide' axis='0 1 0' pos='0 0 0' />
23 |       <joint name='rot' type='hinge' axis='0 0 1' pos='0 0 0' limited="false" />
24 |     </body>
25 |   </worldbody>
26 |   <actuator>
27 |     <!-- Those are just dummy actuators for providing ranges -->
28 |     <motor joint='ballx' ctrlrange="-1 1" ctrllimited="true" />
29 |     <motor joint='rot' ctrlrange="-0.25 0.25" ctrllimited="true" />
30 |   </actuator>
31 | </mujoco>
32 | 


--------------------------------------------------------------------------------
/examples/icml/trpo_point.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sandbox.rocky.tf.algos.trpo import TRPO
 3 | from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
 4 | from sandbox.rocky.tf.envs.base import TfEnv
 5 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 6 | from examples.point_env_randgoal import PointEnvRandGoal
 7 | from examples.point_env_randgoal_oracle import PointEnvRandGoalOracle
 8 | #from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
 9 | from rllab.envs.mujoco.walker2d_env import Walker2DEnv
10 | from rllab.envs.normalized_env import normalize
11 | from rllab.misc.instrument import stub, run_experiment_lite
12 | 
13 | stub(globals())
14 | 
15 | import tensorflow as tf
16 | 
17 | #env = normalize(PointEnvRandGoal())
18 | env = normalize(PointEnvRandGoalOracle())
19 | #env = normalize(HalfCheetahEnv())
20 | #env = normalize(Walker2DEnv())
21 | env = TfEnv(env)
22 | policy = GaussianMLPPolicy(
23 |     name='policy',
24 |     env_spec=env.spec,
25 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
26 |     #hidden_sizes=(32, 32)
27 |     #hidden_nonlinearity=tf.nn.relu,
28 |     hidden_sizes=(100, 100)
29 | )
30 | 
31 | baseline = LinearFeatureBaseline(env_spec=env.spec)
32 | 
33 | algo = TRPO(
34 |     env=env,
35 |     policy=policy,
36 |     baseline=baseline,
37 |     batch_size=500,  # was 4k
38 |     max_path_length=5,
39 |     n_itr=100,
40 |     discount=0.99,
41 |     step_size=0.01,
42 |     #plot=True,
43 | )
44 | #algo.train()
45 | 
46 | run_experiment_lite(
47 |     algo.train(),
48 |     # Number of parallel workers for sampling
49 |     n_parallel=4,
50 |     # Only keep the snapshot parameters for the last iteration
51 |     snapshot_mode="last",
52 |     # Specifies the seed for the experiment. If this is not provided, a random seed
53 |     # will be used
54 |     seed=1,
55 |     exp_prefix='vpg_sensitive_point',
56 |     exp_name='oracleenv',
57 |     #plot=True,
58 | )
59 | 


--------------------------------------------------------------------------------
/examples/vpg_swimmer.py:
--------------------------------------------------------------------------------
 1 | #from rllab.algos.vpg import VPG
 2 | from sandbox.rocky.tf.algos.vpg import VPG
 3 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 4 | from rllab.baselines.zero_baseline import ZeroBaseline
 5 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv
 6 | from rllab.envs.mujoco.swimmer_randgoal_oracle_env import SwimmerRandGoalOracleEnv
 7 | from rllab.envs.mujoco.swimmer_randgoal_env import SwimmerRandGoalEnv
 8 | from rllab.envs.normalized_env import normalize
 9 | from rllab.misc.instrument import stub, run_experiment_lite
10 | #from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
11 | #from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
12 | from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
13 | from sandbox.rocky.tf.envs.base import TfEnv
14 | 
15 | stub(globals())
16 | oracle = False
17 | random = True
18 | 
19 | if oracle:
20 |     env = TfEnv(normalize(SwimmerRandGoalOracleEnv()))
21 |     batch_size = 200
22 | elif random:
23 |     env = TfEnv(normalize(SwimmerRandGoalEnv()))
24 |     batch_size = 200
25 | else:
26 |     env = TfEnv(normalize(SwimmerEnv()))
27 |     batch_size = 20
28 | policy = GaussianMLPPolicy(
29 |     name="policy",
30 |     env_spec=env.spec,
31 |     hidden_sizes=(100,100),
32 | )
33 | baseline = LinearFeatureBaseline(env_spec=env.spec)
34 | #baseline = ZeroBaseline(env_spec=env.spec)
35 | algo = VPG(
36 |     env=env,
37 |     policy=policy,
38 |     baseline=baseline,
39 |     batch_size=500*batch_size,
40 |     max_path_length=500,
41 |     n_itr=500,
42 |     #plot=True,
43 |     optimizer_args={'tf_optimizer_args':{'learning_rate': 1e-3}},
44 | )
45 | run_experiment_lite(
46 |     algo.train(),
47 |     n_parallel=1,  # try increasing this to make it faster??? (Maybe need to modify code for this)
48 |     snapshot_mode="last",
49 |     seed=1,
50 |     exp_prefix='vpgswimmer',
51 |     #exp_name='basic',
52 |     exp_name='randomenv',
53 |     #plot=True,
54 | )
55 | 


--------------------------------------------------------------------------------
/icml/make_paths_plot.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import numpy as np
 3 | import pickle
 4 | 
 5 | prefixes = ['maml', 'pretrain']
 6 | 
 7 | n_itr = 4
 8 | goal =  [-0.29554775,  0.37811744]
 9 | 
10 | 
11 | plt.clf()
12 | plt.hold(True)
13 | itr_line_styles = [':', '-.', '--', '-']
14 | maml_colors = ['dodgerblue', None, None, 'darkblue']
15 | pretrain_colors = ['limegreen',None, None, 'darkgreen']
16 | 
17 | plt.figure(figsize=(9.0,4.5))
18 | ind = 0
19 | #for itr in range(n_itr):
20 | for itr in [0,3]:
21 |     with open('maml_paths_itr'+str(itr)+'.pkl', 'rb') as f:
22 |         paths = pickle.load(f)
23 |     points = paths[ind]['observations']
24 |     plt.plot(points[:,0], points[:,1], itr_line_styles[itr], color=maml_colors[itr], linewidth=2)
25 | plt.plot(goal[0], goal[1], 'r*', markersize=28, markeredgewidth=0)
26 | plt.title('MAML', fontsize=25)
27 | plt.legend(['pre-update',  '3 steps', 'goal position'], fontsize=23, loc='upper right') #, 'pretrain preupdate', 'pretrain 3 steps'])
28 | plt.xlim([-0.5, 0.3])
29 | plt.ylim([-0.2, 0.6])
30 | plt.tight_layout()
31 | ax = plt.gca()
32 | plt.setp(ax.get_xticklabels(), fontsize=14)
33 | plt.setp(ax.get_yticklabels(), fontsize=14)
34 | plt.savefig('maml_paths_viz.png')
35 | 
36 | plt.clf()
37 | #for itr in n_itr:
38 | for itr in [0,3]:
39 |     with open('pretrain_paths_itr'+str(itr)+'.pkl', 'rb') as f:
40 |         paths = pickle.load(f)
41 |     points = paths[ind]['observations']
42 |     plt.plot(points[:,0], points[:,1], itr_line_styles[itr], color=pretrain_colors[itr], linewidth=2)
43 | plt.plot(goal[0], goal[1], 'r*', markersize=28, markeredgewidth=0)
44 | plt.title('pretrained', fontsize=25)
45 | plt.legend(['pre-update',  '3 steps', 'goal position'], fontsize=23, loc='lower left') #, 'pretrain preupdate', 'pretrain 3 steps'])
46 | 
47 | plt.xlim([-0.5, 0.3])
48 | plt.ylim([-0.2, 0.6])
49 | plt.tight_layout()
50 | ax = plt.gca()
51 | plt.setp(ax.get_xticklabels(), fontsize=14)
52 | plt.setp(ax.get_yticklabels(), fontsize=14)
53 | plt.savefig('pretrain_paths_viz.png')
54 | 


--------------------------------------------------------------------------------
/examples/trpo_gym.py:
--------------------------------------------------------------------------------
 1 | use_tf = True
 2 | 
 3 | 
 4 | if use_tf:
 5 |     from sandbox.rocky.tf.algos.trpo import TRPO
 6 |     from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
 7 |     from sandbox.rocky.tf.envs.base import TfEnv
 8 | else:
 9 |     from rllab.algos.trpo import TRPO
10 |     from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
11 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
12 | from rllab.envs.gym_env import GymEnv
13 | from rllab.envs.normalized_env import normalize
14 | from rllab.misc.instrument import stub, run_experiment_lite
15 | 
16 | stub(globals())
17 | 
18 | #env = normalize(GymEnv("Pendulum-v0"))
19 | env = normalize(GymEnv("Walker2d-v1"))
20 | 
21 | if use_tf:
22 |     env = TfEnv(env)
23 |     policy = GaussianMLPPolicy(
24 |         name='policy',
25 |         env_spec=env.spec,
26 |         # The neural network policy should have two hidden layers, each with 32 hidden units.
27 |         hidden_sizes=(32, 32)
28 |     )
29 | else:
30 |     policy = GaussianMLPPolicy(
31 |         env_spec=env.spec,
32 |         # The neural network policy should have two hidden layers, each with 32 hidden units.
33 |         hidden_sizes=(32, 32)
34 |     )
35 | 
36 | baseline = LinearFeatureBaseline(env_spec=env.spec)
37 | 
38 | algo = TRPO(
39 |     env=env,
40 |     policy=policy,
41 |     baseline=baseline,
42 |     batch_size=4000,
43 |     max_path_length=env.horizon,
44 |     n_itr=10000,
45 |     discount=0.99,
46 |     step_size=0.01,
47 |     force_batch_sampler=True,  # for TF
48 |     # Uncomment both lines (this and the plot parameter below) to enable plotting
49 |     plot=True,
50 | )
51 | 
52 | run_experiment_lite(
53 |     algo.train(),
54 |     # Number of parallel workers for sampling
55 |     n_parallel=1,
56 |     # Only keep the snapshot parameters for the last iteration
57 |     snapshot_mode="last",
58 |     # Specifies the seed for the experiment. If this is not provided, a random seed
59 |     # will be used
60 |     seed=1,
61 |     plot=True,
62 | )
63 | 


--------------------------------------------------------------------------------
/examples/cluster_demo.py:
--------------------------------------------------------------------------------
 1 | from rllab.algos.trpo import TRPO
 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv
 4 | from rllab.envs.normalized_env import normalize
 5 | from rllab.misc.instrument import stub, run_experiment_lite
 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
 7 | import sys
 8 | 
 9 | stub(globals())
10 | 
11 | from rllab.misc.instrument import VariantGenerator, variant
12 | 
13 | class VG(VariantGenerator):
14 | 
15 |     @variant
16 |     def step_size(self):
17 |         return [0.01, 0.05, 0.1]
18 | 
19 |     @variant
20 |     def seed(self):
21 |         return [1, 11, 21, 31, 41]
22 | 
23 | variants = VG().variants()
24 | 
25 | for v in variants:
26 | 
27 |     env = normalize(CartpoleEnv())
28 | 
29 |     policy = GaussianMLPPolicy(
30 |         env_spec=env.spec,
31 |         # The neural network policy should have two hidden layers, each with 32 hidden units.
32 |         hidden_sizes=(32, 32)
33 |     )
34 | 
35 |     baseline = LinearFeatureBaseline(env_spec=env.spec)
36 | 
37 |     algo = TRPO(
38 |         env=env,
39 |         policy=policy,
40 |         baseline=baseline,
41 |         batch_size=4000,
42 |         max_path_length=100,
43 |         n_itr=40,
44 |         discount=0.99,
45 |         step_size=v["step_size"],
46 |         # Uncomment both lines (this and the plot parameter below) to enable plotting
47 |         # plot=True,
48 |     )
49 | 
50 |     run_experiment_lite(
51 |         algo.train(),
52 |         exp_prefix="first_exp",
53 |         # Number of parallel workers for sampling
54 |         n_parallel=1,
55 |         # Only keep the snapshot parameters for the last iteration
56 |         snapshot_mode="last",
57 |         # Specifies the seed for the experiment. If this is not provided, a random seed
58 |         # will be used
59 |         seed=v["seed"],
60 |         # mode="local",
61 |         mode="ec2",
62 |         variant=v,
63 |         # plot=True,
64 |         # terminate_machine=False,
65 |     )
66 | 


--------------------------------------------------------------------------------
/rllab/plotter/plotter.py:
--------------------------------------------------------------------------------
 1 | import atexit
 2 | import sys
 3 | if sys.version_info[0] == 2:
 4 |     from Queue import Empty
 5 | else:
 6 |     from queue import Empty
 7 | from multiprocessing import Process, Queue
 8 | from rllab.sampler.utils import rollout
 9 | import numpy as np
10 | 
11 | __all__ = [
12 |     'init_worker',
13 |     'init_plot',
14 |     'update_plot'
15 | ]
16 | 
17 | process = None
18 | queue = None
19 | 
20 | 
21 | def _worker_start():
22 |     env = None
23 |     policy = None
24 |     max_length = None
25 |     try:
26 |         while True:
27 |             msgs = {}
28 |             # Only fetch the last message of each type
29 |             while True:
30 |                 try:
31 |                     msg = queue.get_nowait()
32 |                     msgs[msg[0]] = msg[1:]
33 |                 except Empty:
34 |                     break
35 |             if 'stop' in msgs:
36 |                 break
37 |             elif 'update' in msgs:
38 |                 env, policy = msgs['update']
39 |                 # env.start_viewer()
40 |             elif 'demo' in msgs:
41 |                 param_values, max_length = msgs['demo']
42 |                 policy.set_param_values(param_values)
43 |                 rollout(env, policy, max_path_length=max_length, animated=True, speedup=5)
44 |             else:
45 |                 if max_length:
46 |                     rollout(env, policy, max_path_length=max_length, animated=True, speedup=5)
47 |     except KeyboardInterrupt:
48 |         pass
49 | 
50 | 
51 | def _shutdown_worker():
52 |     if process:
53 |         queue.put(['stop'])
54 |         queue.close()
55 |         process.join()
56 | 
57 | 
58 | def init_worker():
59 |     global process, queue
60 |     queue = Queue()
61 |     process = Process(target=_worker_start)
62 |     process.start()
63 |     atexit.register(_shutdown_worker)
64 | 
65 | 
66 | def init_plot(env, policy):
67 |     queue.put(['update', env, policy])
68 | 
69 | 
70 | def update_plot(policy, max_length=np.inf):
71 |     queue.put(['demo', policy.get_param_values(), max_length])
72 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/point_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.base import Step
 2 | from .mujoco_env import MujocoEnv
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.misc.overrides import overrides
 5 | import numpy as np
 6 | import math
 7 | from rllab.mujoco_py import glfw
 8 | 
 9 | 
10 | class PointEnv(MujocoEnv, Serializable):
11 | 
12 |     """
13 |     Use Left, Right, Up, Down, A (steer left), D (steer right)
14 |     """
15 | 
16 |     FILE = 'point.xml'
17 | 
18 |     def __init__(self, *args, **kwargs):
19 |         super(PointEnv, self).__init__(*args, **kwargs)
20 |         Serializable.quick_init(self, locals())
21 | 
22 |     def step(self, action):
23 |         qpos = np.copy(self.model.data.qpos)
24 |         qpos[2, 0] += action[1]
25 |         ori = qpos[2, 0]
26 |         # compute increment in each direction
27 |         dx = math.cos(ori) * action[0]
28 |         dy = math.sin(ori) * action[0]
29 |         # ensure that the robot is within reasonable range
30 |         qpos[0, 0] = np.clip(qpos[0, 0] + dx, -7, 7)
31 |         qpos[1, 0] = np.clip(qpos[1, 0] + dy, -7, 7)
32 |         self.model.data.qpos = qpos
33 |         self.model.forward()
34 |         next_obs = self.get_current_obs()
35 |         return Step(next_obs, 0, False)
36 | 
37 |     def get_xy(self):
38 |         qpos = self.model.data.qpos
39 |         return qpos[0, 0], qpos[1, 0]
40 | 
41 |     def set_xy(self, xy):
42 |         qpos = np.copy(self.model.data.qpos)
43 |         qpos[0, 0] = xy[0]
44 |         qpos[1, 0] = xy[1]
45 |         self.model.data.qpos = qpos
46 |         self.model.forward()
47 | 
48 |     @overrides
49 |     def action_from_key(self, key):
50 |         lb, ub = self.action_bounds
51 |         if key == glfw.KEY_LEFT:
52 |             return np.array([0, ub[0]*0.3])
53 |         elif key == glfw.KEY_RIGHT:
54 |             return np.array([0, lb[0]*0.3])
55 |         elif key == glfw.KEY_UP:
56 |             return np.array([ub[1], 0])
57 |         elif key == glfw.KEY_DOWN:
58 |             return np.array([lb[1], 0])
59 |         else:
60 |             return np.array([0, 0])
61 | 
62 | 


--------------------------------------------------------------------------------
/examples/trpo_point.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from sandbox.rocky.tf.algos.trpo import TRPO
 3 | from sandbox.rocky.tf.algos.vpg import VPG
 4 | from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
 5 | from sandbox.rocky.tf.envs.base import TfEnv
 6 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 7 | from examples.point_env_randgoal import PointEnvRandGoal
 8 | from examples.point_env_randgoal_oracle import PointEnvRandGoalOracle
 9 | #from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
10 | from rllab.envs.mujoco.walker2d_env import Walker2DEnv
11 | from rllab.envs.normalized_env import normalize
12 | from rllab.misc.instrument import stub, run_experiment_lite
13 | 
14 | stub(globals())
15 | 
16 | import tensorflow as tf
17 | 
18 | #env = normalize(PointEnvRandGoal())
19 | env = normalize(PointEnvRandGoalOracle())
20 | 
21 | #env = normalize(HalfCheetahEnv())
22 | #env = normalize(Walker2DEnv())
23 | env = TfEnv(env)
24 | policy = GaussianMLPPolicy(
25 |     name='policy',
26 |     env_spec=env.spec,
27 |     # The neural network policy should have two hidden layers, each with 32 hidden units.
28 |     #hidden_sizes=(32, 32)
29 |     hidden_nonlinearity=tf.nn.relu,
30 |     hidden_sizes=(100, 100)
31 | )
32 | 
33 | baseline = LinearFeatureBaseline(env_spec=env.spec)
34 | 
35 | algo = TRPO(
36 | #algo = VPG(
37 |     env=env,
38 |     policy=policy,
39 |     baseline=baseline,
40 |     batch_size=1000,  # was 4k  # 500 for path lenght of 5, 1000 for path length of 100
41 |     max_path_length=100,
42 |     n_itr=100,
43 |     discount=0.99,
44 |     step_size=0.01,
45 |     #plot=True,
46 | )
47 | #algo.train()
48 | 
49 | run_experiment_lite(
50 |     algo.train(),
51 |     # Number of parallel workers for sampling
52 |     n_parallel=4,
53 |     # Only keep the snapshot parameters for the last iteration
54 |     snapshot_mode="last",
55 |     # Specifies the seed for the experiment. If this is not provided, a random seed
56 |     # will be used
57 |     seed=1,
58 |     exp_prefix='vpg_sensitive_point100',
59 |     exp_name='oracleenv2',
60 |     #plot=True,
61 | )
62 | 


--------------------------------------------------------------------------------
/rllab/distributions/bernoulli.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from .base import Distribution
 4 | import theano.tensor as TT
 5 | import numpy as np
 6 | 
 7 | TINY = 1e-8
 8 | 
 9 | 
10 | class Bernoulli(Distribution):
11 |     def __init__(self, dim):
12 |         self._dim = dim
13 | 
14 |     @property
15 |     def dim(self):
16 |         return self._dim
17 | 
18 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
19 |         old_p = old_dist_info_vars["p"]
20 |         new_p = new_dist_info_vars["p"]
21 |         kl = old_p * (TT.log(old_p + TINY) - TT.log(new_p + TINY)) + \
22 |              (1 - old_p) * (TT.log(1 - old_p + TINY) - TT.log(1 - new_p + TINY))
23 |         return TT.sum(kl, axis=-1)
24 | 
25 |     def kl(self, old_dist_info, new_dist_info):
26 |         old_p = old_dist_info["p"]
27 |         new_p = new_dist_info["p"]
28 |         kl = old_p * (np.log(old_p + TINY) - np.log(new_p + TINY)) + \
29 |              (1 - old_p) * (np.log(1 - old_p + TINY) - np.log(1 - new_p + TINY))
30 |         return np.sum(kl, axis=-1)
31 | 
32 |     def sample(self, dist_info):
33 |         p = np.asarray(dist_info["p"])
34 |         return np.cast['int'](np.random.uniform(low=0., high=1., size=p.shape) < p)
35 | 
36 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
37 |         old_p = old_dist_info_vars["p"]
38 |         new_p = new_dist_info_vars["p"]
39 |         return TT.prod(x_var * new_p / (old_p + TINY) + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY),
40 |                        axis=-1)
41 | 
42 |     def log_likelihood_sym(self, x_var, dist_info_vars):
43 |         p = dist_info_vars["p"]
44 |         return TT.sum(x_var * TT.log(p + TINY) + (1 - x_var) * TT.log(1 - p + TINY), axis=-1)
45 | 
46 |     def log_likelihood(self, xs, dist_info):
47 |         p = dist_info["p"]
48 |         return np.sum(xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1)
49 | 
50 |     def entropy(self, dist_info):
51 |         p = dist_info["p"]
52 |         return np.sum(- p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1)
53 | 
54 |     @property
55 |     def dist_info_keys(self):
56 |         return ["p"]
57 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/cartpole_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rllab.envs.box2d.parser import find_body
 3 | 
 4 | from rllab.core.serializable import Serializable
 5 | from rllab.envs.box2d.box2d_env import Box2DEnv
 6 | from rllab.misc import autoargs
 7 | from rllab.misc.overrides import overrides
 8 | 
 9 | 
10 | class CartpoleEnv(Box2DEnv, Serializable):
11 | 
12 |     @autoargs.inherit(Box2DEnv.__init__)
13 |     def __init__(self, *args, **kwargs):
14 |         self.max_pole_angle = .2
15 |         self.max_cart_pos = 2.4
16 |         self.max_cart_speed = 4.
17 |         self.max_pole_speed = 4.
18 |         self.reset_range = 0.05
19 |         super(CartpoleEnv, self).__init__(
20 |             self.model_path("cartpole.xml.mako"),
21 |             *args, **kwargs
22 |         )
23 |         self.cart = find_body(self.world, "cart")
24 |         self.pole = find_body(self.world, "pole")
25 |         Serializable.__init__(self, *args, **kwargs)
26 | 
27 |     @overrides
28 |     def reset(self):
29 |         self._set_state(self.initial_state)
30 |         self._invalidate_state_caches()
31 |         bounds = np.array([
32 |             self.max_cart_pos,
33 |             self.max_cart_speed,
34 |             self.max_pole_angle,
35 |             self.max_pole_speed
36 |         ])
37 |         low, high = -self.reset_range*bounds, self.reset_range*bounds
38 |         xpos, xvel, apos, avel = np.random.uniform(low, high)
39 |         self.cart.position = (xpos, self.cart.position[1])
40 |         self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1])
41 |         self.pole.angle = apos
42 |         self.pole.angularVelocity = avel
43 |         return self.get_current_obs()
44 | 
45 |     @overrides
46 |     def compute_reward(self, action):
47 |         yield
48 |         notdone = 1 - int(self.is_current_done())
49 |         ucost = 1e-5*(action**2).sum()
50 |         xcost = 1 - np.cos(self.pole.angle)
51 |         yield notdone * 10 - notdone * xcost - notdone * ucost
52 | 
53 |     @overrides
54 |     def is_current_done(self):
55 |         return abs(self.cart.position[0]) > self.max_cart_pos or \
56 |             abs(self.pole.angle) > self.max_pole_angle
57 | 
58 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/models/car_parking.xml:
--------------------------------------------------------------------------------
 1 | <!-- Auto-generated. Do not edit! -->
 2 | <box2d>
 3 |   <world timestep="0.05" gravity="0,0">
 4 |     <body name="goal" type="static" position="0,0">
 5 |       <fixture group="-1" shape="circle" radius="1"/>
 6 |     </body>
 7 |     <body name="car" type="dynamic" position="3,4">
 8 |       <fixture density="1.6666666666666667" group="-1" shape="polygon" box="0.3,0.5"/>
 9 |     </body>
10 |     <body name="left_front_wheel" type="dynamic" position="2.7,4.2">
11 |       <fixture density="3.3333333333333335" group="-1" shape="polygon" box="0.05,0.15"/>
12 |     </body>
13 |     <joint type="revolute" name="left_front_wheel_joint" bodyA="car" bodyB="left_front_wheel" localAnchorA="-0.3,0.2" localAnchorB="0,0" limit="0,0"/>
14 |     <body name="right_front_wheel" type="dynamic" position="3.3,4.2">
15 |       <fixture density="3.3333333333333335" group="-1" shape="polygon" box="0.05,0.15"/>
16 |     </body>
17 |     <joint type="revolute" name="right_front_wheel_joint" bodyA="car" bodyB="right_front_wheel" localAnchorA="0.3,0.2" localAnchorB="0,0" limit="0,0"/>
18 |     <body name="left_rear_wheel" type="dynamic" position="2.7,3.7">
19 |       <fixture density="3.3333333333333335" group="-1" shape="polygon" box="0.05,0.15"/>
20 |     </body>
21 |     <joint type="revolute" name="left_rear_wheel_joint" bodyA="car" bodyB="left_rear_wheel" localAnchorA="-0.3,-0.3" localAnchorB="0,0" limit="0,0"/>
22 |     <body name="right_rear_wheel" type="dynamic" position="3.3,3.7">
23 |       <fixture density="3.3333333333333335" group="-1" shape="polygon" box="0.05,0.15"/>
24 |     </body>
25 |     <joint type="revolute" name="right_rear_wheel_joint" bodyA="car" bodyB="right_rear_wheel" localAnchorA="0.3,-0.3" localAnchorB="0,0" limit="0,0"/>
26 |     <control type="force" bodies="left_front_wheel,right_front_wheel" anchor="0,0" direction="0,1" ctrllimit="-10,10"/>
27 |     <state body="car" type="xvel"/>
28 |     <state body="car" type="yvel"/>
29 |     <state body="car" type="dist" to="goal"/>
30 |     <state body="car" type="angle" to="goal" transform="cos"/>
31 |     <state body="car" type="angle" to="goal" transform="sin"/>
32 |   </world>
33 | </box2d>
34 | 


--------------------------------------------------------------------------------
/scripts/resume_training.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from rllab.sampler.utils import rollout
 5 | from rllab.algos.batch_polopt import BatchPolopt
 6 | import argparse
 7 | import joblib
 8 | import uuid
 9 | import os
10 | import random
11 | import numpy as np
12 | import json
13 | import subprocess
14 | from rllab.misc import logger
15 | from rllab.misc.instrument import to_local_command
16 | 
17 | filename = str(uuid.uuid4())
18 | 
19 | if __name__ == "__main__":
20 | 
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument('file', type=str,
23 |                         help='path to the snapshot file')
24 |     parser.add_argument('--log_dir', type=str, default=None,
25 |                         help='path to the new log directory')
26 |     # Look for params.json file
27 |     args = parser.parse_args()
28 |     parent_dir = os.path.dirname(os.path.realpath(args.file))
29 |     json_file_path = os.path.join(parent_dir, "params.json")
30 |     logger.log("Looking for params.json at %s..." % json_file_path)
31 |     try:
32 |         with open(json_file_path, "r") as f:
33 |             params = json.load(f)
34 |         # exclude certain parameters
35 |         excluded = ['json_args']
36 |         for k in excluded:
37 |             if k in params:
38 |                 del params[k]
39 |         for k, v in list(params.items()):
40 |             if v is None:
41 |                 del params[k]
42 |         if args.log_dir is not None:
43 |             params['log_dir'] = args.log_dir
44 |         params['resume_from'] = args.file
45 |         command = to_local_command(params, script='scripts/run_experiment_lite.py')
46 |         print(command)
47 |         try:
48 |             subprocess.call(command, shell=True, env=os.environ)
49 |         except Exception as e:
50 |             print(e)
51 |             if isinstance(e, KeyboardInterrupt):
52 |                 raise
53 |     except IOError as e:
54 |         logger.log("Failed to find json file. Continuing in non-stub mode...")
55 |         data = joblib.load(args.file)
56 |         assert 'algo' in data
57 |         algo = data['algo']
58 |         assert isinstance(algo, BatchPolopt)
59 |         algo.train()
60 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/ant_env.py:
--------------------------------------------------------------------------------
 1 | from .mujoco_env import MujocoEnv
 2 | from rllab.core.serializable import Serializable
 3 | import numpy as np
 4 | 
 5 | from rllab.envs.base import Step
 6 | from rllab.misc.overrides import overrides
 7 | from rllab.misc import logger
 8 | 
 9 | 
10 | class AntEnv(MujocoEnv, Serializable):
11 | 
12 |     FILE = 'ant.xml'
13 | 
14 |     def __init__(self, *args, **kwargs):
15 |         super(AntEnv, self).__init__(*args, **kwargs)
16 |         Serializable.__init__(self, *args, **kwargs)
17 | 
18 |     def get_current_obs(self):
19 |         return np.concatenate([
20 |             self.model.data.qpos.flat,
21 |             self.model.data.qvel.flat,
22 |             np.clip(self.model.data.cfrc_ext, -1, 1).flat,
23 |             self.get_body_xmat("torso").flat,
24 |             self.get_body_com("torso"),
25 |         ]).reshape(-1)
26 | 
27 |     def step(self, action):
28 |         self.forward_dynamics(action)
29 |         comvel = self.get_body_comvel("torso")
30 |         forward_reward = comvel[0]
31 |         lb, ub = self.action_bounds
32 |         scaling = (ub - lb) * 0.5
33 |         ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling))
34 |         contact_cost = 0.5 * 1e-3 * np.sum(
35 |             np.square(np.clip(self.model.data.cfrc_ext, -1, 1))),
36 |         survive_reward = 0.05
37 |         reward = forward_reward - ctrl_cost - contact_cost + survive_reward
38 |         state = self._state
39 |         notdone = np.isfinite(state).all() \
40 |             and state[2] >= 0.2 and state[2] <= 1.0
41 |         done = not notdone
42 |         ob = self.get_current_obs()
43 |         return Step(ob, float(reward), done)
44 | 
45 |     @overrides
46 |     def log_diagnostics(self, paths):
47 |         progs = [
48 |             path["observations"][-1][-3] - path["observations"][0][-3]
49 |             for path in paths
50 |         ]
51 |         logger.record_tabular('AverageForwardProgress', np.mean(progs))
52 |         logger.record_tabular('MaxForwardProgress', np.max(progs))
53 |         logger.record_tabular('MinForwardProgress', np.min(progs))
54 |         logger.record_tabular('StdForwardProgress', np.std(progs))
55 | 
56 | 


--------------------------------------------------------------------------------
/icml/make_point_plots.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import glob
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | 
 7 | #names = ['maml','sens0','random','oracle']
 8 | 
 9 | prefix = 'icml_point_results_'
10 | oracle_pkl = prefix+'oracle.pkl'
11 | 
12 | maml_pkl = prefix+'maml.pkl'
13 | pretrain_pkl = prefix+'sens0.pkl'
14 | random_pkl = prefix+'random.pkl'
15 | 
16 | key = 'task_avg_returns'
17 | 
18 | n_itr = 4
19 | 
20 | with open(oracle_pkl, 'rb') as f:
21 |     oracle_data = np.array(pickle.load(f)[key])[0]
22 | 
23 | 
24 | oracle_data = np.reshape(oracle_data, [-1, 1])
25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr])
26 | 
27 | fig = plt.figure()
28 | plt.clf()
29 | 
30 | with open(maml_pkl, 'rb') as maml_f:
31 |     maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr]
32 | 
33 | with open(pretrain_pkl, 'rb') as f:
34 |     pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
35 | 
36 | with open(random_pkl, 'rb') as f:
37 |     random_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
38 | 
39 | 
40 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)')
41 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained')
42 | sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random')
43 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle')
44 | ax = fig.gca()
45 | ax.set(yscale='symlog')
46 | 
47 | plt.ylim([-100,-2.0])
48 | 
49 | plt.xlabel('number of gradient steps', fontsize=27)
50 | plt.ylabel('average return (log scale)', fontsize=27)
51 | lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=(0.01, 0.51), fontsize=22) #, bbox_to_anchor=(1, 0.5), fontsize=18)
52 | plt.title('point robot, 2d navigation', fontsize=27)
53 | plt.tight_layout()
54 | 
55 | ax = plt.gca()
56 | plt.setp(ax.get_xticklabels(), fontsize=18)
57 | plt.setp(ax.get_yticklabels(), fontsize=18)
58 | plt.xticks(np.arange(0,4,1.0))
59 | plt.savefig('point_results.png', bbox_extra_artists=(lgd,), bbox_inches='tight')
60 | 


--------------------------------------------------------------------------------
/rllab/spaces/discrete.py:
--------------------------------------------------------------------------------
 1 | from .base import Space
 2 | import numpy as np
 3 | from rllab.misc import special
 4 | from rllab.misc import ext
 5 | 
 6 | 
 7 | class Discrete(Space):
 8 |     """
 9 |     {0,1,...,n-1}
10 |     """
11 | 
12 |     def __init__(self, n):
13 |         self._n = n
14 | 
15 |     @property
16 |     def n(self):
17 |         return self._n
18 | 
19 |     def sample(self):
20 |         return np.random.randint(self.n)
21 | 
22 |     def contains(self, x):
23 |         x = np.asarray(x)
24 |         return x.shape == () and x.dtype.kind == 'i' and x >= 0 and x < self.n
25 | 
26 |     def __repr__(self):
27 |         return "Discrete(%d)" % self.n
28 | 
29 |     def __eq__(self, other):
30 |         return self.n == other.n
31 | 
32 |     def flatten(self, x):
33 |         return special.to_onehot(x, self.n)
34 | 
35 |     def unflatten(self, x):
36 |         return special.from_onehot(x)
37 | 
38 |     def flatten_n(self, x):
39 |         return special.to_onehot_n(x, self.n)
40 | 
41 |     def unflatten_n(self, x):
42 |         return special.from_onehot_n(x)
43 | 
44 |     @property
45 |     def flat_dim(self):
46 |         return self.n
47 | 
48 |     def weighted_sample(self, weights):
49 |         return special.weighted_sample(weights, range(self.n))
50 | 
51 |     @property
52 |     def default_value(self):
53 |         return 0
54 | 
55 |     def new_tensor_variable(self, name, extra_dims):
56 |         if self.n <= 2 ** 8:
57 |             return ext.new_tensor(
58 |                 name=name,
59 |                 ndim=extra_dims+1,
60 |                 dtype='uint8'
61 |             )
62 |         elif self.n <= 2 ** 16:
63 |             return ext.new_tensor(
64 |                 name=name,
65 |                 ndim=extra_dims+1,
66 |                 dtype='uint16'
67 |             )
68 |         else:
69 |             return ext.new_tensor(
70 |                 name=name,
71 |                 ndim=extra_dims+1,
72 |                 dtype='uint32'
73 |             )
74 | 
75 |     def __eq__(self, other):
76 |         if not isinstance(other, Discrete):
77 |             return False
78 |         return self.n == other.n
79 | 
80 |     def __hash__(self):
81 |         return hash(self.n)


--------------------------------------------------------------------------------
/icml/make_antdirec_plots.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import glob
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | 
 7 | #names = ['maml','sens0','random','oracle']
 8 | 
 9 | prefix = 'icml_antdirec_results_'
10 | oracle_pkl = prefix+'oracle.pkl'
11 | 
12 | maml_pkl = prefix+'maml.pkl'
13 | pretrain_pkl = prefix+'pretrain.pkl'
14 | random_pkl = prefix+'random.pkl'
15 | 
16 | key = 'task_avg_returns'
17 | 
18 | n_itr = 4
19 | 
20 | with open(oracle_pkl, 'rb') as f:
21 |     oracle_data = np.array(pickle.load(f)[key])[0]
22 | 
23 | 
24 | oracle_data = np.reshape(oracle_data, [-1, 1])
25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr])
26 | 
27 | fig = plt.figure()
28 | plt.clf()
29 | 
30 | with open(maml_pkl, 'rb') as maml_f:
31 |     maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr]
32 | 
33 | with open(pretrain_pkl, 'rb') as f:
34 |     pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
35 | 
36 | with open(random_pkl, 'rb') as f:
37 |     random_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
38 | 
39 | 
40 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)', legend=False)
41 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained', legend=False)
42 | sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random', legend=False)
43 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle', legend=False)
44 | ax = fig.gca()
45 | #ax.set(yscale='symlog')
46 | 
47 | #plt.ylim([-100,-2.0])
48 | 
49 | plt.xlabel('number of gradient steps', fontsize=26)
50 | plt.ylabel('average return', fontsize=26)
51 | #lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=0, bbox_to_anchor=(1, 0.5), fontsize=20)
52 | plt.title('ant, forward/backward', fontsize=26)
53 | #plt.ylim([-0.04, 3.5])
54 | plt.tight_layout()
55 | 
56 | ax = plt.gca()
57 | plt.setp(ax.get_xticklabels(), fontsize=18)
58 | plt.setp(ax.get_yticklabels(), fontsize=18)
59 | plt.xticks(np.arange(0,4,1.0))
60 | plt.savefig('antdirec_results.png', bbox_inches='tight')
61 | 


--------------------------------------------------------------------------------
/rllab/config.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | import os
 3 | 
 4 | PROJECT_PATH = osp.abspath(osp.join(osp.dirname(__file__), '..'))
 5 | 
 6 | LOG_DIR = PROJECT_PATH + "/data"
 7 | 
 8 | USE_TF = False
 9 | 
10 | DOCKER_IMAGE = "DOCKER_IMAGE"
11 | 
12 | DOCKERFILE_PATH = "/path/to/Dockerfile"
13 | 
14 | KUBE_PREFIX = "rllab_"
15 | 
16 | DOCKER_LOG_DIR = "/tmp/expt"
17 | 
18 | POD_DIR = PROJECT_PATH + "/.pods"
19 | 
20 | AWS_S3_PATH = None
21 | 
22 | AWS_IMAGE_ID = None
23 | 
24 | AWS_INSTANCE_TYPE = "m4.xlarge"
25 | 
26 | AWS_KEY_NAME = "AWS_KEY_NAME"
27 | 
28 | AWS_SPOT = True
29 | 
30 | AWS_SPOT_PRICE = '1.0'
31 | 
32 | AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY", None)
33 | 
34 | AWS_ACCESS_SECRET = os.environ.get("AWS_ACCESS_SECRET", None)
35 | 
36 | AWS_IAM_INSTANCE_PROFILE_NAME = "rllab"
37 | 
38 | AWS_SECURITY_GROUPS = ["rllab"]
39 | 
40 | AWS_SECURITY_GROUP_IDS = []
41 | 
42 | AWS_NETWORK_INTERFACES = []
43 | 
44 | AWS_EXTRA_CONFIGS = dict()
45 | 
46 | AWS_REGION_NAME = "us-east-1"
47 | 
48 | CODE_SYNC_IGNORES = ["*.git/*", "*data/*", "*.pod/*"]
49 | 
50 | DOCKER_CODE_DIR = "/root/code/rllab"
51 | 
52 | AWS_CODE_SYNC_S3_PATH = "s3://to/be/overriden/in/personal"
53 | 
54 | # whether to use fast code sync
55 | FAST_CODE_SYNC = True
56 | 
57 | FAST_CODE_SYNC_IGNORES = [".git", "data", ".pods"]
58 | 
59 | KUBE_DEFAULT_RESOURCES = {
60 |     "requests": {
61 |         "cpu": 0.8,
62 |     }
63 | }
64 | 
65 | KUBE_DEFAULT_NODE_SELECTOR = {
66 |     "aws/type": "m4.xlarge",
67 | }
68 | 
69 | MUJOCO_KEY_PATH = osp.expanduser("~/.mujoco")
70 | 
71 | ENV = {}
72 | 
73 | EBS_OPTIMIZED = True
74 | 
75 | if osp.exists(osp.join(osp.dirname(__file__), "config_personal.py")):
76 |     from .config_personal import *
77 | else:
78 |     print("Creating your personal config from template...")
79 |     from shutil import copy
80 |     copy(osp.join(PROJECT_PATH, "rllab/config_personal_template.py"), osp.join(PROJECT_PATH, "rllab/config_personal.py"))
81 |     from .config_personal import *
82 |     print("Personal config created, but you should probably edit it before further experiments " \
83 |           "are run")
84 |     if 'CIRCLECI' not in os.environ:
85 |         print("Exiting.")
86 |         import sys; sys.exit(0)
87 | 
88 | LABEL = ""
89 | 


--------------------------------------------------------------------------------
/icml/make_cheetahdirec_plots.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import glob
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | 
 7 | #names = ['maml','sens0','random','oracle']
 8 | 
 9 | prefix = 'icml_cheetahdirec_results_'
10 | oracle_pkl = prefix+'oracle.pkl'
11 | 
12 | maml_pkl = prefix+'maml.pkl'
13 | pretrain_pkl = prefix+'pretrain.pkl'
14 | random_pkl = prefix+'random.pkl'
15 | 
16 | key = 'task_avg_returns'
17 | 
18 | n_itr = 4
19 | 
20 | with open(oracle_pkl, 'rb') as f:
21 |     oracle_data = np.array(pickle.load(f)[key])[0]
22 | 
23 | 
24 | oracle_data = np.reshape(oracle_data, [-1, 1])
25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr])
26 | 
27 | fig = plt.figure()
28 | plt.clf()
29 | 
30 | with open(maml_pkl, 'rb') as maml_f:
31 |     maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr]
32 | 
33 | with open(pretrain_pkl, 'rb') as f:
34 |     pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
35 | 
36 | with open(random_pkl, 'rb') as f:
37 |     random_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
38 | 
39 | 
40 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)', legend=False)
41 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained', legend=False)
42 | sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random', legend=False)
43 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle', legend=False)
44 | ax = fig.gca()
45 | #ax.set(yscale='symlog')
46 | 
47 | #plt.ylim([-100,-2.0])
48 | 
49 | plt.xlabel('number of gradient steps', fontsize=26)
50 | plt.ylabel('average return', fontsize=26)
51 | #lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=0, bbox_to_anchor=(1, 0.5), fontsize=20)
52 | plt.title('half-cheetah, forward/backward', fontsize=26)
53 | #plt.ylim([-0.04, 3.5])
54 | plt.tight_layout()
55 | 
56 | ax = plt.gca()
57 | plt.setp(ax.get_xticklabels(), fontsize=18)
58 | plt.setp(ax.get_yticklabels(), fontsize=18)
59 | plt.xticks(np.arange(0,4,1.0))
60 | plt.savefig('cheetahdirec_results.png', bbox_inches='tight')
61 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/mountain_car_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pygame
 3 | from rllab.envs.box2d.parser import find_body
 4 | 
 5 | from rllab.core.serializable import Serializable
 6 | from rllab.envs.box2d.box2d_env import Box2DEnv
 7 | from rllab.misc import autoargs
 8 | from rllab.misc.overrides import overrides
 9 | 
10 | 
11 | class MountainCarEnv(Box2DEnv, Serializable):
12 | 
13 |     @autoargs.inherit(Box2DEnv.__init__)
14 |     @autoargs.arg("height_bonus_coeff", type=float,
15 |                   help="Height bonus added to each step's reward")
16 |     @autoargs.arg("goal_cart_pos", type=float,
17 |                   help="Goal horizontal position")
18 |     def __init__(self,
19 |                  height_bonus=1.,
20 |                  goal_cart_pos=0.6,
21 |                  *args, **kwargs):
22 |         super(MountainCarEnv, self).__init__(
23 |             self.model_path("mountain_car.xml.mako"),
24 |             *args, **kwargs
25 |         )
26 |         self.max_cart_pos = 2
27 |         self.goal_cart_pos = goal_cart_pos
28 |         self.height_bonus = height_bonus
29 |         self.cart = find_body(self.world, "cart")
30 |         Serializable.quick_init(self, locals())
31 | 
32 |     @overrides
33 |     def compute_reward(self, action):
34 |         yield
35 |         yield (-1 + self.height_bonus * self.cart.position[1])
36 | 
37 |     @overrides
38 |     def is_current_done(self):
39 |         return self.cart.position[0] >= self.goal_cart_pos \
40 |             or abs(self.cart.position[0]) >= self.max_cart_pos
41 | 
42 |     @overrides
43 |     def reset(self):
44 |         self._set_state(self.initial_state)
45 |         self._invalidate_state_caches()
46 |         bounds = np.array([
47 |             [-1],
48 |             [1],
49 |         ])
50 |         low, high = bounds
51 |         xvel = np.random.uniform(low, high)
52 |         self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1])
53 |         return self.get_current_obs()
54 | 
55 |     @overrides
56 |     def action_from_keys(self, keys):
57 |         if keys[pygame.K_LEFT]:
58 |             return np.asarray([-1])
59 |         elif keys[pygame.K_RIGHT]:
60 |             return np.asarray([+1])
61 |         else:
62 |             return np.asarray([0])
63 | 
64 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/models/car_parking.xml.rb:
--------------------------------------------------------------------------------
 1 | car_height = 1.0
 2 | car_width = 0.6
 3 | car_mass = 1
 4 | car_density = car_mass / car_height / car_width
 5 | 
 6 | wheel_height = 0.3
 7 | wheel_width = 0.1
 8 | wheel_mass = 0.1
 9 | wheel_density = wheel_mass / wheel_height / wheel_width
10 | wheel_max_deg = 30
11 | 
12 | phantom_group = -1
13 | common = { group: phantom_group }
14 | 
15 | box2d {
16 |   world(timestep: 0.05, gravity: [0, 0]) {
17 |     body(name: :goal, type: :static, position: [0, 0]) {
18 |       fixture(common.merge(shape: :circle, radius: 1))
19 |     }
20 | 
21 |     car_pos = [3, 4]
22 |     body(name: :car, type: :dynamic, position: car_pos) {
23 |       rect(
24 |            box: [car_width / 2, car_height / 2],
25 |            density: car_density,
26 |            group: phantom_group,
27 |            )
28 |     }
29 |     [:left_front_wheel, :right_front_wheel, :left_rear_wheel, :right_rear_wheel].each do |wheel|
30 |       x_pos = car_width / 2
31 |       x_pos *= wheel =~ /left/ ? -1 : 1
32 |       y_pos = wheel =~ /front/ ? 0.2 : -0.3
33 |       body(name: wheel, type: :dynamic, position: [car_pos[0] + x_pos, car_pos[1] + y_pos]) {
34 |         rect(
35 |              box: [wheel_width / 2, wheel_height / 2],
36 |              density: wheel_density,
37 |              group: phantom_group,
38 |              )
39 |       }
40 |       # limit = wheel =~ /front/ ? [-wheel_max_deg, wheel_max_deg] : [0, 0]
41 |       limit = [0, 0]
42 |       joint(
43 |             type: :revolute,
44 |             name: "#{wheel}_joint",
45 |             bodyA: :car,
46 |             bodyB: wheel,
47 |             localAnchorA: [x_pos, y_pos],
48 |             localAnchorB: [0, 0],
49 |             limit: limit,
50 |             )
51 |     end
52 |     control(
53 |             type: :force,
54 |             bodies: [:left_front_wheel, :right_front_wheel],
55 |             anchor: [0, 0],
56 |             direction: [0, 1],
57 |             ctrllimit: [-10.N, 10.N],
58 |             )
59 |     state body: :car, type: :xvel
60 |     state body: :car, type: :yvel
61 |     state body: :car, type: :dist, to: :goal
62 |     state body: :car, type: :angle, to: :goal, transform: :cos
63 |     state body: :car, type: :angle, to: :goal, transform: :sin
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/rllab/sampler/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rllab.misc import tensor_utils
 3 | import time
 4 | 
 5 | 
 6 | def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', reset_arg=None):
 7 |     observations = []
 8 |     actions = []
 9 |     rewards = []
10 |     agent_infos = []
11 |     env_infos = []
12 |     images = []
13 |     o = env.reset(reset_args=reset_arg)
14 |     agent.reset()
15 |     path_length = 0
16 |     if animated:
17 |         env.render()
18 |     while path_length < max_path_length:
19 |         a, agent_info = agent.get_action(o)
20 |         next_o, r, d, env_info = env.step(a)
21 |         observations.append(env.observation_space.flatten(o))
22 |         rewards.append(r)
23 |         actions.append(env.action_space.flatten(a))
24 |         agent_infos.append(agent_info)
25 |         env_infos.append(env_info)
26 |         path_length += 1
27 |         if d: # and not animated:  # TODO testing
28 |             break
29 |         o = next_o
30 |         if animated:
31 |             env.render()
32 |             timestep = 0.05
33 |             time.sleep(timestep / speedup)
34 |             if save_video:
35 |                 from PIL import Image
36 |                 image = env.wrapped_env.wrapped_env.get_viewer().get_image()
37 |                 pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0])
38 |                 images.append(np.flipud(np.array(pil_image)))
39 | 
40 |     if animated:
41 |         if save_video and len(images) >= max_path_length:
42 |             import moviepy.editor as mpy
43 |             clip = mpy.ImageSequenceClip(images, fps=20*speedup)
44 |             if video_filename[-3:] == 'gif':
45 |                 clip.write_gif(video_filename, fps=20*speedup)
46 |             else:
47 |                 clip.write_videofile(video_filename, fps=20*speedup)
48 |         #return
49 | 
50 |     return dict(
51 |         observations=tensor_utils.stack_tensor_list(observations),
52 |         actions=tensor_utils.stack_tensor_list(actions),
53 |         rewards=tensor_utils.stack_tensor_list(rewards),
54 |         agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos),
55 |         env_infos=tensor_utils.stack_tensor_dict_list(env_infos),
56 |     )
57 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/half_cheetah_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.envs.base import Step
 5 | from rllab.envs.mujoco.mujoco_env import MujocoEnv
 6 | from rllab.misc import logger
 7 | from rllab.misc.overrides import overrides
 8 | 
 9 | 
10 | def smooth_abs(x, param):
11 |     return np.sqrt(np.square(x) + np.square(param)) - param
12 | 
13 | 
14 | class HalfCheetahEnv(MujocoEnv, Serializable):
15 | 
16 |     FILE = 'half_cheetah.xml'
17 | 
18 |     def __init__(self, *args, **kwargs):
19 |         super(HalfCheetahEnv, self).__init__(*args, **kwargs)
20 |         Serializable.__init__(self, *args, **kwargs)
21 | 
22 |     def get_current_obs(self):
23 |         return np.concatenate([
24 |             self.model.data.qpos.flatten()[1:],
25 |             self.model.data.qvel.flat,
26 |             self.get_body_com("torso").flat,
27 |         ])
28 | 
29 |     def get_body_xmat(self, body_name):
30 |         idx = self.model.body_names.index(body_name)
31 |         return self.model.data.xmat[idx].reshape((3, 3))
32 | 
33 |     def get_body_com(self, body_name):
34 |         idx = self.model.body_names.index(body_name)
35 |         return self.model.data.com_subtree[idx]
36 | 
37 |     def step(self, action):
38 |         self.forward_dynamics(action)
39 |         next_obs = self.get_current_obs()
40 |         action = np.clip(action, *self.action_bounds)
41 |         ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action))
42 |         #run_cost = -1 * self.get_body_comvel("torso")[0]
43 |         run_cost = 1.*np.abs(self.get_body_comvel("torso")[0] - 0.1)
44 |         cost = ctrl_cost + run_cost
45 |         reward = -cost
46 |         done = False
47 |         return Step(next_obs, reward, done)
48 | 
49 |     @overrides
50 |     def log_diagnostics(self, paths, prefix=''):
51 |         progs = [
52 |             path["observations"][-1][-3] - path["observations"][0][-3]
53 |             for path in paths
54 |         ]
55 |         logger.record_tabular(prefix+'AverageForwardProgress', np.mean(progs))
56 |         logger.record_tabular(prefix+'MaxForwardProgress', np.max(progs))
57 |         logger.record_tabular(prefix+'MinForwardProgress', np.min(progs))
58 |         logger.record_tabular(prefix+'StdForwardProgress', np.std(progs))
59 | 


--------------------------------------------------------------------------------
/rllab/regressors/product_regressor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | import numpy as np
 5 | from rllab.core.serializable import Serializable
 6 | 
 7 | 
 8 | class ProductRegressor(Serializable):
 9 |     """
10 |     A class for performing MLE regression by fitting a product distribution to the outputs. A separate regressor will
11 |     be trained for each individual input distribution.
12 |     """
13 | 
14 |     def __init__(self, regressors):
15 |         """
16 |         :param regressors: List of individual regressors
17 |         """
18 |         Serializable.quick_init(self, locals())
19 |         self.regressors = regressors
20 |         self.output_dims = [x.output_dim for x in regressors]
21 | 
22 |     def _split_ys(self, ys):
23 |         ys = np.asarray(ys)
24 |         split_ids = np.cumsum(self.output_dims)[:-1]
25 |         return np.split(ys, split_ids, axis=1)
26 | 
27 |     def fit(self, xs, ys):
28 |         for regressor, split_ys in zip(self.regressors, self._split_ys(ys)):
29 |             regressor.fit(xs, split_ys)
30 | 
31 |     def predict(self, xs):
32 |         return np.concatenate([
33 |             regressor.predict(xs) for regressor in self.regressors
34 |         ], axis=1)
35 | 
36 |     def sample_predict(self, xs):
37 |         return np.concatenate([
38 |             regressor.sample_predict(xs) for regressor in self.regressors
39 |         ], axis=1)
40 | 
41 |     def predict_log_likelihood(self, xs, ys):
42 |         return np.sum([
43 |                           regressor.predict_log_likelihood(xs, split_ys)
44 |                           for regressor, split_ys in zip(self.regressors, self._split_ys(ys))
45 |                           ], axis=0)
46 | 
47 |     def get_param_values(self, **tags):
48 |         return np.concatenate(
49 |             [regressor.get_param_values(**tags) for regressor in self.regressors]
50 |         )
51 | 
52 |     def set_param_values(self, flattened_params, **tags):
53 |         param_dims = [
54 |             np.prod(regressor.get_param_shapes(**tags))
55 |             for regressor in self.regressors
56 |             ]
57 |         split_ids = np.cumsum(param_dims)[:-1]
58 |         for regressor, split_param_values in zip(self.regressors, np.split(flattened_params, split_ids)):
59 |             regressor.set_param_values(split_param_values)
60 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/walker2d_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.envs.base import Step
 5 | from rllab.envs.mujoco.mujoco_env import MujocoEnv
 6 | from rllab.misc import autoargs
 7 | from rllab.misc import logger
 8 | from rllab.misc.overrides import overrides
 9 | 
10 | 
11 | def smooth_abs(x, param):
12 |     return np.sqrt(np.square(x) + np.square(param)) - param
13 | 
14 | 
15 | class Walker2DEnv(MujocoEnv, Serializable):
16 | 
17 |     FILE = 'walker2d.xml'
18 | 
19 |     @autoargs.arg('ctrl_cost_coeff', type=float,
20 |                   help='cost coefficient for controls')
21 |     def __init__(
22 |             self,
23 |             ctrl_cost_coeff=1e-2,
24 |             *args, **kwargs):
25 |         self.ctrl_cost_coeff = ctrl_cost_coeff
26 |         super(Walker2DEnv, self).__init__(*args, **kwargs)
27 |         Serializable.quick_init(self, locals())
28 | 
29 |     def get_current_obs(self):
30 |         return np.concatenate([
31 |             self.model.data.qpos.flat,
32 |             self.model.data.qvel.flat,
33 |             self.get_body_com("torso").flat,
34 |         ])
35 | 
36 |     def step(self, action):
37 |         self.forward_dynamics(action)
38 |         next_obs = self.get_current_obs()
39 |         action = np.clip(action, *self.action_bounds)
40 |         lb, ub = self.action_bounds
41 |         scaling = (ub - lb) * 0.5
42 |         ctrl_cost = 0.5 * self.ctrl_cost_coeff * \
43 |             np.sum(np.square(action / scaling))
44 |         forward_reward = self.get_body_comvel("torso")[0]
45 |         reward = forward_reward - ctrl_cost
46 |         qpos = self.model.data.qpos
47 |         done = not (qpos[0] > 0.8 and qpos[0] < 2.0
48 |                     and qpos[2] > -1.0 and qpos[2] < 1.0)
49 |         return Step(next_obs, reward, done)
50 | 
51 |     @overrides
52 |     def log_diagnostics(self, paths):
53 |         progs = [
54 |             path["observations"][-1][-3] - path["observations"][0][-3]
55 |             for path in paths
56 |         ]
57 |         logger.record_tabular('AverageForwardProgress', np.mean(progs))
58 |         logger.record_tabular('MaxForwardProgress', np.max(progs))
59 |         logger.record_tabular('MinForwardProgress', np.min(progs))
60 |         logger.record_tabular('StdForwardProgress', np.std(progs))
61 | 
62 | 


--------------------------------------------------------------------------------
/tests/test_instrument.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from rllab.misc import instrument
 5 | from nose2.tools import such
 6 | 
 7 | 
 8 | class TestClass(object):
 9 |     @property
10 |     def arr(self):
11 |         return [1, 2, 3]
12 | 
13 |     @property
14 |     def compound_arr(self):
15 |         return [dict(a=1)]
16 | 
17 | 
18 | with such.A("instrument") as it:
19 |     @it.should
20 |     def test_concretize():
21 |         it.assertEqual(instrument.concretize([5]), [5])
22 |         it.assertEqual(instrument.concretize((5,)), (5,))
23 |         fake_globals = dict(TestClass=TestClass)
24 |         instrument.stub(fake_globals)
25 |         modified = fake_globals["TestClass"]
26 |         it.assertIsInstance(modified, instrument.StubClass)
27 |         it.assertIsInstance(modified(), instrument.StubObject)
28 |         it.assertEqual(instrument.concretize((5,)), (5,))
29 |         it.assertIsInstance(instrument.concretize(modified()), TestClass)
30 | 
31 | 
32 |     @it.should
33 |     def test_chained_call():
34 |         fake_globals = dict(TestClass=TestClass)
35 |         instrument.stub(fake_globals)
36 |         modified = fake_globals["TestClass"]
37 |         it.assertIsInstance(modified().arr[0], instrument.StubMethodCall)
38 |         it.assertIsInstance(modified().compound_arr[0]["a"], instrument.StubMethodCall)
39 |         it.assertEqual(instrument.concretize(modified().arr[0]), 1)
40 | 
41 | 
42 |     @it.should
43 |     def test_variant_generator():
44 | 
45 |         vg = instrument.VariantGenerator()
46 |         vg.add("key1", [1, 2, 3])
47 |         vg.add("key2", [True, False])
48 |         vg.add("key3", lambda key2: [1] if key2 else [1, 2])
49 |         it.assertEqual(len(vg.variants()), 9)
50 | 
51 |         class VG(instrument.VariantGenerator):
52 | 
53 |             @instrument.variant
54 |             def key1(self):
55 |                 return [1, 2, 3]
56 | 
57 |             @instrument.variant
58 |             def key2(self):
59 |                 yield True
60 |                 yield False
61 | 
62 |             @instrument.variant
63 |             def key3(self, key2):
64 |                 if key2:
65 |                     yield 1
66 |                 else:
67 |                     yield 1
68 |                     yield 2
69 | 
70 |         it.assertEqual(len(VG().variants()), 9)
71 | 
72 | it.createTests(globals())
73 | 


--------------------------------------------------------------------------------
/icml/make_cheetah_plots.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import glob
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | 
 7 | #names = ['maml','sens0','random','oracle']
 8 | 
 9 | prefix = 'icml_cheetah_results_'
10 | oracle_pkl = prefix+'oracle.pkl'
11 | 
12 | maml_pkl = prefix+'maml.pkl'
13 | pretrain_pkl = prefix+'pretrain.pkl'
14 | random_pkl = prefix+'random.pkl'
15 | 
16 | key = 'task_avg_returns'
17 | 
18 | n_itr = 4
19 | 
20 | with open(oracle_pkl, 'rb') as f:
21 |     oracle_data = np.array(pickle.load(f)[key])[0]
22 | 
23 | 
24 | oracle_data = np.reshape(oracle_data, [-1, 1])
25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr])
26 | 
27 | fig = plt.figure()
28 | plt.clf()
29 | 
30 | with open(maml_pkl, 'rb') as maml_f:
31 |     maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr]
32 | 
33 | with open(pretrain_pkl, 'rb') as f:
34 |     pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
35 | 
36 | with open(random_pkl, 'rb') as f:
37 |     random_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
38 | 
39 | legend=False
40 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)', legend=legend)
41 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained', legend=legend)
42 | #sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random', legend=legend)
43 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle', legend=legend)
44 | ax = fig.gca()
45 | 
46 | plt.xlabel('number of gradient steps', fontsize=26)
47 | plt.ylabel('average return', fontsize=26)
48 | if legend:
49 |     lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=0, bbox_to_anchor=(1, 0.5), fontsize=20)
50 | plt.title('half-cheetah, goal velocity', fontsize=26)
51 | #plt.ylim([-0.04, 3.5])
52 | plt.tight_layout()
53 | 
54 | ax = plt.gca()
55 | plt.setp(ax.get_xticklabels(), fontsize=18)
56 | plt.setp(ax.get_yticklabels(), fontsize=18)
57 | plt.xticks(np.arange(0,4,1.0))
58 | 
59 | if  legend:
60 |     plt.savefig('cheetah_results.png', bbox_extra_artists=(lgd,), transparent=True, bbox_inches='tight')
61 | else:
62 |     plt.savefig('cheetah_results.png', bbox_inches='tight')
63 | 
64 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/inverted_double_pendulum_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.envs.base import Step
 5 | from rllab.envs.mujoco.mujoco_env import MujocoEnv
 6 | from rllab.misc import autoargs
 7 | from rllab.misc.overrides import overrides
 8 | 
 9 | 
10 | class InvertedDoublePendulumEnv(MujocoEnv, Serializable):
11 |     FILE = 'inverted_double_pendulum.xml.mako'
12 | 
13 |     @autoargs.arg("random_start", type=bool,
14 |                   help="Randomized starting position by adjusting the angles"
15 |                        "When this is false, the double pendulum started out"
16 |                        "in balanced position")
17 |     def __init__(
18 |             self,
19 |             *args, **kwargs):
20 |         self.random_start = kwargs.get("random_start", True)
21 |         super(InvertedDoublePendulumEnv, self).__init__(*args, **kwargs)
22 |         Serializable.quick_init(self, locals())
23 | 
24 |     @overrides
25 |     def get_current_obs(self):
26 |         return np.concatenate([
27 |             self.model.data.qpos[:1],  # cart x pos
28 |             np.sin(self.model.data.qpos[1:]),  # link angles
29 |             np.cos(self.model.data.qpos[1:]),
30 |             np.clip(self.model.data.qvel, -10, 10),
31 |             np.clip(self.model.data.qfrc_constraint, -10, 10)
32 |         ]).reshape(-1)
33 | 
34 |     @overrides
35 |     def step(self, action):
36 |         self.forward_dynamics(action)
37 |         next_obs = self.get_current_obs()
38 |         x, _, y = self.model.data.site_xpos[0]
39 |         dist_penalty = 0.01 * x ** 2 + (y - 2) ** 2
40 |         v1, v2 = self.model.data.qvel[1:3]
41 |         vel_penalty = 1e-3 * v1 ** 2 + 5e-3 * v2 ** 2
42 |         alive_bonus = 10
43 |         r = float(alive_bonus - dist_penalty - vel_penalty)
44 |         done = y <= 1
45 |         return Step(next_obs, r, done)
46 | 
47 |     @overrides
48 |     def reset_mujoco(self, init_state=None):
49 |         assert init_state is None
50 |         qpos = np.copy(self.init_qpos)
51 |         if self.random_start:
52 |             qpos[1] = (np.random.rand() - 0.5) * 40 / 180. * np.pi
53 |         self.model.data.qpos = qpos
54 |         self.model.data.qvel = self.init_qvel
55 |         self.model.data.qacc = self.init_qacc
56 |         self.model.data.ctrl = self.init_ctrl
57 | 


--------------------------------------------------------------------------------
/rllab/misc/resolve.py:
--------------------------------------------------------------------------------
 1 | from pydoc import locate
 2 | import types
 3 | from rllab.misc.ext import iscanr
 4 | 
 5 | 
 6 | def classesinmodule(module):
 7 |     md = module.__dict__
 8 |     return [
 9 |         md[c] for c in md if (
10 |             isinstance(md[c], type) and md[c].__module__ == module.__name__
11 |         )
12 |     ]
13 | 
14 | 
15 | def locate_with_hint(class_path, prefix_hints=[]):
16 |     module_or_class = locate(class_path)
17 |     if module_or_class is None:
18 |         # for hint in iscanr(lambda x, y: x + "." + y, prefix_hints):
19 |         #     module_or_class = locate(hint + "." + class_path)
20 |         #     if module_or_class:
21 |         #         break
22 |         hint = ".".join(prefix_hints)
23 |         module_or_class = locate(hint + "." + class_path)
24 |     return module_or_class
25 |    
26 | 
27 | def load_class(class_path, superclass=None, prefix_hints=[]):
28 |     module_or_class = locate_with_hint(class_path, prefix_hints)
29 |     if module_or_class is None:
30 |         raise ValueError("Cannot find module or class under path %s" % class_path)
31 |     if type(module_or_class) == types.ModuleType:
32 |         if superclass:
33 |             classes = [x for x in classesinmodule(module_or_class) if issubclass(x, superclass)]
34 |         if len(classes) == 0:
35 |             if superclass:
36 |                 raise ValueError('Could not find any subclasses of %s defined in module %s' % (str(superclass), class_path))
37 |             else:
38 |                 raise ValueError('Could not find any classes defined in module %s' % (class_path))
39 |         elif len(classes) > 1:
40 |             if superclass:
41 |                 raise ValueError('Multiple subclasses of %s are defined in the module %s' % (str(superclass), class_path))
42 |             else:
43 |                 raise ValueError('Multiple classes are defined in the module %s' % (class_path))
44 |         else:
45 |             return classes[0]
46 |     elif isinstance(module_or_class, type):
47 |         if superclass is None or issubclass(module_or_class, superclass):
48 |             return module_or_class
49 |         else:
50 |             raise ValueError('The class %s is not a subclass of %s' % (str(module_or_class), str(superclass)))
51 |     else:
52 |         raise ValueError('Unsupported object: %s' % str(module_or_class))
53 | 


--------------------------------------------------------------------------------
/icml/make_ant_plots.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import glob
 3 | import numpy as np
 4 | import matplotlib.pyplot as plt
 5 | import seaborn as sns
 6 | 
 7 | #names = ['maml','sens0','random','oracle']
 8 | 
 9 | prefix = 'icml_ant_results_'
10 | oracle_pkl = prefix+'oracle.pkl'
11 | 
12 | maml_pkl = prefix+'maml.pkl'
13 | pretrain_pkl = prefix+'pretrain.pkl'
14 | random_pkl = prefix+'random.pkl'
15 | 
16 | key = 'task_avg_returns'
17 | 
18 | n_itr = 4
19 | 
20 | with open(oracle_pkl, 'rb') as f:
21 |     oracle_data = np.array(pickle.load(f)[key])[0]
22 | 
23 | 
24 | oracle_data = np.reshape(oracle_data, [-1, 1])
25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr])
26 | 
27 | fig = plt.figure()
28 | plt.clf()
29 | 
30 | with open(maml_pkl, 'rb') as maml_f:
31 |     maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr]
32 | 
33 | with open(pretrain_pkl, 'rb') as f:
34 |     pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
35 | 
36 | with open(random_pkl, 'rb') as f:
37 |     random_data = np.array(pickle.load(f)[key]).T[:,:n_itr]
38 | 
39 | 
40 | legend=False
41 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)', legend=legend)
42 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained', legend=legend)
43 | #sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random', legend=legend)
44 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle', legend=legend)
45 | ax = fig.gca()
46 | 
47 | plt.xlabel('number of gradient steps', fontsize=26)
48 | plt.ylabel('average return', fontsize=26)
49 | if legend:
50 |     lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=0, bbox_to_anchor=(1, 0.5), fontsize=20)
51 | plt.title('ant, goal velocity', fontsize=26)
52 | #plt.ylim([-0.04, 3.5])
53 | plt.tight_layout()
54 | #ax.set(yscale='symlog')
55 | 
56 | ax = plt.gca()
57 | plt.setp(ax.get_xticklabels(), fontsize=18)
58 | plt.setp(ax.get_yticklabels(), fontsize=18)
59 | plt.xticks(np.arange(0,4,1.0))
60 | 
61 | if  legend:
62 |     plt.savefig('ant_results.png', bbox_extra_artists=(lgd,), transparent=True, bbox_inches='tight')
63 | else:
64 |     plt.savefig('ant_results.png', bbox_inches='tight')
65 | 
66 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/distributions/bernoulli.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | from .base import Distribution
 4 | import tensorflow as tf
 5 | import numpy as np
 6 | 
 7 | TINY = 1e-8
 8 | 
 9 | 
10 | class Bernoulli(Distribution):
11 |     def __init__(self, dim):
12 |         self._dim = dim
13 | 
14 |     @property
15 |     def dim(self):
16 |         return self._dim
17 | 
18 |     def kl_sym(self, old_dist_info_vars, new_dist_info_vars):
19 |         old_p = old_dist_info_vars["p"]
20 |         new_p = new_dist_info_vars["p"]
21 |         kl = old_p * (tf.log(old_p + TINY) - tf.log(new_p + TINY)) + \
22 |              (1 - old_p) * (tf.log(1 - old_p + TINY) - tf.log(1 - new_p + TINY))
23 |         ndims = kl.get_shape().ndims
24 |         return tf.reduce_sum(kl, reduction_indices=ndims - 1)
25 | 
26 |     def kl(self, old_dist_info, new_dist_info):
27 |         old_p = old_dist_info["p"]
28 |         new_p = new_dist_info["p"]
29 |         kl = old_p * (np.log(old_p + TINY) - np.log(new_p + TINY)) + \
30 |              (1 - old_p) * (np.log(1 - old_p + TINY) - np.log(1 - new_p + TINY))
31 |         return np.sum(kl, axis=-1)
32 | 
33 |     def sample(self, dist_info):
34 |         p = np.asarray(dist_info["p"])
35 |         return np.cast['int'](np.random.uniform(low=0., high=1., size=p.shape) < p)
36 | 
37 |     def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars):
38 |         old_p = old_dist_info_vars["p"]
39 |         new_p = new_dist_info_vars["p"]
40 |         ndims = old_p.get_shape().ndims
41 |         return tf.reduce_prod(x_var * new_p / (old_p + TINY) + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY),
42 |                               reduction_indices=ndims - 1)
43 | 
44 |     def log_likelihood_sym(self, x_var, dist_info_vars):
45 |         p = dist_info_vars["p"]
46 |         ndims = p.get_shape().ndims
47 |         return tf.reduce_sum(x_var * tf.log(p + TINY) + (1 - x_var) * tf.log(1 - p + TINY), reduction_indices=ndims - 1)
48 | 
49 |     def log_likelihood(self, xs, dist_info):
50 |         p = dist_info["p"]
51 |         return np.sum(xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1)
52 | 
53 |     def entropy(self, dist_info):
54 |         p = dist_info["p"]
55 |         return np.sum(- p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1)
56 | 
57 |     @property
58 |     def dist_info_keys(self):
59 |         return ["p"]
60 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/double_pendulum_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from rllab.envs.box2d.parser import find_body
 3 | 
 4 | from rllab.core.serializable import Serializable
 5 | from rllab.envs.box2d.box2d_env import Box2DEnv
 6 | from rllab.misc import autoargs
 7 | from rllab.misc.overrides import overrides
 8 | 
 9 | 
10 | # http://mlg.eng.cam.ac.uk/pilco/
11 | class DoublePendulumEnv(Box2DEnv, Serializable):
12 | 
13 |     @autoargs.inherit(Box2DEnv.__init__)
14 |     def __init__(self, *args, **kwargs):
15 |         # make sure mdp-level step is 100ms long
16 |         kwargs["frame_skip"] = kwargs.get("frame_skip", 2)
17 |         if kwargs.get("template_args", {}).get("noise", False):
18 |             self.link_len = (np.random.rand()-0.5) + 1
19 |         else:
20 |             self.link_len = 1
21 |         kwargs["template_args"] = kwargs.get("template_args", {})
22 |         kwargs["template_args"]["link_len"] = self.link_len
23 |         super(DoublePendulumEnv, self).__init__(
24 |             self.model_path("double_pendulum.xml.mako"),
25 |             *args, **kwargs
26 |         )
27 |         self.link1 = find_body(self.world, "link1")
28 |         self.link2 = find_body(self.world, "link2")
29 |         Serializable.__init__(self, *args, **kwargs)
30 | 
31 |     @overrides
32 |     def reset(self):
33 |         self._set_state(self.initial_state)
34 |         self._invalidate_state_caches()
35 |         stds = np.array([0.1, 0.1, 0.01, 0.01])
36 |         pos1, pos2, v1, v2 = np.random.randn(*stds.shape) * stds
37 |         self.link1.angle = pos1
38 |         self.link2.angle = pos2
39 |         self.link1.angularVelocity = v1
40 |         self.link2.angularVelocity = v2
41 |         return self.get_current_obs()
42 | 
43 |     def get_tip_pos(self):
44 |         cur_center_pos = self.link2.position
45 |         cur_angle = self.link2.angle
46 |         cur_pos = (
47 |             cur_center_pos[0] - self.link_len*np.sin(cur_angle),
48 |             cur_center_pos[1] - self.link_len*np.cos(cur_angle)
49 |         )
50 |         return cur_pos
51 | 
52 |     @overrides
53 |     def compute_reward(self, action):
54 |         yield
55 |         tgt_pos = np.asarray([0, self.link_len * 2])
56 |         cur_pos = self.get_tip_pos()
57 |         dist = np.linalg.norm(cur_pos - tgt_pos)
58 |         yield -dist
59 | 
60 |     def is_current_done(self):
61 |         return False
62 | 
63 | 


--------------------------------------------------------------------------------
/docs/user/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | 
 4 | ============
 5 | Installation
 6 | ============
 7 | 
 8 | Preparation
 9 | ===========
10 | 
11 | You need to edit your :code:`PYTHONPATH` to include the rllab directory:
12 | 
13 | .. code-block:: bash
14 | 
15 |     export PYTHONPATH=path_to_rllab:$PYTHONPATH
16 | 
17 | Express Install
18 | ===============
19 | 
20 | The fastest way to set up dependencies for rllab is via running the setup script.
21 | 
22 | - On Linux, run the following:
23 | 
24 | .. code-block:: bash
25 | 
26 |     ./scripts/setup_linux.sh
27 | 
28 | - On Mac OS X, run the following:
29 | 
30 | .. code-block:: bash
31 | 
32 |     ./scripts/setup_osx.sh
33 | 
34 | The script sets up a conda environment, which is similar to :code:`virtualenv`. To start using it, run the following:
35 | 
36 | .. code-block:: bash
37 | 
38 |     source activate rllab3
39 | 
40 | 
41 | Optionally, if you would like to run experiments that depends on the Mujoco environment, you can set it up by running the following command:
42 | 
43 | .. code-block:: bash
44 | 
45 |     ./scripts/setup_mujoco.sh
46 | 
47 | and follow the instructions. You need to have the zip file for Mujoco v1.31 and the license file ready.
48 | 
49 | 
50 | 
51 | Manual Install
52 | ==============
53 | 
54 | Anaconda
55 | ------------
56 | 
57 | :code:`rllab` assumes that you are using Anaconda Python distribution. You can download it from `https://www.continuum.io/downloads<https://www.continuum.io/downloads>`.  Make sure to download the installer for Python 2.7.
58 | 
59 | 
60 | System dependencies for pygame
61 | ------------------------------
62 | 
63 | A few environments in rllab are implemented using Box2D, which uses pygame for visualization.
64 | It requires a few system dependencies to be installed first.
65 | 
66 | On Linux, run the following:
67 | 
68 | .. code-block:: bash
69 | 
70 |   sudo apt-get install swig
71 |   sudo apt-get build-dep python-pygame
72 | 
73 | On Mac OS X, run the following:
74 | 
75 | .. code-block:: bash
76 | 
77 |   brew install swig sdl sdl_image sdl_mixer sdl_ttf portmidi
78 | 
79 | System dependencies for scipy
80 | -----------------------------
81 | 
82 | This step is only needed under Linux:
83 | 
84 | .. code-block:: bash
85 | 
86 |   sudo apt-get build-dep python-scipy
87 | 
88 | Install Python modules
89 | ----------------------
90 | 
91 | .. code-block:: bash
92 | 
93 |   conda env create -f environment.yml
94 | 


--------------------------------------------------------------------------------
/rllab/spaces/box.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.serializable import Serializable
 2 | from .base import Space
 3 | import numpy as np
 4 | from rllab.misc import ext
 5 | import theano
 6 | 
 7 | 
 8 | class Box(Space):
 9 |     """
10 |     A box in R^n.
11 |     I.e., each coordinate is bounded.
12 |     """
13 | 
14 |     def __init__(self, low, high, shape=None):
15 |         """
16 |         Two kinds of valid input:
17 |             Box(-1.0, 1.0, (3,4)) # low and high are scalars, and shape is provided
18 |             Box(np.array([-1.0,-2.0]), np.array([2.0,4.0])) # low and high are arrays of the same shape
19 |         """
20 |         if shape is None:
21 |             assert low.shape == high.shape
22 |             self.low = low
23 |             self.high = high
24 |         else:
25 |             assert np.isscalar(low) and np.isscalar(high)
26 |             self.low = low + np.zeros(shape)
27 |             self.high = high + np.zeros(shape)
28 | 
29 |     def sample(self):
30 |         return np.random.uniform(low=self.low, high=self.high, size=self.low.shape)
31 | 
32 |     def contains(self, x):
33 |         return x.shape == self.shape and (x >= self.low).all() and (x <= self.high).all()
34 | 
35 |     @property
36 |     def shape(self):
37 |         return self.low.shape
38 | 
39 |     @property
40 |     def flat_dim(self):
41 |         return np.prod(self.low.shape)
42 | 
43 |     @property
44 |     def bounds(self):
45 |         return self.low, self.high
46 | 
47 |     def flatten(self, x):
48 |         return np.asarray(x).flatten()
49 | 
50 |     def unflatten(self, x):
51 |         return np.asarray(x).reshape(self.shape)
52 | 
53 |     def flatten_n(self, xs):
54 |         xs = np.asarray(xs)
55 |         return xs.reshape((xs.shape[0], -1))
56 | 
57 |     def unflatten_n(self, xs):
58 |         xs = np.asarray(xs)
59 |         return xs.reshape((xs.shape[0],) + self.shape)
60 | 
61 |     def __repr__(self):
62 |         return "Box" + str(self.shape)
63 | 
64 |     def __eq__(self, other):
65 |         return isinstance(other, Box) and np.allclose(self.low, other.low) and \
66 |                np.allclose(self.high, other.high)
67 | 
68 |     def __hash__(self):
69 |         return hash((self.low, self.high))
70 | 
71 |     def new_tensor_variable(self, name, extra_dims):
72 |         return ext.new_tensor(
73 |             name=name,
74 |             ndim=extra_dims+1,
75 |             dtype=theano.config.floatX
76 |         )
77 | 
78 | 


--------------------------------------------------------------------------------
/rllab/exploration_strategies/ou_strategy.py:
--------------------------------------------------------------------------------
 1 | from rllab.misc.overrides import overrides
 2 | from rllab.misc.ext import AttrDict
 3 | from rllab.core.serializable import Serializable
 4 | from rllab.spaces.box import Box
 5 | from rllab.exploration_strategies.base import ExplorationStrategy
 6 | import numpy as np
 7 | import numpy.random as nr
 8 | 
 9 | 
10 | class OUStrategy(ExplorationStrategy, Serializable):
11 |     """
12 |     This strategy implements the Ornstein-Uhlenbeck process, which adds
13 |     time-correlated noise to the actions taken by the deterministic policy.
14 |     The OU process satisfies the following stochastic differential equation:
15 |     dxt = theta*(mu - xt)*dt + sigma*dWt
16 |     where Wt denotes the Wiener process
17 |     """
18 | 
19 |     def __init__(self, env_spec, mu=0, theta=0.15, sigma=0.3, **kwargs):
20 |         assert isinstance(env_spec.action_space, Box)
21 |         assert len(env_spec.action_space.shape) == 1
22 |         Serializable.quick_init(self, locals())
23 |         self.mu = mu
24 |         self.theta = theta
25 |         self.sigma = sigma
26 |         self.action_space = env_spec.action_space
27 |         self.state = np.ones(self.action_space.flat_dim) * self.mu
28 |         self.reset()
29 | 
30 |     def __getstate__(self):
31 |         d = Serializable.__getstate__(self)
32 |         d["state"] = self.state
33 |         return d
34 | 
35 |     def __setstate__(self, d):
36 |         Serializable.__setstate__(self, d)
37 |         self.state = d["state"]
38 | 
39 |     @overrides
40 |     def reset(self):
41 |         self.state = np.ones(self.action_space.flat_dim) * self.mu
42 | 
43 |     def evolve_state(self):
44 |         x = self.state
45 |         dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x))
46 |         self.state = x + dx
47 |         return self.state
48 | 
49 |     @overrides
50 |     def get_action(self, t, observation, policy, **kwargs):
51 |         action, _ = policy.get_action(observation)
52 |         ou_state = self.evolve_state()
53 |         return np.clip(action + ou_state, self.action_space.low, self.action_space.high)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     ou = OUStrategy(env_spec=AttrDict(action_space=Box(low=-1, high=1, shape=(1,))), mu=0, theta=0.15, sigma=0.3)
58 |     states = []
59 |     for i in range(1000):
60 |         states.append(ou.evolve_state()[0])
61 |     import matplotlib.pyplot as plt
62 | 
63 |     plt.plot(states)
64 |     plt.show()
65 | 


--------------------------------------------------------------------------------
/rllab/policies/base.py:
--------------------------------------------------------------------------------
 1 | from rllab.core.parameterized import Parameterized
 2 | 
 3 | 
 4 | class Policy(Parameterized):
 5 |     def __init__(self, env_spec):
 6 |         Parameterized.__init__(self)
 7 |         self._env_spec = env_spec
 8 | 
 9 |     # Should be implemented by all policies
10 | 
11 |     def get_action(self, observation):
12 |         raise NotImplementedError
13 | 
14 |     def reset(self):
15 |         pass
16 | 
17 |     @property
18 |     def observation_space(self):
19 |         return self._env_spec.observation_space
20 | 
21 |     @property
22 |     def action_space(self):
23 |         return self._env_spec.action_space
24 | 
25 |     @property
26 |     def recurrent(self):
27 |         """
28 |         Indicates whether the policy is recurrent.
29 |         :return:
30 |         """
31 |         return False
32 | 
33 |     def log_diagnostics(self, paths):
34 |         """
35 |         Log extra information per iteration based on the collected paths
36 |         """
37 |         pass
38 | 
39 |     @property
40 |     def state_info_keys(self):
41 |         """
42 |         Return keys for the information related to the policy's state when taking an action.
43 |         :return:
44 |         """
45 |         return list()
46 | 
47 |     def terminate(self):
48 |         """
49 |         Clean up operation
50 |         """
51 |         pass
52 | 
53 | 
54 | class StochasticPolicy(Policy):
55 | 
56 |     @property
57 |     def distribution(self):
58 |         """
59 |         :rtype Distribution
60 |         """
61 |         raise NotImplementedError
62 | 
63 |     def dist_info_sym(self, obs_var, state_info_vars):
64 |         """
65 |         Return the symbolic distribution information about the actions.
66 |         :param obs_var: symbolic variable for observations
67 |         :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
68 |         the time it received the observation
69 |         :return:
70 |         """
71 |         raise NotImplementedError
72 | 
73 |     def dist_info(self, obs, state_infos):
74 |         """
75 |         Return the distribution information about the actions.
76 |         :param obs_var: observation values
77 |         :param state_info_vars: a dictionary whose values should contain information about the state of the policy at
78 |         the time it received the observation
79 |         :return:
80 |         """
81 |         raise NotImplementedError
82 | 


--------------------------------------------------------------------------------
/examples/cluster_gym_mujoco_demo.py:
--------------------------------------------------------------------------------
 1 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
 2 | from rllab.envs.normalized_env import normalize
 3 | from sandbox.rocky.tf.envs.base import TfEnv
 4 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
 5 | from sandbox.rocky.tf.algos.trpo import TRPO
 6 | from rllab.misc.instrument import stub, run_experiment_lite
 7 | #from rllab.envs.gym_env import GymEnv
 8 | #from rllab.envs.mujoco.swimmer_randgoal_env import SwimmerRandGoalEnv
 9 | from rllab.envs.mujoco.swimmer_randgoal_oracle_env import SwimmerRandGoalOracleEnv
10 | import sys
11 | 
12 | stub(globals())
13 | 
14 | from rllab.misc.instrument import VariantGenerator, variant
15 | 
16 | 
17 | class VG(VariantGenerator):
18 | 
19 |     @variant
20 |     def step_size(self):
21 |         return [0.005,0.01,0.02] #, 0.05, 0.1]
22 | 
23 |     @variant
24 |     def seed(self):
25 |         return [2,3] #, 11, 21, 31, 41]
26 | 
27 | variants = VG().variants()
28 | 
29 | for v in variants:
30 | 
31 |     env = TfEnv(normalize(SwimmerRandGoalOracleEnv()))
32 |     #env = TfEnv(normalize(GymEnv('HalfCheetah-v1', record_video=False, record_log=False)))
33 | 
34 |     policy = GaussianMLPPolicy(
35 |         env_spec=env.spec,
36 |         # The neural network policy should have two hidden layers, each with 32 hidden units.
37 |         hidden_sizes=(100, 100),
38 |         name="policy"
39 |     )
40 | 
41 |     baseline = LinearFeatureBaseline(env_spec=env.spec)
42 | 
43 |     algo = TRPO(
44 |         env=env,
45 |         policy=policy,
46 |         baseline=baseline,
47 |         batch_size=10000,
48 |         max_path_length=500,
49 |         n_itr=500,
50 |         discount=0.99,
51 |         step_size=v["step_size"],
52 |         # Uncomment both lines (this and the plot parameter below) to enable plotting
53 |         # plot=True,
54 |     )
55 | 
56 |     run_experiment_lite(
57 |         algo.train(),
58 |         exp_prefix="trpo_swimmer_baselines",
59 |         # Number of parallel workers for sampling
60 |         n_parallel=1,
61 |         # Only keep the snapshot parameters for the last iteration
62 |         snapshot_mode="last",
63 |         # Specifies the seed for the experiment. If this is not provided, a random seed
64 |         # will be used
65 |         seed=v["seed"],
66 |         # mode="local",
67 |         mode="ec2",
68 |         variant=v,
69 |         # plot=True,
70 |         # terminate_machine=False,
71 |     )
72 | 


--------------------------------------------------------------------------------
/rllab/envs/box2d/cartpole_swingup_env.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pygame
 3 | from rllab.envs.box2d.parser import find_body
 4 | 
 5 | from rllab.core.serializable import Serializable
 6 | from rllab.envs.box2d.box2d_env import Box2DEnv
 7 | from rllab.misc import autoargs
 8 | from rllab.misc.overrides import overrides
 9 | 
10 | 
11 | # Tornio, Matti, and Tapani Raiko. "Variational Bayesian approach for
12 | # nonlinear identification and control." Proc. of the IFAC Workshop on
13 | # Nonlinear Model Predictive Control for Fast Systems, NMPC FS06. 2006.
14 | class CartpoleSwingupEnv(Box2DEnv, Serializable):
15 | 
16 |     @autoargs.inherit(Box2DEnv.__init__)
17 |     def __init__(self, *args, **kwargs):
18 |         super(CartpoleSwingupEnv, self).__init__(
19 |             self.model_path("cartpole.xml.mako"),
20 |             *args, **kwargs
21 |         )
22 |         self.max_cart_pos = 3
23 |         self.max_reward_cart_pos = 3
24 |         self.cart = find_body(self.world, "cart")
25 |         self.pole = find_body(self.world, "pole")
26 |         Serializable.__init__(self, *args, **kwargs)
27 | 
28 |     @overrides
29 |     def reset(self):
30 |         self._set_state(self.initial_state)
31 |         self._invalidate_state_caches()
32 |         bounds = np.array([
33 |             [-1, -2, np.pi-1, -3],
34 |             [1, 2, np.pi+1, 3],
35 |         ])
36 |         low, high = bounds
37 |         xpos, xvel, apos, avel = np.random.uniform(low, high)
38 |         self.cart.position = (xpos, self.cart.position[1])
39 |         self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1])
40 |         self.pole.angle = apos
41 |         self.pole.angularVelocity = avel
42 |         return self.get_current_obs()
43 | 
44 |     @overrides
45 |     def compute_reward(self, action):
46 |         yield
47 |         if self.is_current_done():
48 |             yield -100
49 |         else:
50 |             if abs(self.cart.position[0]) > self.max_reward_cart_pos:
51 |                 yield -1
52 |             else:
53 |                 yield np.cos(self.pole.angle)
54 | 
55 |     @overrides
56 |     def is_current_done(self):
57 |         return abs(self.cart.position[0]) > self.max_cart_pos
58 | 
59 |     @overrides
60 |     def action_from_keys(self, keys):
61 |         if keys[pygame.K_LEFT]:
62 |             return np.asarray([-10])
63 |         elif keys[pygame.K_RIGHT]:
64 |             return np.asarray([+10])
65 |         else:
66 |             return np.asarray([0])
67 | 
68 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/swimmer_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.base import Step
 2 | from rllab.misc.overrides import overrides
 3 | from .mujoco_env import MujocoEnv
 4 | import numpy as np
 5 | from rllab.core.serializable import Serializable
 6 | from rllab.misc import logger
 7 | from rllab.misc import autoargs
 8 | 
 9 | 
10 | class SwimmerEnv(MujocoEnv, Serializable):
11 | 
12 |     FILE = 'swimmer.xml'
13 | 
14 |     @autoargs.arg('ctrl_cost_coeff', type=float,
15 |                   help='cost coefficient for controls')
16 |     def __init__(
17 |             self,
18 |             ctrl_cost_coeff=1e-2,
19 |             *args, **kwargs):
20 |         self.ctrl_cost_coeff = ctrl_cost_coeff
21 |         super(SwimmerEnv, self).__init__(*args, **kwargs)
22 |         Serializable.quick_init(self, locals())
23 | 
24 |     def get_current_obs(self):
25 |         return np.concatenate([
26 |             self.model.data.qpos.flat,
27 |             self.model.data.qvel.flat,
28 |             self.get_body_com("torso").flat,
29 |         ]).reshape(-1)
30 | 
31 |     def step(self, action):
32 |         self.forward_dynamics(action)
33 |         next_obs = self.get_current_obs()
34 |         lb, ub = self.action_bounds
35 |         scaling = (ub - lb) * 0.5
36 |         ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum(
37 |             np.square(action / scaling))
38 |         forward_reward = self.get_body_comvel("torso")[0]
39 |         #forward_reward = -1.5*np.abs(self.get_body_comvel("torso")[0] - 0.15)
40 |         # max achievable vel is around 0.20 for vpg.
41 |         reward = forward_reward - ctrl_cost
42 |         done = False
43 |         return Step(next_obs, reward, done)
44 | 
45 |     @overrides
46 |     def log_diagnostics(self, paths, prefix=''):
47 |         progs = [
48 |             path["observations"][-1][-3] - path["observations"][0][-3]
49 |             for path in paths
50 |         ]
51 |         #if np.mean(progs) > 4.5:
52 |         #    import pdb; pdb.set_trace()
53 |         #path = paths[0]
54 |         #t = -10
55 |         #lb, ub = self.action_bounds
56 |         #scaling = (ub - lb) * 0.5
57 |         #rew = path['rewards'][t]
58 |         #act = path['actions'][t]
59 |         #ctrl_cost = 0.5*self.ctrl_cost_coeff*np.sum(np.square(act/scaling))
60 | 
61 |         logger.record_tabular('AverageForwardProgress', np.mean(progs))
62 |         logger.record_tabular('MaxForwardProgress', np.max(progs))
63 |         logger.record_tabular('MinForwardProgress', np.min(progs))
64 |         logger.record_tabular('StdForwardProgress', np.std(progs))
65 | 


--------------------------------------------------------------------------------
/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM ubuntu:16.04
 2 | 
 3 | # ========== Anaconda ==========
 4 | # https://github.com/ContinuumIO/docker-images/blob/master/anaconda/Dockerfile
 5 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \
 6 |     libglib2.0-0 libxext6 libsm6 libxrender1 \
 7 |     git mercurial subversion
 8 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
 9 |     wget  --no-check-certificate --quiet https://repo.continuum.io/archive/Anaconda2-2.5.0-Linux-x86_64.sh && \
10 |     /bin/bash /Anaconda2-2.5.0-Linux-x86_64.sh -b -p /opt/conda && \
11 |     rm /Anaconda2-2.5.0-Linux-x86_64.sh
12 | 
13 | RUN apt-get install -y curl grep sed dpkg && \
14 |     TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \
15 |     curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \
16 |     dpkg -i tini.deb && \
17 |     rm tini.deb && \
18 |     apt-get clean
19 | 
20 | ENV PATH /opt/conda/bin:$PATH
21 | # http://bugs.python.org/issue19846
22 | # > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK.
23 | ENV LANG C.UTF-8
24 | ENTRYPOINT [ "/usr/bin/tini", "--" ]
25 | 
26 | # ========== Special Deps ==========
27 | RUN apt-get -y install git make cmake unzip
28 | RUN pip install awscli
29 | # ALE requires zlib
30 | RUN apt-get -y install zlib1g-dev
31 | # MUJOCO requires graphics stuff (Why?)
32 | RUN apt-get -y build-dep glfw
33 | RUN apt-get -y install libxrandr2 libxinerama-dev libxi6 libxcursor-dev
34 | # copied from requirements.txt
35 | #RUN pip install imageio tabulate nose
36 | RUN apt-get install -y vim ack-grep
37 | RUN pip install --upgrade pip
38 | # usual pip install pygame will fail
39 | RUN apt-get build-dep -y python-pygame
40 | RUN pip install Pillow
41 | 
42 | # ========== OpenAI Gym ==========
43 | RUN apt-get -y install libgtk2.0-0
44 | RUN pip install gym
45 | #RUN apt-get -y install ffmpeg
46 | RUN apt-get -y install libav-tools
47 | CMD alias ffmpeg="avconv"
48 | 
49 | # ========== Add codebase stub ==========
50 | CMD mkdir /root/code
51 | ADD environment.yml /root/code/environment.yml
52 | RUN conda env create -f /root/code/environment.yml
53 | 
54 | ENV PYTHONPATH /root/code/rllab:$PYTHONPATH
55 | ENV PATH /opt/conda/envs/rllab3/bin:$PATH
56 | RUN echo "source activate rllab3" >> /root/.bashrc
57 | ENV BASH_ENV /root/.bashrc
58 | WORKDIR /root/code
59 | 
60 | # gpu theanno
61 | ENV THEANO_FLAGS mode=FAST_RUN,device=gpu,floatX=float32
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/rllab/spaces/product.py:
--------------------------------------------------------------------------------
 1 | from rllab.spaces.base import Space
 2 | import numpy as np
 3 | from rllab.misc import ext
 4 | 
 5 | 
 6 | class Product(Space):
 7 | 
 8 |     def __init__(self, *components):
 9 |         if isinstance(components[0], (list, tuple)):
10 |             assert len(components) == 1
11 |             components = components[0]
12 |         self._components = tuple(components)
13 |         dtypes = [c.new_tensor_variable("tmp", extra_dims=0).dtype for c in components]
14 |         if len(dtypes) > 0 and hasattr(dtypes[0], "as_numpy_dtype"):
15 |             dtypes = [d.as_numpy_dtype for d in dtypes]
16 |         self._common_dtype = np.core.numerictypes.find_common_type([], dtypes)
17 | 
18 |     def sample(self):
19 |         return tuple(x.sample() for x in self._components)
20 | 
21 |     @property
22 |     def components(self):
23 |         return self._components
24 | 
25 |     def contains(self, x):
26 |         return isinstance(x, tuple) and all(c.contains(xi) for c, xi in zip(self._components, x))
27 | 
28 |     def new_tensor_variable(self, name, extra_dims):
29 |         return ext.new_tensor(
30 |             name=name,
31 |             ndim=extra_dims+1,
32 |             dtype=self._common_dtype,
33 |         )
34 | 
35 |     @property
36 |     def flat_dim(self):
37 |         return np.sum([c.flat_dim for c in self._components])
38 | 
39 |     def flatten(self, x):
40 |         return np.concatenate([c.flatten(xi) for c, xi in zip(self._components, x)])
41 | 
42 |     def flatten_n(self, xs):
43 |         xs_regrouped = [[x[i] for x in xs] for i in range(len(xs[0]))]
44 |         flat_regrouped = [c.flatten_n(xi) for c, xi in zip(self.components, xs_regrouped)]
45 |         return np.concatenate(flat_regrouped, axis=-1)
46 | 
47 |     def unflatten(self, x):
48 |         dims = [c.flat_dim for c in self._components]
49 |         flat_xs = np.split(x, np.cumsum(dims)[:-1])
50 |         return tuple(c.unflatten(xi) for c, xi in zip(self._components, flat_xs))
51 | 
52 |     def unflatten_n(self, xs):
53 |         dims = [c.flat_dim for c in self._components]
54 |         flat_xs = np.split(xs, np.cumsum(dims)[:-1], axis=-1)
55 |         unflat_xs = [c.unflatten_n(xi) for c, xi in zip(self.components, flat_xs)]
56 |         unflat_xs_grouped = list(zip(*unflat_xs))
57 |         return unflat_xs_grouped
58 | 
59 |     def __eq__(self, other):
60 |         if not isinstance(other, Product):
61 |             return False
62 |         return tuple(self.components) == tuple(other.components)
63 | 
64 |     def __hash__(self):
65 |         return hash(tuple(self.components))
66 | 


--------------------------------------------------------------------------------
/docker/gpu_Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:7.5-cudnn4-devel-ubuntu14.04
 2 | 
 3 | # ========== Anaconda ==========
 4 | # https://github.com/ContinuumIO/docker-images/blob/master/anaconda/Dockerfile
 5 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \
 6 |     libglib2.0-0 libxext6 libsm6 libxrender1 \
 7 |     git mercurial subversion
 8 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
 9 |     wget  --no-check-certificate --quiet https://repo.continuum.io/archive/Anaconda2-2.5.0-Linux-x86_64.sh && \
10 |     /bin/bash /Anaconda2-2.5.0-Linux-x86_64.sh -b -p /opt/conda && \
11 |     rm /Anaconda2-2.5.0-Linux-x86_64.sh
12 | 
13 | RUN apt-get install -y curl grep sed dpkg && \
14 |     TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \
15 |     curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \
16 |     dpkg -i tini.deb && \
17 |     rm tini.deb && \
18 |     apt-get clean
19 | 
20 | ENV PATH /opt/conda/bin:$PATH
21 | # http://bugs.python.org/issue19846
22 | # > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK.
23 | ENV LANG C.UTF-8
24 | ENTRYPOINT [ "/usr/bin/tini", "--" ]
25 | 
26 | # ========== Special Deps ==========
27 | RUN apt-get -y install git make cmake unzip
28 | RUN pip install awscli
29 | # ALE requires zlib
30 | RUN apt-get -y install zlib1g-dev
31 | # MUJOCO requires graphics stuff (Why?)
32 | RUN apt-get -y build-dep glfw
33 | RUN apt-get -y install libxrandr2 libxinerama-dev libxi6 libxcursor-dev
34 | # copied from requirements.txt
35 | #RUN pip install imageio tabulate nose
36 | RUN apt-get install -y vim ack-grep
37 | RUN pip install --upgrade pip
38 | # usual pip install pygame will fail
39 | RUN apt-get build-dep -y python-pygame
40 | RUN pip install Pillow
41 | 
42 | # ========== OpenAI Gym ==========
43 | RUN apt-get -y install libgtk2.0-0
44 | RUN pip install gym
45 | #RUN apt-get -y install ffmpeg
46 | RUN apt-get -y install libav-tools
47 | CMD alias ffmpeg="avconv"
48 | 
49 | # ========== Add codebase stub ==========
50 | CMD mkdir /root/code
51 | ADD environment.yml /root/code/environment.yml
52 | RUN conda env create -f /root/code/environment.yml
53 | 
54 | ENV PYTHONPATH /root/code/rllab:$PYTHONPATH
55 | ENV PATH /opt/conda/envs/rllab3/bin:$PATH
56 | RUN echo "source activate rllab3" >> /root/.bashrc
57 | ENV BASH_ENV /root/.bashrc
58 | WORKDIR /root/code
59 | 
60 | # gpu theanno
61 | ENV THEANO_FLAGS mode=FAST_RUN,device=gpu,floatX=float32
62 | 


--------------------------------------------------------------------------------
/vendor/mujoco_models/swimmer.xml:
--------------------------------------------------------------------------------
 1 | <mujoco model="swimmer">
 2 |   <compiler inertiafromgeom="true" angle="degree" coordinate="local" />
 3 |   <custom>
 4 |     <numeric name="frame_skip" data="50" />
 5 |   </custom>
 6 |   <option timestep="0.001" density="4000" viscosity="0.1" collision="predefined" integrator="Euler" iterations="1000">
 7 |     <flag warmstart="disable" />
 8 |   </option>
 9 |   <default>
10 |     <geom contype='1' conaffinity='1' condim='1' rgba='0.8 0.6 .4 1' material="geom" />
11 |     <!--<joint armature='1'  />-->
12 |   </default>
13 |   <asset>
14 |     <texture type="skybox" builtin="gradient" width="100" height="100" rgb1="1 1 1" rgb2="0 0 0" />
15 |     <texture name="texgeom" type="cube" builtin="flat" mark="cross" width="127" height="1278" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" markrgb="1 1 1" random="0.01" />
16 |     <texture name="texplane" type="2d" builtin="checker" rgb1="0 0 0" rgb2="0.8 0.8 0.8" width="100" height="100" />
17 |     <material name='MatPlane' texture="texplane" shininess="1" texrepeat="30 30" specular="1"  reflectance="0.5" />
18 |     <material name='geom' texture="texgeom" texuniform="true" />
19 |   </asset>
20 |   <worldbody>
21 |     <light directional="true" cutoff="100" exponent="1" diffuse="1 1 1" specular=".1 .1 .1" pos="0 0 1.3" dir="-0 0 -1.3" />
22 |     <geom name='floor' material="MatPlane" pos='0 0 -0.1' size='40 40 0.1' type='plane' conaffinity='1' rgba='0.8 0.9 0.8 1' condim='3' />
23 |     <!--  ================= SWIMMER ================= /-->
24 |     <body name="torso" pos="0 0 0">
25 |       <geom name="torso" type="capsule" fromto="1.5 0 0 0.5 0 0" size="0.1" density="1000" />
26 |       <joint pos="0 0 0" type="slide" name="slider1" axis="1 0 0" />
27 |       <joint pos="0 0 0" type="slide" name="slider2" axis="0 1 0" />
28 |       <joint name="rot" type="hinge" pos="0 0 0" axis="0 0 1" />
29 |       <body name="mid" pos="0.5 0 0">
30 |         <geom name="mid" type="capsule" fromto="0 0 0 -1 0 0" size="0.1" density="1000" />
31 |         <joint name="rot2" type="hinge" pos="0 0 0" axis="0 0 1" range="-100 100" limited="true" />
32 |         <body name="back" pos="-1 0 0">
33 |           <geom name="back" type="capsule" fromto="0 0 0 -1 0 0" size="0.1" density="1000" />
34 |           <joint name="rot3" type="hinge" pos="0 0 0" axis="0 0 1" range="-100 100" limited="true" />
35 |         </body>
36 |       </body>
37 |     </body>
38 |   </worldbody>
39 |   <actuator>
40 |     <motor joint="rot2" ctrllimited="true" ctrlrange="-50 50" />
41 |     <motor joint="rot3" ctrllimited="true" ctrlrange="-50 50" />
42 |   </actuator>
43 | </mujoco>
44 | 


--------------------------------------------------------------------------------
/sandbox/rocky/tf/spaces/product.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | from rllab.spaces.base import Space
 5 | import tensorflow as tf
 6 | import numpy as np
 7 | 
 8 | 
 9 | class Product(Space):
10 |     def __init__(self, *components):
11 |         if isinstance(components[0], (list, tuple)):
12 |             assert len(components) == 1
13 |             components = components[0]
14 |         self._components = tuple(components)
15 |         dtypes = [c.new_tensor_variable("tmp", extra_dims=0).dtype for c in components]
16 |         if len(dtypes) > 0 and hasattr(dtypes[0], "as_numpy_dtype"):
17 |             dtypes = [d.as_numpy_dtype for d in dtypes]
18 |         self._common_dtype = np.core.numerictypes.find_common_type([], dtypes)
19 | 
20 |     def sample(self):
21 |         return tuple(x.sample() for x in self._components)
22 | 
23 |     @property
24 |     def components(self):
25 |         return self._components
26 | 
27 |     def contains(self, x):
28 |         return isinstance(x, tuple) and all(c.contains(xi) for c, xi in zip(self._components, x))
29 | 
30 |     def new_tensor_variable(self, name, extra_dims):
31 |         return tf.placeholder(
32 |             dtype=self._common_dtype,
33 |             shape=[None] * extra_dims + [self.flat_dim],
34 |             name=name,
35 |         )
36 | 
37 |     @property
38 |     def flat_dim(self):
39 |         return np.sum([c.flat_dim for c in self._components])
40 | 
41 |     def flatten(self, x):
42 |         return np.concatenate([c.flatten(xi) for c, xi in zip(self._components, x)])
43 | 
44 |     def flatten_n(self, xs):
45 |         xs_regrouped = [[x[i] for x in xs] for i in range(len(xs[0]))]
46 |         flat_regrouped = [c.flatten_n(xi) for c, xi in zip(self.components, xs_regrouped)]
47 |         return np.concatenate(flat_regrouped, axis=-1)
48 | 
49 |     def unflatten(self, x):
50 |         dims = [c.flat_dim for c in self._components]
51 |         flat_xs = np.split(x, np.cumsum(dims)[:-1])
52 |         return tuple(c.unflatten(xi) for c, xi in zip(self._components, flat_xs))
53 | 
54 |     def unflatten_n(self, xs):
55 |         dims = [c.flat_dim for c in self._components]
56 |         flat_xs = np.split(xs, np.cumsum(dims)[:-1], axis=-1)
57 |         unflat_xs = [c.unflatten_n(xi) for c, xi in zip(self.components, flat_xs)]
58 |         unflat_xs_grouped = list(zip(*unflat_xs))
59 |         return unflat_xs_grouped
60 | 
61 |     def __eq__(self, other):
62 |         if not isinstance(other, Product):
63 |             return False
64 |         return tuple(self.components) == tuple(other.components)
65 | 
66 |     def __hash__(self):
67 |         return hash(tuple(self.components))
68 | 


--------------------------------------------------------------------------------
/examples/trpo_swimmer.py:
--------------------------------------------------------------------------------
 1 | use_tf = True
 2 | 
 3 | if use_tf:
 4 |     from sandbox.rocky.tf.algos.trpo import TRPO
 5 |     # from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy
 6 |     from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy
 7 |     from sandbox.rocky.tf.envs.base import TfEnv
 8 | else:
 9 |     from rllab.algos.trpo import TRPO
10 |     from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy
11 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline
12 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv
13 | from rllab.envs.mujoco.swimmer_randgoal_oracle_env import SwimmerRandGoalOracleEnv
14 | from rllab.envs.mujoco.swimmer_randgoal_env import SwimmerRandGoalEnv
15 | #from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv
16 | from rllab.envs.mujoco.walker2d_env import Walker2DEnv
17 | from rllab.envs.normalized_env import normalize
18 | from rllab.misc.instrument import stub, run_experiment_lite
19 | 
20 | stub(globals())
21 | 
22 | 
23 | #env = normalize(SwimmerEnv())
24 | env = normalize(SwimmerRandGoalOracleEnv())
25 | #env = normalize(SwimmerRandGoalEnv())
26 | 
27 | max_path_length = 100
28 | #env = normalize(HalfCheetahEnv())
29 | #env = normalize(Walker2DEnv())
30 | if use_tf:
31 |     env = TfEnv(env)
32 |     policy = GaussianMLPPolicy(
33 |         name='policy',
34 |         env_spec=env.spec,
35 |         # The neural network policy should have two hidden layers, each with 32 hidden units.
36 |         #hidden_sizes=(32, 32)
37 |         hidden_sizes=(100, 100)
38 |     )
39 | else:
40 |     policy = GaussianMLPPolicy(
41 |         env_spec=env.spec,
42 |         # The neural network policy should have two hidden layers, each with 32 hidden units.
43 |         hidden_sizes=(100, 100)
44 |     )
45 | 
46 | baseline = LinearFeatureBaseline(env_spec=env.spec)
47 | 
48 | algo = TRPO(
49 |     env=env,
50 |     policy=policy,
51 |     baseline=baseline,
52 |     batch_size=max_path_length*10,  # was 4k
53 |     max_path_length=max_path_length,
54 |     n_itr=500,
55 |     discount=0.99,
56 |     step_size=0.01,
57 |     #plot=True,
58 | )
59 | #algo.train()
60 | 
61 | 
62 | run_experiment_lite(
63 |     algo.train(),
64 |     # Number of parallel workers for sampling
65 |     n_parallel=4,
66 |     # Only keep the snapshot parameters for the last iteration
67 |     snapshot_mode="last",
68 |     # Specifies the seed for the experiment. If this is not provided, a random seed
69 |     # will be used
70 |     seed=1,
71 |     exp_prefix='trpo_sensitive_swimmer' + str(max_path_length),
72 |     exp_name='oracleenv',
73 |     #plot=True,
74 | )
75 | 


--------------------------------------------------------------------------------
/rllab/envs/mujoco/swimmer_randgoal_env.py:
--------------------------------------------------------------------------------
 1 | from rllab.envs.base import Step
 2 | from rllab.misc.overrides import overrides
 3 | from .mujoco_env import MujocoEnv
 4 | import numpy as np
 5 | from rllab.core.serializable import Serializable
 6 | from rllab.misc import logger
 7 | from rllab.misc import autoargs
 8 | 
 9 | 
10 | class SwimmerRandGoalEnv(MujocoEnv, Serializable):
11 | 
12 |     FILE = 'swimmer.xml'
13 | 
14 |     @autoargs.arg('ctrl_cost_coeff', type=float,
15 |                   help='cost coefficient for controls')
16 |     def __init__(
17 |             self,
18 |             ctrl_cost_coeff=1e-2,
19 |             *args, **kwargs):
20 |         self.ctrl_cost_coeff = ctrl_cost_coeff
21 |         self._goal_vel = None
22 |         super(SwimmerRandGoalEnv, self).__init__(*args, **kwargs)
23 |         Serializable.quick_init(self, locals())
24 | 
25 |     def get_current_obs(self):
26 |         return np.concatenate([
27 |             self.model.data.qpos.flat,
28 |             self.model.data.qvel.flat,
29 |             self.get_body_com("torso").flat,
30 |         ]).reshape(-1)
31 | 
32 |     @overrides
33 |     def reset(self, init_state=None, reset_args=None, **kwargs):
34 |         goal_vel = reset_args
35 |         if goal_vel is not None:
36 |             self._goal_vel = goal_vel
37 |         elif self._goal is None:
38 |             self._goal_vel = np.random.uniform(0.1, 0.2)
39 |         self.reset_mujoco(init_state)
40 |         self.model.forward()
41 |         self.current_com = self.model.data.com_subtree[0]
42 |         self.dcom = np.zeros_like(self.current_com)
43 |         return self.get_current_obs()
44 | 
45 |     def step(self, action):
46 |         self.forward_dynamics(action)
47 |         next_obs = self.get_current_obs()
48 |         lb, ub = self.action_bounds
49 |         scaling = (ub - lb) * 0.5
50 |         ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum(
51 |             np.square(action / scaling))
52 |         forward_reward = -1.5*np.abs(self.get_body_comvel("torso")[0] - self._goal_vel)
53 |         reward = forward_reward - ctrl_cost
54 |         done = False
55 |         return Step(next_obs, reward, done)
56 | 
57 |     @overrides
58 |     def log_diagnostics(self, paths, prefix=''):
59 |         progs = [
60 |             path["observations"][-1][-3] - path["observations"][0][-3]
61 |             for path in paths
62 |         ]
63 |         logger.record_tabular(prefix+'AverageForwardProgress', np.mean(progs))
64 |         logger.record_tabular(prefix+'MaxForwardProgress', np.max(progs))
65 |         logger.record_tabular(prefix+'MinForwardProgress', np.min(progs))
66 |         logger.record_tabular(prefix+'StdForwardProgress', np.std(progs))
67 | 


--------------------------------------------------------------------------------