├── rllab ├── __init__.py ├── misc │ ├── meta.py │ ├── __init__.py │ ├── mako_utils.py │ └── resolve.py ├── algos │ ├── __init__.py │ ├── base.py │ ├── nop.py │ ├── trpo.py │ ├── ppo.py │ ├── tnpg.py │ └── erwr.py ├── core │ ├── __init__.py │ ├── lasagne_powered.py │ └── serializable.py ├── envs │ ├── __init__.py │ ├── box2d │ │ ├── __init__.py │ │ ├── parser │ │ │ └── __init__.py │ │ ├── models │ │ │ ├── mountain_car.xml.mako │ │ │ ├── double_pendulum.xml.mako │ │ │ ├── cartpole.xml.mako │ │ │ ├── car_parking.xml │ │ │ └── car_parking.xml.rb │ │ ├── cartpole_env.py │ │ ├── mountain_car_env.py │ │ ├── double_pendulum_env.py │ │ └── cartpole_swingup_env.py │ ├── mujoco │ │ ├── __init__.py │ │ ├── gather │ │ │ ├── __init__.py │ │ │ ├── point_gather_env.py │ │ │ ├── swimmer_gather_env.py │ │ │ └── ant_gather_env.py │ │ ├── maze │ │ │ ├── __init__.py │ │ │ ├── point_maze_env.py │ │ │ ├── swimmer_maze_env.py │ │ │ └── ant_maze_env.py │ │ ├── humanoid_env.py │ │ ├── point_env.py │ │ ├── ant_env.py │ │ ├── half_cheetah_env.py │ │ ├── walker2d_env.py │ │ ├── inverted_double_pendulum_env.py │ │ ├── swimmer_env.py │ │ └── swimmer_randgoal_env.py │ ├── env_spec.py │ ├── identification_env.py │ ├── proxy_env.py │ └── sliding_mem_env.py ├── baselines │ ├── __init__.py │ ├── zero_baseline.py │ ├── base.py │ ├── linear_feature_baseline.py │ ├── gaussian_conv_baseline.py │ └── gaussian_mlp_baseline.py ├── optimizers │ ├── __init__.py │ └── minibatch_dataset.py ├── policies │ ├── __init__.py │ ├── uniform_control_policy.py │ └── base.py ├── sampler │ ├── __init__.py │ └── utils.py ├── distributions │ ├── __init__.py │ ├── recurrent_diagonal_gaussian.py │ ├── delta.py │ ├── base.py │ └── bernoulli.py ├── q_functions │ ├── __init__.py │ └── base.py ├── exploration_strategies │ ├── __init__.py │ ├── base.py │ ├── gaussian_strategy.py │ └── ou_strategy.py ├── plotter │ ├── __init__.py │ └── plotter.py ├── mujoco_py │ ├── .rvmrc │ ├── Gemfile │ ├── mjconstants.py │ ├── mjextra.py │ ├── __init__.py │ ├── gen_binding.sh │ └── Gemfile.lock ├── regressors │ ├── __init__.py │ └── product_regressor.py ├── viskit │ └── __init__.py ├── spaces │ ├── __init__.py │ ├── base.py │ ├── discrete.py │ ├── box.py │ └── product.py ├── config_personal_template.py └── config.py ├── tests ├── __init__.py ├── envs │ ├── __init__.py │ └── test_maze_env.py ├── algos │ ├── __init__.py │ └── test_trpo.py ├── regression_tests │ ├── __init__.py │ └── test_issue_3.py ├── test_networks.py ├── test_stateful_pool.py ├── test_serializable.py ├── test_baselines.py ├── test_sampler.py ├── test_spaces.py └── test_instrument.py ├── contrib ├── __init__.py └── alexbeloi │ ├── __init__.py │ └── examples │ ├── __init__.py │ ├── vpgis_cartpole.py │ └── trpois_cartpole.py ├── examples ├── __init__.py ├── nop_cartpole.py ├── trpo_cartpole.py ├── point_env.py ├── trpo_cartpole_recurrent.py ├── trpo_cartpole_stub.py ├── vpg_point.py ├── point_env_rand2goal.py ├── ddpg_cartpole_stub.py ├── old │ └── sens_vpg_point.py ├── point_env_randgoal_oracle.py ├── point_env_randgoal.py ├── icml │ └── trpo_point.py ├── vpg_swimmer.py ├── trpo_gym.py ├── cluster_demo.py ├── trpo_point.py ├── cluster_gym_mujoco_demo.py └── trpo_swimmer.py ├── sandbox ├── __init__.py └── rocky │ ├── __init__.py │ └── tf │ ├── __init__.py │ ├── algos │ ├── npg.py │ ├── __init__.py │ ├── trpo.py │ └── sensitive_trpo.py │ ├── core │ ├── __init__.py │ └── layers_powered.py │ ├── envs │ ├── __init__.py │ └── vec_env_executor.py │ ├── misc │ └── __init__.py │ ├── launchers │ ├── __init__.py │ ├── vpg_cartpole.py │ ├── trpo_cartpole_recurrent.py │ └── trpo_cartpole.py │ ├── policies │ ├── __init__.py │ └── uniform_control_policy.py │ ├── samplers │ └── __init__.py │ ├── distributions │ ├── __init__.py │ ├── recurrent_diagonal_gaussian.py │ ├── base.py │ └── bernoulli.py │ ├── optimizers │ └── __init__.py │ ├── regressors │ └── __init__.py │ └── spaces │ ├── __init__.py │ ├── box.py │ ├── discrete.py │ └── product.py ├── scripts ├── __init__.py ├── setup_linux.sh ├── submit_gym.py ├── setup_osx.sh ├── sync_s3.py ├── setup_mujoco.sh ├── sim_policy.py └── resume_training.py ├── icml ├── ant_results.png ├── paths_viz.png ├── point_results.png ├── antdirec_results.png ├── cheetah_results.png ├── maml_paths_viz.png ├── pretrain_paths_viz.png ├── ant_results_logscale.png ├── cheetahdirec_results.png ├── icml_ant_results_maml.pkl ├── icml_ant_results_oracle.pkl ├── icml_ant_results_random.pkl ├── icml_ant_results_maml_bak.pkl ├── icml_ant_results_pretrain.pkl ├── icml_antdirec_results_maml.pkl ├── icml_cheetah_results_maml.pkl ├── icml_antdirec_results_oracle.pkl ├── icml_antdirec_results_random.pkl ├── icml_cheetah_results_oracle.pkl ├── icml_cheetah_results_random.pkl ├── icml_antdirec_results_pretrain.pkl ├── icml_cheetah_results_pretrain.pkl ├── icml_cheetahdirec_results_maml.pkl ├── icml_cheetah_results_pretrain_bak.pkl ├── icml_cheetahdirec_results_oracle.pkl ├── icml_cheetahdirec_results_random.pkl ├── icml_antdirec_results_maml_batch20.pkl ├── icml_cheetahdirec_results_pretrain.pkl ├── icml_point_results_oracle.pkl ├── make_paths_plot.py ├── make_point_plots.py ├── make_antdirec_plots.py ├── make_cheetahdirec_plots.py ├── make_cheetah_plots.py └── make_ant_plots.py ├── docs ├── user │ ├── cluster_1.png │ ├── cluster_2.png │ ├── cluster_3.png │ └── installation.rst └── index.rst ├── setup.py ├── docker ├── tester_Dockerfile ├── Dockerfile └── gpu_Dockerfile ├── circle.yml ├── vendor └── mujoco_models │ ├── red_ball.xml │ ├── green_ball.xml │ ├── point.xml │ └── swimmer.xml ├── .gitignore ├── environment.yml └── LICENSE /rllab/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/misc/meta.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /contrib/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/algos/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sandbox/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/baselines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/envs/box2d/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/policies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/sampler/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sandbox/rocky/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/algos/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /contrib/alexbeloi/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/distributions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/q_functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /contrib/alexbeloi/examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/gather/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/maze/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/npg.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/misc/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /rllab/exploration_strategies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/launchers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /tests/regression_tests/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/optimizers/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/regressors/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /rllab/plotter/__init__.py: -------------------------------------------------------------------------------- 1 | from .plotter import * 2 | -------------------------------------------------------------------------------- /rllab/mujoco_py/.rvmrc: -------------------------------------------------------------------------------- 1 | rvm use 2.1.0@mjpy --create 2 | -------------------------------------------------------------------------------- /rllab/regressors/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dementrock' 2 | -------------------------------------------------------------------------------- /rllab/viskit/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'dementrock' 2 | -------------------------------------------------------------------------------- /icml/ant_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/ant_results.png -------------------------------------------------------------------------------- /icml/paths_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/paths_viz.png -------------------------------------------------------------------------------- /icml/point_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/point_results.png -------------------------------------------------------------------------------- /docs/user/cluster_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/docs/user/cluster_1.png -------------------------------------------------------------------------------- /docs/user/cluster_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/docs/user/cluster_2.png -------------------------------------------------------------------------------- /docs/user/cluster_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/docs/user/cluster_3.png -------------------------------------------------------------------------------- /icml/antdirec_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/antdirec_results.png -------------------------------------------------------------------------------- /icml/cheetah_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/cheetah_results.png -------------------------------------------------------------------------------- /icml/maml_paths_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/maml_paths_viz.png -------------------------------------------------------------------------------- /icml/pretrain_paths_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/pretrain_paths_viz.png -------------------------------------------------------------------------------- /rllab/envs/box2d/parser/__init__.py: -------------------------------------------------------------------------------- 1 | from .xml_box2d import world_from_xml, find_body, find_joint 2 | -------------------------------------------------------------------------------- /rllab/mujoco_py/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | gem 'pry' 4 | gem 'activesupport' 5 | -------------------------------------------------------------------------------- /icml/ant_results_logscale.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/ant_results_logscale.png -------------------------------------------------------------------------------- /icml/cheetahdirec_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/cheetahdirec_results.png -------------------------------------------------------------------------------- /icml/icml_ant_results_maml.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_maml.pkl -------------------------------------------------------------------------------- /icml/icml_ant_results_oracle.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_oracle.pkl -------------------------------------------------------------------------------- /icml/icml_ant_results_random.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_random.pkl -------------------------------------------------------------------------------- /icml/icml_ant_results_maml_bak.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_maml_bak.pkl -------------------------------------------------------------------------------- /icml/icml_ant_results_pretrain.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_ant_results_pretrain.pkl -------------------------------------------------------------------------------- /icml/icml_antdirec_results_maml.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_maml.pkl -------------------------------------------------------------------------------- /icml/icml_cheetah_results_maml.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_maml.pkl -------------------------------------------------------------------------------- /icml/icml_antdirec_results_oracle.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_oracle.pkl -------------------------------------------------------------------------------- /icml/icml_antdirec_results_random.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_random.pkl -------------------------------------------------------------------------------- /icml/icml_cheetah_results_oracle.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_oracle.pkl -------------------------------------------------------------------------------- /icml/icml_cheetah_results_random.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_random.pkl -------------------------------------------------------------------------------- /icml/icml_antdirec_results_pretrain.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_pretrain.pkl -------------------------------------------------------------------------------- /icml/icml_cheetah_results_pretrain.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_pretrain.pkl -------------------------------------------------------------------------------- /icml/icml_cheetahdirec_results_maml.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetahdirec_results_maml.pkl -------------------------------------------------------------------------------- /icml/icml_cheetah_results_pretrain_bak.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetah_results_pretrain_bak.pkl -------------------------------------------------------------------------------- /icml/icml_cheetahdirec_results_oracle.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetahdirec_results_oracle.pkl -------------------------------------------------------------------------------- /icml/icml_cheetahdirec_results_random.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetahdirec_results_random.pkl -------------------------------------------------------------------------------- /icml/icml_antdirec_results_maml_batch20.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_antdirec_results_maml_batch20.pkl -------------------------------------------------------------------------------- /icml/icml_cheetahdirec_results_pretrain.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cbfinn/rllab/HEAD/icml/icml_cheetahdirec_results_pretrain.pkl -------------------------------------------------------------------------------- /rllab/q_functions/base.py: -------------------------------------------------------------------------------- 1 | from rllab.core.parameterized import Parameterized 2 | 3 | 4 | class QFunction(Parameterized): 5 | pass 6 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # setup.py 2 | from setuptools import setup 3 | 4 | setup( 5 | name='rllab', 6 | version='0.1.0', 7 | packages=['rllab'], 8 | ) 9 | -------------------------------------------------------------------------------- /rllab/mujoco_py/mjconstants.py: -------------------------------------------------------------------------------- 1 | MOUSE_ROTATE_V = 1 2 | MOUSE_ROTATE_H = 2 3 | MOUSE_MOVE_V = 3 4 | MOUSE_MOVE_H = 4 5 | MOUSE_ZOOM = 5 6 | 7 | mjOBJ_BODY = 1 8 | -------------------------------------------------------------------------------- /rllab/spaces/__init__.py: -------------------------------------------------------------------------------- 1 | from .product import Product 2 | from .discrete import Discrete 3 | from .box import Box 4 | 5 | __all__ = ["Product", "Discrete", "Box"] -------------------------------------------------------------------------------- /rllab/algos/base.py: -------------------------------------------------------------------------------- 1 | class Algorithm(object): 2 | pass 3 | 4 | 5 | class RLAlgorithm(Algorithm): 6 | 7 | def train(self): 8 | raise NotImplementedError 9 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/__init__.py: -------------------------------------------------------------------------------- 1 | from .product import Product 2 | from .discrete import Discrete 3 | from .box import Box 4 | 5 | __all__ = ["Product", "Discrete", "Box"] 6 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/humanoid_env.py: -------------------------------------------------------------------------------- 1 | from .simple_humanoid_env import SimpleHumanoidEnv 2 | 3 | 4 | # Taken from Wojciech's code 5 | class HumanoidEnv(SimpleHumanoidEnv): 6 | 7 | FILE = 'humanoid.xml' 8 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/recurrent_diagonal_gaussian.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from sandbox.rocky.tf.distributions.diagonal_gaussian import DiagonalGaussian 5 | 6 | RecurrentDiagonalGaussian = DiagonalGaussian 7 | -------------------------------------------------------------------------------- /rllab/exploration_strategies/base.py: -------------------------------------------------------------------------------- 1 | class ExplorationStrategy(object): 2 | def get_action(self, t, observation, policy, **kwargs): 3 | raise NotImplementedError 4 | 5 | def reset(self): 6 | pass 7 | -------------------------------------------------------------------------------- /docker/tester_Dockerfile: -------------------------------------------------------------------------------- 1 | FROM neocxi/rllab_exp_gpu_tf:py3 2 | 3 | RUN bash -c 'source activate rllab3 && conda install -y nomkl && conda uninstall -y scipy && conda install -y scipy' 4 | 5 | ADD . /root/code/rllab 6 | WORKDIR /root/code/rllab 7 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/gather/point_gather_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.mujoco.gather.gather_env import GatherEnv 2 | from rllab.envs.mujoco.point_env import PointEnv 3 | 4 | 5 | class PointGatherEnv(GatherEnv): 6 | 7 | MODEL_CLASS = PointEnv 8 | ORI_IND = 2 9 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/gather/swimmer_gather_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.mujoco.gather.gather_env import GatherEnv 2 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv 3 | 4 | 5 | class SwimmerGatherEnv(GatherEnv): 6 | 7 | MODEL_CLASS = SwimmerEnv 8 | ORI_IND = 2 9 | -------------------------------------------------------------------------------- /rllab/distributions/recurrent_diagonal_gaussian.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as TT 2 | import numpy as np 3 | from rllab.distributions.base import Distribution 4 | from rllab.distributions.diagonal_gaussian import DiagonalGaussian 5 | 6 | RecurrentDiagonalGaussian = DiagonalGaussian 7 | -------------------------------------------------------------------------------- /rllab/mujoco_py/mjextra.py: -------------------------------------------------------------------------------- 1 | def append_objects(cur, extra): 2 | for i in range(cur.ngeom, cur.ngeom + extra.ngeom): 3 | cur.geoms[i] = extra.geoms[i - cur.ngeom] 4 | cur.ngeom = cur.ngeom + extra.ngeom 5 | if cur.ngeom > cur.maxgeom: 6 | raise ValueError("buffer limit exceeded!") 7 | -------------------------------------------------------------------------------- /rllab/mujoco_py/__init__.py: -------------------------------------------------------------------------------- 1 | from .mjviewer import MjViewer 2 | from .mjcore import MjModel 3 | from .mjcore import register_license 4 | import os 5 | from .mjconstants import * 6 | 7 | register_license(os.path.join(os.path.dirname(__file__), 8 | '../../vendor/mujoco/mjkey.txt')) 9 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/box.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from rllab.spaces.box import Box as TheanoBox 5 | import tensorflow as tf 6 | 7 | 8 | class Box(TheanoBox): 9 | def new_tensor_variable(self, name, extra_dims): 10 | return tf.placeholder(tf.float32, shape=[None] * extra_dims + [self.flat_dim], name=name) 11 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/maze/point_maze_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.mujoco.maze.maze_env import MazeEnv 2 | from rllab.envs.mujoco.point_env import PointEnv 3 | 4 | 5 | class PointMazeEnv(MazeEnv): 6 | 7 | MODEL_CLASS = PointEnv 8 | ORI_IND = 2 9 | 10 | MAZE_HEIGHT = 2 11 | MAZE_SIZE_SCALING = 3.0 12 | 13 | MANUAL_COLLISION = True 14 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/maze/swimmer_maze_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.mujoco.maze.maze_env import MazeEnv 2 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv 3 | 4 | 5 | class SwimmerMazeEnv(MazeEnv): 6 | 7 | MODEL_CLASS = SwimmerEnv 8 | ORI_IND = 2 9 | 10 | MAZE_HEIGHT = 0.5 11 | MAZE_SIZE_SCALING = 4 12 | MAZE_MAKE_CONTACTS = True 13 | 14 | -------------------------------------------------------------------------------- /circle.yml: -------------------------------------------------------------------------------- 1 | machine: 2 | services: 3 | - docker 4 | 5 | dependencies: 6 | cache_directories: 7 | - "~/docker" 8 | override: 9 | - docker info 10 | - if [[ -e ~/docker/image.tar ]]; then docker load -i ~/docker/image.tar; fi 11 | - docker build -t tester -f docker/tester_Dockerfile . 12 | - mkdir -p ~/docker; docker save tester > ~/docker/image.tar 13 | 14 | test: 15 | override: 16 | - docker run tester /bin/bash -li -c "CIRCLECI=true nose2" 17 | -------------------------------------------------------------------------------- /tests/envs/test_maze_env.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | from rllab.envs.mujoco.maze.maze_env_utils import line_intersect, ray_segment_intersect 4 | 5 | 6 | def test_line_intersect(): 7 | assert line_intersect((0, 0), (0, 1), (0, 0), (1, 0))[:2] == (0, 0) 8 | assert line_intersect((0, 0), (0, 1), (0, 0), (0, 1))[2] == 0 9 | assert ray_segment_intersect(ray=((0, 0), 0), segment=((1, -1), (1, 1))) == (1, 0) 10 | assert ray_segment_intersect(ray=((0, 0), math.pi), segment=((1, -1), (1, 1))) is None 11 | -------------------------------------------------------------------------------- /rllab/mujoco_py/gen_binding.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | parent_path=$( cd "$(dirname "${BASH_SOURCE}")" ; pwd -P ) 3 | mujoco_path=$parent_path/../../vendor/mujoco 4 | rm /tmp/code_gen_mujoco.h 5 | cat $mujoco_path/mjdata.h >> /tmp/code_gen_mujoco.h && \ 6 | cat $mujoco_path/mjmodel.h >> /tmp/code_gen_mujoco.h && \ 7 | cat $mujoco_path/mjrender.h >> /tmp/code_gen_mujoco.h && \ 8 | cat $mujoco_path/mjvisualize.h >> /tmp/code_gen_mujoco.h && \ 9 | ruby $parent_path/codegen.rb /tmp/code_gen_mujoco.h $mujoco_path/mjxmacro.h > $parent_path/mjtypes.py 10 | -------------------------------------------------------------------------------- /tests/test_networks.py: -------------------------------------------------------------------------------- 1 | def test_gru_network(): 2 | from rllab.core.network import GRUNetwork 3 | import lasagne.layers as L 4 | from rllab.misc import ext 5 | import numpy as np 6 | network = GRUNetwork( 7 | input_shape=(2, 3), 8 | output_dim=5, 9 | hidden_dim=4, 10 | ) 11 | f_output = ext.compile_function( 12 | inputs=[network.input_layer.input_var], 13 | outputs=L.get_output(network.output_layer) 14 | ) 15 | assert f_output(np.zeros((6, 8, 2, 3))).shape == (6, 8, 5) 16 | -------------------------------------------------------------------------------- /vendor/mujoco_models/red_ball.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | data 2 | *.pyc 3 | *-checkpoint.ipynb 4 | .DS_Store 5 | *.h5 6 | *.log 7 | *.npz 8 | secrets.py 9 | *.avi 10 | *.mp4 11 | build 12 | build_linux 13 | .idea 14 | .sublime-project 15 | run_experiment.sh 16 | scratch-notebooks 17 | launch_scripts 18 | *.sh.e* 19 | *.sh.o* 20 | MUJOCO_LOG.TXT 21 | vendor/mujoco 22 | .project 23 | .pydevproject 24 | *.pdf 25 | .env 26 | snippets 27 | private 28 | lua 29 | iterate.dat 30 | .env 31 | src/ 32 | .settings 33 | .pods 34 | docs/_build 35 | blackbox.zip 36 | blackbox 37 | rllab/config_personal.py 38 | *.swp 39 | -------------------------------------------------------------------------------- /vendor/mujoco_models/green_ball.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /rllab/baselines/zero_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rllab.baselines.base import Baseline 3 | from rllab.misc.overrides import overrides 4 | 5 | 6 | class ZeroBaseline(Baseline): 7 | 8 | def __init__(self, env_spec): 9 | pass 10 | 11 | @overrides 12 | def get_param_values(self, **kwargs): 13 | return None 14 | 15 | @overrides 16 | def set_param_values(self, val, **kwargs): 17 | pass 18 | 19 | @overrides 20 | def fit(self, paths, **kwargs): 21 | pass 22 | 23 | @overrides 24 | def predict(self, path): 25 | return np.zeros_like(path["rewards"]) 26 | -------------------------------------------------------------------------------- /rllab/algos/nop.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.batch_polopt import BatchPolopt 2 | from rllab.misc.overrides import overrides 3 | 4 | 5 | class NOP(BatchPolopt): 6 | """ 7 | NOP (no optimization performed) policy search algorithm 8 | """ 9 | 10 | def __init__( 11 | self, 12 | **kwargs): 13 | super(NOP, self).__init__(**kwargs) 14 | 15 | @overrides 16 | def init_opt(self): 17 | pass 18 | 19 | @overrides 20 | def optimize_policy(self, itr, samples_data): 21 | pass 22 | 23 | @overrides 24 | def get_itr_snapshot(self, itr, samples_data): 25 | return dict() 26 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/gather/ant_gather_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.mujoco.gather.gather_env import GatherEnv 2 | from rllab.envs.mujoco.ant_env import AntEnv 3 | from rllab.envs.mujoco.mujoco_env import q_mult, q_inv 4 | import math 5 | 6 | class AntGatherEnv(GatherEnv): 7 | 8 | MODEL_CLASS = AntEnv 9 | ORI_IND = 3 10 | 11 | def get_ori(self): 12 | ori = [0, 1, 0, 0] 13 | rot = self.inner_env.model.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND+4] # take the quaternion 14 | ori = q_mult(q_mult(rot,ori),q_inv(rot))[1:3] # project onto x-y plane 15 | ori = math.atan2(ori[1],ori[0]) 16 | return ori 17 | -------------------------------------------------------------------------------- /rllab/misc/mako_utils.py: -------------------------------------------------------------------------------- 1 | 2 | def compute_rect_vertices(fromp, to, radius): 3 | x1, y1 = fromp 4 | x2, y2 = to 5 | if abs(y1 - y2) < 1e-6: 6 | dx = 0 7 | dy = radius 8 | else: 9 | dx = radius * 1.0 / (((x1 - x2) / (y1 - y2)) ** 2 + 1) ** 0.5 10 | # equivalently dx = radius * (y2-y1).to_f / ((x2-x1)**2 + (y2-y1)**2)**0.5 11 | dy = (radius**2 - dx**2) ** 0.5 12 | dy *= -1 if (x1 - x2) * (y1 - y2) > 0 else 1 13 | 14 | return ";".join([",".join(map(str, r)) for r in [ 15 | [x1 + dx, y1 + dy], 16 | [x2 + dx, y2 + dy], 17 | [x2 - dx, y2 - dy], 18 | [x1 - dx, y1 - dy], 19 | ]]) 20 | 21 | -------------------------------------------------------------------------------- /rllab/mujoco_py/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | activesupport (4.1.8) 5 | i18n (~> 0.6, >= 0.6.9) 6 | json (~> 1.7, >= 1.7.7) 7 | minitest (~> 5.1) 8 | thread_safe (~> 0.1) 9 | tzinfo (~> 1.1) 10 | coderay (1.1.0) 11 | i18n (0.7.0) 12 | json (1.8.1) 13 | method_source (0.8.2) 14 | minitest (5.5.1) 15 | pry (0.10.1) 16 | coderay (~> 1.1.0) 17 | method_source (~> 0.8.1) 18 | slop (~> 3.4) 19 | slop (3.6.0) 20 | thread_safe (0.3.4) 21 | tzinfo (1.2.2) 22 | thread_safe (~> 0.1) 23 | 24 | PLATFORMS 25 | ruby 26 | 27 | DEPENDENCIES 28 | activesupport 29 | pry 30 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/trpo.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from sandbox.rocky.tf.algos.npo import NPO 4 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 5 | 6 | 7 | class TRPO(NPO): 8 | """ 9 | Trust Region Policy Optimization 10 | """ 11 | 12 | def __init__( 13 | self, 14 | optimizer=None, 15 | optimizer_args=None, 16 | **kwargs): 17 | if optimizer is None: 18 | if optimizer_args is None: 19 | optimizer_args = dict() 20 | optimizer = ConjugateGradientOptimizer(**optimizer_args) 21 | super(TRPO, self).__init__(optimizer=optimizer, **kwargs) 22 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/core/layers_powered.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.core.parameterized import Parameterized 2 | import sandbox.rocky.tf.core.layers as L 3 | import itertools 4 | 5 | 6 | class LayersPowered(Parameterized): 7 | 8 | def __init__(self, output_layers, input_layers=None): 9 | self._output_layers = output_layers 10 | self._input_layers = input_layers 11 | Parameterized.__init__(self) 12 | 13 | def get_params_internal(self, **tags): 14 | layers = L.get_all_layers(self._output_layers, treat_as_input=self._input_layers) 15 | params = itertools.chain.from_iterable(l.get_params(**tags) for l in layers) 16 | return L.unique(params) 17 | 18 | -------------------------------------------------------------------------------- /rllab/algos/trpo.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.npo import NPO 2 | from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 3 | from rllab.core.serializable import Serializable 4 | 5 | 6 | class TRPO(NPO): 7 | """ 8 | Trust Region Policy Optimization 9 | """ 10 | 11 | def __init__( 12 | self, 13 | optimizer=None, 14 | optimizer_args=None, 15 | **kwargs): 16 | if optimizer is None: 17 | if optimizer_args is None: 18 | optimizer_args = dict() 19 | optimizer = ConjugateGradientOptimizer(**optimizer_args) 20 | super(TRPO, self).__init__(optimizer=optimizer, **kwargs) 21 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/maze/ant_maze_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.mujoco.maze.maze_env import MazeEnv 2 | from rllab.envs.mujoco.ant_env import AntEnv 3 | from rllab.envs.mujoco.mujoco_env import q_mult, q_inv 4 | import math 5 | 6 | 7 | class AntMazeEnv(MazeEnv): 8 | 9 | MODEL_CLASS = AntEnv 10 | ORI_IND = 3 11 | 12 | MAZE_HEIGHT = 2 13 | MAZE_SIZE_SCALING = 3.0 14 | 15 | def get_ori(self): 16 | ori = [0, 1, 0, 0] 17 | rot = self.wrapped_env.model.data.qpos[self.__class__.ORI_IND:self.__class__.ORI_IND+4] # take the quaternion 18 | ori = q_mult(q_mult(rot,ori),q_inv(rot))[1:3] # project onto x-y plane 19 | ori = math.atan2(ori[1],ori[0]) 20 | return ori 21 | -------------------------------------------------------------------------------- /tests/test_stateful_pool.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | def _worker_collect_once(_): 6 | return 'a', 1 7 | 8 | 9 | def test_stateful_pool(): 10 | from rllab.sampler import stateful_pool 11 | stateful_pool.singleton_pool.initialize(n_parallel=3) 12 | results = stateful_pool.singleton_pool.run_collect(_worker_collect_once, 3, show_prog_bar=False) 13 | assert tuple(results) == ('a', 'a', 'a') 14 | 15 | 16 | def test_stateful_pool_over_capacity(): 17 | from rllab.sampler import stateful_pool 18 | stateful_pool.singleton_pool.initialize(n_parallel=4) 19 | results = stateful_pool.singleton_pool.run_collect(_worker_collect_once, 3, show_prog_bar=False) 20 | assert len(results) >= 3 21 | -------------------------------------------------------------------------------- /rllab/envs/env_spec.py: -------------------------------------------------------------------------------- 1 | from rllab.core.serializable import Serializable 2 | from rllab.spaces.base import Space 3 | 4 | 5 | class EnvSpec(Serializable): 6 | 7 | def __init__( 8 | self, 9 | observation_space, 10 | action_space): 11 | """ 12 | :type observation_space: Space 13 | :type action_space: Space 14 | """ 15 | Serializable.quick_init(self, locals()) 16 | self._observation_space = observation_space 17 | self._action_space = action_space 18 | 19 | @property 20 | def observation_space(self): 21 | return self._observation_space 22 | 23 | @property 24 | def action_space(self): 25 | return self._action_space 26 | -------------------------------------------------------------------------------- /rllab/algos/ppo.py: -------------------------------------------------------------------------------- 1 | from rllab.optimizers.penalty_lbfgs_optimizer import PenaltyLbfgsOptimizer 2 | from rllab.algos.npo import NPO 3 | from rllab.core.serializable import Serializable 4 | 5 | 6 | class PPO(NPO, Serializable): 7 | """ 8 | Penalized Policy Optimization. 9 | """ 10 | 11 | def __init__( 12 | self, 13 | optimizer=None, 14 | optimizer_args=None, 15 | **kwargs): 16 | Serializable.quick_init(self, locals()) 17 | if optimizer is None: 18 | if optimizer_args is None: 19 | optimizer_args = dict() 20 | optimizer = PenaltyLbfgsOptimizer(**optimizer_args) 21 | super(PPO, self).__init__(optimizer=optimizer, **kwargs) 22 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/algos/sensitive_trpo.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from sandbox.rocky.tf.algos.sensitive_npo import SensitiveNPO 4 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 5 | 6 | 7 | class SensitiveTRPO(SensitiveNPO): 8 | """ 9 | Trust Region Policy Optimization 10 | """ 11 | 12 | def __init__( 13 | self, 14 | optimizer=None, 15 | optimizer_args=None, 16 | **kwargs): 17 | if optimizer is None: 18 | if optimizer_args is None: 19 | optimizer_args = dict() 20 | optimizer = ConjugateGradientOptimizer(**optimizer_args) 21 | super(SensitiveTRPO, self).__init__(optimizer=optimizer, **kwargs) 22 | -------------------------------------------------------------------------------- /rllab/core/lasagne_powered.py: -------------------------------------------------------------------------------- 1 | from rllab.core.parameterized import Parameterized 2 | from rllab.misc.overrides import overrides 3 | import lasagne.layers as L 4 | 5 | 6 | class LasagnePowered(Parameterized): 7 | def __init__(self, output_layers): 8 | self._output_layers = output_layers 9 | super(LasagnePowered, self).__init__() 10 | 11 | @property 12 | def output_layers(self): 13 | return self._output_layers 14 | 15 | @overrides 16 | def get_params_internal(self, **tags): # this gives ALL the vars (not the params values) 17 | return L.get_all_params( # this lasagne function also returns all var below the passed layers 18 | L.concat(self._output_layers), 19 | **tags 20 | ) 21 | -------------------------------------------------------------------------------- /rllab/envs/box2d/models/mountain_car.xml.mako: -------------------------------------------------------------------------------- 1 | <% 2 | noise = opts.get("noise", False) 3 | track_width = 4 4 | if noise: 5 | import numpy as np 6 | track_width += np.random.uniform(-1, 1) 7 | %> 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /examples/nop_cartpole.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.nop import NOP 2 | from rllab.baselines.zero_baseline import ZeroBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from rllab.policies.uniform_control_policy import UniformControlPolicy 6 | 7 | env = normalize(CartpoleEnv()) 8 | 9 | policy = UniformControlPolicy( 10 | env_spec=env.spec, 11 | # The neural network policy should have two hidden layers, each with 32 hidden units. 12 | ) 13 | 14 | baseline = ZeroBaseline(env_spec=env.spec) 15 | 16 | algo = NOP( 17 | env=env, 18 | policy=policy, 19 | baseline=baseline, 20 | batch_size=4000, 21 | max_path_length=100, 22 | n_itr=40, 23 | discount=0.99, 24 | step_size=0.01, 25 | ) 26 | algo.train() 27 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/policies/uniform_control_policy.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.policies.base import Policy 2 | from rllab.core.serializable import Serializable 3 | 4 | 5 | class UniformControlPolicy(Policy, Serializable): 6 | def __init__( 7 | self, 8 | env_spec, 9 | ): 10 | Serializable.quick_init(self, locals()) 11 | super(UniformControlPolicy, self).__init__(env_spec=env_spec) 12 | 13 | @property 14 | def vectorized(self): 15 | return True 16 | 17 | def get_action(self, observation): 18 | return self.action_space.sample(), dict() 19 | 20 | def get_actions(self, observations): 21 | return self.action_space.sample_n(len(observations)), dict() 22 | 23 | def get_params_internal(self, **tags): 24 | return [] 25 | -------------------------------------------------------------------------------- /rllab/algos/tnpg.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.npo import NPO 2 | from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 3 | from rllab.misc import ext 4 | 5 | 6 | class TNPG(NPO): 7 | """ 8 | Truncated Natural Policy Gradient. 9 | """ 10 | 11 | def __init__( 12 | self, 13 | optimizer=None, 14 | optimizer_args=None, 15 | **kwargs): 16 | if optimizer is None: 17 | default_args = dict(max_backtracks=1) 18 | if optimizer_args is None: 19 | optimizer_args = default_args 20 | else: 21 | optimizer_args = dict(default_args, **optimizer_args) 22 | optimizer = ConjugateGradientOptimizer(**optimizer_args) 23 | super(TNPG, self).__init__(optimizer=optimizer, **kwargs) 24 | -------------------------------------------------------------------------------- /examples/trpo_cartpole.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.trpo import TRPO 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 6 | 7 | env = normalize(CartpoleEnv()) 8 | 9 | policy = GaussianMLPPolicy( 10 | env_spec=env.spec, 11 | # The neural network policy should have two hidden layers, each with 32 hidden units. 12 | hidden_sizes=(32, 32) 13 | ) 14 | 15 | baseline = LinearFeatureBaseline(env_spec=env.spec) 16 | 17 | algo = TRPO( 18 | env=env, 19 | policy=policy, 20 | baseline=baseline, 21 | batch_size=4000, 22 | max_path_length=100, 23 | n_itr=40, 24 | discount=0.99, 25 | step_size=0.01, 26 | ) 27 | algo.train() 28 | -------------------------------------------------------------------------------- /tests/test_serializable.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from rllab.core.serializable import Serializable 4 | from sandbox.rocky.tf.core.parameterized import Parameterized, suppress_params_loading 5 | 6 | 7 | class Simple(Parameterized, Serializable): 8 | def __init__(self, name): 9 | Serializable.quick_init(self, locals()) 10 | with tf.variable_scope(name): 11 | self.w = tf.get_variable("w", [10, 10]) 12 | 13 | def get_params_internal(self, **tags): 14 | return [self.w] 15 | 16 | 17 | def test_serializable(): 18 | with suppress_params_loading(): 19 | obj = Simple(name="obj") 20 | obj1 = Serializable.clone(obj, name="obj1") 21 | assert obj.w.name.startswith('obj/') 22 | assert obj1.w.name.startswith('obj1/') 23 | 24 | 25 | if __name__ == "__main__": 26 | test_serializable() 27 | -------------------------------------------------------------------------------- /scripts/setup_linux.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Make sure that conda is available 3 | 4 | hash conda 2>/dev/null || { 5 | echo "Please install anaconda before continuing. You can download it at https://www.continuum.io/downloads. Please use the Python 2.7 installer." 6 | exit 0 7 | } 8 | 9 | echo "Installing system dependencies" 10 | echo "You will probably be asked for your sudo password." 11 | sudo apt-get update 12 | sudo apt-get install -y python-pip python-dev swig cmake build-essential 13 | sudo apt-get build-dep -y python-pygame 14 | sudo apt-get build-dep -y python-scipy 15 | 16 | # Make sure that we're under the directory of the project 17 | cd "$(dirname "$0")/.." 18 | 19 | echo "Creating conda environment..." 20 | conda env create -f environment.yml 21 | conda env update 22 | 23 | echo "Conda environment created! Make sure to run \`source activate rllab3\` whenever you open a new terminal and want to run programs under rllab." 24 | -------------------------------------------------------------------------------- /icml/icml_point_results_oracle.pkl: -------------------------------------------------------------------------------- 1 | (dp0 2 | S'task_avg_returns' 3 | p1 4 | (lp2 5 | (lp3 6 | F-1.97272875745 7 | aF-2.34575318154 8 | aF-2.32250937247 9 | aF-2.08331732345 10 | aF-1.48506580113 11 | aF-1.82948806173 12 | aF-2.9593656989 13 | aF-2.95927984391 14 | aF-1.62553040865 15 | aF-2.24929753541 16 | aF-2.60588825185 17 | aF-1.87460507218 18 | aF-2.16624064843 19 | aF-2.92060743576 20 | aF-3.14571198472 21 | aF-2.19234063112 22 | aF-2.53227361567 23 | aF-1.53461938736 24 | aF-2.10279844877 25 | aF-3.24244584946 26 | aF-2.76242677435 27 | aF-2.37053743068 28 | aF-2.17942891458 29 | aF-2.06120806957 30 | aF-2.33617484501 31 | aF-3.09065130611 32 | aF-2.19778531831 33 | aF-2.64369463483 34 | aF-2.08031379918 35 | aF-1.84096218 36 | aF-2.13910248301 37 | aF-1.48007332088 38 | aF-2.59933037373 39 | aF-1.67871774468 40 | aF-2.41717877275 41 | aF-2.17676728609 42 | aF-2.92057994794 43 | aF-2.21554034345 44 | aF-2.39851887364 45 | aF-1.7900390061 46 | aas. -------------------------------------------------------------------------------- /rllab/envs/identification_env.py: -------------------------------------------------------------------------------- 1 | from rllab.core.serializable import Serializable 2 | from rllab.envs.proxy_env import ProxyEnv 3 | from rllab.misc.overrides import overrides 4 | 5 | 6 | class IdentificationEnv(ProxyEnv, Serializable): 7 | 8 | def __init__(self, mdp_cls, mdp_args): 9 | Serializable.quick_init(self, locals()) 10 | self.mdp_cls = mdp_cls 11 | self.mdp_args = dict(mdp_args) 12 | self.mdp_args["template_args"] = dict(noise=True) 13 | mdp = self.gen_mdp() 14 | super(IdentificationEnv, self).__init__(mdp) 15 | 16 | def gen_mdp(self): 17 | return self.mdp_cls(**self.mdp_args) 18 | 19 | @overrides 20 | def reset(self): 21 | if getattr(self, "_mdp", None): 22 | if hasattr(self._wrapped_env, "release"): 23 | self._wrapped_env.release() 24 | self._wrapped_env = self.gen_mdp() 25 | return super(IdentificationEnv, self).reset() 26 | 27 | -------------------------------------------------------------------------------- /rllab/baselines/base.py: -------------------------------------------------------------------------------- 1 | from rllab.misc import autoargs 2 | 3 | 4 | class Baseline(object): 5 | 6 | def __init__(self, env_spec): 7 | self._mdp_spec = env_spec 8 | 9 | @property 10 | def algorithm_parallelized(self): 11 | return False 12 | 13 | def get_param_values(self): 14 | raise NotImplementedError 15 | 16 | def set_param_values(self, val): 17 | raise NotImplementedError 18 | 19 | def fit(self, paths): 20 | raise NotImplementedError 21 | 22 | def predict(self, path): 23 | raise NotImplementedError 24 | 25 | @classmethod 26 | @autoargs.add_args 27 | def add_args(cls, parser): 28 | pass 29 | 30 | @classmethod 31 | @autoargs.new_from_args 32 | def new_from_args(cls, args, mdp): 33 | pass 34 | 35 | def log_diagnostics(self, paths): 36 | """ 37 | Log extra information per iteration based on the collected paths 38 | """ 39 | pass 40 | -------------------------------------------------------------------------------- /tests/test_baselines.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | os.environ['THEANO_FLAGS'] = 'mode=FAST_COMPILE,optimizer=None' 4 | 5 | from rllab.algos.vpg import VPG 6 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 7 | from rllab.baselines.zero_baseline import ZeroBaseline 8 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 9 | from rllab.baselines.gaussian_mlp_baseline import GaussianMLPBaseline 10 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 11 | from nose2 import tools 12 | 13 | 14 | baselines = [ZeroBaseline, LinearFeatureBaseline, GaussianMLPBaseline] 15 | 16 | 17 | @tools.params(*baselines) 18 | def test_baseline(baseline_cls): 19 | env = CartpoleEnv() 20 | policy = GaussianMLPPolicy(env_spec=env.spec, hidden_sizes=(6,)) 21 | baseline = baseline_cls(env_spec=env.spec) 22 | algo = VPG( 23 | env=env, policy=policy, baseline=baseline, 24 | n_itr=1, batch_size=1000, max_path_length=100 25 | ) 26 | algo.train() 27 | -------------------------------------------------------------------------------- /examples/point_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.base import Env 2 | from rllab.spaces import Box 3 | from rllab.envs.base import Step 4 | import numpy as np 5 | 6 | 7 | class PointEnv(Env): 8 | @property 9 | def observation_space(self): 10 | return Box(low=-np.inf, high=np.inf, shape=(2,)) 11 | 12 | @property 13 | def action_space(self): 14 | return Box(low=-0.1, high=0.1, shape=(2,)) 15 | 16 | def reset(self, **kwargs): 17 | self._state = np.random.uniform(-1, 1, size=(2,)) 18 | observation = np.copy(self._state) 19 | return observation 20 | 21 | def step(self, action): 22 | self._state = self._state + action 23 | x, y = self._state 24 | reward = - (x ** 2 + y ** 2) ** 0.5 25 | done = abs(x) < 0.01 and abs(y) < 0.01 26 | next_observation = np.copy(self._state) 27 | return Step(observation=next_observation, reward=reward, done=done) 28 | 29 | def render(self): 30 | print('current state:', self._state) 31 | -------------------------------------------------------------------------------- /tests/regression_tests/test_issue_3.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from nose2.tools import such 5 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 7 | from rllab.algos.trpo import TRPO 8 | from rllab.baselines.zero_baseline import ZeroBaseline 9 | 10 | with such.A("Issue #3") as it: 11 | @it.should("be fixed") 12 | def test_issue_3(): 13 | """ 14 | As reported in https://github.com/rllab/rllab/issues/3, the adaptive_std parameter was not functioning properly 15 | """ 16 | env = CartpoleEnv() 17 | policy = GaussianMLPPolicy( 18 | env_spec=env, 19 | adaptive_std=True 20 | ) 21 | baseline = ZeroBaseline(env_spec=env.spec) 22 | algo = TRPO( 23 | env=env, 24 | policy=policy, 25 | baseline=baseline, 26 | batch_size=100, 27 | n_itr=1 28 | ) 29 | algo.train() 30 | 31 | it.createTests(globals()) 32 | -------------------------------------------------------------------------------- /tests/test_sampler.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import numpy as np 4 | 5 | 6 | def test_truncate_paths(): 7 | from rllab.sampler.parallel_sampler import truncate_paths 8 | 9 | paths = [ 10 | dict( 11 | observations=np.zeros((100, 1)), 12 | actions=np.zeros((100, 1)), 13 | rewards=np.zeros(100), 14 | env_infos=dict(), 15 | agent_infos=dict(lala=np.zeros(100)), 16 | ), 17 | dict( 18 | observations=np.zeros((50, 1)), 19 | actions=np.zeros((50, 1)), 20 | rewards=np.zeros(50), 21 | env_infos=dict(), 22 | agent_infos=dict(lala=np.zeros(50)), 23 | ), 24 | ] 25 | 26 | truncated = truncate_paths(paths, 130) 27 | assert len(truncated) == 2 28 | assert len(truncated[-1]["observations"]) == 30 29 | assert len(truncated[0]["observations"]) == 100 30 | # make sure not to change the original one 31 | assert len(paths) == 2 32 | assert len(paths[-1]["observations"]) == 50 33 | -------------------------------------------------------------------------------- /rllab/distributions/delta.py: -------------------------------------------------------------------------------- 1 | from rllab.distributions.base import Distribution 2 | 3 | class Delta(Distribution): 4 | @property 5 | def dim(self): 6 | return 0 7 | 8 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 9 | return None 10 | 11 | def kl(self, old_dist_info, new_dist_info): 12 | return None 13 | 14 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 15 | raise NotImplementedError 16 | 17 | def entropy(self, dist_info): 18 | raise NotImplementedError 19 | 20 | def log_likelihood_sym(self, x_var, dist_info_vars): 21 | raise NotImplementedError 22 | 23 | def likelihood_sym(self, x_var, dist_info_vars): 24 | return TT.exp(self.log_likelihood_sym(x_var, dist_info_vars)) 25 | 26 | def log_likelihood(self, xs, dist_info): 27 | return None 28 | 29 | @property 30 | def dist_info_keys(self): 31 | return None 32 | 33 | def entropy(self,dist_info): 34 | return 0 35 | -------------------------------------------------------------------------------- /scripts/submit_gym.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import argparse 4 | import os 5 | import os.path as osp 6 | import gym 7 | from rllab.viskit.core import load_params 8 | 9 | if __name__ == "__main__": 10 | # rl_gym.api_key = 'g8JOpnNVmcjMShBiFtyji2VWX3P2uCzc' 11 | if 'OPENAI_GYM_API_KEY' not in os.environ: 12 | raise ValueError("OpenAi Gym API key not configured. Please register an account on https://gym.openai.com and" 13 | " set the OPENAI_GYM_API_KEY environment variable, and try the script again.") 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('log_dir', type=str, 17 | help='path to the logging directory') 18 | parser.add_argument('--algorithm_id', type=str, default=None, help='Algorithm ID') 19 | args = parser.parse_args() 20 | snapshot_dir = osp.abspath(osp.join(args.log_dir, "..")) 21 | params_file_path = osp.join(snapshot_dir, "params.json") 22 | params_json = load_params(params_file_path) 23 | gym.upload(args.log_dir, algorithm_id=args.algorithm_id) 24 | -------------------------------------------------------------------------------- /scripts/setup_osx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Make sure that pip is available 3 | hash brew 2>/dev/null || { 4 | echo "Please install homebrew before continuing. You can use the following command to install:" 5 | echo "/usr/bin/ruby -e \"\$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)\"" 6 | exit 0 7 | } 8 | 9 | hash conda 2>/dev/null || { 10 | echo "Please install anaconda before continuing. You can download it at https://www.continuum.io/downloads. Please use the Python 2.7 installer." 11 | exit 0 12 | } 13 | 14 | 15 | echo "Installing system dependencies" 16 | echo "You will probably be asked for your sudo password." 17 | 18 | brew install swig sdl sdl_image sdl_mixer sdl_ttf portmidi 19 | 20 | # Make sure that we're under the directory of the project 21 | cd "$(dirname "$0")/.." 22 | echo "Creating conda environment..." 23 | conda env create -f environment.yml 24 | conda env update 25 | 26 | echo "Conda environment created! Make sure to run \`source activate rllab3\` whenever you open a new terminal and want to run programs under rllab." 27 | -------------------------------------------------------------------------------- /examples/trpo_cartpole_recurrent.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from rllab.algos.trpo import TRPO 5 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 6 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 7 | from rllab.envs.normalized_env import normalize 8 | from rllab.policies.gaussian_gru_policy import GaussianGRUPolicy 9 | from rllab.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp 10 | from rllab.misc.instrument import stub, run_experiment_lite 11 | 12 | stub(globals()) 13 | 14 | env = normalize(CartpoleEnv()) 15 | 16 | policy = GaussianGRUPolicy( 17 | env_spec=env.spec, 18 | ) 19 | 20 | baseline = LinearFeatureBaseline(env_spec=env.spec) 21 | 22 | algo = TRPO( 23 | env=env, 24 | policy=policy, 25 | baseline=baseline, 26 | batch_size=4000, 27 | max_path_length=100, 28 | n_itr=10, 29 | discount=0.99, 30 | step_size=0.01, 31 | optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) 32 | ) 33 | run_experiment_lite( 34 | algo.train(), 35 | n_parallel=1, 36 | seed=1, 37 | ) 38 | -------------------------------------------------------------------------------- /tests/test_spaces.py: -------------------------------------------------------------------------------- 1 | 2 | from rllab.spaces import Product, Discrete, Box 3 | import numpy as np 4 | 5 | 6 | def test_product_space(): 7 | _ = Product([Discrete(3), Discrete(2)]) 8 | product_space = Product(Discrete(3), Discrete(2)) 9 | sample = product_space.sample() 10 | assert product_space.contains(sample) 11 | 12 | 13 | def test_product_space_unflatten_n(): 14 | space = Product([Discrete(3), Discrete(3)]) 15 | np.testing.assert_array_equal(space.flatten((2, 2)), space.flatten_n([(2, 2)])[0]) 16 | np.testing.assert_array_equal( 17 | space.unflatten(space.flatten((2, 2))), 18 | space.unflatten_n(space.flatten_n([(2, 2)]))[0] 19 | ) 20 | 21 | 22 | def test_box(): 23 | space = Box(low=-1, high=1, shape=(2, 2)) 24 | np.testing.assert_array_equal(space.flatten([[1, 2], [3, 4]]), [1, 2, 3, 4]) 25 | np.testing.assert_array_equal(space.flatten_n([[[1, 2], [3, 4]]]), [[1, 2, 3, 4]]) 26 | np.testing.assert_array_equal(space.unflatten([1, 2, 3, 4]), [[1, 2], [3, 4]]) 27 | np.testing.assert_array_equal(space.unflatten_n([[1, 2, 3, 4]]), [[[1, 2], [3, 4]]]) 28 | -------------------------------------------------------------------------------- /contrib/alexbeloi/examples/vpgis_cartpole.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.vpg import VPG 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 6 | from contrib.alexbeloi.is_sampler import ISSampler 7 | 8 | """ 9 | Example using VPG with ISSampler, iterations alternate between live and 10 | importance sampled iterations. 11 | """ 12 | 13 | env = normalize(CartpoleEnv()) 14 | 15 | policy = GaussianMLPPolicy( 16 | env_spec=env.spec, 17 | # The neural network policy should have two hidden layers, each with 32 hidden units. 18 | hidden_sizes=(32, 32) 19 | ) 20 | 21 | baseline = LinearFeatureBaseline(env_spec=env.spec) 22 | 23 | algo = VPG( 24 | env=env, 25 | policy=policy, 26 | baseline=baseline, 27 | batch_size=4000, 28 | max_path_length=100, 29 | n_itr=40, 30 | discount=0.99, 31 | step_size=0.01, 32 | sampler_cls=ISSampler, 33 | sampler_args=dict(n_backtrack=1), 34 | ) 35 | algo.train() 36 | -------------------------------------------------------------------------------- /rllab/policies/uniform_control_policy.py: -------------------------------------------------------------------------------- 1 | from rllab.core.parameterized import Parameterized 2 | from rllab.core.serializable import Serializable 3 | from rllab.distributions.delta import Delta 4 | from rllab.policies.base import Policy 5 | from rllab.misc.overrides import overrides 6 | 7 | 8 | class UniformControlPolicy(Policy, Serializable): 9 | def __init__( 10 | self, 11 | env_spec, 12 | ): 13 | Serializable.quick_init(self, locals()) 14 | super(UniformControlPolicy, self).__init__(env_spec=env_spec) 15 | 16 | @overrides 17 | def get_action(self, observation): 18 | return self.action_space.sample(), dict() 19 | 20 | def get_params_internal(self, **tags): 21 | return [] 22 | 23 | def get_actions(self, observations): 24 | return self.action_space.sample_n(len(observations)), dict() 25 | 26 | @property 27 | def vectorized(self): 28 | return True 29 | 30 | def reset(self, dones=None): 31 | pass 32 | 33 | @property 34 | def distribution(self): 35 | # Just a placeholder 36 | return Delta() 37 | -------------------------------------------------------------------------------- /rllab/config_personal_template.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | USE_GPU = False 4 | 5 | DOCKER_IMAGE = "rein/rllab-exp-new" 6 | 7 | KUBE_PREFIX = "template_" 8 | 9 | DOCKER_LOG_DIR = "/tmp/expt" 10 | 11 | AWS_IMAGE_ID = "ami-67c5d00d" 12 | 13 | if USE_GPU: 14 | AWS_INSTANCE_TYPE = "g2.2xlarge" 15 | else: 16 | AWS_INSTANCE_TYPE = "c4.2xlarge" 17 | 18 | AWS_KEY_NAME = "research_virginia" 19 | 20 | AWS_SPOT = True 21 | 22 | AWS_SPOT_PRICE = '10.0' 23 | 24 | AWS_IAM_INSTANCE_PROFILE_NAME = "rllab" 25 | 26 | AWS_SECURITY_GROUPS = ["rllab"] 27 | 28 | AWS_REGION_NAME = "us-west-2" 29 | 30 | AWS_CODE_SYNC_S3_PATH = "e" 31 | 32 | CODE_SYNC_IGNORES = ["*.git/*", "*data/*", "*src/*", 33 | "*.pods/*", "*tests/*", "*examples/*", "docs/*"] 34 | 35 | LOCAL_CODE_DIR = "" 36 | 37 | AWS_S3_PATH = "" 38 | 39 | LABEL = "template" 40 | 41 | DOCKER_CODE_DIR = "/root/code/rllab" 42 | 43 | AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY", "") 44 | 45 | AWS_ACCESS_SECRET = os.environ.get("AWS_ACCESS_SECRET", "") 46 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/base.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | class Distribution(object): 6 | @property 7 | def dim(self): 8 | raise NotImplementedError 9 | 10 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 11 | """ 12 | Compute the symbolic KL divergence of two distributions 13 | """ 14 | raise NotImplementedError 15 | 16 | def kl(self, old_dist_info, new_dist_info): 17 | """ 18 | Compute the KL divergence of two distributions 19 | """ 20 | raise NotImplementedError 21 | 22 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 23 | raise NotImplementedError 24 | 25 | def entropy(self, dist_info): 26 | raise NotImplementedError 27 | 28 | def log_likelihood_sym(self, x_var, dist_info_vars): 29 | raise NotImplementedError 30 | 31 | def log_likelihood(self, xs, dist_info): 32 | raise NotImplementedError 33 | 34 | @property 35 | def dist_info_specs(self): 36 | raise NotImplementedError 37 | 38 | @property 39 | def dist_info_keys(self): 40 | return [k for k, _ in self.dist_info_specs] 41 | -------------------------------------------------------------------------------- /rllab/distributions/base.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as TT 2 | 3 | class Distribution(object): 4 | 5 | @property 6 | def dim(self): 7 | raise NotImplementedError 8 | 9 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 10 | """ 11 | Compute the symbolic KL divergence of two distributions 12 | """ 13 | raise NotImplementedError 14 | 15 | def kl(self, old_dist_info, new_dist_info): 16 | """ 17 | Compute the KL divergence of two distributions 18 | """ 19 | raise NotImplementedError 20 | 21 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 22 | raise NotImplementedError 23 | 24 | def entropy(self, dist_info): 25 | raise NotImplementedError 26 | 27 | def log_likelihood_sym(self, x_var, dist_info_vars): 28 | raise NotImplementedError 29 | 30 | def likelihood_sym(self, x_var, dist_info_vars): 31 | return TT.exp(self.log_likelihood_sym(x_var, dist_info_vars)) 32 | 33 | def log_likelihood(self, xs, dist_info): 34 | raise NotImplementedError 35 | 36 | @property 37 | def dist_info_keys(self): 38 | raise NotImplementedError 39 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/launchers/vpg_cartpole.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from sandbox.rocky.tf.algos.vpg import VPG 5 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 6 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 7 | from rllab.envs.normalized_env import normalize 8 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 9 | from sandbox.rocky.tf.envs.base import TfEnv 10 | from rllab.misc.instrument import stub, run_experiment_lite 11 | 12 | stub(globals()) 13 | 14 | env = TfEnv(normalize(CartpoleEnv())) 15 | 16 | policy = GaussianMLPPolicy( 17 | name="policy", 18 | env_spec=env.spec, 19 | # The neural network policy should have two hidden layers, each with 32 hidden units. 20 | hidden_sizes=(32, 32) 21 | ) 22 | 23 | baseline = LinearFeatureBaseline(env_spec=env.spec) 24 | 25 | algo = VPG( 26 | env=env, 27 | policy=policy, 28 | baseline=baseline, 29 | batch_size=10000, 30 | max_path_length=100, 31 | n_itr=40, 32 | discount=0.99, 33 | optimizer_args=dict( 34 | tf_optimizer_args=dict( 35 | learning_rate=0.01, 36 | ) 37 | ) 38 | ) 39 | run_experiment_lite( 40 | algo.train(), 41 | n_parallel=2, 42 | seed=1, 43 | ) 44 | -------------------------------------------------------------------------------- /rllab/exploration_strategies/gaussian_strategy.py: -------------------------------------------------------------------------------- 1 | from rllab.core.serializable import Serializable 2 | from rllab.spaces.box import Box 3 | from rllab.exploration_strategies.base import ExplorationStrategy 4 | import numpy as np 5 | 6 | 7 | class GaussianStrategy(ExplorationStrategy, Serializable): 8 | """ 9 | This strategy adds Gaussian noise to the action taken by the deterministic policy. 10 | """ 11 | 12 | def __init__(self, env_spec, max_sigma=1.0, min_sigma=0.1, decay_period=1000000): 13 | assert isinstance(env_spec.action_space, Box) 14 | assert len(env_spec.action_space.shape) == 1 15 | Serializable.quick_init(self, locals()) 16 | self._max_sigma = max_sigma 17 | self._min_sigma = min_sigma 18 | self._decay_period = decay_period 19 | self._action_space = env_spec.action_space 20 | 21 | def get_action(self, t, observation, policy, **kwargs): 22 | action, agent_info = policy.get_action(observation) 23 | sigma = self._max_sigma - (self._max_sigma - self._min_sigma) * min(1.0, t * 1.0 / self._decay_period) 24 | return np.clip(action + np.random.normal(size=len(action)) * sigma, self._action_space.low, 25 | self._action_space.high) 26 | -------------------------------------------------------------------------------- /rllab/envs/proxy_env.py: -------------------------------------------------------------------------------- 1 | from .base import Env 2 | 3 | 4 | class ProxyEnv(Env): 5 | def __init__(self, wrapped_env): 6 | self._wrapped_env = wrapped_env 7 | 8 | @property 9 | def wrapped_env(self): 10 | return self._wrapped_env 11 | 12 | def reset(self, *args, **kwargs): 13 | return self._wrapped_env.reset(*args, **kwargs) 14 | 15 | @property 16 | def action_space(self): 17 | return self._wrapped_env.action_space 18 | 19 | @property 20 | def observation_space(self): 21 | return self._wrapped_env.observation_space 22 | 23 | def step(self, action): 24 | return self._wrapped_env.step(action) 25 | 26 | def render(self, *args, **kwargs): 27 | return self._wrapped_env.render(*args, **kwargs) 28 | 29 | def log_diagnostics(self, paths, prefix=''): 30 | self._wrapped_env.log_diagnostics(paths, prefix=prefix) 31 | 32 | @property 33 | def horizon(self): 34 | return self._wrapped_env.horizon 35 | 36 | def terminate(self): 37 | self._wrapped_env.terminate() 38 | 39 | def get_param_values(self): 40 | return self._wrapped_env.get_param_values() 41 | 42 | def set_param_values(self,params): 43 | self._wrapped_env.set_param_values(params) 44 | -------------------------------------------------------------------------------- /contrib/alexbeloi/examples/trpois_cartpole.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.trpo import TRPO 2 | from rllab.algos.tnpg import TNPG 3 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 4 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 5 | from rllab.envs.normalized_env import normalize 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 7 | from contrib.alexbeloi.is_sampler import ISSampler 8 | 9 | """ 10 | Example using VPG with ISSampler, iterations alternate between live and 11 | importance sampled iterations. 12 | """ 13 | 14 | env = normalize(CartpoleEnv()) 15 | 16 | policy = GaussianMLPPolicy( 17 | env_spec=env.spec, 18 | # The neural network policy should have two hidden layers, each with 32 hidden units. 19 | hidden_sizes=(32, 32) 20 | ) 21 | 22 | baseline = LinearFeatureBaseline(env_spec=env.spec) 23 | 24 | optimizer_args = dict( 25 | # debug_nan=True, 26 | # reg_coeff=0.1, 27 | # cg_iters=2 28 | ) 29 | 30 | algo = TRPO( 31 | env=env, 32 | policy=policy, 33 | baseline=baseline, 34 | batch_size=4000, 35 | max_path_length=100, 36 | n_itr=200, 37 | discount=0.99, 38 | step_size=0.01, 39 | sampler_cls=ISSampler, 40 | sampler_args=dict(n_backtrack=1), 41 | optimizer_args=optimizer_args 42 | ) 43 | algo.train() 44 | -------------------------------------------------------------------------------- /examples/trpo_cartpole_stub.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.trpo import TRPO 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from rllab.misc.instrument import stub, run_experiment_lite 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 7 | 8 | stub(globals()) 9 | 10 | env = normalize(CartpoleEnv()) 11 | 12 | policy = GaussianMLPPolicy( 13 | env_spec=env.spec, 14 | # The neural network policy should have two hidden layers, each with 32 hidden units. 15 | hidden_sizes=(32, 32) 16 | ) 17 | 18 | baseline = LinearFeatureBaseline(env_spec=env.spec) 19 | 20 | algo = TRPO( 21 | env=env, 22 | policy=policy, 23 | baseline=baseline, 24 | batch_size=4000, 25 | max_path_length=100, 26 | n_itr=1000, 27 | discount=0.99, 28 | step_size=0.01, 29 | # Uncomment both lines (this and the plot parameter below) to enable plotting 30 | # plot=True, 31 | ) 32 | 33 | run_experiment_lite( 34 | algo.train(), 35 | # Number of parallel workers for sampling 36 | n_parallel=1, 37 | # Only keep the snapshot parameters for the last iteration 38 | snapshot_mode="last", 39 | # Specifies the seed for the experiment. If this is not provided, a random seed 40 | # will be used 41 | seed=1, 42 | # plot=True, 43 | ) 44 | -------------------------------------------------------------------------------- /rllab/optimizers/minibatch_dataset.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class BatchDataset(object): 5 | 6 | def __init__(self, inputs, batch_size, extra_inputs=None): 7 | self._inputs = [ 8 | i for i in inputs 9 | ] 10 | if extra_inputs is None: 11 | extra_inputs = [] 12 | self._extra_inputs = extra_inputs 13 | self._batch_size = batch_size 14 | if batch_size is not None: 15 | self._ids = np.arange(self._inputs[0].shape[0]) 16 | self.update() 17 | 18 | @property 19 | def number_batches(self): 20 | if self._batch_size is None: 21 | return 1 22 | return int(np.ceil(self._inputs[0].shape[0] * 1.0 / self._batch_size)) 23 | 24 | def iterate(self, update=True): 25 | if self._batch_size is None: 26 | yield list(self._inputs) + list(self._extra_inputs) 27 | else: 28 | for itr in range(self.number_batches): 29 | batch_start = itr * self._batch_size 30 | batch_end = (itr + 1) * self._batch_size 31 | batch_ids = self._ids[batch_start:batch_end] 32 | batch = [d[batch_ids] for d in self._inputs] 33 | yield list(batch) + list(self._extra_inputs) 34 | if update: 35 | self.update() 36 | 37 | def update(self): 38 | np.random.shuffle(self._ids) 39 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/launchers/trpo_cartpole_recurrent.py: -------------------------------------------------------------------------------- 1 | from sandbox.rocky.tf.algos.trpo import TRPO 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from sandbox.rocky.tf.policies.gaussian_gru_policy import GaussianGRUPolicy 6 | from sandbox.rocky.tf.policies.gaussian_lstm_policy import GaussianLSTMPolicy 7 | from sandbox.rocky.tf.envs.base import TfEnv 8 | import sandbox.rocky.tf.core.layers as L 9 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer, FiniteDifferenceHvp 10 | from rllab.misc.instrument import stub, run_experiment_lite 11 | 12 | stub(globals()) 13 | 14 | env = TfEnv(normalize(CartpoleEnv())) 15 | 16 | policy = GaussianLSTMPolicy( 17 | name="policy", 18 | env_spec=env.spec, 19 | lstm_layer_cls=L.TfBasicLSTMLayer, 20 | # gru_layer_cls=L.GRULayer, 21 | ) 22 | 23 | baseline = LinearFeatureBaseline(env_spec=env.spec) 24 | 25 | algo = TRPO( 26 | env=env, 27 | policy=policy, 28 | baseline=baseline, 29 | batch_size=4000, 30 | max_path_length=100, 31 | n_itr=10, 32 | discount=0.99, 33 | step_size=0.01, 34 | optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) 35 | ) 36 | run_experiment_lite( 37 | algo.train(), 38 | n_parallel=4, 39 | seed=1, 40 | ) 41 | -------------------------------------------------------------------------------- /examples/vpg_point.py: -------------------------------------------------------------------------------- 1 | #from rllab.algos.vpg import VPG 2 | from sandbox.rocky.tf.algos.vpg import VPG 3 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 4 | from rllab.baselines.zero_baseline import ZeroBaseline 5 | from examples.point_env import PointEnv 6 | from examples.point_env_randgoal import PointEnvRandGoal 7 | from rllab.envs.normalized_env import normalize 8 | from rllab.misc.instrument import stub, run_experiment_lite 9 | #from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 10 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 11 | #from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy 12 | from sandbox.rocky.tf.envs.base import TfEnv 13 | 14 | stub(globals()) 15 | 16 | #env = TfEnv(normalize(PointEnv())) 17 | env = TfEnv(normalize(PointEnvRandGoal())) 18 | policy = GaussianMLPPolicy( 19 | name="policy", 20 | env_spec=env.spec, 21 | ) 22 | #baseline = LinearFeatureBaseline(env_spec=env.spec) 23 | baseline = ZeroBaseline(env_spec=env.spec) 24 | algo = VPG( 25 | env=env, 26 | policy=policy, 27 | baseline=baseline, 28 | #batch_size=20, 29 | max_path_length=5, 30 | n_itr=100, 31 | #plot=True, 32 | ) 33 | run_experiment_lite( 34 | algo.train(), 35 | n_parallel=1, 36 | snapshot_mode="last", 37 | seed=1, 38 | exp_prefix='deleteme', 39 | exp_name='deleteme', 40 | #plot=True, 41 | ) 42 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/launchers/trpo_cartpole.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from sandbox.rocky.tf.algos.trpo import TRPO 5 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 6 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 7 | from rllab.envs.normalized_env import normalize 8 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import ConjugateGradientOptimizer 9 | from sandbox.rocky.tf.optimizers.conjugate_gradient_optimizer import FiniteDifferenceHvp 10 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 11 | from sandbox.rocky.tf.envs.base import TfEnv 12 | from rllab.misc.instrument import stub, run_experiment_lite 13 | 14 | stub(globals()) 15 | 16 | env = TfEnv(normalize(CartpoleEnv())) 17 | 18 | policy = GaussianMLPPolicy( 19 | name="policy", 20 | env_spec=env.spec, 21 | # The neural network policy should have two hidden layers, each with 32 hidden units. 22 | hidden_sizes=(32, 32) 23 | ) 24 | 25 | baseline = LinearFeatureBaseline(env_spec=env.spec) 26 | 27 | algo = TRPO( 28 | env=env, 29 | policy=policy, 30 | baseline=baseline, 31 | batch_size=4000, 32 | max_path_length=100, 33 | n_itr=40, 34 | discount=0.99, 35 | step_size=0.01, 36 | # optimizer=ConjugateGradientOptimizer(hvp_approach=FiniteDifferenceHvp(base_eps=1e-5)) 37 | 38 | ) 39 | run_experiment_lite( 40 | algo.train(), 41 | n_parallel=4, 42 | seed=1, 43 | ) 44 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. rllab documentation master file, created by 2 | sphinx-quickstart on Mon Feb 15 20:07:12 2016. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to rllab 7 | ================ 8 | 9 | rllab is a framework for developing and evaluating reinforcement learning algorithms. 10 | 11 | rllab is a work in progress, input is welcome. The available documentation is limited for now. 12 | 13 | User Guide 14 | ========== 15 | 16 | The rllab user guide explains how to install rllab, how to run experiments, and how to implement new MDPs and new algorithms. 17 | 18 | .. toctree:: 19 | :maxdepth: 2 20 | 21 | user/installation 22 | user/experiments 23 | user/gym_integration 24 | user/implement_env 25 | user/implement_algo_basic 26 | user/implement_algo_advanced 27 | user/cluster 28 | 29 | 30 | Citing rllab 31 | ============ 32 | 33 | If you use rllab for academic research, you are highly encouraged to cite the following paper: 34 | 35 | - Yan Duan, Xi Chen, Rein Houthooft, John Schulman, Pieter Abbeel. "`Benchmarking Deep Reinforcement Learning for Continuous Control `_. *Proceedings of the 33rd International Conference on Machine Learning (ICML), 2016.* 36 | 37 | 38 | Indices and tables 39 | ================== 40 | 41 | * :ref:`genindex` 42 | * :ref:`modindex` 43 | * :ref:`search` 44 | 45 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: rllab3 2 | channels: 3 | - https://conda.anaconda.org/kne 4 | - https://conda.binstar.org/tlatorre 5 | - https://conda.anaconda.org/cjs14 6 | - https://conda.anaconda.org/menpo 7 | - jjhelmus 8 | dependencies: 9 | - python==3.5.2 10 | - numpy==1.10.4 11 | - scipy 12 | - path.py 13 | - python-dateutil 14 | - joblib==0.9.4 15 | - mako 16 | - ipywidgets 17 | - numba 18 | - flask 19 | - pybox2d 20 | - pygame 21 | - h5py 22 | - matplotlib 23 | - opencv3=3.1.0 24 | - scikit-learn 25 | - tensorflow 26 | - pip: 27 | - Pillow 28 | - atari-py 29 | - pyprind 30 | - ipdb 31 | - boto3 32 | - PyOpenGL 33 | - nose2 34 | - pyzmq 35 | - msgpack-python 36 | - mujoco_py 37 | - cached_property 38 | - line_profiler 39 | - Cython 40 | - git+https://github.com/Theano/Theano.git@adfe319ce6b781083d8dc3200fb4481b00853791#egg=Theano 41 | - git+https://github.com/neocxi/Lasagne.git@484866cf8b38d878e92d521be445968531646bb8#egg=Lasagne 42 | - git+https://github.com/plotly/plotly.py.git@2594076e29584ede2d09f2aa40a8a195b3f3fc66#egg=plotly 43 | - awscli 44 | - git+https://github.com/openai/gym.git 45 | - pyglet 46 | - git+https://github.com/neocxi/prettytensor.git 47 | - jupyter 48 | - progressbar2 49 | - chainer==1.15.0 50 | -------------------------------------------------------------------------------- /rllab/algos/erwr.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.vpg import VPG 2 | from rllab.optimizers.lbfgs_optimizer import LbfgsOptimizer 3 | from rllab.core.serializable import Serializable 4 | 5 | 6 | class ERWR(VPG, Serializable): 7 | """ 8 | Episodic Reward Weighted Regression [1]_ 9 | 10 | Notes 11 | ----- 12 | This does not implement the original RwR [2]_ that deals with "immediate reward problems" since 13 | it doesn't find solutions that optimize for temporally delayed rewards. 14 | 15 | .. [1] Kober, Jens, and Jan R. Peters. "Policy search for motor primitives in robotics." Advances in neural information processing systems. 2009. 16 | .. [2] Peters, Jan, and Stefan Schaal. "Using reward-weighted regression for reinforcement learning of task space control." Approximate Dynamic Programming and Reinforcement Learning, 2007. ADPRL 2007. IEEE International Symposium on. IEEE, 2007. 17 | """ 18 | 19 | def __init__( 20 | self, 21 | optimizer=None, 22 | optimizer_args=None, 23 | positive_adv=None, 24 | **kwargs): 25 | Serializable.quick_init(self, locals()) 26 | if optimizer is None: 27 | if optimizer_args is None: 28 | optimizer_args = dict() 29 | optimizer = LbfgsOptimizer(**optimizer_args) 30 | super(ERWR, self).__init__( 31 | optimizer=optimizer, 32 | positive_adv=True if positive_adv is None else positive_adv, 33 | **kwargs 34 | ) 35 | 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 rllab contributors 4 | 5 | rllab uses a shared copyright model: each contributor holds copyright over 6 | their contributions to rllab. The project versioning records all such 7 | contribution and copyright details. 8 | By contributing to the rllab repository through pull-request, comment, 9 | or otherwise, the contributor releases their content to the license and 10 | copyright terms herein. 11 | 12 | Permission is hereby granted, free of charge, to any person obtaining a copy 13 | of this software and associated documentation files (the "Software"), to deal 14 | in the Software without restriction, including without limitation the rights 15 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | copies of the Software, and to permit persons to whom the Software is 17 | furnished to do so, subject to the following conditions: 18 | 19 | The above copyright notice and this permission notice shall be included in all 20 | copies or substantial portions of the Software. 21 | 22 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 27 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 28 | SOFTWARE. 29 | -------------------------------------------------------------------------------- /rllab/spaces/base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Space(object): 5 | """ 6 | Provides a classification state spaces and action spaces, 7 | so you can write generic code that applies to any Environment. 8 | E.g. to choose a random action. 9 | """ 10 | 11 | def sample(self, seed=0): 12 | """ 13 | Uniformly randomly sample a random elemnt of this space 14 | """ 15 | raise NotImplementedError 16 | 17 | def contains(self, x): 18 | """ 19 | Return boolean specifying if x is a valid 20 | member of this space 21 | """ 22 | raise NotImplementedError 23 | 24 | def flatten(self, x): 25 | raise NotImplementedError 26 | 27 | def unflatten(self, x): 28 | raise NotImplementedError 29 | 30 | def flatten_n(self, xs): 31 | raise NotImplementedError 32 | 33 | def unflatten_n(self, xs): 34 | raise NotImplementedError 35 | 36 | @property 37 | def flat_dim(self): 38 | """ 39 | The dimension of the flattened vector of the tensor representation 40 | """ 41 | raise NotImplementedError 42 | 43 | def new_tensor_variable(self, name, extra_dims): 44 | """ 45 | Create a Theano tensor variable given the name and extra dimensions prepended 46 | :param name: name of the variable 47 | :param extra_dims: extra dimensions in the front 48 | :return: the created tensor variable 49 | """ 50 | raise NotImplementedError 51 | -------------------------------------------------------------------------------- /examples/point_env_rand2goal.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.base import Env 2 | from rllab.spaces import Box 3 | from rllab.envs.base import Step 4 | import numpy as np 5 | 6 | 7 | class PointEnvRandGoal(Env): 8 | def __init__(self): 9 | # TODO - call super class init? 10 | self._goal = None 11 | 12 | @property 13 | def observation_space(self): 14 | return Box(low=-np.inf, high=np.inf, shape=(2,)) 15 | 16 | @property 17 | def action_space(self): 18 | return Box(low=-0.1, high=0.1, shape=(2,)) 19 | 20 | def reset(self, reset_args=None): 21 | goal = reset_args 22 | if goal is not None: 23 | self._goal = goal 24 | elif self._goal is None: 25 | #else: 26 | # Only set a new goal if this env hasn't had one defined before. 27 | goals = [np.array([-0.5,0]), np.array([0.5,0])] 28 | self._goal = goals[np.random.randint(2)] 29 | 30 | self._state = (0, 0) 31 | observation = np.copy(self._state) 32 | return observation 33 | 34 | def step(self, action): 35 | self._state = self._state + action 36 | x, y = self._state 37 | x -= self._goal[0] 38 | y -= self._goal[1] 39 | reward = - (x ** 2 + y ** 2) ** 0.5 40 | done = abs(x) < 0.01 and abs(y) < 0.01 41 | next_observation = np.copy(self._state) 42 | return Step(observation=next_observation, reward=reward, done=done) 43 | 44 | def render(self): 45 | print('current state:', self._state) 46 | -------------------------------------------------------------------------------- /scripts/sync_s3.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('.') 3 | from rllab import config 4 | import os 5 | import argparse 6 | import ast 7 | 8 | if __name__ == "__main__": 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('folder', type=str, default=None, nargs='?') 11 | parser.add_argument('--dry', action='store_true', default=False) 12 | parser.add_argument('--bare', action='store_true', default=False) 13 | parser.add_argument('--all', action='store_true', default=False) 14 | args = parser.parse_args() 15 | remote_dir = config.AWS_S3_PATH 16 | local_dir = os.path.join(config.LOG_DIR, "s3") 17 | if args.folder: 18 | remote_dir = os.path.join(remote_dir, args.folder) 19 | local_dir = os.path.join(local_dir, args.folder) 20 | if args.bare: 21 | command = (""" 22 | aws s3 sync {remote_dir} {local_dir} --exclude '*' --include '*.csv' --include '*.json' --content-type "UTF-8" 23 | """.format(local_dir=local_dir, remote_dir=remote_dir)) 24 | elif args.all: 25 | command = (""" 26 | aws s3 sync {remote_dir} {local_dir} --content-type "UTF-8" 27 | """.format(local_dir=local_dir, remote_dir=remote_dir)) 28 | else: 29 | command = (""" 30 | aws s3 sync {remote_dir} {local_dir} --exclude '*stdout.log' --exclude '*stdouterr.log' --content-type "UTF-8" 31 | """.format(local_dir=local_dir, remote_dir=remote_dir)) 32 | if args.dry: 33 | print(command) 34 | else: 35 | os.system(command) 36 | -------------------------------------------------------------------------------- /rllab/baselines/linear_feature_baseline.py: -------------------------------------------------------------------------------- 1 | from rllab.baselines.base import Baseline 2 | from rllab.misc.overrides import overrides 3 | import numpy as np 4 | 5 | 6 | class LinearFeatureBaseline(Baseline): 7 | def __init__(self, env_spec, reg_coeff=1e-5): 8 | self._coeffs = None 9 | self._reg_coeff = reg_coeff 10 | 11 | @overrides 12 | def get_param_values(self, **tags): 13 | return self._coeffs 14 | 15 | @overrides 16 | def set_param_values(self, val, **tags): 17 | self._coeffs = val 18 | 19 | def _features(self, path): 20 | o = np.clip(path["observations"], -10, 10) 21 | l = len(path["rewards"]) 22 | al = np.arange(l).reshape(-1, 1) / 100.0 23 | return np.concatenate([o, o ** 2, al, al ** 2, al ** 3, np.ones((l, 1))], axis=1) 24 | 25 | @overrides 26 | def fit(self, paths, **kwargs): 27 | featmat = np.concatenate([self._features(path) for path in paths]) 28 | returns = np.concatenate([path["returns"] for path in paths]) 29 | reg_coeff = self._reg_coeff 30 | for _ in range(5): 31 | self._coeffs = np.linalg.lstsq( 32 | featmat.T.dot(featmat) + reg_coeff * np.identity(featmat.shape[1]), 33 | featmat.T.dot(returns) 34 | )[0] 35 | if not np.any(np.isnan(self._coeffs)): 36 | break 37 | reg_coeff *= 10 38 | 39 | @overrides 40 | def predict(self, path): 41 | if self._coeffs is None: 42 | return np.zeros(len(path["rewards"])) 43 | return self._features(path).dot(self._coeffs) 44 | -------------------------------------------------------------------------------- /rllab/core/serializable.py: -------------------------------------------------------------------------------- 1 | import inspect 2 | 3 | 4 | class Serializable(object): 5 | 6 | def __init__(self, *args, **kwargs): 7 | self.__args = args 8 | self.__kwargs = kwargs 9 | 10 | def quick_init(self, locals_): 11 | if getattr(self, "_serializable_initialized", False): 12 | return 13 | spec = inspect.getargspec(self.__init__) 14 | # Exclude the first "self" parameter 15 | in_order_args = [locals_[arg] for arg in spec.args][1:] 16 | if spec.varargs: 17 | varargs = locals_[spec.varargs] 18 | else: 19 | varargs = tuple() 20 | if spec.keywords: 21 | kwargs = locals_[spec.keywords] 22 | else: 23 | kwargs = dict() 24 | self.__args = tuple(in_order_args) + varargs 25 | self.__kwargs = kwargs 26 | setattr(self, "_serializable_initialized", True) 27 | 28 | def __getstate__(self): 29 | return {"__args": self.__args, "__kwargs": self.__kwargs} 30 | 31 | def __setstate__(self, d): 32 | # convert all __args to keyword-based arguments 33 | in_order_args = inspect.getargspec(self.__init__).args[1:] 34 | out = type(self)(**dict(zip(in_order_args, d["__args"]), **d["__kwargs"])) 35 | self.__dict__.update(out.__dict__) 36 | 37 | @classmethod 38 | def clone(cls, obj, **kwargs): 39 | assert isinstance(obj, Serializable) 40 | d = obj.__getstate__() 41 | d["__kwargs"] = dict(d["__kwargs"], **kwargs) 42 | out = type(obj).__new__(type(obj)) 43 | out.__setstate__(d) 44 | return out 45 | -------------------------------------------------------------------------------- /examples/ddpg_cartpole_stub.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.ddpg import DDPG 2 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 3 | from rllab.envs.normalized_env import normalize 4 | from rllab.misc.instrument import stub, run_experiment_lite 5 | from rllab.exploration_strategies.ou_strategy import OUStrategy 6 | from rllab.policies.deterministic_mlp_policy import DeterministicMLPPolicy 7 | from rllab.q_functions.continuous_mlp_q_function import ContinuousMLPQFunction 8 | 9 | stub(globals()) 10 | 11 | env = normalize(CartpoleEnv()) 12 | 13 | policy = DeterministicMLPPolicy( 14 | env_spec=env.spec, 15 | # The neural network policy should have two hidden layers, each with 32 hidden units. 16 | hidden_sizes=(32, 32) 17 | ) 18 | 19 | es = OUStrategy(env_spec=env.spec) 20 | 21 | qf = ContinuousMLPQFunction(env_spec=env.spec) 22 | 23 | algo = DDPG( 24 | env=env, 25 | policy=policy, 26 | es=es, 27 | qf=qf, 28 | batch_size=32, 29 | max_path_length=100, 30 | epoch_length=1000, 31 | min_pool_size=10000, 32 | n_epochs=1000, 33 | discount=0.99, 34 | scale_reward=0.01, 35 | qf_learning_rate=1e-3, 36 | policy_learning_rate=1e-4, 37 | # Uncomment both lines (this and the plot parameter below) to enable plotting 38 | # plot=True, 39 | ) 40 | 41 | run_experiment_lite( 42 | algo.train(), 43 | # Number of parallel workers for sampling 44 | n_parallel=1, 45 | # Only keep the snapshot parameters for the last iteration 46 | snapshot_mode="last", 47 | # Specifies the seed for the experiment. If this is not provided, a random seed 48 | # will be used 49 | seed=1, 50 | # plot=True, 51 | ) 52 | -------------------------------------------------------------------------------- /examples/old/sens_vpg_point.py: -------------------------------------------------------------------------------- 1 | #from rllab.algos.vpg import VPG 2 | from sandbox.rocky.tf.algos.sensitive_vpg import SensitiveVPG 3 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 4 | from rllab.baselines.zero_baseline import ZeroBaseline 5 | from examples.point_env import PointEnv 6 | from examples.point_env_randgoal import PointEnvRandGoal 7 | from rllab.envs.normalized_env import normalize 8 | from rllab.misc.instrument import stub, run_experiment_lite 9 | #from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 10 | from sandbox.rocky.tf.policies.sens_minimal_gauss_mlp_policy import SensitiveGaussianMLPPolicy 11 | from sandbox.rocky.tf.envs.base import TfEnv 12 | 13 | import tensorflow as tf 14 | 15 | stub(globals()) 16 | 17 | #env = TfEnv(normalize(PointEnv())) 18 | env = TfEnv(normalize(PointEnvRandGoal())) 19 | policy = SensitiveGaussianMLPPolicy( 20 | name="policy", 21 | env_spec=env.spec, 22 | grad_step_size=1.0, 23 | hidden_nonlinearity=tf.nn.relu, 24 | ) 25 | baseline = LinearFeatureBaseline(env_spec=env.spec) 26 | #baseline = ZeroBaseline(env_spec=env.spec) 27 | algo = SensitiveVPG( 28 | env=env, 29 | policy=policy, 30 | baseline=baseline, 31 | batch_size=20, # use 100 trajs for grad update 32 | max_path_length=5, 33 | meta_batch_size=100, 34 | n_itr=100, 35 | use_sensitive=False, 36 | optimizer_args={'learning_rate': 1e-3} 37 | #plot=True, 38 | ) 39 | run_experiment_lite( 40 | algo.train(), 41 | n_parallel=1, 42 | snapshot_mode="last", 43 | seed=1, 44 | exp_prefix='sensitive1dT5_2017_01_18', 45 | exp_name='nosensitive_linbaseline', 46 | #plot=True, 47 | ) 48 | -------------------------------------------------------------------------------- /rllab/envs/box2d/models/double_pendulum.xml.mako: -------------------------------------------------------------------------------- 1 | <% 2 | from rllab.misc.mako_utils import compute_rect_vertices 3 | link_len = opts['link_len'] 4 | link_width = 0.1 5 | %> 6 | 7 | 8 | 9 | 10 | 16 | 17 | 18 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /rllab/baselines/gaussian_conv_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.misc.overrides import overrides 5 | from rllab.core.parameterized import Parameterized 6 | from rllab.baselines.base import Baseline 7 | from rllab.regressors.gaussian_conv_regressor import GaussianConvRegressor 8 | 9 | 10 | class GaussianConvBaseline(Baseline, Parameterized, Serializable): 11 | 12 | def __init__( 13 | self, 14 | env_spec, 15 | subsample_factor=1., 16 | regressor_args=None, 17 | ): 18 | Serializable.quick_init(self, locals()) 19 | super(GaussianConvBaseline, self).__init__(env_spec) 20 | if regressor_args is None: 21 | regressor_args = dict() 22 | 23 | self._regressor = GaussianConvRegressor( 24 | input_shape=env_spec.observation_space.shape, 25 | output_dim=1, 26 | name="vf", 27 | **regressor_args 28 | ) 29 | 30 | @overrides 31 | def fit(self, paths): 32 | observations = np.concatenate([p["observations"] for p in paths]) 33 | returns = np.concatenate([p["returns"] for p in paths]) 34 | self._regressor.fit(observations, returns.reshape((-1, 1))) 35 | 36 | @overrides 37 | def predict(self, path): 38 | return self._regressor.predict(path["observations"]).flatten() 39 | 40 | @overrides 41 | def get_param_values(self, **tags): 42 | return self._regressor.get_param_values(**tags) 43 | 44 | @overrides 45 | def set_param_values(self, flattened_params, **tags): 46 | self._regressor.set_param_values(flattened_params, **tags) 47 | -------------------------------------------------------------------------------- /examples/point_env_randgoal_oracle.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.base import Env 2 | from rllab.spaces import Box 3 | from rllab.envs.base import Step 4 | import numpy as np 5 | 6 | 7 | class PointEnvRandGoalOracle(Env): 8 | def __init__(self, goal=None): 9 | # TODO - call super class init? 10 | self._goal = goal 11 | if goal is None: 12 | self.set_at_init = False 13 | else: 14 | self.set_at_init = True 15 | 16 | @property 17 | def observation_space(self): 18 | return Box(low=-np.inf, high=np.inf, shape=(4,)) 19 | 20 | @property 21 | def action_space(self): 22 | return Box(low=-0.1, high=0.1, shape=(2,)) 23 | 24 | def sample_goals(self, num_goals): 25 | return np.random.uniform(-0.5, 0.5, size=(num_goals, 2, )) 26 | 27 | def reset(self, reset_args=None): 28 | goal = reset_args 29 | if goal is not None: 30 | self._goal = goal 31 | elif not self.set_at_init: 32 | self._goal = np.random.uniform(-0.5, 0.5, size=(2,)) 33 | 34 | self._state = (0, 0) 35 | observation = np.copy(self._state) 36 | return np.r_[observation, np.copy(self._goal)] 37 | 38 | def step(self, action): 39 | self._state = self._state + action 40 | x, y = self._state 41 | x -= self._goal[0] 42 | y -= self._goal[1] 43 | reward = - (x ** 2 + y ** 2) ** 0.5 44 | done = abs(x) < 0.01 and abs(y) < 0.01 45 | next_observation = np.r_[np.copy(self._state), np.copy(self._goal)] 46 | return Step(observation=next_observation, reward=reward, done=done) 47 | 48 | def render(self): 49 | print('current state:', self._state) 50 | -------------------------------------------------------------------------------- /rllab/envs/sliding_mem_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.envs.base import Step 5 | from rllab.envs.proxy_env import ProxyEnv 6 | from rllab.misc import autoargs 7 | from rllab.misc.overrides import overrides 8 | from rllab.spaces import Box 9 | 10 | 11 | class SlidingMemEnv(ProxyEnv, Serializable): 12 | 13 | def __init__( 14 | self, 15 | env, 16 | n_steps=4, 17 | axis=0, 18 | ): 19 | super().__init__(env) 20 | Serializable.quick_init(self, locals()) 21 | self.n_steps = n_steps 22 | self.axis = axis 23 | self.buffer = None 24 | 25 | def reset_buffer(self, new_): 26 | assert self.axis == 0 27 | self.buffer = np.zeros(self.observation_space.shape, dtype=np.float32) 28 | self.buffer[0:] = new_ 29 | 30 | def add_to_buffer(self, new_): 31 | assert self.axis == 0 32 | self.buffer[1:] = self.buffer[:-1] 33 | self.buffer[:1] = new_ 34 | 35 | @property 36 | def observation_space(self): 37 | origin = self._wrapped_env.observation_space 38 | return Box( 39 | *[ 40 | np.repeat(b, self.n_steps, axis=self.axis) 41 | for b in origin.bounds 42 | ] 43 | ) 44 | 45 | @overrides 46 | def reset(self): 47 | obs = self._wrapped_env.reset() 48 | self.reset_buffer(obs) 49 | return self.buffer 50 | 51 | @overrides 52 | def step(self, action): 53 | next_obs, reward, done, info = self._wrapped_env.step(action) 54 | self.add_to_buffer(next_obs) 55 | return Step(self.buffer, reward, done, **info) 56 | 57 | -------------------------------------------------------------------------------- /rllab/baselines/gaussian_mlp_baseline.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.core.parameterized import Parameterized 5 | from rllab.baselines.base import Baseline 6 | from rllab.misc.overrides import overrides 7 | from rllab.regressors.gaussian_mlp_regressor import GaussianMLPRegressor 8 | 9 | 10 | class GaussianMLPBaseline(Baseline, Parameterized, Serializable): 11 | 12 | def __init__( 13 | self, 14 | env_spec, 15 | subsample_factor=1., 16 | num_seq_inputs=1, 17 | regressor_args=None, 18 | ): 19 | Serializable.quick_init(self, locals()) 20 | super(GaussianMLPBaseline, self).__init__(env_spec) 21 | if regressor_args is None: 22 | regressor_args = dict() 23 | 24 | self._regressor = GaussianMLPRegressor( 25 | input_shape=(env_spec.observation_space.flat_dim * num_seq_inputs,), 26 | output_dim=1, 27 | name="vf", 28 | **regressor_args 29 | ) 30 | 31 | @overrides 32 | def fit(self, paths, log=True): 33 | observations = np.concatenate([p["observations"] for p in paths]) 34 | returns = np.concatenate([p["returns"] for p in paths]) 35 | self._regressor.fit(observations, returns.reshape((-1, 1)), log=log) 36 | 37 | @overrides 38 | def predict(self, path): 39 | return self._regressor.predict(path["observations"]).flatten() 40 | 41 | @overrides 42 | def get_param_values(self, **tags): 43 | return self._regressor.get_param_values(**tags) 44 | 45 | @overrides 46 | def set_param_values(self, flattened_params, **tags): 47 | self._regressor.set_param_values(flattened_params, **tags) 48 | -------------------------------------------------------------------------------- /scripts/setup_mujoco.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$(uname)" == "Darwin" ]; then 4 | mujoco_file="libmujoco131.dylib" 5 | glfw_file="libglfw.3.dylib" 6 | zip_file="mjpro131_osx.zip" 7 | mktemp_cmd="mktemp -d /tmp/mujoco" 8 | elif [ "$(expr substr $(uname -s) 1 5)" == "Linux" ]; then 9 | mujoco_file="libmujoco131.so" 10 | glfw_file="libglfw.so.3" 11 | zip_file="mjpro131_linux.zip" 12 | mktemp_cmd="mktemp -d" 13 | fi 14 | 15 | if [ ! -f vendor/mujoco/$mujoco_file ]; then 16 | read -e -p "Please enter the path to the mujoco zip file [$zip_file]:" path 17 | path=${path:-$zip_file} 18 | eval path=\"$path\" 19 | if [ ! -f $path ]; then 20 | echo "No file found at $path" 21 | exit 0 22 | fi 23 | rm -r /tmp/mujoco 24 | dir=`$mktemp_cmd` 25 | unzip $path -d $dir 26 | if [ ! -f $dir/mjpro131/bin/$mujoco_file ]; then 27 | echo "mjpro/$mujoco_file not found. Make sure you have the correct file (most likely named $zip_file)" 28 | exit 0 29 | fi 30 | if [ ! -f $dir/mjpro131/bin/$glfw_file ]; then 31 | echo "mjpro/$glfw_file not found. Make sure you have the correct file (most likely named $zip_file)" 32 | exit 0 33 | fi 34 | 35 | mkdir -p vendor/mujoco 36 | cp $dir/mjpro131/bin/$mujoco_file vendor/mujoco/ 37 | cp $dir/mjpro131/bin/$glfw_file vendor/mujoco/ 38 | fi 39 | 40 | if [ ! -f vendor/mujoco/mjkey.txt ]; then 41 | read -e -p "Please enter the path to the mujoco license file [mjkey.txt]:" path 42 | path=${path:-mjkey.txt} 43 | eval path=$path 44 | if [ ! -f $path ]; then 45 | echo "No file found at $path" 46 | exit 0 47 | fi 48 | cp $path vendor/mujoco/mjkey.txt 49 | fi 50 | 51 | echo "Mujoco has been set up!" 52 | -------------------------------------------------------------------------------- /scripts/sim_policy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import joblib 4 | import tensorflow as tf 5 | 6 | from rllab.misc.console import query_yes_no 7 | from rllab.sampler.utils import rollout 8 | 9 | if __name__ == "__main__": 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('file', type=str, 13 | help='path to the snapshot file') 14 | parser.add_argument('--max_path_length', type=int, default=1000, 15 | help='Max length of rollout') 16 | parser.add_argument('--speedup', type=float, default=1, 17 | help='Speedup') 18 | parser.add_argument('--video_filename', type=str, 19 | help='path to the out video file') 20 | parser.add_argument('--prompt', type=bool, default=False, 21 | help='Whether or not to prompt for more sim') 22 | args = parser.parse_args() 23 | 24 | max_tries = 10 25 | tri = 0 26 | while True: 27 | tri += 1 28 | with tf.Session() as sess: 29 | data = joblib.load(args.file) 30 | policy = data['policy'] 31 | env = data['env'] 32 | while True: 33 | path = rollout(env, policy, max_path_length=args.max_path_length, 34 | animated=True, speedup=args.speedup, video_filename=args.video_filename) 35 | if args.prompt: 36 | if not query_yes_no('Continue simulation?'): 37 | break 38 | else: 39 | break 40 | #import pdb; pdb.set_trace() 41 | if len(path['rewards']) < args.max_path_length and tri >= max_tries: 42 | tf.reset_default_graph() 43 | continue 44 | break 45 | -------------------------------------------------------------------------------- /rllab/envs/box2d/models/cartpole.xml.mako: -------------------------------------------------------------------------------- 1 | <% 2 | from rllab.misc.mako_utils import compute_rect_vertices 3 | cart_width = 4.0 / (12 ** 0.5) 4 | cart_height = 3.0 / (12 ** 0.5) 5 | 6 | pole_width = 0.1 7 | pole_height = 1.0 8 | noise = opts.get("noise", False) 9 | if noise: 10 | import numpy as np 11 | pole_height += (np.random.rand()-0.5) * pole_height * 1 12 | 13 | cart_friction = 0.0005 14 | pole_friction = 0.000002 15 | %> 16 | 17 | 18 | 19 | 20 | 26 | 27 | 28 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /examples/point_env_randgoal.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.base import Env 2 | from rllab.spaces import Box 3 | from rllab.envs.base import Step 4 | import numpy as np 5 | 6 | 7 | class PointEnvRandGoal(Env): 8 | def __init__(self, goal=None): # Can set goal to test adaptation. 9 | self._goal = goal 10 | 11 | @property 12 | def observation_space(self): 13 | return Box(low=-np.inf, high=np.inf, shape=(2,)) 14 | 15 | @property 16 | def action_space(self): 17 | return Box(low=-0.1, high=0.1, shape=(2,)) 18 | 19 | def sample_goals(self, num_goals): 20 | return np.random.uniform(-0.5, 0.5, size=(num_goals, 2, )) 21 | 22 | def reset(self, reset_args=None): 23 | goal = reset_args 24 | if goal is not None: 25 | self._goal = goal 26 | elif self._goal is None: 27 | # Only set a new goal if this env hasn't had one defined before. 28 | self._goal = np.random.uniform(-0.5, 0.5, size=(2,)) 29 | #goals = [np.array([-0.5,0]), np.array([0.5,0])] 30 | #goals = np.array([[-0.5,0], [0.5,0],[0.2,0.2],[-0.2,-0.2],[0.5,0.5],[0,0.5],[0,-0.5],[-0.5,-0.5],[0.5,-0.5],[-0.5,0.5]]) 31 | #self._goal = goals[np.random.randint(10)] 32 | 33 | self._state = (0, 0) 34 | observation = np.copy(self._state) 35 | return observation 36 | 37 | def step(self, action): 38 | self._state = self._state + action 39 | x, y = self._state 40 | x -= self._goal[0] 41 | y -= self._goal[1] 42 | reward = - (x ** 2 + y ** 2) ** 0.5 43 | done = abs(x) < 0.01 and abs(y) < 0.01 44 | next_observation = np.copy(self._state) 45 | return Step(observation=next_observation, reward=reward, done=done, goal=self._goal) 46 | 47 | def render(self): 48 | print('current state:', self._state) 49 | -------------------------------------------------------------------------------- /tests/algos/test_trpo.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from rllab.envs.base import Env, Step 4 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 5 | from rllab.baselines.zero_baseline import ZeroBaseline 6 | from rllab.algos.trpo import TRPO 7 | from rllab.spaces.box import Box 8 | import lasagne.nonlinearities 9 | import numpy as np 10 | import theano.tensor as TT 11 | 12 | 13 | class DummyEnv(Env): 14 | @property 15 | def observation_space(self): 16 | return Box(low=-np.inf, high=np.inf, shape=(1,)) 17 | 18 | @property 19 | def action_space(self): 20 | return Box(low=-5.0, high=5.0, shape=(1,)) 21 | 22 | def reset(self): 23 | return np.zeros(1) 24 | 25 | def step(self, action): 26 | return Step(observation=np.zeros(1), reward=np.random.normal(), done=True) 27 | 28 | 29 | def naive_relu(x): 30 | return TT.max(x, 0) 31 | 32 | 33 | def test_trpo_relu_nan(): 34 | env = DummyEnv() 35 | policy = GaussianMLPPolicy( 36 | env_spec=env.spec, 37 | hidden_nonlinearity=naive_relu, 38 | hidden_sizes=(1,)) 39 | baseline = ZeroBaseline(env_spec=env.spec) 40 | algo = TRPO( 41 | env=env, policy=policy, baseline=baseline, n_itr=1, batch_size=1000, max_path_length=100, 42 | step_size=0.001 43 | ) 44 | algo.train() 45 | assert not np.isnan(np.sum(policy.get_param_values())) 46 | 47 | 48 | def test_trpo_deterministic_nan(): 49 | env = DummyEnv() 50 | policy = GaussianMLPPolicy( 51 | env_spec=env.spec, 52 | hidden_sizes=(1,)) 53 | policy._l_log_std.param.set_value([np.float32(np.log(1e-8))]) 54 | baseline = ZeroBaseline(env_spec=env.spec) 55 | algo = TRPO( 56 | env=env, policy=policy, baseline=baseline, n_itr=10, batch_size=1000, max_path_length=100, 57 | step_size=0.01 58 | ) 59 | algo.train() 60 | assert not np.isnan(np.sum(policy.get_param_values())) 61 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/envs/vec_env_executor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pickle as pickle 3 | from sandbox.rocky.tf.misc import tensor_utils 4 | 5 | 6 | class VecEnvExecutor(object): 7 | def __init__(self, envs, max_path_length): 8 | self.envs = envs 9 | self._action_space = envs[0].action_space 10 | self._observation_space = envs[0].observation_space 11 | self.ts = np.zeros(len(self.envs), dtype='int') 12 | self.max_path_length = max_path_length 13 | 14 | def step(self, action_n, reset_args=None): 15 | if reset_args is None: 16 | reset_args = [None]*len(self.envs) 17 | all_results = [env.step(a) for (a, env) in zip(action_n, self.envs)] 18 | obs, rewards, dones, env_infos = list(map(list, list(zip(*all_results)))) 19 | dones = np.asarray(dones) 20 | rewards = np.asarray(rewards) 21 | self.ts += 1 22 | if self.max_path_length is not None: 23 | dones[self.ts >= self.max_path_length] = True 24 | for (i, done) in enumerate(dones): 25 | if done: 26 | obs[i] = self.envs[i].reset(reset_args=reset_args[i]) 27 | self.ts[i] = 0 28 | return obs, rewards, dones, tensor_utils.stack_tensor_dict_list(env_infos) 29 | 30 | def reset(self, reset_args=None): 31 | if reset_args is not None: 32 | results = [env.reset(reset_args=arg) for env, arg in zip(self.envs, reset_args)] 33 | else: 34 | results = [env.reset() for env in self.envs] 35 | self.ts[:] = 0 36 | return results 37 | 38 | @property 39 | def num_envs(self): 40 | return len(self.envs) 41 | 42 | @property 43 | def action_space(self): 44 | return self._action_space 45 | 46 | @property 47 | def observation_space(self): 48 | return self._observation_space 49 | 50 | def terminate(self): 51 | pass 52 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/discrete.py: -------------------------------------------------------------------------------- 1 | from rllab.spaces.base import Space 2 | import numpy as np 3 | from rllab.misc import special 4 | from rllab.misc import ext 5 | import tensorflow as tf 6 | 7 | 8 | class Discrete(Space): 9 | """ 10 | {0,1,...,n-1} 11 | """ 12 | 13 | def __init__(self, n): 14 | self._n = n 15 | 16 | @property 17 | def n(self): 18 | return self._n 19 | 20 | def sample(self): 21 | return np.random.randint(self.n) 22 | 23 | def sample_n(self, n): 24 | return np.random.randint(low=0, high=self.n, size=n) 25 | 26 | def contains(self, x): 27 | x = np.asarray(x) 28 | return x.shape == () and x.dtype.kind == 'i' and x >= 0 and x < self.n 29 | 30 | def __repr__(self): 31 | return "Discrete(%d)" % self.n 32 | 33 | def __eq__(self, other): 34 | return self.n == other.n 35 | 36 | def flatten(self, x): 37 | return special.to_onehot(x, self.n) 38 | 39 | def unflatten(self, x): 40 | return special.from_onehot(x) 41 | 42 | def flatten_n(self, x): 43 | return special.to_onehot_n(x, self.n) 44 | 45 | def unflatten_n(self, x): 46 | return special.from_onehot_n(x) 47 | 48 | @property 49 | def default_value(self): 50 | return 0 51 | 52 | @property 53 | def flat_dim(self): 54 | return self.n 55 | 56 | def weighted_sample(self, weights): 57 | return special.weighted_sample(weights, range(self.n)) 58 | 59 | def new_tensor_variable(self, name, extra_dims): 60 | # needed for safe conversion to float32 61 | return tf.placeholder(dtype=tf.uint8, shape=[None] * extra_dims + [self.flat_dim], name=name) 62 | 63 | def __eq__(self, other): 64 | if not isinstance(other, Discrete): 65 | return False 66 | return self.n == other.n 67 | 68 | def __hash__(self): 69 | return hash(self.n) 70 | -------------------------------------------------------------------------------- /vendor/mujoco_models/point.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 32 | -------------------------------------------------------------------------------- /examples/icml/trpo_point.py: -------------------------------------------------------------------------------- 1 | 2 | from sandbox.rocky.tf.algos.trpo import TRPO 3 | from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy 4 | from sandbox.rocky.tf.envs.base import TfEnv 5 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 6 | from examples.point_env_randgoal import PointEnvRandGoal 7 | from examples.point_env_randgoal_oracle import PointEnvRandGoalOracle 8 | #from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv 9 | from rllab.envs.mujoco.walker2d_env import Walker2DEnv 10 | from rllab.envs.normalized_env import normalize 11 | from rllab.misc.instrument import stub, run_experiment_lite 12 | 13 | stub(globals()) 14 | 15 | import tensorflow as tf 16 | 17 | #env = normalize(PointEnvRandGoal()) 18 | env = normalize(PointEnvRandGoalOracle()) 19 | #env = normalize(HalfCheetahEnv()) 20 | #env = normalize(Walker2DEnv()) 21 | env = TfEnv(env) 22 | policy = GaussianMLPPolicy( 23 | name='policy', 24 | env_spec=env.spec, 25 | # The neural network policy should have two hidden layers, each with 32 hidden units. 26 | #hidden_sizes=(32, 32) 27 | #hidden_nonlinearity=tf.nn.relu, 28 | hidden_sizes=(100, 100) 29 | ) 30 | 31 | baseline = LinearFeatureBaseline(env_spec=env.spec) 32 | 33 | algo = TRPO( 34 | env=env, 35 | policy=policy, 36 | baseline=baseline, 37 | batch_size=500, # was 4k 38 | max_path_length=5, 39 | n_itr=100, 40 | discount=0.99, 41 | step_size=0.01, 42 | #plot=True, 43 | ) 44 | #algo.train() 45 | 46 | run_experiment_lite( 47 | algo.train(), 48 | # Number of parallel workers for sampling 49 | n_parallel=4, 50 | # Only keep the snapshot parameters for the last iteration 51 | snapshot_mode="last", 52 | # Specifies the seed for the experiment. If this is not provided, a random seed 53 | # will be used 54 | seed=1, 55 | exp_prefix='vpg_sensitive_point', 56 | exp_name='oracleenv', 57 | #plot=True, 58 | ) 59 | -------------------------------------------------------------------------------- /examples/vpg_swimmer.py: -------------------------------------------------------------------------------- 1 | #from rllab.algos.vpg import VPG 2 | from sandbox.rocky.tf.algos.vpg import VPG 3 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 4 | from rllab.baselines.zero_baseline import ZeroBaseline 5 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv 6 | from rllab.envs.mujoco.swimmer_randgoal_oracle_env import SwimmerRandGoalOracleEnv 7 | from rllab.envs.mujoco.swimmer_randgoal_env import SwimmerRandGoalEnv 8 | from rllab.envs.normalized_env import normalize 9 | from rllab.misc.instrument import stub, run_experiment_lite 10 | #from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 11 | #from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 12 | from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy 13 | from sandbox.rocky.tf.envs.base import TfEnv 14 | 15 | stub(globals()) 16 | oracle = False 17 | random = True 18 | 19 | if oracle: 20 | env = TfEnv(normalize(SwimmerRandGoalOracleEnv())) 21 | batch_size = 200 22 | elif random: 23 | env = TfEnv(normalize(SwimmerRandGoalEnv())) 24 | batch_size = 200 25 | else: 26 | env = TfEnv(normalize(SwimmerEnv())) 27 | batch_size = 20 28 | policy = GaussianMLPPolicy( 29 | name="policy", 30 | env_spec=env.spec, 31 | hidden_sizes=(100,100), 32 | ) 33 | baseline = LinearFeatureBaseline(env_spec=env.spec) 34 | #baseline = ZeroBaseline(env_spec=env.spec) 35 | algo = VPG( 36 | env=env, 37 | policy=policy, 38 | baseline=baseline, 39 | batch_size=500*batch_size, 40 | max_path_length=500, 41 | n_itr=500, 42 | #plot=True, 43 | optimizer_args={'tf_optimizer_args':{'learning_rate': 1e-3}}, 44 | ) 45 | run_experiment_lite( 46 | algo.train(), 47 | n_parallel=1, # try increasing this to make it faster??? (Maybe need to modify code for this) 48 | snapshot_mode="last", 49 | seed=1, 50 | exp_prefix='vpgswimmer', 51 | #exp_name='basic', 52 | exp_name='randomenv', 53 | #plot=True, 54 | ) 55 | -------------------------------------------------------------------------------- /icml/make_paths_plot.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | import numpy as np 3 | import pickle 4 | 5 | prefixes = ['maml', 'pretrain'] 6 | 7 | n_itr = 4 8 | goal = [-0.29554775, 0.37811744] 9 | 10 | 11 | plt.clf() 12 | plt.hold(True) 13 | itr_line_styles = [':', '-.', '--', '-'] 14 | maml_colors = ['dodgerblue', None, None, 'darkblue'] 15 | pretrain_colors = ['limegreen',None, None, 'darkgreen'] 16 | 17 | plt.figure(figsize=(9.0,4.5)) 18 | ind = 0 19 | #for itr in range(n_itr): 20 | for itr in [0,3]: 21 | with open('maml_paths_itr'+str(itr)+'.pkl', 'rb') as f: 22 | paths = pickle.load(f) 23 | points = paths[ind]['observations'] 24 | plt.plot(points[:,0], points[:,1], itr_line_styles[itr], color=maml_colors[itr], linewidth=2) 25 | plt.plot(goal[0], goal[1], 'r*', markersize=28, markeredgewidth=0) 26 | plt.title('MAML', fontsize=25) 27 | plt.legend(['pre-update', '3 steps', 'goal position'], fontsize=23, loc='upper right') #, 'pretrain preupdate', 'pretrain 3 steps']) 28 | plt.xlim([-0.5, 0.3]) 29 | plt.ylim([-0.2, 0.6]) 30 | plt.tight_layout() 31 | ax = plt.gca() 32 | plt.setp(ax.get_xticklabels(), fontsize=14) 33 | plt.setp(ax.get_yticklabels(), fontsize=14) 34 | plt.savefig('maml_paths_viz.png') 35 | 36 | plt.clf() 37 | #for itr in n_itr: 38 | for itr in [0,3]: 39 | with open('pretrain_paths_itr'+str(itr)+'.pkl', 'rb') as f: 40 | paths = pickle.load(f) 41 | points = paths[ind]['observations'] 42 | plt.plot(points[:,0], points[:,1], itr_line_styles[itr], color=pretrain_colors[itr], linewidth=2) 43 | plt.plot(goal[0], goal[1], 'r*', markersize=28, markeredgewidth=0) 44 | plt.title('pretrained', fontsize=25) 45 | plt.legend(['pre-update', '3 steps', 'goal position'], fontsize=23, loc='lower left') #, 'pretrain preupdate', 'pretrain 3 steps']) 46 | 47 | plt.xlim([-0.5, 0.3]) 48 | plt.ylim([-0.2, 0.6]) 49 | plt.tight_layout() 50 | ax = plt.gca() 51 | plt.setp(ax.get_xticklabels(), fontsize=14) 52 | plt.setp(ax.get_yticklabels(), fontsize=14) 53 | plt.savefig('pretrain_paths_viz.png') 54 | -------------------------------------------------------------------------------- /examples/trpo_gym.py: -------------------------------------------------------------------------------- 1 | use_tf = True 2 | 3 | 4 | if use_tf: 5 | from sandbox.rocky.tf.algos.trpo import TRPO 6 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 7 | from sandbox.rocky.tf.envs.base import TfEnv 8 | else: 9 | from rllab.algos.trpo import TRPO 10 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 11 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 12 | from rllab.envs.gym_env import GymEnv 13 | from rllab.envs.normalized_env import normalize 14 | from rllab.misc.instrument import stub, run_experiment_lite 15 | 16 | stub(globals()) 17 | 18 | #env = normalize(GymEnv("Pendulum-v0")) 19 | env = normalize(GymEnv("Walker2d-v1")) 20 | 21 | if use_tf: 22 | env = TfEnv(env) 23 | policy = GaussianMLPPolicy( 24 | name='policy', 25 | env_spec=env.spec, 26 | # The neural network policy should have two hidden layers, each with 32 hidden units. 27 | hidden_sizes=(32, 32) 28 | ) 29 | else: 30 | policy = GaussianMLPPolicy( 31 | env_spec=env.spec, 32 | # The neural network policy should have two hidden layers, each with 32 hidden units. 33 | hidden_sizes=(32, 32) 34 | ) 35 | 36 | baseline = LinearFeatureBaseline(env_spec=env.spec) 37 | 38 | algo = TRPO( 39 | env=env, 40 | policy=policy, 41 | baseline=baseline, 42 | batch_size=4000, 43 | max_path_length=env.horizon, 44 | n_itr=10000, 45 | discount=0.99, 46 | step_size=0.01, 47 | force_batch_sampler=True, # for TF 48 | # Uncomment both lines (this and the plot parameter below) to enable plotting 49 | plot=True, 50 | ) 51 | 52 | run_experiment_lite( 53 | algo.train(), 54 | # Number of parallel workers for sampling 55 | n_parallel=1, 56 | # Only keep the snapshot parameters for the last iteration 57 | snapshot_mode="last", 58 | # Specifies the seed for the experiment. If this is not provided, a random seed 59 | # will be used 60 | seed=1, 61 | plot=True, 62 | ) 63 | -------------------------------------------------------------------------------- /examples/cluster_demo.py: -------------------------------------------------------------------------------- 1 | from rllab.algos.trpo import TRPO 2 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 3 | from rllab.envs.box2d.cartpole_env import CartpoleEnv 4 | from rllab.envs.normalized_env import normalize 5 | from rllab.misc.instrument import stub, run_experiment_lite 6 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 7 | import sys 8 | 9 | stub(globals()) 10 | 11 | from rllab.misc.instrument import VariantGenerator, variant 12 | 13 | class VG(VariantGenerator): 14 | 15 | @variant 16 | def step_size(self): 17 | return [0.01, 0.05, 0.1] 18 | 19 | @variant 20 | def seed(self): 21 | return [1, 11, 21, 31, 41] 22 | 23 | variants = VG().variants() 24 | 25 | for v in variants: 26 | 27 | env = normalize(CartpoleEnv()) 28 | 29 | policy = GaussianMLPPolicy( 30 | env_spec=env.spec, 31 | # The neural network policy should have two hidden layers, each with 32 hidden units. 32 | hidden_sizes=(32, 32) 33 | ) 34 | 35 | baseline = LinearFeatureBaseline(env_spec=env.spec) 36 | 37 | algo = TRPO( 38 | env=env, 39 | policy=policy, 40 | baseline=baseline, 41 | batch_size=4000, 42 | max_path_length=100, 43 | n_itr=40, 44 | discount=0.99, 45 | step_size=v["step_size"], 46 | # Uncomment both lines (this and the plot parameter below) to enable plotting 47 | # plot=True, 48 | ) 49 | 50 | run_experiment_lite( 51 | algo.train(), 52 | exp_prefix="first_exp", 53 | # Number of parallel workers for sampling 54 | n_parallel=1, 55 | # Only keep the snapshot parameters for the last iteration 56 | snapshot_mode="last", 57 | # Specifies the seed for the experiment. If this is not provided, a random seed 58 | # will be used 59 | seed=v["seed"], 60 | # mode="local", 61 | mode="ec2", 62 | variant=v, 63 | # plot=True, 64 | # terminate_machine=False, 65 | ) 66 | -------------------------------------------------------------------------------- /rllab/plotter/plotter.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import sys 3 | if sys.version_info[0] == 2: 4 | from Queue import Empty 5 | else: 6 | from queue import Empty 7 | from multiprocessing import Process, Queue 8 | from rllab.sampler.utils import rollout 9 | import numpy as np 10 | 11 | __all__ = [ 12 | 'init_worker', 13 | 'init_plot', 14 | 'update_plot' 15 | ] 16 | 17 | process = None 18 | queue = None 19 | 20 | 21 | def _worker_start(): 22 | env = None 23 | policy = None 24 | max_length = None 25 | try: 26 | while True: 27 | msgs = {} 28 | # Only fetch the last message of each type 29 | while True: 30 | try: 31 | msg = queue.get_nowait() 32 | msgs[msg[0]] = msg[1:] 33 | except Empty: 34 | break 35 | if 'stop' in msgs: 36 | break 37 | elif 'update' in msgs: 38 | env, policy = msgs['update'] 39 | # env.start_viewer() 40 | elif 'demo' in msgs: 41 | param_values, max_length = msgs['demo'] 42 | policy.set_param_values(param_values) 43 | rollout(env, policy, max_path_length=max_length, animated=True, speedup=5) 44 | else: 45 | if max_length: 46 | rollout(env, policy, max_path_length=max_length, animated=True, speedup=5) 47 | except KeyboardInterrupt: 48 | pass 49 | 50 | 51 | def _shutdown_worker(): 52 | if process: 53 | queue.put(['stop']) 54 | queue.close() 55 | process.join() 56 | 57 | 58 | def init_worker(): 59 | global process, queue 60 | queue = Queue() 61 | process = Process(target=_worker_start) 62 | process.start() 63 | atexit.register(_shutdown_worker) 64 | 65 | 66 | def init_plot(env, policy): 67 | queue.put(['update', env, policy]) 68 | 69 | 70 | def update_plot(policy, max_length=np.inf): 71 | queue.put(['demo', policy.get_param_values(), max_length]) 72 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/point_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.base import Step 2 | from .mujoco_env import MujocoEnv 3 | from rllab.core.serializable import Serializable 4 | from rllab.misc.overrides import overrides 5 | import numpy as np 6 | import math 7 | from rllab.mujoco_py import glfw 8 | 9 | 10 | class PointEnv(MujocoEnv, Serializable): 11 | 12 | """ 13 | Use Left, Right, Up, Down, A (steer left), D (steer right) 14 | """ 15 | 16 | FILE = 'point.xml' 17 | 18 | def __init__(self, *args, **kwargs): 19 | super(PointEnv, self).__init__(*args, **kwargs) 20 | Serializable.quick_init(self, locals()) 21 | 22 | def step(self, action): 23 | qpos = np.copy(self.model.data.qpos) 24 | qpos[2, 0] += action[1] 25 | ori = qpos[2, 0] 26 | # compute increment in each direction 27 | dx = math.cos(ori) * action[0] 28 | dy = math.sin(ori) * action[0] 29 | # ensure that the robot is within reasonable range 30 | qpos[0, 0] = np.clip(qpos[0, 0] + dx, -7, 7) 31 | qpos[1, 0] = np.clip(qpos[1, 0] + dy, -7, 7) 32 | self.model.data.qpos = qpos 33 | self.model.forward() 34 | next_obs = self.get_current_obs() 35 | return Step(next_obs, 0, False) 36 | 37 | def get_xy(self): 38 | qpos = self.model.data.qpos 39 | return qpos[0, 0], qpos[1, 0] 40 | 41 | def set_xy(self, xy): 42 | qpos = np.copy(self.model.data.qpos) 43 | qpos[0, 0] = xy[0] 44 | qpos[1, 0] = xy[1] 45 | self.model.data.qpos = qpos 46 | self.model.forward() 47 | 48 | @overrides 49 | def action_from_key(self, key): 50 | lb, ub = self.action_bounds 51 | if key == glfw.KEY_LEFT: 52 | return np.array([0, ub[0]*0.3]) 53 | elif key == glfw.KEY_RIGHT: 54 | return np.array([0, lb[0]*0.3]) 55 | elif key == glfw.KEY_UP: 56 | return np.array([ub[1], 0]) 57 | elif key == glfw.KEY_DOWN: 58 | return np.array([lb[1], 0]) 59 | else: 60 | return np.array([0, 0]) 61 | 62 | -------------------------------------------------------------------------------- /examples/trpo_point.py: -------------------------------------------------------------------------------- 1 | 2 | from sandbox.rocky.tf.algos.trpo import TRPO 3 | from sandbox.rocky.tf.algos.vpg import VPG 4 | from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy 5 | from sandbox.rocky.tf.envs.base import TfEnv 6 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 7 | from examples.point_env_randgoal import PointEnvRandGoal 8 | from examples.point_env_randgoal_oracle import PointEnvRandGoalOracle 9 | #from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv 10 | from rllab.envs.mujoco.walker2d_env import Walker2DEnv 11 | from rllab.envs.normalized_env import normalize 12 | from rllab.misc.instrument import stub, run_experiment_lite 13 | 14 | stub(globals()) 15 | 16 | import tensorflow as tf 17 | 18 | #env = normalize(PointEnvRandGoal()) 19 | env = normalize(PointEnvRandGoalOracle()) 20 | 21 | #env = normalize(HalfCheetahEnv()) 22 | #env = normalize(Walker2DEnv()) 23 | env = TfEnv(env) 24 | policy = GaussianMLPPolicy( 25 | name='policy', 26 | env_spec=env.spec, 27 | # The neural network policy should have two hidden layers, each with 32 hidden units. 28 | #hidden_sizes=(32, 32) 29 | hidden_nonlinearity=tf.nn.relu, 30 | hidden_sizes=(100, 100) 31 | ) 32 | 33 | baseline = LinearFeatureBaseline(env_spec=env.spec) 34 | 35 | algo = TRPO( 36 | #algo = VPG( 37 | env=env, 38 | policy=policy, 39 | baseline=baseline, 40 | batch_size=1000, # was 4k # 500 for path lenght of 5, 1000 for path length of 100 41 | max_path_length=100, 42 | n_itr=100, 43 | discount=0.99, 44 | step_size=0.01, 45 | #plot=True, 46 | ) 47 | #algo.train() 48 | 49 | run_experiment_lite( 50 | algo.train(), 51 | # Number of parallel workers for sampling 52 | n_parallel=4, 53 | # Only keep the snapshot parameters for the last iteration 54 | snapshot_mode="last", 55 | # Specifies the seed for the experiment. If this is not provided, a random seed 56 | # will be used 57 | seed=1, 58 | exp_prefix='vpg_sensitive_point100', 59 | exp_name='oracleenv2', 60 | #plot=True, 61 | ) 62 | -------------------------------------------------------------------------------- /rllab/distributions/bernoulli.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from .base import Distribution 4 | import theano.tensor as TT 5 | import numpy as np 6 | 7 | TINY = 1e-8 8 | 9 | 10 | class Bernoulli(Distribution): 11 | def __init__(self, dim): 12 | self._dim = dim 13 | 14 | @property 15 | def dim(self): 16 | return self._dim 17 | 18 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 19 | old_p = old_dist_info_vars["p"] 20 | new_p = new_dist_info_vars["p"] 21 | kl = old_p * (TT.log(old_p + TINY) - TT.log(new_p + TINY)) + \ 22 | (1 - old_p) * (TT.log(1 - old_p + TINY) - TT.log(1 - new_p + TINY)) 23 | return TT.sum(kl, axis=-1) 24 | 25 | def kl(self, old_dist_info, new_dist_info): 26 | old_p = old_dist_info["p"] 27 | new_p = new_dist_info["p"] 28 | kl = old_p * (np.log(old_p + TINY) - np.log(new_p + TINY)) + \ 29 | (1 - old_p) * (np.log(1 - old_p + TINY) - np.log(1 - new_p + TINY)) 30 | return np.sum(kl, axis=-1) 31 | 32 | def sample(self, dist_info): 33 | p = np.asarray(dist_info["p"]) 34 | return np.cast['int'](np.random.uniform(low=0., high=1., size=p.shape) < p) 35 | 36 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 37 | old_p = old_dist_info_vars["p"] 38 | new_p = new_dist_info_vars["p"] 39 | return TT.prod(x_var * new_p / (old_p + TINY) + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY), 40 | axis=-1) 41 | 42 | def log_likelihood_sym(self, x_var, dist_info_vars): 43 | p = dist_info_vars["p"] 44 | return TT.sum(x_var * TT.log(p + TINY) + (1 - x_var) * TT.log(1 - p + TINY), axis=-1) 45 | 46 | def log_likelihood(self, xs, dist_info): 47 | p = dist_info["p"] 48 | return np.sum(xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1) 49 | 50 | def entropy(self, dist_info): 51 | p = dist_info["p"] 52 | return np.sum(- p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1) 53 | 54 | @property 55 | def dist_info_keys(self): 56 | return ["p"] 57 | -------------------------------------------------------------------------------- /rllab/envs/box2d/cartpole_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rllab.envs.box2d.parser import find_body 3 | 4 | from rllab.core.serializable import Serializable 5 | from rllab.envs.box2d.box2d_env import Box2DEnv 6 | from rllab.misc import autoargs 7 | from rllab.misc.overrides import overrides 8 | 9 | 10 | class CartpoleEnv(Box2DEnv, Serializable): 11 | 12 | @autoargs.inherit(Box2DEnv.__init__) 13 | def __init__(self, *args, **kwargs): 14 | self.max_pole_angle = .2 15 | self.max_cart_pos = 2.4 16 | self.max_cart_speed = 4. 17 | self.max_pole_speed = 4. 18 | self.reset_range = 0.05 19 | super(CartpoleEnv, self).__init__( 20 | self.model_path("cartpole.xml.mako"), 21 | *args, **kwargs 22 | ) 23 | self.cart = find_body(self.world, "cart") 24 | self.pole = find_body(self.world, "pole") 25 | Serializable.__init__(self, *args, **kwargs) 26 | 27 | @overrides 28 | def reset(self): 29 | self._set_state(self.initial_state) 30 | self._invalidate_state_caches() 31 | bounds = np.array([ 32 | self.max_cart_pos, 33 | self.max_cart_speed, 34 | self.max_pole_angle, 35 | self.max_pole_speed 36 | ]) 37 | low, high = -self.reset_range*bounds, self.reset_range*bounds 38 | xpos, xvel, apos, avel = np.random.uniform(low, high) 39 | self.cart.position = (xpos, self.cart.position[1]) 40 | self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1]) 41 | self.pole.angle = apos 42 | self.pole.angularVelocity = avel 43 | return self.get_current_obs() 44 | 45 | @overrides 46 | def compute_reward(self, action): 47 | yield 48 | notdone = 1 - int(self.is_current_done()) 49 | ucost = 1e-5*(action**2).sum() 50 | xcost = 1 - np.cos(self.pole.angle) 51 | yield notdone * 10 - notdone * xcost - notdone * ucost 52 | 53 | @overrides 54 | def is_current_done(self): 55 | return abs(self.cart.position[0]) > self.max_cart_pos or \ 56 | abs(self.pole.angle) > self.max_pole_angle 57 | 58 | -------------------------------------------------------------------------------- /rllab/envs/box2d/models/car_parking.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | -------------------------------------------------------------------------------- /scripts/resume_training.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from rllab.sampler.utils import rollout 5 | from rllab.algos.batch_polopt import BatchPolopt 6 | import argparse 7 | import joblib 8 | import uuid 9 | import os 10 | import random 11 | import numpy as np 12 | import json 13 | import subprocess 14 | from rllab.misc import logger 15 | from rllab.misc.instrument import to_local_command 16 | 17 | filename = str(uuid.uuid4()) 18 | 19 | if __name__ == "__main__": 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('file', type=str, 23 | help='path to the snapshot file') 24 | parser.add_argument('--log_dir', type=str, default=None, 25 | help='path to the new log directory') 26 | # Look for params.json file 27 | args = parser.parse_args() 28 | parent_dir = os.path.dirname(os.path.realpath(args.file)) 29 | json_file_path = os.path.join(parent_dir, "params.json") 30 | logger.log("Looking for params.json at %s..." % json_file_path) 31 | try: 32 | with open(json_file_path, "r") as f: 33 | params = json.load(f) 34 | # exclude certain parameters 35 | excluded = ['json_args'] 36 | for k in excluded: 37 | if k in params: 38 | del params[k] 39 | for k, v in list(params.items()): 40 | if v is None: 41 | del params[k] 42 | if args.log_dir is not None: 43 | params['log_dir'] = args.log_dir 44 | params['resume_from'] = args.file 45 | command = to_local_command(params, script='scripts/run_experiment_lite.py') 46 | print(command) 47 | try: 48 | subprocess.call(command, shell=True, env=os.environ) 49 | except Exception as e: 50 | print(e) 51 | if isinstance(e, KeyboardInterrupt): 52 | raise 53 | except IOError as e: 54 | logger.log("Failed to find json file. Continuing in non-stub mode...") 55 | data = joblib.load(args.file) 56 | assert 'algo' in data 57 | algo = data['algo'] 58 | assert isinstance(algo, BatchPolopt) 59 | algo.train() 60 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/ant_env.py: -------------------------------------------------------------------------------- 1 | from .mujoco_env import MujocoEnv 2 | from rllab.core.serializable import Serializable 3 | import numpy as np 4 | 5 | from rllab.envs.base import Step 6 | from rllab.misc.overrides import overrides 7 | from rllab.misc import logger 8 | 9 | 10 | class AntEnv(MujocoEnv, Serializable): 11 | 12 | FILE = 'ant.xml' 13 | 14 | def __init__(self, *args, **kwargs): 15 | super(AntEnv, self).__init__(*args, **kwargs) 16 | Serializable.__init__(self, *args, **kwargs) 17 | 18 | def get_current_obs(self): 19 | return np.concatenate([ 20 | self.model.data.qpos.flat, 21 | self.model.data.qvel.flat, 22 | np.clip(self.model.data.cfrc_ext, -1, 1).flat, 23 | self.get_body_xmat("torso").flat, 24 | self.get_body_com("torso"), 25 | ]).reshape(-1) 26 | 27 | def step(self, action): 28 | self.forward_dynamics(action) 29 | comvel = self.get_body_comvel("torso") 30 | forward_reward = comvel[0] 31 | lb, ub = self.action_bounds 32 | scaling = (ub - lb) * 0.5 33 | ctrl_cost = 0.5 * 1e-2 * np.sum(np.square(action / scaling)) 34 | contact_cost = 0.5 * 1e-3 * np.sum( 35 | np.square(np.clip(self.model.data.cfrc_ext, -1, 1))), 36 | survive_reward = 0.05 37 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward 38 | state = self._state 39 | notdone = np.isfinite(state).all() \ 40 | and state[2] >= 0.2 and state[2] <= 1.0 41 | done = not notdone 42 | ob = self.get_current_obs() 43 | return Step(ob, float(reward), done) 44 | 45 | @overrides 46 | def log_diagnostics(self, paths): 47 | progs = [ 48 | path["observations"][-1][-3] - path["observations"][0][-3] 49 | for path in paths 50 | ] 51 | logger.record_tabular('AverageForwardProgress', np.mean(progs)) 52 | logger.record_tabular('MaxForwardProgress', np.max(progs)) 53 | logger.record_tabular('MinForwardProgress', np.min(progs)) 54 | logger.record_tabular('StdForwardProgress', np.std(progs)) 55 | 56 | -------------------------------------------------------------------------------- /icml/make_point_plots.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import glob 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | #names = ['maml','sens0','random','oracle'] 8 | 9 | prefix = 'icml_point_results_' 10 | oracle_pkl = prefix+'oracle.pkl' 11 | 12 | maml_pkl = prefix+'maml.pkl' 13 | pretrain_pkl = prefix+'sens0.pkl' 14 | random_pkl = prefix+'random.pkl' 15 | 16 | key = 'task_avg_returns' 17 | 18 | n_itr = 4 19 | 20 | with open(oracle_pkl, 'rb') as f: 21 | oracle_data = np.array(pickle.load(f)[key])[0] 22 | 23 | 24 | oracle_data = np.reshape(oracle_data, [-1, 1]) 25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr]) 26 | 27 | fig = plt.figure() 28 | plt.clf() 29 | 30 | with open(maml_pkl, 'rb') as maml_f: 31 | maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr] 32 | 33 | with open(pretrain_pkl, 'rb') as f: 34 | pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 35 | 36 | with open(random_pkl, 'rb') as f: 37 | random_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 38 | 39 | 40 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)') 41 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained') 42 | sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random') 43 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle') 44 | ax = fig.gca() 45 | ax.set(yscale='symlog') 46 | 47 | plt.ylim([-100,-2.0]) 48 | 49 | plt.xlabel('number of gradient steps', fontsize=27) 50 | plt.ylabel('average return (log scale)', fontsize=27) 51 | lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=(0.01, 0.51), fontsize=22) #, bbox_to_anchor=(1, 0.5), fontsize=18) 52 | plt.title('point robot, 2d navigation', fontsize=27) 53 | plt.tight_layout() 54 | 55 | ax = plt.gca() 56 | plt.setp(ax.get_xticklabels(), fontsize=18) 57 | plt.setp(ax.get_yticklabels(), fontsize=18) 58 | plt.xticks(np.arange(0,4,1.0)) 59 | plt.savefig('point_results.png', bbox_extra_artists=(lgd,), bbox_inches='tight') 60 | -------------------------------------------------------------------------------- /rllab/spaces/discrete.py: -------------------------------------------------------------------------------- 1 | from .base import Space 2 | import numpy as np 3 | from rllab.misc import special 4 | from rllab.misc import ext 5 | 6 | 7 | class Discrete(Space): 8 | """ 9 | {0,1,...,n-1} 10 | """ 11 | 12 | def __init__(self, n): 13 | self._n = n 14 | 15 | @property 16 | def n(self): 17 | return self._n 18 | 19 | def sample(self): 20 | return np.random.randint(self.n) 21 | 22 | def contains(self, x): 23 | x = np.asarray(x) 24 | return x.shape == () and x.dtype.kind == 'i' and x >= 0 and x < self.n 25 | 26 | def __repr__(self): 27 | return "Discrete(%d)" % self.n 28 | 29 | def __eq__(self, other): 30 | return self.n == other.n 31 | 32 | def flatten(self, x): 33 | return special.to_onehot(x, self.n) 34 | 35 | def unflatten(self, x): 36 | return special.from_onehot(x) 37 | 38 | def flatten_n(self, x): 39 | return special.to_onehot_n(x, self.n) 40 | 41 | def unflatten_n(self, x): 42 | return special.from_onehot_n(x) 43 | 44 | @property 45 | def flat_dim(self): 46 | return self.n 47 | 48 | def weighted_sample(self, weights): 49 | return special.weighted_sample(weights, range(self.n)) 50 | 51 | @property 52 | def default_value(self): 53 | return 0 54 | 55 | def new_tensor_variable(self, name, extra_dims): 56 | if self.n <= 2 ** 8: 57 | return ext.new_tensor( 58 | name=name, 59 | ndim=extra_dims+1, 60 | dtype='uint8' 61 | ) 62 | elif self.n <= 2 ** 16: 63 | return ext.new_tensor( 64 | name=name, 65 | ndim=extra_dims+1, 66 | dtype='uint16' 67 | ) 68 | else: 69 | return ext.new_tensor( 70 | name=name, 71 | ndim=extra_dims+1, 72 | dtype='uint32' 73 | ) 74 | 75 | def __eq__(self, other): 76 | if not isinstance(other, Discrete): 77 | return False 78 | return self.n == other.n 79 | 80 | def __hash__(self): 81 | return hash(self.n) -------------------------------------------------------------------------------- /icml/make_antdirec_plots.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import glob 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | #names = ['maml','sens0','random','oracle'] 8 | 9 | prefix = 'icml_antdirec_results_' 10 | oracle_pkl = prefix+'oracle.pkl' 11 | 12 | maml_pkl = prefix+'maml.pkl' 13 | pretrain_pkl = prefix+'pretrain.pkl' 14 | random_pkl = prefix+'random.pkl' 15 | 16 | key = 'task_avg_returns' 17 | 18 | n_itr = 4 19 | 20 | with open(oracle_pkl, 'rb') as f: 21 | oracle_data = np.array(pickle.load(f)[key])[0] 22 | 23 | 24 | oracle_data = np.reshape(oracle_data, [-1, 1]) 25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr]) 26 | 27 | fig = plt.figure() 28 | plt.clf() 29 | 30 | with open(maml_pkl, 'rb') as maml_f: 31 | maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr] 32 | 33 | with open(pretrain_pkl, 'rb') as f: 34 | pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 35 | 36 | with open(random_pkl, 'rb') as f: 37 | random_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 38 | 39 | 40 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)', legend=False) 41 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained', legend=False) 42 | sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random', legend=False) 43 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle', legend=False) 44 | ax = fig.gca() 45 | #ax.set(yscale='symlog') 46 | 47 | #plt.ylim([-100,-2.0]) 48 | 49 | plt.xlabel('number of gradient steps', fontsize=26) 50 | plt.ylabel('average return', fontsize=26) 51 | #lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=0, bbox_to_anchor=(1, 0.5), fontsize=20) 52 | plt.title('ant, forward/backward', fontsize=26) 53 | #plt.ylim([-0.04, 3.5]) 54 | plt.tight_layout() 55 | 56 | ax = plt.gca() 57 | plt.setp(ax.get_xticklabels(), fontsize=18) 58 | plt.setp(ax.get_yticklabels(), fontsize=18) 59 | plt.xticks(np.arange(0,4,1.0)) 60 | plt.savefig('antdirec_results.png', bbox_inches='tight') 61 | -------------------------------------------------------------------------------- /rllab/config.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import os 3 | 4 | PROJECT_PATH = osp.abspath(osp.join(osp.dirname(__file__), '..')) 5 | 6 | LOG_DIR = PROJECT_PATH + "/data" 7 | 8 | USE_TF = False 9 | 10 | DOCKER_IMAGE = "DOCKER_IMAGE" 11 | 12 | DOCKERFILE_PATH = "/path/to/Dockerfile" 13 | 14 | KUBE_PREFIX = "rllab_" 15 | 16 | DOCKER_LOG_DIR = "/tmp/expt" 17 | 18 | POD_DIR = PROJECT_PATH + "/.pods" 19 | 20 | AWS_S3_PATH = None 21 | 22 | AWS_IMAGE_ID = None 23 | 24 | AWS_INSTANCE_TYPE = "m4.xlarge" 25 | 26 | AWS_KEY_NAME = "AWS_KEY_NAME" 27 | 28 | AWS_SPOT = True 29 | 30 | AWS_SPOT_PRICE = '1.0' 31 | 32 | AWS_ACCESS_KEY = os.environ.get("AWS_ACCESS_KEY", None) 33 | 34 | AWS_ACCESS_SECRET = os.environ.get("AWS_ACCESS_SECRET", None) 35 | 36 | AWS_IAM_INSTANCE_PROFILE_NAME = "rllab" 37 | 38 | AWS_SECURITY_GROUPS = ["rllab"] 39 | 40 | AWS_SECURITY_GROUP_IDS = [] 41 | 42 | AWS_NETWORK_INTERFACES = [] 43 | 44 | AWS_EXTRA_CONFIGS = dict() 45 | 46 | AWS_REGION_NAME = "us-east-1" 47 | 48 | CODE_SYNC_IGNORES = ["*.git/*", "*data/*", "*.pod/*"] 49 | 50 | DOCKER_CODE_DIR = "/root/code/rllab" 51 | 52 | AWS_CODE_SYNC_S3_PATH = "s3://to/be/overriden/in/personal" 53 | 54 | # whether to use fast code sync 55 | FAST_CODE_SYNC = True 56 | 57 | FAST_CODE_SYNC_IGNORES = [".git", "data", ".pods"] 58 | 59 | KUBE_DEFAULT_RESOURCES = { 60 | "requests": { 61 | "cpu": 0.8, 62 | } 63 | } 64 | 65 | KUBE_DEFAULT_NODE_SELECTOR = { 66 | "aws/type": "m4.xlarge", 67 | } 68 | 69 | MUJOCO_KEY_PATH = osp.expanduser("~/.mujoco") 70 | 71 | ENV = {} 72 | 73 | EBS_OPTIMIZED = True 74 | 75 | if osp.exists(osp.join(osp.dirname(__file__), "config_personal.py")): 76 | from .config_personal import * 77 | else: 78 | print("Creating your personal config from template...") 79 | from shutil import copy 80 | copy(osp.join(PROJECT_PATH, "rllab/config_personal_template.py"), osp.join(PROJECT_PATH, "rllab/config_personal.py")) 81 | from .config_personal import * 82 | print("Personal config created, but you should probably edit it before further experiments " \ 83 | "are run") 84 | if 'CIRCLECI' not in os.environ: 85 | print("Exiting.") 86 | import sys; sys.exit(0) 87 | 88 | LABEL = "" 89 | -------------------------------------------------------------------------------- /icml/make_cheetahdirec_plots.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import glob 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | #names = ['maml','sens0','random','oracle'] 8 | 9 | prefix = 'icml_cheetahdirec_results_' 10 | oracle_pkl = prefix+'oracle.pkl' 11 | 12 | maml_pkl = prefix+'maml.pkl' 13 | pretrain_pkl = prefix+'pretrain.pkl' 14 | random_pkl = prefix+'random.pkl' 15 | 16 | key = 'task_avg_returns' 17 | 18 | n_itr = 4 19 | 20 | with open(oracle_pkl, 'rb') as f: 21 | oracle_data = np.array(pickle.load(f)[key])[0] 22 | 23 | 24 | oracle_data = np.reshape(oracle_data, [-1, 1]) 25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr]) 26 | 27 | fig = plt.figure() 28 | plt.clf() 29 | 30 | with open(maml_pkl, 'rb') as maml_f: 31 | maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr] 32 | 33 | with open(pretrain_pkl, 'rb') as f: 34 | pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 35 | 36 | with open(random_pkl, 'rb') as f: 37 | random_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 38 | 39 | 40 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)', legend=False) 41 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained', legend=False) 42 | sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random', legend=False) 43 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle', legend=False) 44 | ax = fig.gca() 45 | #ax.set(yscale='symlog') 46 | 47 | #plt.ylim([-100,-2.0]) 48 | 49 | plt.xlabel('number of gradient steps', fontsize=26) 50 | plt.ylabel('average return', fontsize=26) 51 | #lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=0, bbox_to_anchor=(1, 0.5), fontsize=20) 52 | plt.title('half-cheetah, forward/backward', fontsize=26) 53 | #plt.ylim([-0.04, 3.5]) 54 | plt.tight_layout() 55 | 56 | ax = plt.gca() 57 | plt.setp(ax.get_xticklabels(), fontsize=18) 58 | plt.setp(ax.get_yticklabels(), fontsize=18) 59 | plt.xticks(np.arange(0,4,1.0)) 60 | plt.savefig('cheetahdirec_results.png', bbox_inches='tight') 61 | -------------------------------------------------------------------------------- /rllab/envs/box2d/mountain_car_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pygame 3 | from rllab.envs.box2d.parser import find_body 4 | 5 | from rllab.core.serializable import Serializable 6 | from rllab.envs.box2d.box2d_env import Box2DEnv 7 | from rllab.misc import autoargs 8 | from rllab.misc.overrides import overrides 9 | 10 | 11 | class MountainCarEnv(Box2DEnv, Serializable): 12 | 13 | @autoargs.inherit(Box2DEnv.__init__) 14 | @autoargs.arg("height_bonus_coeff", type=float, 15 | help="Height bonus added to each step's reward") 16 | @autoargs.arg("goal_cart_pos", type=float, 17 | help="Goal horizontal position") 18 | def __init__(self, 19 | height_bonus=1., 20 | goal_cart_pos=0.6, 21 | *args, **kwargs): 22 | super(MountainCarEnv, self).__init__( 23 | self.model_path("mountain_car.xml.mako"), 24 | *args, **kwargs 25 | ) 26 | self.max_cart_pos = 2 27 | self.goal_cart_pos = goal_cart_pos 28 | self.height_bonus = height_bonus 29 | self.cart = find_body(self.world, "cart") 30 | Serializable.quick_init(self, locals()) 31 | 32 | @overrides 33 | def compute_reward(self, action): 34 | yield 35 | yield (-1 + self.height_bonus * self.cart.position[1]) 36 | 37 | @overrides 38 | def is_current_done(self): 39 | return self.cart.position[0] >= self.goal_cart_pos \ 40 | or abs(self.cart.position[0]) >= self.max_cart_pos 41 | 42 | @overrides 43 | def reset(self): 44 | self._set_state(self.initial_state) 45 | self._invalidate_state_caches() 46 | bounds = np.array([ 47 | [-1], 48 | [1], 49 | ]) 50 | low, high = bounds 51 | xvel = np.random.uniform(low, high) 52 | self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1]) 53 | return self.get_current_obs() 54 | 55 | @overrides 56 | def action_from_keys(self, keys): 57 | if keys[pygame.K_LEFT]: 58 | return np.asarray([-1]) 59 | elif keys[pygame.K_RIGHT]: 60 | return np.asarray([+1]) 61 | else: 62 | return np.asarray([0]) 63 | 64 | -------------------------------------------------------------------------------- /rllab/envs/box2d/models/car_parking.xml.rb: -------------------------------------------------------------------------------- 1 | car_height = 1.0 2 | car_width = 0.6 3 | car_mass = 1 4 | car_density = car_mass / car_height / car_width 5 | 6 | wheel_height = 0.3 7 | wheel_width = 0.1 8 | wheel_mass = 0.1 9 | wheel_density = wheel_mass / wheel_height / wheel_width 10 | wheel_max_deg = 30 11 | 12 | phantom_group = -1 13 | common = { group: phantom_group } 14 | 15 | box2d { 16 | world(timestep: 0.05, gravity: [0, 0]) { 17 | body(name: :goal, type: :static, position: [0, 0]) { 18 | fixture(common.merge(shape: :circle, radius: 1)) 19 | } 20 | 21 | car_pos = [3, 4] 22 | body(name: :car, type: :dynamic, position: car_pos) { 23 | rect( 24 | box: [car_width / 2, car_height / 2], 25 | density: car_density, 26 | group: phantom_group, 27 | ) 28 | } 29 | [:left_front_wheel, :right_front_wheel, :left_rear_wheel, :right_rear_wheel].each do |wheel| 30 | x_pos = car_width / 2 31 | x_pos *= wheel =~ /left/ ? -1 : 1 32 | y_pos = wheel =~ /front/ ? 0.2 : -0.3 33 | body(name: wheel, type: :dynamic, position: [car_pos[0] + x_pos, car_pos[1] + y_pos]) { 34 | rect( 35 | box: [wheel_width / 2, wheel_height / 2], 36 | density: wheel_density, 37 | group: phantom_group, 38 | ) 39 | } 40 | # limit = wheel =~ /front/ ? [-wheel_max_deg, wheel_max_deg] : [0, 0] 41 | limit = [0, 0] 42 | joint( 43 | type: :revolute, 44 | name: "#{wheel}_joint", 45 | bodyA: :car, 46 | bodyB: wheel, 47 | localAnchorA: [x_pos, y_pos], 48 | localAnchorB: [0, 0], 49 | limit: limit, 50 | ) 51 | end 52 | control( 53 | type: :force, 54 | bodies: [:left_front_wheel, :right_front_wheel], 55 | anchor: [0, 0], 56 | direction: [0, 1], 57 | ctrllimit: [-10.N, 10.N], 58 | ) 59 | state body: :car, type: :xvel 60 | state body: :car, type: :yvel 61 | state body: :car, type: :dist, to: :goal 62 | state body: :car, type: :angle, to: :goal, transform: :cos 63 | state body: :car, type: :angle, to: :goal, transform: :sin 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /rllab/sampler/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rllab.misc import tensor_utils 3 | import time 4 | 5 | 6 | def rollout(env, agent, max_path_length=np.inf, animated=False, speedup=1, save_video=True, video_filename='sim_out.mp4', reset_arg=None): 7 | observations = [] 8 | actions = [] 9 | rewards = [] 10 | agent_infos = [] 11 | env_infos = [] 12 | images = [] 13 | o = env.reset(reset_args=reset_arg) 14 | agent.reset() 15 | path_length = 0 16 | if animated: 17 | env.render() 18 | while path_length < max_path_length: 19 | a, agent_info = agent.get_action(o) 20 | next_o, r, d, env_info = env.step(a) 21 | observations.append(env.observation_space.flatten(o)) 22 | rewards.append(r) 23 | actions.append(env.action_space.flatten(a)) 24 | agent_infos.append(agent_info) 25 | env_infos.append(env_info) 26 | path_length += 1 27 | if d: # and not animated: # TODO testing 28 | break 29 | o = next_o 30 | if animated: 31 | env.render() 32 | timestep = 0.05 33 | time.sleep(timestep / speedup) 34 | if save_video: 35 | from PIL import Image 36 | image = env.wrapped_env.wrapped_env.get_viewer().get_image() 37 | pil_image = Image.frombytes('RGB', (image[1], image[2]), image[0]) 38 | images.append(np.flipud(np.array(pil_image))) 39 | 40 | if animated: 41 | if save_video and len(images) >= max_path_length: 42 | import moviepy.editor as mpy 43 | clip = mpy.ImageSequenceClip(images, fps=20*speedup) 44 | if video_filename[-3:] == 'gif': 45 | clip.write_gif(video_filename, fps=20*speedup) 46 | else: 47 | clip.write_videofile(video_filename, fps=20*speedup) 48 | #return 49 | 50 | return dict( 51 | observations=tensor_utils.stack_tensor_list(observations), 52 | actions=tensor_utils.stack_tensor_list(actions), 53 | rewards=tensor_utils.stack_tensor_list(rewards), 54 | agent_infos=tensor_utils.stack_tensor_dict_list(agent_infos), 55 | env_infos=tensor_utils.stack_tensor_dict_list(env_infos), 56 | ) 57 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/half_cheetah_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.envs.base import Step 5 | from rllab.envs.mujoco.mujoco_env import MujocoEnv 6 | from rllab.misc import logger 7 | from rllab.misc.overrides import overrides 8 | 9 | 10 | def smooth_abs(x, param): 11 | return np.sqrt(np.square(x) + np.square(param)) - param 12 | 13 | 14 | class HalfCheetahEnv(MujocoEnv, Serializable): 15 | 16 | FILE = 'half_cheetah.xml' 17 | 18 | def __init__(self, *args, **kwargs): 19 | super(HalfCheetahEnv, self).__init__(*args, **kwargs) 20 | Serializable.__init__(self, *args, **kwargs) 21 | 22 | def get_current_obs(self): 23 | return np.concatenate([ 24 | self.model.data.qpos.flatten()[1:], 25 | self.model.data.qvel.flat, 26 | self.get_body_com("torso").flat, 27 | ]) 28 | 29 | def get_body_xmat(self, body_name): 30 | idx = self.model.body_names.index(body_name) 31 | return self.model.data.xmat[idx].reshape((3, 3)) 32 | 33 | def get_body_com(self, body_name): 34 | idx = self.model.body_names.index(body_name) 35 | return self.model.data.com_subtree[idx] 36 | 37 | def step(self, action): 38 | self.forward_dynamics(action) 39 | next_obs = self.get_current_obs() 40 | action = np.clip(action, *self.action_bounds) 41 | ctrl_cost = 1e-1 * 0.5 * np.sum(np.square(action)) 42 | #run_cost = -1 * self.get_body_comvel("torso")[0] 43 | run_cost = 1.*np.abs(self.get_body_comvel("torso")[0] - 0.1) 44 | cost = ctrl_cost + run_cost 45 | reward = -cost 46 | done = False 47 | return Step(next_obs, reward, done) 48 | 49 | @overrides 50 | def log_diagnostics(self, paths, prefix=''): 51 | progs = [ 52 | path["observations"][-1][-3] - path["observations"][0][-3] 53 | for path in paths 54 | ] 55 | logger.record_tabular(prefix+'AverageForwardProgress', np.mean(progs)) 56 | logger.record_tabular(prefix+'MaxForwardProgress', np.max(progs)) 57 | logger.record_tabular(prefix+'MinForwardProgress', np.min(progs)) 58 | logger.record_tabular(prefix+'StdForwardProgress', np.std(progs)) 59 | -------------------------------------------------------------------------------- /rllab/regressors/product_regressor.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import numpy as np 5 | from rllab.core.serializable import Serializable 6 | 7 | 8 | class ProductRegressor(Serializable): 9 | """ 10 | A class for performing MLE regression by fitting a product distribution to the outputs. A separate regressor will 11 | be trained for each individual input distribution. 12 | """ 13 | 14 | def __init__(self, regressors): 15 | """ 16 | :param regressors: List of individual regressors 17 | """ 18 | Serializable.quick_init(self, locals()) 19 | self.regressors = regressors 20 | self.output_dims = [x.output_dim for x in regressors] 21 | 22 | def _split_ys(self, ys): 23 | ys = np.asarray(ys) 24 | split_ids = np.cumsum(self.output_dims)[:-1] 25 | return np.split(ys, split_ids, axis=1) 26 | 27 | def fit(self, xs, ys): 28 | for regressor, split_ys in zip(self.regressors, self._split_ys(ys)): 29 | regressor.fit(xs, split_ys) 30 | 31 | def predict(self, xs): 32 | return np.concatenate([ 33 | regressor.predict(xs) for regressor in self.regressors 34 | ], axis=1) 35 | 36 | def sample_predict(self, xs): 37 | return np.concatenate([ 38 | regressor.sample_predict(xs) for regressor in self.regressors 39 | ], axis=1) 40 | 41 | def predict_log_likelihood(self, xs, ys): 42 | return np.sum([ 43 | regressor.predict_log_likelihood(xs, split_ys) 44 | for regressor, split_ys in zip(self.regressors, self._split_ys(ys)) 45 | ], axis=0) 46 | 47 | def get_param_values(self, **tags): 48 | return np.concatenate( 49 | [regressor.get_param_values(**tags) for regressor in self.regressors] 50 | ) 51 | 52 | def set_param_values(self, flattened_params, **tags): 53 | param_dims = [ 54 | np.prod(regressor.get_param_shapes(**tags)) 55 | for regressor in self.regressors 56 | ] 57 | split_ids = np.cumsum(param_dims)[:-1] 58 | for regressor, split_param_values in zip(self.regressors, np.split(flattened_params, split_ids)): 59 | regressor.set_param_values(split_param_values) 60 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/walker2d_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.envs.base import Step 5 | from rllab.envs.mujoco.mujoco_env import MujocoEnv 6 | from rllab.misc import autoargs 7 | from rllab.misc import logger 8 | from rllab.misc.overrides import overrides 9 | 10 | 11 | def smooth_abs(x, param): 12 | return np.sqrt(np.square(x) + np.square(param)) - param 13 | 14 | 15 | class Walker2DEnv(MujocoEnv, Serializable): 16 | 17 | FILE = 'walker2d.xml' 18 | 19 | @autoargs.arg('ctrl_cost_coeff', type=float, 20 | help='cost coefficient for controls') 21 | def __init__( 22 | self, 23 | ctrl_cost_coeff=1e-2, 24 | *args, **kwargs): 25 | self.ctrl_cost_coeff = ctrl_cost_coeff 26 | super(Walker2DEnv, self).__init__(*args, **kwargs) 27 | Serializable.quick_init(self, locals()) 28 | 29 | def get_current_obs(self): 30 | return np.concatenate([ 31 | self.model.data.qpos.flat, 32 | self.model.data.qvel.flat, 33 | self.get_body_com("torso").flat, 34 | ]) 35 | 36 | def step(self, action): 37 | self.forward_dynamics(action) 38 | next_obs = self.get_current_obs() 39 | action = np.clip(action, *self.action_bounds) 40 | lb, ub = self.action_bounds 41 | scaling = (ub - lb) * 0.5 42 | ctrl_cost = 0.5 * self.ctrl_cost_coeff * \ 43 | np.sum(np.square(action / scaling)) 44 | forward_reward = self.get_body_comvel("torso")[0] 45 | reward = forward_reward - ctrl_cost 46 | qpos = self.model.data.qpos 47 | done = not (qpos[0] > 0.8 and qpos[0] < 2.0 48 | and qpos[2] > -1.0 and qpos[2] < 1.0) 49 | return Step(next_obs, reward, done) 50 | 51 | @overrides 52 | def log_diagnostics(self, paths): 53 | progs = [ 54 | path["observations"][-1][-3] - path["observations"][0][-3] 55 | for path in paths 56 | ] 57 | logger.record_tabular('AverageForwardProgress', np.mean(progs)) 58 | logger.record_tabular('MaxForwardProgress', np.max(progs)) 59 | logger.record_tabular('MinForwardProgress', np.min(progs)) 60 | logger.record_tabular('StdForwardProgress', np.std(progs)) 61 | 62 | -------------------------------------------------------------------------------- /tests/test_instrument.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from rllab.misc import instrument 5 | from nose2.tools import such 6 | 7 | 8 | class TestClass(object): 9 | @property 10 | def arr(self): 11 | return [1, 2, 3] 12 | 13 | @property 14 | def compound_arr(self): 15 | return [dict(a=1)] 16 | 17 | 18 | with such.A("instrument") as it: 19 | @it.should 20 | def test_concretize(): 21 | it.assertEqual(instrument.concretize([5]), [5]) 22 | it.assertEqual(instrument.concretize((5,)), (5,)) 23 | fake_globals = dict(TestClass=TestClass) 24 | instrument.stub(fake_globals) 25 | modified = fake_globals["TestClass"] 26 | it.assertIsInstance(modified, instrument.StubClass) 27 | it.assertIsInstance(modified(), instrument.StubObject) 28 | it.assertEqual(instrument.concretize((5,)), (5,)) 29 | it.assertIsInstance(instrument.concretize(modified()), TestClass) 30 | 31 | 32 | @it.should 33 | def test_chained_call(): 34 | fake_globals = dict(TestClass=TestClass) 35 | instrument.stub(fake_globals) 36 | modified = fake_globals["TestClass"] 37 | it.assertIsInstance(modified().arr[0], instrument.StubMethodCall) 38 | it.assertIsInstance(modified().compound_arr[0]["a"], instrument.StubMethodCall) 39 | it.assertEqual(instrument.concretize(modified().arr[0]), 1) 40 | 41 | 42 | @it.should 43 | def test_variant_generator(): 44 | 45 | vg = instrument.VariantGenerator() 46 | vg.add("key1", [1, 2, 3]) 47 | vg.add("key2", [True, False]) 48 | vg.add("key3", lambda key2: [1] if key2 else [1, 2]) 49 | it.assertEqual(len(vg.variants()), 9) 50 | 51 | class VG(instrument.VariantGenerator): 52 | 53 | @instrument.variant 54 | def key1(self): 55 | return [1, 2, 3] 56 | 57 | @instrument.variant 58 | def key2(self): 59 | yield True 60 | yield False 61 | 62 | @instrument.variant 63 | def key3(self, key2): 64 | if key2: 65 | yield 1 66 | else: 67 | yield 1 68 | yield 2 69 | 70 | it.assertEqual(len(VG().variants()), 9) 71 | 72 | it.createTests(globals()) 73 | -------------------------------------------------------------------------------- /icml/make_cheetah_plots.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import glob 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | #names = ['maml','sens0','random','oracle'] 8 | 9 | prefix = 'icml_cheetah_results_' 10 | oracle_pkl = prefix+'oracle.pkl' 11 | 12 | maml_pkl = prefix+'maml.pkl' 13 | pretrain_pkl = prefix+'pretrain.pkl' 14 | random_pkl = prefix+'random.pkl' 15 | 16 | key = 'task_avg_returns' 17 | 18 | n_itr = 4 19 | 20 | with open(oracle_pkl, 'rb') as f: 21 | oracle_data = np.array(pickle.load(f)[key])[0] 22 | 23 | 24 | oracle_data = np.reshape(oracle_data, [-1, 1]) 25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr]) 26 | 27 | fig = plt.figure() 28 | plt.clf() 29 | 30 | with open(maml_pkl, 'rb') as maml_f: 31 | maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr] 32 | 33 | with open(pretrain_pkl, 'rb') as f: 34 | pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 35 | 36 | with open(random_pkl, 'rb') as f: 37 | random_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 38 | 39 | legend=False 40 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)', legend=legend) 41 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained', legend=legend) 42 | #sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random', legend=legend) 43 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle', legend=legend) 44 | ax = fig.gca() 45 | 46 | plt.xlabel('number of gradient steps', fontsize=26) 47 | plt.ylabel('average return', fontsize=26) 48 | if legend: 49 | lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=0, bbox_to_anchor=(1, 0.5), fontsize=20) 50 | plt.title('half-cheetah, goal velocity', fontsize=26) 51 | #plt.ylim([-0.04, 3.5]) 52 | plt.tight_layout() 53 | 54 | ax = plt.gca() 55 | plt.setp(ax.get_xticklabels(), fontsize=18) 56 | plt.setp(ax.get_yticklabels(), fontsize=18) 57 | plt.xticks(np.arange(0,4,1.0)) 58 | 59 | if legend: 60 | plt.savefig('cheetah_results.png', bbox_extra_artists=(lgd,), transparent=True, bbox_inches='tight') 61 | else: 62 | plt.savefig('cheetah_results.png', bbox_inches='tight') 63 | 64 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/inverted_double_pendulum_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from rllab.core.serializable import Serializable 4 | from rllab.envs.base import Step 5 | from rllab.envs.mujoco.mujoco_env import MujocoEnv 6 | from rllab.misc import autoargs 7 | from rllab.misc.overrides import overrides 8 | 9 | 10 | class InvertedDoublePendulumEnv(MujocoEnv, Serializable): 11 | FILE = 'inverted_double_pendulum.xml.mako' 12 | 13 | @autoargs.arg("random_start", type=bool, 14 | help="Randomized starting position by adjusting the angles" 15 | "When this is false, the double pendulum started out" 16 | "in balanced position") 17 | def __init__( 18 | self, 19 | *args, **kwargs): 20 | self.random_start = kwargs.get("random_start", True) 21 | super(InvertedDoublePendulumEnv, self).__init__(*args, **kwargs) 22 | Serializable.quick_init(self, locals()) 23 | 24 | @overrides 25 | def get_current_obs(self): 26 | return np.concatenate([ 27 | self.model.data.qpos[:1], # cart x pos 28 | np.sin(self.model.data.qpos[1:]), # link angles 29 | np.cos(self.model.data.qpos[1:]), 30 | np.clip(self.model.data.qvel, -10, 10), 31 | np.clip(self.model.data.qfrc_constraint, -10, 10) 32 | ]).reshape(-1) 33 | 34 | @overrides 35 | def step(self, action): 36 | self.forward_dynamics(action) 37 | next_obs = self.get_current_obs() 38 | x, _, y = self.model.data.site_xpos[0] 39 | dist_penalty = 0.01 * x ** 2 + (y - 2) ** 2 40 | v1, v2 = self.model.data.qvel[1:3] 41 | vel_penalty = 1e-3 * v1 ** 2 + 5e-3 * v2 ** 2 42 | alive_bonus = 10 43 | r = float(alive_bonus - dist_penalty - vel_penalty) 44 | done = y <= 1 45 | return Step(next_obs, r, done) 46 | 47 | @overrides 48 | def reset_mujoco(self, init_state=None): 49 | assert init_state is None 50 | qpos = np.copy(self.init_qpos) 51 | if self.random_start: 52 | qpos[1] = (np.random.rand() - 0.5) * 40 / 180. * np.pi 53 | self.model.data.qpos = qpos 54 | self.model.data.qvel = self.init_qvel 55 | self.model.data.qacc = self.init_qacc 56 | self.model.data.ctrl = self.init_ctrl 57 | -------------------------------------------------------------------------------- /rllab/misc/resolve.py: -------------------------------------------------------------------------------- 1 | from pydoc import locate 2 | import types 3 | from rllab.misc.ext import iscanr 4 | 5 | 6 | def classesinmodule(module): 7 | md = module.__dict__ 8 | return [ 9 | md[c] for c in md if ( 10 | isinstance(md[c], type) and md[c].__module__ == module.__name__ 11 | ) 12 | ] 13 | 14 | 15 | def locate_with_hint(class_path, prefix_hints=[]): 16 | module_or_class = locate(class_path) 17 | if module_or_class is None: 18 | # for hint in iscanr(lambda x, y: x + "." + y, prefix_hints): 19 | # module_or_class = locate(hint + "." + class_path) 20 | # if module_or_class: 21 | # break 22 | hint = ".".join(prefix_hints) 23 | module_or_class = locate(hint + "." + class_path) 24 | return module_or_class 25 | 26 | 27 | def load_class(class_path, superclass=None, prefix_hints=[]): 28 | module_or_class = locate_with_hint(class_path, prefix_hints) 29 | if module_or_class is None: 30 | raise ValueError("Cannot find module or class under path %s" % class_path) 31 | if type(module_or_class) == types.ModuleType: 32 | if superclass: 33 | classes = [x for x in classesinmodule(module_or_class) if issubclass(x, superclass)] 34 | if len(classes) == 0: 35 | if superclass: 36 | raise ValueError('Could not find any subclasses of %s defined in module %s' % (str(superclass), class_path)) 37 | else: 38 | raise ValueError('Could not find any classes defined in module %s' % (class_path)) 39 | elif len(classes) > 1: 40 | if superclass: 41 | raise ValueError('Multiple subclasses of %s are defined in the module %s' % (str(superclass), class_path)) 42 | else: 43 | raise ValueError('Multiple classes are defined in the module %s' % (class_path)) 44 | else: 45 | return classes[0] 46 | elif isinstance(module_or_class, type): 47 | if superclass is None or issubclass(module_or_class, superclass): 48 | return module_or_class 49 | else: 50 | raise ValueError('The class %s is not a subclass of %s' % (str(module_or_class), str(superclass))) 51 | else: 52 | raise ValueError('Unsupported object: %s' % str(module_or_class)) 53 | -------------------------------------------------------------------------------- /icml/make_ant_plots.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import glob 3 | import numpy as np 4 | import matplotlib.pyplot as plt 5 | import seaborn as sns 6 | 7 | #names = ['maml','sens0','random','oracle'] 8 | 9 | prefix = 'icml_ant_results_' 10 | oracle_pkl = prefix+'oracle.pkl' 11 | 12 | maml_pkl = prefix+'maml.pkl' 13 | pretrain_pkl = prefix+'pretrain.pkl' 14 | random_pkl = prefix+'random.pkl' 15 | 16 | key = 'task_avg_returns' 17 | 18 | n_itr = 4 19 | 20 | with open(oracle_pkl, 'rb') as f: 21 | oracle_data = np.array(pickle.load(f)[key])[0] 22 | 23 | 24 | oracle_data = np.reshape(oracle_data, [-1, 1]) 25 | oracle_data = np.tile(oracle_data[:,0:1], [1,n_itr]) 26 | 27 | fig = plt.figure() 28 | plt.clf() 29 | 30 | with open(maml_pkl, 'rb') as maml_f: 31 | maml_data = np.array(pickle.load(maml_f)[key]).T[:,:n_itr] 32 | 33 | with open(pretrain_pkl, 'rb') as f: 34 | pretrain_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 35 | 36 | with open(random_pkl, 'rb') as f: 37 | random_data = np.array(pickle.load(f)[key]).T[:,:n_itr] 38 | 39 | 40 | legend=False 41 | sns.tsplot(time=range(n_itr), data=maml_data[:,:n_itr], color='g', linestyle='-', marker='o', condition='MAML (ours)', legend=legend) 42 | sns.tsplot(time=range(n_itr), data=pretrain_data[:,:n_itr], color='b', linestyle='--', marker='s', condition='pretrained', legend=legend) 43 | #sns.tsplot(time=range(n_itr), data=random_data[:,:n_itr], color='k', linestyle=':', marker='^', condition='random', legend=legend) 44 | sns.tsplot(time=range(n_itr), data=oracle_data[:,:n_itr], color='r', linestyle='-.', marker='v', condition='oracle', legend=legend) 45 | ax = fig.gca() 46 | 47 | plt.xlabel('number of gradient steps', fontsize=26) 48 | plt.ylabel('average return', fontsize=26) 49 | if legend: 50 | lgd=plt.legend(['MAML (ours)', 'pretrained', 'random', 'oracle'], loc=0, bbox_to_anchor=(1, 0.5), fontsize=20) 51 | plt.title('ant, goal velocity', fontsize=26) 52 | #plt.ylim([-0.04, 3.5]) 53 | plt.tight_layout() 54 | #ax.set(yscale='symlog') 55 | 56 | ax = plt.gca() 57 | plt.setp(ax.get_xticklabels(), fontsize=18) 58 | plt.setp(ax.get_yticklabels(), fontsize=18) 59 | plt.xticks(np.arange(0,4,1.0)) 60 | 61 | if legend: 62 | plt.savefig('ant_results.png', bbox_extra_artists=(lgd,), transparent=True, bbox_inches='tight') 63 | else: 64 | plt.savefig('ant_results.png', bbox_inches='tight') 65 | 66 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/distributions/bernoulli.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | from .base import Distribution 4 | import tensorflow as tf 5 | import numpy as np 6 | 7 | TINY = 1e-8 8 | 9 | 10 | class Bernoulli(Distribution): 11 | def __init__(self, dim): 12 | self._dim = dim 13 | 14 | @property 15 | def dim(self): 16 | return self._dim 17 | 18 | def kl_sym(self, old_dist_info_vars, new_dist_info_vars): 19 | old_p = old_dist_info_vars["p"] 20 | new_p = new_dist_info_vars["p"] 21 | kl = old_p * (tf.log(old_p + TINY) - tf.log(new_p + TINY)) + \ 22 | (1 - old_p) * (tf.log(1 - old_p + TINY) - tf.log(1 - new_p + TINY)) 23 | ndims = kl.get_shape().ndims 24 | return tf.reduce_sum(kl, reduction_indices=ndims - 1) 25 | 26 | def kl(self, old_dist_info, new_dist_info): 27 | old_p = old_dist_info["p"] 28 | new_p = new_dist_info["p"] 29 | kl = old_p * (np.log(old_p + TINY) - np.log(new_p + TINY)) + \ 30 | (1 - old_p) * (np.log(1 - old_p + TINY) - np.log(1 - new_p + TINY)) 31 | return np.sum(kl, axis=-1) 32 | 33 | def sample(self, dist_info): 34 | p = np.asarray(dist_info["p"]) 35 | return np.cast['int'](np.random.uniform(low=0., high=1., size=p.shape) < p) 36 | 37 | def likelihood_ratio_sym(self, x_var, old_dist_info_vars, new_dist_info_vars): 38 | old_p = old_dist_info_vars["p"] 39 | new_p = new_dist_info_vars["p"] 40 | ndims = old_p.get_shape().ndims 41 | return tf.reduce_prod(x_var * new_p / (old_p + TINY) + (1 - x_var) * (1 - new_p) / (1 - old_p + TINY), 42 | reduction_indices=ndims - 1) 43 | 44 | def log_likelihood_sym(self, x_var, dist_info_vars): 45 | p = dist_info_vars["p"] 46 | ndims = p.get_shape().ndims 47 | return tf.reduce_sum(x_var * tf.log(p + TINY) + (1 - x_var) * tf.log(1 - p + TINY), reduction_indices=ndims - 1) 48 | 49 | def log_likelihood(self, xs, dist_info): 50 | p = dist_info["p"] 51 | return np.sum(xs * np.log(p + TINY) + (1 - xs) * np.log(1 - p + TINY), axis=-1) 52 | 53 | def entropy(self, dist_info): 54 | p = dist_info["p"] 55 | return np.sum(- p * np.log(p + TINY) - (1 - p) * np.log(1 - p + TINY), axis=-1) 56 | 57 | @property 58 | def dist_info_keys(self): 59 | return ["p"] 60 | -------------------------------------------------------------------------------- /rllab/envs/box2d/double_pendulum_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from rllab.envs.box2d.parser import find_body 3 | 4 | from rllab.core.serializable import Serializable 5 | from rllab.envs.box2d.box2d_env import Box2DEnv 6 | from rllab.misc import autoargs 7 | from rllab.misc.overrides import overrides 8 | 9 | 10 | # http://mlg.eng.cam.ac.uk/pilco/ 11 | class DoublePendulumEnv(Box2DEnv, Serializable): 12 | 13 | @autoargs.inherit(Box2DEnv.__init__) 14 | def __init__(self, *args, **kwargs): 15 | # make sure mdp-level step is 100ms long 16 | kwargs["frame_skip"] = kwargs.get("frame_skip", 2) 17 | if kwargs.get("template_args", {}).get("noise", False): 18 | self.link_len = (np.random.rand()-0.5) + 1 19 | else: 20 | self.link_len = 1 21 | kwargs["template_args"] = kwargs.get("template_args", {}) 22 | kwargs["template_args"]["link_len"] = self.link_len 23 | super(DoublePendulumEnv, self).__init__( 24 | self.model_path("double_pendulum.xml.mako"), 25 | *args, **kwargs 26 | ) 27 | self.link1 = find_body(self.world, "link1") 28 | self.link2 = find_body(self.world, "link2") 29 | Serializable.__init__(self, *args, **kwargs) 30 | 31 | @overrides 32 | def reset(self): 33 | self._set_state(self.initial_state) 34 | self._invalidate_state_caches() 35 | stds = np.array([0.1, 0.1, 0.01, 0.01]) 36 | pos1, pos2, v1, v2 = np.random.randn(*stds.shape) * stds 37 | self.link1.angle = pos1 38 | self.link2.angle = pos2 39 | self.link1.angularVelocity = v1 40 | self.link2.angularVelocity = v2 41 | return self.get_current_obs() 42 | 43 | def get_tip_pos(self): 44 | cur_center_pos = self.link2.position 45 | cur_angle = self.link2.angle 46 | cur_pos = ( 47 | cur_center_pos[0] - self.link_len*np.sin(cur_angle), 48 | cur_center_pos[1] - self.link_len*np.cos(cur_angle) 49 | ) 50 | return cur_pos 51 | 52 | @overrides 53 | def compute_reward(self, action): 54 | yield 55 | tgt_pos = np.asarray([0, self.link_len * 2]) 56 | cur_pos = self.get_tip_pos() 57 | dist = np.linalg.norm(cur_pos - tgt_pos) 58 | yield -dist 59 | 60 | def is_current_done(self): 61 | return False 62 | 63 | -------------------------------------------------------------------------------- /docs/user/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | 4 | ============ 5 | Installation 6 | ============ 7 | 8 | Preparation 9 | =========== 10 | 11 | You need to edit your :code:`PYTHONPATH` to include the rllab directory: 12 | 13 | .. code-block:: bash 14 | 15 | export PYTHONPATH=path_to_rllab:$PYTHONPATH 16 | 17 | Express Install 18 | =============== 19 | 20 | The fastest way to set up dependencies for rllab is via running the setup script. 21 | 22 | - On Linux, run the following: 23 | 24 | .. code-block:: bash 25 | 26 | ./scripts/setup_linux.sh 27 | 28 | - On Mac OS X, run the following: 29 | 30 | .. code-block:: bash 31 | 32 | ./scripts/setup_osx.sh 33 | 34 | The script sets up a conda environment, which is similar to :code:`virtualenv`. To start using it, run the following: 35 | 36 | .. code-block:: bash 37 | 38 | source activate rllab3 39 | 40 | 41 | Optionally, if you would like to run experiments that depends on the Mujoco environment, you can set it up by running the following command: 42 | 43 | .. code-block:: bash 44 | 45 | ./scripts/setup_mujoco.sh 46 | 47 | and follow the instructions. You need to have the zip file for Mujoco v1.31 and the license file ready. 48 | 49 | 50 | 51 | Manual Install 52 | ============== 53 | 54 | Anaconda 55 | ------------ 56 | 57 | :code:`rllab` assumes that you are using Anaconda Python distribution. You can download it from `https://www.continuum.io/downloads`. Make sure to download the installer for Python 2.7. 58 | 59 | 60 | System dependencies for pygame 61 | ------------------------------ 62 | 63 | A few environments in rllab are implemented using Box2D, which uses pygame for visualization. 64 | It requires a few system dependencies to be installed first. 65 | 66 | On Linux, run the following: 67 | 68 | .. code-block:: bash 69 | 70 | sudo apt-get install swig 71 | sudo apt-get build-dep python-pygame 72 | 73 | On Mac OS X, run the following: 74 | 75 | .. code-block:: bash 76 | 77 | brew install swig sdl sdl_image sdl_mixer sdl_ttf portmidi 78 | 79 | System dependencies for scipy 80 | ----------------------------- 81 | 82 | This step is only needed under Linux: 83 | 84 | .. code-block:: bash 85 | 86 | sudo apt-get build-dep python-scipy 87 | 88 | Install Python modules 89 | ---------------------- 90 | 91 | .. code-block:: bash 92 | 93 | conda env create -f environment.yml 94 | -------------------------------------------------------------------------------- /rllab/spaces/box.py: -------------------------------------------------------------------------------- 1 | from rllab.core.serializable import Serializable 2 | from .base import Space 3 | import numpy as np 4 | from rllab.misc import ext 5 | import theano 6 | 7 | 8 | class Box(Space): 9 | """ 10 | A box in R^n. 11 | I.e., each coordinate is bounded. 12 | """ 13 | 14 | def __init__(self, low, high, shape=None): 15 | """ 16 | Two kinds of valid input: 17 | Box(-1.0, 1.0, (3,4)) # low and high are scalars, and shape is provided 18 | Box(np.array([-1.0,-2.0]), np.array([2.0,4.0])) # low and high are arrays of the same shape 19 | """ 20 | if shape is None: 21 | assert low.shape == high.shape 22 | self.low = low 23 | self.high = high 24 | else: 25 | assert np.isscalar(low) and np.isscalar(high) 26 | self.low = low + np.zeros(shape) 27 | self.high = high + np.zeros(shape) 28 | 29 | def sample(self): 30 | return np.random.uniform(low=self.low, high=self.high, size=self.low.shape) 31 | 32 | def contains(self, x): 33 | return x.shape == self.shape and (x >= self.low).all() and (x <= self.high).all() 34 | 35 | @property 36 | def shape(self): 37 | return self.low.shape 38 | 39 | @property 40 | def flat_dim(self): 41 | return np.prod(self.low.shape) 42 | 43 | @property 44 | def bounds(self): 45 | return self.low, self.high 46 | 47 | def flatten(self, x): 48 | return np.asarray(x).flatten() 49 | 50 | def unflatten(self, x): 51 | return np.asarray(x).reshape(self.shape) 52 | 53 | def flatten_n(self, xs): 54 | xs = np.asarray(xs) 55 | return xs.reshape((xs.shape[0], -1)) 56 | 57 | def unflatten_n(self, xs): 58 | xs = np.asarray(xs) 59 | return xs.reshape((xs.shape[0],) + self.shape) 60 | 61 | def __repr__(self): 62 | return "Box" + str(self.shape) 63 | 64 | def __eq__(self, other): 65 | return isinstance(other, Box) and np.allclose(self.low, other.low) and \ 66 | np.allclose(self.high, other.high) 67 | 68 | def __hash__(self): 69 | return hash((self.low, self.high)) 70 | 71 | def new_tensor_variable(self, name, extra_dims): 72 | return ext.new_tensor( 73 | name=name, 74 | ndim=extra_dims+1, 75 | dtype=theano.config.floatX 76 | ) 77 | 78 | -------------------------------------------------------------------------------- /rllab/exploration_strategies/ou_strategy.py: -------------------------------------------------------------------------------- 1 | from rllab.misc.overrides import overrides 2 | from rllab.misc.ext import AttrDict 3 | from rllab.core.serializable import Serializable 4 | from rllab.spaces.box import Box 5 | from rllab.exploration_strategies.base import ExplorationStrategy 6 | import numpy as np 7 | import numpy.random as nr 8 | 9 | 10 | class OUStrategy(ExplorationStrategy, Serializable): 11 | """ 12 | This strategy implements the Ornstein-Uhlenbeck process, which adds 13 | time-correlated noise to the actions taken by the deterministic policy. 14 | The OU process satisfies the following stochastic differential equation: 15 | dxt = theta*(mu - xt)*dt + sigma*dWt 16 | where Wt denotes the Wiener process 17 | """ 18 | 19 | def __init__(self, env_spec, mu=0, theta=0.15, sigma=0.3, **kwargs): 20 | assert isinstance(env_spec.action_space, Box) 21 | assert len(env_spec.action_space.shape) == 1 22 | Serializable.quick_init(self, locals()) 23 | self.mu = mu 24 | self.theta = theta 25 | self.sigma = sigma 26 | self.action_space = env_spec.action_space 27 | self.state = np.ones(self.action_space.flat_dim) * self.mu 28 | self.reset() 29 | 30 | def __getstate__(self): 31 | d = Serializable.__getstate__(self) 32 | d["state"] = self.state 33 | return d 34 | 35 | def __setstate__(self, d): 36 | Serializable.__setstate__(self, d) 37 | self.state = d["state"] 38 | 39 | @overrides 40 | def reset(self): 41 | self.state = np.ones(self.action_space.flat_dim) * self.mu 42 | 43 | def evolve_state(self): 44 | x = self.state 45 | dx = self.theta * (self.mu - x) + self.sigma * nr.randn(len(x)) 46 | self.state = x + dx 47 | return self.state 48 | 49 | @overrides 50 | def get_action(self, t, observation, policy, **kwargs): 51 | action, _ = policy.get_action(observation) 52 | ou_state = self.evolve_state() 53 | return np.clip(action + ou_state, self.action_space.low, self.action_space.high) 54 | 55 | 56 | if __name__ == "__main__": 57 | ou = OUStrategy(env_spec=AttrDict(action_space=Box(low=-1, high=1, shape=(1,))), mu=0, theta=0.15, sigma=0.3) 58 | states = [] 59 | for i in range(1000): 60 | states.append(ou.evolve_state()[0]) 61 | import matplotlib.pyplot as plt 62 | 63 | plt.plot(states) 64 | plt.show() 65 | -------------------------------------------------------------------------------- /rllab/policies/base.py: -------------------------------------------------------------------------------- 1 | from rllab.core.parameterized import Parameterized 2 | 3 | 4 | class Policy(Parameterized): 5 | def __init__(self, env_spec): 6 | Parameterized.__init__(self) 7 | self._env_spec = env_spec 8 | 9 | # Should be implemented by all policies 10 | 11 | def get_action(self, observation): 12 | raise NotImplementedError 13 | 14 | def reset(self): 15 | pass 16 | 17 | @property 18 | def observation_space(self): 19 | return self._env_spec.observation_space 20 | 21 | @property 22 | def action_space(self): 23 | return self._env_spec.action_space 24 | 25 | @property 26 | def recurrent(self): 27 | """ 28 | Indicates whether the policy is recurrent. 29 | :return: 30 | """ 31 | return False 32 | 33 | def log_diagnostics(self, paths): 34 | """ 35 | Log extra information per iteration based on the collected paths 36 | """ 37 | pass 38 | 39 | @property 40 | def state_info_keys(self): 41 | """ 42 | Return keys for the information related to the policy's state when taking an action. 43 | :return: 44 | """ 45 | return list() 46 | 47 | def terminate(self): 48 | """ 49 | Clean up operation 50 | """ 51 | pass 52 | 53 | 54 | class StochasticPolicy(Policy): 55 | 56 | @property 57 | def distribution(self): 58 | """ 59 | :rtype Distribution 60 | """ 61 | raise NotImplementedError 62 | 63 | def dist_info_sym(self, obs_var, state_info_vars): 64 | """ 65 | Return the symbolic distribution information about the actions. 66 | :param obs_var: symbolic variable for observations 67 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at 68 | the time it received the observation 69 | :return: 70 | """ 71 | raise NotImplementedError 72 | 73 | def dist_info(self, obs, state_infos): 74 | """ 75 | Return the distribution information about the actions. 76 | :param obs_var: observation values 77 | :param state_info_vars: a dictionary whose values should contain information about the state of the policy at 78 | the time it received the observation 79 | :return: 80 | """ 81 | raise NotImplementedError 82 | -------------------------------------------------------------------------------- /examples/cluster_gym_mujoco_demo.py: -------------------------------------------------------------------------------- 1 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 2 | from rllab.envs.normalized_env import normalize 3 | from sandbox.rocky.tf.envs.base import TfEnv 4 | from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 5 | from sandbox.rocky.tf.algos.trpo import TRPO 6 | from rllab.misc.instrument import stub, run_experiment_lite 7 | #from rllab.envs.gym_env import GymEnv 8 | #from rllab.envs.mujoco.swimmer_randgoal_env import SwimmerRandGoalEnv 9 | from rllab.envs.mujoco.swimmer_randgoal_oracle_env import SwimmerRandGoalOracleEnv 10 | import sys 11 | 12 | stub(globals()) 13 | 14 | from rllab.misc.instrument import VariantGenerator, variant 15 | 16 | 17 | class VG(VariantGenerator): 18 | 19 | @variant 20 | def step_size(self): 21 | return [0.005,0.01,0.02] #, 0.05, 0.1] 22 | 23 | @variant 24 | def seed(self): 25 | return [2,3] #, 11, 21, 31, 41] 26 | 27 | variants = VG().variants() 28 | 29 | for v in variants: 30 | 31 | env = TfEnv(normalize(SwimmerRandGoalOracleEnv())) 32 | #env = TfEnv(normalize(GymEnv('HalfCheetah-v1', record_video=False, record_log=False))) 33 | 34 | policy = GaussianMLPPolicy( 35 | env_spec=env.spec, 36 | # The neural network policy should have two hidden layers, each with 32 hidden units. 37 | hidden_sizes=(100, 100), 38 | name="policy" 39 | ) 40 | 41 | baseline = LinearFeatureBaseline(env_spec=env.spec) 42 | 43 | algo = TRPO( 44 | env=env, 45 | policy=policy, 46 | baseline=baseline, 47 | batch_size=10000, 48 | max_path_length=500, 49 | n_itr=500, 50 | discount=0.99, 51 | step_size=v["step_size"], 52 | # Uncomment both lines (this and the plot parameter below) to enable plotting 53 | # plot=True, 54 | ) 55 | 56 | run_experiment_lite( 57 | algo.train(), 58 | exp_prefix="trpo_swimmer_baselines", 59 | # Number of parallel workers for sampling 60 | n_parallel=1, 61 | # Only keep the snapshot parameters for the last iteration 62 | snapshot_mode="last", 63 | # Specifies the seed for the experiment. If this is not provided, a random seed 64 | # will be used 65 | seed=v["seed"], 66 | # mode="local", 67 | mode="ec2", 68 | variant=v, 69 | # plot=True, 70 | # terminate_machine=False, 71 | ) 72 | -------------------------------------------------------------------------------- /rllab/envs/box2d/cartpole_swingup_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pygame 3 | from rllab.envs.box2d.parser import find_body 4 | 5 | from rllab.core.serializable import Serializable 6 | from rllab.envs.box2d.box2d_env import Box2DEnv 7 | from rllab.misc import autoargs 8 | from rllab.misc.overrides import overrides 9 | 10 | 11 | # Tornio, Matti, and Tapani Raiko. "Variational Bayesian approach for 12 | # nonlinear identification and control." Proc. of the IFAC Workshop on 13 | # Nonlinear Model Predictive Control for Fast Systems, NMPC FS06. 2006. 14 | class CartpoleSwingupEnv(Box2DEnv, Serializable): 15 | 16 | @autoargs.inherit(Box2DEnv.__init__) 17 | def __init__(self, *args, **kwargs): 18 | super(CartpoleSwingupEnv, self).__init__( 19 | self.model_path("cartpole.xml.mako"), 20 | *args, **kwargs 21 | ) 22 | self.max_cart_pos = 3 23 | self.max_reward_cart_pos = 3 24 | self.cart = find_body(self.world, "cart") 25 | self.pole = find_body(self.world, "pole") 26 | Serializable.__init__(self, *args, **kwargs) 27 | 28 | @overrides 29 | def reset(self): 30 | self._set_state(self.initial_state) 31 | self._invalidate_state_caches() 32 | bounds = np.array([ 33 | [-1, -2, np.pi-1, -3], 34 | [1, 2, np.pi+1, 3], 35 | ]) 36 | low, high = bounds 37 | xpos, xvel, apos, avel = np.random.uniform(low, high) 38 | self.cart.position = (xpos, self.cart.position[1]) 39 | self.cart.linearVelocity = (xvel, self.cart.linearVelocity[1]) 40 | self.pole.angle = apos 41 | self.pole.angularVelocity = avel 42 | return self.get_current_obs() 43 | 44 | @overrides 45 | def compute_reward(self, action): 46 | yield 47 | if self.is_current_done(): 48 | yield -100 49 | else: 50 | if abs(self.cart.position[0]) > self.max_reward_cart_pos: 51 | yield -1 52 | else: 53 | yield np.cos(self.pole.angle) 54 | 55 | @overrides 56 | def is_current_done(self): 57 | return abs(self.cart.position[0]) > self.max_cart_pos 58 | 59 | @overrides 60 | def action_from_keys(self, keys): 61 | if keys[pygame.K_LEFT]: 62 | return np.asarray([-10]) 63 | elif keys[pygame.K_RIGHT]: 64 | return np.asarray([+10]) 65 | else: 66 | return np.asarray([0]) 67 | 68 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/swimmer_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.base import Step 2 | from rllab.misc.overrides import overrides 3 | from .mujoco_env import MujocoEnv 4 | import numpy as np 5 | from rllab.core.serializable import Serializable 6 | from rllab.misc import logger 7 | from rllab.misc import autoargs 8 | 9 | 10 | class SwimmerEnv(MujocoEnv, Serializable): 11 | 12 | FILE = 'swimmer.xml' 13 | 14 | @autoargs.arg('ctrl_cost_coeff', type=float, 15 | help='cost coefficient for controls') 16 | def __init__( 17 | self, 18 | ctrl_cost_coeff=1e-2, 19 | *args, **kwargs): 20 | self.ctrl_cost_coeff = ctrl_cost_coeff 21 | super(SwimmerEnv, self).__init__(*args, **kwargs) 22 | Serializable.quick_init(self, locals()) 23 | 24 | def get_current_obs(self): 25 | return np.concatenate([ 26 | self.model.data.qpos.flat, 27 | self.model.data.qvel.flat, 28 | self.get_body_com("torso").flat, 29 | ]).reshape(-1) 30 | 31 | def step(self, action): 32 | self.forward_dynamics(action) 33 | next_obs = self.get_current_obs() 34 | lb, ub = self.action_bounds 35 | scaling = (ub - lb) * 0.5 36 | ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum( 37 | np.square(action / scaling)) 38 | forward_reward = self.get_body_comvel("torso")[0] 39 | #forward_reward = -1.5*np.abs(self.get_body_comvel("torso")[0] - 0.15) 40 | # max achievable vel is around 0.20 for vpg. 41 | reward = forward_reward - ctrl_cost 42 | done = False 43 | return Step(next_obs, reward, done) 44 | 45 | @overrides 46 | def log_diagnostics(self, paths, prefix=''): 47 | progs = [ 48 | path["observations"][-1][-3] - path["observations"][0][-3] 49 | for path in paths 50 | ] 51 | #if np.mean(progs) > 4.5: 52 | # import pdb; pdb.set_trace() 53 | #path = paths[0] 54 | #t = -10 55 | #lb, ub = self.action_bounds 56 | #scaling = (ub - lb) * 0.5 57 | #rew = path['rewards'][t] 58 | #act = path['actions'][t] 59 | #ctrl_cost = 0.5*self.ctrl_cost_coeff*np.sum(np.square(act/scaling)) 60 | 61 | logger.record_tabular('AverageForwardProgress', np.mean(progs)) 62 | logger.record_tabular('MaxForwardProgress', np.max(progs)) 63 | logger.record_tabular('MinForwardProgress', np.min(progs)) 64 | logger.record_tabular('StdForwardProgress', np.std(progs)) 65 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM ubuntu:16.04 2 | 3 | # ========== Anaconda ========== 4 | # https://github.com/ContinuumIO/docker-images/blob/master/anaconda/Dockerfile 5 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 6 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 7 | git mercurial subversion 8 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ 9 | wget --no-check-certificate --quiet https://repo.continuum.io/archive/Anaconda2-2.5.0-Linux-x86_64.sh && \ 10 | /bin/bash /Anaconda2-2.5.0-Linux-x86_64.sh -b -p /opt/conda && \ 11 | rm /Anaconda2-2.5.0-Linux-x86_64.sh 12 | 13 | RUN apt-get install -y curl grep sed dpkg && \ 14 | TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \ 15 | curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ 16 | dpkg -i tini.deb && \ 17 | rm tini.deb && \ 18 | apt-get clean 19 | 20 | ENV PATH /opt/conda/bin:$PATH 21 | # http://bugs.python.org/issue19846 22 | # > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK. 23 | ENV LANG C.UTF-8 24 | ENTRYPOINT [ "/usr/bin/tini", "--" ] 25 | 26 | # ========== Special Deps ========== 27 | RUN apt-get -y install git make cmake unzip 28 | RUN pip install awscli 29 | # ALE requires zlib 30 | RUN apt-get -y install zlib1g-dev 31 | # MUJOCO requires graphics stuff (Why?) 32 | RUN apt-get -y build-dep glfw 33 | RUN apt-get -y install libxrandr2 libxinerama-dev libxi6 libxcursor-dev 34 | # copied from requirements.txt 35 | #RUN pip install imageio tabulate nose 36 | RUN apt-get install -y vim ack-grep 37 | RUN pip install --upgrade pip 38 | # usual pip install pygame will fail 39 | RUN apt-get build-dep -y python-pygame 40 | RUN pip install Pillow 41 | 42 | # ========== OpenAI Gym ========== 43 | RUN apt-get -y install libgtk2.0-0 44 | RUN pip install gym 45 | #RUN apt-get -y install ffmpeg 46 | RUN apt-get -y install libav-tools 47 | CMD alias ffmpeg="avconv" 48 | 49 | # ========== Add codebase stub ========== 50 | CMD mkdir /root/code 51 | ADD environment.yml /root/code/environment.yml 52 | RUN conda env create -f /root/code/environment.yml 53 | 54 | ENV PYTHONPATH /root/code/rllab:$PYTHONPATH 55 | ENV PATH /opt/conda/envs/rllab3/bin:$PATH 56 | RUN echo "source activate rllab3" >> /root/.bashrc 57 | ENV BASH_ENV /root/.bashrc 58 | WORKDIR /root/code 59 | 60 | # gpu theanno 61 | ENV THEANO_FLAGS mode=FAST_RUN,device=gpu,floatX=float32 62 | 63 | 64 | -------------------------------------------------------------------------------- /rllab/spaces/product.py: -------------------------------------------------------------------------------- 1 | from rllab.spaces.base import Space 2 | import numpy as np 3 | from rllab.misc import ext 4 | 5 | 6 | class Product(Space): 7 | 8 | def __init__(self, *components): 9 | if isinstance(components[0], (list, tuple)): 10 | assert len(components) == 1 11 | components = components[0] 12 | self._components = tuple(components) 13 | dtypes = [c.new_tensor_variable("tmp", extra_dims=0).dtype for c in components] 14 | if len(dtypes) > 0 and hasattr(dtypes[0], "as_numpy_dtype"): 15 | dtypes = [d.as_numpy_dtype for d in dtypes] 16 | self._common_dtype = np.core.numerictypes.find_common_type([], dtypes) 17 | 18 | def sample(self): 19 | return tuple(x.sample() for x in self._components) 20 | 21 | @property 22 | def components(self): 23 | return self._components 24 | 25 | def contains(self, x): 26 | return isinstance(x, tuple) and all(c.contains(xi) for c, xi in zip(self._components, x)) 27 | 28 | def new_tensor_variable(self, name, extra_dims): 29 | return ext.new_tensor( 30 | name=name, 31 | ndim=extra_dims+1, 32 | dtype=self._common_dtype, 33 | ) 34 | 35 | @property 36 | def flat_dim(self): 37 | return np.sum([c.flat_dim for c in self._components]) 38 | 39 | def flatten(self, x): 40 | return np.concatenate([c.flatten(xi) for c, xi in zip(self._components, x)]) 41 | 42 | def flatten_n(self, xs): 43 | xs_regrouped = [[x[i] for x in xs] for i in range(len(xs[0]))] 44 | flat_regrouped = [c.flatten_n(xi) for c, xi in zip(self.components, xs_regrouped)] 45 | return np.concatenate(flat_regrouped, axis=-1) 46 | 47 | def unflatten(self, x): 48 | dims = [c.flat_dim for c in self._components] 49 | flat_xs = np.split(x, np.cumsum(dims)[:-1]) 50 | return tuple(c.unflatten(xi) for c, xi in zip(self._components, flat_xs)) 51 | 52 | def unflatten_n(self, xs): 53 | dims = [c.flat_dim for c in self._components] 54 | flat_xs = np.split(xs, np.cumsum(dims)[:-1], axis=-1) 55 | unflat_xs = [c.unflatten_n(xi) for c, xi in zip(self.components, flat_xs)] 56 | unflat_xs_grouped = list(zip(*unflat_xs)) 57 | return unflat_xs_grouped 58 | 59 | def __eq__(self, other): 60 | if not isinstance(other, Product): 61 | return False 62 | return tuple(self.components) == tuple(other.components) 63 | 64 | def __hash__(self): 65 | return hash(tuple(self.components)) 66 | -------------------------------------------------------------------------------- /docker/gpu_Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:7.5-cudnn4-devel-ubuntu14.04 2 | 3 | # ========== Anaconda ========== 4 | # https://github.com/ContinuumIO/docker-images/blob/master/anaconda/Dockerfile 5 | RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ 6 | libglib2.0-0 libxext6 libsm6 libxrender1 \ 7 | git mercurial subversion 8 | RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ 9 | wget --no-check-certificate --quiet https://repo.continuum.io/archive/Anaconda2-2.5.0-Linux-x86_64.sh && \ 10 | /bin/bash /Anaconda2-2.5.0-Linux-x86_64.sh -b -p /opt/conda && \ 11 | rm /Anaconda2-2.5.0-Linux-x86_64.sh 12 | 13 | RUN apt-get install -y curl grep sed dpkg && \ 14 | TINI_VERSION=`curl https://github.com/krallin/tini/releases/latest | grep -o "/v.*\"" | sed 's:^..\(.*\).$:\1:'` && \ 15 | curl -L "https://github.com/krallin/tini/releases/download/v${TINI_VERSION}/tini_${TINI_VERSION}.deb" > tini.deb && \ 16 | dpkg -i tini.deb && \ 17 | rm tini.deb && \ 18 | apt-get clean 19 | 20 | ENV PATH /opt/conda/bin:$PATH 21 | # http://bugs.python.org/issue19846 22 | # > At the moment, setting "LANG=C" on a Linux system *fundamentally breaks Python 3*, and that's not OK. 23 | ENV LANG C.UTF-8 24 | ENTRYPOINT [ "/usr/bin/tini", "--" ] 25 | 26 | # ========== Special Deps ========== 27 | RUN apt-get -y install git make cmake unzip 28 | RUN pip install awscli 29 | # ALE requires zlib 30 | RUN apt-get -y install zlib1g-dev 31 | # MUJOCO requires graphics stuff (Why?) 32 | RUN apt-get -y build-dep glfw 33 | RUN apt-get -y install libxrandr2 libxinerama-dev libxi6 libxcursor-dev 34 | # copied from requirements.txt 35 | #RUN pip install imageio tabulate nose 36 | RUN apt-get install -y vim ack-grep 37 | RUN pip install --upgrade pip 38 | # usual pip install pygame will fail 39 | RUN apt-get build-dep -y python-pygame 40 | RUN pip install Pillow 41 | 42 | # ========== OpenAI Gym ========== 43 | RUN apt-get -y install libgtk2.0-0 44 | RUN pip install gym 45 | #RUN apt-get -y install ffmpeg 46 | RUN apt-get -y install libav-tools 47 | CMD alias ffmpeg="avconv" 48 | 49 | # ========== Add codebase stub ========== 50 | CMD mkdir /root/code 51 | ADD environment.yml /root/code/environment.yml 52 | RUN conda env create -f /root/code/environment.yml 53 | 54 | ENV PYTHONPATH /root/code/rllab:$PYTHONPATH 55 | ENV PATH /opt/conda/envs/rllab3/bin:$PATH 56 | RUN echo "source activate rllab3" >> /root/.bashrc 57 | ENV BASH_ENV /root/.bashrc 58 | WORKDIR /root/code 59 | 60 | # gpu theanno 61 | ENV THEANO_FLAGS mode=FAST_RUN,device=gpu,floatX=float32 62 | -------------------------------------------------------------------------------- /vendor/mujoco_models/swimmer.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | -------------------------------------------------------------------------------- /sandbox/rocky/tf/spaces/product.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | from rllab.spaces.base import Space 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | 9 | class Product(Space): 10 | def __init__(self, *components): 11 | if isinstance(components[0], (list, tuple)): 12 | assert len(components) == 1 13 | components = components[0] 14 | self._components = tuple(components) 15 | dtypes = [c.new_tensor_variable("tmp", extra_dims=0).dtype for c in components] 16 | if len(dtypes) > 0 and hasattr(dtypes[0], "as_numpy_dtype"): 17 | dtypes = [d.as_numpy_dtype for d in dtypes] 18 | self._common_dtype = np.core.numerictypes.find_common_type([], dtypes) 19 | 20 | def sample(self): 21 | return tuple(x.sample() for x in self._components) 22 | 23 | @property 24 | def components(self): 25 | return self._components 26 | 27 | def contains(self, x): 28 | return isinstance(x, tuple) and all(c.contains(xi) for c, xi in zip(self._components, x)) 29 | 30 | def new_tensor_variable(self, name, extra_dims): 31 | return tf.placeholder( 32 | dtype=self._common_dtype, 33 | shape=[None] * extra_dims + [self.flat_dim], 34 | name=name, 35 | ) 36 | 37 | @property 38 | def flat_dim(self): 39 | return np.sum([c.flat_dim for c in self._components]) 40 | 41 | def flatten(self, x): 42 | return np.concatenate([c.flatten(xi) for c, xi in zip(self._components, x)]) 43 | 44 | def flatten_n(self, xs): 45 | xs_regrouped = [[x[i] for x in xs] for i in range(len(xs[0]))] 46 | flat_regrouped = [c.flatten_n(xi) for c, xi in zip(self.components, xs_regrouped)] 47 | return np.concatenate(flat_regrouped, axis=-1) 48 | 49 | def unflatten(self, x): 50 | dims = [c.flat_dim for c in self._components] 51 | flat_xs = np.split(x, np.cumsum(dims)[:-1]) 52 | return tuple(c.unflatten(xi) for c, xi in zip(self._components, flat_xs)) 53 | 54 | def unflatten_n(self, xs): 55 | dims = [c.flat_dim for c in self._components] 56 | flat_xs = np.split(xs, np.cumsum(dims)[:-1], axis=-1) 57 | unflat_xs = [c.unflatten_n(xi) for c, xi in zip(self.components, flat_xs)] 58 | unflat_xs_grouped = list(zip(*unflat_xs)) 59 | return unflat_xs_grouped 60 | 61 | def __eq__(self, other): 62 | if not isinstance(other, Product): 63 | return False 64 | return tuple(self.components) == tuple(other.components) 65 | 66 | def __hash__(self): 67 | return hash(tuple(self.components)) 68 | -------------------------------------------------------------------------------- /examples/trpo_swimmer.py: -------------------------------------------------------------------------------- 1 | use_tf = True 2 | 3 | if use_tf: 4 | from sandbox.rocky.tf.algos.trpo import TRPO 5 | # from sandbox.rocky.tf.policies.gaussian_mlp_policy import GaussianMLPPolicy 6 | from sandbox.rocky.tf.policies.minimal_gauss_mlp_policy import GaussianMLPPolicy 7 | from sandbox.rocky.tf.envs.base import TfEnv 8 | else: 9 | from rllab.algos.trpo import TRPO 10 | from rllab.policies.gaussian_mlp_policy import GaussianMLPPolicy 11 | from rllab.baselines.linear_feature_baseline import LinearFeatureBaseline 12 | from rllab.envs.mujoco.swimmer_env import SwimmerEnv 13 | from rllab.envs.mujoco.swimmer_randgoal_oracle_env import SwimmerRandGoalOracleEnv 14 | from rllab.envs.mujoco.swimmer_randgoal_env import SwimmerRandGoalEnv 15 | #from rllab.envs.mujoco.half_cheetah_env import HalfCheetahEnv 16 | from rllab.envs.mujoco.walker2d_env import Walker2DEnv 17 | from rllab.envs.normalized_env import normalize 18 | from rllab.misc.instrument import stub, run_experiment_lite 19 | 20 | stub(globals()) 21 | 22 | 23 | #env = normalize(SwimmerEnv()) 24 | env = normalize(SwimmerRandGoalOracleEnv()) 25 | #env = normalize(SwimmerRandGoalEnv()) 26 | 27 | max_path_length = 100 28 | #env = normalize(HalfCheetahEnv()) 29 | #env = normalize(Walker2DEnv()) 30 | if use_tf: 31 | env = TfEnv(env) 32 | policy = GaussianMLPPolicy( 33 | name='policy', 34 | env_spec=env.spec, 35 | # The neural network policy should have two hidden layers, each with 32 hidden units. 36 | #hidden_sizes=(32, 32) 37 | hidden_sizes=(100, 100) 38 | ) 39 | else: 40 | policy = GaussianMLPPolicy( 41 | env_spec=env.spec, 42 | # The neural network policy should have two hidden layers, each with 32 hidden units. 43 | hidden_sizes=(100, 100) 44 | ) 45 | 46 | baseline = LinearFeatureBaseline(env_spec=env.spec) 47 | 48 | algo = TRPO( 49 | env=env, 50 | policy=policy, 51 | baseline=baseline, 52 | batch_size=max_path_length*10, # was 4k 53 | max_path_length=max_path_length, 54 | n_itr=500, 55 | discount=0.99, 56 | step_size=0.01, 57 | #plot=True, 58 | ) 59 | #algo.train() 60 | 61 | 62 | run_experiment_lite( 63 | algo.train(), 64 | # Number of parallel workers for sampling 65 | n_parallel=4, 66 | # Only keep the snapshot parameters for the last iteration 67 | snapshot_mode="last", 68 | # Specifies the seed for the experiment. If this is not provided, a random seed 69 | # will be used 70 | seed=1, 71 | exp_prefix='trpo_sensitive_swimmer' + str(max_path_length), 72 | exp_name='oracleenv', 73 | #plot=True, 74 | ) 75 | -------------------------------------------------------------------------------- /rllab/envs/mujoco/swimmer_randgoal_env.py: -------------------------------------------------------------------------------- 1 | from rllab.envs.base import Step 2 | from rllab.misc.overrides import overrides 3 | from .mujoco_env import MujocoEnv 4 | import numpy as np 5 | from rllab.core.serializable import Serializable 6 | from rllab.misc import logger 7 | from rllab.misc import autoargs 8 | 9 | 10 | class SwimmerRandGoalEnv(MujocoEnv, Serializable): 11 | 12 | FILE = 'swimmer.xml' 13 | 14 | @autoargs.arg('ctrl_cost_coeff', type=float, 15 | help='cost coefficient for controls') 16 | def __init__( 17 | self, 18 | ctrl_cost_coeff=1e-2, 19 | *args, **kwargs): 20 | self.ctrl_cost_coeff = ctrl_cost_coeff 21 | self._goal_vel = None 22 | super(SwimmerRandGoalEnv, self).__init__(*args, **kwargs) 23 | Serializable.quick_init(self, locals()) 24 | 25 | def get_current_obs(self): 26 | return np.concatenate([ 27 | self.model.data.qpos.flat, 28 | self.model.data.qvel.flat, 29 | self.get_body_com("torso").flat, 30 | ]).reshape(-1) 31 | 32 | @overrides 33 | def reset(self, init_state=None, reset_args=None, **kwargs): 34 | goal_vel = reset_args 35 | if goal_vel is not None: 36 | self._goal_vel = goal_vel 37 | elif self._goal is None: 38 | self._goal_vel = np.random.uniform(0.1, 0.2) 39 | self.reset_mujoco(init_state) 40 | self.model.forward() 41 | self.current_com = self.model.data.com_subtree[0] 42 | self.dcom = np.zeros_like(self.current_com) 43 | return self.get_current_obs() 44 | 45 | def step(self, action): 46 | self.forward_dynamics(action) 47 | next_obs = self.get_current_obs() 48 | lb, ub = self.action_bounds 49 | scaling = (ub - lb) * 0.5 50 | ctrl_cost = 0.5 * self.ctrl_cost_coeff * np.sum( 51 | np.square(action / scaling)) 52 | forward_reward = -1.5*np.abs(self.get_body_comvel("torso")[0] - self._goal_vel) 53 | reward = forward_reward - ctrl_cost 54 | done = False 55 | return Step(next_obs, reward, done) 56 | 57 | @overrides 58 | def log_diagnostics(self, paths, prefix=''): 59 | progs = [ 60 | path["observations"][-1][-3] - path["observations"][0][-3] 61 | for path in paths 62 | ] 63 | logger.record_tabular(prefix+'AverageForwardProgress', np.mean(progs)) 64 | logger.record_tabular(prefix+'MaxForwardProgress', np.max(progs)) 65 | logger.record_tabular(prefix+'MinForwardProgress', np.min(progs)) 66 | logger.record_tabular(prefix+'StdForwardProgress', np.std(progs)) 67 | --------------------------------------------------------------------------------