├── maple ├── __init__.py ├── dataset │ └── __init__.py ├── models │ ├── __init__.py │ ├── constructor.py │ └── utils.py ├── policy │ ├── __init__.py │ ├── static │ │ ├── pendulum.py │ │ ├── point2denv.py │ │ ├── point2dwallenv.py │ │ ├── halfcheetahveljump.py │ │ ├── humanoid.py │ │ ├── ant.py │ │ ├── halfcheetahvel.py │ │ ├── __init__.py │ │ ├── hopper.py │ │ ├── walker2d.py │ │ ├── halfcheetahjump.py │ │ ├── halfcheetah.py │ │ └── antangle.py │ └── fake_env.py ├── global_config.py ├── utils │ ├── __init__.py │ ├── filesystem.py │ ├── visualization.py │ └── logging.py └── env │ ├── __init__.py │ ├── halfcheetah_vel.py │ ├── halfcheetah_jump.py │ ├── humanoid.py │ ├── ant.py │ ├── ant_angle.py │ └── assert │ └── halfcheetah.xml ├── examples ├── __init__.py └── config │ ├── neorl │ ├── base.py │ ├── hopper_low.py │ ├── hopper_medium.py │ ├── walker2d_low.py │ ├── halfcheetah_low.py │ ├── walker2d_medium.py │ ├── halfcheetah_medium.py │ └── base_maple.py │ ├── d4rl │ ├── hopper_medium.py │ ├── hopper_random.py │ ├── walker2d_medium.py │ ├── walker2d_random.py │ ├── halfcheetah_medium.py │ ├── halfcheetah_random.py │ ├── hopper_mixed.py │ ├── hopper_medium_expert.py │ ├── walker2d_mixed.py │ ├── halfcheetah_mixed.py │ ├── walker2d_medium_expert.py │ ├── halfcheetah_medium_expert.py │ └── base_maple.py │ └── __init__.py ├── rla_scripts ├── __init__.py ├── config.py ├── view_expt.py ├── archive_expt.py ├── delete_expt.py └── start_pretty_plotter.py ├── run_scripts ├── __init__.py ├── utils.py ├── main.py └── base.py ├── softlearning ├── __init__.py ├── misc │ ├── __init__.py │ ├── plotter.py │ ├── kernel.py │ └── utils.py ├── models │ ├── __init__.py │ ├── utils.py │ └── feedforward.py ├── policies │ ├── __init__.py │ ├── utils.py │ ├── uniform_policy.py │ └── base_policy.py ├── scripts │ ├── __init__.py │ └── console_scripts.py ├── environments │ ├── __init__.py │ ├── gym │ │ ├── mujoco │ │ │ ├── __init__.py │ │ │ └── image_pusher_2d.py │ │ ├── robotics │ │ │ └── __init__.py │ │ ├── wrappers │ │ │ ├── __init__.py │ │ │ └── normalize_action.py │ │ ├── __init__.py │ │ └── multi_goal.py │ ├── adapters │ │ ├── __init__.py │ │ └── gym_adapter.py │ ├── dm_control │ │ └── __init__.py │ ├── helpers.py │ └── utils.py ├── preprocessors │ ├── __init__.py │ ├── utils.py │ └── convnet.py ├── value_functions │ ├── __init__.py │ ├── vanilla.py │ ├── utils.py │ └── value_function.py ├── algorithms │ ├── __init__.py │ └── utils.py ├── distributions │ ├── __init__.py │ └── squash_bijector.py ├── utils │ ├── numpy.py │ └── keras.py ├── replay_pools │ ├── __init__.py │ ├── extra_policy_info_replay_pool.py │ ├── replay_pool.py │ ├── utils.py │ ├── union_pool.py │ ├── trajectory_replay_pool.py │ └── flexible_replay_pool.py └── samplers │ ├── __init__.py │ ├── dummy_sampler.py │ ├── base_sampler.py │ ├── extra_policy_info_sampler.py │ ├── utils.py │ ├── explore_sampler.py │ ├── remote_sampler.py │ └── simple_sampler.py ├── resources ├── poster.png ├── plot_demo.png └── neorl-maple.png ├── log └── v2_examples.config.d4rl.walker2d_medium_expert │ └── 2022 │ └── 01 │ ├── 04 │ └── 09-30-38-583144 10.83.150.23 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=8&penalty_clip=20 │ │ ├── tb │ │ └── events │ │ │ └── events.out.tfevents.1641259841.ml-gpu-ser108.nmg01 │ │ └── warn.txt │ └── 01 │ ├── 15-38-56-149640 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=88&penalty_clip=20 │ ├── tb │ │ └── events │ │ │ └── events.out.tfevents.1641022739.ml-gpu-ser119.nmg01 │ └── warn.txt │ └── 15-42-56-985568 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=888&penalty_clip=20 │ ├── tb │ └── events │ │ └── events.out.tfevents.1641022980.ml-gpu-ser119.nmg01 │ └── warn.txt ├── rla_config_mopo.yaml ├── setup.py ├── LICENSE ├── .gitignore └── README.md /maple/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rla_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /run_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maple/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maple/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maple/policy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /examples/config/neorl/base.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/misc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/policies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/environments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/preprocessors/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /rla_scripts/config.py: -------------------------------------------------------------------------------- 1 | DATA_ROOT = '../' 2 | -------------------------------------------------------------------------------- /softlearning/value_functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/environments/gym/mujoco/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /softlearning/environments/gym/robotics/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maple/global_config.py: -------------------------------------------------------------------------------- 1 | STATE_CLIP_BOUND = 100 2 | MAX_PENALTY = 20 -------------------------------------------------------------------------------- /maple/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # from .filesystem import * 2 | # from .launcher import * -------------------------------------------------------------------------------- /softlearning/algorithms/__init__.py: -------------------------------------------------------------------------------- 1 | from .sql import SQL 2 | from .sac import SAC 3 | -------------------------------------------------------------------------------- /softlearning/distributions/__init__.py: -------------------------------------------------------------------------------- 1 | from .real_nvp_flow import ConditionalRealNVPFlow 2 | -------------------------------------------------------------------------------- /resources/poster.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/resources/poster.png -------------------------------------------------------------------------------- /resources/plot_demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/resources/plot_demo.png -------------------------------------------------------------------------------- /resources/neorl-maple.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/resources/neorl-maple.png -------------------------------------------------------------------------------- /softlearning/environments/gym/wrappers/__init__.py: -------------------------------------------------------------------------------- 1 | from .normalize_action import NormalizeActionWrapper 2 | -------------------------------------------------------------------------------- /maple/utils/filesystem.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def mkdir(path): 4 | if not os.path.exists(path): 5 | os.mkdir(path) -------------------------------------------------------------------------------- /softlearning/environments/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | """Module that provides adapters between SoftlearningEnv and other universes""" 2 | -------------------------------------------------------------------------------- /softlearning/utils/numpy.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def softmax(x): 5 | max_x = np.max(x) 6 | exp_x = np.exp(x - max_x) 7 | return exp_x / np.sum(exp_x) 8 | -------------------------------------------------------------------------------- /softlearning/replay_pools/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_replay_pool import SimpleReplayPool 2 | from .extra_policy_info_replay_pool import ExtraPolicyInfoReplayPool 3 | from .union_pool import UnionPool 4 | from .trajectory_replay_pool import TrajectoryReplayPool 5 | -------------------------------------------------------------------------------- /softlearning/environments/dm_control/__init__.py: -------------------------------------------------------------------------------- 1 | """Custom DeepMind Control Suite environments. 2 | 3 | Every class inside this module should extend a dm_control.suite.Task class. The 4 | # file structure should be similar to dm_control's file structure. 5 | """ 6 | -------------------------------------------------------------------------------- /maple/policy/static/pendulum.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | done = np.zeros((len(obs), 1)) 10 | return done 11 | -------------------------------------------------------------------------------- /softlearning/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_sampler import BaseSampler 2 | from .dummy_sampler import DummySampler 3 | from .simple_sampler import SimpleSampler 4 | # from .remote_sampler import RemoteSampler 5 | from .extra_policy_info_sampler import ExtraPolicyInfoSampler 6 | from .utils import rollout, rollouts 7 | -------------------------------------------------------------------------------- /maple/policy/static/point2denv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | done = np.array([False]).repeat(len(obs)) 10 | done = done[:,None] 11 | return done 12 | -------------------------------------------------------------------------------- /maple/policy/static/point2dwallenv.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | done = np.array([False]).repeat(len(obs)) 10 | done = done[:,None] 11 | return done 12 | -------------------------------------------------------------------------------- /maple/policy/static/halfcheetahveljump.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | done = np.array([False]).repeat(len(obs)) 10 | done = done[:,None] 11 | return done 12 | -------------------------------------------------------------------------------- /maple/policy/static/humanoid.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import numpy as np 3 | 4 | class StaticFns: 5 | 6 | @staticmethod 7 | def termination_fn(obs, act, next_obs): 8 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 9 | 10 | z = next_obs[:,0] 11 | done = (z < 1.0) + (z > 2.0) 12 | 13 | done = done[:,None] 14 | return done -------------------------------------------------------------------------------- /softlearning/environments/helpers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def random_point_in_circle(angle_range=(0, 2*np.pi), radius=(0, 25)): 5 | angle = np.random.uniform(*angle_range) 6 | radius = radius if np.isscalar(radius) else np.random.uniform(*radius) 7 | x, y = np.cos(angle) * radius, np.sin(angle) * radius 8 | point = np.array([x, y]) 9 | return point 10 | -------------------------------------------------------------------------------- /softlearning/samplers/dummy_sampler.py: -------------------------------------------------------------------------------- 1 | from .base_sampler import BaseSampler 2 | 3 | 4 | class DummySampler(BaseSampler): 5 | def __init__(self, batch_size, max_path_length): 6 | super(DummySampler, self).__init__( 7 | max_path_length=max_path_length, 8 | min_pool_size=0, 9 | batch_size=batch_size) 10 | 11 | def sample(self): 12 | pass 13 | -------------------------------------------------------------------------------- /examples/config/d4rl/hopper_medium.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'hopper', 6 | 'task': 'medium-v0', 7 | 'exp_name': 'hopper_medium' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/hopper-medium-v0', 11 | 'pool_load_max_size': 10**6, 12 | 'rollout_length': 5, 13 | 'penalty_coeff': 5.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/hopper_random.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'hopper', 6 | 'task': 'random-v0', 7 | 'exp_name': 'hopper_random' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/hopper-random-v0', 11 | 'pool_load_max_size': 10**6, 12 | 'rollout_length': 5, 13 | 'penalty_coeff': 1.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/walker2d_medium.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'walker2d', 6 | 'task': 'medium-v0', 7 | 'exp_name': 'walker2d_medium' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/walker2d-medium-v0', 11 | 'pool_load_max_size': 10**6, 12 | 'rollout_length': 5, 13 | 'penalty_coeff': 5.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/walker2d_random.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'walker2d', 6 | 'task': 'random-v0', 7 | 'exp_name': 'walker2d_random' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/walker2d-random-v0', 11 | 'pool_load_max_size': 10**6, 12 | 'rollout_length': 1, 13 | 'penalty_coeff': 1.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/halfcheetah_medium.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'halfcheetah', 6 | 'task': 'medium-v0', 7 | 'exp_name': 'halfcheetah_medium' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/halfcheetah-medium-v0', 11 | 'pool_load_max_size': 10**6, 12 | 'rollout_length': 1, 13 | 'penalty_coeff': 1.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/halfcheetah_random.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'halfcheetah', 6 | 'task': 'random-v0', 7 | 'exp_name': 'halfcheetah_random' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/halfcheetah-random-v0', 11 | 'pool_load_max_size': 10**6, 12 | 'rollout_length': 5, 13 | 'penalty_coeff': 0.5 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/hopper_mixed.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'hopper', 6 | 'task': 'medium-replay-v0', 7 | 'exp_name': 'hopper_medium_replay' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/hopper-medium-replay-v0', 11 | 'pool_load_max_size': 200920, 12 | 'rollout_length': 5, 13 | 'penalty_coeff': 1.0, 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/hopper_medium_expert.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'hopper', 6 | 'task': 'medium-expert-v0', 7 | 'exp_name': 'hopper_medium_expert' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/hopper-medium-expert-v0', 11 | 'pool_load_max_size': 2 * 10**6, 12 | 'rollout_length': 5, 13 | 'penalty_coeff': 1.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/walker2d_mixed.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'walker2d', 6 | 'task': 'medium-replay-v0', 7 | 'exp_name': 'walker2d_medium_replay' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/walker2d-medium-replay-v0', 11 | 'pool_load_max_size': 100930, 12 | 'rollout_length': 1, 13 | 'penalty_coeff': 1.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/halfcheetah_mixed.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'halfcheetah', 6 | 'task': 'medium-replay-v0', 7 | 'exp_name': 'halfcheetah_medium_replay' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/halfcheetah-medium-replay-v0', 11 | 'pool_load_max_size': 101000, 12 | 'rollout_length': 5, 13 | 'penalty_coeff': 1.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/walker2d_medium_expert.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'walker2d', 6 | 'task': 'medium-expert-v0', 7 | 'exp_name': 'walker2d_medium_expert' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/walker2d-medium-expert-v0', 11 | 'pool_load_max_size': 2 * 10**6, 12 | 'rollout_length': 1, 13 | 'penalty_coeff': 2.0 14 | }) -------------------------------------------------------------------------------- /examples/config/d4rl/halfcheetah_medium_expert.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'halfcheetah', 6 | 'task': 'medium-expert-v0', 7 | 'exp_name': 'halfcheetah_medium_expert' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'd4rl/halfcheetah-medium-expert-v0', 11 | 'pool_load_max_size': 2 * 10**6, 12 | 'rollout_length': 5, 13 | 'penalty_coeff': 5.0 14 | }) -------------------------------------------------------------------------------- /maple/policy/static/ant.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | x = next_obs[:, 0] 10 | not_done = np.isfinite(next_obs).all(axis=-1) \ 11 | * (x >= 0.2) \ 12 | * (x <= 1.0) 13 | 14 | done = ~not_done 15 | done = done[:,None] 16 | return done 17 | -------------------------------------------------------------------------------- /examples/config/neorl/hopper_low.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'Hopper', 6 | 'task': 'v3', 7 | 'exp_name': 'hopper_neo_low' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'neorl/neorl_data/Hopper-v3-low-1000-train-noise.npz', 11 | 'pool_load_max_size': 101000, 12 | 'rollout_length': 10, 13 | 'penalty_coeff': 0.25, 14 | 'use_neorl': True 15 | }) -------------------------------------------------------------------------------- /examples/config/neorl/hopper_medium.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'Hopper', 6 | 'task': 'v3', 7 | 'exp_name': 'hopper_neo_medium' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'neorl/neorl_data/Hopper-v3-medium-1000-train-noise.npz', 11 | 'pool_load_max_size': 101000, 12 | 'rollout_length': 10, 13 | 'penalty_coeff': 0.25, 14 | 'use_neorl': True 15 | }) -------------------------------------------------------------------------------- /examples/config/neorl/walker2d_low.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'Walker2d', 6 | 'task': 'v3', 7 | 'exp_name': 'walker2d_neo_low' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'neorl/neorl_data/Walker2d-v3-low-1000-train-noise.npz', 11 | 'pool_load_max_size': 101000, 12 | 'rollout_length': 10, 13 | 'penalty_coeff': 0.25, 14 | 'use_neorl': True 15 | }) -------------------------------------------------------------------------------- /examples/config/neorl/halfcheetah_low.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'HalfCheetah', 6 | 'task': 'v3', 7 | 'exp_name': 'halfcheetah_low' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'neorl/neorl_data/HalfCheetah-v3-low-1000-train-noise.npz', 11 | 'pool_load_max_size': 101000, 12 | 'rollout_length': 10, 13 | 'penalty_coeff': 0.25, 14 | 'use_neorl': True 15 | }) -------------------------------------------------------------------------------- /examples/config/neorl/walker2d_medium.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'Walker2d', 6 | 'task': 'v3', 7 | 'exp_name': 'walker2d_neo_medium' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'neorl/neorl_data/Walker2d-v3-medium-1000-train-noise.npz', 11 | 'pool_load_max_size': 101000, 12 | 'rollout_length': 10, 13 | 'penalty_coeff': 0.25, 14 | 'use_neorl': True 15 | }) -------------------------------------------------------------------------------- /examples/config/neorl/halfcheetah_medium.py: -------------------------------------------------------------------------------- 1 | from .base_maple import maple_params, deepcopy 2 | 3 | params = deepcopy(maple_params) 4 | params.update({ 5 | 'domain': 'HalfCheetah', 6 | 'task': 'v3', 7 | 'exp_name': 'halfcheetah_neo_medium' 8 | }) 9 | params['kwargs'].update({ 10 | 'pool_load_path': 'neorl/neorl_data/HalfCheetah-v3-medium-1000-train-noise.npz', 11 | 'pool_load_max_size': 101000, 12 | 'rollout_length': 10, 13 | 'penalty_coeff': 0.25, 14 | 'use_neorl': True 15 | }) -------------------------------------------------------------------------------- /log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/04/09-30-38-583144 10.83.150.23 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=8&penalty_clip=20/tb/events/events.out.tfevents.1641259841.ml-gpu-ser108.nmg01: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/04/09-30-38-583144 10.83.150.23 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=8&penalty_clip=20/tb/events/events.out.tfevents.1641259841.ml-gpu-ser108.nmg01 -------------------------------------------------------------------------------- /log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-38-56-149640 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=88&penalty_clip=20/tb/events/events.out.tfevents.1641022739.ml-gpu-ser119.nmg01: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-38-56-149640 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=88&penalty_clip=20/tb/events/events.out.tfevents.1641022739.ml-gpu-ser119.nmg01 -------------------------------------------------------------------------------- /log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-42-56-985568 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=888&penalty_clip=20/tb/events/events.out.tfevents.1641022980.ml-gpu-ser119.nmg01: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-42-56-985568 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=888&penalty_clip=20/tb/events/events.out.tfevents.1641022980.ml-gpu-ser119.nmg01 -------------------------------------------------------------------------------- /maple/env/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | # 3 | from .ant import AntEnv 4 | from .humanoid import HumanoidEnv 5 | from .halfcheetah_jump import HalfCheetahEnv as HalfCheetahJumpEnv 6 | from .halfcheetah_vel import HalfCheetahEnv as HalfCheetahVelEnv 7 | from .ant_angle import AntEnv as AngAngleEnv 8 | # import halfcheetah_vel 9 | # 10 | env_overwrite = {'Ant': AntEnv,'AntAngle': AngAngleEnv, 11 | 'Humanoid': HumanoidEnv, 'HalfCheetahVel':HalfCheetahVelEnv, 12 | 'HalfCheetahJump': HalfCheetahJumpEnv} 13 | # 14 | # sys.modules[__name__] = env_overwrite -------------------------------------------------------------------------------- /rla_scripts/view_expt.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to view data of experiments. 3 | """ 4 | 5 | from RLA.easy_log.log_tools import ViewLogTool 6 | import argparse 7 | from config import * 8 | 9 | def argsparser(): 10 | parser = argparse.ArgumentParser("View Log") 11 | parser.add_argument('--task_table_name', type=str) 12 | parser.add_argument('--regex', type=str) 13 | args = parser.parse_args() 14 | return args 15 | 16 | if __name__=='__main__': 17 | args = argsparser() 18 | dlt = ViewLogTool(proj_root=DATA_ROOT, task_table_name=args.task_table_name, regex=args.regex) 19 | dlt.view_log() -------------------------------------------------------------------------------- /softlearning/models/utils.py: -------------------------------------------------------------------------------- 1 | def build_metric_learner_from_variant(variant, env, evaluation_data): 2 | sampler_params = variant['sampler_params'] 3 | metric_learner_params = variant['metric_learner_params'] 4 | metric_learner_params.update({ 5 | 'observation_shape': env.observation_space.shape, 6 | 'max_distance': sampler_params['kwargs']['max_path_length'], 7 | 'evaluation_data': evaluation_data 8 | }) 9 | 10 | metric_learner = MetricLearner(**metric_learner_params) 11 | return metric_learner 12 | 13 | 14 | def get_model_from_variant(variant, env, *args, **kwargs): 15 | pass 16 | -------------------------------------------------------------------------------- /maple/policy/static/halfcheetahvel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | done = np.array([False]).repeat(len(obs)) 10 | done = done[:,None] 11 | return done 12 | 13 | @staticmethod 14 | def recompute_reward_fn(obs, act, next_obs, rew): 15 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 16 | new_rew = -(rew + 0.1 * np.sum(np.square(act))) - 0.1 * np.sum(np.square(act)) 17 | return new_rew 18 | -------------------------------------------------------------------------------- /softlearning/replay_pools/extra_policy_info_replay_pool.py: -------------------------------------------------------------------------------- 1 | from .simple_replay_pool import SimpleReplayPool 2 | 3 | 4 | class ExtraPolicyInfoReplayPool(SimpleReplayPool): 5 | def __init__(self, *args, **kwargs): 6 | super(ExtraPolicyInfoReplayPool, self).__init__(*args, **kwargs) 7 | 8 | fields = { 9 | 'raw_actions': { 10 | 'shape': self._action_space.shape, 11 | 'dtype': 'float32' 12 | }, 13 | 'log_pis': { 14 | 'shape': (1, ), 15 | 'dtype': 'float32' 16 | } 17 | } 18 | 19 | self.add_fields(fields) 20 | -------------------------------------------------------------------------------- /softlearning/distributions/squash_bijector.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | import tensorflow_probability as tfp 4 | 5 | 6 | class SquashBijector(tfp.bijectors.Bijector): 7 | def __init__(self, validate_args=False, name="tanh"): 8 | super(SquashBijector, self).__init__( 9 | forward_min_event_ndims=0, 10 | validate_args=validate_args, 11 | name=name) 12 | 13 | def _forward(self, x): 14 | return tf.nn.tanh(x) 15 | 16 | def _inverse(self, y): 17 | return tf.atanh(y) 18 | 19 | def _forward_log_det_jacobian(self, x): 20 | return 2. * (np.log(2.) - x - tf.nn.softplus(-2. * x)) 21 | -------------------------------------------------------------------------------- /rla_scripts/archive_expt.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to archive benchmarking experiments. 3 | 4 | It is convenient to merge the archived experiments and the current task into tensorboard by: 5 | 6 | tensorboard --logdir ./log/your_task/,./log/archived/ 7 | 8 | """ 9 | 10 | from RLA.easy_log.log_tools import ArchiveLogTool 11 | import argparse 12 | from config import * 13 | 14 | def argsparser(): 15 | parser = argparse.ArgumentParser("Archive Log") 16 | # reduce setting 17 | parser.add_argument('--task_table_name', type=str) 18 | parser.add_argument('--regex', type=str) 19 | 20 | args = parser.parse_args() 21 | return args 22 | 23 | if __name__=='__main__': 24 | args = argsparser() 25 | dlt = ArchiveLogTool(proj_root=DATA_ROOT, task_table_name=args.task_table_name, regex=args.regex) 26 | dlt.archive_log() -------------------------------------------------------------------------------- /softlearning/environments/gym/wrappers/normalize_action.py: -------------------------------------------------------------------------------- 1 | import gym 2 | from gym import spaces 3 | import numpy as np 4 | 5 | 6 | __all__ = ['NormalizeActionWrapper'] 7 | 8 | 9 | class NormalizeActionWrapper(gym.ActionWrapper): 10 | """Rescale the action space of the environment.""" 11 | 12 | def action(self, action): 13 | if not isinstance(self.env.action_space, spaces.Box): 14 | return action 15 | 16 | # rescale the action 17 | low, high = self.env.action_space.low, self.env.action_space.high 18 | scaled_action = low + (action + 1.0) * (high - low) / 2.0 19 | scaled_action = np.clip(scaled_action, low, high) 20 | 21 | return scaled_action 22 | 23 | def reverse_action(self, action): 24 | raise NotImplementedError 25 | 26 | 27 | normalize = NormalizeActionWrapper 28 | -------------------------------------------------------------------------------- /maple/policy/static/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import importlib 4 | import pdb 5 | 6 | 7 | def import_fns(path, file, fns_name='StaticFns'): 8 | full_path = os.path.join("maple/policy/static", file) 9 | import_path = full_path.replace('/', '.') 10 | module = importlib.import_module(import_path) 11 | fns = getattr(module, fns_name) 12 | return fns 13 | def get_base_path(): 14 | return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 15 | cwd = os.path.join(get_base_path(), 'policy/static') 16 | files = os.listdir(cwd) 17 | ## remove __init__.py 18 | files = filter(lambda x: '__' not in x and x[0] != '.', files) 19 | ## env.py --> env 20 | files = map(lambda x: x.replace('.py', ''), files) 21 | 22 | ## {env: StaticFns, ... } 23 | static_fns = {file: import_fns(cwd, file) for file in files} 24 | 25 | sys.modules[__name__] = static_fns 26 | 27 | -------------------------------------------------------------------------------- /maple/policy/static/hopper.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from maple.global_config import * 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | if obs.shape[-1] == 12: # In neorl, another dim is inserted to observation space. 9 | next_obs = next_obs[:, 1:] 10 | height = next_obs[:, 0] 11 | angle = next_obs[:, 1] 12 | # not_done = np.logical_and(np.all(next_obs > -100, axis=-1), 13 | # np.all(next_obs < 100, axis=-1)) * \ 14 | not_done = np.isfinite(next_obs).all(axis=-1) \ 15 | * np.abs(next_obs < STATE_CLIP_BOUND).all(axis=-1) \ 16 | * (height > .7) \ 17 | * (np.abs(angle) < .2) 18 | 19 | done = ~not_done 20 | done = done[:,None] 21 | return done 22 | -------------------------------------------------------------------------------- /maple/policy/static/walker2d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from maple.global_config import * 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | if obs.shape[-1] == 18: # In neorl, another dim is inserted to observation space. 9 | # not_done = np.array([True]).repeat(len(obs)) 10 | next_obs = next_obs[:, 1:] 11 | height = next_obs[:, 0] 12 | angle = next_obs[:, 1] 13 | not_done = np.logical_and(np.all(next_obs > -1 * STATE_CLIP_BOUND, axis=-1), np.all(next_obs < STATE_CLIP_BOUND, axis=-1)) \ 14 | * (height > 0.8) \ 15 | * (height < 2.0) \ 16 | * (angle > -1.0) \ 17 | * (angle < 1.0) 18 | done = ~not_done 19 | done = done[:,None] 20 | return done 21 | -------------------------------------------------------------------------------- /rla_config_mopo.yaml: -------------------------------------------------------------------------------- 1 | PROJECT_TYPE: 2 | # lib: backup the project in YOUR_PROJECT_ROOT/build/lib. 3 | # It suit to the situation when you run the code by building a package. (e.g., "python setup.py install") 4 | # source: backup the project in YOUR_PROJECT_ROOT/{backup_code_dir}. 5 | # It suit to the situation when you run your code directly. 6 | # Backup will ignore the files satisfy the rules in IGNORE_RULE (the default value is: YOUR_PROJECT_ROOT/.gitignore) 7 | # and all log files in easy_log. 8 | backup_code_by: 'source' 9 | 10 | 11 | BACKUP_CONFIG: 12 | lib_dir: './build/lib/' 13 | backup_code_dir: 14 | - './maple' 15 | - './run_scripts' 16 | 17 | LOG_USED: 18 | - 'stdout' 19 | - 'log' 20 | - 'csv' 21 | - 'tensorboard' 22 | 23 | DL_FRAMEWORK: 'tensorflow' 24 | SEND_LOG_FILE: False 25 | 26 | REMOTE_SETTING: 27 | ftp_server: '' 28 | username: '' 29 | password: '' 30 | remote_log_root: '' 31 | -------------------------------------------------------------------------------- /examples/config/d4rl/base_maple.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | base_params = { 3 | 'type': 'MAPLE', 4 | 'universe': 'gym', 5 | 'kwargs': { 6 | 'epoch_length': 1000, 7 | 'train_every_n_steps': 1, 8 | 'n_train_repeat': 1, 9 | 'eval_render_mode': None, 10 | 'eval_n_episodes': 10, 11 | 'eval_deterministic': True, 12 | 13 | 'discount': 0.99, 14 | 'tau': 5e-3, 15 | 'reward_scale': 1.0, 16 | 17 | 'model_train_freq': 1000, 18 | 'model_retain_epochs': 5, 19 | 'rollout_batch_size': 50e3, 20 | 'deterministic': False, 21 | 'num_networks': 7, 22 | 'num_elites': 5, 23 | 'real_ratio': 0.05, 24 | 'target_entropy': -3, 25 | 'max_model_t': None 26 | } 27 | } 28 | 29 | maple_params = deepcopy(base_params) 30 | maple_params['kwargs'].update({ 31 | 'separate_mean_var': True, 32 | 'penalty_learned_var': True, 33 | }) -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from setuptools import find_packages 3 | 4 | setup( 5 | name='MAPLE', 6 | packages=find_packages(), 7 | version='0.0.1', 8 | description='Offline Model-based Adaptable Policy Learning', 9 | long_description=open('./README.md').read(), 10 | author='Xiong-Hui Chen, Fan-Ming Luo', 11 | author_email='chenxh@lamda.nju.edu.cn, luofm@lamda.nju.edu.cn', 12 | entry_points={ 13 | 'console_scripts': ( 14 | 'mopo=softlearning.scripts.console_scripts:main', 15 | 'viskit=mopo.scripts.console_scripts:main' 16 | ) 17 | }, 18 | install_requires=[ 19 | "RLA @ git+https://github.com/polixir/RLAssistant.git@main#egg=RLA", 20 | "serializable @ git+https://github.com/hartikainen/serializable.git@76516385a3a716ed4a2a9ad877e2d5cbcf18d4e6#egg=serializable", 21 | 'gtimer', 22 | 'dotmap', 23 | ], 24 | zip_safe=True, 25 | license='MIT' 26 | ) 27 | -------------------------------------------------------------------------------- /examples/config/neorl/base_maple.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | base_params = { 3 | 'type': 'MAPLE', 4 | 'universe': 'gym', 5 | 6 | 'log_dir': './ray_mopo/', # Specify where to write log files here 7 | 8 | 'kwargs': { 9 | 'epoch_length': 1000, 10 | 'train_every_n_steps': 1, 11 | 'n_train_repeat': 1, 12 | 'eval_render_mode': None, 13 | 'eval_n_episodes': 10, 14 | 'eval_deterministic': True, 15 | 16 | 'discount': 0.99, 17 | 'tau': 5e-3, 18 | 'reward_scale': 1.0, 19 | 20 | 'model_train_freq': 1000, 21 | 'model_retain_epochs': 5, 22 | 'rollout_batch_size': 50e3, 23 | 'deterministic': False, 24 | 'num_networks': 7, 25 | 'num_elites': 5, 26 | 'real_ratio': 0.05, 27 | 'target_entropy': -3, 28 | 'max_model_t': None 29 | } 30 | } 31 | 32 | maple_params = deepcopy(base_params) 33 | maple_params['kwargs'].update({ 34 | 'separate_mean_var': True, 35 | 'penalty_learned_var': True, 36 | }) -------------------------------------------------------------------------------- /maple/policy/static/halfcheetahjump.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | # done = np.array([False]).repeat(len(obs)) 10 | not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) 11 | done = ~not_done 12 | done = done[:,None] 13 | # not_done = np.isfinite(next_obs).all(axis=-1) 14 | # done = done[:,None] 15 | return done 16 | 17 | @staticmethod 18 | def recompute_reward_fn(obs, act, next_obs, rew): 19 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 20 | 21 | # new_rew = -(rew + 0.1 * np.sum(np.square(act))) - 0.1 * np.sum(np.square(act)) 22 | new_rew = np.clip(rew + 0.1 * np.sum(np.square(act), axis=-1), None, 3) \ 23 | - 0.1 * np.sum(np.square(act), axis=-1) + 15 * next_obs[..., 0] 24 | return new_rew 25 | -------------------------------------------------------------------------------- /softlearning/environments/utils.py: -------------------------------------------------------------------------------- 1 | from .adapters.gym_adapter import ( 2 | GYM_ENVIRONMENTS, 3 | GymAdapter, 4 | ) 5 | 6 | from maple.env import env_overwrite 7 | 8 | ENVIRONMENTS = { 9 | 'gym': GYM_ENVIRONMENTS, 10 | } 11 | 12 | ADAPTERS = { 13 | 'gym': GymAdapter, 14 | } 15 | 16 | 17 | def get_environment(universe, domain, task, environment_params): 18 | if domain in env_overwrite: 19 | print('[ environments/utils ] WARNING: Using overwritten {} environment'.format(domain)) 20 | env = env_overwrite[domain]() 21 | env = ADAPTERS[universe](None, None, env=env) 22 | else: 23 | env = ADAPTERS[universe](domain, task, **environment_params) 24 | return env 25 | 26 | 27 | def get_environment_from_params(environment_params): 28 | universe = environment_params['universe'] 29 | task = environment_params['task'] 30 | domain = environment_params['domain'] 31 | environment_kwargs = environment_params.get('kwargs', {}).copy() 32 | 33 | return get_environment(universe, domain, task, environment_kwargs) 34 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Xiong-Hui Chen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /maple/policy/static/halfcheetah.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from maple.global_config import * 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | # done = np.array([False]).repeat(len(obs)) 10 | if obs.shape[-1] == 18: # neorl 11 | # not_done = np.array([True]).repeat(len(obs)) 12 | next_obs = next_obs[:, 1:] 13 | not_done = np.logical_and(np.all(next_obs >= -1 * STATE_CLIP_BOUND, axis=-1), np.all(next_obs <= STATE_CLIP_BOUND, axis=-1)) 14 | else: 15 | not_done = np.array([True]).repeat(len(obs)) 16 | not_done = np.logical_and(np.all(next_obs > -1 * STATE_CLIP_BOUND, axis=-1), np.all(next_obs < STATE_CLIP_BOUND, axis=-1)) 17 | done = ~not_done 18 | done = done[:,None] 19 | return done 20 | 21 | @staticmethod 22 | def recompute_reward_fn(obs, act, next_obs, rew): 23 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 24 | new_rew = -(rew + 0.1 * np.sum(np.square(act))) - 0.1 * np.sum(np.square(act)) 25 | return new_rew 26 | -------------------------------------------------------------------------------- /softlearning/utils/keras.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | 3 | import tensorflow as tf 4 | 5 | 6 | class PicklableKerasModel(tf.keras.Model): 7 | def __getstate__(self): 8 | with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd: 9 | tf.keras.models.save_model(self, fd.name, overwrite=True) 10 | model_str = fd.read() 11 | d = {'model_str': model_str} 12 | 13 | return d 14 | 15 | def __setstate__(self, state): 16 | with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd: 17 | fd.write(state['model_str']) 18 | fd.flush() 19 | 20 | loaded_model = tf.keras.models.load_model( 21 | fd.name, custom_objects={ 22 | self.__class__.__name__: self.__class__}) 23 | 24 | self.__dict__.update(loaded_model.__dict__.copy()) 25 | 26 | @classmethod 27 | def from_config(cls, *args, custom_objects=None, **kwargs): 28 | custom_objects = custom_objects or {} 29 | custom_objects[cls.__name__] = cls 30 | custom_objects['tf'] = tf 31 | return super(PicklableKerasModel, cls).from_config( 32 | *args, custom_objects=custom_objects, **kwargs) 33 | -------------------------------------------------------------------------------- /softlearning/replay_pools/replay_pool.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | 4 | class ReplayPool(object): 5 | """A class used to save and replay data.""" 6 | 7 | @abc.abstractmethod 8 | def add_sample(self, sample): 9 | """Add a transition tuple.""" 10 | pass 11 | 12 | @abc.abstractmethod 13 | def terminate_episode(self): 14 | """Clean up pool after episode termination.""" 15 | pass 16 | 17 | @property 18 | @abc.abstractmethod 19 | def size(self, **kwargs): 20 | pass 21 | 22 | def add_path(self, path): 23 | """Add a rollout to the replay pool. 24 | 25 | This default implementation naively goes through every step, but you 26 | may want to optimize this. 27 | 28 | NOTE: You should NOT call "terminate_episode" after calling add_path. 29 | It's assumed that this function handles the episode termination. 30 | 31 | :param path: Dict like one outputted by railrl.samplers.util.rollout 32 | """ 33 | self.add_samples(path) 34 | self.terminate_episode() 35 | 36 | @abc.abstractmethod 37 | def random_batch(self, batch_size): 38 | """Return a random batch of size `batch_size`.""" 39 | pass 40 | -------------------------------------------------------------------------------- /examples/config/__init__.py: -------------------------------------------------------------------------------- 1 | params = { 2 | 'type': 'MAPLE', 3 | 'universe': 'gym', 4 | 'domain': 'Hopper', 5 | 'task': 'v2', 6 | 7 | 'log_dir': '~/ray_mopo/', 8 | 'exp_name': 'defaults', 9 | 10 | 'kwargs': { 11 | 'epoch_length': 1000, 12 | 'train_every_n_steps': 1, 13 | 'n_train_repeat': 2, #20, 14 | 'eval_render_mode': None, 15 | 'eval_n_episodes': 10, 16 | 'eval_deterministic': True, 17 | 18 | 'discount': 0.99, 19 | 'tau': 5e-3, 20 | 'reward_scale': 1.0, 21 | #### 22 | 'model_reset_freq': 1000, 23 | 'model_train_freq': 250, # 250 24 | # 'retain_model_epochs': 2, 25 | 'model_pool_size': 2e6, 26 | 'rollout_batch': 100e3, # 40e3 27 | 'rollout_length': 1, 28 | 'deterministic': False, 29 | 'num_networks': 7, 30 | 'num_elites': 5, 31 | 'real_ratio': 0.05, 32 | 'entropy_mult': 0.5, 33 | # 'target_entropy': -1.5, 34 | 'max_model_t': 1e10, 35 | # 'max_dev': 0.25, 36 | # 'marker': 'early-stop_10rep_stochastic', 37 | 'rollout_length_params': [20, 150, 1, 1], ## epoch, loss, length 38 | # 'marker': 'dump', 39 | } 40 | } -------------------------------------------------------------------------------- /maple/policy/static/antangle.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class StaticFns: 4 | 5 | @staticmethod 6 | def termination_fn(obs, act, next_obs): 7 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 8 | 9 | x = next_obs[:, 0] 10 | not_done = np.isfinite(next_obs).all(axis=-1) \ 11 | * (x >= 0.2) \ 12 | * (x <= 1.0) 13 | not_done2 = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1)) 14 | not_done = np.logical_and(not_done2, not_done) 15 | done = ~not_done 16 | done = done[:,None] 17 | return done 18 | 19 | @staticmethod 20 | def recompute_reward_fn(obs, act, next_obs, rew): 21 | survive_reward = 1 22 | ctrl_cost = .5 * np.square(act).sum(axis=-1) 23 | xy_velocity = next_obs[..., 111:] 24 | contact_cost = 0.5 * 1e-3 * np.sum((np.square(next_obs[..., 27:111])), axis=-1) 25 | assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2 26 | new_rew = xy_velocity[..., 0] * np.cos(np.pi/6) + xy_velocity[..., 1] * np.sin(np.pi/6) - ctrl_cost - contact_cost + survive_reward 27 | # new_rew = -(rew + 0.1 * np.sum(np.square(act))) - 0.1 * np.sum(np.square(act)) 28 | return new_rew 29 | 30 | -------------------------------------------------------------------------------- /softlearning/value_functions/vanilla.py: -------------------------------------------------------------------------------- 1 | from softlearning.models.feedforward import feedforward_model 2 | 3 | 4 | def create_feedforward_Q_function(observation_shape, 5 | action_shape, 6 | *args, 7 | observation_preprocessor=None, 8 | name='feedforward_Q', 9 | **kwargs): 10 | input_shapes = (observation_shape, action_shape) 11 | preprocessors = (observation_preprocessor, None) 12 | return feedforward_model( 13 | input_shapes, 14 | *args, 15 | output_size=1, 16 | preprocessors=preprocessors, 17 | name=name, 18 | **kwargs) 19 | 20 | 21 | def create_feedforward_V_function(observation_shape, 22 | *args, 23 | observation_preprocessor=None, 24 | name='feedforward_V', 25 | **kwargs): 26 | input_shapes = (observation_shape, ) 27 | preprocessors = (observation_preprocessor, None) 28 | return feedforward_model( 29 | input_shapes, 30 | *args, 31 | output_size=1, 32 | preprocessors=preprocessors, 33 | **kwargs) 34 | -------------------------------------------------------------------------------- /rla_scripts/delete_expt.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to delete useless experimental logs by regex string. 3 | 4 | """ 5 | from RLA.easy_log.log_tools import DeleteLogTool, Filter 6 | import argparse 7 | from config import * 8 | 9 | def argsparser(): 10 | parser = argparse.ArgumentParser("Delete Log") 11 | # reduce setting 12 | parser.add_argument('--task_table_name', type=str, default="") 13 | parser.add_argument('--regex', type=str) 14 | parser.add_argument('--timestep_bound', type=int, default=100) 15 | # Filter.ALL: delete all experiments satisfied regex 16 | # Filter.SMALL_TIMESTEP: delete all experiments that the names satisfy regex 17 | # and the recorded timesteps are less than args.timestep_bound. 18 | parser.add_argument('--delete_type', type=str, default=Filter.ALL) 19 | 20 | args = parser.parse_args() 21 | return args 22 | 23 | if __name__=='__main__': 24 | args = argsparser() 25 | filter = Filter() 26 | filter.config(type=args.delete_type, timstep_bound=args.timestep_bound) 27 | dlt = DeleteLogTool(proj_root=DATA_ROOT, task_table_name=args.task_table_name, regex=args.regex, 28 | filter=filter) 29 | if args.delete_type == Filter.ALL: 30 | dlt.delete_related_log() 31 | elif args.delete_type == Filter.SMALL_TIMESTEP: 32 | dlt.delete_small_timestep_log() 33 | else: 34 | raise NotImplementedError -------------------------------------------------------------------------------- /softlearning/replay_pools/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from . import ( 4 | simple_replay_pool, 5 | extra_policy_info_replay_pool, 6 | union_pool, 7 | trajectory_replay_pool) 8 | 9 | 10 | POOL_CLASSES = { 11 | 'SimpleReplayPool': simple_replay_pool.SimpleReplayPool, 12 | 'TrajectoryReplayPool': trajectory_replay_pool.TrajectoryReplayPool, 13 | 'ExtraPolicyInfoReplayPool': ( 14 | extra_policy_info_replay_pool.ExtraPolicyInfoReplayPool), 15 | 'UnionPool': union_pool.UnionPool, 16 | } 17 | 18 | DEFAULT_REPLAY_POOL = 'SimpleReplayPool' 19 | 20 | 21 | def get_replay_pool_from_variant(variant, env, *args, **kwargs): 22 | replay_pool_params = variant['replay_pool_params'] 23 | if isinstance(replay_pool_params["kwargs"]["max_size"], int): 24 | replay_pool_params["kwargs"]["max_size"] = replay_pool_params["kwargs"]["max_size"] 25 | else: 26 | replay_pool_params["kwargs"]["max_size"] = replay_pool_params["kwargs"]["max_size"](variant) 27 | replay_pool_type = replay_pool_params['type'] 28 | replay_pool_kwargs = deepcopy(replay_pool_params['kwargs']) 29 | print('[ DEBUG ]: replay pool config: ', replay_pool_kwargs) 30 | replay_pool = POOL_CLASSES[replay_pool_type]( 31 | *args, 32 | observation_space=env.observation_space, 33 | action_space=env.action_space, 34 | **replay_pool_kwargs, 35 | **kwargs) 36 | 37 | return replay_pool 38 | -------------------------------------------------------------------------------- /maple/env/halfcheetah_vel.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, action): 12 | xposbefore = self.sim.data.qpos[0] 13 | self.do_simulation(action, self.frame_skip) 14 | xposafter = self.sim.data.qpos[0] 15 | ob = self._get_obs() 16 | reward_ctrl = -0.1 * np.square(action).sum() 17 | reward_run = np.clip((xposafter - xposbefore) / self.dt, None, 3) 18 | reward = reward_ctrl + reward_run 19 | done = False 20 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 21 | 22 | def _get_obs(self): 23 | return np.concatenate( 24 | [ 25 | self.sim.data.qpos.flat[1:], 26 | self.sim.data.qvel.flat, 27 | ] 28 | ) 29 | 30 | def reset_model(self): 31 | qpos = self.init_qpos + self.np_random.uniform( 32 | low=-0.1, high=0.1, size=self.model.nq 33 | ) 34 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * 0.1 35 | self.set_state(qpos, qvel) 36 | return self._get_obs() 37 | 38 | def viewer_setup(self): 39 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /softlearning/replay_pools/union_pool.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .replay_pool import ReplayPool 4 | 5 | 6 | class UnionPool(ReplayPool): 7 | def __init__(self, pools): 8 | pool_sizes = np.array([b.size for b in pools]) 9 | self._total_size = sum(pool_sizes) 10 | self._normalized_pool_sizes = pool_sizes / self._total_size 11 | 12 | self.pools = pools 13 | 14 | def add_sample(self, *args, **kwargs): 15 | raise NotImplementedError 16 | 17 | def terminate_episode(self): 18 | raise NotImplementedError 19 | 20 | @property 21 | def size(self): 22 | return self._total_size 23 | 24 | def add_path(self, **kwargs): 25 | raise NotImplementedError 26 | 27 | def random_batch(self, batch_size): 28 | 29 | # TODO: Hack 30 | partial_batch_sizes = self._normalized_pool_sizes * batch_size 31 | partial_batch_sizes = partial_batch_sizes.astype(int) 32 | partial_batch_sizes[0] = batch_size - sum(partial_batch_sizes[1:]) 33 | 34 | partial_batches = [ 35 | pool.random_batch(partial_batch_size) for pool, 36 | partial_batch_size in zip(self.pools, partial_batch_sizes) 37 | ] 38 | 39 | def all_values(key): 40 | return [partial_batch[key] for partial_batch in partial_batches] 41 | 42 | keys = partial_batches[0].keys() 43 | 44 | return {key: np.concatenate(all_values(key), axis=0) for key in keys} 45 | -------------------------------------------------------------------------------- /softlearning/models/feedforward.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | 4 | from softlearning.utils.keras import PicklableKerasModel 5 | 6 | 7 | def feedforward_model(input_shapes, 8 | output_size, 9 | hidden_layer_sizes, 10 | activation='relu', 11 | output_activation='linear', 12 | preprocessors=None, 13 | name='feedforward_model', 14 | *args, 15 | **kwargs): 16 | inputs = [ 17 | tf.keras.layers.Input(shape=input_shape) 18 | for input_shape in input_shapes 19 | ] 20 | 21 | if preprocessors is None: 22 | preprocessors = (None, ) * len(inputs) 23 | 24 | preprocessed_inputs = [ 25 | preprocessor(input_) if preprocessor is not None else input_ 26 | for preprocessor, input_ in zip(preprocessors, inputs) 27 | ] 28 | 29 | concatenated = tf.keras.layers.Lambda( 30 | lambda x: tf.concat(x, axis=-1) 31 | )(preprocessed_inputs) 32 | 33 | out = concatenated 34 | for units in hidden_layer_sizes: 35 | out = tf.keras.layers.Dense( 36 | units, *args, activation=activation, **kwargs 37 | )(out) 38 | 39 | out = tf.keras.layers.Dense( 40 | output_size, *args, activation=output_activation, **kwargs 41 | )(out) 42 | 43 | model = PicklableKerasModel(inputs, out, name=name) 44 | 45 | return model 46 | 47 | -------------------------------------------------------------------------------- /maple/env/halfcheetah_jump.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle): 7 | def __init__(self): 8 | mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5) 9 | utils.EzPickle.__init__(self) 10 | 11 | def step(self, action): 12 | xposbefore = self.sim.data.qpos[0] 13 | self.do_simulation(action, self.frame_skip) 14 | xposafter = self.sim.data.qpos[0] 15 | 16 | ob = self._get_obs() 17 | reward_ctrl = -0.1 * np.square(action).sum() 18 | reward_run = np.clip((xposafter - xposbefore) / self.dt, None, 3) + 15 * (self.sim.data.qpos[1] - self.init_qpos[1]) 19 | reward = reward_ctrl + reward_run 20 | done = False 21 | return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl) 22 | 23 | def _get_obs(self): 24 | return np.concatenate( 25 | [ 26 | self.sim.data.qpos.flat[1:], 27 | self.sim.data.qvel.flat, 28 | ] 29 | ) 30 | 31 | def reset_model(self): 32 | qpos = self.init_qpos + self.np_random.uniform( 33 | low=-0.1, high=0.1, size=self.model.nq 34 | ) 35 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * 0.1 36 | self.set_state(qpos, qvel) 37 | self.z_init = self.init_qpos[1] 38 | return self._get_obs() 39 | 40 | def viewer_setup(self): 41 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /softlearning/policies/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from softlearning.preprocessors.utils import get_preprocessor_from_params 4 | 5 | 6 | def get_gaussian_policy(env, Q, **kwargs): 7 | from .gaussian_policy import FeedforwardGaussianPolicy 8 | policy = FeedforwardGaussianPolicy( 9 | input_shapes=(env.active_observation_shape, ), 10 | output_shape=env.action_space.shape, 11 | **kwargs) 12 | 13 | return policy 14 | 15 | 16 | def get_uniform_policy(env, *args, **kwargs): 17 | from .uniform_policy import UniformPolicy 18 | policy = UniformPolicy( 19 | input_shapes=(env.active_observation_shape, ), 20 | output_shape=env.action_space.shape) 21 | 22 | return policy 23 | 24 | 25 | POLICY_FUNCTIONS = { 26 | 'GaussianPolicy': get_gaussian_policy, 27 | 'UniformPolicy': get_uniform_policy, 28 | } 29 | 30 | 31 | def get_policy(policy_type, *args, **kwargs): 32 | return POLICY_FUNCTIONS[policy_type](*args, **kwargs) 33 | 34 | 35 | def get_policy_from_variant(variant, env, Qs, *args, **kwargs): 36 | policy_params = variant['policy_params'] 37 | policy_type = policy_params['type'] 38 | policy_kwargs = deepcopy(policy_params['kwargs']) 39 | 40 | preprocessor_params = policy_kwargs.pop('preprocessor_params', None) 41 | preprocessor = get_preprocessor_from_params(env, preprocessor_params) 42 | 43 | policy = POLICY_FUNCTIONS[policy_type](env, *args, 44 | Q=Qs[0], 45 | preprocessor=preprocessor, 46 | **policy_kwargs, 47 | **kwargs) 48 | 49 | return policy 50 | -------------------------------------------------------------------------------- /log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-38-56-149640 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=88&penalty_clip=20/warn.txt: -------------------------------------------------------------------------------- 1 | [WARN] 0 : sync: start 2 | [WARN] 20 : sync: start 3 | [WARN] 40 : sync: start 4 | [WARN] 60 : sync: start 5 | [WARN] 80 : sync: start 6 | [WARN] 100 : sync: start 7 | [WARN] 120 : sync: start 8 | [WARN] 140 : sync: start 9 | [WARN] 160 : sync: start 10 | [WARN] 180 : sync: start 11 | [WARN] 200 : sync: start 12 | [WARN] 220 : sync: start 13 | [WARN] 240 : sync: start 14 | [WARN] 260 : sync: start 15 | [WARN] 280 : sync: start 16 | [WARN] 300 : sync: start 17 | [WARN] 320 : sync: start 18 | [WARN] 340 : sync: start 19 | [WARN] 360 : sync: start 20 | [WARN] 380 : sync: start 21 | [WARN] 400 : sync: start 22 | [WARN] 420 : sync: start 23 | [WARN] 440 : sync: start 24 | [WARN] 460 : sync: start 25 | [WARN] 480 : sync: start 26 | [WARN] 500 : sync: start 27 | [WARN] 520 : sync: start 28 | [WARN] 540 : sync: start 29 | [WARN] 560 : sync: start 30 | [WARN] 580 : sync: start 31 | [WARN] 600 : sync: start 32 | [WARN] 620 : sync: start 33 | [WARN] 640 : sync: start 34 | [WARN] 660 : sync: start 35 | [WARN] 680 : sync: start 36 | [WARN] 700 : sync: start 37 | [WARN] 720 : sync: start 38 | [WARN] 740 : sync: start 39 | [WARN] 760 : sync: start 40 | [WARN] 780 : sync: start 41 | [WARN] 800 : sync: start 42 | [WARN] 820 : sync: start 43 | [WARN] 840 : sync: start 44 | [WARN] 860 : sync: start 45 | [WARN] 880 : sync: start 46 | [WARN] 900 : sync: start 47 | [WARN] 920 : sync: start 48 | [WARN] 940 : sync: start 49 | [WARN] 960 : sync: start 50 | [WARN] 980 : sync: start 51 | -------------------------------------------------------------------------------- /log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/04/09-30-38-583144 10.83.150.23 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=8&penalty_clip=20/warn.txt: -------------------------------------------------------------------------------- 1 | [WARN] 0 : sync: start 2 | [WARN] 20 : sync: start 3 | [WARN] 40 : sync: start 4 | [WARN] 60 : sync: start 5 | [WARN] 80 : sync: start 6 | [WARN] 100 : sync: start 7 | [WARN] 120 : sync: start 8 | [WARN] 140 : sync: start 9 | [WARN] 160 : sync: start 10 | [WARN] 180 : sync: start 11 | [WARN] 200 : sync: start 12 | [WARN] 220 : sync: start 13 | [WARN] 240 : sync: start 14 | [WARN] 260 : sync: start 15 | [WARN] 280 : sync: start 16 | [WARN] 300 : sync: start 17 | [WARN] 320 : sync: start 18 | [WARN] 340 : sync: start 19 | [WARN] 360 : sync: start 20 | [WARN] 380 : sync: start 21 | [WARN] 400 : sync: start 22 | [WARN] 420 : sync: start 23 | [WARN] 440 : sync: start 24 | [WARN] 460 : sync: start 25 | [WARN] 480 : sync: start 26 | [WARN] 500 : sync: start 27 | [WARN] 520 : sync: start 28 | [WARN] 540 : sync: start 29 | [WARN] 560 : sync: start 30 | [WARN] 580 : sync: start 31 | [WARN] 600 : sync: start 32 | [WARN] 620 : sync: start 33 | [WARN] 640 : sync: start 34 | [WARN] 660 : sync: start 35 | [WARN] 680 : sync: start 36 | [WARN] 700 : sync: start 37 | [WARN] 720 : sync: start 38 | [WARN] 740 : sync: start 39 | [WARN] 760 : sync: start 40 | [WARN] 780 : sync: start 41 | [WARN] 800 : sync: start 42 | [WARN] 820 : sync: start 43 | [WARN] 840 : sync: start 44 | [WARN] 860 : sync: start 45 | [WARN] 880 : sync: start 46 | [WARN] 900 : sync: start 47 | [WARN] 920 : sync: start 48 | [WARN] 940 : sync: start 49 | [WARN] 960 : sync: start 50 | [WARN] 980 : sync: start 51 | -------------------------------------------------------------------------------- /log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-42-56-985568 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=888&penalty_clip=20/warn.txt: -------------------------------------------------------------------------------- 1 | [WARN] 0 : sync: start 2 | [WARN] 20 : sync: start 3 | [WARN] 40 : sync: start 4 | [WARN] 60 : sync: start 5 | [WARN] 80 : sync: start 6 | [WARN] 100 : sync: start 7 | [WARN] 120 : sync: start 8 | [WARN] 140 : sync: start 9 | [WARN] 160 : sync: start 10 | [WARN] 180 : sync: start 11 | [WARN] 200 : sync: start 12 | [WARN] 220 : sync: start 13 | [WARN] 240 : sync: start 14 | [WARN] 260 : sync: start 15 | [WARN] 280 : sync: start 16 | [WARN] 300 : sync: start 17 | [WARN] 320 : sync: start 18 | [WARN] 340 : sync: start 19 | [WARN] 360 : sync: start 20 | [WARN] 380 : sync: start 21 | [WARN] 400 : sync: start 22 | [WARN] 420 : sync: start 23 | [WARN] 440 : sync: start 24 | [WARN] 460 : sync: start 25 | [WARN] 480 : sync: start 26 | [WARN] 500 : sync: start 27 | [WARN] 520 : sync: start 28 | [WARN] 540 : sync: start 29 | [WARN] 560 : sync: start 30 | [WARN] 580 : sync: start 31 | [WARN] 600 : sync: start 32 | [WARN] 620 : sync: start 33 | [WARN] 640 : sync: start 34 | [WARN] 660 : sync: start 35 | [WARN] 680 : sync: start 36 | [WARN] 700 : sync: start 37 | [WARN] 720 : sync: start 38 | [WARN] 740 : sync: start 39 | [WARN] 760 : sync: start 40 | [WARN] 780 : sync: start 41 | [WARN] 800 : sync: start 42 | [WARN] 820 : sync: start 43 | [WARN] 840 : sync: start 44 | [WARN] 860 : sync: start 45 | [WARN] 880 : sync: start 46 | [WARN] 900 : sync: start 47 | [WARN] 920 : sync: start 48 | [WARN] 940 : sync: start 49 | [WARN] 960 : sync: start 50 | [WARN] 980 : sync: start 51 | -------------------------------------------------------------------------------- /softlearning/preprocessors/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | 4 | def get_convnet_preprocessor(observation_shape, 5 | name='convnet_preprocessor', 6 | **kwargs): 7 | from .convnet import convnet_preprocessor 8 | preprocessor = convnet_preprocessor( 9 | input_shapes=(observation_shape, ), name=name, **kwargs) 10 | 11 | return preprocessor 12 | 13 | 14 | def get_feedforward_preprocessor(observation_shape, 15 | name='feedforward_preprocessor', 16 | **kwargs): 17 | from softlearning.models.feedforward import feedforward_model 18 | preprocessor = feedforward_model( 19 | input_shapes=(observation_shape, ), name=name, **kwargs) 20 | 21 | return preprocessor 22 | 23 | 24 | PREPROCESSOR_FUNCTIONS = { 25 | 'convnet_preprocessor': get_convnet_preprocessor, 26 | 'feedforward_preprocessor': get_feedforward_preprocessor, 27 | None: lambda *args, **kwargs: None 28 | } 29 | 30 | 31 | def get_preprocessor_from_params(env, preprocessor_params, *args, **kwargs): 32 | if preprocessor_params is None: 33 | return None 34 | 35 | preprocessor_type = preprocessor_params.get('type', None) 36 | preprocessor_kwargs = deepcopy(preprocessor_params.get('kwargs', {})) 37 | 38 | if preprocessor_type is None: 39 | return None 40 | 41 | preprocessor = PREPROCESSOR_FUNCTIONS[ 42 | preprocessor_type]( 43 | env.active_observation_shape, 44 | *args, 45 | **preprocessor_kwargs, 46 | **kwargs) 47 | 48 | return preprocessor 49 | 50 | 51 | def get_preprocessor_from_variant(variant, env, *args, **kwargs): 52 | preprocessor_params = variant['preprocessor_params'] 53 | return get_preprocessor_from_params( 54 | env, preprocessor_params, *args, **kwargs) 55 | -------------------------------------------------------------------------------- /softlearning/value_functions/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | from softlearning.preprocessors.utils import get_preprocessor_from_params 4 | from . import vanilla 5 | 6 | 7 | def create_double_value_function(value_fn, *args, **kwargs): 8 | # TODO(hartikainen): The double Q-function should support the same 9 | # interface as the regular ones. Implement the double min-thing 10 | # as a Keras layer. 11 | value_fns = tuple(value_fn(*args, **kwargs) for i in range(2)) 12 | return value_fns 13 | 14 | 15 | VALUE_FUNCTIONS = { 16 | 'feedforward_V_function': ( 17 | vanilla.create_feedforward_V_function), 18 | 'double_feedforward_Q_function': lambda *args, **kwargs: ( 19 | create_double_value_function(vanilla.create_feedforward_Q_function, *args, **kwargs)), 20 | } 21 | 22 | 23 | def get_Q_function_from_variant(variant, env, *args, **kwargs): 24 | Q_params = variant['Q_params'] 25 | Q_type = Q_params['type'] 26 | Q_kwargs = deepcopy(Q_params['kwargs']) 27 | 28 | preprocessor_params = Q_kwargs.pop('preprocessor_params', None) 29 | preprocessor = get_preprocessor_from_params(env, preprocessor_params) 30 | 31 | return VALUE_FUNCTIONS[Q_type]( 32 | observation_shape=env.active_observation_shape, 33 | action_shape=env.action_space.shape, 34 | *args, 35 | observation_preprocessor=preprocessor, 36 | **Q_kwargs, 37 | **kwargs) 38 | 39 | 40 | def get_V_function_from_variant(variant, env, *args, **kwargs): 41 | V_params = variant['V_params'] 42 | V_type = V_params['type'] 43 | V_kwargs = deepcopy(V_params['kwargs']) 44 | 45 | preprocessor_params = V_kwargs.pop('preprocessor_params', None) 46 | preprocessor = get_preprocessor_from_params(env, preprocessor_params) 47 | return VALUE_FUNCTIONS[V_type]( 48 | observation_shape=env.active_observation_shape, 49 | *args, 50 | observation_preprocessor=preprocessor, 51 | **V_kwargs, 52 | **kwargs) 53 | -------------------------------------------------------------------------------- /softlearning/value_functions/value_function.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | 4 | from serializable import Serializable 5 | 6 | 7 | class SumQFunction(Serializable): 8 | def __init__(self, 9 | observation_shape, 10 | action_shape, 11 | q_functions): 12 | self._Serializable__initialize(locals()) 13 | 14 | self.q_functions = q_functions 15 | 16 | assert len(observation_shape) == 1, observation_shape 17 | self._Do = observation_shape[0] 18 | assert len(action_shape) == 1, action_shape 19 | self._Da = action_shape[0] 20 | 21 | self._observations_ph = tf.placeholder( 22 | tf.float32, shape=(None, self._Do), name='observations') 23 | self._actions_ph = tf.placeholder( 24 | tf.float32, shape=(None, self._Da), name='actions') 25 | 26 | self._output = self.output_for( 27 | self._observations_ph, self._actions_ph, reuse=True) 28 | 29 | def output_for(self, observations, actions, reuse=False): 30 | outputs = [ 31 | qf.output_for(observations, actions, reuse=reuse) 32 | for qf in self.q_functions 33 | ] 34 | output = tf.add_n(outputs) 35 | return output 36 | 37 | def _eval(self, observations, actions): 38 | feeds = { 39 | self._observations_ph: observations, 40 | self._actions_ph: actions 41 | } 42 | 43 | return tf.keras.backend.get_session().run(self._output, feeds) 44 | 45 | def get_param_values(self): 46 | all_values_list = [qf.get_param_values() for qf in self.q_functions] 47 | 48 | return np.concatenate(all_values_list) 49 | 50 | def set_param_values(self, all_values): 51 | param_sizes = [qf.get_param_values().size for qf in self.q_functions] 52 | split_points = np.cumsum(param_sizes)[:-1] 53 | 54 | all_values_list = np.split(all_values, split_points) 55 | 56 | for values, qf in zip(all_values_list, self.q_functions): 57 | qf.set_param_values(values) 58 | -------------------------------------------------------------------------------- /softlearning/policies/uniform_policy.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import tensorflow as tf 4 | 5 | from .base_policy import BasePolicy 6 | 7 | 8 | class UniformPolicy(BasePolicy): 9 | def __init__(self, input_shapes, output_shape, action_range=(-1.0, 1.0)): 10 | super(UniformPolicy, self).__init__() 11 | self._Serializable__initialize(locals()) 12 | 13 | self.inputs = [ 14 | tf.keras.layers.Input(shape=input_shape) 15 | for input_shape in input_shapes 16 | ] 17 | self._action_range = action_range 18 | 19 | x = tf.keras.layers.Lambda( 20 | lambda x: tf.concat(x, axis=-1) 21 | )(self.inputs) 22 | 23 | actions = tf.keras.layers.Lambda( 24 | lambda x: tf.random.uniform( 25 | (tf.shape(x)[0], output_shape[0]), 26 | *action_range) 27 | )(x) 28 | 29 | self.actions_model = tf.keras.Model(self.inputs, actions) 30 | 31 | self.actions_input = tf.keras.Input(shape=output_shape) 32 | 33 | log_pis = tf.keras.layers.Lambda( 34 | lambda x: tf.tile(tf.log([ 35 | (action_range[1] - action_range[0]) / 2.0 36 | ])[None], (tf.shape(x)[0], 1)) 37 | )(self.actions_input) 38 | 39 | self.log_pis_model = tf.keras.Model( 40 | (*self.inputs, self.actions_input), log_pis) 41 | 42 | def get_weights(self): 43 | return [] 44 | 45 | def set_weights(self, *args, **kwargs): 46 | return 47 | 48 | @property 49 | def trainable_variables(self): 50 | return [] 51 | 52 | def reset(self): 53 | pass 54 | 55 | def actions(self, conditions): 56 | return self.actions_model(conditions) 57 | 58 | def log_pis(self, conditions, actions): 59 | return self.log_pis_model([*conditions, actions]) 60 | 61 | def actions_np(self, conditions): 62 | return self.actions_model.predict(conditions) 63 | 64 | def log_pis_np(self, conditions, actions): 65 | return self.log_pis_model.predict([*conditions, actions]) 66 | 67 | def get_diagnostics(self, conditions): 68 | return OrderedDict({}) 69 | -------------------------------------------------------------------------------- /softlearning/samplers/base_sampler.py: -------------------------------------------------------------------------------- 1 | from collections import deque, OrderedDict 2 | from itertools import islice 3 | 4 | 5 | class BaseSampler(object): 6 | def __init__(self, 7 | max_path_length, 8 | min_pool_size, 9 | batch_size, 10 | store_last_n_paths=10): 11 | self._max_path_length = max_path_length 12 | self._min_pool_size = min_pool_size 13 | self._batch_size = batch_size 14 | self._store_last_n_paths = store_last_n_paths 15 | self._last_n_paths = deque(maxlen=store_last_n_paths) 16 | 17 | self.env = None 18 | self.policy = None 19 | self.pool = None 20 | 21 | def initialize(self, env, policy, pool): 22 | self.env = env 23 | self.policy = policy 24 | self.pool = pool 25 | 26 | def set_policy(self, policy): 27 | self.policy = policy 28 | 29 | def clear_last_n_paths(self): 30 | self._last_n_paths.clear() 31 | 32 | def get_last_n_paths(self, n=None): 33 | if n is None: 34 | n = self._store_last_n_paths 35 | 36 | last_n_paths = tuple(islice(self._last_n_paths, None, n)) 37 | 38 | return last_n_paths 39 | 40 | def sample(self): 41 | raise NotImplementedError 42 | 43 | def batch_ready(self): 44 | enough_samples = self.pool.size >= self._min_pool_size 45 | return enough_samples 46 | 47 | def random_batch(self, batch_size=None, **kwargs): 48 | batch_size = batch_size or self._batch_size 49 | return self.pool.random_batch(batch_size, **kwargs) 50 | 51 | def terminate(self): 52 | self.env.close() 53 | 54 | def get_diagnostics(self): 55 | diagnostics = OrderedDict({'pool-size': self.pool.size}) 56 | return diagnostics 57 | 58 | def __getstate__(self): 59 | state = { 60 | key: value for key, value in self.__dict__.items() 61 | if key not in ('env', 'policy', 'pool') 62 | } 63 | 64 | return state 65 | 66 | def __setstate__(self, state): 67 | self.__dict__.update(state) 68 | 69 | self.env = None 70 | self.policy = None 71 | self.pool = None 72 | -------------------------------------------------------------------------------- /maple/env/humanoid.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym.envs.mujoco import mujoco_env 3 | from gym import utils 4 | 5 | def mass_center(model, sim): 6 | mass = np.expand_dims(model.body_mass, 1) 7 | xpos = sim.data.xipos 8 | return (np.sum(mass * xpos, 0) / np.sum(mass))[0] 9 | 10 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle): 11 | def __init__(self): 12 | mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5) 13 | utils.EzPickle.__init__(self) 14 | 15 | def _get_obs(self): 16 | data = self.sim.data 17 | return np.concatenate([data.qpos.flat[2:], 18 | data.qvel.flat, 19 | # data.cinert.flat, 20 | # data.cvel.flat, 21 | # data.qfrc_actuator.flat, 22 | # data.cfrc_ext.flat 23 | ]) 24 | 25 | def step(self, a): 26 | pos_before = mass_center(self.model, self.sim) 27 | self.do_simulation(a, self.frame_skip) 28 | pos_after = mass_center(self.model, self.sim) 29 | alive_bonus = 5.0 30 | data = self.sim.data 31 | lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep 32 | quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum() 33 | quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum() 34 | quad_impact_cost = min(quad_impact_cost, 10) 35 | reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus 36 | qpos = self.sim.data.qpos 37 | done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0)) 38 | return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost) 39 | 40 | def reset_model(self): 41 | c = 0.01 42 | self.set_state( 43 | self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq), 44 | self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,) 45 | ) 46 | return self._get_obs() 47 | 48 | def viewer_setup(self): 49 | self.viewer.cam.trackbodyid = 1 50 | self.viewer.cam.distance = self.model.stat.extent * 1.0 51 | self.viewer.cam.lookat[2] = 2.0 52 | self.viewer.cam.elevation = -20 53 | 54 | -------------------------------------------------------------------------------- /softlearning/samplers/extra_policy_info_sampler.py: -------------------------------------------------------------------------------- 1 | """Sampler that stores raw actions and log pis from policy.""" 2 | 3 | 4 | from collections import defaultdict 5 | 6 | import numpy as np 7 | 8 | from .simple_sampler import SimpleSampler 9 | 10 | 11 | class ExtraPolicyInfoSampler(SimpleSampler): 12 | def sample(self): 13 | if self._current_observation is None: 14 | self._current_observation = self.env.reset() 15 | 16 | observations = self.env.convert_to_active_observation( 17 | self._current_observation)[None] 18 | actions = self.policy.actions_np([observations]) 19 | log_pis = self.policy.log_pis_np([observations], actions) 20 | 21 | action = actions[0] 22 | log_pi = log_pis[0] 23 | 24 | next_observation, reward, terminal, info = self.env.step(action) 25 | self._path_length += 1 26 | self._path_return += reward 27 | self._total_samples += 1 28 | 29 | self._current_path['observations'].append(self._current_observation) 30 | self._current_path['actions'].append(action) 31 | self._current_path['rewards'].append([reward]) 32 | self._current_path['terminals'].append([terminal]) 33 | self._current_path['next_observations'].append(next_observation) 34 | self._current_path['infos'].append(info) 35 | # self._current_path['raw_actions'].append(raw_action) 36 | self._current_path['log_pis'].append(log_pi) 37 | 38 | if terminal or self._path_length >= self._max_path_length: 39 | last_path = { 40 | field_name: np.array(values) 41 | for field_name, values in self._current_path.items() 42 | } 43 | self.pool.add_path(last_path) 44 | self._last_n_paths.appendleft(last_path) 45 | 46 | self.policy.reset() 47 | self._current_observation = self.env.reset() 48 | 49 | self._max_path_return = max(self._max_path_return, 50 | self._path_return) 51 | self._last_path_return = self._path_return 52 | 53 | self._path_length = 0 54 | self._path_return = 0 55 | self._current_path = defaultdict(list) 56 | 57 | self._n_episodes += 1 58 | else: 59 | self._current_observation = next_observation 60 | 61 | return self._current_observation, reward, terminal, info 62 | -------------------------------------------------------------------------------- /rla_scripts/start_pretty_plotter.py: -------------------------------------------------------------------------------- 1 | """ 2 | A script to start a server of the pretty plotter. 3 | 4 | """ 5 | import os 6 | 7 | from RLA.easy_log.log_tools import PrettyPlotterTool, Filter 8 | import argparse 9 | from RLA.rla_argparser import boolean_flag 10 | from config import * 11 | 12 | from smart_logger.front_page.page import start_page_server 13 | import smart_logger.common.plot_config as plot_config 14 | import smart_logger.common.page_config as page_config 15 | 16 | 17 | def argsparser(): 18 | parser = argparse.ArgumentParser("Delete Log") 19 | # reduce setting 20 | parser.add_argument('--task_table_name', type=str, default="") 21 | parser.add_argument('--regex', type=str) 22 | parser.add_argument('--timestep_bound', type=int, default=100) 23 | parser.add_argument('--delete_type', type=str, default=Filter.ALL) 24 | parser.add_argument('--workspace_path', '-wks', type=str, default='~/.pretty_plotter_cache', 25 | help="Path to the workspace, used to saving cache data") 26 | parser.add_argument('--user_name', '-u', type=str, default='user', 27 | help="user name") 28 | parser.add_argument('--password', '-pw', type=str, default='123456', 29 | help="password") 30 | parser.add_argument('--port', '-p', type=int, default=7005, help="Server port") 31 | boolean_flag(parser, 'start_server', default=False) 32 | args = parser.parse_args() 33 | return args 34 | 35 | if __name__=='__main__': 36 | args = argsparser() 37 | filter = Filter() 38 | filter.config(type=args.delete_type, timstep_bound=args.timestep_bound) 39 | tool = PrettyPlotterTool(proj_root=DATA_ROOT, task_table_name=args.task_table_name, regex=args.regex) 40 | tool.gen_json(args.regex) 41 | if args.start_server: 42 | plot_config.DATA_PATH = os.path.abspath(DATA_ROOT) 43 | page_config.WORKSPAPCE = os.path.abspath(os.path.expanduser(args.workspace_path)) 44 | 45 | plot_config.DATA_MERGER = [] 46 | plot_config.PLOTTING_XY = [] 47 | plot_config.PLOT_LOG_PATH = f"{plot_config.DATA_PATH}" 48 | plot_config.PLOT_FIGURE_SAVING_PATH = f"{os.path.join(os.path.dirname(plot_config.DATA_PATH), 'figure')}" 49 | 50 | page_config.WEB_RAM_PATH = f"{page_config.WORKSPAPCE}/WEB_ROM" 51 | page_config.CONFIG_PATH = f"{page_config.WEB_RAM_PATH}/configs" 52 | page_config.FIGURE_PATH = f"{page_config.WEB_RAM_PATH}/figures" 53 | page_config.PORT = args.port 54 | page_config.USER_NAME = args.user_name 55 | page_config.PASSWD = args.password 56 | start_page_server() 57 | -------------------------------------------------------------------------------- /softlearning/preprocessors/convnet.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | 3 | from softlearning.models.feedforward import feedforward_model 4 | from softlearning.utils.keras import PicklableKerasModel 5 | 6 | 7 | def convnet_preprocessor( 8 | input_shapes, 9 | image_shape, 10 | output_size, 11 | conv_filters=(32, 32), 12 | conv_kernel_sizes=((5, 5), (5, 5)), 13 | pool_type='MaxPool2D', 14 | pool_sizes=((2, 2), (2, 2)), 15 | pool_strides=(2, 2), 16 | dense_hidden_layer_sizes=(64, 64), 17 | data_format='channels_last', 18 | name="convnet_preprocessor", 19 | make_picklable=True, 20 | *args, 21 | **kwargs): 22 | if data_format == 'channels_last': 23 | H, W, C = image_shape 24 | elif data_format == 'channels_first': 25 | C, H, W = image_shape 26 | 27 | inputs = [ 28 | tf.keras.layers.Input(shape=input_shape) 29 | for input_shape in input_shapes 30 | ] 31 | 32 | concatenated_input = tf.keras.layers.Lambda( 33 | lambda x: tf.concat(x, axis=-1) 34 | )(inputs) 35 | 36 | images_flat, input_raw = tf.keras.layers.Lambda( 37 | lambda x: [x[..., :H * W * C], x[..., H * W * C:]] 38 | )(concatenated_input) 39 | 40 | images = tf.keras.layers.Reshape(image_shape)(images_flat) 41 | 42 | conv_out = images 43 | for filters, kernel_size, pool_size, strides in zip( 44 | conv_filters, conv_kernel_sizes, pool_sizes, pool_strides): 45 | conv_out = tf.keras.layers.Conv2D( 46 | filters=filters, 47 | kernel_size=kernel_size, 48 | padding="SAME", 49 | activation=tf.nn.relu, 50 | *args, 51 | **kwargs 52 | )(conv_out) 53 | conv_out = getattr(tf.keras.layers, pool_type)( 54 | pool_size=pool_size, strides=strides 55 | )(conv_out) 56 | 57 | flattened = tf.keras.layers.Flatten()(conv_out) 58 | concatenated_output = tf.keras.layers.Lambda( 59 | lambda x: tf.concat(x, axis=-1) 60 | )([flattened, input_raw]) 61 | 62 | output = ( 63 | feedforward_model( 64 | input_shapes=(concatenated_output.shape[1:].as_list(), ), 65 | output_size=output_size, 66 | hidden_layer_sizes=dense_hidden_layer_sizes, 67 | activation='relu', 68 | output_activation='linear', 69 | *args, 70 | **kwargs 71 | )([concatenated_output]) 72 | if dense_hidden_layer_sizes 73 | else concatenated_output) 74 | 75 | model = PicklableKerasModel(inputs, output, name=name) 76 | 77 | return model 78 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | venv/ 108 | env.bak/ 109 | venv.bak/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | .spyproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # mypy 122 | .mypy_cache/ 123 | .dmypy.json 124 | dmypy.json 125 | 126 | # Pyre type checker 127 | .pyre/ 128 | 129 | .idea 130 | 131 | **.npz 132 | luban_start_* 133 | 134 | code/** 135 | checkpoint/** 136 | models/** 137 | results/** 138 | archive_tester/** 139 | log/** 140 | archived/** 141 | .DS_Store -------------------------------------------------------------------------------- /softlearning/misc/plotter.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | 5 | 6 | class QFPolicyPlotter: 7 | def __init__(self, Q, policy, obs_lst, default_action, n_samples): 8 | self._Q = Q 9 | self._policy = policy 10 | self._obs_lst = obs_lst 11 | self._default_action = np.array(default_action) 12 | self._n_samples = n_samples 13 | 14 | self._var_inds = np.where(np.isnan(default_action))[0] 15 | assert len(self._var_inds) == 2 16 | 17 | n_plots = len(obs_lst) 18 | 19 | x_size = 5 * n_plots 20 | y_size = 5 21 | 22 | fig = plt.figure(figsize=(x_size, y_size)) 23 | self._ax_lst = [] 24 | for i in range(n_plots): 25 | ax = fig.add_subplot(100 + n_plots * 10 + i + 1) 26 | ax.set_xlim((-1, 1)) 27 | ax.set_ylim((-1, 1)) 28 | ax.grid(True) 29 | self._ax_lst.append(ax) 30 | 31 | self._line_objects = list() 32 | 33 | def draw(self): 34 | # noinspection PyArgumentList 35 | [h.remove() for h in self._line_objects] 36 | self._line_objects = list() 37 | 38 | self._plot_level_curves() 39 | self._plot_action_samples() 40 | 41 | plt.draw() 42 | plt.pause(0.001) 43 | 44 | def _plot_level_curves(self): 45 | # Create mesh grid. 46 | xs = np.linspace(-1, 1, 50) 47 | ys = np.linspace(-1, 1, 50) 48 | xgrid, ygrid = np.meshgrid(xs, ys) 49 | N = len(xs)*len(ys) 50 | 51 | # Copy default values along the first axis and replace nans with 52 | # the mesh grid points. 53 | actions = np.tile(self._default_action.astype(np.float32), (N, 1)) 54 | actions[:, self._var_inds[0]] = xgrid.ravel() 55 | actions[:, self._var_inds[1]] = ygrid.ravel() 56 | 57 | for ax, obs in zip(self._ax_lst, self._obs_lst): 58 | observations = np.tile( 59 | obs[None].astype(np.float32), (actions.shape[0], 1)) 60 | 61 | Q_np = self._Q.predict((observations, actions)) 62 | Q_np = np.reshape(Q_np, xgrid.shape) 63 | 64 | cs = ax.contour(xgrid, ygrid, Q_np, 20) 65 | self._line_objects += cs.collections 66 | self._line_objects += ax.clabel( 67 | cs, inline=1, fontsize=10, fmt='%.2f') 68 | 69 | def _plot_action_samples(self): 70 | for ax, obs in zip(self._ax_lst, self._obs_lst): 71 | observations = np.ones((self._n_samples, 1)) * obs[None, :] 72 | actions = self._policy.actions_np([observations]) 73 | 74 | x, y = actions[:, 0], actions[:, 1] 75 | self._line_objects += ax.plot(x, y, 'b*') 76 | -------------------------------------------------------------------------------- /maple/env/ant.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | # def __init__(self, goal=15.0/180*np.pi): 7 | def __init__(self, goal=30.0/180*np.pi): 8 | self._goal = goal 9 | mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5) 10 | utils.EzPickle.__init__(self) 11 | 12 | def step(self, a): 13 | # self.render() 14 | xy_position_before = self.get_body_com("torso")[:2].copy() 15 | self.do_simulation(a, self.frame_skip) 16 | xy_position_after = self.get_body_com("torso")[:2].copy() 17 | direct = (np.cos(self._goal), np.sin(self._goal)) 18 | 19 | xy_velocity = (xy_position_after - xy_position_before) / self.dt 20 | x_velocity, y_velocity = xy_velocity 21 | 22 | # xposbefore = self.get_body_com("torso")[0] 23 | # self.do_simulation(a, self.frame_skip) 24 | # xposafter = self.get_body_com("torso")[0] 25 | 26 | # forward_reward = (xposafter - xposbefore)/self.dt 27 | forward_reward = x_velocity 28 | angle_reward = np.dot(np.array(xy_velocity), direct) 29 | ctrl_cost = .5 * np.square(a).sum() 30 | contact_cost = 0.5 * 1e-3 * np.sum( 31 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) 32 | survive_reward = 1.0 33 | reward = forward_reward - ctrl_cost - contact_cost + survive_reward 34 | state = self.state_vector() 35 | notdone = np.isfinite(state).all() \ 36 | and state[2] >= 0.2 and state[2] <= 1.0 37 | done = not notdone 38 | ob = self._get_obs(xy_velocity) 39 | return ob, reward, done, dict( 40 | reward_ctrl=-ctrl_cost, 41 | reward_contact=-contact_cost, 42 | reward_survive=survive_reward, 43 | reward_forward=forward_reward, 44 | reward_angle=angle_reward, 45 | x_position=xy_position_after[0], 46 | y_position=xy_position_after[1]) 47 | 48 | def _get_obs(self, xy_velocity): 49 | return np.concatenate([ 50 | # self.get_body_com("torso")[:2].copy(), 51 | self.sim.data.qpos.flat[2:], 52 | self.sim.data.qvel.flat, 53 | np.clip(self.sim.data.cfrc_ext, -1, 1).flat, 54 | np.array(xy_velocity), 55 | ]) 56 | 57 | def reset_model(self): 58 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 59 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 60 | self.set_state(qpos, qvel) 61 | return self._get_obs([0,0]) 62 | 63 | def viewer_setup(self): 64 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /maple/env/ant_angle.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from gym import utils 3 | from gym.envs.mujoco import mujoco_env 4 | 5 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle): 6 | # def __init__(self, goal=15.0/180*np.pi): 7 | def __init__(self, goal=30.0/180*np.pi): 8 | self._goal = goal 9 | mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5) 10 | utils.EzPickle.__init__(self) 11 | 12 | def step(self, a): 13 | # self.render() 14 | xy_position_before = self.get_body_com("torso")[:2].copy() 15 | self.do_simulation(a, self.frame_skip) 16 | xy_position_after = self.get_body_com("torso")[:2].copy() 17 | direct = (np.cos(self._goal), np.sin(self._goal)) 18 | 19 | xy_velocity = (xy_position_after - xy_position_before) / self.dt 20 | x_velocity, y_velocity = xy_velocity 21 | 22 | # xposbefore = self.get_body_com("torso")[0] 23 | # self.do_simulation(a, self.frame_skip) 24 | # xposafter = self.get_body_com("torso")[0] 25 | 26 | # forward_reward = (xposafter - xposbefore)/self.dt 27 | forward_reward = x_velocity 28 | angle_reward = np.dot(np.array(xy_velocity), direct) 29 | ctrl_cost = .5 * np.square(a).sum() 30 | contact_cost = 0.5 * 1e-3 * np.sum( 31 | np.square(np.clip(self.sim.data.cfrc_ext, -1, 1))) 32 | survive_reward = 1.0 33 | reward = x_velocity * np.cos(np.pi/6) + y_velocity * np.sin(np.pi/6) - ctrl_cost - contact_cost + survive_reward 34 | state = self.state_vector() 35 | notdone = np.isfinite(state).all() \ 36 | and state[2] >= 0.2 and state[2] <= 1.0 37 | done = not notdone 38 | ob = self._get_obs(xy_velocity) 39 | return ob, reward, done, dict( 40 | reward_ctrl=-ctrl_cost, 41 | reward_contact=-contact_cost, 42 | reward_survive=survive_reward, 43 | reward_forward=forward_reward, 44 | reward_angle=angle_reward, 45 | x_position=xy_position_after[0], 46 | y_position=xy_position_after[1]) 47 | 48 | def _get_obs(self, xy_velocity): 49 | return np.concatenate([ 50 | # self.get_body_com("torso")[:2].copy(), 51 | self.sim.data.qpos.flat[2:], 52 | self.sim.data.qvel.flat, 53 | np.clip(self.sim.data.cfrc_ext, -1, 1).flat, 54 | np.array(xy_velocity), 55 | ]) 56 | 57 | def reset_model(self): 58 | qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1) 59 | qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1 60 | self.set_state(qpos, qvel) 61 | return self._get_obs([0, 0]) 62 | 63 | def viewer_setup(self): 64 | self.viewer.cam.distance = self.model.stat.extent * 0.5 -------------------------------------------------------------------------------- /softlearning/algorithms/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | 4 | def create_SAC_algorithm(variant, *args, **kwargs): 5 | from .sac import SAC 6 | 7 | algorithm = SAC(*args, **kwargs) 8 | 9 | return algorithm 10 | 11 | 12 | def create_SQL_algorithm(variant, *args, **kwargs): 13 | from .sql import SQL 14 | 15 | algorithm = SQL(*args, **kwargs) 16 | 17 | return algorithm 18 | 19 | def create_MVE_algorithm(variant, *args, **kwargs): 20 | from .mve_sac import MVESAC 21 | 22 | algorithm = MVESAC(*args, **kwargs) 23 | 24 | return algorithm 25 | 26 | def create_MOPO_algorithm(variant, *args, **kwargs): 27 | from mopo.algorithms.mopo import MOPO 28 | 29 | algorithm = MOPO(*args, **kwargs) 30 | 31 | return algorithm 32 | 33 | 34 | ALGORITHM_CLASSES = { 35 | 'SAC': create_SAC_algorithm, 36 | 'SQL': create_SQL_algorithm, 37 | 'MOPO': create_MOPO_algorithm, 38 | } 39 | 40 | 41 | def get_algorithm_from_variant(variant, *args, **kwargs): 42 | algorithm_params = variant['algorithm_params'] 43 | algorithm_type = algorithm_params['type'] 44 | algorithm_kwargs = deepcopy(algorithm_params['kwargs']) 45 | exp_name = variant['algorithm_params']["exp_name"] 46 | # vae = variant['use_vae'] 47 | retrain_model = variant['retrain_model'] 48 | exp_name = exp_name.replace('_', '-') 49 | if algorithm_kwargs['separate_mean_var']: 50 | exp_name += '_smv' 51 | algorithm_kwargs["model_name"] = exp_name + '_1_{}'.format(variant['model_suffix']) 52 | algorithm_kwargs["tester"] = kwargs['tester'] 53 | if variant['length'] > 0: 54 | algorithm_kwargs['rollout_length'] = variant['length'] 55 | if variant['penalty_coeff'] >= 0: 56 | algorithm_kwargs['penalty_coeff'] = variant['penalty_coeff'] 57 | if variant['elite_num'] > 0: 58 | algorithm_kwargs['num_elites'] = variant['elite_num'] 59 | algorithm_kwargs['fix_env'] = variant['fix_env'] 60 | kwargs = {**kwargs, **algorithm_kwargs.toDict()} 61 | kwargs['vae'] = variant['use_vae'] 62 | kwargs['clip_state'] = not variant["no_clip_state"] 63 | kwargs['res_dyn'] = variant["res_dyn"] 64 | kwargs['norm_input'] = not variant["no_norm_input"] 65 | kwargs['seed'] = variant['run_params']['seed'] 66 | # kwargs['load_task_name'] = variant['load_task_name'] 67 | # kwargs['load_date'] = variant['load_date'] 68 | print("[ DEBUG ]: kwargs to net is {}".format(kwargs)) 69 | # if retrain_model: 70 | # print('[ DEBUG ] retraining model... ') 71 | # print(kwargs) 72 | # kwargs['model_load_dir'] = None 73 | kwargs['retrain'] = retrain_model 74 | kwargs['network_kwargs']['embedding_size'] = variant['emb_size'] 75 | kwargs['n_epochs'] = variant['n_epochs'] 76 | kwargs['source'] = variant['config'].split('.')[-2] 77 | algorithm = ALGORITHM_CLASSES[algorithm_type](variant, *args, **kwargs) 78 | return algorithm 79 | -------------------------------------------------------------------------------- /softlearning/samplers/utils.py: -------------------------------------------------------------------------------- 1 | from copy import deepcopy 2 | 3 | import numpy as np 4 | 5 | from softlearning import replay_pools 6 | from . import ( 7 | dummy_sampler, 8 | extra_policy_info_sampler, 9 | # remote_sampler, 10 | base_sampler, 11 | simple_sampler) 12 | 13 | 14 | def get_sampler_from_variant(variant, *args, **kwargs): 15 | SAMPLERS = { 16 | 'DummySampler': dummy_sampler.DummySampler, 17 | 'ExtraPolicyInfoSampler': ( 18 | extra_policy_info_sampler.ExtraPolicyInfoSampler), 19 | # 'RemoteSampler': remote_sampler.RemoteSampler, 20 | 'Sampler': base_sampler.BaseSampler, 21 | 'SimpleSampler': simple_sampler.SimpleSampler, 22 | } 23 | 24 | sampler_params = variant['sampler_params'] 25 | sampler_type = sampler_params['type'] 26 | 27 | sampler_args = deepcopy(sampler_params.get('args', ())) 28 | sampler_kwargs = deepcopy(sampler_params.get('kwargs', {})) 29 | 30 | sampler = SAMPLERS[sampler_type]( 31 | *sampler_args, *args, **sampler_kwargs, **kwargs) 32 | 33 | return sampler 34 | 35 | 36 | def rollout(env, 37 | policy, 38 | path_length, 39 | callback=None, 40 | render_mode=None, 41 | break_on_terminal=True): 42 | observation_space = env.observation_space 43 | action_space = env.action_space 44 | 45 | pool = replay_pools.SimpleReplayPool( 46 | observation_space, action_space, max_size=path_length) 47 | sampler = simple_sampler.SimpleSampler( 48 | max_path_length=path_length, 49 | min_pool_size=None, 50 | batch_size=None) 51 | 52 | sampler.initialize(env, policy, pool) 53 | 54 | images = [] 55 | infos = [] 56 | 57 | t = 0 58 | for t in range(path_length): 59 | observation, reward, terminal, info = sampler.sample() 60 | infos.append(info) 61 | 62 | if callback is not None: 63 | callback(observation) 64 | 65 | if render_mode is not None: 66 | if render_mode == 'rgb_array': 67 | image = env.render(mode=render_mode) 68 | # import pdb; pdb.set_trace() 69 | # image = env._env.sim.render(mode='offscreen') 70 | images.append(image) 71 | else: 72 | env.render() 73 | 74 | if terminal: 75 | sampler.reset_policy() 76 | if break_on_terminal: break 77 | 78 | assert pool._size == t + 1 79 | 80 | path = pool.batch_by_indices( 81 | np.arange(pool._size), 82 | observation_keys=getattr(env, 'observation_keys', None)) 83 | path['infos'] = infos 84 | 85 | if render_mode == 'rgb_array': 86 | path['images'] = np.stack(images, axis=0) 87 | 88 | return path 89 | 90 | 91 | def rollouts(n_paths, *args, **kwargs): 92 | paths = [rollout(*args, **kwargs) for i in range(n_paths)] 93 | return paths 94 | -------------------------------------------------------------------------------- /maple/models/constructor.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | from maple.models.fc import FC 5 | from maple.models.bnn import BNN 6 | 7 | def construct_model(obs_dim=11, act_dim=3, rew_dim=1, hidden_dim=200, num_networks=7, 8 | num_elites=5, session=None, model_type='mlp', separate_mean_var=False, 9 | name=None, load_dir=None, deterministic=False, source=None): 10 | if name is None: 11 | name = 'BNN' 12 | print('[ BNN ] Name {} | Observation dim {} | Action dim: {} | Hidden dim: {}'.format(name, obs_dim, act_dim, hidden_dim)) 13 | params = {'name': name, 'num_networks': num_networks, 'num_elites': num_elites, 14 | 'sess': session, 'separate_mean_var': separate_mean_var, 'deterministic': deterministic, 15 | 'obs_dim': obs_dim, 'source': source} 16 | 17 | if load_dir is not None: 18 | print('Specified load dir', load_dir) 19 | params['model_dir'] = load_dir 20 | 21 | model = BNN(params) 22 | 23 | if not model.model_loaded: 24 | if model_type == 'identity': 25 | return 26 | elif model_type == 'linear': 27 | print('[ BNN ] Training linear model') 28 | model.add(FC(obs_dim+rew_dim, input_dim=obs_dim+act_dim, weight_decay=0.000025)) 29 | elif model_type == 'mlp': 30 | print('[ BNN ] Training non-linear model | Obs: {} | Act: {} | Rew: {}'.format(obs_dim, act_dim, rew_dim)) 31 | model.add(FC(hidden_dim, input_dim=obs_dim+act_dim, activation="swish", weight_decay=0.000025)) 32 | model.add(FC(hidden_dim, activation="swish", weight_decay=0.00005)) 33 | model.add(FC(hidden_dim, activation="swish", weight_decay=0.000075)) 34 | model.add(FC(hidden_dim, activation="swish", weight_decay=0.000075)) 35 | model.add(FC(obs_dim+rew_dim, weight_decay=0.0001)) 36 | if separate_mean_var: 37 | model.add(FC(obs_dim+rew_dim, input_dim=hidden_dim, weight_decay=0.0001), var_layer=True) 38 | 39 | if load_dir is not None: 40 | model.model_loaded = True 41 | if source == 'd4rl': 42 | model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001}) 43 | elif source == 'neorl': 44 | model.finalize(tf.contrib.opt.AdamWOptimizer, {"learning_rate": 0.001, "weight_decay":0.000075}) 45 | print('[ BNN ] Model: {}'.format(model)) 46 | return model 47 | 48 | 49 | def format_samples_for_training(samples): 50 | # terminals = samples["terminals"] 51 | # terminals_idx = np.where(terminals)[0] 52 | obs = samples['observations'] 53 | act = samples['actions'] 54 | next_obs = samples['next_observations'] 55 | # next_obs[terminals_idx] = obs[terminals_idx] 56 | rew = samples['rewards'] 57 | delta_obs = next_obs - obs 58 | inputs = np.concatenate((obs, act), axis=-1) 59 | outputs = np.concatenate((rew, delta_obs), axis=-1) 60 | 61 | return inputs, outputs 62 | 63 | 64 | def reset_model(model): 65 | model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=model.name) 66 | model.sess.run(tf.initialize_vars(model_vars)) 67 | 68 | if __name__ == '__main__': 69 | model = construct_model() 70 | -------------------------------------------------------------------------------- /softlearning/misc/kernel.py: -------------------------------------------------------------------------------- 1 | from distutils.version import LooseVersion 2 | 3 | import numpy as np 4 | import tensorflow as tf 5 | 6 | 7 | def adaptive_isotropic_gaussian_kernel(xs, ys, h_min=1e-3): 8 | """Gaussian kernel with dynamic bandwidth. 9 | 10 | The bandwidth is adjusted dynamically to match median_distance / log(Kx). 11 | See [2] for more information. 12 | 13 | Args: 14 | xs(`tf.Tensor`): A tensor of shape (N x Kx x D) containing N sets of Kx 15 | particles of dimension D. This is the first kernel argument. 16 | ys(`tf.Tensor`): A tensor of shape (N x Ky x D) containing N sets of Kx 17 | particles of dimension D. This is the second kernel argument. 18 | h_min(`float`): Minimum bandwidth. 19 | 20 | Returns: 21 | `dict`: Returned dictionary has two fields: 22 | 'output': A `tf.Tensor` object of shape (N x Kx x Ky) representing 23 | the kernel matrix for inputs `xs` and `ys`. 24 | 'gradient': A 'tf.Tensor` object of shape (N x Kx x Ky x D) 25 | representing the gradient of the kernel with respect to `xs`. 26 | 27 | Reference: 28 | [2] Qiang Liu,Dilin Wang, "Stein Variational Gradient Descent: A General 29 | Purpose Bayesian Inference Algorithm," Neural Information Processing 30 | Systems (NIPS), 2016. 31 | """ 32 | Kx, D = xs.get_shape().as_list()[-2:] 33 | Ky, D2 = ys.get_shape().as_list()[-2:] 34 | assert D == D2 35 | 36 | leading_shape = tf.shape(xs)[:-2] 37 | 38 | # Compute the pairwise distances of left and right particles. 39 | diff = tf.expand_dims(xs, -2) - tf.expand_dims(ys, -3) 40 | # ... x Kx x Ky x D 41 | 42 | if LooseVersion(tf.__version__) <= LooseVersion('1.5.0'): 43 | dist_sq = tf.reduce_sum(diff**2, axis=-1, keep_dims=False) 44 | else: 45 | dist_sq = tf.reduce_sum(diff**2, axis=-1, keepdims=False) 46 | # ... x Kx x Ky 47 | 48 | # Get median. 49 | input_shape = tf.concat((leading_shape, [Kx * Ky]), axis=0) 50 | values, _ = tf.nn.top_k( 51 | input=tf.reshape(dist_sq, input_shape), 52 | k=(Kx * Ky // 2 + 1), # This is exactly true only if Kx*Ky is odd. 53 | sorted=True) # ... x floor(Ks*Kd/2) 54 | 55 | medians_sq = values[..., -1] # ... (shape) (last element is the median) 56 | 57 | h = medians_sq / np.log(Kx) # ... (shape) 58 | h = tf.maximum(h, h_min) 59 | h = tf.stop_gradient(h) # Just in case. 60 | h_expanded_twice = tf.expand_dims(tf.expand_dims(h, -1), -1) 61 | # ... x 1 x 1 62 | 63 | kappa = tf.exp(-dist_sq / h_expanded_twice) # ... x Kx x Ky 64 | 65 | # Construct the gradient 66 | h_expanded_thrice = tf.expand_dims(h_expanded_twice, -1) 67 | # ... x 1 x 1 x 1 68 | kappa_expanded = tf.expand_dims(kappa, -1) # ... x Kx x Ky x 1 69 | 70 | kappa_grad = -2 * diff / h_expanded_thrice * kappa_expanded 71 | # ... x Kx x Ky x D 72 | 73 | return {"output": kappa, "gradient": kappa_grad} 74 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # MAPLE 2 | The Official Code for "[MAPLE: Offline Model-based Adaptable Policy Learning](https://proceedings.neurips.cc/paper/2021/hash/470e7a4f017a5476afb7eeb3f8b96f9b-Abstract.html)". 3 | ![](./resources/poster.png) 4 | After being accepted in NeurIPS'21, we conducted experiments in [NeoRL](https://arxiv.org/abs/2102.00714). The results can be found in the following table. 5 | ![](./resources/neorl-maple.png) 6 | \* In this process, we introduced parts of implementation tricks in [the NeoRL version of MOPO](https://agit.ai/Polixir/OfflineRL/src/branch/master/offlinerl) into MAPLE, which also make the training process of MAPLE more stable in NeoRL tasks and keep (or further improve) the performance in D4RL. 7 | 8 | (The Pytorch version of MAPLE can also be found in: https://github.com/polixir/OfflineRL) 9 | 10 | # [optional] Download Resources 11 | 12 | For better reproducibility, we uploaded a backup of dataset which is used in our experiment, since we found that the content of dataset in [D4RL](https://github.com/rail-berkeley/d4rl) and [NeoRL](https://github.com/polixir/NeoRL) might be changed. 13 | - D4RL: https://drive.google.com/drive/folders/1kgNg6xLHRTyb_tzDQULezB9XYGNuakCM?usp=sharing 14 | - NeoRL: https://drive.google.com/drive/folders/1gZdVQTY_7FLCFGqszHF9sfKcXT8epoze?usp=sharing 15 | 16 | After downloaded, you can push the data of D4RL to ~/.d4rl/datasets and NeoRL to {your path to MAPLE}/neorl_data/ 17 | 18 | We have also uploaded the dynamics models for MAPLE-200 and MAPLE-NeoRL (which have 50 ensemble models) training, which can be found in: https://drive.google.com/drive/folders/1Ex9_RyJsafKaU2Eo5UgD34ZqJnJ25cru?usp=sharing. 19 | You can download the models to {path to MAPLE}/models to skip the dynamics model training process. 20 | 21 | # Installation 22 | 23 | We use [RLAssistant](https://github.com/xionghuichen/RLAssistant) to manage our experiments. You can download and install it via: 24 | ``` 25 | git clone https://github.com/xionghuichen/RLAssistant.git 26 | cd RLAssistant 27 | pip install -e . 28 | ``` 29 | Then you can install MAPLE via: 30 | ``` 31 | git clone https://github.com/xionghuichen/MAPLE.git 32 | cd MAPLE 33 | pip install -e . 34 | ``` 35 | 36 | # Quick Start 37 | 38 | You can train your MAPLE policy directly like this: 39 | ``` 40 | cd run_scripts 41 | # train the MAPLE policy for the hopper_low task in neorl 42 | python main.py --config examples.config.neorl.hopper_low 43 | or 44 | 45 | # train the MAPLE policy for walker2d_medium_expert task in d4rl 46 | python main.py --config examples.config.d4rl.walker2d_medium_expert 47 | 48 | # train the MAPLE policy for walker2d_medium_expert task in d4rl with 200 dynamics models 49 | python main.py --config examples.config.d4rl.walker2d_medium_expert --maple_200 50 | 51 | 52 | # train the MAPLE policy for walker2d_medium_expert task in d4rl with your custom configs 53 | python main.py --config examples.config.d4rl.walker2d_medium_expert --custom_config --penalty_coeff 1.0 54 | ``` 55 | 56 | The training logs can be found in {your MAPLE path}/log. You can use tensorbard to check and also use the tools in RLA to visualize (e.g., ```RLA.easy_plot.plot_func.plot_res_func```). 57 | You can check plot_demo.ipynb for more details. The figure of the simplest setting will be something like this: 58 | 59 | ![](./resources/plot_demo.png) 60 | 61 | There are also some scrips in ``./rla_scrips`` to manage the experimental logs. 62 | -------------------------------------------------------------------------------- /run_scripts/utils.py: -------------------------------------------------------------------------------- 1 | import multiprocessing 2 | import argparse 3 | from distutils.util import strtobool 4 | import json 5 | 6 | # from ray.tune import sample_from 7 | 8 | import softlearning.algorithms.utils as alg_utils 9 | import softlearning.environments.utils as env_utils 10 | from softlearning.misc.utils import datetimestamp 11 | 12 | 13 | DEFAULT_UNIVERSE = 'gym' 14 | DEFAULT_DOMAIN = 'HalfCheetah' 15 | DEFAULT_TASK = 'v2' 16 | 17 | class AlgType(object): 18 | MAPLE_NEORL = 'maple_neorl' 19 | MAPLE_D4RL = 'maple_d4rl' 20 | MAPLE_D4RL_200 = 'maple_d4rl_200' 21 | 22 | 23 | TASKS_BY_DOMAIN_BY_UNIVERSE = { 24 | universe: { 25 | domain: tuple(tasks) 26 | for domain, tasks in domains.items() 27 | } 28 | for universe, domains in env_utils.ENVIRONMENTS.items() 29 | } 30 | 31 | AVAILABLE_TASKS = set(sum( 32 | [ 33 | tasks 34 | for universe, domains in TASKS_BY_DOMAIN_BY_UNIVERSE.items() 35 | for domain, tasks in domains.items() 36 | ], 37 | ())) 38 | 39 | DOMAINS_BY_UNIVERSE = { 40 | universe: tuple(domains) 41 | for universe, domains in env_utils.ENVIRONMENTS.items() 42 | } 43 | 44 | AVAILABLE_DOMAINS = set(sum(DOMAINS_BY_UNIVERSE.values(), ())) 45 | 46 | UNIVERSES = tuple(env_utils.ENVIRONMENTS) 47 | 48 | AVAILABLE_ALGORITHMS = set(alg_utils.ALGORITHM_CLASSES.keys()) 49 | 50 | 51 | 52 | 53 | 54 | def get_parser(allow_policy_list=False): 55 | parser = argparse.ArgumentParser() 56 | 57 | # parser.add_argument( 58 | # '--universe', 59 | # type=str, 60 | # choices=UNIVERSES, 61 | # default=DEFAULT_UNIVERSE) 62 | # parser.add_argument( 63 | # '--domain', 64 | # type=str, 65 | # choices=AVAILABLE_DOMAINS, 66 | # default=DEFAULT_DOMAIN) 67 | parser.add_argument( 68 | '--config', 69 | type=str, 70 | default='examples.config.d4rl.halfcheetah_medium_expert' 71 | ) 72 | parser.add_argument( 73 | '--info', type=str, default='default_info') 74 | parser.add_argument('--length', type=int, default=-1) 75 | parser.add_argument('--penalty_clip', type=float, default=20) 76 | parser.add_argument('--elite_num', type=int, default=-1) 77 | parser.add_argument( '--seed', type=int, default=88) 78 | parser.add_argument( '--n_epochs', type=int, default=1000) 79 | parser.add_argument( 80 | '--penalty_coeff', type=float, default=-1.0) 81 | parser.add_argument( 82 | '--emb_size', type=int, default=16) 83 | parser.add_argument( 84 | '--model_suffix', type=int, default=-1) 85 | parser.add_argument('--loaded_date', type=str, default='') 86 | parser.add_argument('--loaded_task_name', type=str, default='') 87 | parser.add_argument('--not_inherit_hp', action='store_false') 88 | parser.add_argument('--maple_200', action='store_true') 89 | parser.add_argument('--custom_config', action='store_true') 90 | parser.add_argument('--retrain_model', action='store_true') 91 | 92 | if allow_policy_list: 93 | parser.add_argument( 94 | '--policy', 95 | type=str, 96 | nargs='+', 97 | choices=('gaussian', ), 98 | default='gaussian') 99 | else: 100 | parser.add_argument( 101 | '--policy', 102 | type=str, 103 | choices=('gaussian', ), 104 | default='gaussian') 105 | 106 | 107 | 108 | return parser 109 | 110 | 111 | -------------------------------------------------------------------------------- /softlearning/policies/base_policy.py: -------------------------------------------------------------------------------- 1 | from contextlib import contextmanager 2 | from collections import OrderedDict 3 | 4 | import numpy as np 5 | from serializable import Serializable 6 | 7 | 8 | class BasePolicy(Serializable): 9 | def __init__(self): 10 | self._deterministic = False 11 | 12 | def reset(self): 13 | """Reset and clean the policy.""" 14 | raise NotImplementedError 15 | 16 | def actions(self, conditions): 17 | """Compute (symbolic) actions given conditions (observations)""" 18 | raise NotImplementedError 19 | 20 | def log_pis(self, conditions, actions): 21 | """Compute (symbolic) log probs for given observations and actions.""" 22 | raise NotImplementedError 23 | 24 | def actions_np(self, conditions): 25 | """Compute (numeric) actions given conditions (observations)""" 26 | raise NotImplementedError 27 | 28 | def log_pis_np(self, conditions, actions): 29 | """Compute (numeric) log probs for given observations and actions.""" 30 | raise NotImplementedError 31 | 32 | @contextmanager 33 | def set_deterministic(self, deterministic=True): 34 | """Context manager for changing the determinism of the policy. 35 | Args: 36 | set_deterministic (`bool`): Value to set the self._is_deterministic 37 | to during the context. The value will be reset back to the 38 | previous value when the context exits. 39 | """ 40 | was_deterministic = self._deterministic 41 | self._deterministic = deterministic 42 | yield 43 | self._deterministic = was_deterministic 44 | 45 | def get_diagnostics(self, conditions): 46 | """Return diagnostic information of the policy. 47 | 48 | Arguments: 49 | conditions: Observations to run the diagnostics for. 50 | Returns: 51 | diagnostics: OrderedDict of diagnostic information. 52 | """ 53 | diagnostics = OrderedDict({}) 54 | return diagnostics 55 | 56 | def __getstate__(self): 57 | state = Serializable.__getstate__(self) 58 | state['pickled_weights'] = self.get_weights() 59 | 60 | return state 61 | 62 | def __setstate__(self, state): 63 | Serializable.__setstate__(self, state) 64 | self.set_weights(state['pickled_weights']) 65 | 66 | 67 | class LatentSpacePolicy(BasePolicy): 68 | def __init__(self, *args, smoothing_coefficient=None, **kwargs): 69 | super(LatentSpacePolicy, self).__init__(*args, **kwargs) 70 | 71 | assert smoothing_coefficient is None or 0 <= smoothing_coefficient <= 1 72 | self._smoothing_alpha = smoothing_coefficient or 0 73 | self._smoothing_beta = ( 74 | np.sqrt(1.0 - np.power(self._smoothing_alpha, 2.0)) 75 | / (1.0 - self._smoothing_alpha)) 76 | self._reset_smoothing_x() 77 | self._smooth_latents = False 78 | 79 | def _reset_smoothing_x(self): 80 | self._smoothing_x = np.zeros((1, *self._output_shape)) 81 | 82 | def actions_np(self, conditions): 83 | if self._deterministic: 84 | return self.deterministic_actions_model.predict(conditions) 85 | elif self._smoothing_alpha == 0: 86 | return self.actions_model.predict(conditions) 87 | else: 88 | alpha, beta = self._smoothing_alpha, self._smoothing_beta 89 | raw_latents = self.latents_model.predict(conditions) 90 | self._smoothing_x = ( 91 | alpha * self._smoothing_x + (1.0 - alpha) * raw_latents) 92 | latents = beta * self._smoothing_x 93 | 94 | return self.actions_model_for_fixed_latents.predict( 95 | [*conditions, latents]) 96 | 97 | def reset(self): 98 | self._reset_smoothing_x() 99 | -------------------------------------------------------------------------------- /softlearning/samplers/explore_sampler.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | from .base_sampler import BaseSampler 6 | 7 | 8 | class ExploreSampler(BaseSampler): 9 | def __init__(self, **kwargs): 10 | super(ExploreSampler, self).__init__(**kwargs) 11 | 12 | self._path_length = 0 13 | self._path_return = 0 14 | self._current_path = defaultdict(list) 15 | self._last_path_return = 0 16 | self._max_path_return = -np.inf 17 | self._n_episodes = 0 18 | self._current_observation = None 19 | self._total_samples = 0 20 | 21 | def _process_observations(self, 22 | observation, 23 | action, 24 | reward, 25 | terminal, 26 | next_observation, 27 | info): 28 | processed_observation = { 29 | 'observations': observation, 30 | 'actions': action, 31 | 'rewards': [reward], 32 | 'terminals': [terminal], 33 | 'next_observations': next_observation, 34 | 'infos': info, 35 | } 36 | 37 | return processed_observation 38 | 39 | def sample(self): 40 | if self._current_observation is None: 41 | self._current_observation = self.env.reset() 42 | self._s0 = self.env.unwrapped.state_vector() 43 | 44 | action = self.policy.actions_np([ 45 | self.env.convert_to_active_observation( 46 | self._current_observation)[None] 47 | ])[0] 48 | 49 | next_observation, reward, terminal, info = self.env.step(action) 50 | self._path_length += 1 51 | self._path_return += reward 52 | self._total_samples += 1 53 | 54 | processed_sample = self._process_observations( 55 | observation=self._current_observation, 56 | action=action, 57 | reward=reward, 58 | terminal=terminal, 59 | next_observation=next_observation, 60 | info=info, 61 | ) 62 | 63 | for key, value in processed_sample.items(): 64 | self._current_path[key].append(value) 65 | 66 | if terminal or self._path_length >= self._max_path_length: 67 | last_path = { 68 | field_name: np.array(values) 69 | for field_name, values in self._current_path.items() 70 | } 71 | self.pool.add_path(last_path) 72 | self._last_n_paths.appendleft(last_path) 73 | 74 | self._max_path_return = max(self._max_path_return, 75 | self._path_return) 76 | self._last_path_return = self._path_return 77 | 78 | self.policy.reset() 79 | self._current_observation = None 80 | self._path_length = 0 81 | self._path_return = 0 82 | self._current_path = defaultdict(list) 83 | 84 | self._n_episodes += 1 85 | else: 86 | self._current_observation = next_observation 87 | 88 | return next_observation, reward, terminal, info 89 | 90 | def random_batch(self, batch_size=None, **kwargs): 91 | batch_size = batch_size or self._batch_size 92 | observation_keys = getattr(self.env, 'observation_keys', None) 93 | 94 | return self.pool.random_batch( 95 | batch_size, observation_keys=observation_keys, **kwargs) 96 | 97 | def get_diagnostics(self): 98 | diagnostics = super(SimpleSampler, self).get_diagnostics() 99 | diagnostics.update({ 100 | 'max-path-return': self._max_path_return, 101 | 'last-path-return': self._last_path_return, 102 | 'episodes': self._n_episodes, 103 | 'total-samples': self._total_samples, 104 | }) 105 | 106 | return diagnostics 107 | -------------------------------------------------------------------------------- /softlearning/samplers/remote_sampler.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | from collections import OrderedDict 3 | 4 | import ray 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | 9 | from .base_sampler import BaseSampler 10 | from .utils import rollout 11 | 12 | 13 | class RemoteSampler(BaseSampler): 14 | def __init__(self, **kwargs): 15 | super(RemoteSampler, self).__init__(**kwargs) 16 | 17 | self._remote_environment = None 18 | self._remote_path = None 19 | self._n_episodes = 0 20 | self._total_samples = 0 21 | self._last_path_return = 0 22 | self._max_path_return = -np.inf 23 | 24 | def _create_remote_environment(self, env, policy): 25 | env_pkl = pickle.dumps(env) 26 | policy_pkl = pickle.dumps(policy) 27 | 28 | if not ray.is_initialized(): 29 | ray.init() 30 | 31 | self._remote_environment = _RemoteEnv.remote(env_pkl, policy_pkl) 32 | 33 | # Block until the env and policy is ready 34 | initialized = ray.get(self._remote_environment.initialized.remote()) 35 | assert initialized, initialized 36 | 37 | def initialize(self, env, policy, pool): 38 | super(RemoteSampler, self).initialize(env, policy, pool) 39 | self._create_remote_environment(env, policy) 40 | 41 | def wait_for_path(self, timeout=1): 42 | if self._remote_path is None: 43 | return [True] 44 | 45 | path_ready, _ = ray.wait([self._remote_path], timeout=timeout) 46 | return path_ready 47 | 48 | def sample(self, timeout=0): 49 | if self._remote_path is None: 50 | policy_params = self.policy.get_weights() 51 | self._remote_path = self._remote_environment.rollout.remote( 52 | policy_params, self._max_path_length) 53 | 54 | path_ready = self.wait_for_path(timeout=timeout) 55 | 56 | if len(path_ready) or not self.batch_ready(): 57 | path = ray.get(self._remote_path) 58 | self._last_n_paths.appendleft(path) 59 | 60 | self.pool.add_path(path) 61 | 62 | self._remote_path = None 63 | self._total_samples += len(path['observations']) 64 | self._last_path_return = np.sum(path['rewards']) 65 | self._max_path_return = max(self._max_path_return, 66 | self._last_path_return) 67 | self._n_episodes += 1 68 | 69 | def get_diagnostics(self): 70 | diagnostics = OrderedDict({ 71 | 'max-path-return': self._max_path_return, 72 | 'last-path-return': self._last_path_return, 73 | 'pool-size': self.pool.size, 74 | 'episodes': self._n_episodes, 75 | 'total-samples': self._total_samples, 76 | }) 77 | 78 | return diagnostics 79 | 80 | def __getstate__(self): 81 | super_state = super(RemoteSampler, self).__getstate__() 82 | state = { 83 | key: value for key, value in super_state.items() 84 | if key not in ('_remote_environment', '_remote_path') 85 | } 86 | 87 | return state 88 | 89 | def __setstate__(self, state): 90 | super(RemoteSampler, self).__setstate__(state) 91 | self._create_remote_environment(self.env, self.policy) 92 | self._remote_path = None 93 | 94 | 95 | @ray.remote 96 | class _RemoteEnv(object): 97 | def __init__(self, env_pkl, policy_pkl): 98 | self._session = tf.keras.backend.get_session() 99 | self._session.run(tf.global_variables_initializer()) 100 | 101 | self._env = pickle.loads(env_pkl) 102 | self._policy = pickle.loads(policy_pkl) 103 | 104 | if hasattr(self._env, 'initialize'): 105 | self._env.initialize() 106 | 107 | self._initialized = True 108 | 109 | def initialized(self): 110 | return self._initialized 111 | 112 | def rollout(self, policy_weights, path_length): 113 | self._policy.set_weights(policy_weights) 114 | path = rollout(self._env, self._policy, path_length) 115 | 116 | return path 117 | -------------------------------------------------------------------------------- /maple/models/utils.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from __future__ import print_function 3 | from __future__ import absolute_import 4 | 5 | import tensorflow as tf 6 | import numpy as np 7 | 8 | def get_required_argument(dotmap, key, message, default=None): 9 | val = dotmap.get(key, default) 10 | if val is default: 11 | raise ValueError(message) 12 | return val 13 | 14 | def spectral_norm(w, iteration=1): 15 | w_shape = w.shape.as_list() 16 | w = tf.reshape(w, [-1, w_shape[-1]]) 17 | 18 | u = tf.get_variable("u", [1, w_shape[-1]], initializer=tf.random_normal_initializer(), trainable=False) 19 | 20 | u_hat = u 21 | v_hat = None 22 | for i in range(iteration): 23 | """ 24 | power iteration 25 | Usually iteration = 1 will be enough 26 | """ 27 | v_ = tf.matmul(u_hat, tf.transpose(w)) 28 | v_hat = tf.nn.l2_normalize(v_) 29 | 30 | u_ = tf.matmul(v_hat, w) 31 | u_hat = tf.nn.l2_normalize(u_) 32 | 33 | u_hat = tf.stop_gradient(u_hat) 34 | v_hat = tf.stop_gradient(v_hat) 35 | 36 | sigma = tf.matmul(tf.matmul(v_hat, w), tf.transpose(u_hat)) 37 | 38 | with tf.control_dependencies([u.assign(u_hat)]): 39 | w_norm = w / sigma 40 | w_norm = tf.reshape(w_norm, w_shape) 41 | 42 | 43 | return w_norm, u 44 | 45 | class TensorStandardScaler: 46 | """Helper class for automatically normalizing inputs into the network. 47 | """ 48 | def __init__(self, x_dim): 49 | """Initializes a scaler. 50 | 51 | Arguments: 52 | x_dim (int): The dimensionality of the inputs into the scaler. 53 | 54 | Returns: None. 55 | """ 56 | self.fitted = False 57 | with tf.variable_scope("Scaler"): 58 | self.mu = tf.get_variable( 59 | name="scaler_mu", shape=[1, x_dim], initializer=tf.constant_initializer(0.0), 60 | trainable=False 61 | ) 62 | self.sigma = tf.get_variable( 63 | name="scaler_std", shape=[1, x_dim], initializer=tf.constant_initializer(1.0), 64 | trainable=False 65 | ) 66 | 67 | self.cached_mu, self.cached_sigma = np.zeros([0, x_dim]), np.ones([1, x_dim]) 68 | 69 | def fit(self, data): 70 | """Runs two ops, one for assigning the mean of the data to the internal mean, and 71 | another for assigning the standard deviation of the data to the internal standard deviation. 72 | This function must be called within a 'with .as_default()' block. 73 | 74 | Arguments: 75 | data (np.ndarray): A numpy array containing the input 76 | 77 | Returns: None. 78 | """ 79 | mu = np.mean(data, axis=0, keepdims=True) 80 | sigma = np.std(data, axis=0, keepdims=True) 81 | sigma[sigma < 1e-12] = 1.0 82 | 83 | self.mu.load(mu) 84 | self.sigma.load(sigma) 85 | self.fitted = True 86 | self.cache() 87 | 88 | def transform(self, data): 89 | """Transforms the input matrix data using the parameters of this scaler. 90 | 91 | Arguments: 92 | data (np.array): A numpy array containing the points to be transformed. 93 | 94 | Returns: (np.array) The transformed dataset. 95 | """ 96 | return (data - self.mu) / self.sigma 97 | 98 | def inverse_transform(self, data): 99 | """Undoes the transformation performed by this scaler. 100 | 101 | Arguments: 102 | data (np.array): A numpy array containing the points to be transformed. 103 | 104 | Returns: (np.array) The transformed dataset. 105 | """ 106 | return self.sigma * data + self.mu 107 | 108 | def get_vars(self): 109 | """Returns a list of variables managed by this object. 110 | 111 | Returns: (list) The list of variables. 112 | """ 113 | return [self.mu, self.sigma] 114 | 115 | def cache(self): 116 | """Caches current values of this scaler. 117 | 118 | Returns: None. 119 | """ 120 | self.cached_mu = self.mu.eval() 121 | self.cached_sigma = self.sigma.eval() 122 | 123 | def load_cache(self): 124 | """Loads values from the cache 125 | Returns: None. 126 | """ 127 | self.mu.load(self.cached_mu) 128 | self.sigma.load(self.cached_sigma) 129 | 130 | -------------------------------------------------------------------------------- /softlearning/misc/utils.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import datetime 3 | import os 4 | import random 5 | 6 | import tensorflow as tf 7 | import numpy as np 8 | 9 | 10 | PROJECT_PATH = os.path.dirname( 11 | os.path.realpath(os.path.join(__file__, '..', '..'))) 12 | 13 | 14 | DEFAULT_SNAPSHOT_MODE = 'none' 15 | DEFAULT_SNAPSHOT_GAP = 1000 16 | 17 | 18 | def initialize_tf_variables(session, only_uninitialized=True): 19 | variables = tf.global_variables() + tf.local_variables() 20 | 21 | def is_initialized(variable): 22 | try: 23 | session.run(variable) 24 | return True 25 | except tf.errors.FailedPreconditionError: 26 | return False 27 | 28 | return False 29 | 30 | if only_uninitialized: 31 | variables = [ 32 | variable for variable in variables 33 | if not is_initialized(variable) 34 | ] 35 | 36 | session.run(tf.variables_initializer(variables)) 37 | 38 | 39 | def set_seed(seed): 40 | seed %= 4294967294 41 | random.seed(seed) 42 | np.random.seed(seed) 43 | tf.set_random_seed(seed) 44 | print("Using seed {}".format(seed)) 45 | 46 | 47 | def datetimestamp(divider='-', datetime_divider='T'): 48 | now = datetime.datetime.now() 49 | return now.strftime( 50 | '%Y{d}%m{d}%dT%H{d}%M{d}%S' 51 | ''.format(d=divider, dtd=datetime_divider)) 52 | 53 | 54 | def datestamp(divider='-'): 55 | return datetime.date.today().isoformat().replace('-', divider) 56 | 57 | 58 | def timestamp(divider='-'): 59 | now = datetime.datetime.now() 60 | time_now = datetime.datetime.time(now) 61 | return time_now.strftime( 62 | '%H{d}%M{d}%S'.format(d=divider)) 63 | 64 | 65 | def concat_obs_z(obs, z, num_skills): 66 | """Concatenates the observation to a one-hot encoding of Z.""" 67 | assert np.isscalar(z) 68 | z_one_hot = np.zeros(num_skills) 69 | z_one_hot[z] = 1 70 | return np.hstack([obs, z_one_hot]) 71 | 72 | 73 | def split_aug_obs(aug_obs, num_skills): 74 | """Splits an augmented observation into the observation and Z.""" 75 | (obs, z_one_hot) = (aug_obs[:-num_skills], aug_obs[-num_skills:]) 76 | z = np.where(z_one_hot == 1)[0][0] 77 | return (obs, z) 78 | 79 | 80 | def _make_dir(filename): 81 | folder = os.path.dirname(filename) 82 | if not os.path.exists(folder): 83 | os.makedirs(folder) 84 | 85 | 86 | def save_video(video_frames, filename): 87 | import cv2 88 | _make_dir(filename) 89 | 90 | video_frames = np.flip(video_frames, axis=-1) 91 | 92 | # Define the codec and create VideoWriter object 93 | fourcc = cv2.VideoWriter_fourcc(*'MJPG') 94 | fps = 30.0 95 | (height, width, _) = video_frames[0].shape 96 | writer = cv2.VideoWriter(filename, fourcc, fps, (width, height)) 97 | for video_frame in video_frames: 98 | writer.write(video_frame) 99 | writer.release() 100 | 101 | 102 | def deep_update(d, *us): 103 | d = d.copy() 104 | 105 | for u in us: 106 | u = u.copy() 107 | for k, v in u.items(): 108 | d[k] = ( 109 | deep_update(d.get(k, {}), v) 110 | if isinstance(v, collections.Mapping) 111 | else v) 112 | 113 | return d 114 | 115 | 116 | def get_git_rev(): 117 | try: 118 | import git 119 | except ImportError: 120 | print( 121 | "Warning: gitpython not installed." 122 | " Unable to log git rev." 123 | " Run `pip install gitpython` if you want git revs to be logged.") 124 | return None 125 | 126 | try: 127 | repo = git.Repo(os.getcwd()) 128 | git_rev = repo.active_branch.commit.name_rev 129 | except TypeError: 130 | git_rev = repo.head.object.name_rev 131 | 132 | return git_rev 133 | 134 | 135 | def flatten(unflattened, parent_key='', separator='.'): 136 | items = [] 137 | for k, v in unflattened.items(): 138 | if separator in k: 139 | raise ValueError( 140 | "Found separator ({}) from key ({})".format(separator, k)) 141 | new_key = parent_key + separator + k if parent_key else k 142 | if isinstance(v, collections.MutableMapping) and v: 143 | items.extend(flatten(v, new_key, separator=separator).items()) 144 | else: 145 | items.append((new_key, v)) 146 | 147 | return dict(items) 148 | 149 | 150 | def unflatten(flattened, separator='.'): 151 | result = {} 152 | for key, value in flattened.items(): 153 | parts = key.split(separator) 154 | d = result 155 | for part in parts[:-1]: 156 | if part not in d: 157 | d[part] = {} 158 | d = d[part] 159 | d[parts[-1]] = value 160 | 161 | return result 162 | -------------------------------------------------------------------------------- /softlearning/samplers/simple_sampler.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | from .base_sampler import BaseSampler 6 | 7 | 8 | class SimpleSampler(BaseSampler): 9 | def __init__(self, **kwargs): 10 | super(SimpleSampler, self).__init__(**kwargs) 11 | 12 | self._path_length = 0 13 | self._path_return = 0 14 | self._current_path = defaultdict(list) 15 | self._last_path_return = 0 16 | self._max_path_return = -np.inf 17 | self._n_episodes = 0 18 | self._current_observation = None 19 | self._total_samples = 0 20 | 21 | def initialize(self, env, policy, pool): 22 | super(SimpleSampler, self).initialize(env, policy, pool) 23 | self.get_action = self.policy[0] 24 | self.make_init_hidden = self.policy[1] 25 | self.hidden = self.make_init_hidden() 26 | 27 | def _process_observations(self, 28 | observation, 29 | action, 30 | last_action, 31 | reward, 32 | terminal, 33 | next_observation, 34 | info): 35 | processed_observation = { 36 | 'observations': observation, 37 | 'actions': action, 38 | 'last_actions': last_action, 39 | 'rewards': [reward], 40 | 'terminals': [terminal], 41 | 'next_observations': next_observation, 42 | 'valid': [1], 43 | 'infos': info, 44 | } 45 | 46 | return processed_observation 47 | 48 | def sample(self): 49 | if self._current_observation is None: 50 | self._current_observation = self.env.reset() 51 | #### EDIT 52 | if hasattr(self.env.unwrapped, "state_vector"): 53 | self._reset_state_vector = self.env.unwrapped.state_vector() 54 | #### 55 | lst_action = self.hidden[1] 56 | action, self.hidden = self.get_action(self.env.convert_to_active_observation( 57 | self._current_observation)[None], self.hidden) 58 | action = action[0] 59 | # print(action.shape) 60 | next_observation, reward, terminal, info = self.env.step(action) 61 | self._path_length += 1 62 | self._path_return += reward 63 | self._total_samples += 1 64 | # print(lst_action.shape, lst_action.squeeze(1).shape, action.shape) 65 | processed_sample = self._process_observations( 66 | observation=self._current_observation, 67 | action=action, 68 | reward=reward, 69 | terminal=terminal, 70 | next_observation=next_observation, 71 | last_action=lst_action.squeeze(1).squeeze(0), 72 | info=info, 73 | ) 74 | 75 | for key, value in processed_sample.items(): 76 | self._current_path[key].append(value) 77 | 78 | if terminal or self._path_length >= self._max_path_length: 79 | last_path = { 80 | field_name: np.array(values) 81 | for field_name, values in self._current_path.items() 82 | } 83 | ######## this function is siginificant for replaybuffer 84 | self.pool.add_path(last_path) 85 | self._last_n_paths.appendleft(last_path) 86 | 87 | self._max_path_return = max(self._max_path_return, 88 | self._path_return) 89 | self._last_path_return = self._path_return 90 | 91 | self.reset_policy() 92 | self._current_observation = None 93 | self._path_length = 0 94 | self._path_return = 0 95 | self._current_path = defaultdict(list) 96 | 97 | self._n_episodes += 1 98 | 99 | else: 100 | self._current_observation = next_observation 101 | 102 | return next_observation, reward, terminal, info 103 | 104 | def reset_policy(self): 105 | self.hidden = self.make_init_hidden(1) 106 | 107 | def random_batch(self, batch_size=None, **kwargs): 108 | batch_size = batch_size or self._batch_size 109 | observation_keys = getattr(self.env, 'observation_keys', None) 110 | 111 | return self.pool.random_batch( 112 | batch_size, observation_keys=observation_keys, **kwargs) 113 | 114 | def get_diagnostics(self): 115 | diagnostics = super(SimpleSampler, self).get_diagnostics() 116 | diagnostics.update({ 117 | 'max-path-return': self._max_path_return, 118 | 'last-path-return': self._last_path_return, 119 | 'episodes': self._n_episodes, 120 | 'total-samples': self._total_samples, 121 | }) 122 | 123 | return diagnostics 124 | -------------------------------------------------------------------------------- /maple/utils/visualization.py: -------------------------------------------------------------------------------- 1 | import io 2 | import math 3 | import numpy as np 4 | import cv2 5 | import matplotlib 6 | matplotlib.use('Agg') 7 | import matplotlib.pyplot as plt 8 | 9 | 10 | def plot_trajectories(writer, label, epoch, env_traj, model_traj, means, stds): 11 | state_dim = env_traj[0].size 12 | model_states = [[obs[s] for obs in model_traj] for s in range(state_dim)] 13 | env_states = [[obs[s] for obs in env_traj ] for s in range(state_dim)] 14 | 15 | means = [np.array([mean[s] for mean in means]) for s in range(state_dim)] 16 | stds = [np.array([std[s] for std in stds]) for s in range(state_dim)] 17 | 18 | cols = 1 19 | rows = math.ceil(state_dim / cols) 20 | 21 | plt.clf() 22 | fig, axes = plt.subplots(rows, cols, figsize = (9*cols, 3*rows)) 23 | axes = axes.ravel() 24 | 25 | for i in range(state_dim): 26 | ax = axes[i] 27 | X = range(len(model_states[i])) 28 | 29 | ax.fill_between(X, means[i]+stds[i], means[i]-stds[i], color='r', alpha=0.5) 30 | ax.plot(env_states[i], color='k') 31 | ax.plot(model_states[i], color='b') 32 | ax.plot(means[i], color='r') 33 | 34 | if i == 0: 35 | ax.set_title('reward') 36 | elif i == 1: 37 | ax.set_title('terminal') 38 | else: 39 | ax.set_title('state dim {}'.format(i-2)) 40 | plt.tight_layout() 41 | 42 | buf = io.BytesIO() 43 | plt.savefig(buf, format='png', layout = 'tight') 44 | buf.seek(0) 45 | 46 | img = cv2.imdecode(np.fromstring(buf.getvalue(), dtype=np.uint8), -1) 47 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 48 | img = img.transpose(2,0,1) / 255. 49 | 50 | writer.add_image(label, img, epoch) 51 | 52 | plt.close() 53 | 54 | 55 | ''' 56 | writer video : [ batch x channels x timesteps x height x width ] 57 | ''' 58 | def record_trajectories(writer, label, epoch, env_images, model_images=None): 59 | traj_length = len(env_images) 60 | if model_images is not None: 61 | assert len(env_images) == len(model_images) 62 | images = [np.concatenate((env_img, model_img)) for (env_img, model_img) in zip(env_images, model_images)] 63 | else: 64 | images = env_images 65 | 66 | ## [ traj_length, 2 * H, W, C ] 67 | images = np.array(images) 68 | images = torch.Tensor(images) 69 | 70 | ## [ traj_length, C, 2 * H, W ] 71 | images = images.permute(0,3,1,2) 72 | ## [ B, traj_length, C, 2 * H, W ] 73 | images = images.unsqueeze(0) 74 | 75 | images = images / 255. 76 | images = images[:,:,0].unsqueeze(2) 77 | 78 | print('[ Visualization ] Saving to {}'.format(label)) 79 | fps = min(max(traj_length / 5, 2), 30) 80 | writer.add_video('video_' + label, images, epoch, fps = fps) 81 | 82 | 83 | def visualize_policy(real_env, fake_env, policy, writer, timestep, max_steps=100, focus=None, label='model_vis', img_dim=128): 84 | init_obs = real_env.reset() 85 | obs = init_obs.copy() 86 | 87 | observations_r = [obs] 88 | observations_f = [obs] 89 | rewards_r = [0] 90 | rewards_f = [0] 91 | terminals_r = [False] 92 | terminals_f = [False] 93 | means_f = [np.concatenate((np.zeros(2), obs))] 94 | stds_f = [np.concatenate((np.zeros(2), obs*0))] 95 | actions = [] 96 | 97 | i = 0 98 | term_r, term_f = False, False 99 | while not (term_r and term_f) and i <= max_steps: 100 | 101 | act = policy.actions_np(obs[None])[0] 102 | if not term_r: 103 | next_obs_r, rew_r, term_r, info_r = real_env.step(act) 104 | observations_r.append(next_obs_r) 105 | rewards_r.append(rew_r) 106 | terminals_r.append(term_r) 107 | 108 | if not term_f: 109 | next_obs_f, rew_f, term_f, info_f = fake_env.step(obs, act) 110 | observations_f.append(next_obs_f) 111 | rewards_f.append(rew_f) 112 | terminals_f.append(term_f) 113 | means_f.append(info_f['mean']) 114 | stds_f.append(info_f['std']) 115 | 116 | actions.append(act) 117 | 118 | if not term_f: 119 | obs = next_obs_f 120 | else: 121 | obs = next_obs_r 122 | 123 | i += 1 124 | 125 | terminals_r = np.array([terminals_r]).astype(np.uint8).T 126 | terminals_f = np.array([terminals_f]).astype(np.uint8).T 127 | rewards_r = np.array([rewards_r]).T 128 | rewards_f = np.array([rewards_f]).T 129 | 130 | rewards_observations_r = np.concatenate((rewards_r, terminals_r, np.array(observations_r)), -1) 131 | rewards_observations_f = np.concatenate((rewards_f, terminals_f, np.array(observations_f)), -1) 132 | plot_trajectories(writer, label, timestep, rewards_observations_r, rewards_observations_f, means_f, stds_f) 133 | record_trajectories(writer, label, epoch, images_r) 134 | 135 | -------------------------------------------------------------------------------- /maple/utils/logging.py: -------------------------------------------------------------------------------- 1 | import time 2 | import math 3 | 4 | class Progress: 5 | 6 | def __init__(self, total, name = 'Progress', ncol=3, max_length=20, indent=0, line_width=100, speed_update_freq=100): 7 | self.total = total 8 | self.name = name 9 | self.ncol = ncol 10 | self.max_length = max_length 11 | self.indent = indent 12 | self.line_width = line_width 13 | self._speed_update_freq = speed_update_freq 14 | self._speed = None 15 | 16 | self._step = 0 17 | self._prev_line = '\033[F' 18 | self._clear_line = ' ' * self.line_width 19 | 20 | self._pbar_size = self.ncol * self.max_length 21 | self._complete_pbar = '#' * self._pbar_size 22 | self._incomplete_pbar = ' ' * self._pbar_size 23 | 24 | self.lines = [''] 25 | self.fraction = '{} / {}'.format(0, self.total) 26 | 27 | self.resume() 28 | 29 | 30 | def update(self, n=1): 31 | self._step += n 32 | if self._step % self._speed_update_freq == 0: 33 | self._time0 = time.time() 34 | self._step0 = self._step 35 | 36 | def resume(self): 37 | self._skip_lines = 1 38 | print('\n', end='') 39 | self._time0 = time.time() 40 | self._step0 = self._step 41 | 42 | def pause(self): 43 | self._clear() 44 | self._skip_lines = 1 45 | 46 | def set_description(self, params=[]): 47 | 48 | ############ 49 | # Position # 50 | ############ 51 | self._clear() 52 | 53 | ########### 54 | # Percent # 55 | ########### 56 | percent, fraction = self._format_percent(self._step, self.total) 57 | self.fraction = fraction 58 | 59 | ######### 60 | # Speed # 61 | ######### 62 | speed = self._format_speed(self._step) 63 | 64 | ########## 65 | # Params # 66 | ########## 67 | num_params = len(params) 68 | nrow = math.ceil(num_params / self.ncol) 69 | params_split = self._chunk(params, self.ncol) 70 | params_string, lines = self._format(params_split) 71 | self.lines = lines 72 | 73 | 74 | description = '{} | {}{}'.format(percent, speed, params_string) 75 | print(description) 76 | self._skip_lines = nrow + 1 77 | 78 | def append_description(self, descr): 79 | self.lines.append(descr) 80 | 81 | def _clear(self): 82 | position = self._prev_line * self._skip_lines 83 | empty = '\n'.join([self._clear_line for _ in range(self._skip_lines)]) 84 | print(position, end='') 85 | print(empty) 86 | print(position, end='') 87 | 88 | def _format_percent(self, n, total): 89 | if total: 90 | percent = n / float(total) 91 | 92 | complete_entries = int(percent * self._pbar_size) 93 | incomplete_entries = self._pbar_size - complete_entries 94 | 95 | pbar = self._complete_pbar[:complete_entries] + self._incomplete_pbar[:incomplete_entries] 96 | fraction = '{} / {}'.format(n, total) 97 | string = '{} [{}] {:3d}%'.format(fraction, pbar, int(percent*100)) 98 | else: 99 | fraction = '{}'.format(n) 100 | string = '{} iterations'.format(n) 101 | return string, fraction 102 | 103 | def _format_speed(self, n): 104 | num_steps = n - self._step0 105 | t = time.time() - self._time0 106 | speed = num_steps / t 107 | string = '{:.1f} Hz'.format(speed) 108 | if num_steps > 0: 109 | self._speed = string 110 | return string 111 | 112 | def _chunk(self, l, n): 113 | return [l[i:i+n] for i in range(0, len(l), n)] 114 | 115 | def _format(self, chunks): 116 | lines = [self._format_chunk(chunk) for chunk in chunks] 117 | lines.insert(0,'') 118 | padding = '\n' + ' '*self.indent 119 | string = padding.join(lines) 120 | return string, lines 121 | 122 | def _format_chunk(self, chunk): 123 | line = ' | '.join([self._format_param(param) for param in chunk]) 124 | return line 125 | 126 | def _format_param(self, param): 127 | k, v = param 128 | return '{} : {}'.format(k, v)[:self.max_length] 129 | 130 | def stamp(self): 131 | if self.lines != ['']: 132 | params = ' | '.join(self.lines) 133 | string = '[ {} ] {}{} | {}'.format(self.name, self.fraction, params, self._speed) 134 | self._clear() 135 | print(string, end='\n') 136 | self._skip_lines = 1 137 | else: 138 | self._clear() 139 | self._skip_lines = 0 140 | 141 | def close(self): 142 | self.pause() 143 | 144 | class Silent: 145 | 146 | def __init__(self, *args, **kwargs): 147 | pass 148 | 149 | def __getattr__(self, attr): 150 | return lambda *args: None 151 | 152 | 153 | if __name__ == '__main__': 154 | silent = Silent() 155 | silent.update() 156 | silent.stamp() 157 | 158 | num_steps = 1000 159 | progress = Progress(num_steps) 160 | for i in range(num_steps): 161 | progress.update() 162 | params = [ 163 | ['A', '{:06d}'.format(i)], 164 | ['B', '{:06d}'.format(i)], 165 | ['C', '{:06d}'.format(i)], 166 | ['D', '{:06d}'.format(i)], 167 | ['E', '{:06d}'.format(i)], 168 | ['F', '{:06d}'.format(i)], 169 | ['G', '{:06d}'.format(i)], 170 | ['H', '{:06d}'.format(i)], 171 | ] 172 | progress.set_description(params) 173 | time.sleep(0.01) 174 | progress.close() 175 | -------------------------------------------------------------------------------- /softlearning/environments/gym/mujoco/image_pusher_2d.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from softlearning.environments.helpers import random_point_in_circle 4 | from .pusher_2d import Pusher2dEnv 5 | 6 | 7 | class ImagePusher2dEnv(Pusher2dEnv): 8 | def __init__(self, image_shape, *args, **kwargs): 9 | self._Serializable__initialize(locals()) 10 | self.image_shape = image_shape 11 | Pusher2dEnv.__init__(self, *args, **kwargs) 12 | 13 | def _get_obs(self): 14 | width, height = self.image_shape[:2] 15 | image = self.render(mode='rgb_array', width=width, height=height) 16 | image = ((2.0 / 255.0) * image - 1.0) 17 | 18 | return np.concatenate([ 19 | image.reshape(-1), 20 | self.sim.data.qpos.flat[self.JOINT_INDS], 21 | self.sim.data.qvel.flat[self.JOINT_INDS], 22 | ]).reshape(-1) 23 | 24 | def step(self, action): 25 | """Step, computing reward from 'true' observations and not images.""" 26 | 27 | reward_observations = super(ImagePusher2dEnv, self)._get_obs() 28 | reward, info = self.compute_reward(reward_observations, action) 29 | 30 | self.do_simulation(action, self.frame_skip) 31 | 32 | observation = self._get_obs() 33 | done = False 34 | 35 | return observation, reward, done, info 36 | 37 | def viewer_setup(self): 38 | self.viewer.cam.trackbodyid = 0 39 | self.viewer.cam.lookat[:3] = [0, 0, 0] 40 | self.viewer.cam.distance = 3.5 41 | self.viewer.cam.elevation = -90 42 | self.viewer.cam.azimuth = 0 43 | self.viewer.cam.trackbodyid = -1 44 | 45 | 46 | class ImageForkReacher2dEnv(ImagePusher2dEnv): 47 | def __init__(self, 48 | arm_goal_distance_cost_coeff, 49 | arm_object_distance_cost_coeff, 50 | *args, 51 | **kwargs): 52 | self._Serializable__initialize(locals()) 53 | 54 | self._arm_goal_distance_cost_coeff = arm_goal_distance_cost_coeff 55 | self._arm_object_distance_cost_coeff = arm_object_distance_cost_coeff 56 | 57 | super(ImageForkReacher2dEnv, self).__init__(*args, **kwargs) 58 | 59 | def compute_reward(self, observations, actions): 60 | is_batch = True 61 | if observations.ndim == 1: 62 | observations = observations[None] 63 | actions = actions[None] 64 | is_batch = False 65 | else: 66 | raise NotImplementedError('Might be broken.') 67 | 68 | arm_pos = observations[:, -6:-4] 69 | goal_pos = self.get_body_com('goal')[:2][None] 70 | object_pos = observations[:, -3:-1] 71 | 72 | arm_goal_dists = np.linalg.norm(arm_pos - goal_pos, axis=1) 73 | arm_object_dists = np.linalg.norm(arm_pos - object_pos, axis=1) 74 | ctrl_costs = np.sum(actions**2, axis=1) 75 | 76 | costs = ( 77 | + self._arm_goal_distance_cost_coeff * arm_goal_dists 78 | + self._arm_object_distance_cost_coeff * arm_object_dists 79 | + self._ctrl_cost_coeff * ctrl_costs) 80 | 81 | rewards = -costs 82 | 83 | if not is_batch: 84 | rewards = rewards.squeeze() 85 | arm_goal_dists = arm_goal_dists.squeeze() 86 | arm_object_dists = arm_object_dists.squeeze() 87 | 88 | return rewards, { 89 | 'arm_goal_distance': arm_goal_dists, 90 | 'arm_object_distance': arm_object_dists, 91 | } 92 | 93 | def reset_model(self): 94 | qpos = np.random.uniform( 95 | low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos.squeeze() 96 | 97 | # qpos[self.JOINT_INDS[0]] = np.random.uniform(-np.pi, np.pi) 98 | # qpos[self.JOINT_INDS[1]] = np.random.uniform( 99 | # -np.pi/2, np.pi/2) + np.pi/4 100 | # qpos[self.JOINT_INDS[2]] = np.random.uniform( 101 | # -np.pi/2, np.pi/2) + np.pi/2 102 | 103 | target_position = np.array(random_point_in_circle( 104 | angle_range=(0, 2*np.pi), radius=(0.6, 1.2))) 105 | target_position[1] += 1.0 106 | 107 | qpos[self.TARGET_INDS] = target_position 108 | # qpos[self.TARGET_INDS] = [1.0, 2.0] 109 | # qpos[self.TARGET_INDS] = self.init_qpos.squeeze()[self.TARGET_INDS] 110 | 111 | puck_position = np.random.uniform([-1.0], [1.0], size=[2]) 112 | puck_position = ( 113 | np.sign(puck_position) 114 | * np.maximum(np.abs(puck_position), 1/2)) 115 | puck_position[np.where(puck_position == 0)] = 1.0 116 | # puck_position[1] += 1.0 117 | # puck_position = np.random.uniform( 118 | # low=[0.3, -1.0], high=[1.0, -0.4]), 119 | 120 | qpos[self.PUCK_INDS] = puck_position 121 | 122 | qvel = self.init_qvel.copy().squeeze() 123 | qvel[self.PUCK_INDS] = 0 124 | qvel[self.TARGET_INDS] = 0 125 | 126 | # TODO: remnants from rllab -> gym conversion 127 | # qacc = np.zeros(self.sim.data.qacc.shape[0]) 128 | # ctrl = np.zeros(self.sim.data.ctrl.shape[0]) 129 | # full_state = np.concatenate((qpos, qvel, qacc, ctrl)) 130 | 131 | # super(Pusher2dEnv, self).reset(full_state) 132 | 133 | self.set_state(qpos, qvel) 134 | 135 | return self._get_obs() 136 | 137 | 138 | class BlindForkReacher2dEnv(ImageForkReacher2dEnv): 139 | def _get_obs(self): 140 | return np.concatenate([ 141 | self.sim.data.qpos.flat[self.JOINT_INDS], 142 | self.sim.data.qvel.flat[self.JOINT_INDS], 143 | ]).reshape(-1) 144 | -------------------------------------------------------------------------------- /softlearning/environments/gym/__init__.py: -------------------------------------------------------------------------------- 1 | """Custom Gym environments. 2 | 3 | Every class inside this module should extend a gym.Env class. The file 4 | structure should be similar to gym.envs file structure, e.g. if you're 5 | implementing a mujoco env, you would implement it under gym.mujoco submodule. 6 | """ 7 | 8 | import gym 9 | import numpy as np 10 | 11 | 12 | CUSTOM_GYM_ENVIRONMENTS_PATH = __package__ 13 | MUJOCO_ENVIRONMENTS_PATH = '{}.mujoco'.format(CUSTOM_GYM_ENVIRONMENTS_PATH) 14 | 15 | MUJOCO_ENVIRONMENT_SPECS = ( 16 | { 17 | 'id': 'Swimmer-Parameterizable-v3', 18 | 'entry_point': ('gym.envs.mujoco.swimmer_v3:SwimmerEnv'), 19 | }, 20 | { 21 | 'id': 'Hopper-Parameterizable-v3', 22 | 'entry_point': ('gym.envs.mujoco.hopper_v3:HopperEnv'), 23 | }, 24 | { 25 | 'id': 'Walker2d-Parameterizable-v3', 26 | 'entry_point': ('gym.envs.mujoco.walker2d_v3:Walker2dEnv'), 27 | }, 28 | { 29 | 'id': 'HalfCheetah-Parameterizable-v3', 30 | 'entry_point': ('gym.envs.mujoco.half_cheetah_v3:HalfCheetahEnv'), 31 | }, 32 | { 33 | 'id': 'Ant-Parameterizable-v3', 34 | 'entry_point': ('gym.envs.mujoco.ant_v3:AntEnv'), 35 | }, 36 | { 37 | 'id': 'AntAngle-Parameterizable-v3', 38 | 'entry_point': ('gym.envs.mujoco.ant_v3_angle:AntEnv'), 39 | }, 40 | { 41 | 'id': 'Humanoid-Parameterizable-v3', 42 | 'entry_point': ('gym.envs.mujoco.humanoid_v3:HumanoidEnv'), 43 | }, 44 | { 45 | 'id': 'Pusher2d-Default-v0', 46 | 'entry_point': ('{}.pusher_2d:Pusher2dEnv'.format(MUJOCO_ENVIRONMENTS_PATH)), 47 | }, 48 | { 49 | 'id': 'Pusher2d-DefaultReach-v0', 50 | 'entry_point': ('{}.pusher_2d:ForkReacherEnv'.format(MUJOCO_ENVIRONMENTS_PATH)), 51 | }, 52 | { 53 | 'id': 'Pusher2d-ImageDefault-v0', 54 | 'entry_point': ('{}.image_pusher_2d:ImagePusher2dEnv'.format(MUJOCO_ENVIRONMENTS_PATH)), 55 | }, 56 | { 57 | 'id': 'Pusher2d-ImageReach-v0', 58 | 'entry_point': ('{}.image_pusher_2d:ImageForkReacher2dEnv'.format(MUJOCO_ENVIRONMENTS_PATH)), 59 | }, 60 | { 61 | 'id': 'Pusher2d-BlindReach-v0', 62 | 'entry_point': ('{}.image_pusher_2d:BlindForkReacher2dEnv'.format(MUJOCO_ENVIRONMENTS_PATH)), 63 | }, 64 | ) 65 | 66 | GENERAL_ENVIRONMENT_SPECS = ( 67 | { 68 | 'id': 'MultiGoal-Default-v0', 69 | 'entry_point': ('{}.multi_goal:MultiGoalEnv'.format(CUSTOM_GYM_ENVIRONMENTS_PATH)) 70 | }, 71 | ) 72 | 73 | MULTIWORLD_ENVIRONMENT_SPECS = ( 74 | { 75 | 'id': 'Point2DEnv-Default-v0', 76 | 'entry_point': 'multiworld.envs.pygame.point2d:Point2DEnv' 77 | }, 78 | { 79 | 'id': 'Point2DEnv-Wall-v0', 80 | 'entry_point': 'multiworld.envs.pygame.point2d:Point2DWallEnv' 81 | }, 82 | { 83 | 'id': 'Point2DEnv-Offline-v0', 84 | 'entry_point': 'multiworld.envs.pygame.point2d:Point2DEnv', 85 | 'kwargs': { 86 | 'initial_position' : np.array([-4, 0]), 87 | 'fixed_goal' : np.array([4, 0]), 88 | 'randomize_position_on_reset' : False 89 | } 90 | }, 91 | { 92 | 'id': 'Point2DWallEnv-Offline-v0', 93 | 'entry_point': 'multiworld.envs.pygame.point2d:Point2DWallEnv', 94 | 'kwargs': { 95 | 'wall_shape': "big-u", 96 | 'initial_position' : np.array([-4, 0]), 97 | 'fixed_goal' : np.array([4, 0]), 98 | 'randomize_position_on_reset' : False 99 | } 100 | }, 101 | ) 102 | 103 | MUJOCO_ENVIRONMENTS = tuple( 104 | environment_spec['id'] 105 | for environment_spec in MUJOCO_ENVIRONMENT_SPECS) 106 | 107 | 108 | GENERAL_ENVIRONMENTS = tuple( 109 | environment_spec['id'] 110 | for environment_spec in GENERAL_ENVIRONMENT_SPECS) 111 | 112 | 113 | MULTIWORLD_ENVIRONMENTS = tuple( 114 | environment_spec['id'] 115 | for environment_spec in MULTIWORLD_ENVIRONMENT_SPECS) 116 | 117 | GYM_ENVIRONMENTS = ( 118 | *MUJOCO_ENVIRONMENTS, 119 | *GENERAL_ENVIRONMENTS, 120 | *MULTIWORLD_ENVIRONMENTS, 121 | ) 122 | 123 | 124 | def register_mujoco_environments(): 125 | """Register softlearning mujoco environments.""" 126 | for mujoco_environment in MUJOCO_ENVIRONMENT_SPECS: 127 | gym.register(**mujoco_environment) 128 | 129 | gym_ids = tuple( 130 | environment_spec['id'] 131 | for environment_spec in MUJOCO_ENVIRONMENT_SPECS) 132 | 133 | return gym_ids 134 | 135 | 136 | def register_general_environments(): 137 | """Register gym environments that don't fall under a specific category.""" 138 | for general_environment in GENERAL_ENVIRONMENT_SPECS: 139 | gym.register(**general_environment) 140 | 141 | gym_ids = tuple( 142 | environment_spec['id'] 143 | for environment_spec in GENERAL_ENVIRONMENT_SPECS) 144 | 145 | return gym_ids 146 | 147 | 148 | def register_multiworld_environments(): 149 | """Register custom environments from multiworld package.""" 150 | for multiworld_environment in MULTIWORLD_ENVIRONMENT_SPECS: 151 | gym.register(**multiworld_environment) 152 | 153 | gym_ids = tuple( 154 | environment_spec['id'] 155 | for environment_spec in MULTIWORLD_ENVIRONMENT_SPECS) 156 | 157 | return gym_ids 158 | 159 | 160 | def register_environments(): 161 | registered_mujoco_environments = register_mujoco_environments() 162 | registered_general_environments = register_general_environments() 163 | registered_multiworld_environments = register_multiworld_environments() 164 | 165 | return ( 166 | *registered_mujoco_environments, 167 | *registered_general_environments, 168 | *registered_multiworld_environments, 169 | ) 170 | -------------------------------------------------------------------------------- /maple/env/assert/halfcheetah.xml: -------------------------------------------------------------------------------- 1 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /softlearning/replay_pools/trajectory_replay_pool.py: -------------------------------------------------------------------------------- 1 | from collections import deque 2 | import gzip 3 | import pickle 4 | from itertools import islice 5 | 6 | import numpy as np 7 | 8 | from softlearning.utils.numpy import softmax 9 | from .replay_pool import ReplayPool 10 | 11 | 12 | def random_int_with_variable_range(mins, maxs): 13 | result = np.floor(np.random.uniform(mins, maxs)).astype(int) 14 | return result 15 | 16 | 17 | class TrajectoryReplayPool(ReplayPool): 18 | def __init__(self, 19 | observation_space, 20 | action_space, 21 | max_size): 22 | super(TrajectoryReplayPool, self).__init__() 23 | 24 | max_size = int(max_size) 25 | self._max_size = max_size 26 | 27 | self._trajectories = deque(maxlen=max_size) 28 | self._trajectory_lengths = deque(maxlen=max_size) 29 | self._num_samples = 0 30 | self._trajectories_since_save = 0 31 | 32 | @property 33 | def num_trajectories(self): 34 | return len(self._trajectories) 35 | 36 | @property 37 | def size(self): 38 | return sum(self._trajectory_lengths) 39 | 40 | @property 41 | def num_samples(self): 42 | return self._num_samples 43 | 44 | def add_paths(self, trajectories): 45 | self._trajectories += trajectories 46 | self._trajectory_lengths += [ 47 | trajectory[next(iter(trajectory.keys()))].shape[0] 48 | for trajectory in trajectories 49 | ] 50 | self._trajectories_since_save += len(trajectories) 51 | 52 | def add_path(self, trajectory): 53 | self.add_paths([trajectory]) 54 | 55 | def add_sample(self, sample): 56 | raise NotImplementedError( 57 | "{} only supports adding full paths at once.".format(self.__class__.__name__)) 58 | 59 | def add_samples(self, samples): 60 | raise NotImplementedError( 61 | "{} only supports adding full paths at once.".format(self.__class__.__name__)) 62 | 63 | def batch_by_indices(self, 64 | episode_indices, 65 | step_indices, 66 | field_name_filter=None): 67 | assert len(episode_indices) == len(step_indices) 68 | 69 | batch_size = len(episode_indices) 70 | trajectories = [self._trajectories[i] for i in episode_indices] 71 | 72 | batch = { 73 | field_name: np.empty( 74 | (batch_size, *values.shape[1:]), dtype=values.dtype) 75 | for field_name, values in trajectories[0].items() 76 | } 77 | 78 | for i, episode in enumerate(trajectories): 79 | for field_name, episode_values in episode.items(): 80 | batch[field_name][i] = episode_values[step_indices[i]] 81 | 82 | return batch 83 | 84 | def random_batch(self, batch_size, *args, **kwargs): 85 | num_trajectories = len(self._trajectories) 86 | if num_trajectories < 1: 87 | return {} 88 | 89 | trajectory_lengths = np.array(self._trajectory_lengths) 90 | trajectory_weights = trajectory_lengths / np.sum(trajectory_lengths) 91 | trajectory_probabilities = softmax(trajectory_weights) 92 | 93 | trajectory_indices = np.random.choice( 94 | np.arange(num_trajectories), 95 | size=batch_size, 96 | replace=True, 97 | p=trajectory_probabilities) 98 | first_key = next(iter( 99 | self._trajectories[trajectory_indices[0]].keys())) 100 | trajectory_lengths = np.array([ 101 | self._trajectories[trajectory_index][first_key].shape[0] 102 | for trajectory_index in trajectory_indices 103 | ]) 104 | 105 | step_indices = random_int_with_variable_range( 106 | np.zeros_like(trajectory_lengths, dtype=np.int64), 107 | trajectory_lengths) 108 | 109 | batch = self.batch_by_indices(trajectory_indices, step_indices) 110 | 111 | return batch 112 | 113 | def last_n_batch(self, last_n, field_name_filter=None, **kwargs): 114 | num_trajectories = len(self._trajectories) 115 | if num_trajectories < 1: 116 | return {} 117 | 118 | trajectory_indices = [] 119 | step_indices = [] 120 | 121 | trajectory_lengths = 0 122 | for trajectory_index in range(num_trajectories-1, -1, -1): 123 | trajectory = self._trajectories[trajectory_index] 124 | trajectory_length = trajectory[list(trajectory.keys())[0]].shape[0] 125 | 126 | steps_from_this_episode = min(trajectory_length, last_n - trajectory_lengths) 127 | step_indices += list(range( 128 | trajectory_length-1, 129 | trajectory_length - steps_from_this_episode - 1, 130 | -1)) 131 | trajectory_indices += [trajectory_index] * steps_from_this_episode 132 | 133 | trajectory_lengths += trajectory_length 134 | 135 | if trajectory_lengths >= last_n: 136 | break 137 | 138 | trajectory_indices = trajectory_indices[::-1] 139 | step_indices = step_indices[::-1] 140 | 141 | batch = self.batch_by_indices(trajectory_indices, step_indices) 142 | 143 | return batch 144 | 145 | def save_latest_experience(self, pickle_path): 146 | # deque doesn't support direct slicing, thus need to use islice 147 | num_trajectories = self.num_trajectories 148 | start_index = max(num_trajectories - self._trajectories_since_save, 0) 149 | end_index = num_trajectories 150 | 151 | latest_trajectories = tuple(islice( 152 | self._trajectories, start_index, end_index)) 153 | 154 | with gzip.open(pickle_path, 'wb') as f: 155 | pickle.dump(latest_trajectories, f) 156 | 157 | self._trajectories_since_save = 0 158 | 159 | def load_experience(self, experience_path): 160 | with gzip.open(experience_path, 'rb') as f: 161 | latest_trajectories = pickle.load(f) 162 | 163 | self.add_paths(latest_trajectories) 164 | self._trajectories_since_save = 0 165 | -------------------------------------------------------------------------------- /run_scripts/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("../") 3 | import os 4 | from RLA.easy_log.tester import tester 5 | from utils import get_parser 6 | from maple.policy import maple 7 | from copy import deepcopy 8 | 9 | def get_params_from_file(filepath, params_name='params'): 10 | import importlib 11 | from dotmap import DotMap 12 | module = importlib.import_module(filepath) 13 | params = getattr(module, params_name) 14 | params = DotMap(params) 15 | return params 16 | 17 | 18 | def get_variant_spec(command_line_args): 19 | from base import get_variant_spec, get_task_spec 20 | params = get_params_from_file(command_line_args.config) 21 | variant_spec = get_variant_spec(command_line_args, params) 22 | print(variant_spec) 23 | if 'neorl' in command_line_args.config: 24 | variant_spec['environment_params']['training']['kwargs']['use_neorl'] = True 25 | else: 26 | variant_spec['environment_params']['training']['kwargs']['use_neorl'] = False 27 | for k,v in vars(command_line_args).items(): 28 | variant_spec[k] = v 29 | variant_spec['run_params']['seed'] = command_line_args.seed 30 | variant_spec = get_task_spec(variant_spec) 31 | return variant_spec 32 | 33 | 34 | 35 | import tensorflow as tf 36 | 37 | from softlearning.environments.utils import get_environment_from_params 38 | from softlearning.replay_pools.utils import get_replay_pool_from_variant 39 | from softlearning.samplers.utils import get_sampler_from_variant 40 | 41 | from softlearning.misc.utils import set_seed 42 | import copy 43 | import maple.policy.static as static 44 | 45 | 46 | def get_package_path(): 47 | return os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 48 | 49 | def main(): 50 | import sys 51 | example_args = get_parser().parse_args(sys.argv[1:]) 52 | 53 | variant_spec = get_variant_spec(example_args) 54 | # command_line_args = example_args 55 | print('vriant spec: {}'.format(variant_spec)) 56 | 57 | # if command_line_args.video_save_frequency is not None: 58 | # assert 'algorithm_params' in variant_spec 59 | # variant_spec['algorithm_params']['kwargs']['video_save_frequency'] = ( 60 | # command_line_args.video_save_frequency) 61 | 62 | variant = variant_spec 63 | # init 64 | set_seed(variant['run_params']['seed']) 65 | gpu_options = tf.GPUOptions(allow_growth=True) 66 | session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) 67 | tester.set_hyper_param(**variant) 68 | tf.keras.backend.set_session(session) 69 | 70 | # build 71 | 72 | variant = copy.deepcopy(variant) 73 | # redundant code for compatibility to the older version. 74 | if variant['elite_num'] <= 0: 75 | variant['algorithm_params']['kwargs']['num_networks'] = int(variant['model_suffix']) 76 | variant['algorithm_params']['kwargs']['num_elites'] = int(int(variant['model_suffix']) / 7 * 5) 77 | 78 | if variant['loaded_task_name'] != '': 79 | from RLA import ExperimentLoader 80 | el = ExperimentLoader() 81 | el.config(task_name=variant['loaded_task_name'], 82 | record_date=variant['loaded_date'], root='../') 83 | args = el.import_hyper_parameters(hp_to_overwrite=['retrain_model']) 84 | tester.hyper_param = vars(args) 85 | tester.hyper_param['retrain_model'] = False 86 | tester.hyper_param['algorithm_params'].kwargs.model_load_dir = variant['algorithm_params'].kwargs.model_load_dir 87 | variant = copy.deepcopy(tester.hyper_param) 88 | else: 89 | el = None 90 | tester.add_record_param(['info', "model_suffix", "penalty_coeff", "length", 91 | 'maple_200', 'run_params.seed', 'penalty_clip']) 92 | tester.configure(task_name="v2_" + variant["config"], 93 | rla_config=os.path.join(get_package_path(), 'rla_config_mopo.yaml'), 94 | log_root=get_package_path()) 95 | tester.log_files_gen() 96 | tester.print_args() 97 | environment_params = variant['environment_params'] 98 | training_environment = (get_environment_from_params(environment_params['training'])) 99 | evaluation_environment = (get_environment_from_params(environment_params['evaluation'](variant)) 100 | if 'evaluation' in environment_params else training_environment) 101 | 102 | replay_pool = (get_replay_pool_from_variant(variant, training_environment)) 103 | sampler = get_sampler_from_variant(variant) 104 | 105 | 106 | #### get termination function 107 | domain = environment_params['training']['domain'] 108 | static_fns = static[domain.lower()] 109 | #### 110 | if variant['elite_num'] <= 0: 111 | variant['algorithm_params']['kwargs']['num_networks'] = int(variant['model_suffix']) 112 | variant['algorithm_params']['kwargs']['num_elites'] = int(int(variant['model_suffix']) / 7 * 5) 113 | # construct MAPLE parameters 114 | algorithm_params = variant['algorithm_params'] 115 | algorithm_kwargs = deepcopy(algorithm_params['kwargs']) 116 | exp_name = variant['algorithm_params']["exp_name"] 117 | retrain_model = variant['retrain_model'] 118 | exp_name = exp_name.replace('_', '-') 119 | if algorithm_kwargs['separate_mean_var']: 120 | exp_name += '_smv' 121 | algorithm_kwargs["model_name"] = exp_name + '_1_{}'.format(variant['model_suffix']) 122 | kwargs = algorithm_kwargs.toDict() 123 | 124 | kwargs['penalty_coeff'] = variant['penalty_coeff'] 125 | kwargs['penalty_clip'] = variant['penalty_clip'] 126 | kwargs['rollout_length'] = variant['length'] 127 | kwargs['seed'] = variant['run_params']['seed'] 128 | kwargs['retrain'] = retrain_model 129 | kwargs['network_kwargs']['embedding_size'] = variant['emb_size'] 130 | kwargs['n_epochs'] = variant['n_epochs'] 131 | kwargs['source'] = variant['config'].split('.')[-2] 132 | kwargs['training_environment'] = training_environment 133 | kwargs['evaluation_environment'] = evaluation_environment 134 | kwargs['pool'] = replay_pool 135 | kwargs['static_fns'] = static_fns 136 | kwargs['sampler'] = sampler # to be removed 137 | trainer = maple.MAPLE(**kwargs) 138 | if el is None: 139 | list(trainer.train()) 140 | else: 141 | trainer.vis(el) 142 | trainer.performance_ns(el) 143 | 144 | if __name__=='__main__': 145 | main() -------------------------------------------------------------------------------- /softlearning/environments/gym/multi_goal.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | 4 | from gym.utils import EzPickle 5 | from gym import spaces 6 | from gym.envs.mujoco.mujoco_env import MujocoEnv 7 | 8 | 9 | class MultiGoalEnv(MujocoEnv, EzPickle): 10 | """ 11 | Move a 2D point mass to one of the goal positions. Cost is the distance to 12 | the closest goal. 13 | 14 | State: position. 15 | Action: velocity. 16 | """ 17 | def __init__(self, 18 | goal_reward=10, 19 | actuation_cost_coeff=30.0, 20 | distance_cost_coeff=1.0, 21 | init_sigma=0.1): 22 | EzPickle.__init__(**locals()) 23 | 24 | self.dynamics = PointDynamics(dim=2, sigma=0) 25 | self.init_mu = np.zeros(2, dtype=np.float32) 26 | self.init_sigma = init_sigma 27 | self.goal_positions = np.array( 28 | ( 29 | (5, 0), 30 | (-5, 0), 31 | (0, 5), 32 | (0, -5) 33 | ), 34 | dtype=np.float32) 35 | self.goal_threshold = 1.0 36 | self.goal_reward = goal_reward 37 | self.action_cost_coeff = actuation_cost_coeff 38 | self.distance_cost_coeff = distance_cost_coeff 39 | self.xlim = (-7, 7) 40 | self.ylim = (-7, 7) 41 | self.vel_bound = 1. 42 | self.reset() 43 | self.observation = None 44 | 45 | self._ax = None 46 | self._env_lines = [] 47 | self.fixed_plots = None 48 | self.dynamic_plots = [] 49 | 50 | def reset(self): 51 | unclipped_observation = ( 52 | self.init_mu 53 | + self.init_sigma 54 | * np.random.normal(size=self.dynamics.s_dim)) 55 | self.observation = np.clip( 56 | unclipped_observation, 57 | self.observation_space.low, 58 | self.observation_space.high) 59 | return self.observation 60 | 61 | @property 62 | def observation_space(self): 63 | return spaces.Box( 64 | low=np.array((self.xlim[0], self.ylim[0])), 65 | high=np.array((self.xlim[1], self.ylim[1])), 66 | dtype=np.float32, 67 | shape=None) 68 | 69 | @property 70 | def action_space(self): 71 | return spaces.Box( 72 | low=-self.vel_bound, 73 | high=self.vel_bound, 74 | shape=(self.dynamics.a_dim, ), 75 | dtype=np.float32) 76 | 77 | def get_current_obs(self): 78 | return np.copy(self.observation) 79 | 80 | def step(self, action): 81 | action = action.ravel() 82 | 83 | action = np.clip( 84 | action, 85 | self.action_space.low, 86 | self.action_space.high).ravel() 87 | 88 | observation = self.dynamics.forward(self.observation, action) 89 | observation = np.clip( 90 | observation, 91 | self.observation_space.low, 92 | self.observation_space.high) 93 | 94 | reward = self.compute_reward(observation, action) 95 | dist_to_goal = np.amin([ 96 | np.linalg.norm(observation - goal_position) 97 | for goal_position in self.goal_positions 98 | ]) 99 | done = dist_to_goal < self.goal_threshold 100 | if done: 101 | reward += self.goal_reward 102 | 103 | self.observation = np.copy(observation) 104 | 105 | return observation, reward, done, {'pos': observation} 106 | 107 | def _init_plot(self): 108 | fig_env = plt.figure(figsize=(7, 7)) 109 | self._ax = fig_env.add_subplot(111) 110 | self._ax.axis('equal') 111 | 112 | self._env_lines = [] 113 | self._ax.set_xlim((-7, 7)) 114 | self._ax.set_ylim((-7, 7)) 115 | 116 | self._ax.set_title('Multigoal Environment') 117 | self._ax.set_xlabel('x') 118 | self._ax.set_ylabel('y') 119 | 120 | self._plot_position_cost(self._ax) 121 | 122 | def render_rollouts(self, paths=()): 123 | """Render for rendering the past rollouts of the environment.""" 124 | if self._ax is None: 125 | self._init_plot() 126 | 127 | # noinspection PyArgumentList 128 | [line.remove() for line in self._env_lines] 129 | self._env_lines = [] 130 | 131 | for path in paths: 132 | positions = np.stack([info['pos'] for info in path['infos']]) 133 | xx = positions[:, 0] 134 | yy = positions[:, 1] 135 | self._env_lines += self._ax.plot(xx, yy, 'b') 136 | 137 | plt.draw() 138 | plt.pause(0.01) 139 | 140 | def render(self, mode='human'): 141 | """Render for rendering the current state of the environment.""" 142 | pass 143 | 144 | def compute_reward(self, observation, action): 145 | # penalize the L2 norm of acceleration 146 | # noinspection PyTypeChecker 147 | action_cost = np.sum(action ** 2) * self.action_cost_coeff 148 | 149 | # penalize squared dist to goal 150 | cur_position = observation 151 | # noinspection PyTypeChecker 152 | goal_cost = self.distance_cost_coeff * np.amin([ 153 | np.sum((cur_position - goal_position) ** 2) 154 | for goal_position in self.goal_positions 155 | ]) 156 | 157 | # penalize staying with the log barriers 158 | costs = [action_cost, goal_cost] 159 | reward = -np.sum(costs) 160 | return reward 161 | 162 | def _plot_position_cost(self, ax): 163 | delta = 0.01 164 | x_min, x_max = tuple(1.1 * np.array(self.xlim)) 165 | y_min, y_max = tuple(1.1 * np.array(self.ylim)) 166 | X, Y = np.meshgrid( 167 | np.arange(x_min, x_max, delta), 168 | np.arange(y_min, y_max, delta) 169 | ) 170 | goal_costs = np.amin([ 171 | (X - goal_x) ** 2 + (Y - goal_y) ** 2 172 | for goal_x, goal_y in self.goal_positions 173 | ], axis=0) 174 | costs = goal_costs 175 | 176 | contours = ax.contour(X, Y, costs, 20) 177 | ax.clabel(contours, inline=1, fontsize=10, fmt='%.0f') 178 | ax.set_xlim([x_min, x_max]) 179 | ax.set_ylim([y_min, y_max]) 180 | goal = ax.plot(self.goal_positions[:, 0], 181 | self.goal_positions[:, 1], 'ro') 182 | return [contours, goal] 183 | 184 | 185 | class PointDynamics(object): 186 | """ 187 | State: position. 188 | Action: velocity. 189 | """ 190 | def __init__(self, dim, sigma): 191 | self.dim = dim 192 | self.sigma = sigma 193 | self.s_dim = dim 194 | self.a_dim = dim 195 | 196 | def forward(self, state, action): 197 | mu_next = state + action 198 | state_next = mu_next + self.sigma * \ 199 | np.random.normal(size=self.s_dim) 200 | return state_next 201 | -------------------------------------------------------------------------------- /softlearning/environments/adapters/gym_adapter.py: -------------------------------------------------------------------------------- 1 | """Implements a GymAdapter that converts Gym envs into SoftlearningEnv.""" 2 | 3 | import numpy as np 4 | import copy 5 | import gym 6 | from gym import spaces, wrappers 7 | 8 | from .softlearning_env import SoftlearningEnv 9 | from softlearning.environments.gym import register_environments 10 | from softlearning.environments.gym.wrappers import NormalizeActionWrapper 11 | from collections import defaultdict, OrderedDict 12 | 13 | 14 | def parse_domain_task(gym_id): 15 | domain_task_parts = gym_id.split('-') 16 | domain = '-'.join(domain_task_parts[:1]) 17 | task = '-'.join(domain_task_parts[1:]) 18 | 19 | return domain, task 20 | 21 | 22 | CUSTOM_GYM_ENVIRONMENT_IDS = register_environments() 23 | CUSTOM_GYM_ENVIRONMENTS = defaultdict(list) 24 | 25 | for gym_id in CUSTOM_GYM_ENVIRONMENT_IDS: 26 | domain, task = parse_domain_task(gym_id) 27 | CUSTOM_GYM_ENVIRONMENTS[domain].append(task) 28 | 29 | CUSTOM_GYM_ENVIRONMENTS = dict(CUSTOM_GYM_ENVIRONMENTS) 30 | 31 | GYM_ENVIRONMENT_IDS = tuple(gym.envs.registry.env_specs.keys()) 32 | GYM_ENVIRONMENTS = defaultdict(list) 33 | 34 | 35 | for gym_id in GYM_ENVIRONMENT_IDS: 36 | domain, task = parse_domain_task(gym_id) 37 | GYM_ENVIRONMENTS[domain].append(task) 38 | 39 | GYM_ENVIRONMENTS = dict(GYM_ENVIRONMENTS) 40 | 41 | DEFAULT_OBSERVATION_KEY = 'observations' 42 | 43 | 44 | class GymAdapter(SoftlearningEnv): 45 | """Adapter that implements the SoftlearningEnv for Gym envs.""" 46 | 47 | def __init__(self, 48 | domain, 49 | task, 50 | *args, 51 | env=None, 52 | normalize=True, 53 | observation_keys=None, 54 | unwrap_time_limit=True, 55 | use_neorl=False, 56 | **kwargs): 57 | assert not args, ( 58 | "Gym environments don't support args. Use kwargs instead.") 59 | 60 | self.normalize = normalize 61 | self.observation_keys = observation_keys 62 | self.unwrap_time_limit = unwrap_time_limit 63 | 64 | super(GymAdapter, self).__init__(domain, task, *args, **kwargs) 65 | 66 | if env is None: 67 | assert (domain is not None and task is not None), (domain, task) 68 | env_id = "{}-{}".format(domain, task) 69 | if use_neorl: 70 | import neorl 71 | env = neorl.make(env_id) 72 | else: 73 | env = gym.envs.make(env_id, **kwargs) 74 | else: 75 | assert domain is None and task is None, (domain, task) 76 | 77 | if isinstance(env, wrappers.TimeLimit) and unwrap_time_limit: 78 | # Remove the TimeLimit wrapper that sets 'done = True' when 79 | # the time limit specified for each environment has been passed and 80 | # therefore the environment is not Markovian (terminal condition 81 | # depends on time rather than state). 82 | env = env.env 83 | 84 | if normalize: 85 | env = NormalizeActionWrapper(env) 86 | 87 | self._env = env 88 | 89 | if isinstance(self._env.observation_space, spaces.Dict): 90 | dict_observation_space = self._env.observation_space 91 | self.observation_keys = ( 92 | observation_keys or (*self._env.observation_space.spaces.keys(), )) 93 | elif isinstance(self._env.observation_space, spaces.Box): 94 | dict_observation_space = spaces.Dict(OrderedDict(( 95 | (DEFAULT_OBSERVATION_KEY, self._env.observation_space), 96 | ))) 97 | self.observation_keys = (DEFAULT_OBSERVATION_KEY, ) 98 | 99 | self._observation_space = type(dict_observation_space)([ 100 | (name, copy.deepcopy(space)) 101 | for name, space in dict_observation_space.spaces.items() 102 | if name in self.observation_keys 103 | ]) 104 | 105 | @property 106 | def observation_space(self): 107 | observation_space = self._observation_space 108 | return observation_space 109 | 110 | @property 111 | def active_observation_shape(self): 112 | """Shape for the active observation based on observation_keys.""" 113 | # if not isinstance(self._env.observation_space, spaces.Dict): 114 | # return super(GymAdapter, self).active_observation_shape 115 | if not isinstance(self.observation_space, spaces.Dict): 116 | return super(GymAdapter, self).active_observation_shape 117 | 118 | observation_keys = ( 119 | self.observation_keys 120 | or list(self.observation_space.spaces.keys())) 121 | 122 | active_size = sum( 123 | np.prod(self.observation_space.spaces[key].shape) 124 | for key in observation_keys) 125 | 126 | active_observation_shape = (active_size, ) 127 | 128 | return active_observation_shape 129 | 130 | def convert_to_active_observation(self, observation): 131 | # if not isinstance(self._env.observation_space, spaces.Dict): 132 | # return observation 133 | if not isinstance(self.observation_space, spaces.Dict): 134 | return observation 135 | 136 | observation_keys = ( 137 | self.observation_keys 138 | or list(self.observation_space.spaces.keys())) 139 | 140 | observation = np.concatenate([ 141 | observation[key] for key in observation_keys 142 | ], axis=-1) 143 | 144 | return observation 145 | 146 | @property 147 | def action_space(self, *args, **kwargs): 148 | action_space = self._env.action_space 149 | if len(action_space.shape) > 1: 150 | raise NotImplementedError( 151 | "Action space ({}) is not flat, make sure to check the" 152 | " implemenation.".format(action_space)) 153 | return action_space 154 | 155 | def step(self, action, *args, **kwargs): 156 | observation, reward, terminal, info = self._env.step( 157 | action, *args, **kwargs) 158 | 159 | if not isinstance(self._env.observation_space, spaces.Dict): 160 | observation = {DEFAULT_OBSERVATION_KEY: observation} 161 | 162 | observation = self._filter_observation(observation) 163 | return observation, reward, terminal, info 164 | 165 | def reset(self, *args, **kwargs): 166 | observation = self._env.reset() 167 | 168 | if not isinstance(self._env.observation_space, spaces.Dict): 169 | observation = {DEFAULT_OBSERVATION_KEY: observation} 170 | 171 | observation = self._filter_observation(observation) 172 | return observation 173 | 174 | def render(self, *args, **kwargs): 175 | return self._env.render(*args, **kwargs) 176 | 177 | def close(self, *args, **kwargs): 178 | return self._env.close(*args, **kwargs) 179 | 180 | def seed(self, *args, **kwargs): 181 | return self._env.seed(*args, **kwargs) 182 | 183 | @property 184 | def unwrapped(self): 185 | return self._env.unwrapped 186 | 187 | def get_param_values(self, *args, **kwargs): 188 | raise NotImplementedError 189 | 190 | def set_param_values(self, *args, **kwargs): 191 | raise NotImplementedError 192 | -------------------------------------------------------------------------------- /softlearning/scripts/console_scripts.py: -------------------------------------------------------------------------------- 1 | """A command line interface that exposes softlearning examples to user. 2 | 3 | This package exposes the functions in examples.instrument module to the user 4 | through a cli, which allows seamless runs of examples in different modes (e.g. 5 | locally, in google compute engine, or ec2). 6 | 7 | 8 | There are two types of cli commands in this file (each have their corresponding 9 | function in examples.instrument): 10 | 1. run_example_* methods, which run the experiments by invoking 11 | `tune.run_experiments` function. 12 | 2. launch_example_* methods, which are helpers function to submit an 13 | example to be run in the cloud. In practice, these launch a cluster, 14 | and then run the `run_example_cluster` method with the provided 15 | arguments and options. 16 | """ 17 | 18 | from __future__ import absolute_import 19 | from __future__ import division 20 | from __future__ import print_function 21 | 22 | import logging 23 | 24 | import click 25 | 26 | from examples.instrument import ( 27 | run_example_dry, 28 | run_example_local, 29 | run_example_debug, 30 | run_example_cluster, 31 | launch_example_cluster, 32 | launch_example_gce, 33 | launch_example_ec2) 34 | 35 | 36 | logging.basicConfig(level=logging.INFO) 37 | logger = logging.getLogger(__name__) 38 | logger.setLevel(logging.INFO) 39 | 40 | 41 | def add_options(options): 42 | def decorator(f): 43 | for option in options[::-1]: 44 | click.decorators._param_memo(f, option) 45 | return f 46 | return decorator 47 | 48 | 49 | @click.group() 50 | def cli(): 51 | pass 52 | 53 | 54 | @cli.command( 55 | name='run_example_dry', 56 | context_settings={'ignore_unknown_options': True}) 57 | @click.argument("example_module_name", required=True, type=str) 58 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 59 | def run_example_dry_cmd(example_module_name, example_argv): 60 | """Print the variant spec and related information of an example.""" 61 | return run_example_dry(example_module_name, example_argv) 62 | 63 | 64 | @cli.command( 65 | name='run_local', 66 | context_settings={'ignore_unknown_options': True}) 67 | @click.argument("example_module_name", required=True, type=str) 68 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 69 | def run_example_local_cmd(example_module_name, example_argv): 70 | """Run example locally, potentially parallelizing across cpus/gpus.""" 71 | return run_example_local(example_module_name, example_argv) 72 | 73 | 74 | @cli.command( 75 | name='run_example_debug', 76 | context_settings={'ignore_unknown_options': True}) 77 | @click.argument("example_module_name", required=True, type=str) 78 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 79 | def run_example_debug_cmd(example_module_name, example_argv): 80 | """The debug mode limits tune trial runs to enable use of debugger.""" 81 | return run_example_debug(example_module_name, example_argv) 82 | 83 | 84 | @cli.command( 85 | name='run_example_cluster', 86 | context_settings={'ignore_unknown_options': True}) 87 | @click.argument("example_module_name", required=True, type=str) 88 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 89 | def run_example_cluster_cmd(example_module_name, example_argv): 90 | """Run example on cluster mode. 91 | 92 | This functions is very similar to the local mode, except that it 93 | correctly sets the redis address to make ray/tune work on a cluster. 94 | """ 95 | run_example_cluster(example_module_name, example_argv) 96 | 97 | 98 | @cli.command( 99 | name='launch_example_cluster', 100 | context_settings={ 101 | 'allow_extra_args': True, 102 | 'ignore_unknown_options': True 103 | }) 104 | @click.argument("example_module_name", required=True, type=str) 105 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED) 106 | @click.option( 107 | "--config_file", 108 | required=False, 109 | type=str) 110 | @click.option( 111 | "--stop/--no-stop", 112 | is_flag=True, 113 | default=True, 114 | help="Stop the cluster after the command finishes running.") 115 | @click.option( 116 | "--start/--no-start", 117 | is_flag=True, 118 | default=True, 119 | help="Start the cluster if needed.") 120 | @click.option( 121 | "--screen/--no-screen", 122 | is_flag=True, 123 | default=False, 124 | help="Run the command in a screen.") 125 | @click.option( 126 | "--tmux/--no-tmux", 127 | is_flag=True, 128 | default=True, 129 | help="Run the command in tmux.") 130 | @click.option( 131 | "--override-cluster-name", 132 | required=False, 133 | type=str, 134 | help="Override the configured cluster name.") 135 | @click.option( 136 | "--port-forward", required=False, type=int, help="Port to forward.") 137 | def launch_example_cluster_cmd(*args, **kwargs): 138 | """Launches the example on autoscaled ray cluster through ray exec_cmd. 139 | 140 | This handles basic validation and sanity checks for the experiment, and 141 | then executes the command on autoscaled ray cluster. If necessary, it will 142 | also fill in more useful defaults for our workflow (i.e. for tmux and 143 | override_cluster_name). 144 | """ 145 | return launch_example_cluster(*args, **kwargs) 146 | 147 | 148 | @cli.command( 149 | name='launch_example_gce', 150 | context_settings={ 151 | 'allow_extra_args': True, 152 | 'ignore_unknown_options': True 153 | }) 154 | @add_options(launch_example_cluster_cmd.params) 155 | def launch_example_gce_cmd(*args, **kwargs): 156 | """Forwards call to `launch_example_cluster` after adding gce defaults. 157 | 158 | This optionally sets the ray autoscaler configuration file to the default 159 | gce configuration file, and then calls `launch_example_cluster` to 160 | execute the original command on autoscaled gce cluster by parsing the args. 161 | 162 | See `launch_example_cluster` for further details. 163 | """ 164 | return launch_example_gce(*args, **kwargs) 165 | 166 | 167 | @cli.command( 168 | name='launch_example_ec2', 169 | context_settings={ 170 | 'allow_extra_args': True, 171 | 'ignore_unknown_options': True 172 | }) 173 | @add_options(launch_example_cluster_cmd.params) 174 | def launch_example_ec2_cmd(*args, **kwargs): 175 | """Forwards call to `launch_example_cluster` after adding ec2 defaults. 176 | 177 | This optionally sets the ray autoscaler configuration file to the default 178 | ec2 configuration file, and then calls `launch_example_cluster` to 179 | execute the original command on autoscaled ec2 cluster by parsing the args. 180 | 181 | See `launch_example_cluster` for further details. 182 | """ 183 | return launch_example_ec2(*args, **kwargs) 184 | 185 | 186 | cli.add_command(run_example_local_cmd) 187 | cli.add_command(run_example_dry_cmd) 188 | cli.add_command(run_example_cluster_cmd) 189 | 190 | # Alias for run_example_local 191 | cli.add_command(run_example_local_cmd, name='launch_example_local') 192 | # Alias for run_example_dry 193 | cli.add_command(run_example_dry_cmd, name='launch_example_dry') 194 | # Alias for run_example_debug 195 | cli.add_command(run_example_debug_cmd, name='launch_example_debug') 196 | cli.add_command(launch_example_cluster_cmd) 197 | cli.add_command(launch_example_gce_cmd) 198 | cli.add_command(launch_example_ec2_cmd) 199 | 200 | 201 | def main(): 202 | return cli() 203 | 204 | 205 | if __name__ == "__main__": 206 | main() 207 | -------------------------------------------------------------------------------- /softlearning/replay_pools/flexible_replay_pool.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import pickle 3 | 4 | import numpy as np 5 | 6 | from .replay_pool import ReplayPool 7 | 8 | 9 | class FlexibleReplayPool(ReplayPool): 10 | def __init__(self, max_size, fields_attrs, obs_filter=False, modify_rew=False): 11 | super(FlexibleReplayPool, self).__init__() 12 | 13 | max_size = int(max_size) 14 | self._max_size = max_size 15 | 16 | self.fields = {} 17 | self.fields_attrs = {} 18 | 19 | self.add_fields(fields_attrs) 20 | 21 | self.obs_filter = obs_filter 22 | self.modify_rew = modify_rew 23 | 24 | self._pointer = 0 25 | self._size = 0 26 | self._samples_since_save = 0 27 | 28 | @property 29 | def size(self): 30 | return self._size 31 | 32 | @property 33 | def field_names(self): 34 | return list(self.fields.keys()) 35 | 36 | def add_fields(self, fields_attrs): 37 | self.fields_attrs.update(fields_attrs) 38 | 39 | for field_name, field_attrs in fields_attrs.items(): 40 | field_shape = (self._max_size, *field_attrs['shape']) 41 | initializer = field_attrs.get('initializer', np.zeros) 42 | self.fields[field_name] = initializer( 43 | field_shape, dtype=field_attrs['dtype']) 44 | 45 | def _advance(self, count=1): 46 | self._pointer = (self._pointer + count) % self._max_size 47 | self._size = min(self._size + count, self._max_size) 48 | self._samples_since_save += count 49 | 50 | def add_sample(self, sample): 51 | samples = { 52 | key: value[None, ...] 53 | for key, value in sample.items() 54 | } 55 | self.add_samples(samples) 56 | 57 | def add_samples(self, samples): 58 | # if 'infos' not in samples: 59 | # samples['infos'] = {} 60 | field_names = list(samples.keys()) 61 | num_samples = samples[field_names[0]].shape[0] 62 | index = np.arange( 63 | self._pointer, self._pointer + num_samples) % self._max_size 64 | for field_name in self.field_names: 65 | # print(field_name) 66 | default_value = ( 67 | self.fields_attrs[field_name].get('default_value', 0.0)) 68 | values = samples.get(field_name, default_value) 69 | if field_name not in samples.keys() and 'infos' in samples and field_name in samples['infos'][0].keys(): 70 | values = np.expand_dims(np.array([samples['infos'][i].get(field_name, default_value) for i in range(num_samples)]), axis=1) 71 | try: 72 | assert values.shape[0] == num_samples, f'value shape: {values.shape[0]}, expected: {num_samples}' 73 | if isinstance(values[0], dict): 74 | values = np.stack([np.concatenate([ 75 | value[key] 76 | for key in value.keys() 77 | ], axis=-1) for value in values]) 78 | self.fields[field_name][index] = values 79 | except Exception as e: 80 | import traceback 81 | traceback.print_exc(limit=10) 82 | print('[ DEBUG ] errors occurs: {}'.format(e)) 83 | 84 | import pdb; pdb.set_trace() 85 | self._advance(num_samples) 86 | 87 | def restore_samples(self, samples): 88 | num_samples = samples[list(samples.keys())[0]].shape[0] 89 | index = np.arange( 90 | 0, num_samples) % self._max_size 91 | for key, values in samples.items(): 92 | assert key in self.field_names 93 | self.fields[key][index] = values 94 | 95 | def random_indices(self, batch_size): 96 | if self._size == 0: return np.arange(0, 0) 97 | return np.random.randint(0, self._size, batch_size) 98 | 99 | def random_batch(self, batch_size, field_name_filter=None, **kwargs): 100 | random_indices = self.random_indices(batch_size) 101 | return self.batch_by_indices( 102 | random_indices, field_name_filter=field_name_filter, **kwargs) 103 | 104 | def last_n_batch(self, last_n, field_name_filter=None, **kwargs): 105 | last_n_indices = np.arange( 106 | self._pointer - min(self.size, last_n), self._pointer 107 | ) % self._max_size 108 | return self.batch_by_indices( 109 | last_n_indices, field_name_filter=field_name_filter, **kwargs) 110 | 111 | def filter_fields(self, field_names, field_name_filter): 112 | if isinstance(field_name_filter, str): 113 | field_name_filter = [field_name_filter] 114 | 115 | if isinstance(field_name_filter, (list, tuple)): 116 | field_name_list = field_name_filter 117 | 118 | def filter_fn(field_name): 119 | return field_name in field_name_list 120 | 121 | else: 122 | filter_fn = field_name_filter 123 | 124 | filtered_field_names = [ 125 | field_name for field_name in field_names 126 | if filter_fn(field_name) 127 | ] 128 | 129 | return filtered_field_names 130 | 131 | def batch_by_indices(self, indices, field_name_filter=None): 132 | if np.any(indices % self._max_size > self.size): 133 | raise ValueError( 134 | "Tried to retrieve batch with indices greater than current" 135 | " size") 136 | 137 | field_names = self.field_names 138 | if field_name_filter is not None: 139 | field_names = self.filter_fields( 140 | field_names, field_name_filter) 141 | 142 | return { 143 | field_name: self.fields[field_name][indices] 144 | for field_name in field_names 145 | } 146 | 147 | def save_latest_experience(self, pickle_path): 148 | latest_samples = self.last_n_batch(self._samples_since_save) 149 | 150 | with gzip.open(pickle_path, 'wb') as f: 151 | pickle.dump(latest_samples, f) 152 | 153 | self._samples_since_save = 0 154 | 155 | def load_experience(self, experience_path): 156 | with gzip.open(experience_path, 'rb') as f: 157 | latest_samples = pickle.load(f) 158 | 159 | key = list(latest_samples.keys())[0] 160 | num_samples = latest_samples[key].shape[0] 161 | for field_name, data in latest_samples.items(): 162 | assert data.shape[0] == num_samples, data.shape 163 | 164 | self.add_samples(latest_samples) 165 | self._samples_since_save = 0 166 | 167 | def return_all_samples(self): 168 | return { 169 | field_name: self.fields[field_name][:self.size] 170 | for field_name in self.field_names 171 | } 172 | 173 | def __getstate__(self): 174 | state = self.__dict__.copy() 175 | state['fields'] = { 176 | field_name: self.fields[field_name][:self.size] 177 | for field_name in self.field_names 178 | } 179 | 180 | return state 181 | 182 | def __setstate__(self, state): 183 | if state['_size'] < state['_max_size']: 184 | pad_size = state['_max_size'] - state['_size'] 185 | for field_name in state['fields'].keys(): 186 | field_shape = state['fields_attrs'][field_name]['shape'] 187 | state['fields'][field_name] = np.concatenate(( 188 | state['fields'][field_name], 189 | np.zeros((pad_size, *field_shape)) 190 | ), axis=0) 191 | 192 | self.__dict__ = state 193 | -------------------------------------------------------------------------------- /run_scripts/base.py: -------------------------------------------------------------------------------- 1 | 2 | import numpy as np 3 | import pdb 4 | 5 | from softlearning.misc.utils import get_git_rev, deep_update 6 | 7 | M = 256 8 | REPARAMETERIZE = True 9 | 10 | NUM_COUPLING_LAYERS = 2 11 | 12 | GAUSSIAN_POLICY_PARAMS_BASE = { 13 | 'type': 'GaussianPolicy', 14 | 'kwargs': { 15 | 'hidden_layer_sizes': (M, M), 16 | 'squash': True, 17 | } 18 | } 19 | 20 | GAUSSIAN_POLICY_PARAMS_FOR_DOMAIN = {} 21 | 22 | POLICY_PARAMS_BASE = { 23 | 'GaussianPolicy': GAUSSIAN_POLICY_PARAMS_BASE, 24 | } 25 | 26 | POLICY_PARAMS_BASE.update({ 27 | 'gaussian': POLICY_PARAMS_BASE['GaussianPolicy'], 28 | }) 29 | 30 | POLICY_PARAMS_FOR_DOMAIN = { 31 | 'GaussianPolicy': GAUSSIAN_POLICY_PARAMS_FOR_DOMAIN, 32 | } 33 | 34 | POLICY_PARAMS_FOR_DOMAIN.update({ 35 | 'gaussian': POLICY_PARAMS_FOR_DOMAIN['GaussianPolicy'], 36 | }) 37 | 38 | DEFAULT_MAX_PATH_LENGTH = 1000 39 | MAX_PATH_LENGTH_PER_DOMAIN = { 40 | 'Point2DEnv': 50, 41 | 'Point2DWallEnv': 50, 42 | 'Pendulum': 200, 43 | } 44 | import tensorflow as tf 45 | import os 46 | def get_package_path(): 47 | return os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 48 | 49 | ALGORITHM_PARAMS_ADDITIONAL = { 50 | 'MAPLE': { 51 | 'type': 'MAPLE', 52 | 'kwargs': { 53 | 'reparameterize': REPARAMETERIZE, 54 | 'lr': 3e-4, 55 | 'target_update_interval': 1, 56 | 'tau': 5e-3, 57 | 'store_extra_policy_info': False, 58 | 'action_prior': 'uniform', 59 | 'n_initial_exploration_steps': int(5000), 60 | "model_load_dir": os.path.join(get_package_path(), 'models'), 61 | "num_networks": 7, 62 | "network_kwargs": { 63 | "hidden_sizes": [256, 256], 64 | "activation": tf.nn.relu, 65 | "output_activation": None, 66 | "lstm_hidden_unit": 128, 67 | "embedding_size": 16 68 | } 69 | } 70 | }, 71 | } 72 | 73 | DEFAULT_NUM_EPOCHS = 200 74 | 75 | NUM_EPOCHS_PER_DOMAIN = { 76 | 'Hopper': int(1e3), 77 | 'HalfCheetah': int(1e3), 78 | 'Walker2d': int(1e3), 79 | } 80 | 81 | ALGORITHM_PARAMS_PER_DOMAIN = { 82 | **{ 83 | domain: { 84 | 'kwargs': { 85 | 'n_epochs': NUM_EPOCHS_PER_DOMAIN.get(domain, DEFAULT_NUM_EPOCHS), 86 | 'n_initial_exploration_steps': ( 87 | MAX_PATH_LENGTH_PER_DOMAIN.get(domain, DEFAULT_MAX_PATH_LENGTH) * 10), 88 | } 89 | } for domain in NUM_EPOCHS_PER_DOMAIN 90 | } 91 | } 92 | 93 | 94 | NUM_CHECKPOINTS = 10 95 | 96 | 97 | def get_variant_spec_base(universe, domain, task, policy, algorithm, env_params): 98 | print("get algorithms", algorithm) 99 | algorithm_params = deep_update( 100 | env_params, 101 | ALGORITHM_PARAMS_PER_DOMAIN.get(domain, {}) 102 | ) 103 | algorithm_params = deep_update( 104 | algorithm_params, 105 | ALGORITHM_PARAMS_ADDITIONAL.get(algorithm, {}) 106 | ) 107 | variant_spec = { 108 | # 'git_sha': get_git_rev(), 109 | 110 | 'environment_params': { 111 | 'training': { 112 | 'domain': domain, 113 | 'task': task, 114 | 'universe': universe, 115 | 'kwargs': {}, 116 | }, 117 | 'evaluation': lambda spec: ( 118 | spec['environment_params']['training']), 119 | }, 120 | 'policy_params': deep_update( 121 | POLICY_PARAMS_BASE[policy], 122 | POLICY_PARAMS_FOR_DOMAIN[policy].get(domain, {}) 123 | ), 124 | 'Q_params': { 125 | 'type': 'double_feedforward_Q_function', 126 | 'kwargs': { 127 | 'hidden_layer_sizes': (M, M), 128 | } 129 | }, 130 | 'algorithm_params': algorithm_params, 131 | 'replay_pool_params': { 132 | 'type': 'SimpleReplayPool', 133 | 'kwargs': { 134 | 'max_size': lambda spec: ( 135 | { 136 | 'SimpleReplayPool': int(1e6), 137 | 'TrajectoryReplayPool': int(1e4), 138 | }.get(spec['replay_pool_params']['type'], int(1e6)) 139 | ), 140 | } 141 | }, 142 | 'sampler_params': { 143 | 'type': 'SimpleSampler', 144 | 'kwargs': { 145 | 'max_path_length': MAX_PATH_LENGTH_PER_DOMAIN.get( 146 | domain, DEFAULT_MAX_PATH_LENGTH), 147 | 'min_pool_size': MAX_PATH_LENGTH_PER_DOMAIN.get( 148 | domain, DEFAULT_MAX_PATH_LENGTH), 149 | 'batch_size': 256, 150 | } 151 | }, 152 | 'run_params': { 153 | 'seed': 88, 154 | 'checkpoint_at_end': True, 155 | 'checkpoint_frequency': NUM_EPOCHS_PER_DOMAIN.get( 156 | domain, DEFAULT_NUM_EPOCHS) // NUM_CHECKPOINTS, 157 | 'checkpoint_replay_pool': False, 158 | 'info': '' 159 | }, 160 | } 161 | 162 | return variant_spec 163 | 164 | def get_variant_spec(args, env_params): 165 | universe, domain, task = env_params.universe, env_params.domain, env_params.task 166 | variant_spec = get_variant_spec_base( 167 | universe, domain, task, args.policy, env_params.type, env_params) 168 | return variant_spec 169 | 170 | NEORL_CONFIG = { 171 | "hopper": 172 | { 173 | 'common': { 174 | 'length': 10, 175 | 'penalty_coeff': 1.0, 176 | }, 177 | }, 178 | "halfcheetah": 179 | { 180 | 'common': { 181 | 'penalty_clip': 4, 182 | 'length': 15, 183 | 'penalty_coeff': 1.0, 184 | } 185 | }, 186 | 'walker2d': 187 | { 188 | 'common': { 189 | 'length': 15, 190 | 'penalty_coeff': 0.25, 191 | } 192 | } 193 | } 194 | D4RL_MAPLE_CONFIG = { 195 | 'common':{ 196 | 'length': 10, 197 | 'penalty_coeff': 0.25, 198 | }, 199 | 'halfcheetah':{ 200 | 'common': {}, 201 | 'medium-expert': 202 | { 203 | 'n_epochs': 2000, 204 | 'penalty_coeff': 5.0, 205 | } 206 | } 207 | } 208 | 209 | D4RL_MAPLE_200_CONFIG = { 210 | 'common': { 211 | 'length': 20, 212 | 'penalty_coeff': 0.25, 213 | }, 214 | 'halfcheetah': { 215 | 'common': {}, 216 | 'medium-expert': 217 | { 218 | 'n_epochs': 2000, 219 | 'length': 10, 220 | 'penalty_coeff': 5.0, 221 | }, 222 | 'mixed': { 223 | 'penalty_clip': 4.0, 224 | } 225 | }, 226 | 'hopper': { 227 | 'common': { 228 | 'penalty_coeff': 1.0, 229 | } 230 | }, 231 | } 232 | def get_task_spec(variant_spec): 233 | if variant_spec["custom_config"]: 234 | return variant_spec 235 | else: 236 | if variant_spec['environment_params']['training']['kwargs']['use_neorl']: 237 | if variant_spec['maple_200']: 238 | assert "have not test maple_200 in neorl yet" 239 | variant_spec['model_suffix'] = 50 240 | tasks = variant_spec['config'].split('.')[-1].split('_') 241 | variant_spec.update(NEORL_CONFIG[tasks[0]]['common']) 242 | else: 243 | tasks = variant_spec['config'].split('.')[-1].split('_') 244 | if variant_spec['maple_200']: 245 | variant_spec['model_suffix'] = 200 246 | config = D4RL_MAPLE_200_CONFIG 247 | else: 248 | variant_spec['model_suffix'] = 20 249 | config = D4RL_MAPLE_CONFIG 250 | variant_spec.update(config['common']) 251 | if tasks[0] in config.keys(): 252 | variant_spec.update(config[tasks[0]]['common']) 253 | behavior_type = '-'.join(tasks[1:]) 254 | if behavior_type in config[tasks[0]].keys(): 255 | variant_spec.update(config[tasks[0]][behavior_type]) 256 | return variant_spec 257 | -------------------------------------------------------------------------------- /maple/policy/fake_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import tensorflow as tf 3 | 4 | class FakeEnv: 5 | def __init__(self, model, config, 6 | penalty_coeff=0., 7 | max_penalty=4.0, 8 | # fix_env=False, 9 | penalty_learned_var=False, 10 | penalty_learned_var_random=False): 11 | self.model = model 12 | self.config = config 13 | self.max_penalty = max_penalty 14 | # self.fix_env = fix_env 15 | self.penalty_coeff = penalty_coeff 16 | self.penalty_learned_var = penalty_learned_var 17 | self.penalty_learned_var_random = penalty_learned_var_random 18 | 19 | ''' 20 | x : [ batch_size, obs_dim + 1 ] 21 | means : [ num_models, batch_size, obs_dim + 1 ] 22 | vars : [ num_models, batch_size, obs_dim + 1 ] 23 | ''' 24 | def _get_logprob(self, x, means, variances): 25 | 26 | k = x.shape[-1] 27 | 28 | ## [ num_networks, batch_size ] 29 | log_prob = -1/2 * (k * np.log(2*np.pi) + np.log(variances).sum(-1) + (np.power(x-means, 2)/variances).sum(-1)) 30 | 31 | ## [ batch_size ] 32 | prob = np.exp(log_prob).sum(0) 33 | 34 | ## [ batch_size ] 35 | log_prob = np.log(prob) 36 | 37 | stds = np.std(means,0).mean(-1) 38 | return log_prob, stds 39 | 40 | def reset(self, rollout_length, batch_size): 41 | 42 | self.ts = 0 43 | # Randomly select 10 models rather than using one model for adaptable policy learning. 44 | # Then MAPLE learns to adapt for each combination of the dynamics models. 45 | ensemble_number = np.random.randint(10, len(self.model._model_inds)) 46 | sub_model_idx = np.random.choice(self.model._model_inds, size=ensemble_number) 47 | self.model_inds = np.random.choice(sub_model_idx, size=batch_size) 48 | self.rollout_length = rollout_length 49 | self.reset_ratio = np.random.random() 50 | 51 | def step(self, obs, act, deterministic=False): 52 | assert len(obs.shape) == len(act.shape) 53 | if len(obs.shape) == 1: 54 | obs = obs[None] 55 | act = act[None] 56 | return_single = True 57 | else: 58 | return_single = False 59 | 60 | inputs = np.concatenate((obs, act), axis=-1) 61 | batch_length = inputs.shape[0] 62 | all_means = [] 63 | all_vars = [] 64 | if self.model.num_nets * batch_length > 500000: 65 | group_batch_num = int(500000 / self.model.num_nets) 66 | for i in range(int(np.ceil(batch_length / group_batch_num))): 67 | ensemble_model_means, ensemble_model_vars = self.model.predict( 68 | inputs[i * group_batch_num: (i + 1) * group_batch_num], factored=True) 69 | all_means.append(ensemble_model_means) 70 | all_vars.append(ensemble_model_vars) 71 | else: 72 | ensemble_model_means, ensemble_model_vars = self.model.predict( 73 | inputs[:], factored=True) 74 | all_means.append(ensemble_model_means) 75 | all_vars.append(ensemble_model_vars) 76 | # print(self.model.num_nets * batch_length) 77 | ensemble_model_means = np.concatenate(all_means, axis=1) 78 | ensemble_model_vars = np.concatenate(all_vars, axis=1) 79 | 80 | ensemble_model_means[:, :, 1:] += obs 81 | ensemble_model_stds = np.sqrt(ensemble_model_vars) 82 | 83 | if deterministic: 84 | ensemble_samples = ensemble_model_means 85 | else: 86 | ensemble_samples = ensemble_model_means + np.random.normal(size=ensemble_model_means.shape) * ensemble_model_stds 87 | 88 | if not deterministic: 89 | #### choose one model from ensemble 90 | num_models, batch_size, _ = ensemble_model_means.shape 91 | model_inds = self.model.random_inds(batch_size) 92 | batch_inds = np.arange(0, batch_size) 93 | samples = ensemble_samples[model_inds, batch_inds] 94 | model_means = ensemble_model_means[model_inds, batch_inds] 95 | model_stds = ensemble_model_stds[model_inds, batch_inds] 96 | else: 97 | samples = np.mean(ensemble_samples, axis=0) 98 | model_means = np.mean(ensemble_model_means, axis=0) 99 | model_stds = np.mean(ensemble_model_stds, axis=0) 100 | log_prob, dev = self._get_logprob(samples, ensemble_model_means, ensemble_model_vars) 101 | 102 | rewards, next_obs = samples[:,:1], samples[:,1:] 103 | terminals = self.config.termination_fn(obs, act, next_obs) 104 | 105 | batch_size = model_means.shape[0] 106 | return_means = np.concatenate((model_means[:,:1], terminals, model_means[:,1:]), axis=-1) 107 | return_stds = np.concatenate((model_stds[:,:1], np.zeros((batch_size,1)), model_stds[:,1:]), axis=-1) 108 | self.ts += 1 109 | assert self.penalty_coeff != 0 110 | if self.penalty_coeff != 0: 111 | if not self.penalty_learned_var: 112 | ensemble_means_obs = ensemble_model_means[:,:,1:] 113 | mean_obs_means = np.mean(ensemble_means_obs, axis=0) # average predictions over models 114 | diffs = ensemble_means_obs - mean_obs_means 115 | normalize_diffs = False 116 | if normalize_diffs: 117 | obs_dim = next_obs.shape[1] 118 | obs_sigma = self.model.scaler.cached_sigma[0,:obs_dim] 119 | diffs = diffs / obs_sigma 120 | dists = np.linalg.norm(diffs, axis=2) # distance in obs space 121 | penalty = np.max(dists, axis=0) # max distances over models 122 | else: 123 | penalty = np.amax(np.linalg.norm(ensemble_model_stds, axis=2), axis=0) 124 | 125 | penalty = np.expand_dims(penalty, 1) 126 | penalty = np.clip(penalty, a_max=self.max_penalty, a_min=None) 127 | assert penalty.shape == rewards.shape 128 | unpenalized_rewards = rewards 129 | penalized_rewards = rewards - self.penalty_coeff * penalty 130 | else: 131 | penalty = None 132 | unpenalized_rewards = rewards 133 | penalized_rewards = rewards 134 | 135 | if return_single: 136 | next_obs = next_obs[0] 137 | return_means = return_means[0] 138 | return_stds = return_stds[0] 139 | unpenalized_rewards = unpenalized_rewards[0] 140 | penalized_rewards = penalized_rewards[0] 141 | terminals = terminals[0] 142 | 143 | info = {'mean': return_means, 'std': return_stds, 'log_prob': log_prob, 'dev': dev, 144 | 'unpenalized_rewards': unpenalized_rewards, 'penalty': penalty, 'penalized_rewards': penalized_rewards} 145 | return next_obs, penalized_rewards, terminals, info 146 | 147 | ## for debugging computation graph 148 | def step_ph(self, obs_ph, act_ph, deterministic=False): 149 | assert len(obs_ph.shape) == len(act_ph.shape) 150 | 151 | inputs = tf.concat([obs_ph, act_ph], axis=1) 152 | # inputs = np.concatenate((obs, act), axis=-1) 153 | ensemble_model_means, ensemble_model_vars = self.model.create_prediction_tensors(inputs, factored=True) 154 | # ensemble_model_means, ensemble_model_vars = self.model.predict(inputs, factored=True) 155 | ensemble_model_means = tf.concat([ensemble_model_means[:,:,0:1], ensemble_model_means[:,:,1:] + obs_ph[None]], axis=-1) 156 | # ensemble_model_means[:,:,1:] += obs_ph 157 | ensemble_model_stds = tf.sqrt(ensemble_model_vars) 158 | # ensemble_model_stds = np.sqrt(ensemble_model_vars) 159 | 160 | if deterministic: 161 | ensemble_samples = ensemble_model_means 162 | else: 163 | # ensemble_samples = ensemble_model_means + np.random.normal(size=ensemble_model_means.shape) * ensemble_model_stds 164 | ensemble_samples = ensemble_model_means + tf.random.normal(tf.shape(ensemble_model_means)) * ensemble_model_stds 165 | 166 | samples = ensemble_samples[0] 167 | 168 | rewards, next_obs = samples[:,:1], samples[:,1:] 169 | terminals = self.config.termination_ph_fn(obs_ph, act_ph, next_obs) 170 | info = {} 171 | 172 | return next_obs, rewards, terminals, info 173 | 174 | def close(self): 175 | pass 176 | 177 | 178 | 179 | --------------------------------------------------------------------------------