├── maple
    ├── __init__.py
    ├── dataset
    │   └── __init__.py
    ├── models
    │   ├── __init__.py
    │   ├── constructor.py
    │   └── utils.py
    ├── policy
    │   ├── __init__.py
    │   ├── static
    │   │   ├── pendulum.py
    │   │   ├── point2denv.py
    │   │   ├── point2dwallenv.py
    │   │   ├── halfcheetahveljump.py
    │   │   ├── humanoid.py
    │   │   ├── ant.py
    │   │   ├── halfcheetahvel.py
    │   │   ├── __init__.py
    │   │   ├── hopper.py
    │   │   ├── walker2d.py
    │   │   ├── halfcheetahjump.py
    │   │   ├── halfcheetah.py
    │   │   └── antangle.py
    │   └── fake_env.py
    ├── global_config.py
    ├── utils
    │   ├── __init__.py
    │   ├── filesystem.py
    │   ├── visualization.py
    │   └── logging.py
    └── env
    │   ├── __init__.py
    │   ├── halfcheetah_vel.py
    │   ├── halfcheetah_jump.py
    │   ├── humanoid.py
    │   ├── ant.py
    │   ├── ant_angle.py
    │   └── assert
    │       └── halfcheetah.xml
├── examples
    ├── __init__.py
    └── config
    │   ├── neorl
    │       ├── base.py
    │       ├── hopper_low.py
    │       ├── hopper_medium.py
    │       ├── walker2d_low.py
    │       ├── halfcheetah_low.py
    │       ├── walker2d_medium.py
    │       ├── halfcheetah_medium.py
    │       └── base_maple.py
    │   ├── d4rl
    │       ├── hopper_medium.py
    │       ├── hopper_random.py
    │       ├── walker2d_medium.py
    │       ├── walker2d_random.py
    │       ├── halfcheetah_medium.py
    │       ├── halfcheetah_random.py
    │       ├── hopper_mixed.py
    │       ├── hopper_medium_expert.py
    │       ├── walker2d_mixed.py
    │       ├── halfcheetah_mixed.py
    │       ├── walker2d_medium_expert.py
    │       ├── halfcheetah_medium_expert.py
    │       └── base_maple.py
    │   └── __init__.py
├── rla_scripts
    ├── __init__.py
    ├── config.py
    ├── view_expt.py
    ├── archive_expt.py
    ├── delete_expt.py
    └── start_pretty_plotter.py
├── run_scripts
    ├── __init__.py
    ├── utils.py
    ├── main.py
    └── base.py
├── softlearning
    ├── __init__.py
    ├── misc
    │   ├── __init__.py
    │   ├── plotter.py
    │   ├── kernel.py
    │   └── utils.py
    ├── models
    │   ├── __init__.py
    │   ├── utils.py
    │   └── feedforward.py
    ├── policies
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── uniform_policy.py
    │   └── base_policy.py
    ├── scripts
    │   ├── __init__.py
    │   └── console_scripts.py
    ├── environments
    │   ├── __init__.py
    │   ├── gym
    │   │   ├── mujoco
    │   │   │   ├── __init__.py
    │   │   │   └── image_pusher_2d.py
    │   │   ├── robotics
    │   │   │   └── __init__.py
    │   │   ├── wrappers
    │   │   │   ├── __init__.py
    │   │   │   └── normalize_action.py
    │   │   ├── __init__.py
    │   │   └── multi_goal.py
    │   ├── adapters
    │   │   ├── __init__.py
    │   │   └── gym_adapter.py
    │   ├── dm_control
    │   │   └── __init__.py
    │   ├── helpers.py
    │   └── utils.py
    ├── preprocessors
    │   ├── __init__.py
    │   ├── utils.py
    │   └── convnet.py
    ├── value_functions
    │   ├── __init__.py
    │   ├── vanilla.py
    │   ├── utils.py
    │   └── value_function.py
    ├── algorithms
    │   ├── __init__.py
    │   └── utils.py
    ├── distributions
    │   ├── __init__.py
    │   └── squash_bijector.py
    ├── utils
    │   ├── numpy.py
    │   └── keras.py
    ├── replay_pools
    │   ├── __init__.py
    │   ├── extra_policy_info_replay_pool.py
    │   ├── replay_pool.py
    │   ├── utils.py
    │   ├── union_pool.py
    │   ├── trajectory_replay_pool.py
    │   └── flexible_replay_pool.py
    └── samplers
    │   ├── __init__.py
    │   ├── dummy_sampler.py
    │   ├── base_sampler.py
    │   ├── extra_policy_info_sampler.py
    │   ├── utils.py
    │   ├── explore_sampler.py
    │   ├── remote_sampler.py
    │   └── simple_sampler.py
├── resources
    ├── poster.png
    ├── plot_demo.png
    └── neorl-maple.png
├── log
    └── v2_examples.config.d4rl.walker2d_medium_expert
    │   └── 2022
    │       └── 01
    │           ├── 04
    │               └── 09-30-38-583144 10.83.150.23 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=8&penalty_clip=20
    │               │   ├── tb
    │               │       └── events
    │               │       │   └── events.out.tfevents.1641259841.ml-gpu-ser108.nmg01
    │               │   └── warn.txt
    │           └── 01
    │               ├── 15-38-56-149640 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=88&penalty_clip=20
    │                   ├── tb
    │                   │   └── events
    │                   │   │   └── events.out.tfevents.1641022739.ml-gpu-ser119.nmg01
    │                   └── warn.txt
    │               └── 15-42-56-985568 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=888&penalty_clip=20
    │                   ├── tb
    │                       └── events
    │                       │   └── events.out.tfevents.1641022980.ml-gpu-ser119.nmg01
    │                   └── warn.txt
├── rla_config_mopo.yaml
├── setup.py
├── LICENSE
├── .gitignore
└── README.md


/maple/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rla_scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/run_scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maple/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maple/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maple/policy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/examples/config/neorl/base.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/misc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/policies/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/environments/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/preprocessors/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/rla_scripts/config.py:
--------------------------------------------------------------------------------
1 | DATA_ROOT = '../'
2 | 


--------------------------------------------------------------------------------
/softlearning/value_functions/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/environments/gym/mujoco/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/softlearning/environments/gym/robotics/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maple/global_config.py:
--------------------------------------------------------------------------------
1 | STATE_CLIP_BOUND = 100
2 | MAX_PENALTY = 20


--------------------------------------------------------------------------------
/maple/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # from .filesystem import *
2 | # from .launcher import *


--------------------------------------------------------------------------------
/softlearning/algorithms/__init__.py:
--------------------------------------------------------------------------------
1 | from .sql import SQL
2 | from .sac import SAC
3 | 


--------------------------------------------------------------------------------
/softlearning/distributions/__init__.py:
--------------------------------------------------------------------------------
1 | from .real_nvp_flow import ConditionalRealNVPFlow
2 | 


--------------------------------------------------------------------------------
/resources/poster.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/resources/poster.png


--------------------------------------------------------------------------------
/resources/plot_demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/resources/plot_demo.png


--------------------------------------------------------------------------------
/resources/neorl-maple.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/resources/neorl-maple.png


--------------------------------------------------------------------------------
/softlearning/environments/gym/wrappers/__init__.py:
--------------------------------------------------------------------------------
1 | from .normalize_action import NormalizeActionWrapper
2 | 


--------------------------------------------------------------------------------
/maple/utils/filesystem.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | def mkdir(path):
4 | 	if not os.path.exists(path):
5 | 		os.mkdir(path)


--------------------------------------------------------------------------------
/softlearning/environments/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | """Module that provides adapters between SoftlearningEnv and other universes"""
2 | 


--------------------------------------------------------------------------------
/softlearning/utils/numpy.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | 
4 | def softmax(x):
5 |     max_x = np.max(x)
6 |     exp_x = np.exp(x - max_x)
7 |     return exp_x / np.sum(exp_x)
8 | 


--------------------------------------------------------------------------------
/softlearning/replay_pools/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_replay_pool import SimpleReplayPool
2 | from .extra_policy_info_replay_pool import ExtraPolicyInfoReplayPool
3 | from .union_pool import UnionPool
4 | from .trajectory_replay_pool import TrajectoryReplayPool
5 | 


--------------------------------------------------------------------------------
/softlearning/environments/dm_control/__init__.py:
--------------------------------------------------------------------------------
1 | """Custom DeepMind Control Suite environments.
2 | 
3 | Every class inside this module should extend a dm_control.suite.Task class. The
4 | # file structure should be similar to dm_control's file structure.
5 | """
6 | 


--------------------------------------------------------------------------------
/maple/policy/static/pendulum.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         done = np.zeros((len(obs), 1))
10 |         return done
11 | 


--------------------------------------------------------------------------------
/softlearning/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_sampler import BaseSampler
2 | from .dummy_sampler import DummySampler
3 | from .simple_sampler import SimpleSampler
4 | # from .remote_sampler import RemoteSampler
5 | from .extra_policy_info_sampler import ExtraPolicyInfoSampler
6 | from .utils import rollout, rollouts
7 | 


--------------------------------------------------------------------------------
/maple/policy/static/point2denv.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         done = np.array([False]).repeat(len(obs))
10 |         done = done[:,None]
11 |         return done
12 | 


--------------------------------------------------------------------------------
/maple/policy/static/point2dwallenv.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         done = np.array([False]).repeat(len(obs))
10 |         done = done[:,None]
11 |         return done
12 | 


--------------------------------------------------------------------------------
/maple/policy/static/halfcheetahveljump.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         done = np.array([False]).repeat(len(obs))
10 |         done = done[:,None]
11 |         return done
12 | 


--------------------------------------------------------------------------------
/maple/policy/static/humanoid.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | 
 4 | class StaticFns:
 5 | 
 6 |     @staticmethod
 7 |     def termination_fn(obs, act, next_obs):
 8 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 9 | 
10 |         z = next_obs[:,0]
11 |         done = (z < 1.0) + (z > 2.0)
12 | 
13 |         done = done[:,None]
14 |         return done


--------------------------------------------------------------------------------
/softlearning/environments/helpers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def random_point_in_circle(angle_range=(0, 2*np.pi), radius=(0, 25)):
 5 |     angle = np.random.uniform(*angle_range)
 6 |     radius = radius if np.isscalar(radius) else np.random.uniform(*radius)
 7 |     x, y = np.cos(angle) * radius, np.sin(angle) * radius
 8 |     point = np.array([x, y])
 9 |     return point
10 | 


--------------------------------------------------------------------------------
/softlearning/samplers/dummy_sampler.py:
--------------------------------------------------------------------------------
 1 | from .base_sampler import BaseSampler
 2 | 
 3 | 
 4 | class DummySampler(BaseSampler):
 5 |     def __init__(self, batch_size, max_path_length):
 6 |         super(DummySampler, self).__init__(
 7 |             max_path_length=max_path_length,
 8 |             min_pool_size=0,
 9 |             batch_size=batch_size)
10 | 
11 |     def sample(self):
12 |         pass
13 | 


--------------------------------------------------------------------------------
/examples/config/d4rl/hopper_medium.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'hopper',
 6 |     'task': 'medium-v0',
 7 |     'exp_name': 'hopper_medium'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/hopper-medium-v0',
11 |     'pool_load_max_size': 10**6,
12 |     'rollout_length': 5,
13 |     'penalty_coeff': 5.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/hopper_random.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'hopper',
 6 |     'task': 'random-v0',
 7 |     'exp_name': 'hopper_random'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/hopper-random-v0',
11 |     'pool_load_max_size': 10**6,
12 |     'rollout_length': 5,
13 |     'penalty_coeff': 1.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/walker2d_medium.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'walker2d',
 6 |     'task': 'medium-v0',
 7 |     'exp_name': 'walker2d_medium'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/walker2d-medium-v0',
11 |     'pool_load_max_size': 10**6,
12 |     'rollout_length': 5,
13 |     'penalty_coeff': 5.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/walker2d_random.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'walker2d',
 6 |     'task': 'random-v0',
 7 |     'exp_name': 'walker2d_random'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/walker2d-random-v0',
11 |     'pool_load_max_size': 10**6,
12 |     'rollout_length': 1,
13 |     'penalty_coeff': 1.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/halfcheetah_medium.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'halfcheetah',
 6 |     'task': 'medium-v0',
 7 |     'exp_name': 'halfcheetah_medium'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/halfcheetah-medium-v0',
11 |     'pool_load_max_size': 10**6,
12 |     'rollout_length': 1,
13 |     'penalty_coeff': 1.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/halfcheetah_random.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'halfcheetah',
 6 |     'task': 'random-v0',
 7 |     'exp_name': 'halfcheetah_random'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/halfcheetah-random-v0',
11 |     'pool_load_max_size': 10**6,
12 |     'rollout_length': 5,
13 |     'penalty_coeff': 0.5
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/hopper_mixed.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'hopper',
 6 |     'task': 'medium-replay-v0',
 7 |     'exp_name': 'hopper_medium_replay'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/hopper-medium-replay-v0',
11 |     'pool_load_max_size': 200920,
12 |     'rollout_length': 5,
13 |     'penalty_coeff': 1.0,
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/hopper_medium_expert.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'hopper',
 6 |     'task': 'medium-expert-v0',
 7 |     'exp_name': 'hopper_medium_expert'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/hopper-medium-expert-v0',
11 |     'pool_load_max_size': 2 * 10**6,
12 |     'rollout_length': 5,
13 |     'penalty_coeff': 1.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/walker2d_mixed.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'walker2d',
 6 |     'task': 'medium-replay-v0',
 7 |     'exp_name': 'walker2d_medium_replay'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/walker2d-medium-replay-v0',
11 |     'pool_load_max_size': 100930,
12 |     'rollout_length': 1,
13 |     'penalty_coeff': 1.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/halfcheetah_mixed.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'halfcheetah',
 6 |     'task': 'medium-replay-v0',
 7 |     'exp_name': 'halfcheetah_medium_replay'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/halfcheetah-medium-replay-v0',
11 |     'pool_load_max_size': 101000,
12 |     'rollout_length': 5,
13 |     'penalty_coeff': 1.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/walker2d_medium_expert.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'walker2d',
 6 |     'task': 'medium-expert-v0',
 7 |     'exp_name': 'walker2d_medium_expert'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/walker2d-medium-expert-v0',
11 |     'pool_load_max_size': 2 * 10**6,
12 |     'rollout_length': 1,
13 |     'penalty_coeff': 2.0
14 | })


--------------------------------------------------------------------------------
/examples/config/d4rl/halfcheetah_medium_expert.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'halfcheetah',
 6 |     'task': 'medium-expert-v0',
 7 |     'exp_name': 'halfcheetah_medium_expert'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'd4rl/halfcheetah-medium-expert-v0',
11 |     'pool_load_max_size': 2 * 10**6,
12 |     'rollout_length': 5,
13 |     'penalty_coeff': 5.0
14 | })


--------------------------------------------------------------------------------
/maple/policy/static/ant.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         x = next_obs[:, 0]
10 |         not_done = 	np.isfinite(next_obs).all(axis=-1) \
11 |         			* (x >= 0.2) \
12 |         			* (x <= 1.0)
13 | 
14 |         done = ~not_done
15 |         done = done[:,None]
16 |         return done
17 | 


--------------------------------------------------------------------------------
/examples/config/neorl/hopper_low.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'Hopper',
 6 |     'task': 'v3',
 7 |     'exp_name': 'hopper_neo_low'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'neorl/neorl_data/Hopper-v3-low-1000-train-noise.npz',
11 |     'pool_load_max_size': 101000,
12 |     'rollout_length': 10,
13 |     'penalty_coeff': 0.25,
14 |     'use_neorl': True
15 | })


--------------------------------------------------------------------------------
/examples/config/neorl/hopper_medium.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'Hopper',
 6 |     'task': 'v3',
 7 |     'exp_name': 'hopper_neo_medium'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'neorl/neorl_data/Hopper-v3-medium-1000-train-noise.npz',
11 |     'pool_load_max_size': 101000,
12 |     'rollout_length': 10,
13 |     'penalty_coeff': 0.25,
14 |     'use_neorl': True
15 | })


--------------------------------------------------------------------------------
/examples/config/neorl/walker2d_low.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'Walker2d',
 6 |     'task': 'v3',
 7 |     'exp_name': 'walker2d_neo_low'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'neorl/neorl_data/Walker2d-v3-low-1000-train-noise.npz',
11 |     'pool_load_max_size': 101000,
12 |     'rollout_length': 10,
13 |     'penalty_coeff': 0.25,
14 |     'use_neorl': True
15 | })


--------------------------------------------------------------------------------
/examples/config/neorl/halfcheetah_low.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'HalfCheetah',
 6 |     'task': 'v3',
 7 |     'exp_name': 'halfcheetah_low'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'neorl/neorl_data/HalfCheetah-v3-low-1000-train-noise.npz',
11 |     'pool_load_max_size': 101000,
12 |     'rollout_length': 10,
13 |     'penalty_coeff': 0.25,
14 |     'use_neorl': True
15 | })


--------------------------------------------------------------------------------
/examples/config/neorl/walker2d_medium.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'Walker2d',
 6 |     'task': 'v3',
 7 |     'exp_name': 'walker2d_neo_medium'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'neorl/neorl_data/Walker2d-v3-medium-1000-train-noise.npz',
11 |     'pool_load_max_size': 101000,
12 |     'rollout_length': 10,
13 |     'penalty_coeff': 0.25,
14 |     'use_neorl': True
15 | })


--------------------------------------------------------------------------------
/examples/config/neorl/halfcheetah_medium.py:
--------------------------------------------------------------------------------
 1 | from .base_maple import maple_params, deepcopy
 2 | 
 3 | params = deepcopy(maple_params)
 4 | params.update({
 5 |     'domain': 'HalfCheetah',
 6 |     'task': 'v3',
 7 |     'exp_name': 'halfcheetah_neo_medium'
 8 | })
 9 | params['kwargs'].update({
10 |     'pool_load_path': 'neorl/neorl_data/HalfCheetah-v3-medium-1000-train-noise.npz',
11 |     'pool_load_max_size': 101000,
12 |     'rollout_length': 10,
13 |     'penalty_coeff': 0.25,
14 |     'use_neorl': True
15 | })


--------------------------------------------------------------------------------
/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/04/09-30-38-583144 10.83.150.23 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=8&penalty_clip=20/tb/events/events.out.tfevents.1641259841.ml-gpu-ser108.nmg01:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/04/09-30-38-583144 10.83.150.23 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=8&penalty_clip=20/tb/events/events.out.tfevents.1641259841.ml-gpu-ser108.nmg01


--------------------------------------------------------------------------------
/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-38-56-149640 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=88&penalty_clip=20/tb/events/events.out.tfevents.1641022739.ml-gpu-ser119.nmg01:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-38-56-149640 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=88&penalty_clip=20/tb/events/events.out.tfevents.1641022739.ml-gpu-ser119.nmg01


--------------------------------------------------------------------------------
/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-42-56-985568 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=888&penalty_clip=20/tb/events/events.out.tfevents.1641022980.ml-gpu-ser119.nmg01:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xionghuichen/MAPLE/HEAD/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-42-56-985568 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=888&penalty_clip=20/tb/events/events.out.tfevents.1641022980.ml-gpu-ser119.nmg01


--------------------------------------------------------------------------------
/maple/env/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | # 
 3 | from .ant import AntEnv
 4 | from .humanoid import HumanoidEnv
 5 | from .halfcheetah_jump import HalfCheetahEnv as HalfCheetahJumpEnv
 6 | from .halfcheetah_vel import HalfCheetahEnv as HalfCheetahVelEnv
 7 | from .ant_angle import AntEnv as AngAngleEnv
 8 | # import halfcheetah_vel
 9 | # 
10 | env_overwrite = {'Ant': AntEnv,'AntAngle': AngAngleEnv,
11 |                  'Humanoid': HumanoidEnv, 'HalfCheetahVel':HalfCheetahVelEnv,
12 |                  'HalfCheetahJump': HalfCheetahJumpEnv}
13 | # 
14 | # sys.modules[__name__] = env_overwrite


--------------------------------------------------------------------------------
/rla_scripts/view_expt.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A script to view data of experiments.
 3 | """
 4 | 
 5 | from RLA.easy_log.log_tools import ViewLogTool
 6 | import argparse
 7 | from config import *
 8 | 
 9 | def argsparser():
10 |     parser = argparse.ArgumentParser("View Log")
11 |     parser.add_argument('--task_table_name', type=str)
12 |     parser.add_argument('--regex', type=str)
13 |     args = parser.parse_args()
14 |     return args
15 | 
16 | if __name__=='__main__':
17 |     args = argsparser()
18 |     dlt = ViewLogTool(proj_root=DATA_ROOT, task_table_name=args.task_table_name, regex=args.regex)
19 |     dlt.view_log()


--------------------------------------------------------------------------------
/softlearning/models/utils.py:
--------------------------------------------------------------------------------
 1 | def build_metric_learner_from_variant(variant, env, evaluation_data):
 2 |     sampler_params = variant['sampler_params']
 3 |     metric_learner_params = variant['metric_learner_params']
 4 |     metric_learner_params.update({
 5 |         'observation_shape': env.observation_space.shape,
 6 |         'max_distance': sampler_params['kwargs']['max_path_length'],
 7 |         'evaluation_data': evaluation_data
 8 |     })
 9 | 
10 |     metric_learner = MetricLearner(**metric_learner_params)
11 |     return metric_learner
12 | 
13 | 
14 | def get_model_from_variant(variant, env, *args, **kwargs):
15 |     pass
16 | 


--------------------------------------------------------------------------------
/maple/policy/static/halfcheetahvel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         done = np.array([False]).repeat(len(obs))
10 |         done = done[:,None]
11 |         return done
12 | 
13 |     @staticmethod
14 |     def recompute_reward_fn(obs, act, next_obs, rew):
15 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
16 |         new_rew = -(rew + 0.1 * np.sum(np.square(act))) - 0.1 * np.sum(np.square(act))
17 |         return new_rew
18 | 


--------------------------------------------------------------------------------
/softlearning/replay_pools/extra_policy_info_replay_pool.py:
--------------------------------------------------------------------------------
 1 | from .simple_replay_pool import SimpleReplayPool
 2 | 
 3 | 
 4 | class ExtraPolicyInfoReplayPool(SimpleReplayPool):
 5 |     def __init__(self, *args, **kwargs):
 6 |         super(ExtraPolicyInfoReplayPool, self).__init__(*args, **kwargs)
 7 | 
 8 |         fields = {
 9 |             'raw_actions': {
10 |                 'shape': self._action_space.shape,
11 |                 'dtype': 'float32'
12 |             },
13 |             'log_pis': {
14 |                 'shape': (1, ),
15 |                 'dtype': 'float32'
16 |             }
17 |         }
18 | 
19 |         self.add_fields(fields)
20 | 


--------------------------------------------------------------------------------
/softlearning/distributions/squash_bijector.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | import tensorflow_probability as tfp
 4 | 
 5 | 
 6 | class SquashBijector(tfp.bijectors.Bijector):
 7 |     def __init__(self, validate_args=False, name="tanh"):
 8 |         super(SquashBijector, self).__init__(
 9 |             forward_min_event_ndims=0,
10 |             validate_args=validate_args,
11 |             name=name)
12 | 
13 |     def _forward(self, x):
14 |         return tf.nn.tanh(x)
15 | 
16 |     def _inverse(self, y):
17 |         return tf.atanh(y)
18 | 
19 |     def _forward_log_det_jacobian(self, x):
20 |         return 2. * (np.log(2.) - x - tf.nn.softplus(-2. * x))
21 | 


--------------------------------------------------------------------------------
/rla_scripts/archive_expt.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A script to archive benchmarking experiments. 
 3 | 
 4 | It is convenient to merge the archived experiments and the current task into tensorboard by:
 5 | 
 6 | tensorboard --logdir ./log/your_task/,./log/archived/
 7 | 
 8 | """
 9 | 
10 | from RLA.easy_log.log_tools import ArchiveLogTool
11 | import argparse
12 | from config import *
13 | 
14 | def argsparser():
15 |     parser = argparse.ArgumentParser("Archive Log")
16 |     # reduce setting
17 |     parser.add_argument('--task_table_name', type=str)
18 |     parser.add_argument('--regex', type=str)
19 | 
20 |     args = parser.parse_args()
21 |     return args
22 | 
23 | if __name__=='__main__':
24 |     args = argsparser()
25 |     dlt = ArchiveLogTool(proj_root=DATA_ROOT, task_table_name=args.task_table_name, regex=args.regex)
26 |     dlt.archive_log()


--------------------------------------------------------------------------------
/softlearning/environments/gym/wrappers/normalize_action.py:
--------------------------------------------------------------------------------
 1 | import gym
 2 | from gym import spaces
 3 | import numpy as np
 4 | 
 5 | 
 6 | __all__ = ['NormalizeActionWrapper']
 7 | 
 8 | 
 9 | class NormalizeActionWrapper(gym.ActionWrapper):
10 |     """Rescale the action space of the environment."""
11 | 
12 |     def action(self, action):
13 |         if not isinstance(self.env.action_space, spaces.Box):
14 |             return action
15 | 
16 |         # rescale the action
17 |         low, high = self.env.action_space.low, self.env.action_space.high
18 |         scaled_action = low + (action + 1.0) * (high - low) / 2.0
19 |         scaled_action = np.clip(scaled_action, low, high)
20 | 
21 |         return scaled_action
22 | 
23 |     def reverse_action(self, action):
24 |         raise NotImplementedError
25 | 
26 | 
27 | normalize = NormalizeActionWrapper
28 | 


--------------------------------------------------------------------------------
/maple/policy/static/__init__.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import importlib
 4 | import pdb
 5 | 
 6 | 
 7 | def import_fns(path, file, fns_name='StaticFns'):
 8 | 	full_path = os.path.join("maple/policy/static", file)
 9 | 	import_path = full_path.replace('/', '.')
10 | 	module = importlib.import_module(import_path)
11 | 	fns = getattr(module, fns_name)
12 | 	return fns
13 | def get_base_path():
14 | 	return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15 | cwd = os.path.join(get_base_path(), 'policy/static')
16 | files = os.listdir(cwd)
17 | ## remove __init__.py
18 | files = filter(lambda x: '__' not in x and x[0] != '.', files)
19 | ## env.py --> env
20 | files = map(lambda x: x.replace('.py', ''), files)
21 | 
22 | ## {env: StaticFns, ... }
23 | static_fns = {file: import_fns(cwd, file) for file in files}
24 | 
25 | sys.modules[__name__] = static_fns
26 | 
27 | 


--------------------------------------------------------------------------------
/maple/policy/static/hopper.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from maple.global_config import *
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 |         if obs.shape[-1] == 12: # In neorl, another dim is inserted to observation space.
 9 |             next_obs = next_obs[:, 1:]
10 |         height = next_obs[:, 0]
11 |         angle = next_obs[:, 1]
12 |         # not_done =  np.logical_and(np.all(next_obs > -100, axis=-1),
13 |         #                np.all(next_obs < 100, axis=-1)) * \
14 |         not_done = np.isfinite(next_obs).all(axis=-1) \
15 |                     * np.abs(next_obs < STATE_CLIP_BOUND).all(axis=-1) \
16 |                     * (height > .7) \
17 |                     * (np.abs(angle) < .2)
18 | 
19 |         done = ~not_done
20 |         done = done[:,None]
21 |         return done
22 | 


--------------------------------------------------------------------------------
/maple/policy/static/walker2d.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from maple.global_config import *
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 |         if obs.shape[-1] == 18: # In neorl, another dim is inserted to observation space.
 9 |             # not_done = np.array([True]).repeat(len(obs))
10 |             next_obs = next_obs[:, 1:]
11 |         height = next_obs[:, 0]
12 |         angle = next_obs[:, 1]
13 |         not_done =  np.logical_and(np.all(next_obs > -1 * STATE_CLIP_BOUND, axis=-1), np.all(next_obs < STATE_CLIP_BOUND, axis=-1)) \
14 |                     * (height > 0.8) \
15 |                     * (height < 2.0) \
16 |                     * (angle > -1.0) \
17 |                     * (angle < 1.0)
18 |         done = ~not_done
19 |         done = done[:,None]
20 |         return done
21 | 


--------------------------------------------------------------------------------
/rla_config_mopo.yaml:
--------------------------------------------------------------------------------
 1 | PROJECT_TYPE:
 2 |   # lib: backup the project in YOUR_PROJECT_ROOT/build/lib.
 3 |   # It suit to the situation when you run the code by building a package. (e.g., "python setup.py install")
 4 |   # source: backup the project in YOUR_PROJECT_ROOT/{backup_code_dir}.
 5 |   # It suit to the situation when you run your code directly.
 6 |   # Backup will ignore the files satisfy the rules in IGNORE_RULE (the default value is: YOUR_PROJECT_ROOT/.gitignore)
 7 |   # and all log files in easy_log.
 8 |   backup_code_by: 'source'
 9 | 
10 | 
11 | BACKUP_CONFIG:
12 |   lib_dir: './build/lib/'
13 |   backup_code_dir:
14 |     - './maple'
15 |     - './run_scripts'
16 | 
17 | LOG_USED:
18 |   - 'stdout'
19 |   - 'log'
20 |   - 'csv'
21 |   - 'tensorboard'
22 | 
23 | DL_FRAMEWORK: 'tensorflow'
24 | SEND_LOG_FILE: False
25 | 
26 | REMOTE_SETTING:
27 |   ftp_server: ''
28 |   username: ''
29 |   password: ''
30 |   remote_log_root: ''
31 | 


--------------------------------------------------------------------------------
/examples/config/d4rl/base_maple.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | base_params = {
 3 |     'type': 'MAPLE',
 4 |     'universe': 'gym',
 5 |     'kwargs': {
 6 |         'epoch_length': 1000,
 7 |         'train_every_n_steps': 1,
 8 |         'n_train_repeat': 1,
 9 |         'eval_render_mode': None,
10 |         'eval_n_episodes': 10,
11 |         'eval_deterministic': True,
12 | 
13 |         'discount': 0.99,
14 |         'tau': 5e-3,
15 |         'reward_scale': 1.0,
16 | 
17 |         'model_train_freq': 1000,
18 |         'model_retain_epochs': 5,
19 |         'rollout_batch_size': 50e3,
20 |         'deterministic': False,
21 |         'num_networks': 7,
22 |         'num_elites': 5,
23 |         'real_ratio': 0.05,
24 |         'target_entropy': -3,
25 |         'max_model_t': None
26 |     }
27 | }
28 | 
29 | maple_params = deepcopy(base_params)
30 | maple_params['kwargs'].update({
31 |     'separate_mean_var': True,
32 |     'penalty_learned_var': True,
33 | })


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from setuptools import find_packages
 3 | 
 4 | setup(
 5 |     name='MAPLE',
 6 |     packages=find_packages(),
 7 |     version='0.0.1',
 8 |     description='Offline Model-based Adaptable Policy Learning',
 9 |     long_description=open('./README.md').read(),
10 |     author='Xiong-Hui Chen, Fan-Ming Luo',
11 |     author_email='chenxh@lamda.nju.edu.cn, luofm@lamda.nju.edu.cn',
12 |     entry_points={
13 |         'console_scripts': (
14 |             'mopo=softlearning.scripts.console_scripts:main',
15 |             'viskit=mopo.scripts.console_scripts:main'
16 |         )
17 |     },
18 |     install_requires=[
19 |         "RLA @ git+https://github.com/polixir/RLAssistant.git@main#egg=RLA",
20 |         "serializable @ git+https://github.com/hartikainen/serializable.git@76516385a3a716ed4a2a9ad877e2d5cbcf18d4e6#egg=serializable",
21 |         'gtimer',
22 |         'dotmap',
23 |     ],
24 |     zip_safe=True,
25 |     license='MIT'
26 | )
27 | 


--------------------------------------------------------------------------------
/examples/config/neorl/base_maple.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | base_params = {
 3 |     'type': 'MAPLE',
 4 |     'universe': 'gym',
 5 | 
 6 |     'log_dir': './ray_mopo/', # Specify where to write log files here
 7 | 
 8 |     'kwargs': {
 9 |         'epoch_length': 1000,
10 |         'train_every_n_steps': 1,
11 |         'n_train_repeat': 1,
12 |         'eval_render_mode': None,
13 |         'eval_n_episodes': 10,
14 |         'eval_deterministic': True,
15 | 
16 |         'discount': 0.99,
17 |         'tau': 5e-3,
18 |         'reward_scale': 1.0,
19 | 
20 |         'model_train_freq': 1000,
21 |         'model_retain_epochs': 5,
22 |         'rollout_batch_size': 50e3,
23 |         'deterministic': False,
24 |         'num_networks': 7,
25 |         'num_elites': 5,
26 |         'real_ratio': 0.05,
27 |         'target_entropy': -3,
28 |         'max_model_t': None
29 |     }
30 | }
31 | 
32 | maple_params = deepcopy(base_params)
33 | maple_params['kwargs'].update({
34 |     'separate_mean_var': True,
35 |     'penalty_learned_var': True,
36 | })


--------------------------------------------------------------------------------
/maple/policy/static/halfcheetahjump.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         # done = np.array([False]).repeat(len(obs))
10 |         not_done = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1))
11 |         done = ~not_done
12 |         done = done[:,None]
13 |         # not_done = np.isfinite(next_obs).all(axis=-1)
14 |         # done = done[:,None]
15 |         return done
16 | 
17 |     @staticmethod
18 |     def recompute_reward_fn(obs, act, next_obs, rew):
19 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
20 | 
21 |         # new_rew = -(rew + 0.1 * np.sum(np.square(act))) - 0.1 * np.sum(np.square(act))
22 |         new_rew = np.clip(rew + 0.1 * np.sum(np.square(act), axis=-1), None, 3) \
23 |                   - 0.1 * np.sum(np.square(act), axis=-1) + 15 * next_obs[..., 0]
24 |         return new_rew
25 | 


--------------------------------------------------------------------------------
/softlearning/environments/utils.py:
--------------------------------------------------------------------------------
 1 | from .adapters.gym_adapter import (
 2 |     GYM_ENVIRONMENTS,
 3 |     GymAdapter,
 4 | )
 5 | 
 6 | from maple.env import env_overwrite
 7 | 
 8 | ENVIRONMENTS = {
 9 |     'gym': GYM_ENVIRONMENTS,
10 | }
11 | 
12 | ADAPTERS = {
13 |     'gym': GymAdapter,
14 | }
15 | 
16 | 
17 | def get_environment(universe, domain, task, environment_params):
18 |     if domain in env_overwrite:
19 |         print('[ environments/utils ] WARNING: Using overwritten {} environment'.format(domain))
20 |         env = env_overwrite[domain]()
21 |         env = ADAPTERS[universe](None, None, env=env)
22 |     else:
23 |         env = ADAPTERS[universe](domain, task, **environment_params)
24 |     return env
25 | 
26 | 
27 | def get_environment_from_params(environment_params):
28 |     universe = environment_params['universe']
29 |     task = environment_params['task']
30 |     domain = environment_params['domain']
31 |     environment_kwargs = environment_params.get('kwargs', {}).copy()
32 | 
33 |     return get_environment(universe, domain, task, environment_kwargs)
34 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Xiong-Hui Chen
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/maple/policy/static/halfcheetah.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from maple.global_config import *
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         # done = np.array([False]).repeat(len(obs))
10 |         if obs.shape[-1] == 18: # neorl
11 |             # not_done = np.array([True]).repeat(len(obs))
12 |             next_obs = next_obs[:, 1:]
13 |             not_done = np.logical_and(np.all(next_obs >= -1 * STATE_CLIP_BOUND, axis=-1), np.all(next_obs <= STATE_CLIP_BOUND, axis=-1))
14 |         else:
15 |             not_done = np.array([True]).repeat(len(obs))
16 |             not_done = np.logical_and(np.all(next_obs > -1 * STATE_CLIP_BOUND, axis=-1), np.all(next_obs < STATE_CLIP_BOUND, axis=-1))
17 |         done = ~not_done
18 |         done = done[:,None]
19 |         return done
20 | 
21 |     @staticmethod
22 |     def recompute_reward_fn(obs, act, next_obs, rew):
23 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
24 |         new_rew = -(rew + 0.1 * np.sum(np.square(act))) - 0.1 * np.sum(np.square(act))
25 |         return new_rew
26 | 


--------------------------------------------------------------------------------
/softlearning/utils/keras.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | 
 6 | class PicklableKerasModel(tf.keras.Model):
 7 |     def __getstate__(self):
 8 |         with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
 9 |             tf.keras.models.save_model(self, fd.name, overwrite=True)
10 |             model_str = fd.read()
11 |         d = {'model_str': model_str}
12 | 
13 |         return d
14 | 
15 |     def __setstate__(self, state):
16 |         with tempfile.NamedTemporaryFile(suffix='.hdf5', delete=True) as fd:
17 |             fd.write(state['model_str'])
18 |             fd.flush()
19 | 
20 |             loaded_model = tf.keras.models.load_model(
21 |                 fd.name, custom_objects={
22 |                     self.__class__.__name__: self.__class__})
23 | 
24 |         self.__dict__.update(loaded_model.__dict__.copy())
25 | 
26 |     @classmethod
27 |     def from_config(cls, *args, custom_objects=None, **kwargs):
28 |         custom_objects = custom_objects or {}
29 |         custom_objects[cls.__name__] = cls
30 |         custom_objects['tf'] = tf
31 |         return super(PicklableKerasModel, cls).from_config(
32 |             *args, custom_objects=custom_objects, **kwargs)
33 | 


--------------------------------------------------------------------------------
/softlearning/replay_pools/replay_pool.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | 
 3 | 
 4 | class ReplayPool(object):
 5 |     """A class used to save and replay data."""
 6 | 
 7 |     @abc.abstractmethod
 8 |     def add_sample(self, sample):
 9 |         """Add a transition tuple."""
10 |         pass
11 | 
12 |     @abc.abstractmethod
13 |     def terminate_episode(self):
14 |         """Clean up pool after episode termination."""
15 |         pass
16 | 
17 |     @property
18 |     @abc.abstractmethod
19 |     def size(self, **kwargs):
20 |         pass
21 | 
22 |     def add_path(self, path):
23 |         """Add a rollout to the replay pool.
24 | 
25 |         This default implementation naively goes through every step, but you
26 |         may want to optimize this.
27 | 
28 |         NOTE: You should NOT call "terminate_episode" after calling add_path.
29 |         It's assumed that this function handles the episode termination.
30 | 
31 |         :param path: Dict like one outputted by railrl.samplers.util.rollout
32 |         """
33 |         self.add_samples(path)
34 |         self.terminate_episode()
35 | 
36 |     @abc.abstractmethod
37 |     def random_batch(self, batch_size):
38 |         """Return a random batch of size `batch_size`."""
39 |         pass
40 | 


--------------------------------------------------------------------------------
/examples/config/__init__.py:
--------------------------------------------------------------------------------
 1 | params = {
 2 |     'type': 'MAPLE',
 3 |     'universe': 'gym',
 4 |     'domain': 'Hopper',
 5 |     'task': 'v2',
 6 | 
 7 |     'log_dir': '~/ray_mopo/',
 8 |     'exp_name': 'defaults',
 9 | 
10 |     'kwargs': {
11 |         'epoch_length': 1000,
12 |         'train_every_n_steps': 1,
13 |         'n_train_repeat': 2, #20,
14 |         'eval_render_mode': None,
15 |         'eval_n_episodes': 10,
16 |         'eval_deterministic': True,
17 | 
18 |         'discount': 0.99,
19 |         'tau': 5e-3,
20 |         'reward_scale': 1.0,
21 |         ####
22 |         'model_reset_freq': 1000,
23 |         'model_train_freq': 250, # 250
24 |         # 'retain_model_epochs': 2,
25 |         'model_pool_size': 2e6,
26 |         'rollout_batch': 100e3, # 40e3
27 |         'rollout_length': 1,
28 |         'deterministic': False,
29 |         'num_networks': 7,
30 |         'num_elites': 5,
31 |         'real_ratio': 0.05,
32 |         'entropy_mult': 0.5,
33 |         # 'target_entropy': -1.5,
34 |         'max_model_t': 1e10,
35 |         # 'max_dev': 0.25, 
36 |         # 'marker': 'early-stop_10rep_stochastic',
37 |         'rollout_length_params': [20, 150, 1, 1], ## epoch, loss, length
38 |         # 'marker': 'dump',
39 |     }
40 | }


--------------------------------------------------------------------------------
/maple/policy/static/antangle.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class StaticFns:
 4 | 
 5 |     @staticmethod
 6 |     def termination_fn(obs, act, next_obs):
 7 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
 8 | 
 9 |         x = next_obs[:, 0]
10 |         not_done = 	np.isfinite(next_obs).all(axis=-1) \
11 |         			* (x >= 0.2) \
12 |         			* (x <= 1.0)
13 |         not_done2 = np.logical_and(np.all(next_obs > -100, axis=-1), np.all(next_obs < 100, axis=-1))
14 |         not_done = np.logical_and(not_done2, not_done)
15 |         done = ~not_done
16 |         done = done[:,None]
17 |         return done
18 | 
19 |     @staticmethod
20 |     def recompute_reward_fn(obs, act, next_obs, rew):
21 |         survive_reward = 1
22 |         ctrl_cost = .5 * np.square(act).sum(axis=-1)
23 |         xy_velocity = next_obs[..., 111:]
24 |         contact_cost = 0.5 * 1e-3 * np.sum((np.square(next_obs[..., 27:111])), axis=-1)
25 |         assert len(obs.shape) == len(next_obs.shape) == len(act.shape) == 2
26 |         new_rew = xy_velocity[..., 0] * np.cos(np.pi/6) + xy_velocity[..., 1] * np.sin(np.pi/6) - ctrl_cost - contact_cost + survive_reward
27 |         # new_rew = -(rew + 0.1 * np.sum(np.square(act))) - 0.1 * np.sum(np.square(act))
28 |         return new_rew
29 | 
30 | 


--------------------------------------------------------------------------------
/softlearning/value_functions/vanilla.py:
--------------------------------------------------------------------------------
 1 | from softlearning.models.feedforward import feedforward_model
 2 | 
 3 | 
 4 | def create_feedforward_Q_function(observation_shape,
 5 |                                   action_shape,
 6 |                                   *args,
 7 |                                   observation_preprocessor=None,
 8 |                                   name='feedforward_Q',
 9 |                                   **kwargs):
10 |     input_shapes = (observation_shape, action_shape)
11 |     preprocessors = (observation_preprocessor, None)
12 |     return feedforward_model(
13 |         input_shapes,
14 |         *args,
15 |         output_size=1,
16 |         preprocessors=preprocessors,
17 |         name=name,
18 |         **kwargs)
19 | 
20 | 
21 | def create_feedforward_V_function(observation_shape,
22 |                                   *args,
23 |                                   observation_preprocessor=None,
24 |                                   name='feedforward_V',
25 |                                   **kwargs):
26 |     input_shapes = (observation_shape, )
27 |     preprocessors = (observation_preprocessor, None)
28 |     return feedforward_model(
29 |         input_shapes,
30 |         *args,
31 |         output_size=1,
32 |         preprocessors=preprocessors,
33 |         **kwargs)
34 | 


--------------------------------------------------------------------------------
/rla_scripts/delete_expt.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A script to delete useless experimental logs by regex string.
 3 | 
 4 | """
 5 | from RLA.easy_log.log_tools import DeleteLogTool, Filter
 6 | import argparse
 7 | from config import *
 8 | 
 9 | def argsparser():
10 |     parser = argparse.ArgumentParser("Delete Log")
11 |     # reduce setting
12 |     parser.add_argument('--task_table_name', type=str, default="")
13 |     parser.add_argument('--regex', type=str)
14 |     parser.add_argument('--timestep_bound', type=int, default=100)
15 |     # Filter.ALL: delete all experiments satisfied regex
16 |     # Filter.SMALL_TIMESTEP: delete all experiments that the names satisfy regex
17 |     # and the recorded timesteps are less than args.timestep_bound.
18 |     parser.add_argument('--delete_type', type=str, default=Filter.ALL)
19 | 
20 |     args = parser.parse_args()
21 |     return args
22 | 
23 | if __name__=='__main__':
24 |     args = argsparser()
25 |     filter = Filter()
26 |     filter.config(type=args.delete_type, timstep_bound=args.timestep_bound)
27 |     dlt = DeleteLogTool(proj_root=DATA_ROOT, task_table_name=args.task_table_name, regex=args.regex,
28 |                         filter=filter)
29 |     if args.delete_type == Filter.ALL:
30 |         dlt.delete_related_log()
31 |     elif args.delete_type == Filter.SMALL_TIMESTEP:
32 |         dlt.delete_small_timestep_log()
33 |     else:
34 |         raise NotImplementedError


--------------------------------------------------------------------------------
/softlearning/replay_pools/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from . import (
 4 |     simple_replay_pool,
 5 |     extra_policy_info_replay_pool,
 6 |     union_pool,
 7 |     trajectory_replay_pool)
 8 | 
 9 | 
10 | POOL_CLASSES = {
11 |     'SimpleReplayPool': simple_replay_pool.SimpleReplayPool,
12 |     'TrajectoryReplayPool': trajectory_replay_pool.TrajectoryReplayPool,
13 |     'ExtraPolicyInfoReplayPool': (
14 |         extra_policy_info_replay_pool.ExtraPolicyInfoReplayPool),
15 |     'UnionPool': union_pool.UnionPool,
16 | }
17 | 
18 | DEFAULT_REPLAY_POOL = 'SimpleReplayPool'
19 | 
20 | 
21 | def get_replay_pool_from_variant(variant, env, *args, **kwargs):
22 |     replay_pool_params = variant['replay_pool_params']
23 |     if isinstance(replay_pool_params["kwargs"]["max_size"], int):
24 |         replay_pool_params["kwargs"]["max_size"] = replay_pool_params["kwargs"]["max_size"]
25 |     else:
26 |         replay_pool_params["kwargs"]["max_size"] = replay_pool_params["kwargs"]["max_size"](variant)
27 |     replay_pool_type = replay_pool_params['type']
28 |     replay_pool_kwargs = deepcopy(replay_pool_params['kwargs'])
29 |     print('[ DEBUG ]: replay pool config: ', replay_pool_kwargs)
30 |     replay_pool = POOL_CLASSES[replay_pool_type](
31 |         *args,
32 |         observation_space=env.observation_space,
33 |         action_space=env.action_space,
34 |         **replay_pool_kwargs,
35 |         **kwargs)
36 | 
37 |     return replay_pool
38 | 


--------------------------------------------------------------------------------
/maple/env/halfcheetah_vel.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | 
 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 7 |     def __init__(self):
 8 |         mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5)
 9 |         utils.EzPickle.__init__(self)
10 | 
11 |     def step(self, action):
12 |         xposbefore = self.sim.data.qpos[0]
13 |         self.do_simulation(action, self.frame_skip)
14 |         xposafter = self.sim.data.qpos[0]
15 |         ob = self._get_obs()
16 |         reward_ctrl = -0.1 * np.square(action).sum()
17 |         reward_run = np.clip((xposafter - xposbefore) / self.dt, None, 3)
18 |         reward = reward_ctrl + reward_run
19 |         done = False
20 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
21 | 
22 |     def _get_obs(self):
23 |         return np.concatenate(
24 |             [
25 |                 self.sim.data.qpos.flat[1:],
26 |                 self.sim.data.qvel.flat,
27 |             ]
28 |         )
29 | 
30 |     def reset_model(self):
31 |         qpos = self.init_qpos + self.np_random.uniform(
32 |             low=-0.1, high=0.1, size=self.model.nq
33 |         )
34 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * 0.1
35 |         self.set_state(qpos, qvel)
36 |         return self._get_obs()
37 | 
38 |     def viewer_setup(self):
39 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/softlearning/replay_pools/union_pool.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .replay_pool import ReplayPool
 4 | 
 5 | 
 6 | class UnionPool(ReplayPool):
 7 |     def __init__(self, pools):
 8 |         pool_sizes = np.array([b.size for b in pools])
 9 |         self._total_size = sum(pool_sizes)
10 |         self._normalized_pool_sizes = pool_sizes / self._total_size
11 | 
12 |         self.pools = pools
13 | 
14 |     def add_sample(self, *args, **kwargs):
15 |         raise NotImplementedError
16 | 
17 |     def terminate_episode(self):
18 |         raise NotImplementedError
19 | 
20 |     @property
21 |     def size(self):
22 |         return self._total_size
23 | 
24 |     def add_path(self, **kwargs):
25 |         raise NotImplementedError
26 | 
27 |     def random_batch(self, batch_size):
28 | 
29 |         # TODO: Hack
30 |         partial_batch_sizes = self._normalized_pool_sizes * batch_size
31 |         partial_batch_sizes = partial_batch_sizes.astype(int)
32 |         partial_batch_sizes[0] = batch_size - sum(partial_batch_sizes[1:])
33 | 
34 |         partial_batches = [
35 |             pool.random_batch(partial_batch_size) for pool,
36 |             partial_batch_size in zip(self.pools, partial_batch_sizes)
37 |         ]
38 | 
39 |         def all_values(key):
40 |             return [partial_batch[key] for partial_batch in partial_batches]
41 | 
42 |         keys = partial_batches[0].keys()
43 | 
44 |         return {key: np.concatenate(all_values(key), axis=0) for key in keys}
45 | 


--------------------------------------------------------------------------------
/softlearning/models/feedforward.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | 
 4 | from softlearning.utils.keras import PicklableKerasModel
 5 | 
 6 | 
 7 | def feedforward_model(input_shapes,
 8 |                       output_size,
 9 |                       hidden_layer_sizes,
10 |                       activation='relu',
11 |                       output_activation='linear',
12 |                       preprocessors=None,
13 |                       name='feedforward_model',
14 |                       *args,
15 |                       **kwargs):
16 |     inputs = [
17 |         tf.keras.layers.Input(shape=input_shape)
18 |         for input_shape in input_shapes
19 |     ]
20 | 
21 |     if preprocessors is None:
22 |         preprocessors = (None, ) * len(inputs)
23 | 
24 |     preprocessed_inputs = [
25 |         preprocessor(input_) if preprocessor is not None else input_
26 |         for preprocessor, input_ in zip(preprocessors, inputs)
27 |     ]
28 | 
29 |     concatenated = tf.keras.layers.Lambda(
30 |         lambda x: tf.concat(x, axis=-1)
31 |     )(preprocessed_inputs)
32 | 
33 |     out = concatenated
34 |     for units in hidden_layer_sizes:
35 |         out = tf.keras.layers.Dense(
36 |             units, *args, activation=activation, **kwargs
37 |         )(out)
38 | 
39 |     out = tf.keras.layers.Dense(
40 |         output_size, *args, activation=output_activation, **kwargs
41 |     )(out)
42 | 
43 |     model = PicklableKerasModel(inputs, out, name=name)
44 | 
45 |     return model
46 | 
47 | 


--------------------------------------------------------------------------------
/maple/env/halfcheetah_jump.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | 
 6 | class HalfCheetahEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 7 |     def __init__(self):
 8 |         mujoco_env.MujocoEnv.__init__(self, "half_cheetah.xml", 5)
 9 |         utils.EzPickle.__init__(self)
10 | 
11 |     def step(self, action):
12 |         xposbefore = self.sim.data.qpos[0]
13 |         self.do_simulation(action, self.frame_skip)
14 |         xposafter = self.sim.data.qpos[0]
15 | 
16 |         ob = self._get_obs()
17 |         reward_ctrl = -0.1 * np.square(action).sum()
18 |         reward_run = np.clip((xposafter - xposbefore) / self.dt, None, 3) + 15 * (self.sim.data.qpos[1] - self.init_qpos[1])
19 |         reward = reward_ctrl + reward_run
20 |         done = False
21 |         return ob, reward, done, dict(reward_run=reward_run, reward_ctrl=reward_ctrl)
22 | 
23 |     def _get_obs(self):
24 |         return np.concatenate(
25 |             [
26 |                 self.sim.data.qpos.flat[1:],
27 |                 self.sim.data.qvel.flat,
28 |             ]
29 |         )
30 | 
31 |     def reset_model(self):
32 |         qpos = self.init_qpos + self.np_random.uniform(
33 |             low=-0.1, high=0.1, size=self.model.nq
34 |         )
35 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * 0.1
36 |         self.set_state(qpos, qvel)
37 |         self.z_init =  self.init_qpos[1]
38 |         return self._get_obs()
39 | 
40 |     def viewer_setup(self):
41 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/softlearning/policies/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from softlearning.preprocessors.utils import get_preprocessor_from_params
 4 | 
 5 | 
 6 | def get_gaussian_policy(env, Q, **kwargs):
 7 |     from .gaussian_policy import FeedforwardGaussianPolicy
 8 |     policy = FeedforwardGaussianPolicy(
 9 |         input_shapes=(env.active_observation_shape, ),
10 |         output_shape=env.action_space.shape,
11 |         **kwargs)
12 | 
13 |     return policy
14 | 
15 | 
16 | def get_uniform_policy(env, *args, **kwargs):
17 |     from .uniform_policy import UniformPolicy
18 |     policy = UniformPolicy(
19 |         input_shapes=(env.active_observation_shape, ),
20 |         output_shape=env.action_space.shape)
21 | 
22 |     return policy
23 | 
24 | 
25 | POLICY_FUNCTIONS = {
26 |     'GaussianPolicy': get_gaussian_policy,
27 |     'UniformPolicy': get_uniform_policy,
28 | }
29 | 
30 | 
31 | def get_policy(policy_type, *args, **kwargs):
32 |     return POLICY_FUNCTIONS[policy_type](*args, **kwargs)
33 | 
34 | 
35 | def get_policy_from_variant(variant, env, Qs, *args, **kwargs):
36 |     policy_params = variant['policy_params']
37 |     policy_type = policy_params['type']
38 |     policy_kwargs = deepcopy(policy_params['kwargs'])
39 | 
40 |     preprocessor_params = policy_kwargs.pop('preprocessor_params', None)
41 |     preprocessor = get_preprocessor_from_params(env, preprocessor_params)
42 | 
43 |     policy = POLICY_FUNCTIONS[policy_type](env, *args,
44 |         Q=Qs[0],
45 |         preprocessor=preprocessor,
46 |         **policy_kwargs,
47 |         **kwargs)
48 | 
49 |     return policy
50 | 


--------------------------------------------------------------------------------
/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-38-56-149640 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=88&penalty_clip=20/warn.txt:
--------------------------------------------------------------------------------
 1 | [WARN] 0 : sync: start
 2 | [WARN] 20 : sync: start
 3 | [WARN] 40 : sync: start
 4 | [WARN] 60 : sync: start
 5 | [WARN] 80 : sync: start
 6 | [WARN] 100 : sync: start
 7 | [WARN] 120 : sync: start
 8 | [WARN] 140 : sync: start
 9 | [WARN] 160 : sync: start
10 | [WARN] 180 : sync: start
11 | [WARN] 200 : sync: start
12 | [WARN] 220 : sync: start
13 | [WARN] 240 : sync: start
14 | [WARN] 260 : sync: start
15 | [WARN] 280 : sync: start
16 | [WARN] 300 : sync: start
17 | [WARN] 320 : sync: start
18 | [WARN] 340 : sync: start
19 | [WARN] 360 : sync: start
20 | [WARN] 380 : sync: start
21 | [WARN] 400 : sync: start
22 | [WARN] 420 : sync: start
23 | [WARN] 440 : sync: start
24 | [WARN] 460 : sync: start
25 | [WARN] 480 : sync: start
26 | [WARN] 500 : sync: start
27 | [WARN] 520 : sync: start
28 | [WARN] 540 : sync: start
29 | [WARN] 560 : sync: start
30 | [WARN] 580 : sync: start
31 | [WARN] 600 : sync: start
32 | [WARN] 620 : sync: start
33 | [WARN] 640 : sync: start
34 | [WARN] 660 : sync: start
35 | [WARN] 680 : sync: start
36 | [WARN] 700 : sync: start
37 | [WARN] 720 : sync: start
38 | [WARN] 740 : sync: start
39 | [WARN] 760 : sync: start
40 | [WARN] 780 : sync: start
41 | [WARN] 800 : sync: start
42 | [WARN] 820 : sync: start
43 | [WARN] 840 : sync: start
44 | [WARN] 860 : sync: start
45 | [WARN] 880 : sync: start
46 | [WARN] 900 : sync: start
47 | [WARN] 920 : sync: start
48 | [WARN] 940 : sync: start
49 | [WARN] 960 : sync: start
50 | [WARN] 980 : sync: start
51 | 


--------------------------------------------------------------------------------
/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/04/09-30-38-583144 10.83.150.23 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=8&penalty_clip=20/warn.txt:
--------------------------------------------------------------------------------
 1 | [WARN] 0 : sync: start
 2 | [WARN] 20 : sync: start
 3 | [WARN] 40 : sync: start
 4 | [WARN] 60 : sync: start
 5 | [WARN] 80 : sync: start
 6 | [WARN] 100 : sync: start
 7 | [WARN] 120 : sync: start
 8 | [WARN] 140 : sync: start
 9 | [WARN] 160 : sync: start
10 | [WARN] 180 : sync: start
11 | [WARN] 200 : sync: start
12 | [WARN] 220 : sync: start
13 | [WARN] 240 : sync: start
14 | [WARN] 260 : sync: start
15 | [WARN] 280 : sync: start
16 | [WARN] 300 : sync: start
17 | [WARN] 320 : sync: start
18 | [WARN] 340 : sync: start
19 | [WARN] 360 : sync: start
20 | [WARN] 380 : sync: start
21 | [WARN] 400 : sync: start
22 | [WARN] 420 : sync: start
23 | [WARN] 440 : sync: start
24 | [WARN] 460 : sync: start
25 | [WARN] 480 : sync: start
26 | [WARN] 500 : sync: start
27 | [WARN] 520 : sync: start
28 | [WARN] 540 : sync: start
29 | [WARN] 560 : sync: start
30 | [WARN] 580 : sync: start
31 | [WARN] 600 : sync: start
32 | [WARN] 620 : sync: start
33 | [WARN] 640 : sync: start
34 | [WARN] 660 : sync: start
35 | [WARN] 680 : sync: start
36 | [WARN] 700 : sync: start
37 | [WARN] 720 : sync: start
38 | [WARN] 740 : sync: start
39 | [WARN] 760 : sync: start
40 | [WARN] 780 : sync: start
41 | [WARN] 800 : sync: start
42 | [WARN] 820 : sync: start
43 | [WARN] 840 : sync: start
44 | [WARN] 860 : sync: start
45 | [WARN] 880 : sync: start
46 | [WARN] 900 : sync: start
47 | [WARN] 920 : sync: start
48 | [WARN] 940 : sync: start
49 | [WARN] 960 : sync: start
50 | [WARN] 980 : sync: start
51 | 


--------------------------------------------------------------------------------
/log/v2_examples.config.d4rl.walker2d_medium_expert/2022/01/01/15-42-56-985568 10.83.150.44 &info=test-4&model_suffix=200&penalty_coeff=0.25&length=20&maple_200=True&run_params.seed=888&penalty_clip=20/warn.txt:
--------------------------------------------------------------------------------
 1 | [WARN] 0 : sync: start
 2 | [WARN] 20 : sync: start
 3 | [WARN] 40 : sync: start
 4 | [WARN] 60 : sync: start
 5 | [WARN] 80 : sync: start
 6 | [WARN] 100 : sync: start
 7 | [WARN] 120 : sync: start
 8 | [WARN] 140 : sync: start
 9 | [WARN] 160 : sync: start
10 | [WARN] 180 : sync: start
11 | [WARN] 200 : sync: start
12 | [WARN] 220 : sync: start
13 | [WARN] 240 : sync: start
14 | [WARN] 260 : sync: start
15 | [WARN] 280 : sync: start
16 | [WARN] 300 : sync: start
17 | [WARN] 320 : sync: start
18 | [WARN] 340 : sync: start
19 | [WARN] 360 : sync: start
20 | [WARN] 380 : sync: start
21 | [WARN] 400 : sync: start
22 | [WARN] 420 : sync: start
23 | [WARN] 440 : sync: start
24 | [WARN] 460 : sync: start
25 | [WARN] 480 : sync: start
26 | [WARN] 500 : sync: start
27 | [WARN] 520 : sync: start
28 | [WARN] 540 : sync: start
29 | [WARN] 560 : sync: start
30 | [WARN] 580 : sync: start
31 | [WARN] 600 : sync: start
32 | [WARN] 620 : sync: start
33 | [WARN] 640 : sync: start
34 | [WARN] 660 : sync: start
35 | [WARN] 680 : sync: start
36 | [WARN] 700 : sync: start
37 | [WARN] 720 : sync: start
38 | [WARN] 740 : sync: start
39 | [WARN] 760 : sync: start
40 | [WARN] 780 : sync: start
41 | [WARN] 800 : sync: start
42 | [WARN] 820 : sync: start
43 | [WARN] 840 : sync: start
44 | [WARN] 860 : sync: start
45 | [WARN] 880 : sync: start
46 | [WARN] 900 : sync: start
47 | [WARN] 920 : sync: start
48 | [WARN] 940 : sync: start
49 | [WARN] 960 : sync: start
50 | [WARN] 980 : sync: start
51 | 


--------------------------------------------------------------------------------
/softlearning/preprocessors/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | 
 4 | def get_convnet_preprocessor(observation_shape,
 5 |                              name='convnet_preprocessor',
 6 |                              **kwargs):
 7 |     from .convnet import convnet_preprocessor
 8 |     preprocessor = convnet_preprocessor(
 9 |         input_shapes=(observation_shape, ), name=name, **kwargs)
10 | 
11 |     return preprocessor
12 | 
13 | 
14 | def get_feedforward_preprocessor(observation_shape,
15 |                                  name='feedforward_preprocessor',
16 |                                  **kwargs):
17 |     from softlearning.models.feedforward import feedforward_model
18 |     preprocessor = feedforward_model(
19 |         input_shapes=(observation_shape, ), name=name, **kwargs)
20 | 
21 |     return preprocessor
22 | 
23 | 
24 | PREPROCESSOR_FUNCTIONS = {
25 |     'convnet_preprocessor': get_convnet_preprocessor,
26 |     'feedforward_preprocessor': get_feedforward_preprocessor,
27 |     None: lambda *args, **kwargs: None
28 | }
29 | 
30 | 
31 | def get_preprocessor_from_params(env, preprocessor_params, *args, **kwargs):
32 |     if preprocessor_params is None:
33 |         return None
34 | 
35 |     preprocessor_type = preprocessor_params.get('type', None)
36 |     preprocessor_kwargs = deepcopy(preprocessor_params.get('kwargs', {}))
37 | 
38 |     if preprocessor_type is None:
39 |         return None
40 | 
41 |     preprocessor = PREPROCESSOR_FUNCTIONS[
42 |         preprocessor_type](
43 |             env.active_observation_shape,
44 |             *args,
45 |             **preprocessor_kwargs,
46 |             **kwargs)
47 | 
48 |     return preprocessor
49 | 
50 | 
51 | def get_preprocessor_from_variant(variant, env, *args, **kwargs):
52 |     preprocessor_params = variant['preprocessor_params']
53 |     return get_preprocessor_from_params(
54 |         env, preprocessor_params, *args, **kwargs)
55 | 


--------------------------------------------------------------------------------
/softlearning/value_functions/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | from softlearning.preprocessors.utils import get_preprocessor_from_params
 4 | from . import vanilla
 5 | 
 6 | 
 7 | def create_double_value_function(value_fn, *args, **kwargs):
 8 |     # TODO(hartikainen): The double Q-function should support the same
 9 |     # interface as the regular ones. Implement the double min-thing
10 |     # as a Keras layer.
11 |     value_fns = tuple(value_fn(*args, **kwargs) for i in range(2))
12 |     return value_fns
13 | 
14 | 
15 | VALUE_FUNCTIONS = {
16 |     'feedforward_V_function': (
17 |         vanilla.create_feedforward_V_function),
18 |     'double_feedforward_Q_function': lambda *args, **kwargs: (
19 |         create_double_value_function(vanilla.create_feedforward_Q_function, *args, **kwargs)),
20 | }
21 | 
22 | 
23 | def get_Q_function_from_variant(variant, env, *args, **kwargs):
24 |     Q_params = variant['Q_params']
25 |     Q_type = Q_params['type']
26 |     Q_kwargs = deepcopy(Q_params['kwargs'])
27 | 
28 |     preprocessor_params = Q_kwargs.pop('preprocessor_params', None)
29 |     preprocessor = get_preprocessor_from_params(env, preprocessor_params)
30 | 
31 |     return VALUE_FUNCTIONS[Q_type](
32 |         observation_shape=env.active_observation_shape,
33 |         action_shape=env.action_space.shape,
34 |         *args,
35 |         observation_preprocessor=preprocessor,
36 |         **Q_kwargs,
37 |         **kwargs)
38 | 
39 | 
40 | def get_V_function_from_variant(variant, env, *args, **kwargs):
41 |     V_params = variant['V_params']
42 |     V_type = V_params['type']
43 |     V_kwargs = deepcopy(V_params['kwargs'])
44 | 
45 |     preprocessor_params = V_kwargs.pop('preprocessor_params', None)
46 |     preprocessor = get_preprocessor_from_params(env, preprocessor_params)
47 |     return VALUE_FUNCTIONS[V_type](
48 |         observation_shape=env.active_observation_shape,
49 |         *args,
50 |         observation_preprocessor=preprocessor,
51 |         **V_kwargs,
52 |         **kwargs)
53 | 


--------------------------------------------------------------------------------
/softlearning/value_functions/value_function.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | 
 4 | from serializable import Serializable
 5 | 
 6 | 
 7 | class SumQFunction(Serializable):
 8 |     def __init__(self,
 9 |                  observation_shape,
10 |                  action_shape,
11 |                  q_functions):
12 |         self._Serializable__initialize(locals())
13 | 
14 |         self.q_functions = q_functions
15 | 
16 |         assert len(observation_shape) == 1, observation_shape
17 |         self._Do = observation_shape[0]
18 |         assert len(action_shape) == 1, action_shape
19 |         self._Da = action_shape[0]
20 | 
21 |         self._observations_ph = tf.placeholder(
22 |             tf.float32, shape=(None, self._Do), name='observations')
23 |         self._actions_ph = tf.placeholder(
24 |             tf.float32, shape=(None, self._Da), name='actions')
25 | 
26 |         self._output = self.output_for(
27 |             self._observations_ph, self._actions_ph, reuse=True)
28 | 
29 |     def output_for(self, observations, actions, reuse=False):
30 |         outputs = [
31 |             qf.output_for(observations, actions, reuse=reuse)
32 |             for qf in self.q_functions
33 |         ]
34 |         output = tf.add_n(outputs)
35 |         return output
36 | 
37 |     def _eval(self, observations, actions):
38 |         feeds = {
39 |             self._observations_ph: observations,
40 |             self._actions_ph: actions
41 |         }
42 | 
43 |         return tf.keras.backend.get_session().run(self._output, feeds)
44 | 
45 |     def get_param_values(self):
46 |         all_values_list = [qf.get_param_values() for qf in self.q_functions]
47 | 
48 |         return np.concatenate(all_values_list)
49 | 
50 |     def set_param_values(self, all_values):
51 |         param_sizes = [qf.get_param_values().size for qf in self.q_functions]
52 |         split_points = np.cumsum(param_sizes)[:-1]
53 | 
54 |         all_values_list = np.split(all_values, split_points)
55 | 
56 |         for values, qf in zip(all_values_list, self.q_functions):
57 |             qf.set_param_values(values)
58 | 


--------------------------------------------------------------------------------
/softlearning/policies/uniform_policy.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | import tensorflow as tf
 4 | 
 5 | from .base_policy import BasePolicy
 6 | 
 7 | 
 8 | class UniformPolicy(BasePolicy):
 9 |     def __init__(self, input_shapes, output_shape, action_range=(-1.0, 1.0)):
10 |         super(UniformPolicy, self).__init__()
11 |         self._Serializable__initialize(locals())
12 | 
13 |         self.inputs = [
14 |             tf.keras.layers.Input(shape=input_shape)
15 |             for input_shape in input_shapes
16 |         ]
17 |         self._action_range = action_range
18 | 
19 |         x = tf.keras.layers.Lambda(
20 |             lambda x: tf.concat(x, axis=-1)
21 |         )(self.inputs)
22 | 
23 |         actions = tf.keras.layers.Lambda(
24 |             lambda x: tf.random.uniform(
25 |                 (tf.shape(x)[0], output_shape[0]),
26 |                 *action_range)
27 |         )(x)
28 | 
29 |         self.actions_model = tf.keras.Model(self.inputs, actions)
30 | 
31 |         self.actions_input = tf.keras.Input(shape=output_shape)
32 | 
33 |         log_pis = tf.keras.layers.Lambda(
34 |             lambda x: tf.tile(tf.log([
35 |                 (action_range[1] - action_range[0]) / 2.0
36 |             ])[None], (tf.shape(x)[0], 1))
37 |         )(self.actions_input)
38 | 
39 |         self.log_pis_model = tf.keras.Model(
40 |             (*self.inputs, self.actions_input), log_pis)
41 | 
42 |     def get_weights(self):
43 |         return []
44 | 
45 |     def set_weights(self, *args, **kwargs):
46 |         return
47 | 
48 |     @property
49 |     def trainable_variables(self):
50 |         return []
51 | 
52 |     def reset(self):
53 |         pass
54 | 
55 |     def actions(self, conditions):
56 |         return self.actions_model(conditions)
57 | 
58 |     def log_pis(self, conditions, actions):
59 |         return self.log_pis_model([*conditions, actions])
60 | 
61 |     def actions_np(self, conditions):
62 |         return self.actions_model.predict(conditions)
63 | 
64 |     def log_pis_np(self, conditions, actions):
65 |         return self.log_pis_model.predict([*conditions, actions])
66 | 
67 |     def get_diagnostics(self, conditions):
68 |         return OrderedDict({})
69 | 


--------------------------------------------------------------------------------
/softlearning/samplers/base_sampler.py:
--------------------------------------------------------------------------------
 1 | from collections import deque, OrderedDict
 2 | from itertools import islice
 3 | 
 4 | 
 5 | class BaseSampler(object):
 6 |     def __init__(self,
 7 |                  max_path_length,
 8 |                  min_pool_size,
 9 |                  batch_size,
10 |                  store_last_n_paths=10):
11 |         self._max_path_length = max_path_length
12 |         self._min_pool_size = min_pool_size
13 |         self._batch_size = batch_size
14 |         self._store_last_n_paths = store_last_n_paths
15 |         self._last_n_paths = deque(maxlen=store_last_n_paths)
16 | 
17 |         self.env = None
18 |         self.policy = None
19 |         self.pool = None
20 | 
21 |     def initialize(self, env, policy, pool):
22 |         self.env = env
23 |         self.policy = policy
24 |         self.pool = pool
25 | 
26 |     def set_policy(self, policy):
27 |         self.policy = policy
28 | 
29 |     def clear_last_n_paths(self):
30 |         self._last_n_paths.clear()
31 | 
32 |     def get_last_n_paths(self, n=None):
33 |         if n is None:
34 |             n = self._store_last_n_paths
35 | 
36 |         last_n_paths = tuple(islice(self._last_n_paths, None, n))
37 | 
38 |         return last_n_paths
39 | 
40 |     def sample(self):
41 |         raise NotImplementedError
42 | 
43 |     def batch_ready(self):
44 |         enough_samples = self.pool.size >= self._min_pool_size
45 |         return enough_samples
46 | 
47 |     def random_batch(self, batch_size=None, **kwargs):
48 |         batch_size = batch_size or self._batch_size
49 |         return self.pool.random_batch(batch_size, **kwargs)
50 | 
51 |     def terminate(self):
52 |         self.env.close()
53 | 
54 |     def get_diagnostics(self):
55 |         diagnostics = OrderedDict({'pool-size': self.pool.size})
56 |         return diagnostics
57 | 
58 |     def __getstate__(self):
59 |         state = {
60 |             key: value for key, value in self.__dict__.items()
61 |             if key not in ('env', 'policy', 'pool')
62 |         }
63 | 
64 |         return state
65 | 
66 |     def __setstate__(self, state):
67 |         self.__dict__.update(state)
68 | 
69 |         self.env = None
70 |         self.policy = None
71 |         self.pool = None
72 | 


--------------------------------------------------------------------------------
/maple/env/humanoid.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym.envs.mujoco import mujoco_env
 3 | from gym import utils
 4 | 
 5 | def mass_center(model, sim):
 6 |     mass = np.expand_dims(model.body_mass, 1)
 7 |     xpos = sim.data.xipos
 8 |     return (np.sum(mass * xpos, 0) / np.sum(mass))[0]
 9 | 
10 | class HumanoidEnv(mujoco_env.MujocoEnv, utils.EzPickle):
11 |     def __init__(self):
12 |         mujoco_env.MujocoEnv.__init__(self, 'humanoid.xml', 5)
13 |         utils.EzPickle.__init__(self)
14 | 
15 |     def _get_obs(self):
16 |         data = self.sim.data
17 |         return np.concatenate([data.qpos.flat[2:],
18 |                                data.qvel.flat,
19 |                                # data.cinert.flat,
20 |                                # data.cvel.flat,
21 |                                # data.qfrc_actuator.flat,
22 |                                # data.cfrc_ext.flat
23 |                                ])
24 | 
25 |     def step(self, a):
26 |         pos_before = mass_center(self.model, self.sim)
27 |         self.do_simulation(a, self.frame_skip)
28 |         pos_after = mass_center(self.model, self.sim)
29 |         alive_bonus = 5.0
30 |         data = self.sim.data
31 |         lin_vel_cost = 0.25 * (pos_after - pos_before) / self.model.opt.timestep
32 |         quad_ctrl_cost = 0.1 * np.square(data.ctrl).sum()
33 |         quad_impact_cost = .5e-6 * np.square(data.cfrc_ext).sum()
34 |         quad_impact_cost = min(quad_impact_cost, 10)
35 |         reward = lin_vel_cost - quad_ctrl_cost - quad_impact_cost + alive_bonus
36 |         qpos = self.sim.data.qpos
37 |         done = bool((qpos[2] < 1.0) or (qpos[2] > 2.0))
38 |         return self._get_obs(), reward, done, dict(reward_linvel=lin_vel_cost, reward_quadctrl=-quad_ctrl_cost, reward_alive=alive_bonus, reward_impact=-quad_impact_cost)
39 | 
40 |     def reset_model(self):
41 |         c = 0.01
42 |         self.set_state(
43 |             self.init_qpos + self.np_random.uniform(low=-c, high=c, size=self.model.nq),
44 |             self.init_qvel + self.np_random.uniform(low=-c, high=c, size=self.model.nv,)
45 |         )
46 |         return self._get_obs()
47 | 
48 |     def viewer_setup(self):
49 |         self.viewer.cam.trackbodyid = 1
50 |         self.viewer.cam.distance = self.model.stat.extent * 1.0
51 |         self.viewer.cam.lookat[2] = 2.0
52 |         self.viewer.cam.elevation = -20
53 | 
54 | 


--------------------------------------------------------------------------------
/softlearning/samplers/extra_policy_info_sampler.py:
--------------------------------------------------------------------------------
 1 | """Sampler that stores raw actions and log pis from policy."""
 2 | 
 3 | 
 4 | from collections import defaultdict
 5 | 
 6 | import numpy as np
 7 | 
 8 | from .simple_sampler import SimpleSampler
 9 | 
10 | 
11 | class ExtraPolicyInfoSampler(SimpleSampler):
12 |     def sample(self):
13 |         if self._current_observation is None:
14 |             self._current_observation = self.env.reset()
15 | 
16 |         observations = self.env.convert_to_active_observation(
17 |             self._current_observation)[None]
18 |         actions = self.policy.actions_np([observations])
19 |         log_pis = self.policy.log_pis_np([observations], actions)
20 | 
21 |         action = actions[0]
22 |         log_pi = log_pis[0]
23 | 
24 |         next_observation, reward, terminal, info = self.env.step(action)
25 |         self._path_length += 1
26 |         self._path_return += reward
27 |         self._total_samples += 1
28 | 
29 |         self._current_path['observations'].append(self._current_observation)
30 |         self._current_path['actions'].append(action)
31 |         self._current_path['rewards'].append([reward])
32 |         self._current_path['terminals'].append([terminal])
33 |         self._current_path['next_observations'].append(next_observation)
34 |         self._current_path['infos'].append(info)
35 |         # self._current_path['raw_actions'].append(raw_action)
36 |         self._current_path['log_pis'].append(log_pi)
37 | 
38 |         if terminal or self._path_length >= self._max_path_length:
39 |             last_path = {
40 |                 field_name: np.array(values)
41 |                 for field_name, values in self._current_path.items()
42 |             }
43 |             self.pool.add_path(last_path)
44 |             self._last_n_paths.appendleft(last_path)
45 | 
46 |             self.policy.reset()
47 |             self._current_observation = self.env.reset()
48 | 
49 |             self._max_path_return = max(self._max_path_return,
50 |                                         self._path_return)
51 |             self._last_path_return = self._path_return
52 | 
53 |             self._path_length = 0
54 |             self._path_return = 0
55 |             self._current_path = defaultdict(list)
56 | 
57 |             self._n_episodes += 1
58 |         else:
59 |             self._current_observation = next_observation
60 | 
61 |         return self._current_observation, reward, terminal, info
62 | 


--------------------------------------------------------------------------------
/rla_scripts/start_pretty_plotter.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A script to start a server of the pretty plotter.
 3 | 
 4 | """
 5 | import os
 6 | 
 7 | from RLA.easy_log.log_tools import PrettyPlotterTool, Filter
 8 | import argparse
 9 | from RLA.rla_argparser import boolean_flag
10 | from config import *
11 | 
12 | from smart_logger.front_page.page import start_page_server
13 | import smart_logger.common.plot_config as plot_config
14 | import smart_logger.common.page_config as page_config
15 | 
16 | 
17 | def argsparser():
18 |     parser = argparse.ArgumentParser("Delete Log")
19 |     # reduce setting
20 |     parser.add_argument('--task_table_name', type=str, default="")
21 |     parser.add_argument('--regex', type=str)
22 |     parser.add_argument('--timestep_bound', type=int, default=100)
23 |     parser.add_argument('--delete_type', type=str, default=Filter.ALL)
24 |     parser.add_argument('--workspace_path', '-wks', type=str, default='~/.pretty_plotter_cache',
25 |                         help="Path to the workspace, used to saving cache data")
26 |     parser.add_argument('--user_name', '-u', type=str, default='user',
27 |                         help="user name")
28 |     parser.add_argument('--password', '-pw', type=str, default='123456',
29 |                         help="password")
30 |     parser.add_argument('--port', '-p', type=int, default=7005, help="Server port")
31 |     boolean_flag(parser, 'start_server', default=False)
32 |     args = parser.parse_args()
33 |     return args
34 | 
35 | if __name__=='__main__':
36 |     args = argsparser()
37 |     filter = Filter()
38 |     filter.config(type=args.delete_type, timstep_bound=args.timestep_bound)
39 |     tool = PrettyPlotterTool(proj_root=DATA_ROOT, task_table_name=args.task_table_name, regex=args.regex)
40 |     tool.gen_json(args.regex)
41 |     if args.start_server:
42 |         plot_config.DATA_PATH = os.path.abspath(DATA_ROOT)
43 |         page_config.WORKSPAPCE = os.path.abspath(os.path.expanduser(args.workspace_path))
44 | 
45 |         plot_config.DATA_MERGER = []
46 |         plot_config.PLOTTING_XY = []
47 |         plot_config.PLOT_LOG_PATH = f"{plot_config.DATA_PATH}"
48 |         plot_config.PLOT_FIGURE_SAVING_PATH = f"{os.path.join(os.path.dirname(plot_config.DATA_PATH), 'figure')}"
49 | 
50 |         page_config.WEB_RAM_PATH = f"{page_config.WORKSPAPCE}/WEB_ROM"
51 |         page_config.CONFIG_PATH = f"{page_config.WEB_RAM_PATH}/configs"
52 |         page_config.FIGURE_PATH = f"{page_config.WEB_RAM_PATH}/figures"
53 |         page_config.PORT = args.port
54 |         page_config.USER_NAME = args.user_name
55 |         page_config.PASSWD = args.password
56 |         start_page_server()
57 | 


--------------------------------------------------------------------------------
/softlearning/preprocessors/convnet.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | 
 3 | from softlearning.models.feedforward import feedforward_model
 4 | from softlearning.utils.keras import PicklableKerasModel
 5 | 
 6 | 
 7 | def convnet_preprocessor(
 8 |         input_shapes,
 9 |         image_shape,
10 |         output_size,
11 |         conv_filters=(32, 32),
12 |         conv_kernel_sizes=((5, 5), (5, 5)),
13 |         pool_type='MaxPool2D',
14 |         pool_sizes=((2, 2), (2, 2)),
15 |         pool_strides=(2, 2),
16 |         dense_hidden_layer_sizes=(64, 64),
17 |         data_format='channels_last',
18 |         name="convnet_preprocessor",
19 |         make_picklable=True,
20 |         *args,
21 |         **kwargs):
22 |     if data_format == 'channels_last':
23 |         H, W, C = image_shape
24 |     elif data_format == 'channels_first':
25 |         C, H, W = image_shape
26 | 
27 |     inputs = [
28 |         tf.keras.layers.Input(shape=input_shape)
29 |         for input_shape in input_shapes
30 |     ]
31 | 
32 |     concatenated_input = tf.keras.layers.Lambda(
33 |         lambda x: tf.concat(x, axis=-1)
34 |     )(inputs)
35 | 
36 |     images_flat, input_raw = tf.keras.layers.Lambda(
37 |         lambda x: [x[..., :H * W * C], x[..., H * W * C:]]
38 |     )(concatenated_input)
39 | 
40 |     images = tf.keras.layers.Reshape(image_shape)(images_flat)
41 | 
42 |     conv_out = images
43 |     for filters, kernel_size, pool_size, strides in zip(
44 |             conv_filters, conv_kernel_sizes, pool_sizes, pool_strides):
45 |         conv_out = tf.keras.layers.Conv2D(
46 |             filters=filters,
47 |             kernel_size=kernel_size,
48 |             padding="SAME",
49 |             activation=tf.nn.relu,
50 |             *args,
51 |             **kwargs
52 |         )(conv_out)
53 |         conv_out = getattr(tf.keras.layers, pool_type)(
54 |             pool_size=pool_size, strides=strides
55 |         )(conv_out)
56 | 
57 |     flattened = tf.keras.layers.Flatten()(conv_out)
58 |     concatenated_output = tf.keras.layers.Lambda(
59 |         lambda x: tf.concat(x, axis=-1)
60 |     )([flattened, input_raw])
61 | 
62 |     output = (
63 |         feedforward_model(
64 |             input_shapes=(concatenated_output.shape[1:].as_list(), ),
65 |             output_size=output_size,
66 |             hidden_layer_sizes=dense_hidden_layer_sizes,
67 |             activation='relu',
68 |             output_activation='linear',
69 |             *args,
70 |             **kwargs
71 |         )([concatenated_output])
72 |         if dense_hidden_layer_sizes
73 |         else concatenated_output)
74 | 
75 |     model = PicklableKerasModel(inputs, output, name=name)
76 | 
77 |     return model
78 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | venv/
108 | env.bak/
109 | venv.bak/
110 | 
111 | # Spyder project settings
112 | .spyderproject
113 | .spyproject
114 | 
115 | # Rope project settings
116 | .ropeproject
117 | 
118 | # mkdocs documentation
119 | /site
120 | 
121 | # mypy
122 | .mypy_cache/
123 | .dmypy.json
124 | dmypy.json
125 | 
126 | # Pyre type checker
127 | .pyre/
128 | 
129 | .idea
130 | 
131 | **.npz
132 | luban_start_*
133 | 
134 | code/**
135 | checkpoint/**
136 | models/**
137 | results/**
138 | archive_tester/**
139 | log/**
140 | archived/**
141 | .DS_Store


--------------------------------------------------------------------------------
/softlearning/misc/plotter.py:
--------------------------------------------------------------------------------
 1 | import tensorflow as tf
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | 
 5 | 
 6 | class QFPolicyPlotter:
 7 |     def __init__(self, Q, policy, obs_lst, default_action, n_samples):
 8 |         self._Q = Q
 9 |         self._policy = policy
10 |         self._obs_lst = obs_lst
11 |         self._default_action = np.array(default_action)
12 |         self._n_samples = n_samples
13 | 
14 |         self._var_inds = np.where(np.isnan(default_action))[0]
15 |         assert len(self._var_inds) == 2
16 | 
17 |         n_plots = len(obs_lst)
18 | 
19 |         x_size = 5 * n_plots
20 |         y_size = 5
21 | 
22 |         fig = plt.figure(figsize=(x_size, y_size))
23 |         self._ax_lst = []
24 |         for i in range(n_plots):
25 |             ax = fig.add_subplot(100 + n_plots * 10 + i + 1)
26 |             ax.set_xlim((-1, 1))
27 |             ax.set_ylim((-1, 1))
28 |             ax.grid(True)
29 |             self._ax_lst.append(ax)
30 | 
31 |         self._line_objects = list()
32 | 
33 |     def draw(self):
34 |         # noinspection PyArgumentList
35 |         [h.remove() for h in self._line_objects]
36 |         self._line_objects = list()
37 | 
38 |         self._plot_level_curves()
39 |         self._plot_action_samples()
40 | 
41 |         plt.draw()
42 |         plt.pause(0.001)
43 | 
44 |     def _plot_level_curves(self):
45 |         # Create mesh grid.
46 |         xs = np.linspace(-1, 1, 50)
47 |         ys = np.linspace(-1, 1, 50)
48 |         xgrid, ygrid = np.meshgrid(xs, ys)
49 |         N = len(xs)*len(ys)
50 | 
51 |         # Copy default values along the first axis and replace nans with
52 |         # the mesh grid points.
53 |         actions = np.tile(self._default_action.astype(np.float32), (N, 1))
54 |         actions[:, self._var_inds[0]] = xgrid.ravel()
55 |         actions[:, self._var_inds[1]] = ygrid.ravel()
56 | 
57 |         for ax, obs in zip(self._ax_lst, self._obs_lst):
58 |             observations = np.tile(
59 |                 obs[None].astype(np.float32), (actions.shape[0], 1))
60 | 
61 |             Q_np = self._Q.predict((observations, actions))
62 |             Q_np = np.reshape(Q_np, xgrid.shape)
63 | 
64 |             cs = ax.contour(xgrid, ygrid, Q_np, 20)
65 |             self._line_objects += cs.collections
66 |             self._line_objects += ax.clabel(
67 |                 cs, inline=1, fontsize=10, fmt='%.2f')
68 | 
69 |     def _plot_action_samples(self):
70 |         for ax, obs in zip(self._ax_lst, self._obs_lst):
71 |             observations = np.ones((self._n_samples, 1)) * obs[None, :]
72 |             actions = self._policy.actions_np([observations])
73 | 
74 |             x, y = actions[:, 0], actions[:, 1]
75 |             self._line_objects += ax.plot(x, y, 'b*')
76 | 


--------------------------------------------------------------------------------
/maple/env/ant.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     # def __init__(self, goal=15.0/180*np.pi):
 7 |     def __init__(self, goal=30.0/180*np.pi):
 8 |         self._goal = goal
 9 |         mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5)
10 |         utils.EzPickle.__init__(self)
11 | 
12 |     def step(self, a):
13 |         # self.render()
14 |         xy_position_before = self.get_body_com("torso")[:2].copy()
15 |         self.do_simulation(a, self.frame_skip)
16 |         xy_position_after = self.get_body_com("torso")[:2].copy()
17 |         direct = (np.cos(self._goal), np.sin(self._goal))
18 | 
19 |         xy_velocity = (xy_position_after - xy_position_before) / self.dt
20 |         x_velocity, y_velocity = xy_velocity
21 | 
22 |         # xposbefore = self.get_body_com("torso")[0]
23 |         # self.do_simulation(a, self.frame_skip)
24 |         # xposafter = self.get_body_com("torso")[0]
25 |         
26 |         # forward_reward = (xposafter - xposbefore)/self.dt
27 |         forward_reward = x_velocity
28 |         angle_reward = np.dot(np.array(xy_velocity), direct)
29 |         ctrl_cost = .5 * np.square(a).sum()
30 |         contact_cost = 0.5 * 1e-3 * np.sum(
31 |             np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
32 |         survive_reward = 1.0
33 |         reward = forward_reward - ctrl_cost - contact_cost + survive_reward
34 |         state = self.state_vector()
35 |         notdone = np.isfinite(state).all() \
36 |             and state[2] >= 0.2 and state[2] <= 1.0
37 |         done = not notdone
38 |         ob = self._get_obs(xy_velocity)
39 |         return ob, reward, done, dict(
40 |             reward_ctrl=-ctrl_cost,
41 |             reward_contact=-contact_cost,
42 |             reward_survive=survive_reward,
43 |             reward_forward=forward_reward,
44 |             reward_angle=angle_reward,
45 |             x_position=xy_position_after[0],
46 |             y_position=xy_position_after[1])
47 | 
48 |     def _get_obs(self, xy_velocity):
49 |         return np.concatenate([
50 |             # self.get_body_com("torso")[:2].copy(),
51 |             self.sim.data.qpos.flat[2:],
52 |             self.sim.data.qvel.flat,
53 |             np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
54 |             np.array(xy_velocity),
55 |         ])
56 | 
57 |     def reset_model(self):
58 |         qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
59 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
60 |         self.set_state(qpos, qvel)
61 |         return self._get_obs([0,0])
62 | 
63 |     def viewer_setup(self):
64 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/maple/env/ant_angle.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from gym import utils
 3 | from gym.envs.mujoco import mujoco_env
 4 | 
 5 | class AntEnv(mujoco_env.MujocoEnv, utils.EzPickle):
 6 |     # def __init__(self, goal=15.0/180*np.pi):
 7 |     def __init__(self, goal=30.0/180*np.pi):
 8 |         self._goal = goal
 9 |         mujoco_env.MujocoEnv.__init__(self, 'ant.xml', 5)
10 |         utils.EzPickle.__init__(self)
11 | 
12 |     def step(self, a):
13 |         # self.render()
14 |         xy_position_before = self.get_body_com("torso")[:2].copy()
15 |         self.do_simulation(a, self.frame_skip)
16 |         xy_position_after = self.get_body_com("torso")[:2].copy()
17 |         direct = (np.cos(self._goal), np.sin(self._goal))
18 | 
19 |         xy_velocity = (xy_position_after - xy_position_before) / self.dt
20 |         x_velocity, y_velocity = xy_velocity
21 | 
22 |         # xposbefore = self.get_body_com("torso")[0]
23 |         # self.do_simulation(a, self.frame_skip)
24 |         # xposafter = self.get_body_com("torso")[0]
25 |         
26 |         # forward_reward = (xposafter - xposbefore)/self.dt
27 |         forward_reward = x_velocity
28 |         angle_reward = np.dot(np.array(xy_velocity), direct)
29 |         ctrl_cost = .5 * np.square(a).sum()
30 |         contact_cost = 0.5 * 1e-3 * np.sum(
31 |             np.square(np.clip(self.sim.data.cfrc_ext, -1, 1)))
32 |         survive_reward = 1.0
33 |         reward = x_velocity * np.cos(np.pi/6) + y_velocity * np.sin(np.pi/6) - ctrl_cost - contact_cost + survive_reward
34 |         state = self.state_vector()
35 |         notdone = np.isfinite(state).all() \
36 |             and state[2] >= 0.2 and state[2] <= 1.0
37 |         done = not notdone
38 |         ob = self._get_obs(xy_velocity)
39 |         return ob, reward, done, dict(
40 |             reward_ctrl=-ctrl_cost,
41 |             reward_contact=-contact_cost,
42 |             reward_survive=survive_reward,
43 |             reward_forward=forward_reward,
44 |             reward_angle=angle_reward,
45 |             x_position=xy_position_after[0],
46 |             y_position=xy_position_after[1])
47 | 
48 |     def _get_obs(self, xy_velocity):
49 |         return np.concatenate([
50 |             # self.get_body_com("torso")[:2].copy(),
51 |             self.sim.data.qpos.flat[2:],
52 |             self.sim.data.qvel.flat,
53 |             np.clip(self.sim.data.cfrc_ext, -1, 1).flat,
54 |             np.array(xy_velocity),
55 |         ])
56 | 
57 |     def reset_model(self):
58 |         qpos = self.init_qpos + self.np_random.uniform(size=self.model.nq, low=-.1, high=.1)
59 |         qvel = self.init_qvel + self.np_random.randn(self.model.nv) * .1
60 |         self.set_state(qpos, qvel)
61 |         return self._get_obs([0, 0])
62 | 
63 |     def viewer_setup(self):
64 |         self.viewer.cam.distance = self.model.stat.extent * 0.5


--------------------------------------------------------------------------------
/softlearning/algorithms/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | 
 4 | def create_SAC_algorithm(variant, *args, **kwargs):
 5 |     from .sac import SAC
 6 | 
 7 |     algorithm = SAC(*args, **kwargs)
 8 | 
 9 |     return algorithm
10 | 
11 | 
12 | def create_SQL_algorithm(variant, *args, **kwargs):
13 |     from .sql import SQL
14 | 
15 |     algorithm = SQL(*args, **kwargs)
16 | 
17 |     return algorithm
18 | 
19 | def create_MVE_algorithm(variant, *args, **kwargs):
20 |     from .mve_sac import MVESAC
21 | 
22 |     algorithm = MVESAC(*args, **kwargs)
23 | 
24 |     return algorithm
25 | 
26 | def create_MOPO_algorithm(variant, *args, **kwargs):
27 |     from mopo.algorithms.mopo import MOPO
28 | 
29 |     algorithm = MOPO(*args, **kwargs)
30 | 
31 |     return algorithm
32 | 
33 | 
34 | ALGORITHM_CLASSES = {
35 |     'SAC': create_SAC_algorithm,
36 |     'SQL': create_SQL_algorithm,
37 |     'MOPO': create_MOPO_algorithm,
38 | }
39 | 
40 | 
41 | def get_algorithm_from_variant(variant,  *args, **kwargs):
42 |     algorithm_params = variant['algorithm_params']
43 |     algorithm_type = algorithm_params['type']
44 |     algorithm_kwargs = deepcopy(algorithm_params['kwargs'])
45 |     exp_name = variant['algorithm_params']["exp_name"]
46 |     # vae = variant['use_vae']
47 |     retrain_model = variant['retrain_model']
48 |     exp_name = exp_name.replace('_', '-')
49 |     if algorithm_kwargs['separate_mean_var']:
50 |         exp_name += '_smv'
51 |     algorithm_kwargs["model_name"] = exp_name + '_1_{}'.format(variant['model_suffix'])
52 |     algorithm_kwargs["tester"] = kwargs['tester']
53 |     if variant['length'] > 0:
54 |         algorithm_kwargs['rollout_length'] = variant['length']
55 |     if variant['penalty_coeff'] >= 0:
56 |         algorithm_kwargs['penalty_coeff'] = variant['penalty_coeff']
57 |     if variant['elite_num'] > 0:
58 |         algorithm_kwargs['num_elites'] = variant['elite_num']
59 |     algorithm_kwargs['fix_env'] = variant['fix_env']
60 |     kwargs = {**kwargs, **algorithm_kwargs.toDict()}
61 |     kwargs['vae'] = variant['use_vae']
62 |     kwargs['clip_state'] = not variant["no_clip_state"]
63 |     kwargs['res_dyn'] = variant["res_dyn"]
64 |     kwargs['norm_input'] = not variant["no_norm_input"]
65 |     kwargs['seed'] = variant['run_params']['seed']
66 |     # kwargs['load_task_name'] = variant['load_task_name']
67 |     # kwargs['load_date'] = variant['load_date']
68 |     print("[ DEBUG ]: kwargs to net is {}".format(kwargs))
69 |     # if retrain_model:
70 |     #     print('[ DEBUG ] retraining model... ')
71 |     #     print(kwargs)
72 |     #     kwargs['model_load_dir'] = None
73 |     kwargs['retrain'] = retrain_model
74 |     kwargs['network_kwargs']['embedding_size'] = variant['emb_size']
75 |     kwargs['n_epochs'] = variant['n_epochs']
76 |     kwargs['source'] = variant['config'].split('.')[-2]
77 |     algorithm = ALGORITHM_CLASSES[algorithm_type](variant, *args, **kwargs)
78 |     return algorithm
79 | 


--------------------------------------------------------------------------------
/softlearning/samplers/utils.py:
--------------------------------------------------------------------------------
 1 | from copy import deepcopy
 2 | 
 3 | import numpy as np
 4 | 
 5 | from softlearning import replay_pools
 6 | from . import (
 7 |     dummy_sampler,
 8 |     extra_policy_info_sampler,
 9 |     # remote_sampler,
10 |     base_sampler,
11 |     simple_sampler)
12 | 
13 | 
14 | def get_sampler_from_variant(variant, *args, **kwargs):
15 |     SAMPLERS = {
16 |         'DummySampler': dummy_sampler.DummySampler,
17 |         'ExtraPolicyInfoSampler': (
18 |             extra_policy_info_sampler.ExtraPolicyInfoSampler),
19 |         # 'RemoteSampler': remote_sampler.RemoteSampler,
20 |         'Sampler': base_sampler.BaseSampler,
21 |         'SimpleSampler': simple_sampler.SimpleSampler,
22 |     }
23 | 
24 |     sampler_params = variant['sampler_params']
25 |     sampler_type = sampler_params['type']
26 | 
27 |     sampler_args = deepcopy(sampler_params.get('args', ()))
28 |     sampler_kwargs = deepcopy(sampler_params.get('kwargs', {}))
29 | 
30 |     sampler = SAMPLERS[sampler_type](
31 |         *sampler_args, *args, **sampler_kwargs, **kwargs)
32 | 
33 |     return sampler
34 | 
35 | 
36 | def rollout(env,
37 |             policy,
38 |             path_length,
39 |             callback=None,
40 |             render_mode=None,
41 |             break_on_terminal=True):
42 |     observation_space = env.observation_space
43 |     action_space = env.action_space
44 | 
45 |     pool = replay_pools.SimpleReplayPool(
46 |         observation_space, action_space, max_size=path_length)
47 |     sampler = simple_sampler.SimpleSampler(
48 |         max_path_length=path_length,
49 |         min_pool_size=None,
50 |         batch_size=None)
51 | 
52 |     sampler.initialize(env, policy, pool)
53 | 
54 |     images = []
55 |     infos = []
56 | 
57 |     t = 0
58 |     for t in range(path_length):
59 |         observation, reward, terminal, info = sampler.sample()
60 |         infos.append(info)
61 | 
62 |         if callback is not None:
63 |             callback(observation)
64 | 
65 |         if render_mode is not None:
66 |             if render_mode == 'rgb_array':
67 |                 image = env.render(mode=render_mode)
68 |                 # import pdb; pdb.set_trace()
69 |                 # image = env._env.sim.render(mode='offscreen')
70 |                 images.append(image)
71 |             else:
72 |                 env.render()
73 | 
74 |         if terminal:
75 |             sampler.reset_policy()
76 |             if break_on_terminal: break
77 | 
78 |     assert pool._size == t + 1
79 | 
80 |     path = pool.batch_by_indices(
81 |         np.arange(pool._size),
82 |         observation_keys=getattr(env, 'observation_keys', None))
83 |     path['infos'] = infos
84 | 
85 |     if render_mode == 'rgb_array':
86 |         path['images'] = np.stack(images, axis=0)
87 | 
88 |     return path
89 | 
90 | 
91 | def rollouts(n_paths, *args, **kwargs):
92 |     paths = [rollout(*args, **kwargs) for i in range(n_paths)]
93 |     return paths
94 | 


--------------------------------------------------------------------------------
/maple/models/constructor.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import tensorflow as tf
 3 | 
 4 | from maple.models.fc import FC
 5 | from maple.models.bnn import BNN
 6 | 
 7 | def construct_model(obs_dim=11, act_dim=3, rew_dim=1, hidden_dim=200, num_networks=7,
 8 | 					num_elites=5, session=None, model_type='mlp', separate_mean_var=False,
 9 | 					name=None, load_dir=None, deterministic=False, source=None):
10 | 	if name is None:
11 | 		name = 'BNN'
12 | 	print('[ BNN ] Name {} | Observation dim {} | Action dim: {} | Hidden dim: {}'.format(name, obs_dim, act_dim, hidden_dim))
13 | 	params = {'name': name, 'num_networks': num_networks, 'num_elites': num_elites,
14 | 			  'sess': session, 'separate_mean_var': separate_mean_var, 'deterministic': deterministic,
15 | 			  'obs_dim': obs_dim, 'source': source}
16 | 
17 | 	if load_dir is not None:
18 | 		print('Specified load dir', load_dir)
19 | 		params['model_dir'] = load_dir
20 | 
21 | 	model = BNN(params)
22 | 
23 | 	if not model.model_loaded:
24 | 		if model_type == 'identity':
25 | 			return
26 | 		elif model_type == 'linear':
27 | 			print('[ BNN ] Training linear model')
28 | 			model.add(FC(obs_dim+rew_dim, input_dim=obs_dim+act_dim, weight_decay=0.000025))
29 | 		elif model_type == 'mlp':
30 | 			print('[ BNN ] Training non-linear model | Obs: {} | Act: {} | Rew: {}'.format(obs_dim, act_dim, rew_dim))
31 | 			model.add(FC(hidden_dim, input_dim=obs_dim+act_dim, activation="swish", weight_decay=0.000025))
32 | 			model.add(FC(hidden_dim, activation="swish", weight_decay=0.00005))
33 | 			model.add(FC(hidden_dim, activation="swish", weight_decay=0.000075))
34 | 			model.add(FC(hidden_dim, activation="swish", weight_decay=0.000075))
35 | 			model.add(FC(obs_dim+rew_dim, weight_decay=0.0001))
36 | 			if separate_mean_var:
37 | 				model.add(FC(obs_dim+rew_dim, input_dim=hidden_dim, weight_decay=0.0001), var_layer=True)
38 | 
39 | 	if load_dir is not None:
40 | 		model.model_loaded = True
41 | 	if source == 'd4rl':
42 | 		model.finalize(tf.train.AdamOptimizer, {"learning_rate": 0.001})
43 | 	elif source == 'neorl':
44 | 		model.finalize(tf.contrib.opt.AdamWOptimizer, {"learning_rate": 0.001, "weight_decay":0.000075})
45 | 	print('[ BNN ] Model: {}'.format(model))
46 | 	return model
47 | 
48 | 
49 | def format_samples_for_training(samples):
50 | 	# terminals = samples["terminals"]
51 | 	# terminals_idx = np.where(terminals)[0]
52 | 	obs = samples['observations']
53 | 	act = samples['actions']
54 | 	next_obs = samples['next_observations']
55 | 	# next_obs[terminals_idx] = obs[terminals_idx]
56 | 	rew = samples['rewards']
57 | 	delta_obs = next_obs - obs
58 | 	inputs = np.concatenate((obs, act), axis=-1)
59 | 	outputs = np.concatenate((rew, delta_obs), axis=-1)
60 | 
61 | 	return inputs, outputs
62 | 
63 | 
64 | def reset_model(model):
65 | 	model_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=model.name)
66 | 	model.sess.run(tf.initialize_vars(model_vars))
67 | 
68 | if __name__ == '__main__':
69 | 	model = construct_model()
70 | 


--------------------------------------------------------------------------------
/softlearning/misc/kernel.py:
--------------------------------------------------------------------------------
 1 | from distutils.version import LooseVersion
 2 | 
 3 | import numpy as np
 4 | import tensorflow as tf
 5 | 
 6 | 
 7 | def adaptive_isotropic_gaussian_kernel(xs, ys, h_min=1e-3):
 8 |     """Gaussian kernel with dynamic bandwidth.
 9 | 
10 |     The bandwidth is adjusted dynamically to match median_distance / log(Kx).
11 |     See [2] for more information.
12 | 
13 |     Args:
14 |         xs(`tf.Tensor`): A tensor of shape (N x Kx x D) containing N sets of Kx
15 |             particles of dimension D. This is the first kernel argument.
16 |         ys(`tf.Tensor`): A tensor of shape (N x Ky x D) containing N sets of Kx
17 |             particles of dimension D. This is the second kernel argument.
18 |         h_min(`float`): Minimum bandwidth.
19 | 
20 |     Returns:
21 |         `dict`: Returned dictionary has two fields:
22 |             'output': A `tf.Tensor` object of shape (N x Kx x Ky) representing
23 |                 the kernel matrix for inputs `xs` and `ys`.
24 |             'gradient': A 'tf.Tensor` object of shape (N x Kx x Ky x D)
25 |                 representing the gradient of the kernel with respect to `xs`.
26 | 
27 |     Reference:
28 |         [2] Qiang Liu,Dilin Wang, "Stein Variational Gradient Descent: A General
29 |             Purpose Bayesian Inference Algorithm," Neural Information Processing
30 |             Systems (NIPS), 2016.
31 |     """
32 |     Kx, D = xs.get_shape().as_list()[-2:]
33 |     Ky, D2 = ys.get_shape().as_list()[-2:]
34 |     assert D == D2
35 | 
36 |     leading_shape = tf.shape(xs)[:-2]
37 | 
38 |     # Compute the pairwise distances of left and right particles.
39 |     diff = tf.expand_dims(xs, -2) - tf.expand_dims(ys, -3)
40 |     # ... x Kx x Ky x D
41 | 
42 |     if LooseVersion(tf.__version__) <= LooseVersion('1.5.0'):
43 |         dist_sq = tf.reduce_sum(diff**2, axis=-1, keep_dims=False)
44 |     else:
45 |         dist_sq = tf.reduce_sum(diff**2, axis=-1, keepdims=False)
46 |     # ... x Kx x Ky
47 | 
48 |     # Get median.
49 |     input_shape = tf.concat((leading_shape, [Kx * Ky]), axis=0)
50 |     values, _ = tf.nn.top_k(
51 |         input=tf.reshape(dist_sq, input_shape),
52 |         k=(Kx * Ky // 2 + 1),  # This is exactly true only if Kx*Ky is odd.
53 |         sorted=True)  # ... x floor(Ks*Kd/2)
54 | 
55 |     medians_sq = values[..., -1]  # ... (shape) (last element is the median)
56 | 
57 |     h = medians_sq / np.log(Kx)  # ... (shape)
58 |     h = tf.maximum(h, h_min)
59 |     h = tf.stop_gradient(h)  # Just in case.
60 |     h_expanded_twice = tf.expand_dims(tf.expand_dims(h, -1), -1)
61 |     # ... x 1 x 1
62 | 
63 |     kappa = tf.exp(-dist_sq / h_expanded_twice)  # ... x Kx x Ky
64 | 
65 |     # Construct the gradient
66 |     h_expanded_thrice = tf.expand_dims(h_expanded_twice, -1)
67 |     # ... x 1 x 1 x 1
68 |     kappa_expanded = tf.expand_dims(kappa, -1)  # ... x Kx x Ky x 1
69 | 
70 |     kappa_grad = -2 * diff / h_expanded_thrice * kappa_expanded
71 |     # ... x Kx x Ky x D
72 | 
73 |     return {"output": kappa, "gradient": kappa_grad}
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # MAPLE
 2 | The Official Code for  "[MAPLE: Offline Model-based Adaptable Policy Learning](https://proceedings.neurips.cc/paper/2021/hash/470e7a4f017a5476afb7eeb3f8b96f9b-Abstract.html)". 
 3 | ![](./resources/poster.png)
 4 | After being accepted in NeurIPS'21, we conducted experiments in [NeoRL](https://arxiv.org/abs/2102.00714). The results can be found in the following table.
 5 | ![](./resources/neorl-maple.png)
 6 | \* In this process, we introduced parts of implementation tricks in [the NeoRL version of MOPO](https://agit.ai/Polixir/OfflineRL/src/branch/master/offlinerl) into MAPLE, which also make the training process of MAPLE more stable in NeoRL tasks and keep (or further improve) the performance in D4RL.
 7 | 
 8 | (The Pytorch version of MAPLE can also be found in: https://github.com/polixir/OfflineRL)
 9 | 
10 | # [optional] Download Resources
11 | 
12 | For better reproducibility, we uploaded a backup of dataset which is used in our experiment, since we found that the content of dataset in [D4RL](https://github.com/rail-berkeley/d4rl) and [NeoRL](https://github.com/polixir/NeoRL) might be changed.   
13 | - D4RL: https://drive.google.com/drive/folders/1kgNg6xLHRTyb_tzDQULezB9XYGNuakCM?usp=sharing
14 | - NeoRL: https://drive.google.com/drive/folders/1gZdVQTY_7FLCFGqszHF9sfKcXT8epoze?usp=sharing
15 | 
16 | After downloaded, you can push the data of D4RL to ~/.d4rl/datasets and NeoRL to {your path to MAPLE}/neorl_data/
17 | 
18 | We have also uploaded the dynamics models for MAPLE-200 and MAPLE-NeoRL (which have 50 ensemble models) training, which can be found in: https://drive.google.com/drive/folders/1Ex9_RyJsafKaU2Eo5UgD34ZqJnJ25cru?usp=sharing.
19 | You can download the models to {path to MAPLE}/models to skip the dynamics model training process. 
20 | 
21 | # Installation
22 | 
23 | We use [RLAssistant](https://github.com/xionghuichen/RLAssistant) to manage our experiments. You can download and install it via:
24 | ```
25 | git clone https://github.com/xionghuichen/RLAssistant.git
26 | cd RLAssistant
27 | pip install -e .
28 | ```
29 | Then you can install MAPLE via:
30 | ```
31 | git clone https://github.com/xionghuichen/MAPLE.git
32 | cd MAPLE
33 | pip install -e .
34 | ```
35 | 
36 | # Quick Start
37 | 
38 | You can train your MAPLE policy directly like this:
39 | ```
40 | cd run_scripts
41 | # train the MAPLE policy for the hopper_low task in neorl
42 | python main.py --config examples.config.neorl.hopper_low
43 | or 
44 | 
45 | # train the MAPLE policy for walker2d_medium_expert task in d4rl
46 | python main.py --config examples.config.d4rl.walker2d_medium_expert 
47 | 
48 | # train the MAPLE policy for walker2d_medium_expert task in d4rl with 200 dynamics models
49 | python main.py --config examples.config.d4rl.walker2d_medium_expert --maple_200
50 | 
51 | 
52 | # train the MAPLE policy for walker2d_medium_expert task in d4rl with your custom configs
53 | python main.py --config examples.config.d4rl.walker2d_medium_expert --custom_config --penalty_coeff 1.0
54 | ```
55 | 
56 | The training logs can be found in {your MAPLE path}/log. You can use tensorbard to check and also use the tools in RLA to visualize (e.g., ```RLA.easy_plot.plot_func.plot_res_func```).
57 | You can check plot_demo.ipynb for more details. The figure of the simplest setting will be something like this:
58 | 
59 | ![](./resources/plot_demo.png)
60 | 
61 | There are also some scrips in ``./rla_scrips`` to manage the experimental logs. 
62 | 


--------------------------------------------------------------------------------
/run_scripts/utils.py:
--------------------------------------------------------------------------------
  1 | import multiprocessing
  2 | import argparse
  3 | from distutils.util import strtobool
  4 | import json
  5 | 
  6 | # from ray.tune import sample_from
  7 | 
  8 | import softlearning.algorithms.utils as alg_utils
  9 | import softlearning.environments.utils as env_utils
 10 | from softlearning.misc.utils import datetimestamp
 11 | 
 12 | 
 13 | DEFAULT_UNIVERSE = 'gym'
 14 | DEFAULT_DOMAIN = 'HalfCheetah'
 15 | DEFAULT_TASK = 'v2'
 16 | 
 17 | class AlgType(object):
 18 |     MAPLE_NEORL = 'maple_neorl'
 19 |     MAPLE_D4RL = 'maple_d4rl'
 20 |     MAPLE_D4RL_200 = 'maple_d4rl_200'
 21 | 
 22 | 
 23 | TASKS_BY_DOMAIN_BY_UNIVERSE = {
 24 |     universe: {
 25 |         domain: tuple(tasks)
 26 |         for domain, tasks in domains.items()
 27 |     }
 28 |     for universe, domains in env_utils.ENVIRONMENTS.items()
 29 | }
 30 | 
 31 | AVAILABLE_TASKS = set(sum(
 32 |     [
 33 |         tasks
 34 |         for universe, domains in TASKS_BY_DOMAIN_BY_UNIVERSE.items()
 35 |         for domain, tasks in domains.items()
 36 |     ],
 37 |     ()))
 38 | 
 39 | DOMAINS_BY_UNIVERSE = {
 40 |     universe: tuple(domains)
 41 |     for universe, domains in env_utils.ENVIRONMENTS.items()
 42 | }
 43 | 
 44 | AVAILABLE_DOMAINS = set(sum(DOMAINS_BY_UNIVERSE.values(), ()))
 45 | 
 46 | UNIVERSES = tuple(env_utils.ENVIRONMENTS)
 47 | 
 48 | AVAILABLE_ALGORITHMS = set(alg_utils.ALGORITHM_CLASSES.keys())
 49 | 
 50 | 
 51 | 
 52 | 
 53 | 
 54 | def get_parser(allow_policy_list=False):
 55 |     parser = argparse.ArgumentParser()
 56 | 
 57 |     # parser.add_argument(
 58 |     #     '--universe',
 59 |     #     type=str,
 60 |     #     choices=UNIVERSES,
 61 |     #     default=DEFAULT_UNIVERSE)
 62 |     # parser.add_argument(
 63 |     #     '--domain',
 64 |     #     type=str,
 65 |     #     choices=AVAILABLE_DOMAINS,
 66 |     #     default=DEFAULT_DOMAIN)
 67 |     parser.add_argument(
 68 |         '--config',
 69 |         type=str,
 70 |         default='examples.config.d4rl.halfcheetah_medium_expert'
 71 |         )
 72 |     parser.add_argument(
 73 |         '--info', type=str, default='default_info')
 74 |     parser.add_argument('--length', type=int, default=-1)
 75 |     parser.add_argument('--penalty_clip', type=float, default=20)
 76 |     parser.add_argument('--elite_num', type=int, default=-1)
 77 |     parser.add_argument( '--seed', type=int, default=88)
 78 |     parser.add_argument( '--n_epochs', type=int, default=1000)
 79 |     parser.add_argument(
 80 |         '--penalty_coeff', type=float, default=-1.0)
 81 |     parser.add_argument(
 82 |         '--emb_size', type=int, default=16)
 83 |     parser.add_argument(
 84 |         '--model_suffix', type=int, default=-1)
 85 |     parser.add_argument('--loaded_date', type=str, default='')
 86 |     parser.add_argument('--loaded_task_name', type=str, default='')
 87 |     parser.add_argument('--not_inherit_hp', action='store_false')
 88 |     parser.add_argument('--maple_200', action='store_true')
 89 |     parser.add_argument('--custom_config', action='store_true')
 90 |     parser.add_argument('--retrain_model', action='store_true')
 91 | 
 92 |     if allow_policy_list:
 93 |         parser.add_argument(
 94 |             '--policy',
 95 |             type=str,
 96 |             nargs='+',
 97 |             choices=('gaussian', ),
 98 |             default='gaussian')
 99 |     else:
100 |         parser.add_argument(
101 |             '--policy',
102 |             type=str,
103 |             choices=('gaussian', ),
104 |             default='gaussian')
105 | 
106 | 
107 | 
108 |     return parser
109 | 
110 | 
111 | 


--------------------------------------------------------------------------------
/softlearning/policies/base_policy.py:
--------------------------------------------------------------------------------
 1 | from contextlib import contextmanager
 2 | from collections import OrderedDict
 3 | 
 4 | import numpy as np
 5 | from serializable import Serializable
 6 | 
 7 | 
 8 | class BasePolicy(Serializable):
 9 |     def __init__(self):
10 |         self._deterministic = False
11 | 
12 |     def reset(self):
13 |         """Reset and clean the policy."""
14 |         raise NotImplementedError
15 | 
16 |     def actions(self, conditions):
17 |         """Compute (symbolic) actions given conditions (observations)"""
18 |         raise NotImplementedError
19 | 
20 |     def log_pis(self, conditions, actions):
21 |         """Compute (symbolic) log probs for given observations and actions."""
22 |         raise NotImplementedError
23 | 
24 |     def actions_np(self, conditions):
25 |         """Compute (numeric) actions given conditions (observations)"""
26 |         raise NotImplementedError
27 | 
28 |     def log_pis_np(self, conditions, actions):
29 |         """Compute (numeric) log probs for given observations and actions."""
30 |         raise NotImplementedError
31 | 
32 |     @contextmanager
33 |     def set_deterministic(self, deterministic=True):
34 |         """Context manager for changing the determinism of the policy.
35 |         Args:
36 |             set_deterministic (`bool`): Value to set the self._is_deterministic
37 |                 to during the context. The value will be reset back to the
38 |                 previous value when the context exits.
39 |         """
40 |         was_deterministic = self._deterministic
41 |         self._deterministic = deterministic
42 |         yield
43 |         self._deterministic = was_deterministic
44 | 
45 |     def get_diagnostics(self, conditions):
46 |         """Return diagnostic information of the policy.
47 | 
48 |         Arguments:
49 |             conditions: Observations to run the diagnostics for.
50 |         Returns:
51 |             diagnostics: OrderedDict of diagnostic information.
52 |         """
53 |         diagnostics = OrderedDict({})
54 |         return diagnostics
55 | 
56 |     def __getstate__(self):
57 |         state = Serializable.__getstate__(self)
58 |         state['pickled_weights'] = self.get_weights()
59 | 
60 |         return state
61 | 
62 |     def __setstate__(self, state):
63 |         Serializable.__setstate__(self, state)
64 |         self.set_weights(state['pickled_weights'])
65 | 
66 | 
67 | class LatentSpacePolicy(BasePolicy):
68 |     def __init__(self, *args, smoothing_coefficient=None, **kwargs):
69 |         super(LatentSpacePolicy, self).__init__(*args, **kwargs)
70 | 
71 |         assert smoothing_coefficient is None or 0 <= smoothing_coefficient <= 1
72 |         self._smoothing_alpha = smoothing_coefficient or 0
73 |         self._smoothing_beta = (
74 |             np.sqrt(1.0 - np.power(self._smoothing_alpha, 2.0))
75 |             / (1.0 - self._smoothing_alpha))
76 |         self._reset_smoothing_x()
77 |         self._smooth_latents = False
78 | 
79 |     def _reset_smoothing_x(self):
80 |         self._smoothing_x = np.zeros((1, *self._output_shape))
81 | 
82 |     def actions_np(self, conditions):
83 |         if self._deterministic:
84 |             return self.deterministic_actions_model.predict(conditions)
85 |         elif self._smoothing_alpha == 0:
86 |             return self.actions_model.predict(conditions)
87 |         else:
88 |             alpha, beta = self._smoothing_alpha, self._smoothing_beta
89 |             raw_latents = self.latents_model.predict(conditions)
90 |             self._smoothing_x = (
91 |                 alpha * self._smoothing_x + (1.0 - alpha) * raw_latents)
92 |             latents = beta * self._smoothing_x
93 | 
94 |             return self.actions_model_for_fixed_latents.predict(
95 |                 [*conditions, latents])
96 | 
97 |     def reset(self):
98 |         self._reset_smoothing_x()
99 | 


--------------------------------------------------------------------------------
/softlearning/samplers/explore_sampler.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import numpy as np
  4 | 
  5 | from .base_sampler import BaseSampler
  6 | 
  7 | 
  8 | class ExploreSampler(BaseSampler):
  9 |     def __init__(self, **kwargs):
 10 |         super(ExploreSampler, self).__init__(**kwargs)
 11 | 
 12 |         self._path_length = 0
 13 |         self._path_return = 0
 14 |         self._current_path = defaultdict(list)
 15 |         self._last_path_return = 0
 16 |         self._max_path_return = -np.inf
 17 |         self._n_episodes = 0
 18 |         self._current_observation = None
 19 |         self._total_samples = 0
 20 | 
 21 |     def _process_observations(self,
 22 |                               observation,
 23 |                               action,
 24 |                               reward,
 25 |                               terminal,
 26 |                               next_observation,
 27 |                               info):
 28 |         processed_observation = {
 29 |             'observations': observation,
 30 |             'actions': action,
 31 |             'rewards': [reward],
 32 |             'terminals': [terminal],
 33 |             'next_observations': next_observation,
 34 |             'infos': info,
 35 |         }
 36 | 
 37 |         return processed_observation
 38 | 
 39 |     def sample(self):
 40 |         if self._current_observation is None:
 41 |             self._current_observation = self.env.reset()
 42 |             self._s0 = self.env.unwrapped.state_vector()
 43 | 
 44 |         action = self.policy.actions_np([
 45 |             self.env.convert_to_active_observation(
 46 |                 self._current_observation)[None]
 47 |         ])[0]
 48 | 
 49 |         next_observation, reward, terminal, info = self.env.step(action)
 50 |         self._path_length += 1
 51 |         self._path_return += reward
 52 |         self._total_samples += 1
 53 | 
 54 |         processed_sample = self._process_observations(
 55 |             observation=self._current_observation,
 56 |             action=action,
 57 |             reward=reward,
 58 |             terminal=terminal,
 59 |             next_observation=next_observation,
 60 |             info=info,
 61 |         )
 62 | 
 63 |         for key, value in processed_sample.items():
 64 |             self._current_path[key].append(value)
 65 | 
 66 |         if terminal or self._path_length >= self._max_path_length:
 67 |             last_path = {
 68 |                 field_name: np.array(values)
 69 |                 for field_name, values in self._current_path.items()
 70 |             }
 71 |             self.pool.add_path(last_path)
 72 |             self._last_n_paths.appendleft(last_path)
 73 | 
 74 |             self._max_path_return = max(self._max_path_return,
 75 |                                         self._path_return)
 76 |             self._last_path_return = self._path_return
 77 | 
 78 |             self.policy.reset()
 79 |             self._current_observation = None
 80 |             self._path_length = 0
 81 |             self._path_return = 0
 82 |             self._current_path = defaultdict(list)
 83 | 
 84 |             self._n_episodes += 1
 85 |         else:
 86 |             self._current_observation = next_observation
 87 | 
 88 |         return next_observation, reward, terminal, info
 89 | 
 90 |     def random_batch(self, batch_size=None, **kwargs):
 91 |         batch_size = batch_size or self._batch_size
 92 |         observation_keys = getattr(self.env, 'observation_keys', None)
 93 | 
 94 |         return self.pool.random_batch(
 95 |             batch_size, observation_keys=observation_keys, **kwargs)
 96 | 
 97 |     def get_diagnostics(self):
 98 |         diagnostics = super(SimpleSampler, self).get_diagnostics()
 99 |         diagnostics.update({
100 |             'max-path-return': self._max_path_return,
101 |             'last-path-return': self._last_path_return,
102 |             'episodes': self._n_episodes,
103 |             'total-samples': self._total_samples,
104 |         })
105 | 
106 |         return diagnostics
107 | 


--------------------------------------------------------------------------------
/softlearning/samplers/remote_sampler.py:
--------------------------------------------------------------------------------
  1 | import pickle
  2 | from collections import OrderedDict
  3 | 
  4 | import ray
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | 
  8 | 
  9 | from .base_sampler import BaseSampler
 10 | from .utils import rollout
 11 | 
 12 | 
 13 | class RemoteSampler(BaseSampler):
 14 |     def __init__(self, **kwargs):
 15 |         super(RemoteSampler, self).__init__(**kwargs)
 16 | 
 17 |         self._remote_environment = None
 18 |         self._remote_path = None
 19 |         self._n_episodes = 0
 20 |         self._total_samples = 0
 21 |         self._last_path_return = 0
 22 |         self._max_path_return = -np.inf
 23 | 
 24 |     def _create_remote_environment(self, env, policy):
 25 |         env_pkl = pickle.dumps(env)
 26 |         policy_pkl = pickle.dumps(policy)
 27 | 
 28 |         if not ray.is_initialized():
 29 |             ray.init()
 30 | 
 31 |         self._remote_environment = _RemoteEnv.remote(env_pkl, policy_pkl)
 32 | 
 33 |         # Block until the env and policy is ready
 34 |         initialized = ray.get(self._remote_environment.initialized.remote())
 35 |         assert initialized, initialized
 36 | 
 37 |     def initialize(self, env, policy, pool):
 38 |         super(RemoteSampler, self).initialize(env, policy, pool)
 39 |         self._create_remote_environment(env, policy)
 40 | 
 41 |     def wait_for_path(self, timeout=1):
 42 |         if self._remote_path is None:
 43 |             return [True]
 44 | 
 45 |         path_ready, _ = ray.wait([self._remote_path], timeout=timeout)
 46 |         return path_ready
 47 | 
 48 |     def sample(self, timeout=0):
 49 |         if self._remote_path is None:
 50 |             policy_params = self.policy.get_weights()
 51 |             self._remote_path = self._remote_environment.rollout.remote(
 52 |                 policy_params, self._max_path_length)
 53 | 
 54 |         path_ready = self.wait_for_path(timeout=timeout)
 55 | 
 56 |         if len(path_ready) or not self.batch_ready():
 57 |             path = ray.get(self._remote_path)
 58 |             self._last_n_paths.appendleft(path)
 59 | 
 60 |             self.pool.add_path(path)
 61 | 
 62 |             self._remote_path = None
 63 |             self._total_samples += len(path['observations'])
 64 |             self._last_path_return = np.sum(path['rewards'])
 65 |             self._max_path_return = max(self._max_path_return,
 66 |                                         self._last_path_return)
 67 |             self._n_episodes += 1
 68 | 
 69 |     def get_diagnostics(self):
 70 |         diagnostics = OrderedDict({
 71 |             'max-path-return': self._max_path_return,
 72 |             'last-path-return': self._last_path_return,
 73 |             'pool-size': self.pool.size,
 74 |             'episodes': self._n_episodes,
 75 |             'total-samples': self._total_samples,
 76 |         })
 77 | 
 78 |         return diagnostics
 79 | 
 80 |     def __getstate__(self):
 81 |         super_state = super(RemoteSampler, self).__getstate__()
 82 |         state = {
 83 |             key: value for key, value in super_state.items()
 84 |             if key not in ('_remote_environment', '_remote_path')
 85 |         }
 86 | 
 87 |         return state
 88 | 
 89 |     def __setstate__(self, state):
 90 |         super(RemoteSampler, self).__setstate__(state)
 91 |         self._create_remote_environment(self.env, self.policy)
 92 |         self._remote_path = None
 93 | 
 94 | 
 95 | @ray.remote
 96 | class _RemoteEnv(object):
 97 |     def __init__(self, env_pkl, policy_pkl):
 98 |         self._session = tf.keras.backend.get_session()
 99 |         self._session.run(tf.global_variables_initializer())
100 | 
101 |         self._env = pickle.loads(env_pkl)
102 |         self._policy = pickle.loads(policy_pkl)
103 | 
104 |         if hasattr(self._env, 'initialize'):
105 |             self._env.initialize()
106 | 
107 |         self._initialized = True
108 | 
109 |     def initialized(self):
110 |         return self._initialized
111 | 
112 |     def rollout(self, policy_weights, path_length):
113 |         self._policy.set_weights(policy_weights)
114 |         path = rollout(self._env, self._policy, path_length)
115 | 
116 |         return path
117 | 


--------------------------------------------------------------------------------
/maple/models/utils.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from __future__ import print_function
  3 | from __future__ import absolute_import
  4 | 
  5 | import tensorflow as tf
  6 | import numpy as np
  7 | 
  8 | def get_required_argument(dotmap, key, message, default=None):
  9 |     val = dotmap.get(key, default)
 10 |     if val is default:
 11 |         raise ValueError(message)
 12 |     return val
 13 | 
 14 | def spectral_norm(w, iteration=1):
 15 |    w_shape = w.shape.as_list()
 16 |    w = tf.reshape(w, [-1, w_shape[-1]])
 17 | 
 18 |    u = tf.get_variable("u", [1, w_shape[-1]], initializer=tf.random_normal_initializer(), trainable=False)
 19 | 
 20 |    u_hat = u
 21 |    v_hat = None
 22 |    for i in range(iteration):
 23 |        """
 24 |        power iteration
 25 |        Usually iteration = 1 will be enough
 26 |        """
 27 |        v_ = tf.matmul(u_hat, tf.transpose(w))
 28 |        v_hat = tf.nn.l2_normalize(v_)
 29 | 
 30 |        u_ = tf.matmul(v_hat, w)
 31 |        u_hat = tf.nn.l2_normalize(u_)
 32 | 
 33 |    u_hat = tf.stop_gradient(u_hat)
 34 |    v_hat = tf.stop_gradient(v_hat)
 35 | 
 36 |    sigma = tf.matmul(tf.matmul(v_hat, w), tf.transpose(u_hat))
 37 | 
 38 |    with tf.control_dependencies([u.assign(u_hat)]):
 39 |        w_norm = w / sigma
 40 |        w_norm = tf.reshape(w_norm, w_shape)
 41 | 
 42 | 
 43 |    return w_norm, u
 44 | 
 45 | class TensorStandardScaler:
 46 |     """Helper class for automatically normalizing inputs into the network.
 47 |     """
 48 |     def __init__(self, x_dim):
 49 |         """Initializes a scaler.
 50 | 
 51 |         Arguments:
 52 |         x_dim (int): The dimensionality of the inputs into the scaler.
 53 | 
 54 |         Returns: None.
 55 |         """
 56 |         self.fitted = False
 57 |         with tf.variable_scope("Scaler"):
 58 |             self.mu = tf.get_variable(
 59 |                 name="scaler_mu", shape=[1, x_dim], initializer=tf.constant_initializer(0.0),
 60 |                 trainable=False
 61 |             )
 62 |             self.sigma = tf.get_variable(
 63 |                 name="scaler_std", shape=[1, x_dim], initializer=tf.constant_initializer(1.0),
 64 |                 trainable=False
 65 |             )
 66 | 
 67 |         self.cached_mu, self.cached_sigma = np.zeros([0, x_dim]), np.ones([1, x_dim])
 68 | 
 69 |     def fit(self, data):
 70 |         """Runs two ops, one for assigning the mean of the data to the internal mean, and
 71 |         another for assigning the standard deviation of the data to the internal standard deviation.
 72 |         This function must be called within a 'with <session>.as_default()' block.
 73 | 
 74 |         Arguments:
 75 |         data (np.ndarray): A numpy array containing the input
 76 | 
 77 |         Returns: None.
 78 |         """
 79 |         mu = np.mean(data, axis=0, keepdims=True)
 80 |         sigma = np.std(data, axis=0, keepdims=True)
 81 |         sigma[sigma < 1e-12] = 1.0
 82 | 
 83 |         self.mu.load(mu)
 84 |         self.sigma.load(sigma)
 85 |         self.fitted = True
 86 |         self.cache()
 87 | 
 88 |     def transform(self, data):
 89 |         """Transforms the input matrix data using the parameters of this scaler.
 90 | 
 91 |         Arguments:
 92 |         data (np.array): A numpy array containing the points to be transformed.
 93 | 
 94 |         Returns: (np.array) The transformed dataset.
 95 |         """
 96 |         return (data - self.mu) / self.sigma
 97 | 
 98 |     def inverse_transform(self, data):
 99 |         """Undoes the transformation performed by this scaler.
100 | 
101 |         Arguments:
102 |         data (np.array): A numpy array containing the points to be transformed.
103 | 
104 |         Returns: (np.array) The transformed dataset.
105 |         """
106 |         return self.sigma * data + self.mu
107 | 
108 |     def get_vars(self):
109 |         """Returns a list of variables managed by this object.
110 | 
111 |         Returns: (list<tf.Variable>) The list of variables.
112 |         """
113 |         return [self.mu, self.sigma]
114 | 
115 |     def cache(self):
116 |         """Caches current values of this scaler.
117 | 
118 |         Returns: None.
119 |         """
120 |         self.cached_mu = self.mu.eval()
121 |         self.cached_sigma = self.sigma.eval()
122 | 
123 |     def load_cache(self):
124 |         """Loads values from the cache
125 |         Returns: None.
126 |         """
127 |         self.mu.load(self.cached_mu)
128 |         self.sigma.load(self.cached_sigma)
129 | 
130 | 


--------------------------------------------------------------------------------
/softlearning/misc/utils.py:
--------------------------------------------------------------------------------
  1 | import collections
  2 | import datetime
  3 | import os
  4 | import random
  5 | 
  6 | import tensorflow as tf
  7 | import numpy as np
  8 | 
  9 | 
 10 | PROJECT_PATH = os.path.dirname(
 11 |     os.path.realpath(os.path.join(__file__, '..', '..')))
 12 | 
 13 | 
 14 | DEFAULT_SNAPSHOT_MODE = 'none'
 15 | DEFAULT_SNAPSHOT_GAP = 1000
 16 | 
 17 | 
 18 | def initialize_tf_variables(session, only_uninitialized=True):
 19 |     variables = tf.global_variables() + tf.local_variables()
 20 | 
 21 |     def is_initialized(variable):
 22 |         try:
 23 |             session.run(variable)
 24 |             return True
 25 |         except tf.errors.FailedPreconditionError:
 26 |             return False
 27 | 
 28 |         return False
 29 | 
 30 |     if only_uninitialized:
 31 |         variables = [
 32 |             variable for variable in variables
 33 |             if not is_initialized(variable)
 34 |         ]
 35 | 
 36 |     session.run(tf.variables_initializer(variables))
 37 | 
 38 | 
 39 | def set_seed(seed):
 40 |     seed %= 4294967294
 41 |     random.seed(seed)
 42 |     np.random.seed(seed)
 43 |     tf.set_random_seed(seed)
 44 |     print("Using seed {}".format(seed))
 45 | 
 46 | 
 47 | def datetimestamp(divider='-', datetime_divider='T'):
 48 |     now = datetime.datetime.now()
 49 |     return now.strftime(
 50 |         '%Y{d}%m{d}%dT%H{d}%M{d}%S'
 51 |         ''.format(d=divider, dtd=datetime_divider))
 52 | 
 53 | 
 54 | def datestamp(divider='-'):
 55 |     return datetime.date.today().isoformat().replace('-', divider)
 56 | 
 57 | 
 58 | def timestamp(divider='-'):
 59 |     now = datetime.datetime.now()
 60 |     time_now = datetime.datetime.time(now)
 61 |     return time_now.strftime(
 62 |         '%H{d}%M{d}%S'.format(d=divider))
 63 | 
 64 | 
 65 | def concat_obs_z(obs, z, num_skills):
 66 |     """Concatenates the observation to a one-hot encoding of Z."""
 67 |     assert np.isscalar(z)
 68 |     z_one_hot = np.zeros(num_skills)
 69 |     z_one_hot[z] = 1
 70 |     return np.hstack([obs, z_one_hot])
 71 | 
 72 | 
 73 | def split_aug_obs(aug_obs, num_skills):
 74 |     """Splits an augmented observation into the observation and Z."""
 75 |     (obs, z_one_hot) = (aug_obs[:-num_skills], aug_obs[-num_skills:])
 76 |     z = np.where(z_one_hot == 1)[0][0]
 77 |     return (obs, z)
 78 | 
 79 | 
 80 | def _make_dir(filename):
 81 |     folder = os.path.dirname(filename)
 82 |     if not os.path.exists(folder):
 83 |         os.makedirs(folder)
 84 | 
 85 | 
 86 | def save_video(video_frames, filename):
 87 |     import cv2
 88 |     _make_dir(filename)
 89 | 
 90 |     video_frames = np.flip(video_frames, axis=-1)
 91 | 
 92 |     # Define the codec and create VideoWriter object
 93 |     fourcc = cv2.VideoWriter_fourcc(*'MJPG')
 94 |     fps = 30.0
 95 |     (height, width, _) = video_frames[0].shape
 96 |     writer = cv2.VideoWriter(filename, fourcc, fps, (width, height))
 97 |     for video_frame in video_frames:
 98 |         writer.write(video_frame)
 99 |     writer.release()
100 | 
101 | 
102 | def deep_update(d, *us):
103 |     d = d.copy()
104 | 
105 |     for u in us:
106 |         u = u.copy()
107 |         for k, v in u.items():
108 |             d[k] = (
109 |                 deep_update(d.get(k, {}), v)
110 |                 if isinstance(v, collections.Mapping)
111 |                 else v)
112 | 
113 |     return d
114 | 
115 | 
116 | def get_git_rev():
117 |     try:
118 |         import git
119 |     except ImportError:
120 |         print(
121 |             "Warning: gitpython not installed."
122 |             " Unable to log git rev."
123 |             " Run `pip install gitpython` if you want git revs to be logged.")
124 |         return None
125 | 
126 |     try:
127 |         repo = git.Repo(os.getcwd())
128 |         git_rev = repo.active_branch.commit.name_rev
129 |     except TypeError:
130 |         git_rev = repo.head.object.name_rev
131 | 
132 |     return git_rev
133 | 
134 | 
135 | def flatten(unflattened, parent_key='', separator='.'):
136 |     items = []
137 |     for k, v in unflattened.items():
138 |         if separator in k:
139 |             raise ValueError(
140 |                 "Found separator ({}) from key ({})".format(separator, k))
141 |         new_key = parent_key + separator + k if parent_key else k
142 |         if isinstance(v, collections.MutableMapping) and v:
143 |             items.extend(flatten(v, new_key, separator=separator).items())
144 |         else:
145 |             items.append((new_key, v))
146 | 
147 |     return dict(items)
148 | 
149 | 
150 | def unflatten(flattened, separator='.'):
151 |     result = {}
152 |     for key, value in flattened.items():
153 |         parts = key.split(separator)
154 |         d = result
155 |         for part in parts[:-1]:
156 |             if part not in d:
157 |                 d[part] = {}
158 |             d = d[part]
159 |         d[parts[-1]] = value
160 | 
161 |     return result
162 | 


--------------------------------------------------------------------------------
/softlearning/samplers/simple_sampler.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import numpy as np
  4 | 
  5 | from .base_sampler import BaseSampler
  6 | 
  7 | 
  8 | class SimpleSampler(BaseSampler):
  9 |     def __init__(self, **kwargs):
 10 |         super(SimpleSampler, self).__init__(**kwargs)
 11 | 
 12 |         self._path_length = 0
 13 |         self._path_return = 0
 14 |         self._current_path = defaultdict(list)
 15 |         self._last_path_return = 0
 16 |         self._max_path_return = -np.inf
 17 |         self._n_episodes = 0
 18 |         self._current_observation = None
 19 |         self._total_samples = 0
 20 | 
 21 |     def initialize(self, env, policy, pool):
 22 |         super(SimpleSampler, self).initialize(env, policy, pool)
 23 |         self.get_action = self.policy[0]
 24 |         self.make_init_hidden = self.policy[1]
 25 |         self.hidden = self.make_init_hidden()
 26 | 
 27 |     def _process_observations(self,
 28 |                               observation,
 29 |                               action,
 30 |                               last_action,
 31 |                               reward,
 32 |                               terminal,
 33 |                               next_observation,
 34 |                               info):
 35 |         processed_observation = {
 36 |             'observations': observation,
 37 |             'actions': action,
 38 |             'last_actions': last_action,
 39 |             'rewards': [reward],
 40 |             'terminals': [terminal],
 41 |             'next_observations': next_observation,
 42 |             'valid': [1],
 43 |             'infos': info,
 44 |         }
 45 | 
 46 |         return processed_observation
 47 | 
 48 |     def sample(self):
 49 |         if self._current_observation is None:
 50 |             self._current_observation = self.env.reset()
 51 |             #### EDIT
 52 |             if hasattr(self.env.unwrapped, "state_vector"):
 53 |                 self._reset_state_vector = self.env.unwrapped.state_vector()
 54 |             ####
 55 |         lst_action = self.hidden[1]
 56 |         action, self.hidden = self.get_action(self.env.convert_to_active_observation(
 57 |                 self._current_observation)[None], self.hidden)
 58 |         action = action[0]
 59 |         # print(action.shape)
 60 |         next_observation, reward, terminal, info = self.env.step(action)
 61 |         self._path_length += 1
 62 |         self._path_return += reward
 63 |         self._total_samples += 1
 64 |         # print(lst_action.shape, lst_action.squeeze(1).shape, action.shape)
 65 |         processed_sample = self._process_observations(
 66 |             observation=self._current_observation,
 67 |             action=action,
 68 |             reward=reward,
 69 |             terminal=terminal,
 70 |             next_observation=next_observation,
 71 |             last_action=lst_action.squeeze(1).squeeze(0),
 72 |             info=info,
 73 |         )
 74 | 
 75 |         for key, value in processed_sample.items():
 76 |             self._current_path[key].append(value)
 77 | 
 78 |         if terminal or self._path_length >= self._max_path_length:
 79 |             last_path = {
 80 |                 field_name: np.array(values)
 81 |                 for field_name, values in self._current_path.items()
 82 |             }
 83 |             ######## this function is siginificant for replaybuffer
 84 |             self.pool.add_path(last_path)
 85 |             self._last_n_paths.appendleft(last_path)
 86 | 
 87 |             self._max_path_return = max(self._max_path_return,
 88 |                                         self._path_return)
 89 |             self._last_path_return = self._path_return
 90 | 
 91 |             self.reset_policy()
 92 |             self._current_observation = None
 93 |             self._path_length = 0
 94 |             self._path_return = 0
 95 |             self._current_path = defaultdict(list)
 96 | 
 97 |             self._n_episodes += 1
 98 | 
 99 |         else:
100 |             self._current_observation = next_observation
101 | 
102 |         return next_observation, reward, terminal, info
103 | 
104 |     def reset_policy(self):
105 |         self.hidden = self.make_init_hidden(1)
106 | 
107 |     def random_batch(self, batch_size=None, **kwargs):
108 |         batch_size = batch_size or self._batch_size
109 |         observation_keys = getattr(self.env, 'observation_keys', None)
110 | 
111 |         return self.pool.random_batch(
112 |             batch_size, observation_keys=observation_keys, **kwargs)
113 | 
114 |     def get_diagnostics(self):
115 |         diagnostics = super(SimpleSampler, self).get_diagnostics()
116 |         diagnostics.update({
117 |             'max-path-return': self._max_path_return,
118 |             'last-path-return': self._last_path_return,
119 |             'episodes': self._n_episodes,
120 |             'total-samples': self._total_samples,
121 |         })
122 | 
123 |         return diagnostics
124 | 


--------------------------------------------------------------------------------
/maple/utils/visualization.py:
--------------------------------------------------------------------------------
  1 | import io
  2 | import math
  3 | import numpy as np
  4 | import cv2
  5 | import matplotlib
  6 | matplotlib.use('Agg')
  7 | import matplotlib.pyplot as plt
  8 | 
  9 | 
 10 | def plot_trajectories(writer, label, epoch, env_traj, model_traj, means, stds):
 11 |     state_dim = env_traj[0].size
 12 |     model_states = [[obs[s] for obs in model_traj] for s in range(state_dim)]
 13 |     env_states   = [[obs[s] for obs in env_traj  ] for s in range(state_dim)]
 14 | 
 15 |     means = [np.array([mean[s] for mean in means]) for s in range(state_dim)]
 16 |     stds = [np.array([std[s] for std in stds]) for s in range(state_dim)]
 17 | 
 18 |     cols = 1
 19 |     rows = math.ceil(state_dim / cols)
 20 | 
 21 |     plt.clf()
 22 |     fig, axes = plt.subplots(rows, cols, figsize = (9*cols, 3*rows))
 23 |     axes = axes.ravel()
 24 | 
 25 |     for i in range(state_dim):
 26 |         ax = axes[i]
 27 |         X = range(len(model_states[i]))
 28 | 
 29 |         ax.fill_between(X, means[i]+stds[i], means[i]-stds[i], color='r', alpha=0.5)
 30 |         ax.plot(env_states[i],   color='k')
 31 |         ax.plot(model_states[i], color='b')
 32 |         ax.plot(means[i], color='r')
 33 | 
 34 |         if i == 0:
 35 |             ax.set_title('reward')
 36 |         elif i == 1:
 37 |             ax.set_title('terminal')
 38 |         else:
 39 |             ax.set_title('state dim {}'.format(i-2))
 40 |     plt.tight_layout()
 41 | 
 42 |     buf = io.BytesIO()
 43 |     plt.savefig(buf, format='png', layout = 'tight')
 44 |     buf.seek(0)
 45 | 
 46 |     img = cv2.imdecode(np.fromstring(buf.getvalue(), dtype=np.uint8), -1)
 47 |     img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
 48 |     img = img.transpose(2,0,1) / 255.
 49 |     
 50 |     writer.add_image(label, img, epoch)
 51 | 
 52 |     plt.close()
 53 | 
 54 | 
 55 | '''
 56 |     writer video : [ batch x channels x timesteps x height x width ]
 57 | '''
 58 | def record_trajectories(writer, label, epoch, env_images, model_images=None):
 59 |     traj_length = len(env_images)
 60 |     if model_images is not None:
 61 |         assert len(env_images) == len(model_images)
 62 |         images = [np.concatenate((env_img, model_img)) for (env_img, model_img) in zip(env_images, model_images)]
 63 |     else:
 64 |         images = env_images
 65 |         
 66 |     ## [ traj_length, 2 * H, W, C ]
 67 |     images = np.array(images)
 68 |     images = torch.Tensor(images)
 69 | 
 70 |     ## [ traj_length, C, 2 * H, W ]
 71 |     images = images.permute(0,3,1,2)
 72 |     ## [ B, traj_length, C, 2 * H, W ]
 73 |     images = images.unsqueeze(0)
 74 | 
 75 |     images = images / 255.
 76 |     images = images[:,:,0].unsqueeze(2)
 77 | 
 78 |     print('[ Visualization ] Saving to {}'.format(label))
 79 |     fps = min(max(traj_length / 5, 2), 30)
 80 |     writer.add_video('video_' + label, images, epoch, fps = fps)
 81 | 
 82 | 
 83 | def visualize_policy(real_env, fake_env, policy, writer, timestep, max_steps=100, focus=None, label='model_vis', img_dim=128):
 84 |     init_obs = real_env.reset()
 85 |     obs = init_obs.copy()
 86 | 
 87 |     observations_r = [obs]
 88 |     observations_f = [obs]
 89 |     rewards_r = [0]
 90 |     rewards_f = [0]
 91 |     terminals_r = [False]
 92 |     terminals_f = [False]
 93 |     means_f = [np.concatenate((np.zeros(2), obs))]
 94 |     stds_f = [np.concatenate((np.zeros(2), obs*0))]
 95 |     actions = []
 96 | 
 97 |     i = 0
 98 |     term_r, term_f = False, False
 99 |     while not (term_r and term_f) and i <= max_steps:
100 | 
101 |         act = policy.actions_np(obs[None])[0]
102 |         if not term_r:
103 |             next_obs_r, rew_r, term_r, info_r = real_env.step(act)
104 |             observations_r.append(next_obs_r)
105 |             rewards_r.append(rew_r)
106 |             terminals_r.append(term_r)
107 | 
108 |         if not term_f:
109 |             next_obs_f, rew_f, term_f, info_f = fake_env.step(obs, act)
110 |             observations_f.append(next_obs_f)
111 |             rewards_f.append(rew_f)
112 |             terminals_f.append(term_f)
113 |             means_f.append(info_f['mean'])
114 |             stds_f.append(info_f['std'])
115 |         
116 |         actions.append(act)
117 | 
118 |         if not term_f:
119 |             obs = next_obs_f
120 |         else:
121 |             obs = next_obs_r
122 | 
123 |         i += 1
124 | 
125 |     terminals_r = np.array([terminals_r]).astype(np.uint8).T
126 |     terminals_f = np.array([terminals_f]).astype(np.uint8).T
127 |     rewards_r = np.array([rewards_r]).T
128 |     rewards_f = np.array([rewards_f]).T
129 | 
130 |     rewards_observations_r = np.concatenate((rewards_r, terminals_r, np.array(observations_r)), -1)
131 |     rewards_observations_f = np.concatenate((rewards_f, terminals_f, np.array(observations_f)), -1)
132 |     plot_trajectories(writer, label, timestep, rewards_observations_r, rewards_observations_f, means_f, stds_f)
133 |     record_trajectories(writer, label, epoch, images_r)
134 | 
135 | 


--------------------------------------------------------------------------------
/maple/utils/logging.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | import math
  3 | 
  4 | class Progress:
  5 | 
  6 | 	def __init__(self, total, name = 'Progress', ncol=3, max_length=20, indent=0, line_width=100, speed_update_freq=100):
  7 | 		self.total = total
  8 | 		self.name = name
  9 | 		self.ncol = ncol
 10 | 		self.max_length = max_length
 11 | 		self.indent = indent
 12 | 		self.line_width = line_width
 13 | 		self._speed_update_freq = speed_update_freq
 14 | 		self._speed = None
 15 | 
 16 | 		self._step = 0
 17 | 		self._prev_line = '\033[F'
 18 | 		self._clear_line = ' ' * self.line_width
 19 | 
 20 | 		self._pbar_size = self.ncol * self.max_length
 21 | 		self._complete_pbar = '#' * self._pbar_size
 22 | 		self._incomplete_pbar = ' ' * self._pbar_size
 23 | 
 24 | 		self.lines = ['']
 25 | 		self.fraction = '{} / {}'.format(0, self.total)
 26 | 
 27 | 		self.resume()
 28 | 
 29 | 		
 30 | 	def update(self, n=1):
 31 | 		self._step += n
 32 | 		if self._step % self._speed_update_freq == 0:
 33 | 			self._time0 = time.time()
 34 | 			self._step0 = self._step
 35 | 
 36 | 	def resume(self):
 37 | 		self._skip_lines = 1
 38 | 		print('\n', end='')
 39 | 		self._time0 = time.time()
 40 | 		self._step0 = self._step
 41 | 
 42 | 	def pause(self):
 43 | 		self._clear()
 44 | 		self._skip_lines = 1
 45 | 
 46 | 	def set_description(self, params=[]):
 47 | 
 48 | 		############
 49 | 		# Position #
 50 | 		############
 51 | 		self._clear()
 52 | 
 53 | 		###########
 54 | 		# Percent #
 55 | 		###########
 56 | 		percent, fraction = self._format_percent(self._step, self.total)
 57 | 		self.fraction = fraction
 58 | 
 59 | 		#########
 60 | 		# Speed #
 61 | 		#########
 62 | 		speed = self._format_speed(self._step)
 63 | 
 64 | 		##########
 65 | 		# Params #
 66 | 		##########
 67 | 		num_params = len(params)
 68 | 		nrow = math.ceil(num_params / self.ncol)
 69 | 		params_split = self._chunk(params, self.ncol)
 70 | 		params_string, lines = self._format(params_split)
 71 | 		self.lines = lines
 72 | 
 73 | 
 74 | 		description = '{} | {}{}'.format(percent, speed, params_string)
 75 | 		print(description)
 76 | 		self._skip_lines = nrow + 1
 77 | 
 78 | 	def append_description(self, descr):
 79 | 		self.lines.append(descr)
 80 | 
 81 | 	def _clear(self):
 82 | 		position = self._prev_line * self._skip_lines
 83 | 		empty = '\n'.join([self._clear_line for _ in range(self._skip_lines)])
 84 | 		print(position, end='')
 85 | 		print(empty)
 86 | 		print(position, end='')
 87 | 		
 88 | 	def _format_percent(self, n, total):
 89 | 		if total:
 90 | 			percent = n / float(total)
 91 | 
 92 | 			complete_entries = int(percent * self._pbar_size)
 93 | 			incomplete_entries = self._pbar_size - complete_entries
 94 | 
 95 | 			pbar = self._complete_pbar[:complete_entries] + self._incomplete_pbar[:incomplete_entries]
 96 | 			fraction = '{} / {}'.format(n, total)
 97 | 			string = '{} [{}] {:3d}%'.format(fraction, pbar, int(percent*100))
 98 | 		else:
 99 | 			fraction = '{}'.format(n)
100 | 			string = '{} iterations'.format(n)
101 | 		return string, fraction
102 | 
103 | 	def _format_speed(self, n):
104 | 		num_steps = n - self._step0
105 | 		t = time.time() - self._time0
106 | 		speed = num_steps / t
107 | 		string = '{:.1f} Hz'.format(speed)
108 | 		if num_steps > 0:
109 | 			self._speed = string
110 | 		return string
111 | 
112 | 	def _chunk(self, l, n):
113 | 		return [l[i:i+n] for i in range(0, len(l), n)]
114 | 
115 | 	def _format(self, chunks):
116 | 		lines = [self._format_chunk(chunk) for chunk in chunks]
117 | 		lines.insert(0,'')
118 | 		padding = '\n' + ' '*self.indent
119 | 		string = padding.join(lines)
120 | 		return string, lines
121 | 
122 | 	def _format_chunk(self, chunk):
123 | 		line = ' | '.join([self._format_param(param) for param in chunk])
124 | 		return line
125 | 
126 | 	def _format_param(self, param):
127 | 		k, v = param
128 | 		return '{} : {}'.format(k, v)[:self.max_length]
129 | 
130 | 	def stamp(self):
131 | 		if self.lines != ['']:
132 | 			params = ' | '.join(self.lines)
133 | 			string = '[ {} ] {}{} | {}'.format(self.name, self.fraction, params, self._speed)
134 | 			self._clear()
135 | 			print(string, end='\n')
136 | 			self._skip_lines = 1
137 | 		else:
138 | 			self._clear()
139 | 			self._skip_lines = 0
140 | 
141 | 	def close(self):
142 | 		self.pause()
143 | 
144 | class Silent:
145 | 
146 | 	def __init__(self, *args, **kwargs):
147 | 		pass
148 | 
149 | 	def __getattr__(self, attr):
150 | 		return lambda *args: None
151 | 
152 | 
153 | if __name__ == '__main__':
154 | 	silent = Silent()
155 | 	silent.update()
156 | 	silent.stamp()
157 | 
158 | 	num_steps = 1000
159 | 	progress = Progress(num_steps)
160 | 	for i in range(num_steps):
161 | 		progress.update()
162 | 		params = [
163 | 			['A', '{:06d}'.format(i)],
164 | 			['B', '{:06d}'.format(i)],
165 | 			['C', '{:06d}'.format(i)],
166 | 			['D', '{:06d}'.format(i)],
167 | 			['E', '{:06d}'.format(i)],
168 | 			['F', '{:06d}'.format(i)],
169 | 			['G', '{:06d}'.format(i)],
170 | 			['H', '{:06d}'.format(i)],
171 |         ]
172 | 		progress.set_description(params)
173 | 		time.sleep(0.01)
174 | 	progress.close()
175 | 


--------------------------------------------------------------------------------
/softlearning/environments/gym/mujoco/image_pusher_2d.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from softlearning.environments.helpers import random_point_in_circle
  4 | from .pusher_2d import Pusher2dEnv
  5 | 
  6 | 
  7 | class ImagePusher2dEnv(Pusher2dEnv):
  8 |     def __init__(self, image_shape, *args, **kwargs):
  9 |         self._Serializable__initialize(locals())
 10 |         self.image_shape = image_shape
 11 |         Pusher2dEnv.__init__(self, *args, **kwargs)
 12 | 
 13 |     def _get_obs(self):
 14 |         width, height = self.image_shape[:2]
 15 |         image = self.render(mode='rgb_array', width=width, height=height)
 16 |         image = ((2.0 / 255.0) * image - 1.0)
 17 | 
 18 |         return np.concatenate([
 19 |             image.reshape(-1),
 20 |             self.sim.data.qpos.flat[self.JOINT_INDS],
 21 |             self.sim.data.qvel.flat[self.JOINT_INDS],
 22 |         ]).reshape(-1)
 23 | 
 24 |     def step(self, action):
 25 |         """Step, computing reward from 'true' observations and not images."""
 26 | 
 27 |         reward_observations = super(ImagePusher2dEnv, self)._get_obs()
 28 |         reward, info = self.compute_reward(reward_observations, action)
 29 | 
 30 |         self.do_simulation(action, self.frame_skip)
 31 | 
 32 |         observation = self._get_obs()
 33 |         done = False
 34 | 
 35 |         return observation, reward, done, info
 36 | 
 37 |     def viewer_setup(self):
 38 |         self.viewer.cam.trackbodyid = 0
 39 |         self.viewer.cam.lookat[:3] = [0, 0, 0]
 40 |         self.viewer.cam.distance = 3.5
 41 |         self.viewer.cam.elevation = -90
 42 |         self.viewer.cam.azimuth = 0
 43 |         self.viewer.cam.trackbodyid = -1
 44 | 
 45 | 
 46 | class ImageForkReacher2dEnv(ImagePusher2dEnv):
 47 |     def __init__(self,
 48 |                  arm_goal_distance_cost_coeff,
 49 |                  arm_object_distance_cost_coeff,
 50 |                  *args,
 51 |                  **kwargs):
 52 |         self._Serializable__initialize(locals())
 53 | 
 54 |         self._arm_goal_distance_cost_coeff = arm_goal_distance_cost_coeff
 55 |         self._arm_object_distance_cost_coeff = arm_object_distance_cost_coeff
 56 | 
 57 |         super(ImageForkReacher2dEnv, self).__init__(*args, **kwargs)
 58 | 
 59 |     def compute_reward(self, observations, actions):
 60 |         is_batch = True
 61 |         if observations.ndim == 1:
 62 |             observations = observations[None]
 63 |             actions = actions[None]
 64 |             is_batch = False
 65 |         else:
 66 |             raise NotImplementedError('Might be broken.')
 67 | 
 68 |         arm_pos = observations[:, -6:-4]
 69 |         goal_pos = self.get_body_com('goal')[:2][None]
 70 |         object_pos = observations[:, -3:-1]
 71 | 
 72 |         arm_goal_dists = np.linalg.norm(arm_pos - goal_pos, axis=1)
 73 |         arm_object_dists = np.linalg.norm(arm_pos - object_pos, axis=1)
 74 |         ctrl_costs = np.sum(actions**2, axis=1)
 75 | 
 76 |         costs = (
 77 |             + self._arm_goal_distance_cost_coeff * arm_goal_dists
 78 |             + self._arm_object_distance_cost_coeff * arm_object_dists
 79 |             + self._ctrl_cost_coeff * ctrl_costs)
 80 | 
 81 |         rewards = -costs
 82 | 
 83 |         if not is_batch:
 84 |             rewards = rewards.squeeze()
 85 |             arm_goal_dists = arm_goal_dists.squeeze()
 86 |             arm_object_dists = arm_object_dists.squeeze()
 87 | 
 88 |         return rewards, {
 89 |             'arm_goal_distance': arm_goal_dists,
 90 |             'arm_object_distance': arm_object_dists,
 91 |         }
 92 | 
 93 |     def reset_model(self):
 94 |         qpos = np.random.uniform(
 95 |             low=-0.1, high=0.1, size=self.model.nq) + self.init_qpos.squeeze()
 96 | 
 97 |         # qpos[self.JOINT_INDS[0]] = np.random.uniform(-np.pi, np.pi)
 98 |         # qpos[self.JOINT_INDS[1]] = np.random.uniform(
 99 |         #     -np.pi/2, np.pi/2) + np.pi/4
100 |         # qpos[self.JOINT_INDS[2]] = np.random.uniform(
101 |         #     -np.pi/2, np.pi/2) + np.pi/2
102 | 
103 |         target_position = np.array(random_point_in_circle(
104 |             angle_range=(0, 2*np.pi), radius=(0.6, 1.2)))
105 |         target_position[1] += 1.0
106 | 
107 |         qpos[self.TARGET_INDS] = target_position
108 |         # qpos[self.TARGET_INDS] = [1.0, 2.0]
109 |         # qpos[self.TARGET_INDS] = self.init_qpos.squeeze()[self.TARGET_INDS]
110 | 
111 |         puck_position = np.random.uniform([-1.0], [1.0], size=[2])
112 |         puck_position = (
113 |             np.sign(puck_position)
114 |             * np.maximum(np.abs(puck_position), 1/2))
115 |         puck_position[np.where(puck_position == 0)] = 1.0
116 |         # puck_position[1] += 1.0
117 |         # puck_position = np.random.uniform(
118 |         #     low=[0.3, -1.0], high=[1.0, -0.4]),
119 | 
120 |         qpos[self.PUCK_INDS] = puck_position
121 | 
122 |         qvel = self.init_qvel.copy().squeeze()
123 |         qvel[self.PUCK_INDS] = 0
124 |         qvel[self.TARGET_INDS] = 0
125 | 
126 |         # TODO: remnants from rllab -> gym conversion
127 |         # qacc = np.zeros(self.sim.data.qacc.shape[0])
128 |         # ctrl = np.zeros(self.sim.data.ctrl.shape[0])
129 |         # full_state = np.concatenate((qpos, qvel, qacc, ctrl))
130 | 
131 |         # super(Pusher2dEnv, self).reset(full_state)
132 | 
133 |         self.set_state(qpos, qvel)
134 | 
135 |         return self._get_obs()
136 | 
137 | 
138 | class BlindForkReacher2dEnv(ImageForkReacher2dEnv):
139 |     def _get_obs(self):
140 |         return np.concatenate([
141 |             self.sim.data.qpos.flat[self.JOINT_INDS],
142 |             self.sim.data.qvel.flat[self.JOINT_INDS],
143 |         ]).reshape(-1)
144 | 


--------------------------------------------------------------------------------
/softlearning/environments/gym/__init__.py:
--------------------------------------------------------------------------------
  1 | """Custom Gym environments.
  2 | 
  3 | Every class inside this module should extend a gym.Env class. The file
  4 | structure should be similar to gym.envs file structure, e.g. if you're
  5 | implementing a mujoco env, you would implement it under gym.mujoco submodule.
  6 | """
  7 | 
  8 | import gym
  9 | import numpy as np
 10 | 
 11 | 
 12 | CUSTOM_GYM_ENVIRONMENTS_PATH = __package__
 13 | MUJOCO_ENVIRONMENTS_PATH = '{}.mujoco'.format(CUSTOM_GYM_ENVIRONMENTS_PATH)
 14 | 
 15 | MUJOCO_ENVIRONMENT_SPECS = (
 16 |     {
 17 |         'id': 'Swimmer-Parameterizable-v3',
 18 |         'entry_point': ('gym.envs.mujoco.swimmer_v3:SwimmerEnv'),
 19 |     },
 20 |     {
 21 |         'id': 'Hopper-Parameterizable-v3',
 22 |         'entry_point': ('gym.envs.mujoco.hopper_v3:HopperEnv'),
 23 |     },
 24 |     {
 25 |         'id': 'Walker2d-Parameterizable-v3',
 26 |         'entry_point': ('gym.envs.mujoco.walker2d_v3:Walker2dEnv'),
 27 |     },
 28 |     {
 29 |         'id': 'HalfCheetah-Parameterizable-v3',
 30 |         'entry_point': ('gym.envs.mujoco.half_cheetah_v3:HalfCheetahEnv'),
 31 |     },
 32 |     {
 33 |         'id': 'Ant-Parameterizable-v3',
 34 |         'entry_point': ('gym.envs.mujoco.ant_v3:AntEnv'),
 35 |     },
 36 |     {
 37 |         'id': 'AntAngle-Parameterizable-v3',
 38 |         'entry_point': ('gym.envs.mujoco.ant_v3_angle:AntEnv'),
 39 |     },
 40 |     {
 41 |         'id': 'Humanoid-Parameterizable-v3',
 42 |         'entry_point': ('gym.envs.mujoco.humanoid_v3:HumanoidEnv'),
 43 |     },
 44 |     {
 45 |         'id': 'Pusher2d-Default-v0',
 46 |         'entry_point': ('{}.pusher_2d:Pusher2dEnv'.format(MUJOCO_ENVIRONMENTS_PATH)),
 47 |     },
 48 |     {
 49 |         'id': 'Pusher2d-DefaultReach-v0',
 50 |         'entry_point': ('{}.pusher_2d:ForkReacherEnv'.format(MUJOCO_ENVIRONMENTS_PATH)),
 51 |     },
 52 |     {
 53 |         'id': 'Pusher2d-ImageDefault-v0',
 54 |         'entry_point': ('{}.image_pusher_2d:ImagePusher2dEnv'.format(MUJOCO_ENVIRONMENTS_PATH)),
 55 |     },
 56 |     {
 57 |         'id': 'Pusher2d-ImageReach-v0',
 58 |         'entry_point': ('{}.image_pusher_2d:ImageForkReacher2dEnv'.format(MUJOCO_ENVIRONMENTS_PATH)),
 59 |     },
 60 |     {
 61 |         'id': 'Pusher2d-BlindReach-v0',
 62 |         'entry_point': ('{}.image_pusher_2d:BlindForkReacher2dEnv'.format(MUJOCO_ENVIRONMENTS_PATH)),
 63 |     },
 64 | )
 65 | 
 66 | GENERAL_ENVIRONMENT_SPECS = (
 67 |     {
 68 |         'id': 'MultiGoal-Default-v0',
 69 |         'entry_point': ('{}.multi_goal:MultiGoalEnv'.format(CUSTOM_GYM_ENVIRONMENTS_PATH))
 70 |     },
 71 | )
 72 | 
 73 | MULTIWORLD_ENVIRONMENT_SPECS = (
 74 |     {
 75 |         'id': 'Point2DEnv-Default-v0',
 76 |         'entry_point': 'multiworld.envs.pygame.point2d:Point2DEnv'
 77 |     },
 78 |     {
 79 |         'id': 'Point2DEnv-Wall-v0',
 80 |         'entry_point': 'multiworld.envs.pygame.point2d:Point2DWallEnv'
 81 |     },
 82 |     {
 83 |         'id': 'Point2DEnv-Offline-v0',
 84 |         'entry_point': 'multiworld.envs.pygame.point2d:Point2DEnv',
 85 |         'kwargs': {
 86 |             'initial_position' : np.array([-4, 0]),
 87 |             'fixed_goal' : np.array([4, 0]),
 88 |             'randomize_position_on_reset' : False
 89 |         }
 90 |     },
 91 |     {
 92 |         'id': 'Point2DWallEnv-Offline-v0',
 93 |         'entry_point': 'multiworld.envs.pygame.point2d:Point2DWallEnv',
 94 |         'kwargs': {
 95 |             'wall_shape': "big-u",
 96 |             'initial_position' : np.array([-4, 0]),
 97 |             'fixed_goal' : np.array([4, 0]),
 98 |             'randomize_position_on_reset' : False
 99 |         }
100 |     },
101 | )
102 | 
103 | MUJOCO_ENVIRONMENTS = tuple(
104 |     environment_spec['id']
105 |     for environment_spec in MUJOCO_ENVIRONMENT_SPECS)
106 | 
107 | 
108 | GENERAL_ENVIRONMENTS = tuple(
109 |     environment_spec['id']
110 |     for environment_spec in GENERAL_ENVIRONMENT_SPECS)
111 | 
112 | 
113 | MULTIWORLD_ENVIRONMENTS = tuple(
114 |     environment_spec['id']
115 |     for environment_spec in MULTIWORLD_ENVIRONMENT_SPECS)
116 | 
117 | GYM_ENVIRONMENTS = (
118 |     *MUJOCO_ENVIRONMENTS,
119 |     *GENERAL_ENVIRONMENTS,
120 |     *MULTIWORLD_ENVIRONMENTS,
121 | )
122 | 
123 | 
124 | def register_mujoco_environments():
125 |     """Register softlearning mujoco environments."""
126 |     for mujoco_environment in MUJOCO_ENVIRONMENT_SPECS:
127 |         gym.register(**mujoco_environment)
128 | 
129 |     gym_ids = tuple(
130 |         environment_spec['id']
131 |         for environment_spec in  MUJOCO_ENVIRONMENT_SPECS)
132 | 
133 |     return gym_ids
134 | 
135 | 
136 | def register_general_environments():
137 |     """Register gym environments that don't fall under a specific category."""
138 |     for general_environment in GENERAL_ENVIRONMENT_SPECS:
139 |         gym.register(**general_environment)
140 | 
141 |     gym_ids = tuple(
142 |         environment_spec['id']
143 |         for environment_spec in  GENERAL_ENVIRONMENT_SPECS)
144 | 
145 |     return gym_ids
146 | 
147 | 
148 | def register_multiworld_environments():
149 |     """Register custom environments from multiworld package."""
150 |     for multiworld_environment in MULTIWORLD_ENVIRONMENT_SPECS:
151 |         gym.register(**multiworld_environment)
152 | 
153 |     gym_ids = tuple(
154 |         environment_spec['id']
155 |         for environment_spec in  MULTIWORLD_ENVIRONMENT_SPECS)
156 | 
157 |     return gym_ids
158 | 
159 | 
160 | def register_environments():
161 |     registered_mujoco_environments = register_mujoco_environments()
162 |     registered_general_environments = register_general_environments()
163 |     registered_multiworld_environments = register_multiworld_environments()
164 | 
165 |     return (
166 |         *registered_mujoco_environments,
167 |         *registered_general_environments,
168 |         *registered_multiworld_environments,
169 |     )
170 | 


--------------------------------------------------------------------------------
/maple/env/assert/halfcheetah.xml:
--------------------------------------------------------------------------------
 1 | <!-- Cheetah Model
 2 |     The state space is populated with joints in the order that they are
 3 |     defined in this file. The actuators also operate on joints.
 4 |     State-Space (name/joint/parameter):
 5 |         - rootx     slider      position (m)
 6 |         - rootz     slider      position (m)
 7 |         - rooty     hinge       angle (rad)
 8 |         - bthigh    hinge       angle (rad)
 9 |         - bshin     hinge       angle (rad)
10 |         - bfoot     hinge       angle (rad)
11 |         - fthigh    hinge       angle (rad)
12 |         - fshin     hinge       angle (rad)
13 |         - ffoot     hinge       angle (rad)
14 |         - rootx     slider      velocity (m/s)
15 |         - rootz     slider      velocity (m/s)
16 |         - rooty     hinge       angular velocity (rad/s)
17 |         - bthigh    hinge       angular velocity (rad/s)
18 |         - bshin     hinge       angular velocity (rad/s)
19 |         - bfoot     hinge       angular velocity (rad/s)
20 |         - fthigh    hinge       angular velocity (rad/s)
21 |         - fshin     hinge       angular velocity (rad/s)
22 |         - ffoot     hinge       angular velocity (rad/s)
23 |     Actuators (name/actuator/parameter):
24 |         - bthigh    hinge       torque (N m)
25 |         - bshin     hinge       torque (N m)
26 |         - bfoot     hinge       torque (N m)
27 |         - fthigh    hinge       torque (N m)
28 |         - fshin     hinge       torque (N m)
29 |         - ffoot     hinge       torque (N m)
30 | -->
31 | <mujoco model="cheetah">
32 |   <compiler angle="radian" coordinate="local" inertiafromgeom="true" settotalmass="14"/>
33 |   <default>
34 |     <joint armature=".1" damping=".01" limited="true" solimplimit="0 .8 .03" solreflimit=".02 1" stiffness="8"/>
35 |     <geom conaffinity="0" condim="3" contype="1" friction=".4 .1 .1" rgba="0.8 0.6 .4 1" solimp="0.0 0.8 0.01" solref="0.02 1"/>
36 |     <motor ctrllimited="true" ctrlrange="-1 1"/>
37 |   </default>
38 |   <size nstack="300000" nuser_geom="1"/>
39 |   <option gravity="0 0 -9.81" timestep="0.01"/>
40 |   <asset>
41 |     <texture builtin="gradient" height="100" rgb1="1 1 1" rgb2="0 0 0" type="skybox" width="100"/>
42 |     <texture builtin="flat" height="1278" mark="cross" markrgb="1 1 1" name="texgeom" random="0.01" rgb1="0.8 0.6 0.4" rgb2="0.8 0.6 0.4" type="cube" width="127"/>
43 |     <texture builtin="checker" height="100" name="texplane" rgb1="0 0 0" rgb2="0.8 0.8 0.8" type="2d" width="100"/>
44 |     <material name="MatPlane" reflectance="0.5" shininess="1" specular="1" texrepeat="60 60" texture="texplane"/>
45 |     <material name="geom" texture="texgeom" texuniform="true"/>
46 |   </asset>
47 |   <worldbody>
48 |     <light cutoff="100" diffuse="1 1 1" dir="-0 0 -1.3" directional="true" exponent="1" pos="0 0 1.3" specular=".1 .1 .1"/>
49 |     <geom conaffinity="1" condim="3" material="MatPlane" name="floor" pos="0 0 0" rgba="0.8 0.9 0.8 1" size="40 40 40" type="plane"/>
50 |     <body name="torso" pos="0 0 .7">
51 |       <camera name="track" mode="trackcom" pos="0 -3 0.3" xyaxes="1 0 0 0 0 1"/>
52 |       <joint armature="0" axis="1 0 0" damping="0" limited="false" name="rootx" pos="0 0 0" stiffness="0" type="slide"/>
53 |       <joint armature="0" axis="0 0 1" damping="0" limited="false" name="rootz" pos="0 0 0" stiffness="0" type="slide"/>
54 |       <joint armature="0" axis="0 1 0" damping="0" limited="false" name="rooty" pos="0 0 0" stiffness="0" type="hinge"/>
55 |       <geom fromto="-.5 0 0 .5 0 0" name="torso" size="0.046" type="capsule"/>
56 |       <geom axisangle="0 1 0 .87" name="head" pos=".6 0 .1" size="0.046 .15" type="capsule"/>
57 |       <!-- <site name='tip'  pos='.15 0 .11'/>-->
58 |       <body name="bthigh" pos="-.5 0 0">
59 |         <joint axis="0 1 0" damping="6" name="bthigh" pos="0 0 0" range="-.52 1.05" stiffness="240" type="hinge"/>
60 |         <geom axisangle="0 1 0 -3.8" name="bthigh" pos=".1 0 -.13" size="0.046 .145" type="capsule"/>
61 |         <body name="bshin" pos=".16 0 -.25">
62 |           <joint axis="0 1 0" damping="4.5" name="bshin" pos="0 0 0" range="-.785 .785" stiffness="180" type="hinge"/>
63 |           <geom axisangle="0 1 0 -2.03" name="bshin" pos="-.14 0 -.07" rgba="0.9 0.6 0.6 1" size="0.046 .15" type="capsule"/>
64 |           <body name="bfoot" pos="-.28 0 -.14">
65 |             <joint axis="0 1 0" damping="3" name="bfoot" pos="0 0 0" range="-.4 .785" stiffness="120" type="hinge"/>
66 |             <geom axisangle="0 1 0 -.27" name="bfoot" pos=".03 0 -.097" rgba="0.9 0.6 0.6 1" size="0.046 .094" type="capsule"/>
67 |           </body>
68 |         </body>
69 |       </body>
70 |       <body name="fthigh" pos=".5 0 0">
71 |         <joint axis="0 1 0" damping="4.5" name="fthigh" pos="0 0 0" range="-1 .7" stiffness="180" type="hinge"/>
72 |         <geom axisangle="0 1 0 .52" name="fthigh" pos="-.07 0 -.12" size="0.046 .133" type="capsule"/>
73 |         <body name="fshin" pos="-.14 0 -.24">
74 |           <joint axis="0 1 0" damping="3" name="fshin" pos="0 0 0" range="-1.2 .87" stiffness="120" type="hinge"/>
75 |           <geom axisangle="0 1 0 -.6" name="fshin" pos=".065 0 -.09" rgba="0.9 0.6 0.6 1" size="0.046 .106" type="capsule"/>
76 |           <body name="ffoot" pos=".13 0 -.18">
77 |             <joint axis="0 1 0" damping="1.5" name="ffoot" pos="0 0 0" range="-.5 .5" stiffness="60" type="hinge"/>
78 |             <geom axisangle="0 1 0 -.6" name="ffoot" pos=".045 0 -.07" rgba="0.9 0.6 0.6 1" size="0.046 .07" type="capsule"/>
79 |           </body>
80 |         </body>
81 |       </body>
82 |     </body>
83 |   </worldbody>
84 |   <actuator>
85 |     <motor gear="120" joint="bthigh" name="bthigh"/>
86 |     <motor gear="90" joint="bshin" name="bshin"/>
87 |     <motor gear="60" joint="bfoot" name="bfoot"/>
88 |     <motor gear="120" joint="fthigh" name="fthigh"/>
89 |     <motor gear="60" joint="fshin" name="fshin"/>
90 |     <motor gear="30" joint="ffoot" name="ffoot"/>
91 |   </actuator>
92 | </mujoco>


--------------------------------------------------------------------------------
/softlearning/replay_pools/trajectory_replay_pool.py:
--------------------------------------------------------------------------------
  1 | from collections import deque
  2 | import gzip
  3 | import pickle
  4 | from itertools import islice
  5 | 
  6 | import numpy as np
  7 | 
  8 | from softlearning.utils.numpy import softmax
  9 | from .replay_pool import ReplayPool
 10 | 
 11 | 
 12 | def random_int_with_variable_range(mins, maxs):
 13 |     result = np.floor(np.random.uniform(mins, maxs)).astype(int)
 14 |     return result
 15 | 
 16 | 
 17 | class TrajectoryReplayPool(ReplayPool):
 18 |     def __init__(self,
 19 |                  observation_space,
 20 |                  action_space,
 21 |                  max_size):
 22 |         super(TrajectoryReplayPool, self).__init__()
 23 | 
 24 |         max_size = int(max_size)
 25 |         self._max_size = max_size
 26 | 
 27 |         self._trajectories = deque(maxlen=max_size)
 28 |         self._trajectory_lengths = deque(maxlen=max_size)
 29 |         self._num_samples = 0
 30 |         self._trajectories_since_save = 0
 31 | 
 32 |     @property
 33 |     def num_trajectories(self):
 34 |         return len(self._trajectories)
 35 | 
 36 |     @property
 37 |     def size(self):
 38 |         return sum(self._trajectory_lengths)
 39 | 
 40 |     @property
 41 |     def num_samples(self):
 42 |         return self._num_samples
 43 | 
 44 |     def add_paths(self, trajectories):
 45 |         self._trajectories += trajectories
 46 |         self._trajectory_lengths += [
 47 |             trajectory[next(iter(trajectory.keys()))].shape[0]
 48 |             for trajectory in trajectories
 49 |         ]
 50 |         self._trajectories_since_save += len(trajectories)
 51 | 
 52 |     def add_path(self, trajectory):
 53 |         self.add_paths([trajectory])
 54 | 
 55 |     def add_sample(self, sample):
 56 |         raise NotImplementedError(
 57 |             "{} only supports adding full paths at once.".format(self.__class__.__name__))
 58 | 
 59 |     def add_samples(self, samples):
 60 |         raise NotImplementedError(
 61 |             "{} only supports adding full paths at once.".format(self.__class__.__name__))
 62 | 
 63 |     def batch_by_indices(self,
 64 |                          episode_indices,
 65 |                          step_indices,
 66 |                          field_name_filter=None):
 67 |         assert len(episode_indices) == len(step_indices)
 68 | 
 69 |         batch_size = len(episode_indices)
 70 |         trajectories = [self._trajectories[i] for i in episode_indices]
 71 | 
 72 |         batch = {
 73 |             field_name: np.empty(
 74 |                 (batch_size, *values.shape[1:]), dtype=values.dtype)
 75 |             for field_name, values in trajectories[0].items()
 76 |         }
 77 | 
 78 |         for i, episode in enumerate(trajectories):
 79 |             for field_name, episode_values in episode.items():
 80 |                 batch[field_name][i] = episode_values[step_indices[i]]
 81 | 
 82 |         return batch
 83 | 
 84 |     def random_batch(self, batch_size, *args, **kwargs):
 85 |         num_trajectories = len(self._trajectories)
 86 |         if num_trajectories < 1:
 87 |             return {}
 88 | 
 89 |         trajectory_lengths = np.array(self._trajectory_lengths)
 90 |         trajectory_weights = trajectory_lengths / np.sum(trajectory_lengths)
 91 |         trajectory_probabilities = softmax(trajectory_weights)
 92 | 
 93 |         trajectory_indices = np.random.choice(
 94 |             np.arange(num_trajectories),
 95 |             size=batch_size,
 96 |             replace=True,
 97 |             p=trajectory_probabilities)
 98 |         first_key = next(iter(
 99 |             self._trajectories[trajectory_indices[0]].keys()))
100 |         trajectory_lengths = np.array([
101 |             self._trajectories[trajectory_index][first_key].shape[0]
102 |             for trajectory_index in trajectory_indices
103 |         ])
104 | 
105 |         step_indices = random_int_with_variable_range(
106 |             np.zeros_like(trajectory_lengths, dtype=np.int64),
107 |             trajectory_lengths)
108 | 
109 |         batch = self.batch_by_indices(trajectory_indices, step_indices)
110 | 
111 |         return batch
112 | 
113 |     def last_n_batch(self, last_n, field_name_filter=None, **kwargs):
114 |         num_trajectories = len(self._trajectories)
115 |         if num_trajectories < 1:
116 |             return {}
117 | 
118 |         trajectory_indices = []
119 |         step_indices = []
120 | 
121 |         trajectory_lengths = 0
122 |         for trajectory_index in range(num_trajectories-1, -1, -1):
123 |             trajectory = self._trajectories[trajectory_index]
124 |             trajectory_length = trajectory[list(trajectory.keys())[0]].shape[0]
125 | 
126 |             steps_from_this_episode = min(trajectory_length, last_n - trajectory_lengths)
127 |             step_indices += list(range(
128 |                 trajectory_length-1,
129 |                 trajectory_length - steps_from_this_episode - 1,
130 |                 -1))
131 |             trajectory_indices += [trajectory_index] * steps_from_this_episode
132 | 
133 |             trajectory_lengths += trajectory_length
134 | 
135 |             if trajectory_lengths >= last_n:
136 |                 break
137 | 
138 |         trajectory_indices = trajectory_indices[::-1]
139 |         step_indices = step_indices[::-1]
140 | 
141 |         batch = self.batch_by_indices(trajectory_indices, step_indices)
142 | 
143 |         return batch
144 | 
145 |     def save_latest_experience(self, pickle_path):
146 |         # deque doesn't support direct slicing, thus need to use islice
147 |         num_trajectories = self.num_trajectories
148 |         start_index = max(num_trajectories - self._trajectories_since_save, 0)
149 |         end_index = num_trajectories
150 | 
151 |         latest_trajectories = tuple(islice(
152 |             self._trajectories, start_index, end_index))
153 | 
154 |         with gzip.open(pickle_path, 'wb') as f:
155 |             pickle.dump(latest_trajectories, f)
156 | 
157 |         self._trajectories_since_save = 0
158 | 
159 |     def load_experience(self, experience_path):
160 |         with gzip.open(experience_path, 'rb') as f:
161 |             latest_trajectories = pickle.load(f)
162 | 
163 |         self.add_paths(latest_trajectories)
164 |         self._trajectories_since_save = 0
165 | 


--------------------------------------------------------------------------------
/run_scripts/main.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append("../")
  3 | import os
  4 | from RLA.easy_log.tester import tester
  5 | from utils import get_parser
  6 | from maple.policy import maple
  7 | from copy import deepcopy
  8 | 
  9 | def get_params_from_file(filepath, params_name='params'):
 10 |     import importlib
 11 |     from dotmap import DotMap
 12 |     module = importlib.import_module(filepath)
 13 |     params = getattr(module, params_name)
 14 |     params = DotMap(params)
 15 |     return params
 16 | 
 17 | 
 18 | def get_variant_spec(command_line_args):
 19 |     from base import get_variant_spec, get_task_spec
 20 |     params = get_params_from_file(command_line_args.config)
 21 |     variant_spec = get_variant_spec(command_line_args, params)
 22 |     print(variant_spec)
 23 |     if 'neorl' in command_line_args.config:
 24 |         variant_spec['environment_params']['training']['kwargs']['use_neorl'] = True
 25 |     else:
 26 |         variant_spec['environment_params']['training']['kwargs']['use_neorl'] = False
 27 |     for k,v in vars(command_line_args).items():
 28 |         variant_spec[k] = v
 29 |     variant_spec['run_params']['seed'] = command_line_args.seed
 30 |     variant_spec = get_task_spec(variant_spec)
 31 |     return variant_spec
 32 | 
 33 | 
 34 | 
 35 | import tensorflow as tf
 36 | 
 37 | from softlearning.environments.utils import get_environment_from_params
 38 | from softlearning.replay_pools.utils import get_replay_pool_from_variant
 39 | from softlearning.samplers.utils import get_sampler_from_variant
 40 | 
 41 | from softlearning.misc.utils import set_seed
 42 | import copy
 43 | import maple.policy.static as static
 44 | 
 45 | 
 46 | def get_package_path():
 47 |     return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 48 | 
 49 | def main():
 50 |     import sys
 51 |     example_args = get_parser().parse_args(sys.argv[1:])
 52 | 
 53 |     variant_spec = get_variant_spec(example_args)
 54 |     # command_line_args = example_args
 55 |     print('vriant spec: {}'.format(variant_spec))
 56 | 
 57 |     # if command_line_args.video_save_frequency is not None:
 58 |     #     assert 'algorithm_params' in variant_spec
 59 |     #     variant_spec['algorithm_params']['kwargs']['video_save_frequency'] = (
 60 |     #         command_line_args.video_save_frequency)
 61 | 
 62 |     variant = variant_spec
 63 |     # init
 64 |     set_seed(variant['run_params']['seed'])
 65 |     gpu_options = tf.GPUOptions(allow_growth=True)
 66 |     session = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
 67 |     tester.set_hyper_param(**variant)
 68 |     tf.keras.backend.set_session(session)
 69 | 
 70 |     # build
 71 | 
 72 |     variant = copy.deepcopy(variant)
 73 |     # redundant code for compatibility to the older version.
 74 |     if variant['elite_num'] <= 0:
 75 |         variant['algorithm_params']['kwargs']['num_networks'] = int(variant['model_suffix'])
 76 |         variant['algorithm_params']['kwargs']['num_elites'] = int(int(variant['model_suffix']) / 7 * 5)
 77 | 
 78 |     if variant['loaded_task_name'] != '':
 79 |         from RLA import ExperimentLoader
 80 |         el = ExperimentLoader()
 81 |         el.config(task_name=variant['loaded_task_name'],
 82 |                   record_date=variant['loaded_date'], root='../')
 83 |         args = el.import_hyper_parameters(hp_to_overwrite=['retrain_model'])
 84 |         tester.hyper_param = vars(args)
 85 |         tester.hyper_param['retrain_model'] = False
 86 |         tester.hyper_param['algorithm_params'].kwargs.model_load_dir = variant['algorithm_params'].kwargs.model_load_dir
 87 |         variant = copy.deepcopy(tester.hyper_param)
 88 |     else:
 89 |         el = None
 90 |     tester.add_record_param(['info', "model_suffix", "penalty_coeff", "length",
 91 |                              'maple_200', 'run_params.seed', 'penalty_clip'])
 92 |     tester.configure(task_name="v2_" + variant["config"],
 93 |                      rla_config=os.path.join(get_package_path(), 'rla_config_mopo.yaml'),
 94 |                      log_root=get_package_path())
 95 |     tester.log_files_gen()
 96 |     tester.print_args()
 97 |     environment_params = variant['environment_params']
 98 |     training_environment = (get_environment_from_params(environment_params['training']))
 99 |     evaluation_environment = (get_environment_from_params(environment_params['evaluation'](variant))
100 |         if 'evaluation' in environment_params else training_environment)
101 | 
102 |     replay_pool = (get_replay_pool_from_variant(variant, training_environment))
103 |     sampler = get_sampler_from_variant(variant)
104 | 
105 | 
106 |     #### get termination function
107 |     domain = environment_params['training']['domain']
108 |     static_fns = static[domain.lower()]
109 |     ####
110 |     if variant['elite_num'] <= 0:
111 |         variant['algorithm_params']['kwargs']['num_networks'] = int(variant['model_suffix'])
112 |         variant['algorithm_params']['kwargs']['num_elites'] = int(int(variant['model_suffix']) / 7 * 5)
113 |     # construct MAPLE parameters
114 |     algorithm_params = variant['algorithm_params']
115 |     algorithm_kwargs = deepcopy(algorithm_params['kwargs'])
116 |     exp_name = variant['algorithm_params']["exp_name"]
117 |     retrain_model = variant['retrain_model']
118 |     exp_name = exp_name.replace('_', '-')
119 |     if algorithm_kwargs['separate_mean_var']:
120 |         exp_name += '_smv'
121 |     algorithm_kwargs["model_name"] = exp_name + '_1_{}'.format(variant['model_suffix'])
122 |     kwargs = algorithm_kwargs.toDict()
123 | 
124 |     kwargs['penalty_coeff'] = variant['penalty_coeff']
125 |     kwargs['penalty_clip'] = variant['penalty_clip']
126 |     kwargs['rollout_length'] = variant['length']
127 |     kwargs['seed'] = variant['run_params']['seed']
128 |     kwargs['retrain'] = retrain_model
129 |     kwargs['network_kwargs']['embedding_size'] = variant['emb_size']
130 |     kwargs['n_epochs'] = variant['n_epochs']
131 |     kwargs['source'] = variant['config'].split('.')[-2]
132 |     kwargs['training_environment'] = training_environment
133 |     kwargs['evaluation_environment'] = evaluation_environment
134 |     kwargs['pool'] = replay_pool
135 |     kwargs['static_fns'] = static_fns
136 |     kwargs['sampler'] = sampler  # to be removed
137 |     trainer = maple.MAPLE(**kwargs)
138 |     if el is None:
139 |         list(trainer.train())
140 |     else:
141 |         trainer.vis(el)
142 |         trainer.performance_ns(el)
143 | 
144 | if __name__=='__main__':
145 |     main()


--------------------------------------------------------------------------------
/softlearning/environments/gym/multi_goal.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib.pyplot as plt
  3 | 
  4 | from gym.utils import EzPickle
  5 | from gym import spaces
  6 | from gym.envs.mujoco.mujoco_env import MujocoEnv
  7 | 
  8 | 
  9 | class MultiGoalEnv(MujocoEnv, EzPickle):
 10 |     """
 11 |     Move a 2D point mass to one of the goal positions. Cost is the distance to
 12 |     the closest goal.
 13 | 
 14 |     State: position.
 15 |     Action: velocity.
 16 |     """
 17 |     def __init__(self,
 18 |                  goal_reward=10,
 19 |                  actuation_cost_coeff=30.0,
 20 |                  distance_cost_coeff=1.0,
 21 |                  init_sigma=0.1):
 22 |         EzPickle.__init__(**locals())
 23 | 
 24 |         self.dynamics = PointDynamics(dim=2, sigma=0)
 25 |         self.init_mu = np.zeros(2, dtype=np.float32)
 26 |         self.init_sigma = init_sigma
 27 |         self.goal_positions = np.array(
 28 |             (
 29 |                 (5, 0),
 30 |                 (-5, 0),
 31 |                 (0, 5),
 32 |                 (0, -5)
 33 |             ),
 34 |             dtype=np.float32)
 35 |         self.goal_threshold = 1.0
 36 |         self.goal_reward = goal_reward
 37 |         self.action_cost_coeff = actuation_cost_coeff
 38 |         self.distance_cost_coeff = distance_cost_coeff
 39 |         self.xlim = (-7, 7)
 40 |         self.ylim = (-7, 7)
 41 |         self.vel_bound = 1.
 42 |         self.reset()
 43 |         self.observation = None
 44 | 
 45 |         self._ax = None
 46 |         self._env_lines = []
 47 |         self.fixed_plots = None
 48 |         self.dynamic_plots = []
 49 | 
 50 |     def reset(self):
 51 |         unclipped_observation = (
 52 |             self.init_mu
 53 |             + self.init_sigma
 54 |             * np.random.normal(size=self.dynamics.s_dim))
 55 |         self.observation = np.clip(
 56 |             unclipped_observation,
 57 |             self.observation_space.low,
 58 |             self.observation_space.high)
 59 |         return self.observation
 60 | 
 61 |     @property
 62 |     def observation_space(self):
 63 |         return spaces.Box(
 64 |             low=np.array((self.xlim[0], self.ylim[0])),
 65 |             high=np.array((self.xlim[1], self.ylim[1])),
 66 |             dtype=np.float32,
 67 |             shape=None)
 68 | 
 69 |     @property
 70 |     def action_space(self):
 71 |         return spaces.Box(
 72 |             low=-self.vel_bound,
 73 |             high=self.vel_bound,
 74 |             shape=(self.dynamics.a_dim, ),
 75 |             dtype=np.float32)
 76 | 
 77 |     def get_current_obs(self):
 78 |         return np.copy(self.observation)
 79 | 
 80 |     def step(self, action):
 81 |         action = action.ravel()
 82 | 
 83 |         action = np.clip(
 84 |             action,
 85 |             self.action_space.low,
 86 |             self.action_space.high).ravel()
 87 | 
 88 |         observation = self.dynamics.forward(self.observation, action)
 89 |         observation = np.clip(
 90 |             observation,
 91 |             self.observation_space.low,
 92 |             self.observation_space.high)
 93 | 
 94 |         reward = self.compute_reward(observation, action)
 95 |         dist_to_goal = np.amin([
 96 |             np.linalg.norm(observation - goal_position)
 97 |             for goal_position in self.goal_positions
 98 |         ])
 99 |         done = dist_to_goal < self.goal_threshold
100 |         if done:
101 |             reward += self.goal_reward
102 | 
103 |         self.observation = np.copy(observation)
104 | 
105 |         return observation, reward, done, {'pos': observation}
106 | 
107 |     def _init_plot(self):
108 |         fig_env = plt.figure(figsize=(7, 7))
109 |         self._ax = fig_env.add_subplot(111)
110 |         self._ax.axis('equal')
111 | 
112 |         self._env_lines = []
113 |         self._ax.set_xlim((-7, 7))
114 |         self._ax.set_ylim((-7, 7))
115 | 
116 |         self._ax.set_title('Multigoal Environment')
117 |         self._ax.set_xlabel('x')
118 |         self._ax.set_ylabel('y')
119 | 
120 |         self._plot_position_cost(self._ax)
121 | 
122 |     def render_rollouts(self, paths=()):
123 |         """Render for rendering the past rollouts of the environment."""
124 |         if self._ax is None:
125 |             self._init_plot()
126 | 
127 |         # noinspection PyArgumentList
128 |         [line.remove() for line in self._env_lines]
129 |         self._env_lines = []
130 | 
131 |         for path in paths:
132 |             positions = np.stack([info['pos'] for info in path['infos']])
133 |             xx = positions[:, 0]
134 |             yy = positions[:, 1]
135 |             self._env_lines += self._ax.plot(xx, yy, 'b')
136 | 
137 |         plt.draw()
138 |         plt.pause(0.01)
139 | 
140 |     def render(self, mode='human'):
141 |         """Render for rendering the current state of the environment."""
142 |         pass
143 | 
144 |     def compute_reward(self, observation, action):
145 |         # penalize the L2 norm of acceleration
146 |         # noinspection PyTypeChecker
147 |         action_cost = np.sum(action ** 2) * self.action_cost_coeff
148 | 
149 |         # penalize squared dist to goal
150 |         cur_position = observation
151 |         # noinspection PyTypeChecker
152 |         goal_cost = self.distance_cost_coeff * np.amin([
153 |             np.sum((cur_position - goal_position) ** 2)
154 |             for goal_position in self.goal_positions
155 |         ])
156 | 
157 |         # penalize staying with the log barriers
158 |         costs = [action_cost, goal_cost]
159 |         reward = -np.sum(costs)
160 |         return reward
161 | 
162 |     def _plot_position_cost(self, ax):
163 |         delta = 0.01
164 |         x_min, x_max = tuple(1.1 * np.array(self.xlim))
165 |         y_min, y_max = tuple(1.1 * np.array(self.ylim))
166 |         X, Y = np.meshgrid(
167 |             np.arange(x_min, x_max, delta),
168 |             np.arange(y_min, y_max, delta)
169 |         )
170 |         goal_costs = np.amin([
171 |             (X - goal_x) ** 2 + (Y - goal_y) ** 2
172 |             for goal_x, goal_y in self.goal_positions
173 |         ], axis=0)
174 |         costs = goal_costs
175 | 
176 |         contours = ax.contour(X, Y, costs, 20)
177 |         ax.clabel(contours, inline=1, fontsize=10, fmt='%.0f')
178 |         ax.set_xlim([x_min, x_max])
179 |         ax.set_ylim([y_min, y_max])
180 |         goal = ax.plot(self.goal_positions[:, 0],
181 |                        self.goal_positions[:, 1], 'ro')
182 |         return [contours, goal]
183 | 
184 | 
185 | class PointDynamics(object):
186 |     """
187 |     State: position.
188 |     Action: velocity.
189 |     """
190 |     def __init__(self, dim, sigma):
191 |         self.dim = dim
192 |         self.sigma = sigma
193 |         self.s_dim = dim
194 |         self.a_dim = dim
195 | 
196 |     def forward(self, state, action):
197 |         mu_next = state + action
198 |         state_next = mu_next + self.sigma * \
199 |             np.random.normal(size=self.s_dim)
200 |         return state_next
201 | 


--------------------------------------------------------------------------------
/softlearning/environments/adapters/gym_adapter.py:
--------------------------------------------------------------------------------
  1 | """Implements a GymAdapter that converts Gym envs into SoftlearningEnv."""
  2 | 
  3 | import numpy as np
  4 | import copy
  5 | import gym
  6 | from gym import spaces, wrappers
  7 | 
  8 | from .softlearning_env import SoftlearningEnv
  9 | from softlearning.environments.gym import register_environments
 10 | from softlearning.environments.gym.wrappers import NormalizeActionWrapper
 11 | from collections import defaultdict, OrderedDict
 12 | 
 13 | 
 14 | def parse_domain_task(gym_id):
 15 |     domain_task_parts = gym_id.split('-')
 16 |     domain = '-'.join(domain_task_parts[:1])
 17 |     task = '-'.join(domain_task_parts[1:])
 18 | 
 19 |     return domain, task
 20 | 
 21 | 
 22 | CUSTOM_GYM_ENVIRONMENT_IDS = register_environments()
 23 | CUSTOM_GYM_ENVIRONMENTS = defaultdict(list)
 24 | 
 25 | for gym_id in CUSTOM_GYM_ENVIRONMENT_IDS:
 26 |     domain, task = parse_domain_task(gym_id)
 27 |     CUSTOM_GYM_ENVIRONMENTS[domain].append(task)
 28 | 
 29 | CUSTOM_GYM_ENVIRONMENTS = dict(CUSTOM_GYM_ENVIRONMENTS)
 30 | 
 31 | GYM_ENVIRONMENT_IDS = tuple(gym.envs.registry.env_specs.keys())
 32 | GYM_ENVIRONMENTS = defaultdict(list)
 33 | 
 34 | 
 35 | for gym_id in GYM_ENVIRONMENT_IDS:
 36 |     domain, task = parse_domain_task(gym_id)
 37 |     GYM_ENVIRONMENTS[domain].append(task)
 38 | 
 39 | GYM_ENVIRONMENTS = dict(GYM_ENVIRONMENTS)
 40 | 
 41 | DEFAULT_OBSERVATION_KEY = 'observations'
 42 | 
 43 | 
 44 | class GymAdapter(SoftlearningEnv):
 45 |     """Adapter that implements the SoftlearningEnv for Gym envs."""
 46 | 
 47 |     def __init__(self,
 48 |                  domain,
 49 |                  task,
 50 |                  *args,
 51 |                  env=None,
 52 |                  normalize=True,
 53 |                  observation_keys=None,
 54 |                  unwrap_time_limit=True,
 55 |                  use_neorl=False,
 56 |                  **kwargs):
 57 |         assert not args, (
 58 |             "Gym environments don't support args. Use kwargs instead.")
 59 | 
 60 |         self.normalize = normalize
 61 |         self.observation_keys = observation_keys
 62 |         self.unwrap_time_limit = unwrap_time_limit
 63 | 
 64 |         super(GymAdapter, self).__init__(domain, task, *args, **kwargs)
 65 | 
 66 |         if env is None:
 67 |             assert (domain is not None and task is not None), (domain, task)
 68 |             env_id = "{}-{}".format(domain, task)
 69 |             if use_neorl:
 70 |                 import neorl
 71 |                 env = neorl.make(env_id)
 72 |             else:
 73 |                 env = gym.envs.make(env_id, **kwargs)
 74 |         else:
 75 |             assert domain is None and task is None, (domain, task)
 76 | 
 77 |         if isinstance(env, wrappers.TimeLimit) and unwrap_time_limit:
 78 |             # Remove the TimeLimit wrapper that sets 'done = True' when
 79 |             # the time limit specified for each environment has been passed and
 80 |             # therefore the environment is not Markovian (terminal condition
 81 |             # depends on time rather than state).
 82 |             env = env.env
 83 | 
 84 |         if normalize:
 85 |             env = NormalizeActionWrapper(env)
 86 | 
 87 |         self._env = env
 88 | 
 89 |         if isinstance(self._env.observation_space, spaces.Dict):
 90 |             dict_observation_space = self._env.observation_space
 91 |             self.observation_keys = (
 92 |                 observation_keys or (*self._env.observation_space.spaces.keys(), ))
 93 |         elif isinstance(self._env.observation_space, spaces.Box):
 94 |             dict_observation_space = spaces.Dict(OrderedDict((
 95 |                 (DEFAULT_OBSERVATION_KEY, self._env.observation_space),
 96 |             )))
 97 |             self.observation_keys = (DEFAULT_OBSERVATION_KEY, )
 98 | 
 99 |         self._observation_space = type(dict_observation_space)([
100 |             (name, copy.deepcopy(space))
101 |             for name, space in dict_observation_space.spaces.items()
102 |             if name in self.observation_keys
103 |         ])
104 | 
105 |     @property
106 |     def observation_space(self):
107 |         observation_space = self._observation_space
108 |         return observation_space
109 | 
110 |     @property
111 |     def active_observation_shape(self):
112 |         """Shape for the active observation based on observation_keys."""
113 |         # if not isinstance(self._env.observation_space, spaces.Dict):
114 |         #     return super(GymAdapter, self).active_observation_shape
115 |         if not isinstance(self.observation_space, spaces.Dict):
116 |             return super(GymAdapter, self).active_observation_shape
117 | 
118 |         observation_keys = (
119 |             self.observation_keys
120 |             or list(self.observation_space.spaces.keys()))
121 | 
122 |         active_size = sum(
123 |             np.prod(self.observation_space.spaces[key].shape)
124 |             for key in observation_keys)
125 | 
126 |         active_observation_shape = (active_size, )
127 | 
128 |         return active_observation_shape
129 | 
130 |     def convert_to_active_observation(self, observation):
131 |         # if not isinstance(self._env.observation_space, spaces.Dict):
132 |         #     return observation
133 |         if not isinstance(self.observation_space, spaces.Dict):
134 |             return observation
135 | 
136 |         observation_keys = (
137 |             self.observation_keys
138 |             or list(self.observation_space.spaces.keys()))
139 | 
140 |         observation = np.concatenate([
141 |             observation[key] for key in observation_keys
142 |         ], axis=-1)
143 | 
144 |         return observation
145 | 
146 |     @property
147 |     def action_space(self, *args, **kwargs):
148 |         action_space = self._env.action_space
149 |         if len(action_space.shape) > 1:
150 |             raise NotImplementedError(
151 |                 "Action space ({}) is not flat, make sure to check the"
152 |                 " implemenation.".format(action_space))
153 |         return action_space
154 | 
155 |     def step(self, action, *args, **kwargs):
156 |         observation, reward, terminal, info = self._env.step(
157 |             action, *args, **kwargs)
158 | 
159 |         if not isinstance(self._env.observation_space, spaces.Dict):
160 |             observation = {DEFAULT_OBSERVATION_KEY: observation}
161 | 
162 |         observation = self._filter_observation(observation)
163 |         return observation, reward, terminal, info
164 | 
165 |     def reset(self, *args, **kwargs):
166 |         observation = self._env.reset()
167 | 
168 |         if not isinstance(self._env.observation_space, spaces.Dict):
169 |             observation = {DEFAULT_OBSERVATION_KEY: observation}
170 | 
171 |         observation = self._filter_observation(observation)
172 |         return observation
173 | 
174 |     def render(self, *args, **kwargs):
175 |         return self._env.render(*args, **kwargs)
176 | 
177 |     def close(self, *args, **kwargs):
178 |         return self._env.close(*args, **kwargs)
179 | 
180 |     def seed(self, *args, **kwargs):
181 |         return self._env.seed(*args, **kwargs)
182 | 
183 |     @property
184 |     def unwrapped(self):
185 |         return self._env.unwrapped
186 | 
187 |     def get_param_values(self, *args, **kwargs):
188 |         raise NotImplementedError
189 | 
190 |     def set_param_values(self, *args, **kwargs):
191 |         raise NotImplementedError
192 | 


--------------------------------------------------------------------------------
/softlearning/scripts/console_scripts.py:
--------------------------------------------------------------------------------
  1 | """A command line interface that exposes softlearning examples to user.
  2 | 
  3 | This package exposes the functions in examples.instrument module to the user
  4 | through a cli, which allows seamless runs of examples in different modes (e.g.
  5 | locally, in google compute engine, or ec2).
  6 | 
  7 | 
  8 | There are two types of cli commands in this file (each have their corresponding
  9 | function in examples.instrument):
 10 | 1. run_example_* methods, which run the experiments by invoking
 11 |     `tune.run_experiments` function.
 12 | 2. launch_example_* methods, which are helpers function to submit an
 13 |     example to be run in the cloud. In practice, these launch a cluster,
 14 |     and then run the `run_example_cluster` method with the provided
 15 |     arguments and options.
 16 | """
 17 | 
 18 | from __future__ import absolute_import
 19 | from __future__ import division
 20 | from __future__ import print_function
 21 | 
 22 | import logging
 23 | 
 24 | import click
 25 | 
 26 | from examples.instrument import (
 27 |     run_example_dry,
 28 |     run_example_local,
 29 |     run_example_debug,
 30 |     run_example_cluster,
 31 |     launch_example_cluster,
 32 |     launch_example_gce,
 33 |     launch_example_ec2)
 34 | 
 35 | 
 36 | logging.basicConfig(level=logging.INFO)
 37 | logger = logging.getLogger(__name__)
 38 | logger.setLevel(logging.INFO)
 39 | 
 40 | 
 41 | def add_options(options):
 42 |     def decorator(f):
 43 |         for option in options[::-1]:
 44 |             click.decorators._param_memo(f, option)
 45 |         return f
 46 |     return decorator
 47 | 
 48 | 
 49 | @click.group()
 50 | def cli():
 51 |     pass
 52 | 
 53 | 
 54 | @cli.command(
 55 |     name='run_example_dry',
 56 |     context_settings={'ignore_unknown_options': True})
 57 | @click.argument("example_module_name", required=True, type=str)
 58 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
 59 | def run_example_dry_cmd(example_module_name, example_argv):
 60 |     """Print the variant spec and related information of an example."""
 61 |     return run_example_dry(example_module_name, example_argv)
 62 | 
 63 | 
 64 | @cli.command(
 65 |     name='run_local',
 66 |     context_settings={'ignore_unknown_options': True})
 67 | @click.argument("example_module_name", required=True, type=str)
 68 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
 69 | def run_example_local_cmd(example_module_name, example_argv):
 70 |     """Run example locally, potentially parallelizing across cpus/gpus."""
 71 |     return run_example_local(example_module_name, example_argv)
 72 | 
 73 | 
 74 | @cli.command(
 75 |     name='run_example_debug',
 76 |     context_settings={'ignore_unknown_options': True})
 77 | @click.argument("example_module_name", required=True, type=str)
 78 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
 79 | def run_example_debug_cmd(example_module_name, example_argv):
 80 |     """The debug mode limits tune trial runs to enable use of debugger."""
 81 |     return run_example_debug(example_module_name, example_argv)
 82 | 
 83 | 
 84 | @cli.command(
 85 |     name='run_example_cluster',
 86 |     context_settings={'ignore_unknown_options': True})
 87 | @click.argument("example_module_name", required=True, type=str)
 88 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
 89 | def run_example_cluster_cmd(example_module_name, example_argv):
 90 |     """Run example on cluster mode.
 91 | 
 92 |     This functions is very similar to the local mode, except that it
 93 |     correctly sets the redis address to make ray/tune work on a cluster.
 94 |     """
 95 |     run_example_cluster(example_module_name, example_argv)
 96 | 
 97 | 
 98 | @cli.command(
 99 |     name='launch_example_cluster',
100 |     context_settings={
101 |         'allow_extra_args': True,
102 |         'ignore_unknown_options': True
103 |     })
104 | @click.argument("example_module_name", required=True, type=str)
105 | @click.argument('example_argv', nargs=-1, type=click.UNPROCESSED)
106 | @click.option(
107 |     "--config_file",
108 |     required=False,
109 |     type=str)
110 | @click.option(
111 |     "--stop/--no-stop",
112 |     is_flag=True,
113 |     default=True,
114 |     help="Stop the cluster after the command finishes running.")
115 | @click.option(
116 |     "--start/--no-start",
117 |     is_flag=True,
118 |     default=True,
119 |     help="Start the cluster if needed.")
120 | @click.option(
121 |     "--screen/--no-screen",
122 |     is_flag=True,
123 |     default=False,
124 |     help="Run the command in a screen.")
125 | @click.option(
126 |     "--tmux/--no-tmux",
127 |     is_flag=True,
128 |     default=True,
129 |     help="Run the command in tmux.")
130 | @click.option(
131 |     "--override-cluster-name",
132 |     required=False,
133 |     type=str,
134 |     help="Override the configured cluster name.")
135 | @click.option(
136 |     "--port-forward", required=False, type=int, help="Port to forward.")
137 | def launch_example_cluster_cmd(*args, **kwargs):
138 |     """Launches the example on autoscaled ray cluster through ray exec_cmd.
139 | 
140 |     This handles basic validation and sanity checks for the experiment, and
141 |     then executes the command on autoscaled ray cluster. If necessary, it will
142 |     also fill in more useful defaults for our workflow (i.e. for tmux and
143 |     override_cluster_name).
144 |     """
145 |     return launch_example_cluster(*args, **kwargs)
146 | 
147 | 
148 | @cli.command(
149 |     name='launch_example_gce',
150 |     context_settings={
151 |         'allow_extra_args': True,
152 |         'ignore_unknown_options': True
153 |     })
154 | @add_options(launch_example_cluster_cmd.params)
155 | def launch_example_gce_cmd(*args, **kwargs):
156 |     """Forwards call to `launch_example_cluster` after adding gce defaults.
157 | 
158 |     This optionally sets the ray autoscaler configuration file to the default
159 |     gce configuration file, and then calls `launch_example_cluster` to
160 |     execute the original command on autoscaled gce cluster by parsing the args.
161 | 
162 |     See `launch_example_cluster` for further details.
163 |     """
164 |     return launch_example_gce(*args, **kwargs)
165 | 
166 | 
167 | @cli.command(
168 |     name='launch_example_ec2',
169 |     context_settings={
170 |         'allow_extra_args': True,
171 |         'ignore_unknown_options': True
172 |     })
173 | @add_options(launch_example_cluster_cmd.params)
174 | def launch_example_ec2_cmd(*args, **kwargs):
175 |     """Forwards call to `launch_example_cluster` after adding ec2 defaults.
176 | 
177 |     This optionally sets the ray autoscaler configuration file to the default
178 |     ec2 configuration file, and then calls `launch_example_cluster` to
179 |     execute the original command on autoscaled ec2 cluster by parsing the args.
180 | 
181 |     See `launch_example_cluster` for further details.
182 |     """
183 |     return launch_example_ec2(*args, **kwargs)
184 | 
185 | 
186 | cli.add_command(run_example_local_cmd)
187 | cli.add_command(run_example_dry_cmd)
188 | cli.add_command(run_example_cluster_cmd)
189 | 
190 | # Alias for run_example_local
191 | cli.add_command(run_example_local_cmd, name='launch_example_local')
192 | # Alias for run_example_dry
193 | cli.add_command(run_example_dry_cmd, name='launch_example_dry')
194 | # Alias for run_example_debug
195 | cli.add_command(run_example_debug_cmd, name='launch_example_debug')
196 | cli.add_command(launch_example_cluster_cmd)
197 | cli.add_command(launch_example_gce_cmd)
198 | cli.add_command(launch_example_ec2_cmd)
199 | 
200 | 
201 | def main():
202 |     return cli()
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     main()
207 | 


--------------------------------------------------------------------------------
/softlearning/replay_pools/flexible_replay_pool.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import pickle
  3 | 
  4 | import numpy as np
  5 | 
  6 | from .replay_pool import ReplayPool
  7 | 
  8 | 
  9 | class FlexibleReplayPool(ReplayPool):
 10 |     def __init__(self, max_size, fields_attrs, obs_filter=False, modify_rew=False):
 11 |         super(FlexibleReplayPool, self).__init__()
 12 | 
 13 |         max_size = int(max_size)
 14 |         self._max_size = max_size
 15 | 
 16 |         self.fields = {}
 17 |         self.fields_attrs = {}
 18 | 
 19 |         self.add_fields(fields_attrs)
 20 | 
 21 |         self.obs_filter = obs_filter
 22 |         self.modify_rew = modify_rew
 23 | 
 24 |         self._pointer = 0
 25 |         self._size = 0
 26 |         self._samples_since_save = 0
 27 | 
 28 |     @property
 29 |     def size(self):
 30 |         return self._size
 31 | 
 32 |     @property
 33 |     def field_names(self):
 34 |         return list(self.fields.keys())
 35 | 
 36 |     def add_fields(self, fields_attrs):
 37 |         self.fields_attrs.update(fields_attrs)
 38 | 
 39 |         for field_name, field_attrs in fields_attrs.items():
 40 |             field_shape = (self._max_size, *field_attrs['shape'])
 41 |             initializer = field_attrs.get('initializer', np.zeros)
 42 |             self.fields[field_name] = initializer(
 43 |                 field_shape, dtype=field_attrs['dtype'])
 44 | 
 45 |     def _advance(self, count=1):
 46 |         self._pointer = (self._pointer + count) % self._max_size
 47 |         self._size = min(self._size + count, self._max_size)
 48 |         self._samples_since_save += count
 49 | 
 50 |     def add_sample(self, sample):
 51 |         samples = {
 52 |             key: value[None, ...]
 53 |             for key, value in sample.items()
 54 |         }
 55 |         self.add_samples(samples)
 56 | 
 57 |     def add_samples(self, samples):
 58 |         # if 'infos' not in samples:
 59 |         #     samples['infos'] = {}
 60 |         field_names = list(samples.keys())
 61 |         num_samples = samples[field_names[0]].shape[0]
 62 |         index = np.arange(
 63 |             self._pointer, self._pointer + num_samples) % self._max_size
 64 |         for field_name in self.field_names:
 65 |             # print(field_name)
 66 |             default_value = (
 67 |                 self.fields_attrs[field_name].get('default_value', 0.0))
 68 |             values = samples.get(field_name, default_value)
 69 |             if field_name not in samples.keys() and 'infos' in samples and field_name in samples['infos'][0].keys():
 70 |                 values = np.expand_dims(np.array([samples['infos'][i].get(field_name, default_value) for i in range(num_samples)]), axis=1)
 71 |             try:
 72 |                 assert values.shape[0] == num_samples, f'value shape: {values.shape[0]}, expected: {num_samples}'
 73 |                 if isinstance(values[0], dict):
 74 |                     values = np.stack([np.concatenate([
 75 |                                 value[key]
 76 |                                 for key in value.keys()
 77 |                             ], axis=-1) for value in values])
 78 |                 self.fields[field_name][index] = values
 79 |             except Exception as e:
 80 |                 import traceback
 81 |                 traceback.print_exc(limit=10)
 82 |                 print('[ DEBUG ] errors occurs: {}'.format(e))
 83 | 
 84 |                 import pdb; pdb.set_trace()
 85 |         self._advance(num_samples)
 86 | 
 87 |     def restore_samples(self, samples):
 88 |         num_samples = samples[list(samples.keys())[0]].shape[0]
 89 |         index = np.arange(
 90 |             0, num_samples) % self._max_size
 91 |         for key, values in samples.items():
 92 |             assert key in self.field_names
 93 |             self.fields[key][index] = values
 94 | 
 95 |     def random_indices(self, batch_size):
 96 |         if self._size == 0: return np.arange(0, 0)
 97 |         return np.random.randint(0, self._size, batch_size)
 98 | 
 99 |     def random_batch(self, batch_size, field_name_filter=None, **kwargs):
100 |         random_indices = self.random_indices(batch_size)
101 |         return self.batch_by_indices(
102 |             random_indices, field_name_filter=field_name_filter, **kwargs)
103 | 
104 |     def last_n_batch(self, last_n, field_name_filter=None, **kwargs):
105 |         last_n_indices = np.arange(
106 |             self._pointer - min(self.size, last_n), self._pointer
107 |         ) % self._max_size
108 |         return self.batch_by_indices(
109 |             last_n_indices, field_name_filter=field_name_filter, **kwargs)
110 | 
111 |     def filter_fields(self, field_names, field_name_filter):
112 |         if isinstance(field_name_filter, str):
113 |             field_name_filter = [field_name_filter]
114 | 
115 |         if isinstance(field_name_filter, (list, tuple)):
116 |             field_name_list = field_name_filter
117 | 
118 |             def filter_fn(field_name):
119 |                 return field_name in field_name_list
120 | 
121 |         else:
122 |             filter_fn = field_name_filter
123 | 
124 |         filtered_field_names = [
125 |             field_name for field_name in field_names
126 |             if filter_fn(field_name)
127 |         ]
128 | 
129 |         return filtered_field_names
130 | 
131 |     def batch_by_indices(self, indices, field_name_filter=None):
132 |         if np.any(indices % self._max_size > self.size):
133 |             raise ValueError(
134 |                 "Tried to retrieve batch with indices greater than current"
135 |                 " size")
136 | 
137 |         field_names = self.field_names
138 |         if field_name_filter is not None:
139 |             field_names = self.filter_fields(
140 |                 field_names, field_name_filter)
141 | 
142 |         return {
143 |             field_name: self.fields[field_name][indices]
144 |             for field_name in field_names
145 |         }
146 | 
147 |     def save_latest_experience(self, pickle_path):
148 |         latest_samples = self.last_n_batch(self._samples_since_save)
149 | 
150 |         with gzip.open(pickle_path, 'wb') as f:
151 |             pickle.dump(latest_samples, f)
152 | 
153 |         self._samples_since_save = 0
154 | 
155 |     def load_experience(self, experience_path):
156 |         with gzip.open(experience_path, 'rb') as f:
157 |             latest_samples = pickle.load(f)
158 | 
159 |         key = list(latest_samples.keys())[0]
160 |         num_samples = latest_samples[key].shape[0]
161 |         for field_name, data in latest_samples.items():
162 |             assert data.shape[0] == num_samples, data.shape
163 | 
164 |         self.add_samples(latest_samples)
165 |         self._samples_since_save = 0
166 | 
167 |     def return_all_samples(self):
168 |         return {
169 |             field_name: self.fields[field_name][:self.size]
170 |             for field_name in self.field_names
171 |         }
172 | 
173 |     def __getstate__(self):
174 |         state = self.__dict__.copy()
175 |         state['fields'] = {
176 |             field_name: self.fields[field_name][:self.size]
177 |             for field_name in self.field_names
178 |         }
179 | 
180 |         return state
181 | 
182 |     def __setstate__(self, state):
183 |         if state['_size'] < state['_max_size']:
184 |             pad_size = state['_max_size'] - state['_size']
185 |             for field_name in state['fields'].keys():
186 |                 field_shape = state['fields_attrs'][field_name]['shape']
187 |                 state['fields'][field_name] = np.concatenate((
188 |                     state['fields'][field_name],
189 |                     np.zeros((pad_size, *field_shape))
190 |                 ), axis=0)
191 | 
192 |         self.__dict__ = state
193 | 


--------------------------------------------------------------------------------
/run_scripts/base.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | import pdb
  4 | 
  5 | from softlearning.misc.utils import get_git_rev, deep_update
  6 | 
  7 | M = 256
  8 | REPARAMETERIZE = True
  9 | 
 10 | NUM_COUPLING_LAYERS = 2
 11 | 
 12 | GAUSSIAN_POLICY_PARAMS_BASE = {
 13 |     'type': 'GaussianPolicy',
 14 |     'kwargs': {
 15 |         'hidden_layer_sizes': (M, M),
 16 |         'squash': True,
 17 |     }
 18 | }
 19 | 
 20 | GAUSSIAN_POLICY_PARAMS_FOR_DOMAIN = {}
 21 | 
 22 | POLICY_PARAMS_BASE = {
 23 |     'GaussianPolicy': GAUSSIAN_POLICY_PARAMS_BASE,
 24 | }
 25 | 
 26 | POLICY_PARAMS_BASE.update({
 27 |     'gaussian': POLICY_PARAMS_BASE['GaussianPolicy'],
 28 | })
 29 | 
 30 | POLICY_PARAMS_FOR_DOMAIN = {
 31 |     'GaussianPolicy': GAUSSIAN_POLICY_PARAMS_FOR_DOMAIN,
 32 | }
 33 | 
 34 | POLICY_PARAMS_FOR_DOMAIN.update({
 35 |     'gaussian': POLICY_PARAMS_FOR_DOMAIN['GaussianPolicy'],
 36 | })
 37 | 
 38 | DEFAULT_MAX_PATH_LENGTH = 1000
 39 | MAX_PATH_LENGTH_PER_DOMAIN = {
 40 |     'Point2DEnv': 50,
 41 |     'Point2DWallEnv': 50,
 42 |     'Pendulum': 200,
 43 | }
 44 | import tensorflow as tf
 45 | import os
 46 | def get_package_path():
 47 |     return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
 48 | 
 49 | ALGORITHM_PARAMS_ADDITIONAL = {
 50 |     'MAPLE': {
 51 |         'type': 'MAPLE',
 52 |         'kwargs': {
 53 |             'reparameterize': REPARAMETERIZE,
 54 |             'lr': 3e-4,
 55 |             'target_update_interval': 1,
 56 |             'tau': 5e-3,
 57 |             'store_extra_policy_info': False,
 58 |             'action_prior': 'uniform',
 59 |             'n_initial_exploration_steps': int(5000),
 60 |             "model_load_dir": os.path.join(get_package_path(), 'models'),
 61 |             "num_networks": 7,
 62 |             "network_kwargs": {
 63 |                 "hidden_sizes": [256, 256],
 64 |                 "activation": tf.nn.relu,
 65 |                 "output_activation": None,
 66 |                 "lstm_hidden_unit": 128,
 67 |                 "embedding_size": 16
 68 |             }
 69 |         }
 70 |     },
 71 | }
 72 | 
 73 | DEFAULT_NUM_EPOCHS = 200
 74 | 
 75 | NUM_EPOCHS_PER_DOMAIN = {
 76 |     'Hopper': int(1e3),
 77 |     'HalfCheetah': int(1e3),
 78 |     'Walker2d': int(1e3),
 79 | }
 80 | 
 81 | ALGORITHM_PARAMS_PER_DOMAIN = {
 82 |     **{
 83 |         domain: {
 84 |             'kwargs': {
 85 |                 'n_epochs': NUM_EPOCHS_PER_DOMAIN.get(domain, DEFAULT_NUM_EPOCHS),
 86 |                 'n_initial_exploration_steps': (
 87 |                     MAX_PATH_LENGTH_PER_DOMAIN.get(domain, DEFAULT_MAX_PATH_LENGTH) * 10),
 88 |             }
 89 |         } for domain in NUM_EPOCHS_PER_DOMAIN
 90 |     }
 91 | }
 92 | 
 93 | 
 94 | NUM_CHECKPOINTS = 10
 95 | 
 96 | 
 97 | def get_variant_spec_base(universe, domain, task, policy, algorithm, env_params):
 98 |     print("get algorithms", algorithm)
 99 |     algorithm_params = deep_update(
100 |         env_params,
101 |         ALGORITHM_PARAMS_PER_DOMAIN.get(domain, {})
102 |     )
103 |     algorithm_params = deep_update(
104 |         algorithm_params,
105 |         ALGORITHM_PARAMS_ADDITIONAL.get(algorithm, {})
106 |     )
107 |     variant_spec = {
108 |         # 'git_sha': get_git_rev(),
109 | 
110 |         'environment_params': {
111 |             'training': {
112 |                 'domain': domain,
113 |                 'task': task,
114 |                 'universe': universe,
115 |                 'kwargs': {},
116 |             },
117 |             'evaluation': lambda spec: (
118 |                 spec['environment_params']['training']),
119 |         },
120 |         'policy_params': deep_update(
121 |             POLICY_PARAMS_BASE[policy],
122 |             POLICY_PARAMS_FOR_DOMAIN[policy].get(domain, {})
123 |         ),
124 |         'Q_params': {
125 |             'type': 'double_feedforward_Q_function',
126 |             'kwargs': {
127 |                 'hidden_layer_sizes': (M, M),
128 |             }
129 |         },
130 |         'algorithm_params': algorithm_params,
131 |         'replay_pool_params': {
132 |             'type': 'SimpleReplayPool',
133 |             'kwargs': {
134 |                 'max_size': lambda spec: (
135 |                     {
136 |                         'SimpleReplayPool': int(1e6),
137 |                         'TrajectoryReplayPool': int(1e4),
138 |                     }.get(spec['replay_pool_params']['type'], int(1e6))
139 |                 ),
140 |             }
141 |         },
142 |         'sampler_params': {
143 |             'type': 'SimpleSampler',
144 |             'kwargs': {
145 |                 'max_path_length': MAX_PATH_LENGTH_PER_DOMAIN.get(
146 |                     domain, DEFAULT_MAX_PATH_LENGTH),
147 |                 'min_pool_size': MAX_PATH_LENGTH_PER_DOMAIN.get(
148 |                     domain, DEFAULT_MAX_PATH_LENGTH),
149 |                 'batch_size': 256,
150 |             }
151 |         },
152 |         'run_params': {
153 |             'seed': 88,
154 |             'checkpoint_at_end': True,
155 |             'checkpoint_frequency': NUM_EPOCHS_PER_DOMAIN.get(
156 |                 domain, DEFAULT_NUM_EPOCHS) // NUM_CHECKPOINTS,
157 |             'checkpoint_replay_pool': False,
158 |             'info': ''
159 |         },
160 |     }
161 | 
162 |     return variant_spec
163 | 
164 | def get_variant_spec(args, env_params):
165 |     universe, domain, task = env_params.universe, env_params.domain, env_params.task
166 |     variant_spec = get_variant_spec_base(
167 |         universe, domain, task, args.policy, env_params.type, env_params)
168 |     return variant_spec
169 | 
170 | NEORL_CONFIG = {
171 |     "hopper":
172 |         {
173 |             'common': {
174 |                 'length': 10,
175 |                 'penalty_coeff': 1.0,
176 |             },
177 |         },
178 |     "halfcheetah":
179 |         {
180 |             'common': {
181 |                 'penalty_clip': 4,
182 |                 'length': 15,
183 |                 'penalty_coeff': 1.0,
184 |             }
185 |         },
186 |     'walker2d':
187 |         {
188 |             'common': {
189 |                 'length': 15,
190 |                 'penalty_coeff': 0.25,
191 |             }
192 |         }
193 | }
194 | D4RL_MAPLE_CONFIG = {
195 |     'common':{
196 |         'length': 10,
197 |         'penalty_coeff': 0.25,
198 |     },
199 |     'halfcheetah':{
200 |         'common': {},
201 |         'medium-expert':
202 |             {
203 |                 'n_epochs': 2000,
204 |                 'penalty_coeff': 5.0,
205 |             }
206 |     }
207 | }
208 | 
209 | D4RL_MAPLE_200_CONFIG = {
210 |     'common': {
211 |         'length': 20,
212 |         'penalty_coeff': 0.25,
213 |     },
214 |     'halfcheetah': {
215 |         'common': {},
216 |         'medium-expert':
217 |             {
218 |                 'n_epochs': 2000,
219 |                 'length': 10,
220 |                 'penalty_coeff': 5.0,
221 |             },
222 |         'mixed': {
223 |             'penalty_clip': 4.0,
224 |         }
225 |     },
226 |     'hopper': {
227 |         'common': {
228 |             'penalty_coeff': 1.0,
229 |         }
230 |     },
231 | }
232 | def get_task_spec(variant_spec):
233 |     if variant_spec["custom_config"]:
234 |         return variant_spec
235 |     else:
236 |         if variant_spec['environment_params']['training']['kwargs']['use_neorl']:
237 |             if variant_spec['maple_200']:
238 |                 assert "have not test maple_200 in neorl yet"
239 |             variant_spec['model_suffix'] = 50
240 |             tasks = variant_spec['config'].split('.')[-1].split('_')
241 |             variant_spec.update(NEORL_CONFIG[tasks[0]]['common'])
242 |         else:
243 |             tasks = variant_spec['config'].split('.')[-1].split('_')
244 |             if variant_spec['maple_200']:
245 |                 variant_spec['model_suffix'] = 200
246 |                 config = D4RL_MAPLE_200_CONFIG
247 |             else:
248 |                 variant_spec['model_suffix'] = 20
249 |                 config = D4RL_MAPLE_CONFIG
250 |             variant_spec.update(config['common'])
251 |             if tasks[0] in config.keys():
252 |                 variant_spec.update(config[tasks[0]]['common'])
253 |                 behavior_type = '-'.join(tasks[1:])
254 |                 if behavior_type in config[tasks[0]].keys():
255 |                     variant_spec.update(config[tasks[0]][behavior_type])
256 |         return variant_spec
257 | 


--------------------------------------------------------------------------------
/maple/policy/fake_env.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import tensorflow as tf
  3 | 
  4 | class FakeEnv:
  5 |     def __init__(self, model, config,
  6 |                 penalty_coeff=0.,
  7 |                  max_penalty=4.0,
  8 |                 # fix_env=False,
  9 |                 penalty_learned_var=False,
 10 |                 penalty_learned_var_random=False):
 11 |         self.model = model
 12 |         self.config = config
 13 |         self.max_penalty = max_penalty
 14 |         # self.fix_env = fix_env
 15 |         self.penalty_coeff = penalty_coeff
 16 |         self.penalty_learned_var = penalty_learned_var
 17 |         self.penalty_learned_var_random = penalty_learned_var_random
 18 | 
 19 |     '''
 20 |         x : [ batch_size, obs_dim + 1 ]
 21 |         means : [ num_models, batch_size, obs_dim + 1 ]
 22 |         vars : [ num_models, batch_size, obs_dim + 1 ]
 23 |     '''
 24 |     def _get_logprob(self, x, means, variances):
 25 | 
 26 |         k = x.shape[-1]
 27 | 
 28 |         ## [ num_networks, batch_size ]
 29 |         log_prob = -1/2 * (k * np.log(2*np.pi) + np.log(variances).sum(-1) + (np.power(x-means, 2)/variances).sum(-1))
 30 |         
 31 |         ## [ batch_size ]
 32 |         prob = np.exp(log_prob).sum(0)
 33 | 
 34 |         ## [ batch_size ]
 35 |         log_prob = np.log(prob)
 36 | 
 37 |         stds = np.std(means,0).mean(-1)
 38 |         return log_prob, stds
 39 | 
 40 |     def reset(self, rollout_length, batch_size):
 41 | 
 42 |         self.ts = 0
 43 |         # Randomly select 10 models rather than using one model for adaptable policy learning.
 44 |         # Then MAPLE learns to adapt for each combination of the dynamics models.
 45 |         ensemble_number = np.random.randint(10, len(self.model._model_inds))
 46 |         sub_model_idx = np.random.choice(self.model._model_inds, size=ensemble_number)
 47 |         self.model_inds = np.random.choice(sub_model_idx, size=batch_size)
 48 |         self.rollout_length = rollout_length
 49 |         self.reset_ratio = np.random.random()
 50 | 
 51 |     def step(self, obs, act, deterministic=False):
 52 |         assert len(obs.shape) == len(act.shape)
 53 |         if len(obs.shape) == 1:
 54 |             obs = obs[None]
 55 |             act = act[None]
 56 |             return_single = True
 57 |         else:
 58 |             return_single = False
 59 | 
 60 |         inputs = np.concatenate((obs, act), axis=-1)
 61 |         batch_length = inputs.shape[0]
 62 |         all_means = []
 63 |         all_vars = []
 64 |         if self.model.num_nets * batch_length > 500000:
 65 |             group_batch_num = int(500000 / self.model.num_nets)
 66 |             for i in range(int(np.ceil(batch_length / group_batch_num))):
 67 |                 ensemble_model_means, ensemble_model_vars = self.model.predict(
 68 |                     inputs[i * group_batch_num: (i + 1) * group_batch_num], factored=True)
 69 |                 all_means.append(ensemble_model_means)
 70 |                 all_vars.append(ensemble_model_vars)
 71 |         else:
 72 |             ensemble_model_means, ensemble_model_vars = self.model.predict(
 73 |                 inputs[:], factored=True)
 74 |             all_means.append(ensemble_model_means)
 75 |             all_vars.append(ensemble_model_vars)
 76 |         # print(self.model.num_nets * batch_length)
 77 |         ensemble_model_means = np.concatenate(all_means, axis=1)
 78 |         ensemble_model_vars = np.concatenate(all_vars, axis=1)
 79 | 
 80 |         ensemble_model_means[:, :, 1:] += obs
 81 |         ensemble_model_stds = np.sqrt(ensemble_model_vars)
 82 | 
 83 |         if deterministic:
 84 |             ensemble_samples = ensemble_model_means
 85 |         else:
 86 |             ensemble_samples = ensemble_model_means + np.random.normal(size=ensemble_model_means.shape) * ensemble_model_stds
 87 | 
 88 |         if not deterministic:
 89 |             #### choose one model from ensemble
 90 |             num_models, batch_size, _ = ensemble_model_means.shape
 91 |             model_inds = self.model.random_inds(batch_size)
 92 |             batch_inds = np.arange(0, batch_size)
 93 |             samples = ensemble_samples[model_inds, batch_inds]
 94 |             model_means = ensemble_model_means[model_inds, batch_inds]
 95 |             model_stds = ensemble_model_stds[model_inds, batch_inds]
 96 |         else:
 97 |             samples = np.mean(ensemble_samples, axis=0)
 98 |             model_means = np.mean(ensemble_model_means, axis=0)
 99 |             model_stds = np.mean(ensemble_model_stds, axis=0)
100 |         log_prob, dev = self._get_logprob(samples, ensemble_model_means, ensemble_model_vars)
101 | 
102 |         rewards, next_obs = samples[:,:1], samples[:,1:]
103 |         terminals = self.config.termination_fn(obs, act, next_obs)
104 | 
105 |         batch_size = model_means.shape[0]
106 |         return_means = np.concatenate((model_means[:,:1], terminals, model_means[:,1:]), axis=-1)
107 |         return_stds = np.concatenate((model_stds[:,:1], np.zeros((batch_size,1)), model_stds[:,1:]), axis=-1)
108 |         self.ts += 1
109 |         assert self.penalty_coeff != 0
110 |         if self.penalty_coeff != 0:
111 |             if not self.penalty_learned_var:
112 |                 ensemble_means_obs = ensemble_model_means[:,:,1:]
113 |                 mean_obs_means = np.mean(ensemble_means_obs, axis=0)     # average predictions over models
114 |                 diffs = ensemble_means_obs - mean_obs_means
115 |                 normalize_diffs = False
116 |                 if normalize_diffs:
117 |                     obs_dim = next_obs.shape[1]
118 |                     obs_sigma = self.model.scaler.cached_sigma[0,:obs_dim]
119 |                     diffs = diffs / obs_sigma
120 |                 dists = np.linalg.norm(diffs, axis=2)   # distance in obs space
121 |                 penalty = np.max(dists, axis=0)         # max distances over models
122 |             else:
123 |                 penalty = np.amax(np.linalg.norm(ensemble_model_stds, axis=2), axis=0)
124 | 
125 |             penalty = np.expand_dims(penalty, 1)
126 |             penalty = np.clip(penalty, a_max=self.max_penalty, a_min=None)
127 |             assert penalty.shape == rewards.shape
128 |             unpenalized_rewards = rewards
129 |             penalized_rewards = rewards - self.penalty_coeff * penalty
130 |         else:
131 |             penalty = None
132 |             unpenalized_rewards = rewards
133 |             penalized_rewards = rewards
134 | 
135 |         if return_single:
136 |             next_obs = next_obs[0]
137 |             return_means = return_means[0]
138 |             return_stds = return_stds[0]
139 |             unpenalized_rewards = unpenalized_rewards[0]
140 |             penalized_rewards = penalized_rewards[0]
141 |             terminals = terminals[0]
142 | 
143 |         info = {'mean': return_means, 'std': return_stds, 'log_prob': log_prob, 'dev': dev,
144 |                 'unpenalized_rewards': unpenalized_rewards, 'penalty': penalty, 'penalized_rewards': penalized_rewards}
145 |         return next_obs, penalized_rewards, terminals, info
146 | 
147 |     ## for debugging computation graph
148 |     def step_ph(self, obs_ph, act_ph, deterministic=False):
149 |         assert len(obs_ph.shape) == len(act_ph.shape)
150 | 
151 |         inputs = tf.concat([obs_ph, act_ph], axis=1)
152 |         # inputs = np.concatenate((obs, act), axis=-1)
153 |         ensemble_model_means, ensemble_model_vars = self.model.create_prediction_tensors(inputs, factored=True)
154 |         # ensemble_model_means, ensemble_model_vars = self.model.predict(inputs, factored=True)
155 |         ensemble_model_means = tf.concat([ensemble_model_means[:,:,0:1], ensemble_model_means[:,:,1:] + obs_ph[None]], axis=-1)
156 |         # ensemble_model_means[:,:,1:] += obs_ph
157 |         ensemble_model_stds = tf.sqrt(ensemble_model_vars)
158 |         # ensemble_model_stds = np.sqrt(ensemble_model_vars)
159 | 
160 |         if deterministic:
161 |             ensemble_samples = ensemble_model_means
162 |         else:
163 |             # ensemble_samples = ensemble_model_means + np.random.normal(size=ensemble_model_means.shape) * ensemble_model_stds
164 |             ensemble_samples = ensemble_model_means + tf.random.normal(tf.shape(ensemble_model_means)) * ensemble_model_stds
165 | 
166 |         samples = ensemble_samples[0]
167 | 
168 |         rewards, next_obs = samples[:,:1], samples[:,1:]
169 |         terminals = self.config.termination_ph_fn(obs_ph, act_ph, next_obs)
170 |         info = {}
171 | 
172 |         return next_obs, rewards, terminals, info
173 | 
174 |     def close(self):
175 |         pass
176 | 
177 | 
178 | 
179 | 


--------------------------------------------------------------------------------