├── __init__.py ├── common ├── __init__.py ├── adr │ ├── __init__.py │ └── adr.py ├── svpg │ ├── __init__.py │ ├── particles │ │ ├── __init__.py │ │ ├── utils.py │ │ ├── distributions.py │ │ └── svpg_particle.py │ ├── svpg_utils.py │ └── svpg.py ├── agents │ ├── __init__.py │ ├── ddpg │ │ ├── __init__.py │ │ ├── replay_buffer.py │ │ └── ddpg.py │ └── selfplay_models.py ├── models │ ├── __init__.py │ ├── actor_critic.py │ └── discriminator.py ├── randomizer │ ├── controllers │ │ ├── __init__.py │ │ ├── miscalibrated_push_controller.py │ │ ├── pick_and_place_controller.py │ │ └── hook_controller.py │ ├── residual_envs │ │ ├── __init__.py │ │ └── residual_fetch_push_env.py │ ├── config │ │ ├── __init__.py │ │ ├── ErgoPushRandomized │ │ │ ├── hard.json │ │ │ ├── default.json │ │ │ └── random.json │ │ ├── FetchPushRandomized │ │ │ ├── default.json │ │ │ └── random.json │ │ ├── FetchSlideRandomized │ │ │ ├── default.json │ │ │ └── random.json │ │ ├── ResidualPushRandomized │ │ │ ├── default.json │ │ │ └── random.json │ │ ├── FetchHookRandomized │ │ │ ├── default.json │ │ │ └── random.json │ │ ├── ResidualComplexHookRandomized │ │ │ ├── default.json │ │ │ └── random.json │ │ ├── ResidualFetchHookRandomized │ │ │ ├── default.json │ │ │ └── random.json │ │ └── ErgoReacherRandomized │ │ │ ├── default.json │ │ │ ├── random.json │ │ │ └── hard.json │ ├── assets │ │ ├── stls │ │ │ ├── estop_link.stl │ │ │ ├── laser_link.stl │ │ │ ├── gripper_link.stl │ │ │ ├── torso_fixed_link.stl │ │ │ ├── base_link_collision.stl │ │ │ ├── bellows_link_collision.stl │ │ │ ├── head_pan_link_collision.stl │ │ │ ├── l_wheel_link_collision.stl │ │ │ ├── r_wheel_link_collision.stl │ │ │ ├── elbow_flex_link_collision.stl │ │ │ ├── head_tilt_link_collision.stl │ │ │ ├── torso_lift_link_collision.stl │ │ │ ├── wrist_flex_link_collision.stl │ │ │ ├── wrist_roll_link_collision.stl │ │ │ ├── forearm_roll_link_collision.stl │ │ │ ├── shoulder_lift_link_collision.stl │ │ │ ├── shoulder_pan_link_collision.stl │ │ │ └── upperarm_roll_link_collision.stl │ │ ├── textures │ │ │ ├── block.png │ │ │ └── block_hidden.png │ │ ├── push.xml │ │ ├── slide.xml │ │ ├── hook.xml │ │ ├── shared.xml │ │ ├── pusher.xml │ │ └── robot.xml │ ├── test.py │ ├── randomized_fetchslide.py │ ├── dimension.py │ ├── randomized_fetchpush.py │ ├── randomized_residual_push.py │ ├── randomized_ergo_pusher.py │ ├── randomized_ergoreacher.py │ ├── miscalibrated_push_controller.py │ ├── ergo_objects.py │ ├── wrappers.py │ ├── residual_fetch_push_env.py │ ├── __init__.py │ ├── complex_hook_env.py │ ├── randomized_fetch.py │ ├── residual_push.py │ └── fetch_hook_env.py └── discriminator │ └── discriminator_rewarder.py ├── utils ├── __init__.py ├── learning_curve.py └── plot_utils.py ├── architecture-selfplay.png ├── setup.py ├── ergo_expts.sh ├── .gitignore ├── experiments ├── evaluate_ergo_envs.py └── ddpg_train.py ├── README.md ├── arguments.py └── realrobot_evals └── realrobot_test.py /__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/adr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/svpg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/agents/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/agents/ddpg/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/randomizer/controllers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/randomizer/residual_envs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /common/svpg/particles/__init__.py: -------------------------------------------------------------------------------- 1 | from .svpg_particle import SVPGParticle -------------------------------------------------------------------------------- /common/randomizer/config/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | CONFIG_PATH = os.path.abspath(os.path.dirname(__file__)) 3 | -------------------------------------------------------------------------------- /architecture-selfplay.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/architecture-selfplay.png -------------------------------------------------------------------------------- /common/randomizer/assets/stls/estop_link.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/estop_link.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/laser_link.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/laser_link.stl -------------------------------------------------------------------------------- /common/randomizer/assets/textures/block.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/textures/block.png -------------------------------------------------------------------------------- /common/randomizer/assets/stls/gripper_link.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/gripper_link.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/torso_fixed_link.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/torso_fixed_link.stl -------------------------------------------------------------------------------- /common/randomizer/assets/textures/block_hidden.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/textures/block_hidden.png -------------------------------------------------------------------------------- /common/randomizer/assets/stls/base_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/base_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/bellows_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/bellows_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/head_pan_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/head_pan_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/l_wheel_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/l_wheel_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/r_wheel_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/r_wheel_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/elbow_flex_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/elbow_flex_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/head_tilt_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/head_tilt_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/torso_lift_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/torso_lift_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/wrist_flex_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/wrist_flex_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/wrist_roll_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/wrist_roll_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/forearm_roll_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/forearm_roll_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/shoulder_lift_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/shoulder_lift_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/shoulder_pan_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/shoulder_pan_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/assets/stls/upperarm_roll_link_collision.stl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/upperarm_roll_link_collision.stl -------------------------------------------------------------------------------- /common/randomizer/config/ErgoPushRandomized/hard.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ErgoPusherRandomized-v0", 3 | "dimensions": [ 4 | { 5 | "name": "friction", 6 | "default": 0.1, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='unsupadr', 5 | version='1.0', 6 | packages=['common'], 7 | install_requires=[ 8 | "torch>=1.0", "tqdm", "numpy", "matplotlib", 'gym>=0.10', 9 | 'mujoco_py>=1.15' 10 | ]) 11 | -------------------------------------------------------------------------------- /common/randomizer/config/ErgoPushRandomized/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ErgoPusherRandomized-v0", 3 | "dimensions": [ 4 | { 5 | "name": "friction", 6 | "default": 0.4, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /common/randomizer/config/ErgoPushRandomized/random.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ErgoPusherRandomized-v0", 3 | "dimensions": [ 4 | { 5 | "name": "friction", 6 | "default": 0.4, 7 | "multiplier_min": 0.25, 8 | "multiplier_max": 2.0 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /common/randomizer/config/FetchPushRandomized/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "FetchPushDefault-v0", 3 | "dimensions": [ 4 | { 5 | "name": "friction", 6 | "default": 0.18, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /common/randomizer/config/FetchPushRandomized/random.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "FetchPushDefault-v0", 3 | "dimensions": [ 4 | { 5 | "name": "friction", 6 | "default": 0.18, 7 | "multiplier_min": 0.1, 8 | "multiplier_max": 1.11 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /common/randomizer/config/FetchSlideRandomized/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "FetchPushDefault-v0", 3 | "dimensions": [ 4 | { 5 | "name": "damping", 6 | "default": 0.01, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /common/randomizer/config/FetchSlideRandomized/random.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "FetchPushDefault-v0", 3 | "dimensions": [ 4 | { 5 | "name": "friction", 6 | "default": 0.18, 7 | "multiplier_min": 0.2, 8 | "multiplier_max": 2.22 9 | } 10 | ] 11 | } 12 | -------------------------------------------------------------------------------- /common/randomizer/config/ResidualPushRandomized/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ResidualPushDefaultEnv-v0", 3 | "dimensions": [ 4 | { 5 | "name": "damping", 6 | "default": 0.01, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | }, 10 | { 11 | "name": "friction", 12 | "default": 0.18, 13 | "multiplier_min": 1.0, 14 | "multiplier_max": 1.0 15 | } 16 | ] 17 | } 18 | -------------------------------------------------------------------------------- /common/randomizer/config/ResidualPushRandomized/random.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ResidualPushRandomizedEnv-v0", 3 | "dimensions": [ 4 | { 5 | "name": "damping", 6 | "default": 0.1, 7 | "multiplier_min": 0.1, 8 | "multiplier_max": 3 9 | }, 10 | 11 | { 12 | "name": "friction", 13 | "default": 0.18, 14 | "multiplier_min": 0.2777, 15 | "multiplier_max": 2.222 16 | } 17 | ] 18 | } 19 | -------------------------------------------------------------------------------- /common/randomizer/test.py: -------------------------------------------------------------------------------- 1 | import randomizer 2 | import gym 3 | import numpy as np 4 | from randomizer.wrappers import RandomizedEnvWrapper 5 | 6 | 7 | env = gym.make("FetchPushRandomizedEnv-v0") 8 | # env = RandomizedEnvWrapper(env, seed=12) 9 | obs = env.reset() 10 | 11 | 12 | for i in range(1000): 13 | obs, rew, _, _ = env.step(env.action_space.sample()) 14 | env.render() 15 | if i % 100 == 0: 16 | # print('done') 17 | print(rew) 18 | # Randomize the environment 19 | env.randomize([-1]) 20 | env.reset() 21 | -------------------------------------------------------------------------------- /common/randomizer/config/FetchHookRandomized/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ResidualHookRandomizedEnv-v0", 3 | "dimensions": [ 4 | { 5 | "name": "block_mass", 6 | "default": 4, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | }, 10 | { 11 | "name": "hook_mass", 12 | "default": 4, 13 | "multiplier_min": 1.0, 14 | "multiplier_max": 1.0 15 | }, 16 | { 17 | "name": "friction", 18 | "default": 1, 19 | "multiplier_min": 1.0, 20 | "multiplier_max": 1.0 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /common/randomizer/config/ResidualComplexHookRandomized/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ResidualHookRandomizedEnv-v0", 3 | "dimensions": [ 4 | { 5 | "name": "block_mass", 6 | "default": 4, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | }, 10 | { 11 | "name": "hook_mass", 12 | "default": 4, 13 | "multiplier_min": 1.0, 14 | "multiplier_max": 1.0 15 | }, 16 | { 17 | "name": "friction", 18 | "default": 1, 19 | "multiplier_min": 1.0, 20 | "multiplier_max": 1.0 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /common/randomizer/config/ResidualFetchHookRandomized/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ResidualHookRandomizedEnv-v0", 3 | "dimensions": [ 4 | { 5 | "name": "block_mass", 6 | "default": 4, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | }, 10 | { 11 | "name": "hook_mass", 12 | "default": 4, 13 | "multiplier_min": 1.0, 14 | "multiplier_max": 1.0 15 | }, 16 | { 17 | "name": "friction", 18 | "default": 1, 19 | "multiplier_min": 1.0, 20 | "multiplier_max": 1.0 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /common/randomizer/config/FetchHookRandomized/random.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ResidualHookRandomizedEnv-v0", 3 | "dimensions": [ 4 | { 5 | "name": "block_mass", 6 | "default": 2, 7 | "multiplier_min": 0.6, 8 | "multiplier_max": 4 9 | }, 10 | { 11 | "name": "hook_mass", 12 | "default": 4, 13 | "multiplier_min": 0.6, 14 | "multiplier_max": 2.5 15 | }, 16 | { 17 | "name": "friction", 18 | "default": 1, 19 | "multiplier_min": 0.5, 20 | "multiplier_max": 1.2 21 | } 22 | ] 23 | } 24 | 25 | -------------------------------------------------------------------------------- /common/randomizer/config/ResidualComplexHookRandomized/random.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ResidualHookRandomizedEnv-v0", 3 | "dimensions": [ 4 | { 5 | "name": "block_mass", 6 | "default": 4, 7 | "multiplier_min": 0.3, 8 | "multiplier_max": 3 9 | }, 10 | { 11 | "name": "hook_mass", 12 | "default": 4, 13 | "multiplier_min": 0.6, 14 | "multiplier_max": 4 15 | }, 16 | { 17 | "name": "friction", 18 | "default": 1, 19 | "multiplier_min": 0.8, 20 | "multiplier_max": 2.3 21 | } 22 | ] 23 | } 24 | 25 | -------------------------------------------------------------------------------- /common/randomizer/config/ResidualFetchHookRandomized/random.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ResidualHookRandomizedEnv-v0", 3 | "dimensions": [ 4 | { 5 | "name": "block_mass", 6 | "default": 2, 7 | "multiplier_min": 0.3, 8 | "multiplier_max": 4 9 | }, 10 | { 11 | "name": "hook_mass", 12 | "default": 4, 13 | "multiplier_min": 0.6, 14 | "multiplier_max": 2.5 15 | }, 16 | { 17 | "name": "friction", 18 | "default": 1, 19 | "multiplier_min": 0.5, 20 | "multiplier_max": 1.2 21 | } 22 | ] 23 | } 24 | 25 | -------------------------------------------------------------------------------- /common/svpg/particles/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch.nn as nn 3 | 4 | 5 | class AddBias(nn.Module): 6 | def __init__(self, bias): 7 | super(AddBias, self).__init__() 8 | self._bias = nn.Parameter(bias.unsqueeze(1)) 9 | 10 | def forward(self, x): 11 | if x.dim() == 2: 12 | bias = self._bias.t().view(1, -1) 13 | else: 14 | bias = self._bias.t().view(1, -1, 1, 1) 15 | 16 | return x + bias 17 | 18 | 19 | def init(module, weight_init, bias_init, gain=1): 20 | weight_init(module.weight.data, gain=gain) 21 | bias_init(module.bias.data) 22 | return module 23 | 24 | 25 | orthogonal_init = lambda m: init(module=m, 26 | weight_init=nn.init.orthogonal_, 27 | bias_init=lambda x: nn.init.constant_(x, 0), 28 | gain=np.sqrt(2)) 29 | -------------------------------------------------------------------------------- /ergo_expts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --partition=long 3 | #SBATCH --gres=gpu:1 # Ask for 1 GPU 4 | # SBATCH --nodes=5 5 | # SBATCH --ntasks=8 6 | #SBATCH --mem=36G # Ask for 10 GB of RAM 7 | #SBATCH --time=72:00:00 # The job will run for 3 hours 8 | #SBATCH --array=1-5 9 | #SBATCH --mail-user=sharathraparthy@gmail.com 10 | #SBATCH --mail-type=ALL 11 | #SBATCH --mail-type=BEGIN 12 | #SBATCH --mail-type=END 13 | #SBATCH --mail-type=FAIL 14 | #SBATCH --mail-type=REQUEUE 15 | #SBATCH -o /network/home/raparths/onlysp-reacher-adr-ergo-%j.out 16 | 17 | module load python/3.6 18 | # module load mujoco/2.0 19 | module load cuda/10.0 20 | source $HOME/sharath/bin/activate 21 | 22 | python ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-default' --only-sp=True --env_name='ErgoReacherRandomizedEnv-Headless-v0' --n-params=8 23 | -------------------------------------------------------------------------------- /common/randomizer/randomized_fetchslide.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | from gym import utils 5 | from .randomized_fetch import RandomizedFetchEnv 6 | 7 | 8 | # Ensure we get the path separator correct on windows 9 | MODEL_XML_PATH = os.path.join('fetch', 'slide.xml') 10 | 11 | 12 | class FetchSlideRandomizedEnv(RandomizedFetchEnv, utils.EzPickle): 13 | def __init__(self, reward_type='sparse', **kwargs): 14 | initial_qpos = { 15 | 'robot0:slide0': 0.05, 16 | 'robot0:slide1': 0.48, 17 | 'robot0:slide2': 0.0, 18 | 'object0:joint': [1.7, 1.1, 0.41, 1., 0., 0., 0.], 19 | } 20 | RandomizedFetchEnv.__init__( 21 | self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20, 22 | gripper_extra_height=-0.02, target_in_the_air=False, target_offset=np.array([0.4, 0.0, 0.0]), 23 | obj_range=0.1, target_range=0.3, distance_threshold=0.05, 24 | initial_qpos=initial_qpos, reward_type=reward_type, **kwargs) 25 | -------------------------------------------------------------------------------- /common/randomizer/dimension.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Dimension(object): 5 | """Class which handles the machinery of randomizing a particular dimension 6 | """ 7 | def __init__(self, default_value, multiplier_min=0.0, multiplier_max=1.0, name=None): 8 | self.default_value = default_value 9 | self.current_value = default_value 10 | self.multiplier_min = multiplier_min 11 | self.multiplier_max = multiplier_max 12 | self.range_min = self.default_value * self.multiplier_min 13 | self.range_max = self.default_value * self.multiplier_max 14 | self.name = name 15 | 16 | def randomize(self): 17 | self.current_value = np.random.uniform(low=self.range_min, high=self.range_max) 18 | 19 | def rescale(self, value): 20 | return self.range_min + (self.range_max - self.range_min) * value 21 | 22 | def reset(self): 23 | self.current_value = self.default_value 24 | 25 | def set(self, value): 26 | self.current_value = value 27 | 28 | -------------------------------------------------------------------------------- /common/models/actor_critic.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | 6 | class Actor(nn.Module): 7 | def __init__(self, state_dim, action_dim, max_action): 8 | super(Actor, self).__init__() 9 | 10 | self.l1 = nn.Linear(state_dim, 400) 11 | self.l2 = nn.Linear(400, 300) 12 | self.l3 = nn.Linear(300, action_dim) 13 | 14 | self.max_action = max_action 15 | 16 | def forward(self, x): 17 | x = F.relu(self.l1(x)) 18 | x = F.relu(self.l2(x)) 19 | x = self.max_action * torch.tanh(self.l3(x)) 20 | return x 21 | 22 | class Critic(nn.Module): 23 | def __init__(self, state_dim, action_dim): 24 | super(Critic, self).__init__() 25 | 26 | self.l1 = nn.Linear(state_dim + action_dim, 400) 27 | self.l2 = nn.Linear(400, 300) 28 | self.l3 = nn.Linear(300, 1) 29 | 30 | def forward(self, x, u): 31 | x = F.relu(self.l1(torch.cat([x, u], 1))) 32 | x = F.relu(self.l2(x)) 33 | x = self.l3(x) 34 | return x 35 | -------------------------------------------------------------------------------- /common/randomizer/randomized_fetchpush.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from gym import utils 4 | from gym.envs.mujoco import mujoco_env 5 | 6 | from .randomized_fetch import RandomizedFetchEnv 7 | 8 | # Ensure we get the path separator correct on windows 9 | MODEL_XML_PATH = os.path.join('fetch', 'push.xml') 10 | 11 | class FetchPushRandomizedEnv(RandomizedFetchEnv, utils.EzPickle): 12 | def __init__(self, reward_type='dense', **kwargs): 13 | initial_qpos = { 14 | 'robot0:slide0': 0.405, 15 | 'robot0:slide1': 0.48, 16 | 'robot0:slide2': 0.0, 17 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.], 18 | } 19 | 20 | RandomizedFetchEnv.__init__(self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20, 21 | gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0, 22 | obj_range=0.15, target_range=0.15, distance_threshold=0.05, 23 | initial_qpos=initial_qpos, reward_type=reward_type, **kwargs) 24 | 25 | utils.EzPickle.__init__(self) -------------------------------------------------------------------------------- /common/randomizer/randomized_residual_push.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | from gym import utils 4 | from gym.envs.mujoco import mujoco_env 5 | 6 | from .residual_push import RandomizedResidualPushEnv 7 | 8 | # Ensure we get the path separator correct on windows 9 | MODEL_XML_PATH = os.path.join('fetch', 'push.xml') 10 | 11 | # RANDOMIZED RESIDUAL PUSH ENVIRONMENT 12 | class ResidualPushRandomizedEnv(RandomizedResidualPushEnv, utils.EzPickle): 13 | def __init__(self, reward_type='sparse', **kwargs): 14 | initial_qpos = { 15 | 'robot0:slide0': 0.405, 16 | 'robot0:slide1': 0.48, 17 | 'robot0:slide2': 0.0, 18 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.], 19 | } 20 | RandomizedResidualPushEnv.__init__(self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20, 21 | gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0, 22 | obj_range=0.15, target_range=0.15, distance_threshold=0.05, 23 | initial_qpos=initial_qpos, reward_type=reward_type, **kwargs) 24 | 25 | utils.EzPickle.__init__(self) -------------------------------------------------------------------------------- /common/randomizer/config/ErgoReacherRandomized/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ErgoReacherDefault-v0", 3 | "dimensions": [ 4 | { 5 | "name": "joint0gain", 6 | "default": 1, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | }, 10 | { 11 | "name": "joint1gain", 12 | "default": 1, 13 | "multiplier_min": 1.0, 14 | "multiplier_max": 1.0 15 | }, 16 | { 17 | "name": "joint2gain", 18 | "default": 1, 19 | "multiplier_min": 1.0, 20 | "multiplier_max": 1.0 21 | }, 22 | { 23 | "name": "joint3gain", 24 | "default": 1, 25 | "multiplier_min": 1.0, 26 | "multiplier_max": 1.0 27 | }, 28 | { 29 | "name": "joint0maxtorque", 30 | "default": 18, 31 | "multiplier_min": 1.0, 32 | "multiplier_max": 1.0 33 | }, 34 | { 35 | "name": "joint1maxtorque", 36 | "default": 18, 37 | "multiplier_min": 1.0, 38 | "multiplier_max": 1.0 39 | }, 40 | { 41 | "name": "joint2maxtorque", 42 | "default": 18, 43 | "multiplier_min": 1.0, 44 | "multiplier_max": 1.0 45 | }, 46 | { 47 | "name": "joint3maxtorque", 48 | "default": 18, 49 | "multiplier_min": 1.0, 50 | "multiplier_max": 1.0 51 | } 52 | ] 53 | } -------------------------------------------------------------------------------- /common/randomizer/config/ErgoReacherRandomized/random.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ErgoReacherRandomized-v0", 3 | "dimensions": [ 4 | { 5 | "name": "joint0gain", 6 | "default": 1, 7 | "multiplier_min": 0.3, 8 | "multiplier_max": 2.0 9 | }, 10 | { 11 | "name": "joint1gain", 12 | "default": 1, 13 | "multiplier_min": 0.3, 14 | "multiplier_max": 2.0 15 | }, 16 | { 17 | "name": "joint2gain", 18 | "default": 1, 19 | "multiplier_min": 0.3, 20 | "multiplier_max": 2.0 21 | }, 22 | { 23 | "name": "joint3gain", 24 | "default": 1, 25 | "multiplier_min": 0.3, 26 | "multiplier_max": 2.0 27 | }, 28 | { 29 | "name": "joint0maxtorque", 30 | "default": 18, 31 | "multiplier_min": 1.0, 32 | "multiplier_max": 4.0 33 | }, 34 | { 35 | "name": "joint1maxtorque", 36 | "default": 18, 37 | "multiplier_min": 1.0, 38 | "multiplier_max": 4.0 39 | }, 40 | { 41 | "name": "joint2maxtorque", 42 | "default": 18, 43 | "multiplier_min": 1.0, 44 | "multiplier_max": 4.0 45 | }, 46 | { 47 | "name": "joint3maxtorque", 48 | "default": 18, 49 | "multiplier_min": 1.0, 50 | "multiplier_max": 4.0 51 | } 52 | ] 53 | } -------------------------------------------------------------------------------- /common/randomizer/config/ErgoReacherRandomized/hard.json: -------------------------------------------------------------------------------- 1 | { 2 | "env": "ErgoReacherRandomizedHard-v0", 3 | "dimensions": [ 4 | { 5 | "name": "joint0gain", 6 | "default": 0.25, 7 | "multiplier_min": 1.0, 8 | "multiplier_max": 1.0 9 | }, 10 | { 11 | "name": "joint1gain", 12 | "default": 0.25, 13 | "multiplier_min": 1.0, 14 | "multiplier_max": 1.0 15 | }, 16 | { 17 | "name": "joint2gain", 18 | "default": 0.25, 19 | "multiplier_min": 1.0, 20 | "multiplier_max": 1.0 21 | }, 22 | { 23 | "name": "joint3gain", 24 | "default": 0.25, 25 | "multiplier_min": 1.0, 26 | "multiplier_max": 1.0 27 | }, 28 | { 29 | "name": "joint0maxtorque", 30 | "default": 5.0, 31 | "multiplier_min": 1.0, 32 | "multiplier_max": 1.0 33 | }, 34 | { 35 | "name": "joint1maxtorque", 36 | "default": 5.0, 37 | "multiplier_min": 1.0, 38 | "multiplier_max": 1.0 39 | }, 40 | { 41 | "name": "joint2maxtorque", 42 | "default": 5.0, 43 | "multiplier_min": 1.0, 44 | "multiplier_max": 1.0 45 | }, 46 | { 47 | "name": "joint3maxtorque", 48 | "default": 5.0, 49 | "multiplier_min": 1.0, 50 | "multiplier_max": 1.0 51 | } 52 | ] 53 | } -------------------------------------------------------------------------------- /common/randomizer/assets/push.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /common/randomizer/assets/slide.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | -------------------------------------------------------------------------------- /utils/learning_curve.py: -------------------------------------------------------------------------------- 1 | from arguments import get_args 2 | import numpy as np 3 | import os 4 | import matplotlib.pyplot as plt 5 | from utils.plot_utils import learning_curve 6 | 7 | if __name__=='__main__': 8 | args = get_args() 9 | # load the model param 10 | selfplay_percent = [0.0, 1.0, 1.0] 11 | x_gen_labels = np.linspace(0, 199, 199) 12 | approach = ["udr", "adr", "unsupervised-default"] 13 | PLOTCOLORS = ['hotpink', 'red', 'darkolivegreen', 'hotpink', 'blue'] 14 | save_plots = os.getcwd() + f'/plots/{args.env_name}/' 15 | if not os.path.isdir(save_plots): 16 | os.makedirs(save_plots) 17 | plt.rcParams["figure.figsize"] = (10, 6) 18 | SEED = [42, 41, 44, 43] 19 | 20 | for selfplay_index, a in enumerate(approach): 21 | mean, std = learning_curve(args, selfplay_percent, selfplay_index, a, SEED) 22 | mean = np.reshape(mean, (-1)) 23 | std = np.reshape(std, (-1)) 24 | plt.plot(x_gen_labels, mean, label=f'{a} - sp{selfplay_percent[selfplay_index]}', alpha=0.7) 25 | plt.fill_between(x_gen_labels, mean + std/2, mean - std/2, alpha=0.2) 26 | plt.title(f'Learning Curve for {args.env_name} | Mode : {args.mode}') 27 | plt.xlabel("Number of evaluations | 1 eval per 5000 timesteps") 28 | plt.ylabel("Average Distance") 29 | plt.ylim(0, 0.3) 30 | plt.xlim(0, 200) 31 | plt.legend() 32 | plt.savefig(f'{save_plots}/{args.mode}_generalization_sp{selfplay_percent[1]}.png', figsize=(10, 6)) 33 | plt.show() -------------------------------------------------------------------------------- /common/agents/ddpg/replay_buffer.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | class ReplayBuffer(object): 6 | 7 | def __init__(self, state_dim, action_dim, max_size=int(1e6)): 8 | self.max_size = max_size 9 | self.ptr = 0 10 | self.size = 0 11 | 12 | self.state = np.zeros((max_size, state_dim)) 13 | self.action = np.zeros((max_size, action_dim)) 14 | self.next_state = np.zeros((max_size, state_dim)) 15 | self.reward = np.zeros((max_size, 1)) 16 | self.not_done = np.zeros((max_size, 1)) 17 | 18 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 19 | 20 | def add(self, state, action, next_state, reward, done): 21 | self.state[self.ptr] = state 22 | self.action[self.ptr] = action 23 | self.next_state[self.ptr] = next_state 24 | self.reward[self.ptr] = reward 25 | self.not_done[self.ptr] = 1. - done 26 | 27 | self.ptr = (self.ptr + 1) % self.max_size 28 | self.size = min(self.size + 1, self.max_size) 29 | 30 | def sample(self, batch_size): 31 | ind = np.random.randint(0, self.size, size=batch_size) 32 | 33 | return ( 34 | torch.FloatTensor(self.state[ind]).to(self.device), 35 | torch.FloatTensor(self.action[ind]).to(self.device), 36 | torch.FloatTensor(self.next_state[ind]).to(self.device), 37 | torch.FloatTensor(self.reward[ind]).to(self.device), 38 | torch.FloatTensor(self.not_done[ind]).to(self.device) 39 | ) -------------------------------------------------------------------------------- /common/models/discriminator.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class MLPDiscriminator(nn.Module): 5 | """Discriminator class based on Feedforward Network 6 | Input is a state-action-state' transition 7 | Output is probability that it was from a reference trajectory 8 | """ 9 | def __init__(self, state_dim, action_dim): 10 | super(MLPDiscriminator, self).__init__() 11 | 12 | self.l1 = nn.Linear((state_dim + action_dim + state_dim), 128) 13 | self.l2 = nn.Linear(128, 128) 14 | self.logic = nn.Linear(128, 1) 15 | 16 | self.logic.weight.data.mul_(0.1) 17 | self.logic.bias.data.mul_(0.0) 18 | 19 | # Tuple of S-A-S' 20 | def forward(self, x): 21 | x = torch.tanh(self.l1(x)) 22 | x = torch.tanh(self.l2(x)) 23 | x = self.logic(x) 24 | return torch.sigmoid(x) 25 | 26 | class GAILMLPDiscriminator(nn.Module): 27 | """Discriminator class based on Feedforward Network 28 | Input is a state-action-state' transition 29 | Output is probability that it was from a reference trajectory 30 | """ 31 | def __init__(self, state_dim, action_dim): 32 | super(GAILMLPDiscriminator, self).__init__() 33 | self.l1 = nn.Linear((state_dim + action_dim), 128) 34 | self.l2 = nn.Linear(128, 128) 35 | self.logic = nn.Linear(128, 1) 36 | 37 | self.logic.weight.data.mul_(0.1) 38 | self.logic.bias.data.mul_(0.0) 39 | 40 | # Tuple of S-A-S' 41 | def forward(self, x): 42 | x = torch.tanh(self.l1(x)) 43 | x = torch.tanh(self.l2(x)) 44 | x = self.logic(x) 45 | return torch.sigmoid(x) -------------------------------------------------------------------------------- /common/randomizer/randomized_ergo_pusher.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import gym_ergojr 3 | from gym_ergojr.envs.ergo_pusher_env import ErgoPusherEnv 4 | 5 | def goal_distance(goal_a, goal_b): 6 | assert goal_a.shape == goal_b.shape 7 | return np.linalg.norm(goal_a - goal_b, axis=-1) 8 | 9 | 10 | class ErgoPusherRandomizedEnv(ErgoPusherEnv): 11 | def __init__(self, **kwargs): 12 | self.dimensions = [] 13 | self.config_file = kwargs.get('config') 14 | del kwargs['config'] 15 | 16 | super().__init__(**kwargs) 17 | self.reward_type = 'sparse' 18 | self.distance_threshold = 0.05 19 | 20 | # # this is affected by the DR 21 | # self.puck.friction 22 | 23 | def step(self, action): 24 | obs, reward, done, info = super().step(action) 25 | observation = {"observation": obs, 26 | "achieved_goal": obs[6:8], 27 | "desired_goal": obs[8:]} 28 | return observation, reward, done, info 29 | 30 | def reset(self, forced=False): 31 | obs = super(ErgoPusherRandomizedEnv, self).reset() 32 | observation = {"observation": obs, 33 | "achieved_goal": obs[6:8], 34 | "desired_goal": obs[8:]} 35 | return observation 36 | 37 | def update_randomized_params(self): 38 | 39 | self.puck.friction = self.dimensions[0].current_value 40 | 41 | def compute_reward(self, achieved_goal, goal, info): 42 | # Compute distance between goal and the achieved goal. 43 | d = goal_distance(achieved_goal, goal) 44 | if self.reward_type == 'sparse': 45 | return -(d > self.distance_threshold).astype(np.float32) 46 | else: 47 | return -d -------------------------------------------------------------------------------- /common/randomizer/randomized_ergoreacher.py: -------------------------------------------------------------------------------- 1 | from gym_ergojr.envs import ErgoReacherEnv 2 | import numpy as np 3 | 4 | 5 | class ErgoReacherRandomizedEnv(ErgoReacherEnv): 6 | def __init__(self, **kwargs): 7 | self.dimensions = [] # this will be 8 elements long after wrapper init 8 | self.config_file = kwargs.get('config') 9 | 10 | del kwargs['config'] 11 | 12 | super().__init__(**kwargs) 13 | 14 | # # these two are affected by the DR 15 | # self.max_force 16 | # self.max_vel 17 | 18 | def step(self, action): 19 | obs, reward, done, info = super().step(action) 20 | info = {'distance': self.dist.query()} 21 | observation = {"observation": obs, 22 | "achieved_goal": np.asarray(self.get_tip()[0][1:]), 23 | "desired_goal": obs[8:]} 24 | return observation, reward, done, info 25 | 26 | def reset(self, forced=False): 27 | obs = super(ErgoReacherRandomizedEnv, self).reset() 28 | observation = {"observation": obs, 29 | "achieved_goal": np.asarray(self.get_tip()[0][1:]), 30 | "desired_goal": obs[8:]} 31 | return observation 32 | 33 | def update_randomized_params(self): 34 | # these are used automatically in the `step` function 35 | self.max_force = np.zeros(6, np.float32) 36 | self.max_vel = np.zeros(6, np.float32) 37 | 38 | if self.simple: 39 | self.max_force[[1, 2, 4, 5]] = [x.current_value for x in self.dimensions[:4]] 40 | self.max_vel[[1, 2, 4, 5]] = [x.current_value for x in self.dimensions[4:]] 41 | else: 42 | self.max_force[:] = [x.current_value for x in self.dimensions[:6]] 43 | self.max_vel[:] = [x.current_value for x in self.dimensions[6:]] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | saved-models/ 2 | .idea/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | *.egg-info/ 27 | .installed.cfg 28 | *.egg 29 | MANIFEST 30 | 31 | # PyInstaller 32 | # Usually these files are written by a python script from a template 33 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 34 | *.manifest 35 | *.spec 36 | 37 | # Installer logs 38 | pip-log.txt 39 | pip-delete-this-directory.txt 40 | 41 | # Unit test / coverage reports 42 | htmlcov/ 43 | .tox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | .DS_Store 109 | 110 | *.pth 111 | *.pt 112 | *.npy 113 | -------------------------------------------------------------------------------- /common/randomizer/assets/hook.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /common/svpg/particles/distributions.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from common.svpg.particles.utils import AddBias, init 6 | 7 | """ 8 | Modify standard PyTorch distributions so they are compatible with this code. 9 | """ 10 | 11 | # Categorical 12 | FixedCategorical = torch.distributions.Categorical 13 | 14 | old_sample = FixedCategorical.sample 15 | FixedCategorical.sample = lambda self: old_sample(self).unsqueeze(-1) 16 | 17 | log_prob_cat = FixedCategorical.log_prob 18 | FixedCategorical.log_probs = lambda self, actions: log_prob_cat(self, actions.squeeze(-1)).view(actions.size(0), -1).sum(-1).unsqueeze(-1) 19 | 20 | FixedCategorical.mode = lambda self: self.probs.argmax(dim=-1, keepdim=True) 21 | 22 | 23 | # Normal 24 | FixedNormal = torch.distributions.Normal 25 | 26 | log_prob_normal = FixedNormal.log_prob 27 | FixedNormal.log_probs = lambda self, actions: log_prob_normal(self, actions).sum(-1, keepdim=True) 28 | 29 | normal_entropy = FixedNormal.entropy 30 | FixedNormal.entropy = lambda self: normal_entropy(self).sum(-1) 31 | 32 | FixedNormal.mode = lambda self: self.mean 33 | 34 | 35 | class Categorical(nn.Module): 36 | def __init__(self, num_inputs, num_outputs): 37 | super(Categorical, self).__init__() 38 | 39 | init_ = lambda m: init(m, 40 | nn.init.orthogonal_, 41 | lambda x: nn.init.constant_(x, 0), 42 | gain=0.01) 43 | 44 | self.linear = init_(nn.Linear(num_inputs, num_outputs)) 45 | 46 | def forward(self, x): 47 | x = self.linear(x) 48 | return FixedCategorical(logits=x) 49 | 50 | 51 | class DiagGaussian(nn.Module): 52 | def __init__(self, num_inputs, num_outputs): 53 | super(DiagGaussian, self).__init__() 54 | 55 | init_ = lambda m: init(m, 56 | nn.init.orthogonal_, 57 | lambda x: nn.init.constant_(x, 0)) 58 | 59 | self.fc_mean = init_(nn.Linear(num_inputs, num_outputs)) 60 | self.logstd = AddBias(torch.zeros(num_outputs)) 61 | 62 | def forward(self, x): 63 | action_mean = self.fc_mean(x) 64 | 65 | # An ugly hack for my KFAC implementation. 66 | zeros = torch.zeros(action_mean.size()) 67 | if x.is_cuda: 68 | zeros = zeros.cuda() 69 | 70 | action_logstd = self.logstd(zeros) 71 | return FixedNormal(action_mean, action_logstd.exp()) -------------------------------------------------------------------------------- /common/randomizer/controllers/miscalibrated_push_controller.py: -------------------------------------------------------------------------------- 1 | from .pick_and_place_controller import get_move_action 2 | 3 | import numpy as np 4 | 5 | DEBUG = False 6 | 7 | 8 | def get_push_control(obs, atol=1e-3, block_width=0.15, workspace_height=0.1): 9 | 10 | gripper_position = obs['observation'][:3] 11 | block_position = obs['observation'][3:6] 12 | goal = obs['desired_goal'] 13 | 14 | desired_block_angle = np.arctan2(goal[0] - block_position[0], goal[1] - block_position[1]) 15 | gripper_angle = np.arctan2(goal[0] - gripper_position[0], goal[1] - gripper_position[1]) 16 | 17 | push_position = block_position.copy() 18 | push_position[0] += -1. * np.sin(desired_block_angle) * block_width / 2. 19 | push_position[1] += -1. * np.cos(desired_block_angle) * block_width / 2. 20 | push_position[2] += 0.005 21 | 22 | # If the block is already at the place position, do nothing 23 | if np.sum(np.subtract(block_position, goal)**2) < atol: 24 | if DEBUG: 25 | print("The block is already at the place position; do nothing") 26 | return np.array([0., 0., 0., 0.]) 27 | 28 | # Angle between gripper and goal vs block and goal is roughly the same 29 | angle_diff = abs((desired_block_angle - gripper_angle + np.pi) % (2*np.pi) - np.pi) 30 | 31 | gripper_sq_distance = (gripper_position[0] - goal[0])**2 + (gripper_position[1] - goal[1])**2 32 | block_sq_distance = (block_position[0] - goal[0])**2 + (block_position[1] - goal[1])**2 33 | 34 | if (gripper_position[2] - push_position[2])**2 < atol and angle_diff < np.pi/4 and block_sq_distance < gripper_sq_distance: 35 | 36 | # Push towards the goal 37 | target_position = goal 38 | target_position[2] = gripper_position[2] 39 | if DEBUG: 40 | print("Push") 41 | return get_move_action(obs, target_position, atol=atol, gain=5.0) 42 | 43 | # If the gripper is above the push position 44 | if (gripper_position[0] - push_position[0])**2 + (gripper_position[1] - push_position[1])**2 < atol: 45 | 46 | # Move down to prepare for push 47 | if DEBUG: 48 | print("Move down to prepare for push") 49 | return get_move_action(obs, push_position, atol=atol) 50 | 51 | 52 | # Else move the gripper to above the push 53 | target_position = push_position.copy() 54 | target_position[2] += workspace_height 55 | if DEBUG: 56 | print("Move to above the push position") 57 | 58 | return get_move_action(obs, target_position, atol=atol) 59 | 60 | 61 | -------------------------------------------------------------------------------- /common/randomizer/miscalibrated_push_controller.py: -------------------------------------------------------------------------------- 1 | from .controllers.pick_and_place_controller import get_move_action 2 | 3 | import numpy as np 4 | 5 | DEBUG = False 6 | 7 | 8 | def get_push_control(obs, atol=1e-3, block_width=0.15, workspace_height=0.1): 9 | 10 | gripper_position = obs['observation'][:3] 11 | block_position = obs['observation'][3:6] 12 | goal = obs['desired_goal'] 13 | 14 | desired_block_angle = np.arctan2(goal[0] - block_position[0], goal[1] - block_position[1]) 15 | gripper_angle = np.arctan2(goal[0] - gripper_position[0], goal[1] - gripper_position[1]) 16 | 17 | push_position = block_position.copy() 18 | push_position[0] += -1. * np.sin(desired_block_angle) * block_width / 2. 19 | push_position[1] += -1. * np.cos(desired_block_angle) * block_width / 2. 20 | push_position[2] += 0.005 21 | 22 | # If the block is already at the place position, do nothing 23 | if np.sum(np.subtract(block_position, goal)**2) < atol: 24 | if DEBUG: 25 | print("The block is already at the place position; do nothing") 26 | return np.array([0., 0., 0., 0.]) 27 | 28 | # Angle between gripper and goal vs block and goal is roughly the same 29 | angle_diff = abs((desired_block_angle - gripper_angle + np.pi) % (2*np.pi) - np.pi) 30 | 31 | gripper_sq_distance = (gripper_position[0] - goal[0])**2 + (gripper_position[1] - goal[1])**2 32 | block_sq_distance = (block_position[0] - goal[0])**2 + (block_position[1] - goal[1])**2 33 | 34 | if (gripper_position[2] - push_position[2])**2 < atol and angle_diff < np.pi/4 and block_sq_distance < gripper_sq_distance: 35 | 36 | # Push towards the goal 37 | target_position = goal 38 | target_position[2] = gripper_position[2] 39 | if DEBUG: 40 | print("Push") 41 | return get_move_action(obs, target_position, atol=atol, gain=5.0) 42 | 43 | # If the gripper is above the push position 44 | if (gripper_position[0] - push_position[0])**2 + (gripper_position[1] - push_position[1])**2 < atol: 45 | 46 | # Move down to prepare for push 47 | if DEBUG: 48 | print("Move down to prepare for push") 49 | return get_move_action(obs, push_position, atol=atol) 50 | 51 | 52 | # Else move the gripper to above the push 53 | target_position = push_position.copy() 54 | target_position[2] += workspace_height 55 | if DEBUG: 56 | print("Move to above the push position") 57 | 58 | return get_move_action(obs, target_position, atol=atol) 59 | 60 | 61 | -------------------------------------------------------------------------------- /common/svpg/particles/svpg_particle.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from .utils import orthogonal_init 6 | from .distributions import Categorical, DiagGaussian 7 | 8 | 9 | class SVPGParticleCritic(nn.Module): 10 | def __init__(self, input_dim, output_dim, hidden_dim): 11 | super(SVPGParticleCritic, self).__init__() 12 | 13 | self.critic = nn.Sequential( 14 | orthogonal_init(nn.Linear(input_dim, hidden_dim)), 15 | nn.Tanh(), 16 | orthogonal_init(nn.Linear(hidden_dim, hidden_dim)), 17 | nn.Tanh(), 18 | orthogonal_init(nn.Linear(hidden_dim, 1)) 19 | ) 20 | 21 | def forward(self, x): 22 | return self.critic(x) 23 | 24 | class SVPGParticleActorBase(nn.Module): 25 | def __init__(self, input_dim, hidden_dim): 26 | super(SVPGParticleActorBase, self).__init__() 27 | 28 | self.actor_hidden = nn.Sequential( 29 | orthogonal_init(nn.Linear(input_dim, hidden_dim)), 30 | nn.Tanh(), 31 | orthogonal_init(nn.Linear(hidden_dim, hidden_dim)), 32 | nn.Tanh(), 33 | ) 34 | 35 | def forward(self, x): 36 | return self.actor_hidden(x) 37 | 38 | 39 | class SVPGParticle(nn.Module): 40 | """Implements a AC architecture for a Discrete Advantage 41 | Actor Critic Policy, used inside of SVPG 42 | """ 43 | def __init__(self, input_dim, output_dim, hidden_dim, discrete, freeze=False): 44 | super(SVPGParticle, self).__init__() 45 | 46 | self.critic = SVPGParticleCritic(input_dim, output_dim, hidden_dim) 47 | self.actor_hidden = SVPGParticleActorBase(input_dim, hidden_dim) 48 | 49 | if discrete: 50 | self.dist = Categorical(hidden_dim, output_dim) 51 | else: 52 | self.dist = DiagGaussian(hidden_dim, output_dim) 53 | 54 | if freeze: 55 | self.freeze() 56 | 57 | self.reset() 58 | 59 | def forward(self, x): 60 | actor_hidden = self.actor_hidden(x) 61 | dist = self.dist(actor_hidden) 62 | value = self.critic(x) 63 | 64 | return dist, value 65 | 66 | def freeze(self): 67 | for param in self.critic.parameters(): 68 | param.requires_grad = False 69 | 70 | for param in self.actor_hidden.parameters(): 71 | param.requires_grad = False 72 | 73 | for param in self.dist.parameters(): 74 | param.requires_grad = False 75 | 76 | def reset(self): 77 | self.saved_log_probs = [] 78 | self.saved_klds = [] 79 | self.rewards = [] -------------------------------------------------------------------------------- /common/randomizer/ergo_objects.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | import gym 4 | import numpy as np 5 | from gym import spaces 6 | from tqdm import tqdm 7 | from gym_ergojr.sim.objects import Puck 8 | import pybullet as p 9 | 10 | PUSHER_GOAL_X = [-.2, -.1] # Get three points 11 | PUSHER_GOAL_Y = [-.1, .05] # Get three points 12 | # Define a circle equation with three points. After that choose four equidistant points on that circle. 13 | 14 | circle1 = np.array([[-0.1, -0.1], 15 | [-0.15, -0.1], 16 | [-0.1, 0.05]]) 17 | shift = [-0.029, -0.0475, -0.066] 18 | 19 | 20 | def define_circle(p1, p2, p3): 21 | temp = p2[0] * p2[0] + p2[1] * p2[1] 22 | bc = (p1[0] * p1[0] + p1[1] * p1[1] - temp) / 2 23 | cd = (temp - p3[0] * p3[0] - p3[1] * p3[1]) / 2 24 | det = (p1[0] - p2[0]) * (p2[1] - p3[1]) - (p2[0] - p3[0]) * (p1[1] - p2[1]) 25 | 26 | if abs(det) < 1.0e-6: 27 | return (None, np.inf) 28 | 29 | # Center of circle 30 | cx = (bc*(p2[1] - p3[1]) - cd*(p1[1] - p2[1])) / det 31 | cy = ((p1[0] - p2[0]) * cd - (p2[0] - p3[0]) * bc) / det 32 | 33 | radius = np.sqrt((cx - p1[0])**2 + (cy - p1[1])**2) 34 | return ((cx, cy), radius) 35 | 36 | 37 | def angle(line1, line2): 38 | return np.arccos(np.dot(line1, line2)/(np.linalg.norm(line1)*np.linalg.norm(line2))) 39 | 40 | 41 | goal_pos = np.asarray([[-0.154, 0.0, 0], [-0.15, 0.032, 0], [-0.15, -0.032, 0], [-0.137, 0.062, 0], 42 | [-0.137, -0.062, 0], [-0.117, 0.088, 0], [-0.117, -0.088, 0], [-0.172, 0.0, 0], 43 | [-0.168, 0.032, 0], [-0.168, -0.032, 0],[-0.155, 0.062, 0],[-0.155, -0.062, 0], [-0.135, 0.088, 0], 44 | [-0.135, -0.088, 0], [-0.191, 0.0, 0], [-0.191, -0.0, 0], [-0.186, 0.032, 0], [-0.186, -0.032, 0], 45 | [-0.174, 0.062, 0], [-0.174, -0.062, 0], [-0.154, 0.088, 0], [-0.154, -0.08, 0]]) # Don't hard code. Change afterwards 46 | 47 | puck_positions = np.array([[-0.07, 0.05], 48 | [-0.85, 0.65], 49 | [-0.10, 0.08]]) 50 | class PuckEval(Puck): 51 | def __init__(self, friction=0.4): 52 | super(PuckEval, self).__init__() 53 | self.goal = None 54 | 55 | def reset(self): 56 | super().reset() 57 | 58 | def hard_reset(self): 59 | super().hard_reset() 60 | 61 | def add_target(self): 62 | self.dbo.goal = goal_pos[int(os.environ['goal_index'])] 63 | self.obj_visual = p.createVisualShape( 64 | p.GEOM_CYLINDER, radius=0.02, length=0.01, rgbaColor=[0, 1, 0, 1]) 65 | self.target = p.createMultiBody( 66 | baseVisualShapeIndex=self.obj_visual, basePosition=self.dbo.goal) 67 | 68 | -------------------------------------------------------------------------------- /common/randomizer/wrappers.py: -------------------------------------------------------------------------------- 1 | import gym 2 | import json 3 | import numpy as np 4 | 5 | import gym.spaces as spaces 6 | 7 | from common.randomizer.dimension import Dimension 8 | 9 | 10 | class RandomizedEnvWrapper(gym.Wrapper): 11 | """Creates a randomization-enabled enviornment, which can change 12 | physics / simulation parameters without relaunching everything 13 | """ 14 | 15 | def __init__(self, env, seed): 16 | super(RandomizedEnvWrapper, self).__init__(env) 17 | self.config_file = self.unwrapped.config_file 18 | 19 | self._load_randomization_dimensions(seed) 20 | self.unwrapped.update_randomized_params() 21 | self.randomized_default = ['random'] * len(self.unwrapped.dimensions) 22 | 23 | def _load_randomization_dimensions(self, seed): 24 | """ Helper function to load environment defaults ranges 25 | """ 26 | self.unwrapped.dimensions = [] 27 | 28 | with open(self.config_file, mode='r') as f: 29 | config = json.load(f) 30 | 31 | for dimension in config['dimensions']: 32 | self.unwrapped.dimensions.append( 33 | Dimension( 34 | default_value=dimension['default'], 35 | multiplier_min=dimension['multiplier_min'], 36 | multiplier_max=dimension['multiplier_max'], 37 | name=dimension['name'] 38 | ) 39 | ) 40 | 41 | nrand = len(self.unwrapped.dimensions) 42 | self.unwrapped.randomization_space = spaces.Box(0, 1, shape=(nrand,), dtype=np.float32) 43 | 44 | def randomize(self, randomized_values=[-1]): 45 | """Sets the parameter values such that a call to`update_randomized_params()` 46 | will generate an environment with those settings. 47 | 48 | Passing a list of 'default' strings will give the default value 49 | Passing a list of 'random' strings will give a purely random value for that dimension 50 | Passing a list of -1 integers will have the same effect. 51 | """ 52 | for dimension, randomized_value in enumerate(randomized_values): 53 | if randomized_value == 'default': 54 | self.unwrapped.dimensions[dimension].current_value = \ 55 | self.unwrapped.dimensions[dimension].default_value 56 | elif randomized_value != 'random' and randomized_value != -1: 57 | assert 0.0 <= randomized_value <= 1.0, "using incorrect: {}".format(randomized_value) 58 | self.unwrapped.dimensions[dimension].current_value = \ 59 | self.unwrapped.dimensions[dimension].rescale(randomized_value) 60 | else: # random 61 | self.unwrapped.dimensions[dimension].randomize() 62 | 63 | self.unwrapped.update_randomized_params() 64 | 65 | def step(self, action): 66 | return self.env.step(action) 67 | 68 | def reset(self, **kwargs): 69 | return self.env.reset(**kwargs) -------------------------------------------------------------------------------- /experiments/evaluate_ergo_envs.py: -------------------------------------------------------------------------------- 1 | import re 2 | from common.randomizer.wrappers import RandomizedEnvWrapper 3 | from locale import atof 4 | import time 5 | import torch 6 | import common.randomizer as randomizer 7 | from arguments import get_args 8 | import gym 9 | import gym_ergojr 10 | import numpy as np 11 | import os.path as osp 12 | import os 13 | import time 14 | from common.agents.ddpg.ddpg import DDPG 15 | 16 | 17 | def atof(text): 18 | try: 19 | retval = float(text) 20 | except ValueError: 21 | retval = text 22 | return retval 23 | 24 | 25 | def natural_keys(text): 26 | return [atof(c) for c in re.split(r'[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)', text) ] 27 | 28 | 29 | if __name__ == '__main__': 30 | args = get_args() 31 | # load the model param 32 | SEED = [41, 42, 43, 44] 33 | APPROACH = ['udr'] 34 | SP_PERCENT = [0.0] 35 | for i, approach in enumerate(APPROACH): 36 | for seed in SEED: 37 | print('------------------------------------------------------------------------') 38 | print(f'Approach : {approach} | Seed : {seed} | Env : {args.env_name}') 39 | print('------------------------------------------------------------------------') 40 | args.save_dir = osp.join('/saved_models', 'sp{}polyak{}'.format(SP_PERCENT[i], args.sp_polyak) + '-' + approach) 41 | model_path = osp.join(os.getcwd() + args.save_dir, str(seed), args.env_name) 42 | models_path = os.listdir(model_path + '/') 43 | 44 | # List the file names that start with model and sort according to number. 45 | models_path = list(filter(lambda x: x.lower().endswith("actor.pth"), models_path)) 46 | models_path.sort(key=natural_keys) 47 | 48 | env = gym.make(args.env_name) 49 | env = RandomizedEnvWrapper(env, seed=12) 50 | env.randomize(['default'] * args.n_param) 51 | env.seed(args.seed + 100) 52 | eval_episodes = 1 53 | 54 | state_dim = env.observation_space.shape[0] 55 | action_dim = env.action_space.shape[0] 56 | max_action = float(env.action_space.high[0]) 57 | goal_dim = 2 58 | final_dist = [] 59 | policy = DDPG(args, state_dim, action_dim, max_action, goal_dim) 60 | for model in models_path: 61 | print(model) 62 | policy.load(model, model_path) 63 | avg_reward = 0. 64 | avg_dist = 0 65 | model_dist = [] 66 | for key in range(eval_episodes): 67 | obs = env.reset() 68 | done = False 69 | while not done: 70 | time.sleep(0.01) 71 | action = policy.select_action(np.array(obs, dtype=np.float64)) 72 | obs, reward, done, info = env.step(action) 73 | avg_reward += reward 74 | 75 | avg_dist += info["distance"] 76 | final_dist.append(avg_dist/eval_episodes) 77 | np.save(model_path + f'{args.mode}_evaluation.npy', np.asarray(final_dist)) 78 | 79 | 80 | -------------------------------------------------------------------------------- /utils/plot_utils.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import numpy as np 3 | import os 4 | import os.path as osp 5 | 6 | 7 | def learning_curve(args, selfplay_percent, selplay_index, approach, seed): 8 | ''' 9 | args: arguments 10 | selfplay_percent: list of selfplay percentages (list) 11 | selplay_index: index of particular approach (integer) 12 | approach: list of different approaches ( Ex: ['udr', 'adr', 'unsupervised-default'] 13 | seed: list of seed numbers 14 | return: returns mean and standard deviation across all seeds. 15 | ''' 16 | default_dist = [] 17 | for s in seed: 18 | a = [] 19 | save_dir = osp.join(os.getcwd() + '/' + args.save_dir, "sp" + str(selfplay_percent[selplay_index]) + "polyak" + 20 | str(args.polyak) + '-' + str(approach), str(s) + '/') 21 | array = np.load(save_dir + f'{args.env_name}{args.mode}_evaluation.npy', allow_pickle=True) 22 | for arr in array: 23 | a.append(arr) 24 | array = np.asarray(a) 25 | print(f'Array shape : {array.shape} | Seed : {s} | Approach : {approach} | selplay_index : {selplay_index}') 26 | default_dist.append([array[:199]]) 27 | 28 | mean = np.mean(np.asarray(default_dist), axis=0) 29 | N = args.window_size 30 | mode = 'causal' 31 | mean = smooth(np.reshape(mean, (mean.shape[1])), N, mode=mode) 32 | std = np.std(np.asarray(default_dist), axis=0) 33 | std = smooth(np.reshape(std, (std.shape[1])), N, mode=mode) 34 | 35 | return mean, std 36 | 37 | 38 | def smooth(y, radius, mode='two_sided', valid_only=False): 39 | ''' 40 | Smooth signal y, where radius is determines the size of the window 41 | mode='twosided': 42 | average over the window [max(index - radius, 0), min(index + radius, len(y)-1)] 43 | mode='causal': 44 | average over the window [max(index - radius, 0), index] 45 | valid_only: put nan in entries where the full-sized window is not available 46 | ''' 47 | assert mode in ('two_sided', 'causal') 48 | if len(y) < 2*radius+1: 49 | return np.ones_like(y) * y.mean() 50 | elif mode == 'two_sided': 51 | convkernel = np.ones(2 * radius+1) 52 | out = np.convolve(y, convkernel,mode='same') / np.convolve(np.ones_like(y), convkernel, mode='same') 53 | if valid_only: 54 | out[:radius] = out[-radius:] = np.nan 55 | elif mode == 'causal': 56 | convkernel = np.ones(radius) 57 | out = np.convolve(y, convkernel,mode='full') / np.convolve(np.ones_like(y), convkernel, mode='full') 58 | out = out[:-radius+1] 59 | if valid_only: 60 | out[:radius] = np.nan 61 | return out 62 | 63 | 64 | def sampling_plot(args, seed, sp_index, approach='adr'): 65 | 66 | for s in seed: 67 | save_dir = osp.join(args.save_dir, "sp" + str(sp_index) + "polyak" + 68 | str(args.polyak) + '-' + str(approach), str(s), args.env_name + '/') 69 | print(save_dir) 70 | alice_envs = np.load(save_dir + f'alice_envs.npy', allow_pickle=True) 71 | envs = alice_envs 72 | envs = list(itertools.chain(*envs)) 73 | list_ = np.reshape(envs, (-1, 1)) 74 | return list_ 75 | 76 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Self-Supervised Active Domain Randomization 2 | This repository provides official code base for the paper "Generating Automatic Curricula via Self-Supervised Active Domain Randomization". 3 | ![Self-Supervised Active Domain Randomization Architecture](architecture-selfplay.png) 4 | 5 | ## Experiments 6 | We perform our experiments on ErgoReacher, a 4 DoF arm and ErgoPusher, a 3-DoF arm from both in simulation and on the real robot. 7 | ### Important Flags 8 | There are few important flags which differentiate various experiments. `--approach` specifies the approach ['udr' | 'unsupervised-default' | 'unsupervised-adr'], `--sp-percent` flag specifies the self-play percentage. For all out experiments we use either `--sp-percent=1.0` (full self-play/completely unsupervised) or `--sp-percent=0.0` (no self-play,/completely supervised) depending on the `--approach`. `--only-sp` specifies where the bob operates. It is `True` for `--approach=unsupervised-default` and `False` for `--approach=unsupervised-adr`. For all the reacher experiments we used `--n-params=8` and for pusher experiments we used `--n-params=1`. 9 | 10 | ### Uniform Domain Randomization 11 | For `ErgoReacher` baseline experiments: 12 | 13 | `python experiments/ddpg_train.py --sp-percent 0.0 --approach 'udr' --env-name='ErgoReacherRandomizedEnv-Headless-v0' --n-params=8 ` 14 | 15 | For `ErgoPusher` baseline experiments: 16 | 17 | `python experiments/ddpg_train.py --sp-percent 0.0 --approach 'udr' --env-name='ErgoPushRandomizedEnv-Headless-v0' --n-params=1` 18 | 19 | ### Unsupervised Default 20 | For `ErgoReacher` experiments: 21 | 22 | `python experiments/ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-default' --only-sp=True --env-name='ErgoReacherRandomizedEnv-Headless-v0' --n-params=8 ` 23 | 24 | For `ErgoPusher` experiments: 25 | 26 | `python experiments/ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-default' --only-sp=True --env-name='ErgoPushRandomizedEnv-Headless-v0' --n-params=1` 27 | 28 | ### Self-Supervised Active Domain Randomization 29 | For `ErgoReacher` experiments: 30 | 31 | `python experiments/ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-adr' --only-sp=True --env-name='ErgoReacherRandomizedEnv-Headless-v0' --n-params=8` 32 | 33 | For `ErgoPusher` experiments: 34 | 35 | `python experiments/ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-adr' --only-sp=True --env-name='ErgoPushRandomizedEnv-Headless-v0' --n-params=1` 36 | 37 | ## Evaluations 38 | In order to evaluate the trained models on simulator, on the command line execute the following. 39 | 40 | Here `--mode` can be `[default | hard]`. Here is an example to evaluate `ErgoReacher` in default environment. You can likewise evaluate `ErgoPusher` with different modes. 41 | 42 | `python experiments/evaluate_ergo_envs.py --env-name "ErgoReacherRandomizedEnv-Headless-v0" --mode='default' --sp-polyak 0.95 --n-params=8` 43 | 44 | ## Reference 45 | 46 | ``` 47 | @article{raparthy2020selfsupadr, 48 | title={Generating Automatic Curricula via Self-Supervised Active Domain Randomization}, 49 | author={Raparthy, Sharath Chandra and Mehta, Bhairav and Golemo, Florian and Paull, Liam}, 50 | url={https://arxiv.org/abs/2002.07911}, 51 | year={2020} 52 | } 53 | ``` 54 | 55 | Built by [@Sharath](https://sharathraparthy.github.io/) and [@Bhairav Mehta](https://bhairavmehta95.github.io/) 56 | -------------------------------------------------------------------------------- /common/discriminator/discriminator_rewarder.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import torch 4 | import torch.nn as nn 5 | from torch.autograd import Variable 6 | 7 | from common.models.discriminator import MLPDiscriminator 8 | 9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 10 | 11 | 12 | class DiscriminatorRewarder(object): 13 | def __init__(self, state_dim, action_dim, discriminator_batchsz, reward_scale, 14 | load_discriminator=False, discriminator_lr=3e-3, add_pz=True): 15 | self.discriminator = MLPDiscriminator( 16 | state_dim=state_dim, 17 | action_dim=action_dim 18 | ) 19 | 20 | self.discriminator_criterion = nn.BCELoss() 21 | self.discriminator_optimizer = torch.optim.Adam(self.discriminator.parameters(), lr=discriminator_lr) 22 | self.reward_scale = reward_scale 23 | self.batch_size = discriminator_batchsz 24 | self.add_pz = add_pz 25 | 26 | if load_discriminator: 27 | self._load_discriminator(randomized_env_id) 28 | 29 | def calculate_rewards(self, randomized_trajectory): 30 | """Discriminator based reward calculation 31 | We want to use the negative of the adversarial calculation (Normally, -log(D)). We want to *reward* 32 | our simulator for making it easier to discriminate between the reference env + randomized onea 33 | """ 34 | score, _, _ = self.get_score(randomized_trajectory) 35 | reward = np.log(score) 36 | 37 | if self.add_pz: 38 | reward -= np.log(0.5) 39 | 40 | return self.reward_scale * reward 41 | 42 | def get_score(self, trajectory): 43 | """Discriminator based reward calculation 44 | We want to use the negative of the adversarial calculation (Normally, -log(D)). We want to *reward* 45 | our simulator for making it easier to discriminate between the reference env + randomized onea 46 | """ 47 | traj_tensor = self._trajectory2tensor(trajectory).float() 48 | 49 | with torch.no_grad(): 50 | score = (self.discriminator(traj_tensor).cpu().detach().numpy()+1e-8) 51 | return score.mean(), np.median(score), np.sum(score) 52 | 53 | def train_discriminator(self, reference_trajectory, randomized_trajectory, iterations): 54 | """Trains discriminator to distinguish between reference and randomized state action tuples 55 | """ 56 | for _ in range(iterations): 57 | randind = np.random.randint(0, len(randomized_trajectory[0]), size=int(self.batch_size)) 58 | refind = np.random.randint(0, len(reference_trajectory[0]), size=int(self.batch_size)) 59 | 60 | randomized_batch = self._trajectory2tensor(randomized_trajectory[randind]) 61 | reference_batch = self._trajectory2tensor(reference_trajectory[refind]) 62 | 63 | g_o = self.discriminator(randomized_batch) 64 | e_o = self.discriminator(reference_batch) 65 | 66 | self.discriminator_optimizer.zero_grad() 67 | 68 | discrim_loss = self.discriminator_criterion(g_o, torch.ones((len(randomized_batch), 1), device=device)) + \ 69 | self.discriminator_criterion(e_o, torch.zeros((len(reference_batch), 1), device=device)) 70 | discrim_loss.backward() 71 | 72 | self.discriminator_optimizer.step() 73 | 74 | def _load_discriminator(self, name, path='saved-models/discriminator/discriminator_{}.pth'): 75 | self.discriminator.load_state_dict(torch.load(path.format(name), map_location=device)) 76 | 77 | def _save_discriminator(self, name, path='saved-models/discriminator/discriminator_{}.pth'): 78 | torch.save(self.discriminator.state_dict(), path.format(name)) 79 | 80 | def _trajectory2tensor(self, trajectory): 81 | return torch.from_numpy(trajectory).float().to(device) 82 | -------------------------------------------------------------------------------- /common/agents/selfplay_models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.optim as optim 4 | from torch.distributions import Bernoulli 5 | import numpy as np 6 | 7 | eps = np.finfo(np.float32).eps.item() 8 | 9 | 10 | class BernoulliPolicyFetch(nn.Module): 11 | def __init__(self, goal_dim, action_dim=1): 12 | super(BernoulliPolicyFetch, self).__init__() 13 | self.base = nn.Sequential( 14 | nn.Linear(goal_dim * 2, 300), 15 | nn.ReLU(), 16 | nn.Linear(300, 300), 17 | nn.ReLU(), 18 | ) 19 | 20 | self.out = nn.Linear(300, 1) 21 | self.value = nn.Linear(300, 1) 22 | 23 | self.saved_log_probs = [] 24 | self.rewards = [] 25 | self.values = [] 26 | 27 | def forward(self, x): 28 | x = self.base(x) 29 | termination_prob = self.out(x) 30 | value = self.value(x) 31 | return torch.sigmoid(termination_prob), value 32 | 33 | 34 | class AlicePolicyFetch: 35 | def __init__(self, args, goal_dim, action_dim=1): 36 | self.policy = BernoulliPolicyFetch(goal_dim, action_dim=1) 37 | self.optimizer = optim.Adam(self.policy.parameters(), lr=3e-3) 38 | # self.comm = setup_mpi(self.policy) 39 | self.args = args 40 | 41 | def select_action(self, state, deterministic=False, save_log_probs=True): 42 | state = torch.from_numpy(state).float().unsqueeze(0) 43 | probs, value = self.policy(state) 44 | m = Bernoulli(probs) 45 | action = m.sample() 46 | if save_log_probs: 47 | self.policy.saved_log_probs.append(m.log_prob(action)) 48 | self.policy.values.append(value) 49 | 50 | return action.cpu().data.numpy()[0] 51 | 52 | def log(self, reward): 53 | self.policy.rewards.append(reward) 54 | 55 | def load_from_file(self, file): 56 | self.policy.load_state_dict(torch.load(file)) 57 | 58 | def load_from_policy(self, original): 59 | with torch.no_grad(): 60 | for param, target_param in zip(self.policy.parameters(), original.policy.parameters()): 61 | param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data) 62 | param.requires_grad = False 63 | 64 | def perturb(self, alpha, weight_noise, bias_noise): 65 | with torch.no_grad(): 66 | weight_noise = torch.from_numpy(alpha * weight_noise).float() 67 | bias_noise = torch.from_numpy(alpha * bias_noise).float() 68 | for param, noise in zip(self.policy.parameters(), [weight_noise, bias_noise]): 69 | param.add_(noise) 70 | param.requires_grad = False 71 | 72 | def finish_episode(self, gamma): 73 | R = 0 74 | returns = [] 75 | 76 | # Normalize 77 | self.policy.rewards = np.array(self.policy.rewards) 78 | self.policy.rewards = (self.policy.rewards - self.policy.rewards.mean()) / (self.policy.rewards.std() + eps) 79 | 80 | for r in self.policy.rewards[::-1]: 81 | R = r + gamma * R 82 | returns.insert(0, torch.FloatTensor([R]).unsqueeze(1)) 83 | 84 | log_probs = torch.cat(self.policy.saved_log_probs) 85 | returns = torch.cat(returns).detach() 86 | values = torch.cat(self.policy.values) 87 | 88 | advantage = returns - values 89 | 90 | actor_loss = -(log_probs * advantage.detach()).mean() 91 | critic_loss = advantage.pow(2).mean() 92 | loss = actor_loss + 0.5 * critic_loss 93 | 94 | self.optimizer.zero_grad() 95 | loss.backward() 96 | # sync_grads(self.comm, self.policy) 97 | self.optimizer.step() 98 | 99 | self.policy.rewards = [] 100 | self.policy.saved_log_probs = [] 101 | self.policy.values = [] -------------------------------------------------------------------------------- /common/randomizer/residual_fetch_push_env.py: -------------------------------------------------------------------------------- 1 | from .miscalibrated_push_controller import get_push_control 2 | 3 | from gym.envs.robotics import FetchPushEnv 4 | 5 | import gym 6 | from gym import error, spaces, utils 7 | from gym.utils import seeding 8 | 9 | import numpy as np 10 | import time 11 | 12 | 13 | class ResidualSlipperyPushEnv(gym.Env): 14 | 15 | def __init__(self, *args, **kwargs): 16 | self.fetch_env = gym.make("FetchPush-v1") 17 | 18 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)): 19 | self.fetch_env.env.sim.model.geom_friction[i] = [18e-2, 5.e-3, 1e-4] 20 | 21 | self.metadata = self.fetch_env.metadata 22 | self.hardcoded_controller = None 23 | self.action_space = self.fetch_env.action_space 24 | self.observation_space = self.fetch_env.observation_space 25 | 26 | 27 | def step(self, residual_action): 28 | residual_action = 2. * residual_action 29 | 30 | action = np.add(residual_action, get_push_control(self._last_observation)) 31 | action = np.clip(action, -1, 1) 32 | observation, reward, done, debug_info = self.fetch_env.step(action) 33 | self._last_observation = observation 34 | 35 | return observation, reward, done, debug_info 36 | 37 | def reset(self): 38 | observation = self.fetch_env.reset() 39 | self._last_observation = observation 40 | return observation 41 | 42 | def seed(self, seed=0): 43 | return self.fetch_env.seed(seed=seed) 44 | 45 | def render(self, mode="human", *args, **kwargs): 46 | # See https://github.com/openai/gym/issues/1081 47 | self.fetch_env.env._render_callback() 48 | if mode == 'rgb_array': 49 | self.fetch_env.env._get_viewer().render() 50 | width, height = 3350, 1800 51 | data = self.fetch_env.env._get_viewer().read_pixels(width, height, depth=False) 52 | # original image is upside-down, so flip it 53 | return data[::-1, :, :] 54 | elif mode == 'human': 55 | self.fetch_env.env._get_viewer().render() 56 | 57 | return self.fetch_env.render(*args, **kwargs) 58 | 59 | def compute_reward(self, *args, **kwargs): 60 | return self.fetch_env.compute_reward(*args, **kwargs) 61 | 62 | class SlipperyPushEnv(gym.Env): 63 | 64 | def __init__(self, *args, **kwargs): 65 | self.fetch_env = gym.make("FetchPush-v1") 66 | 67 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)): 68 | self.fetch_env.env.sim.model.geom_friction[i] = [18e-2, 5.e-3, 1e-4] 69 | 70 | self.metadata = self.fetch_env.metadata 71 | 72 | self.action_space = self.fetch_env.action_space 73 | self.observation_space = self.fetch_env.observation_space 74 | 75 | def step(self, action): 76 | return self.fetch_env.step(action) 77 | 78 | def reset(self): 79 | return self.fetch_env.reset() 80 | 81 | def seed(self, seed=0): 82 | return self.fetch_env.seed(seed=seed) 83 | 84 | def render(self, mode="human", *args, **kwargs): 85 | # See https://github.com/openai/gym/issues/1081 86 | self.fetch_env.env._render_callback() 87 | if mode == 'rgb_array': 88 | self.fetch_env.env._get_viewer().render() 89 | width, height = 3350, 1800 90 | data = self.fetch_env.env._get_viewer().read_pixels(width, height, depth=False) 91 | # original image is upside-down, so flip it 92 | return data[::-1, :, :] 93 | elif mode == 'human': 94 | self.fetch_env.env._get_viewer().render() 95 | 96 | return self.fetch_env.render(*args, **kwargs) 97 | 98 | def compute_reward(self, *args, **kwargs): 99 | return self.fetch_env.compute_reward(*args, **kwargs) 100 | 101 | -------------------------------------------------------------------------------- /common/randomizer/controllers/pick_and_place_controller.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | DEBUG = False 4 | 5 | def get_move_action(observation, target_position, atol=1e-3, gain=10., close_gripper=False): 6 | """ 7 | Move an end effector to a position and orientation. 8 | """ 9 | # Get the currents 10 | current_position = observation['observation'][:3] 11 | 12 | action = gain * np.subtract(target_position, current_position) 13 | if close_gripper: 14 | gripper_action = -1. 15 | else: 16 | gripper_action = 0. 17 | action = np.hstack((action, gripper_action)) 18 | 19 | return action 20 | 21 | def block_is_grasped(obs, relative_grasp_position, atol=1e-3): 22 | return block_inside_grippers(obs, relative_grasp_position, atol=atol) and grippers_are_closed(obs, atol=atol) 23 | 24 | def block_inside_grippers(obs, relative_grasp_position, atol=1e-3): 25 | gripper_position = obs['observation'][:3] 26 | block_position = obs['observation'][3:6] 27 | 28 | relative_position = np.subtract(gripper_position, block_position) 29 | 30 | return np.sum(np.subtract(relative_position, relative_grasp_position)**2) < atol 31 | 32 | def grippers_are_closed(obs, atol=1e-3): 33 | gripper_state = obs['observation'][9:11] 34 | return abs(gripper_state[0] - 0.024) < atol 35 | 36 | def grippers_are_open(obs, atol=1e-3): 37 | gripper_state = obs['observation'][9:11] 38 | return abs(gripper_state[0] - 0.05) < atol 39 | 40 | def get_pick_and_place_control(obs, relative_grasp_position=(0., 0., -0.02), workspace_height=0.1, atol=1e-3, gain=10): 41 | """ 42 | Returns 43 | ------- 44 | action : [float] * 4 45 | """ 46 | gripper_position = obs['observation'][:3] 47 | block_position = obs['observation'][3:6] 48 | place_position = obs['desired_goal'] 49 | 50 | # If the gripper is already grasping the block 51 | if block_is_grasped(obs, relative_grasp_position, atol=atol): 52 | 53 | # If the block is already at the place position, do nothing except keep the gripper closed 54 | if np.sum(np.subtract(block_position, place_position)**2) < atol: 55 | if DEBUG: 56 | print("The block is already at the place position; do nothing") 57 | return np.array([0., 0., 0., -1.]) 58 | 59 | # Move to the place position while keeping the gripper closed 60 | target_position = np.add(place_position, relative_grasp_position) 61 | target_position[2] += workspace_height/2. 62 | if DEBUG: 63 | print("Move to above the place position") 64 | return get_move_action(obs, target_position, atol=atol, close_gripper=True, gain=gain) 65 | 66 | # If the block is ready to be grasped 67 | if block_inside_grippers(obs, relative_grasp_position, atol=atol): 68 | 69 | # Close the grippers 70 | if DEBUG: 71 | print("Close the grippers") 72 | return np.array([0., 0., 0., -1.]) 73 | 74 | # If the gripper is above the block 75 | if (gripper_position[0] - block_position[0])**2 + (gripper_position[1] - block_position[1])**2 < atol: 76 | 77 | # If the grippers are closed, open them 78 | if not grippers_are_open(obs, atol=atol): 79 | if DEBUG: 80 | print("Open the grippers") 81 | return np.array([0., 0., 0., 1.]) 82 | 83 | # Move down to grasp 84 | target_position = np.add(block_position, relative_grasp_position) 85 | if DEBUG: 86 | print("Move down to grasp") 87 | return get_move_action(obs, target_position, atol=atol, gain=gain) 88 | 89 | 90 | # Else move the gripper to above the block 91 | target_position = np.add(block_position, relative_grasp_position) 92 | target_position[2] += workspace_height 93 | if DEBUG: 94 | print("Move to above the block") 95 | return get_move_action(obs, target_position, atol=atol, gain=gain) 96 | 97 | 98 | -------------------------------------------------------------------------------- /common/randomizer/residual_envs/residual_fetch_push_env.py: -------------------------------------------------------------------------------- 1 | from randomizer.controllers.miscalibrated_push_controller import get_push_control 2 | 3 | from gym.envs.robotics import FetchPushEnv 4 | 5 | import gym 6 | from gym import error, spaces, utils 7 | from gym.utils import seeding 8 | 9 | import numpy as np 10 | import time 11 | 12 | 13 | class ResidualSlipperyPushEnv(gym.Env): 14 | 15 | def __init__(self, *args, **kwargs): 16 | self.fetch_env = gym.make("FetchPush-v1") 17 | 18 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)): 19 | self.fetch_env.env.sim.model.geom_friction[i] = [18e-2, 5.e-3, 1e-4] 20 | 21 | self.metadata = self.fetch_env.metadata 22 | self.hardcoded_controller = None 23 | self.action_space = self.fetch_env.action_space 24 | self.observation_space = self.fetch_env.observation_space 25 | 26 | 27 | def step(self, residual_action): 28 | residual_action = 2. * residual_action 29 | 30 | action = np.add(residual_action, get_push_control(self._last_observation)) 31 | action = np.clip(action, -1, 1) 32 | observation, reward, done, debug_info = self.fetch_env.step(action) 33 | self._last_observation = observation 34 | 35 | return observation, reward, done, debug_info 36 | 37 | def reset(self): 38 | observation = self.fetch_env.reset() 39 | self._last_observation = observation 40 | return observation 41 | 42 | def seed(self, seed=0): 43 | return self.fetch_env.seed(seed=seed) 44 | 45 | def render(self, mode="human", *args, **kwargs): 46 | # See https://github.com/openai/gym/issues/1081 47 | self.fetch_env.env._render_callback() 48 | if mode == 'rgb_array': 49 | self.fetch_env.env._get_viewer(mode=mode).render() 50 | width, height = 3350, 1800 51 | data = self.fetch_env.env._get_viewer().read_pixels(width, height, depth=False) 52 | # original image is upside-down, so flip it 53 | return data[::-1, :, :] 54 | elif mode == 'human': 55 | self.fetch_env.env._get_viewer().render() 56 | 57 | return self.fetch_env.render(*args, **kwargs) 58 | 59 | def compute_reward(self, *args, **kwargs): 60 | return self.fetch_env.compute_reward(*args, **kwargs) 61 | 62 | class SlipperyPushEnv(gym.Env): 63 | 64 | def __init__(self, *args, **kwargs): 65 | self.fetch_env = gym.make("FetchPush-v1") 66 | 67 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)): 68 | self.fetch_env.env.sim.model.geom_friction[i] = [18e-2, 5.e-3, 1e-4] 69 | 70 | self.metadata = self.fetch_env.metadata 71 | 72 | self.action_space = self.fetch_env.action_space 73 | self.observation_space = self.fetch_env.observation_space 74 | 75 | def step(self, action): 76 | return self.fetch_env.step(action) 77 | 78 | def reset(self): 79 | return self.fetch_env.reset() 80 | 81 | def seed(self, seed=0): 82 | return self.fetch_env.seed(seed=seed) 83 | 84 | def render(self, mode="human", *args, **kwargs): 85 | # See https://github.com/openai/gym/issues/1081 86 | self.fetch_env.env._render_callback() 87 | if mode == 'rgb_array': 88 | self.fetch_env.env._get_viewer().render() 89 | width, height = 3350, 1800 90 | data = self.fetch_env.env._get_viewer().read_pixels(width, height, depth=False) 91 | # original image is upside-down, so flip it 92 | return data[::-1, :, :] 93 | elif mode == 'human': 94 | self.fetch_env.env._get_viewer().render() 95 | 96 | return self.fetch_env.render(*args, **kwargs) 97 | 98 | def compute_reward(self, *args, **kwargs): 99 | return self.fetch_env.compute_reward(*args, **kwargs) 100 | 101 | -------------------------------------------------------------------------------- /common/adr/adr.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from common.discriminator.discriminator_rewarder import DiscriminatorRewarder 4 | from common.svpg.svpg import SVPG 5 | 6 | class ADR: 7 | def __init__( 8 | self, 9 | nparticles, 10 | nparams, 11 | state_dim, 12 | action_dim, 13 | temperature, 14 | svpg_rollout_length, 15 | svpg_horizon, 16 | max_step_length, 17 | reward_scale, 18 | initial_svpg_steps, 19 | seed, 20 | discriminator_batchsz 21 | ): 22 | assert nparticles > 2 23 | 24 | self.nparticles = nparticles 25 | self.nparams = nparams 26 | 27 | self.svpg_rollout_length = svpg_rollout_length 28 | self.svpg_horizon = svpg_horizon 29 | self.initial_svpg_steps = initial_svpg_steps 30 | 31 | self.seed = seed 32 | self.svpg_timesteps = 0 33 | 34 | self.discriminator_rewarder = DiscriminatorRewarder( 35 | state_dim=state_dim, 36 | action_dim=action_dim, 37 | discriminator_batchsz=discriminator_batchsz, 38 | reward_scale=reward_scale, 39 | ) 40 | 41 | self.svpg = SVPG( 42 | nparticles=nparticles, 43 | nparams=self.nparams, 44 | max_step_length=max_step_length, 45 | svpg_rollout_length=svpg_rollout_length, 46 | svpg_horizon=svpg_horizon, 47 | temperature=temperature, 48 | ) 49 | 50 | self.parameter_settings = np.ones( 51 | (self.nparticles, 52 | self.svpg_horizon, 53 | self.svpg.svpg_rollout_length, 54 | self.svpg.nparams) 55 | ) * -1 56 | 57 | def score_trajectories(self, randomized_trajectories): 58 | rewards = np.zeros((self.nparticles, self.svpg.svpg_rollout_length)) 59 | 60 | for i in range(self.nparticles): 61 | for t in range(self.svpg.svpg_rollout_length): 62 | # flatten and combine all randomized and reference trajectories for discriminator 63 | randomized_discrim_score_mean, _, _ = \ 64 | self.discriminator_rewarder.get_score(randomized_trajectories[i][t]) 65 | 66 | rewards[i][t] = randomized_discrim_score_mean 67 | 68 | return rewards 69 | 70 | def step_particles(self): 71 | if self.svpg_timesteps >= self.initial_svpg_steps: 72 | # Get sim instances from SVPG policy 73 | simulation_instances = self.svpg.step() 74 | 75 | index = self.svpg_timesteps % self.svpg_horizon 76 | self.parameter_settings[:, index, :, :] = simulation_instances 77 | 78 | else: 79 | # Creates completely randomized environment 80 | simulation_instances = np.ones((self.nparticles, 81 | self.svpg.svpg_rollout_length, 82 | self.svpg.nparams)) * -1 83 | 84 | assert (self.nparticles, self.svpg.svpg_rollout_length, self.svpg.nparams) \ 85 | == simulation_instances.shape 86 | 87 | # Reshape to work with vectorized environments 88 | simulation_instances = np.transpose(simulation_instances, (1, 0, 2)) 89 | 90 | self.svpg_timesteps += 1 91 | return simulation_instances 92 | 93 | def train(self, reference_trajectories, randomized_trajectories): 94 | rewards = self.score_trajectories(randomized_trajectories) 95 | self._train_particles(rewards) 96 | self._train_discriminator(reference_trajectories, randomized_trajectories) 97 | 98 | def _train_discriminator(self, reference_trajectories, randomized_trajectories): 99 | flattened_randomized = [randomized_trajectories[i][t] for i in range(self.nagents)] 100 | flattened_randomized = np.concatenate(flattened_randomized) 101 | 102 | flattened_reference = [reference_trajectories[i][t] for i in range(self.nagents)] 103 | flattened_reference = np.concatenate(flattened_reference) 104 | 105 | self.discriminator_rewarder.train_discriminator( 106 | flattened_reference, 107 | flattened_randomized, 108 | iterations=len(flattened_randomized) 109 | ) 110 | 111 | def _train_particles(self, rewards): 112 | self.svpg.train(rewards) 113 | 114 | -------------------------------------------------------------------------------- /common/randomizer/assets/shared.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /common/svpg/svpg_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | 5 | 6 | def parameters_to_vector(parameters, grad=False, both=False): 7 | """Convert parameters or/and their gradients to one vector 8 | Arguments: 9 | parameters (Iterable[Variable]): an iterator of Variables that are the 10 | parameters of a model. 11 | grad (bool): Vectorizes gradients if true, otherwise vectorizes params 12 | both (bool): If True, vectorizes both parameters and their gradients, 13 | `grad` has no effect in this case. Otherwise vectorizes parameters 14 | or gradients according to `grad`. 15 | Returns: 16 | The parameters or/and their gradients (each) represented by a single 17 | vector (th.Tensor, not Variable) 18 | """ 19 | # Flag for the device where the parameter is located 20 | param_device = None 21 | 22 | if not both: 23 | vec = [] 24 | if not grad: 25 | for param in parameters: 26 | # Ensure the parameters are located in the same device 27 | param_device = _check_param_device(param, param_device) 28 | vec.append(param.data.view(-1)) 29 | else: 30 | for param in parameters: 31 | param_device = _check_param_device(param, param_device) 32 | vec.append(param.grad.data.view(-1)) 33 | return torch.cat(vec) 34 | else: 35 | vec_params, vec_grads = [], [] 36 | for param in parameters: 37 | param_device = _check_param_device(param, param_device) 38 | vec_params.append(param.data.view(-1)) 39 | vec_grads.append(param.grad.data.view(-1)) 40 | return torch.cat(vec_params), torch.cat(vec_grads) 41 | 42 | def vector_to_parameters(vec, parameters, grad=True): 43 | """Convert one vector to the parameters or gradients of the parameters 44 | Arguments: 45 | vec (torch.Tensor): a single vector represents the parameters of a model. 46 | parameters (Iterable[Variable]): an iterator of Variables that are the 47 | parameters of a model. 48 | grad (bool): True for assigning de-vectorized `vec` to gradients 49 | """ 50 | # Ensure vec of type Variable 51 | if not isinstance(vec, torch.cuda.FloatTensor): 52 | raise TypeError('expected torch.Tensor, but got: {}' 53 | .format(torch.typename(vec))) 54 | # Flag for the device where the parameter is located 55 | param_device = None 56 | 57 | # Pointer for slicing the vector for each parameter 58 | pointer = 0 59 | if grad: 60 | for param in parameters: 61 | # Ensure the parameters are located in the same device 62 | param_device = _check_param_device(param, param_device) 63 | # The length of the parameter 64 | num_param = torch.prod(torch.LongTensor(list(param.size()))) 65 | param.grad.data = vec[pointer:pointer + num_param].view( 66 | param.size()) 67 | # Increment the pointer 68 | pointer += num_param 69 | else: 70 | for param in parameters: 71 | # Ensure the parameters are located in the same device 72 | param_device = _check_param_device(param, param_device) 73 | # The length of the parameter 74 | num_param = torch.prod(torch.LongTensor(list(param.size()))) 75 | param.data = vec[pointer:pointer + num_param].view( 76 | param.size()) 77 | # Increment the pointer 78 | pointer += num_param 79 | 80 | 81 | def _check_param_device(param, old_param_device): 82 | """This helper function is to check if the parameters are located 83 | in the same device. Currently, the conversion between model parameters 84 | and single vector form is not supported for multiple allocations, 85 | e.g. parameters in different GPUs, or mixture of CPU/GPU. 86 | Arguments: 87 | param ([Variable]): a Variable of a parameter of a model 88 | old_param_device (int): the device where the first parameter of a 89 | model is allocated. 90 | Returns: 91 | old_param_device (int): report device for the first time 92 | """ 93 | 94 | # Meet the first parameter 95 | if old_param_device is None: 96 | old_param_device = param.get_device() if param.is_cuda else -1 97 | else: 98 | warn = False 99 | if param.is_cuda: # Check if in same GPU 100 | warn = (param.get_device() != old_param_device) 101 | else: # Check if in CPU 102 | warn = (old_param_device != -1) 103 | if warn: 104 | raise TypeError('Found two parameters on different devices, ' 105 | 'this is currently not supported.') 106 | return old_param_device -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | """ 4 | Here are the param for the training 5 | 6 | """ 7 | def get_args(): 8 | parser = argparse.ArgumentParser() 9 | # the environment setting 10 | parser.add_argument('--n-epochs', type=int, default=50, help='the number of epochs to train the agent') 11 | parser.add_argument('--n-cycles', type=int, default=50, help='the times to collect samples per epoch') 12 | parser.add_argument('--n-batches', type=int, default=40, help='the times to update the network') 13 | parser.add_argument('--save-interval', type=int, default=5, help='the interval that save the trajectory') 14 | parser.add_argument('--seed', type=int, default=123, help='random seed') 15 | parser.add_argument('--num-workers', type=int, default=1, help='the number of cpus to collect samples') 16 | parser.add_argument('--replay-strategy', type=str, default='future', help='the HER strategy') 17 | parser.add_argument('--clip-return', type=float, default=50, help='if clip the returns') 18 | parser.add_argument('--save-dir', type=str, default='saved_models/', help='the path to save the models') 19 | parser.add_argument('--noise-eps', type=float, default=0.2, help='noise eps') 20 | parser.add_argument('--random-eps', type=float, default=0.3, help='random eps') 21 | parser.add_argument('--buffer-size', type=int, default=int(1e6), help='the size of the buffer') 22 | parser.add_argument('--replay-k', type=int, default=4, help='ratio to be replace') 23 | parser.add_argument('--clip-obs', type=float, default=200, help='the clip ratio') 24 | parser.add_argument('--batch-size', type=int, default=256, help='the sample batch size') 25 | parser.add_argument('--gamma', type=float, default=0.98, help='the discount factor') 26 | parser.add_argument('--action-l2', type=float, default=1, help='l2 reg') 27 | parser.add_argument('--lr-actor', type=float, default=0.001, help='the learning rate of the actor') 28 | parser.add_argument('--lr-critic', type=float, default=0.001, help='the learning rate of the critic') 29 | parser.add_argument('--polyak', type=float, default=0.95, help='the average coefficient') 30 | parser.add_argument('--n-test-rollouts', type=int, default=10, help='the number of tests') 31 | parser.add_argument('--clip-range', type=float, default=5, help='the clip range') 32 | parser.add_argument('--demo-length', type=int, default=20, help='the demo length') 33 | parser.add_argument('--cuda', action='store_true', help='if use gpu do the acceleration') 34 | parser.add_argument('--num-rollouts-per-mpi', type=int, default=10, help='the rollouts per mpi') 35 | parser.add_argument('--sp-polyak', type=float, default=0.1, help='Polyak Averaging Coefficient') 36 | parser.add_argument('--sp-gamma', type=float, default=0.1, help='Self play gamma') 37 | parser.add_argument('--sp-percent', type=float, default=0.1, help='Self Play Percentage') 38 | parser.add_argument('--friction', type=float, default=0.18, help='friction parameter to set') 39 | parser.add_argument('--approach', type=str, default='baseline', help='Different approaches for experiments') 40 | parser.add_argument('--svpg-rollout-length', type=int, default=5) 41 | parser.add_argument('--nmpi', type=int, default=8) # TODO! 42 | parser.add_argument('--mode', type=str, default='default') 43 | parser.add_argument('--window-size', type=int, default=20) 44 | parser.add_argument("--policy-name", default="OurDDPG") # Policy name 45 | parser.add_argument("--env-name", default="ErgoPushRandomizedEnv-Headless-v0") # OpenAI gym environment name 46 | parser.add_argument("--start-timesteps", default=1e4, 47 | type=int) # How many time steps purely random policy is run for 48 | parser.add_argument("--eval-freq", default=5e3, type=float) # How often (time steps) we evaluate 49 | parser.add_argument("--max-timesteps", default=1e6, type=float) # Max time steps to run environment for 50 | parser.add_argument("--save-models", default=True, action="store_true") # Whether or not models are saved 51 | parser.add_argument("--expl-noise", default=0.1, type=float) # Std of Gaussian exploration noise 52 | parser.add_argument("--discount", default=0.99, type=float) # Discount factor 53 | parser.add_argument("--tau", default=0.005, type=float) # Target network update rate 54 | parser.add_argument("--policy-noise", default=0.2, type=float) # Noise added to target policy during critic update 55 | parser.add_argument("--noise-clip", default=0.5, type=float) # Range to clip target policy noise 56 | parser.add_argument("--policy-freq", default=2, type=int) # Frequency of delayed policy updates 57 | parser.add_argument("--nparticles", default=1, type=int) 58 | parser.add_argument('--n-params', type=int, default=1) 59 | parser.add_argument('--only-sp', type=bool, default=False) 60 | 61 | 62 | # Add more arguments if needed. 63 | 64 | args = parser.parse_args() 65 | 66 | return args 67 | -------------------------------------------------------------------------------- /common/randomizer/controllers/hook_controller.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | DEBUG = False 4 | 5 | def get_move_action(observation, target_position, atol=1e-3, gain=10., close_gripper=False): 6 | """ 7 | Move an end effector to a position and orientation. 8 | """ 9 | # Get the currents 10 | current_position = observation['observation'][:3] 11 | 12 | action = gain * np.subtract(target_position, current_position) 13 | if close_gripper: 14 | gripper_action = -1. 15 | else: 16 | gripper_action = 0. 17 | action = np.hstack((action, gripper_action)) 18 | 19 | return action 20 | 21 | def block_is_grasped(obs, gripper_position, block_position, relative_grasp_position, atol=1e-3): 22 | block_inside = block_inside_grippers(gripper_position, block_position, relative_grasp_position, atol=atol) 23 | grippers_closed = grippers_are_closed(obs, atol=atol) 24 | 25 | return block_inside and grippers_closed 26 | 27 | def block_inside_grippers(gripper_position, block_position, relative_grasp_position, atol=1e-3): 28 | relative_position = np.subtract(gripper_position, block_position) 29 | 30 | return np.sum(np.subtract(relative_position, relative_grasp_position)**2) < atol 31 | 32 | def grippers_are_closed(obs, atol=1e-3): 33 | gripper_state = obs['observation'][9:11] 34 | return abs(gripper_state[0]) - 0.024 <= atol 35 | 36 | def grippers_are_open(obs, atol=1e-3): 37 | gripper_state = obs['observation'][9:11] 38 | 39 | return abs(gripper_state[0] - 0.05) <= atol 40 | 41 | 42 | def pick_at_position(obs, block_position, place_position, relative_grasp_position=(0., 0., -0.02), workspace_height=0.1, atol=1e-3): 43 | """ 44 | Returns 45 | ------- 46 | action : [float] * 4 47 | """ 48 | gripper_position = obs['observation'][:3] 49 | 50 | # If the gripper is already grasping the block 51 | if block_is_grasped(obs, gripper_position, block_position, relative_grasp_position, atol=atol): 52 | 53 | # If the block is already at the place position, do nothing except keep the gripper closed 54 | if np.sum(np.subtract(block_position, place_position)**2) < atol: 55 | if DEBUG: 56 | print("The block is already at the place position; do nothing") 57 | return np.array([0., 0., 0., -1.]) 58 | 59 | # Move to the place position while keeping the gripper closed 60 | target_position = np.add(place_position, relative_grasp_position) 61 | target_position[2] += workspace_height/2. 62 | if DEBUG: 63 | print("Move to above the place position") 64 | return get_move_action(obs, target_position, atol=atol, close_gripper=True) 65 | 66 | # If the block is ready to be grasped 67 | if block_inside_grippers(gripper_position, block_position, relative_grasp_position, atol=atol): 68 | 69 | # Close the grippers 70 | if DEBUG: 71 | print("Close the grippers") 72 | return np.array([0., 0., 0., -1.]) 73 | 74 | # If the gripper is above the block 75 | target_position = np.add(block_position, relative_grasp_position) 76 | if (gripper_position[0] - target_position[0])**2 + (gripper_position[1] - target_position[1])**2 < atol: 77 | 78 | # If the grippers are closed, open them 79 | if not grippers_are_open(obs, atol=atol): 80 | if DEBUG: 81 | print("Open the grippers") 82 | return np.array([0., 0., 0., 1.]) 83 | 84 | # Move down to grasp 85 | if DEBUG: 86 | print("Move down to grasp") 87 | return get_move_action(obs, target_position, atol=atol) 88 | 89 | 90 | # Else move the gripper to above the block 91 | target_position[2] += workspace_height 92 | if DEBUG: 93 | print("Move to above the block") 94 | return get_move_action(obs, target_position, atol=atol) 95 | 96 | 97 | def get_hook_control(obs, atol=1e-2): 98 | """ 99 | Returns 100 | ------- 101 | action : [float] * 4 102 | """ 103 | gripper_position = obs['observation'][:3] 104 | block_position = obs['observation'][3:6] 105 | hook_position = obs['observation'][25:28] 106 | place_position = obs['desired_goal'] 107 | 108 | # Done 109 | if abs(block_position[0] - place_position[0]) + abs(block_position[1] - place_position[1]) <= 1e-2: 110 | if DEBUG: 111 | print("DONE") 112 | return np.array([0., 0., 0., -1.]) 113 | 114 | # Grasp and lift the hook 115 | if not block_is_grasped(obs, gripper_position, hook_position, relative_grasp_position=(0., 0., -0.05), atol=atol): 116 | if DEBUG: 117 | print("Grasping and lifting the hook") 118 | hook_target = hook_position.copy() 119 | hook_target[2] = 0.5 120 | return pick_at_position(obs, hook_position, hook_target, relative_grasp_position=(0., 0., -0.05)) 121 | 122 | # Align the hook to sweep 123 | hook_target = np.array([block_position[0] - 0.5, block_position[1] - 0.05, 0.45]) 124 | 125 | if hook_position[0] >= hook_target[0] + 0.1 or hook_position[1] + 0.1 <= hook_target[1]: 126 | if DEBUG: 127 | print("Aligning to sweep", hook_position[0] + atol, hook_target[0], hook_position[1] + atol, hook_target[1]) 128 | return get_move_action(obs, hook_target, close_gripper=True) 129 | 130 | if DEBUG: 131 | print("Sweeping back") 132 | 133 | direction = np.subtract(place_position, block_position) 134 | direction = direction[:2] / np.linalg.norm(direction[:2]) 135 | 136 | return np.array([0.4 * direction[0], 0.4 * direction[1], 0., -1.]) 137 | 138 | 139 | 140 | 141 | 142 | 143 | -------------------------------------------------------------------------------- /realrobot_evals/realrobot_test.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | import torch 3 | import time 4 | import numpy as np 5 | import gym 6 | import argparse 7 | import os 8 | import os.path as osp 9 | from tqdm import tqdm, trange 10 | 11 | from common.agents.ddpg.ddpg import DDPG 12 | import gym_ergojr 13 | 14 | parser = argparse.ArgumentParser(description='Real Robot Experiment Driver') 15 | 16 | parser.add_argument('--nepisodes', type=int, default=25, help='Number of trials per *seed*') 17 | parser.add_argument('--experiment-prefix', type=str, default='real', help='Prefix to append to logs') 18 | parser.add_argument('--log-dir', type=str, default='/data/fgolemo/UADR-results/real-robot', help='Log Directory Prefix') 19 | parser.add_argument('--model-dir', type=str, default='saved_models/', help='Model Directory Prefix') # TODO 20 | parser.add_argument('--cont', type=str, default='190329-180631', help='To continue existing file, enter timestamp here') 21 | parser.add_argument('--cont', type=str, default='', help='To continue existing file, enter timestamp here') 22 | 23 | args = parser.parse_args() 24 | 25 | if len(args.cont) == 0: 26 | TIMESTAMP = time.strftime("%y%m%d-%H%M%S") 27 | file_flag = "w" 28 | 29 | else: 30 | TIMESTAMP = args.cont 31 | file_flag = "r+" 32 | 33 | file_path = "{}/{}-{}.hdf5".format(args.log_dir, args.experiment_prefix, TIMESTAMP) 34 | 35 | MAX_EPISODE_STEPS = 100 36 | EPISODES = args.nepisodes 37 | SEED = ["31", "32", "33", "34", "35"] 38 | # Policies to look for 39 | policies = ['udr', 'adr-1.0', 'baseline-sp-1.0'] 40 | 41 | env_name = "ErgoReacher-MultiGoal-Live-v1" 42 | npa = np.array 43 | 44 | img_buffer = [] 45 | 46 | if not osp.exists(args.log_dir): 47 | os.makedirs(args.log_dir) 48 | 49 | 50 | env = gym.make(env_name) 51 | state_dim = env.observation_space.shape[0] 52 | action_dim = env.action_space.shape[0] 53 | 54 | n_episodes = 3 # num of episodes to run 55 | max_timesteps = 100 # max timesteps in one episode 56 | render = True # render the environment 57 | save_gif = False # png images are saved in gif folder 58 | 59 | state_dim = env.observation_space.shape[0] 60 | action_dim = env.action_space.shape[0] 61 | max_action = float(env.action_space.high[0]) 62 | goal_dim = 2 63 | 64 | policy = DDPG(args, state_dim, action_dim, max_action, goal_dim) 65 | 66 | obs = env.reset() 67 | 68 | 69 | with h5py.File(file_path, file_flag) as f: 70 | for policy_type in tqdm(policies, desc="approaches"): 71 | for torque_idx, torque in enumerate(tqdm(SEED, desc="torques...")): 72 | 73 | model_path = osp.join(args.model_dir, policy_type, f"Variant-{torque}") 74 | print(model_path) 75 | no_models = len(os.listdir(model_path)) 76 | 77 | if policy_type not in f: # if dataset doesn't have these tables 78 | log_group = f.create_group(policy_type) 79 | rewards = log_group.create_dataset("rewards", (no_models, len(SEED), EPISODES, MAX_EPISODE_STEPS), 80 | dtype=np.float32) 81 | distances = log_group.create_dataset("distances", (no_models, len(SEED), EPISODES, MAX_EPISODE_STEPS), 82 | dtype=np.float32) 83 | trajectories = log_group.create_dataset("trajectories", 84 | (no_models, len(SEED), EPISODES, MAX_EPISODE_STEPS, 24), 85 | dtype=np.float32) 86 | imgs = log_group.create_dataset("images", 87 | (no_models, len(SEED), EPISODES, MAX_EPISODE_STEPS, 480, 640, 3), 88 | dtype=np.uint8, compression="lzf") 89 | else: # if tables are in dataset, grab their pointers 90 | rewards = f.get(f"/{policy_type}/rewards") 91 | distances = f.get(f"/{policy_type}/distances") 92 | trajectories = f.get(f"/{policy_type}/trajectories") 93 | imgs = f.get(f"/{policy_type}/images") 94 | 95 | tqdm.write(f'Starting analysis of {policy_type} - variant {torque}') 96 | 97 | for model_idx, actorpth in enumerate(tqdm(os.listdir(model_path), desc="models....")): 98 | # Load model weights 99 | policy.load(os.path.join(model_path, actorpth)) 100 | 101 | for ep_num in trange(EPISODES, desc="episodes.."): 102 | non_zero_steps = np.count_nonzero(trajectories[model_idx, torque_idx, ep_num], axis=1) 103 | 104 | if np.count_nonzero(non_zero_steps) == 0: 105 | obs = env.reset() 106 | done = False 107 | cumulative = 0 108 | counter = 0 109 | img_buffer = [] 110 | while counter < MAX_EPISODE_STEPS: 111 | with torch.no_grad(): 112 | # print(counter, obs) 113 | action = policy.select_action(np.array(obs)) 114 | 115 | nobs, reward, _, misc = env.step(action) 116 | cumulative += reward 117 | 118 | trajectories[model_idx, torque_idx, ep_num, counter, :] = np.concatenate( 119 | [obs, action, nobs]) 120 | rewards[model_idx, torque_idx, ep_num, counter] = reward 121 | distances[model_idx, torque_idx, ep_num, counter] = misc["distance"] 122 | img_buffer.append(np.copy(misc["img"])) 123 | 124 | obs = np.copy(nobs) 125 | counter += 1 126 | 127 | imgs[model_idx, torque_idx, ep_num, :counter, :, :, :] = img_buffer 128 | 129 | f.flush() 130 | 131 | env.reset() 132 | -------------------------------------------------------------------------------- /common/randomizer/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | import os.path as osp 3 | 4 | 5 | register( 6 | id='FetchPushRandomizedEnv-v0', 7 | entry_point='common.randomizer.randomized_fetchpush:FetchPushRandomizedEnv', 8 | max_episode_steps=50, 9 | kwargs={ 10 | 'config': 'common/randomizer/config/FetchPushRandomized/random.json', 11 | 'xml_name': 'push.xml' 12 | } 13 | ) 14 | register( 15 | id='FetchSlideRandomizedEnv-v0', 16 | entry_point='common.randomizer.randomized_fetchslide:FetchSlideRandomizedEnv', 17 | max_episode_steps=50, 18 | kwargs={ 19 | 'config': 'common/randomizer/config/FetchSlideRandomized/random.json', 20 | 'xml_name': 'slide.xml' 21 | } 22 | ) 23 | 24 | register( 25 | id='FetchHookRandomizedEnv-v0', 26 | entry_point='common.randomizer.randomized_residual_hook:FetchHookEnv', 27 | max_episode_steps=100, 28 | kwargs={ 29 | 'config': 'common/randomizer/config/FetchHookRandomized/random.json', 30 | 'xml_name': 'hook.xml' 31 | } 32 | ) 33 | register( 34 | id='FetchHookDefaultEnv-v0', 35 | entry_point='common.randomizer.randomized_residual_hook:FetchHookEnv', 36 | max_episode_steps=100, 37 | kwargs={ 38 | 'config': 'common/randomizer/config/FetchHookRandomized/default.json', 39 | 'xml_name': 'hook.xml' 40 | } 41 | ) 42 | register( 43 | id='NoisyFetchHookRandomizedEnv-v0', 44 | entry_point='common.randomizer.randomized_residual_hook:NoisyResidualFetchHookEnv', 45 | max_episode_steps=100, 46 | kwargs={ 47 | 'config': 'common/randomizer/config/FetchHookRandomized/random.json', 48 | 'xml_name': 'hook.xml' 49 | } 50 | ) 51 | register( 52 | id='NoisyFetchHookDefaultEnv-v0', 53 | entry_point='common.randomizer.randomized_residual_hook:NoisyResidualFetchHookEnv', 54 | max_episode_steps=100, 55 | kwargs={ 56 | 'config': 'common/randomizer/config/FetchHookRandomized/default.json', 57 | 'xml_name': 'hook.xml' 58 | } 59 | ) 60 | register( 61 | id='ErgoPushRandomizedEnv-Headless-v0', 62 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv', 63 | max_episode_steps=100, 64 | reward_threshold=0, 65 | kwargs={ 66 | 'config': 'common/randomizer/config/ErgoPushRandomized/random.json', 67 | 'headless': True 68 | } 69 | ) 70 | register( 71 | id='ErgoPushDefaultEnv-Headless-v0', 72 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv', 73 | max_episode_steps=100, 74 | reward_threshold=0, 75 | kwargs={ 76 | 'config': 'common/randomizer/config/ErgoPushRandomized/default.json', 77 | 'headless': True 78 | } 79 | ) 80 | register( 81 | id='ErgoPushHardEnv-Headless-v0', 82 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv', 83 | max_episode_steps=100, 84 | reward_threshold=0, 85 | kwargs={ 86 | 'config': 'common/randomizer/config/ErgoPushRandomized/hard.json', 87 | 'headless': True 88 | } 89 | ) 90 | register( 91 | id='ErgoReacherRandomizedEnv-Headless-v0', 92 | entry_point='common.randomizer.randomized_ergoreacher:ErgoReacherRandomizedEnv', 93 | max_episode_steps=100, 94 | reward_threshold=0, 95 | kwargs={ 96 | 'config': 'common/randomizer/config/ErgoReacherRandomized/random.json', 97 | 'headless': True, 98 | 'simple': True, 99 | 'goal_halfsphere': True, 100 | 'multi_goal': True 101 | } 102 | ) 103 | register( 104 | id='ErgoReacherDefaultEnv-Headless-v0', 105 | entry_point='common.randomizer.randomized_ergoreacher:ErgoReacherRandomizedEnv', 106 | max_episode_steps=100, 107 | kwargs={ 108 | 'config': 'common/randomizer/config/ErgoReacherRandomized/default.json', 109 | 'headless': True, 110 | 'simple': True, 111 | 'goal_halfsphere': True, 112 | 'multi_goal': True 113 | } 114 | ) 115 | register( 116 | id='ErgoReacherHardEnv-Headless-v0', 117 | entry_point='common.randomizer.randomized_ergoreacher:ErgoReacherRandomizedEnv', 118 | max_episode_steps=100, 119 | kwargs={ 120 | 'config': 'common/randomizer/config/ErgoReacherRandomized/hard.json', 121 | 'headless': True, 122 | 'simple': True, 123 | 'goal_halfsphere': True, 124 | 'multi_goal': True 125 | } 126 | ) 127 | register( 128 | id='ErgoReacherDefaultEnv-Graphical-v0', 129 | entry_point='common.randomizer.randomized_ergoreacher:ErgoReacherRandomizedEnv', 130 | max_episode_steps=100, 131 | kwargs={ 132 | 'config': 'common/randomizer/config/ErgoReacherRandomized/hard.json', 133 | 'headless': False, 134 | 'simple': True, 135 | 'goal_halfsphere': True, 136 | 'multi_goal': True 137 | } 138 | ) 139 | register( 140 | id='ErgoPushRandomizedEnv-Graphical-v0', 141 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv', 142 | max_episode_steps=100, 143 | reward_threshold=0, 144 | kwargs={ 145 | 'config': 'common/randomizer/config/ErgoPushRandomized/random.json', 146 | 'headless': False 147 | } 148 | ) 149 | register( 150 | id='ErgoPushHardEnv-Graphical-v0', 151 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv', 152 | max_episode_steps=100, 153 | reward_threshold=0, 154 | kwargs={ 155 | 'config': 'common/randomizer/config/ErgoPushRandomized/hard.json', 156 | 'headless': False 157 | } 158 | ) 159 | 160 | register( 161 | id='ResFetchPushRandomizedEnv-v0', 162 | entry_point='common.randomizer.randomized_residual_push:ResidualPushRandomizedEnv', 163 | max_episode_steps=50, 164 | kwargs={ 165 | 'config': 'common/randomizer/config/FetchPushRandomized/random.json', 166 | 'xml_name': 'push.xml' 167 | } 168 | ) 169 | register( 170 | id='ResFetchPushDefaultEnv-v0', 171 | entry_point='common.randomizer.randomized_residual_push:ResidualPushRandomizedEnv', 172 | max_episode_steps=50, 173 | kwargs={ 174 | 'config': 'common/randomizer/config/FetchPushRandomized/default.json', 175 | 'xml_name': 'push.xml' 176 | } 177 | ) -------------------------------------------------------------------------------- /common/randomizer/assets/pusher.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 102 | -------------------------------------------------------------------------------- /experiments/ddpg_train.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import gym 4 | import argparse 5 | import os 6 | import gym_ergojr 7 | from common.agents.ddpg.replay_buffer import ReplayBuffer 8 | from common.agents.ddpg.ddpg import DDPG 9 | import common.randomizer 10 | from common.randomizer.wrappers import RandomizedEnvWrapper 11 | from common.adr.adr import ADR 12 | import multiprocessing as mp 13 | from arguments import get_args 14 | 15 | 16 | def get_env_params(env): 17 | observation = env.reset() 18 | # close the environment 19 | params = {'obs': observation["observation"].shape[0], 'goal': observation["achieved_goal"].shape[0], 'action': env.action_space.shape[0], 20 | 'action_max': env.action_space.high[0], 'max_timesteps': env._max_episode_steps} 21 | return params 22 | 23 | 24 | args = get_args() 25 | args.save_models = True 26 | args.num_workers = 8 27 | 28 | env = gym.make(args.env_name) 29 | env_param = get_env_params(env) 30 | 31 | # jobid = os.environ['SLURM_ARRAY_TASK_ID'] 32 | # seed = [40, 41, 42, 43, 44] 33 | # args.seed = seed[int(jobid) - 1] # Set seeds 34 | 35 | env.seed(args.seed) 36 | torch.manual_seed(args.seed) 37 | np.random.seed(args.seed) 38 | args.nparticles = mp.cpu_count() - 1 39 | 40 | state_dim = env_param["obs"] 41 | action_dim = env_param["action"] 42 | max_action = env_param["action_max"] 43 | goal_dim = env_param["goal"] 44 | total_timesteps = 0 45 | timesteps_since_eval = 0 46 | episode_num = 0 47 | done = True 48 | env = RandomizedEnvWrapper(env, seed=args.seed) 49 | env.reset() 50 | svpg_rewards = [] 51 | 52 | # Initialize policy 53 | policy = DDPG(args, state_dim, action_dim, max_action, goal_dim) 54 | 55 | replay_buffer = ReplayBuffer(state_dim, action_dim) 56 | 57 | # ADR integration 58 | adr = ADR( 59 | nparticles=args.num_workers, 60 | nparams=args.n_params, 61 | state_dim=1, 62 | action_dim=1, 63 | temperature=10, 64 | svpg_rollout_length=args.svpg_rollout_length, 65 | svpg_horizon=25, 66 | max_step_length=0.05, 67 | reward_scale=1, 68 | initial_svpg_steps=0, 69 | seed=args.seed, 70 | discriminator_batchsz=320, 71 | ) 72 | count = 0 73 | args.save_dir = os.getcwd() + '/' + os.path.join(args.save_dir, 74 | 'sp{}polyak{}'.format(args.sp_percent, args.polyak) + '-' + args.approach, 75 | str(args.seed)) 76 | model_path = os.path.join(args.save_dir, args.env_name) 77 | 78 | if not os.path.isdir(model_path): 79 | os.makedirs(model_path) 80 | args.save_dir = model_path 81 | alice_envs = [] 82 | alice_envs_total = [] 83 | 84 | while total_timesteps < args.max_timesteps: 85 | if done: 86 | env.randomize([-1] * args.n_params) # Randomize the environment 87 | if total_timesteps != 0: 88 | print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format(total_timesteps, episode_num, 89 | episode_timesteps, episode_reward)) 90 | # Reset environment 91 | obs = env.reset() 92 | done = False 93 | episode_reward = 0 94 | episode_timesteps = 0 95 | episode_num += 1 96 | 97 | env_settings = adr.step_particles() 98 | env_settings = np.ascontiguousarray(env_settings) 99 | count = 0 100 | svpg_index = total_timesteps % args.svpg_rollout_length 101 | 102 | if np.random.random() < args.sp_percent: # Self-play loop 103 | env.randomize(["default"] * args.n_params) 104 | 105 | bobs_goal_state, alice_time = policy.alice_loop(args, env, env_param) # Alice Loop 106 | 107 | if total_timesteps % int(1e4) == 0: 108 | alice_envs_total.append(alice_envs) 109 | alice_envs = [] 110 | 111 | if not args.only_sp: 112 | multiplier = np.clip(env_settings[svpg_index][0][:args.n_params], 0, 1.0) 113 | alice_envs.append(multiplier) 114 | env.randomize(multiplier) # Randomize the env for bob 115 | bob_time, done = policy.bob_loop(env, env_param, bobs_goal_state, alice_time, replay_buffer) # Bob Loop 116 | alice_reward = policy.train_alice(alice_time, bob_time) # Train alice 117 | svpg_rewards.append(alice_reward) 118 | if len(svpg_rewards) == args.num_workers * args.svpg_rollout_length: # ADR training 119 | all_rewards = np.reshape(np.asarray(svpg_rewards), (args.num_workers, args.svpg_rollout_length)) 120 | adr._train_particles(all_rewards) 121 | svpg_rewards = [] 122 | else: 123 | env.randomize(["default"] * args.n_params) 124 | bob_time, done = policy.bob_loop(env, env_param, bobs_goal_state, alice_time, replay_buffer) # Bob Loop with only_sp=True 125 | alice_reward = policy.train_alice(alice_time, bob_time) 126 | else: 127 | observation = env.reset() 128 | obs = observation["observation"] 129 | # Select action randomly or according to policy 130 | if total_timesteps < args.start_timesteps: 131 | action = env.action_space.sample() 132 | else: 133 | action = policy.select_action(np.array(obs)) 134 | if args.expl_noise != 0: 135 | action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip( 136 | env.action_space.low, env.action_space.high) 137 | 138 | # Perform action 139 | new_obs, reward, done, _ = env.step(action) 140 | done_bool = 0 if episode_timesteps + 1 == env_param['max_timesteps'] else float(done) 141 | done = done or episode_timesteps + 1 == env_param['max_timesteps'] 142 | episode_reward += reward 143 | 144 | # Store data in replay buffer 145 | replay_buffer.add(obs, action, new_obs["observation"], reward, done_bool) 146 | obs = new_obs["observation"] 147 | # Train the policy after collecting sufficient data 148 | if total_timesteps >= args.start_timesteps: 149 | policy.train(replay_buffer, args.batch_size) 150 | 151 | if timesteps_since_eval >= args.eval_freq: 152 | timesteps_since_eval %= args.eval_freq 153 | 154 | if args.save_models: policy.save(f'model_{total_timesteps}', 155 | directory=args.save_dir) 156 | np.save(f"{args.save_dir}/alice_envs.npy", alice_envs_total) 157 | print("---------------------------------------") 158 | print("Env Name: %s | Seed : %s | sp-percent : %s" % (args.env_name, args.seed, args.sp_percent)) 159 | print("---------------------------------------") 160 | 161 | episode_timesteps += 1 162 | total_timesteps += 1 163 | timesteps_since_eval += 1 164 | 165 | if args.save_models: policy.save(f'model_{total_timesteps}', directory=args.save_dir) 166 | -------------------------------------------------------------------------------- /common/agents/ddpg/ddpg.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import numpy as np 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | from common.agents.selfplay_models import AlicePolicyFetch 7 | 8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 9 | 10 | 11 | # Re-tuned version of Deep Deterministic Policy Gradients (DDPG) 12 | # Paper: https://arxiv.org/abs/1509.02971 13 | 14 | 15 | class Actor(nn.Module): 16 | def __init__(self, state_dim, action_dim, max_action): 17 | super(Actor, self).__init__() 18 | 19 | self.l1 = nn.Linear(state_dim, 400) 20 | self.l2 = nn.Linear(400, 300) 21 | self.l3 = nn.Linear(300, action_dim) 22 | 23 | self.max_action = max_action 24 | 25 | def forward(self, state): 26 | a = F.relu(self.l1(state)) 27 | a = F.relu(self.l2(a)) 28 | return self.max_action * torch.tanh(self.l3(a)) 29 | 30 | 31 | class Critic(nn.Module): 32 | def __init__(self, state_dim, action_dim): 33 | super(Critic, self).__init__() 34 | 35 | self.l1 = nn.Linear(state_dim + action_dim, 400) 36 | self.l2 = nn.Linear(400, 300) 37 | self.l3 = nn.Linear(300, 1) 38 | 39 | def forward(self, state, action): 40 | q = F.relu(self.l1(torch.cat([state, action], 1))) 41 | q = F.relu(self.l2(q)) 42 | return self.l3(q) 43 | 44 | 45 | class DDPG(object): 46 | def __init__(self, args, state_dim, action_dim, max_action, goal_dim, discount=0.99, tau=0.005): 47 | self.args = args 48 | self.goal_dim = goal_dim 49 | self.actor = Actor(state_dim, action_dim, max_action).to(device) # Bob's policy 50 | self.actor_target = copy.deepcopy(self.actor) # Alice acting policy 51 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters()) 52 | 53 | self.critic = Critic(state_dim, action_dim).to(device) # Bob's critic network 54 | self.critic_target = copy.deepcopy(self.critic) # Alice's critic network 55 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters()) 56 | 57 | self.alice_policy = AlicePolicyFetch(self.args, goal_dim, action_dim=1) # Alice Policy 58 | 59 | self.discount = discount 60 | self.tau = tau 61 | 62 | def select_action(self, state): 63 | state = torch.FloatTensor(state.reshape(1, -1)).to(device) 64 | return self.actor(state).cpu().data.numpy().flatten() 65 | 66 | 67 | def train(self, replay_buffer, batch_size=100): 68 | 69 | # Sample replay buffer 70 | state, action, next_state, reward, not_done = replay_buffer.sample(batch_size) 71 | 72 | # Compute the target Q value 73 | target_Q = self.critic_target(next_state, self.actor_target(next_state)) 74 | target_Q = reward + (not_done * self.discount * target_Q).detach() 75 | 76 | # Get current Q estimate 77 | current_Q = self.critic(state, action) 78 | 79 | # Compute critic loss 80 | critic_loss = F.mse_loss(current_Q, target_Q) 81 | 82 | # Optimize the critic 83 | self.critic_optimizer.zero_grad() 84 | critic_loss.backward() 85 | self.critic_optimizer.step() 86 | 87 | # Compute actor loss 88 | actor_loss = -self.critic(state, self.actor(state)).mean() 89 | 90 | # Optimize the actor 91 | self.actor_optimizer.zero_grad() 92 | actor_loss.backward() 93 | self.actor_optimizer.step() 94 | 95 | # Update the frozen target models 96 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()): 97 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) 98 | 99 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()): 100 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) 101 | 102 | def alice_loop(self, args, env, env_params): 103 | alice_done = False 104 | alice_time = 0 105 | env.randomize(["default"] * args.n_params) 106 | observation = env.reset() 107 | 108 | obs = observation["observation"] 109 | ag = observation["achieved_goal"] 110 | alice_state = np.concatenate([ag, np.zeros(ag.shape[0])]) 111 | 112 | while not alice_done and (alice_time < env_params['max_timesteps']): 113 | observation = torch.FloatTensor(obs.reshape(1, -1)).to(device) 114 | pi = self.actor_target(observation) 115 | action = pi.cpu().data.numpy().flatten() 116 | 117 | observation_new, reward, env_done, _ = env.step(action) 118 | obs_new = observation_new 119 | ag_new = observation_new["achieved_goal"] 120 | alice_signal = self.alice_policy.select_action(alice_state) 121 | bobs_goal_state = ag_new 122 | 123 | # Stopping Criteria 124 | if alice_signal > np.random.random(): alice_done = True 125 | alice_done = env_done or alice_time + 1 == env_params['max_timesteps'] or alice_signal and alice_time >= 1 126 | if not alice_done: 127 | alice_state[env_params["goal"]:] = ag_new 128 | bobs_goal_state = ag_new 129 | alice_time += 1 130 | self.alice_policy.log(0.0) 131 | obs = obs_new["observation"] 132 | ag = ag_new 133 | 134 | return bobs_goal_state, alice_time 135 | 136 | def bob_loop(self, env, env_params, bobs_goal_state, alice_time, replay_buffer): 137 | observation = env.reset() 138 | obs = observation["observation"] 139 | bob_done = False 140 | bob_time = 0 141 | while not bob_done and alice_time + bob_time < env_params['max_timesteps']: 142 | action = self.select_action(obs) 143 | new_obs, reward, env_done, _ = env.step(action) 144 | bob_signal = self._check_closeness(new_obs["achieved_goal"], bobs_goal_state) 145 | bob_done = env_done or bob_signal or bob_done 146 | 147 | if not bob_done: 148 | replay_buffer.add(obs, action, new_obs["observation"], reward, bob_done) 149 | obs = new_obs["observation"] 150 | bob_time += 1 151 | 152 | return bob_time, bob_done 153 | 154 | def train_alice(self, alice_time, bob_time): 155 | reward_alice = self.args.sp_gamma * max(0, bob_time - alice_time) 156 | self.alice_policy.log(reward_alice) 157 | self.alice_policy.finish_episode(gamma=0.99) 158 | return reward_alice 159 | 160 | @staticmethod 161 | def _check_closeness(state, goal): 162 | dist = np.linalg.norm(state - goal) 163 | return dist < 0.025 164 | 165 | def save(self, filename, directory): 166 | torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename)) 167 | torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename)) 168 | 169 | def load(self, filename, directory): 170 | self.actor.load_state_dict(torch.load('%s/%s' % (directory, filename), map_location=lambda storage, loc: storage)) 171 | # self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename), map_location='cpu')) 172 | -------------------------------------------------------------------------------- /common/randomizer/assets/robot.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /common/randomizer/complex_hook_env.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from gym import utils, spaces 4 | from gym.utils import seeding 5 | from gym.envs.robotics import rotations, fetch_env 6 | import gym 7 | 8 | 9 | import glob 10 | import os 11 | 12 | 13 | from .controllers.hook_controller import get_hook_control 14 | 15 | DIR_PATH = os.path.dirname(os.path.abspath(__file__)) 16 | 17 | 18 | class ComplexHookSingleObjectEnv(fetch_env.FetchEnv, utils.EzPickle): 19 | def __init__(self, xml_file=None): 20 | initial_qpos = { 21 | 'robot0:slide0': 0.405, 22 | 'robot0:slide1': 0.48, 23 | 'robot0:slide2': 0.0, 24 | 'object0:joint': [1.25, 0.53, 0.6, 1., 0., 0., 0.], 25 | 'hook:joint': [1.35, 0.35, 0.6, 1., 0., 0., 0.], 26 | } 27 | 28 | if xml_file is None: 29 | xml_file = os.path.join(DIR_PATH, 'assets', 'hook.xml') 30 | 31 | self._goal_pos = np.array([1.65, 0.75, 0.42]) 32 | self._object_xpos = np.array([1.8, 0.75]) 33 | 34 | fetch_env.FetchEnv.__init__( 35 | self, xml_file, has_object=True, block_gripper=False, n_substeps=20, 36 | gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0, 37 | obj_range=None, target_range=None, distance_threshold=0.05, 38 | initial_qpos=initial_qpos, reward_type='sparse') 39 | 40 | utils.EzPickle.__init__(self) 41 | 42 | 43 | 44 | def render(self, mode="human", *args, **kwargs): 45 | # See https://github.com/openai/gym/issues/1081 46 | self._render_callback() 47 | if mode == 'rgb_array': 48 | self._get_viewer().render() 49 | width, height = 3350, 1800 50 | data = self._get_viewer().read_pixels(width, height, depth=False) 51 | # original image is upside-down, so flip it 52 | return data[::-1, :, :] 53 | elif mode == 'human': 54 | self._get_viewer().render() 55 | 56 | return super(ComplexHookSingleObjectEnv, self).render(*args, **kwargs) 57 | 58 | def _sample_goal(self): 59 | goal_pos = self._goal_pos.copy() 60 | goal_pos[:2] += self.np_random.uniform(-0.05, 0.05) 61 | return goal_pos 62 | 63 | def _viewer_setup(self): 64 | body_id = self.sim.model.body_name2id('robot0:gripper_link') 65 | lookat = self.sim.data.body_xpos[body_id] 66 | for idx, value in enumerate(lookat): 67 | self.viewer.cam.lookat[idx] = value 68 | self.viewer.cam.distance = 2.5 69 | self.viewer.cam.azimuth = 180. 70 | self.viewer.cam.elevation = -24. 71 | 72 | def _reset_sim(self): 73 | self.sim.set_state(self.initial_state) 74 | 75 | object_xpos_x = 1.65 + self.np_random.uniform(-0.05, 0.05) 76 | while True: 77 | object_xpos_x = 1.8 + self.np_random.uniform(-0.05, 0.05) 78 | object_xpos_y = 0.75 + self.np_random.uniform(-0.05, 0.05) 79 | if (object_xpos_x - self._goal_pos[0]) ** 2 + (object_xpos_y - self._goal_pos[1]) ** 2 >= 0.01: 80 | break 81 | self._object_xpos = np.array([object_xpos_x, object_xpos_y]) 82 | 83 | object_qpos = self.sim.data.get_joint_qpos('object0:joint') 84 | assert object_qpos.shape == (7,) 85 | object_qpos[:2] = self._object_xpos 86 | self.sim.data.set_joint_qpos('object0:joint', object_qpos) 87 | 88 | self.sim.forward() 89 | return True 90 | 91 | def _get_obs(self): 92 | obs = fetch_env.FetchEnv._get_obs(self) 93 | 94 | grip_pos = self.sim.data.get_site_xpos('robot0:grip') 95 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep 96 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt 97 | 98 | hook_pos = self.sim.data.get_site_xpos('hook') 99 | # rotations 100 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook')) 101 | # velocities 102 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt 103 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt 104 | # gripper state 105 | hook_rel_pos = hook_pos - grip_pos 106 | hook_velp -= grip_velp 107 | 108 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos]) 109 | 110 | obs['observation'] = np.concatenate([obs['observation'], hook_observation]) 111 | 112 | return obs 113 | 114 | 115 | class ResidualComplexHookSingleObjectEnv(ComplexHookSingleObjectEnv): 116 | def step(self, residual_action): 117 | residual_action = 2. * residual_action 118 | 119 | action = np.add(residual_action, get_hook_control(self._last_observation)) 120 | action = np.clip(action, -1, 1) 121 | observation, reward, done, debug_info = ComplexHookSingleObjectEnv.step(self, action) 122 | 123 | self._last_observation = observation 124 | 125 | return observation, reward, done, debug_info 126 | 127 | def compute_reward(self, *args, **kwargs): 128 | return ComplexHookSingleObjectEnv.compute_reward(self, *args, **kwargs) 129 | 130 | def reset(self): 131 | observation = ComplexHookSingleObjectEnv.reset(self) 132 | self._last_observation = observation 133 | 134 | return observation 135 | 136 | 137 | class TwoFrameResidualComplexHookSingleObjectEnv(ComplexHookSingleObjectEnv): 138 | def __init__(self, xml_file): 139 | super(TwoFrameResidualComplexHookSingleObjectEnv, self).__init__(xml_file) 140 | self.observation_space.spaces['observation'] = spaces.Box(low=np.hstack( 141 | (self.observation_space.spaces['observation'].low, self.observation_space.spaces['observation'].low)), 142 | high=np.hstack((self.observation_space.spaces[ 143 | 'observation'].high, 144 | self.observation_space.spaces[ 145 | 'observation'].high)), 146 | dtype=np.float32) 147 | 148 | def step(self, residual_action): 149 | residual_action = 2. * residual_action 150 | 151 | action = np.add(residual_action, get_hook_control(self._last_observation)) 152 | action = np.clip(action, -1, 1) 153 | observation, reward, done, debug_info = ComplexHookSingleObjectEnv.step(self, action) 154 | 155 | obs_out = observation.copy() 156 | obs_out['observation'] = np.hstack((self._last_observation['observation'], observation['observation'])) 157 | self._last_observation = observation 158 | 159 | return obs_out, reward, done, debug_info 160 | 161 | def reset(self): 162 | observation = ComplexHookSingleObjectEnv.reset(self) 163 | self._last_observation = observation.copy() 164 | observation['observation'] = np.hstack((self._last_observation['observation'], observation['observation'])) 165 | return observation 166 | 167 | 168 | class ComplexHookEnv(gym.Env): 169 | def __init__(self, train=True, num_scenes=100, scene_offset=0, subenv_type='normal', **kwargs): 170 | if train: 171 | xml_files = list(glob.glob(os.path.join(DIR_PATH, 'assets', 'fetch_complex_objects', 'fetch', 172 | 'train_scene_hook_*.xml'))) 173 | else: 174 | xml_files = list(glob.glob(os.path.join(DIR_PATH, 'assets', 'fetch_complex_objects', 'fetch', 175 | 'test_scene_hook_*.xml'))) 176 | 177 | xml_files = xml_files[scene_offset:scene_offset + num_scenes] 178 | 179 | self._subenvs = [] 180 | 181 | self.seed() 182 | 183 | for xml_file in xml_files: 184 | try: 185 | if subenv_type == 'residual': 186 | subenv = ResidualComplexHookSingleObjectEnv(xml_file) 187 | elif subenv_type == 'twoframe_residual': 188 | subenv = TwoFrameResidualComplexHookSingleObjectEnv(xml_file) 189 | elif subenv_type == 'normal': 190 | subenv = ComplexHookSingleObjectEnv(xml_file) 191 | else: 192 | print('subenv_type not recognized') 193 | self._subenvs.append(subenv) 194 | except: 195 | print("FAILED; skipping") 196 | 197 | self._num_envs = len(self._subenvs) 198 | 199 | self.action_space = self._subenvs[0].action_space 200 | self.observation = self._subenvs[0].observation_space 201 | self.metadata = self._subenvs[0].metadata 202 | self.config_file = kwargs.get('config') 203 | self.dimensions = [] 204 | self.reset() 205 | 206 | def _randomize_block_mass(self): 207 | block_mass = self.dimensions[0].current_value 208 | self._env.sim.model.body_mass[-2] = block_mass 209 | 210 | def _randomize_hook_mass(self): 211 | hook_mass = self.dimensions[1].current_value 212 | self._env.sim.model.body_mass[-1] = hook_mass 213 | 214 | def _randomize_friction(self): 215 | current_friction = self.dimensions[2].current_value 216 | for i in range(len(self._env.sim.model.geom_friction)): 217 | self._env.sim.model.geom_friction[i] = [current_friction, 5.e-3, 1e-4] 218 | 219 | def update_randomized_params(self): 220 | self._randomize_block_mass() 221 | self._randomize_hook_mass() 222 | self._randomize_friction() 223 | 224 | def reset(self): 225 | env_id = self.np_random.randint(self._num_envs) 226 | self._env = self._subenvs[env_id] 227 | return self._env.reset() 228 | 229 | def step(self, action): 230 | return self._env.step(action) 231 | 232 | def render(self, *args, **kwargs): 233 | return self._env.render(*args, **kwargs) 234 | 235 | def seed(self, seed=0): 236 | self.np_random, seed = seeding.np_random(seed) 237 | 238 | for subenv in self._subenvs: 239 | subenv.seed(seed=self.np_random.randint(1e8)) 240 | 241 | return [seed] 242 | 243 | def compute_reward(self, *args, **kwargs): 244 | return self._env.compute_reward(*args, **kwargs) 245 | 246 | 247 | class ResidualComplexHookEnv(ComplexHookEnv): 248 | def __init__(self, **kwargs): 249 | ComplexHookEnv.__init__(self, train=True, subenv_type='residual', **kwargs) 250 | 251 | 252 | class TwoFrameResidualComplexHookEnv(ComplexHookEnv): 253 | def __init__(self): 254 | ComplexHookEnv.__init__(self, train=True, subenv_type='twoframe_residual') 255 | 256 | 257 | -------------------------------------------------------------------------------- /common/randomizer/randomized_fetch.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | import xml.etree.ElementTree as et 5 | 6 | from gym.envs.robotics import rotations, robot_env, utils 7 | 8 | import mujoco_py 9 | 10 | def goal_distance(goal_a, goal_b): 11 | assert goal_a.shape == goal_b.shape 12 | return np.linalg.norm(goal_a - goal_b, axis=-1) 13 | 14 | 15 | class RandomizedFetchEnv(robot_env.RobotEnv): 16 | """Superclass for all Fetch environments. 17 | """ 18 | 19 | def __init__( 20 | self, model_path, n_substeps, gripper_extra_height, block_gripper, 21 | has_object, target_in_the_air, target_offset, obj_range, target_range, 22 | distance_threshold, initial_qpos, reward_type, **kwargs, 23 | ): 24 | """Initializes a new Fetch environment. 25 | 26 | Args: 27 | model_path (string): path to the environments XML file 28 | n_substeps (int): number of substeps the simulation runs on every call to step 29 | gripper_extra_height (float): additional height above the table when positioning the gripper 30 | block_gripper (boolean): whether or not the gripper is blocked (i.e. not movable) or not 31 | has_object (boolean): whether or not the environment has an object 32 | target_in_the_air (boolean): whether or not the target should be in the air above the table or on the table surface 33 | target_offset (float or array with 3 elements): offset of the target 34 | obj_range (float): range of a uniform distribution for sampling initial object positions 35 | target_range (float): range of a uniform distribution for sampling a target 36 | distance_threshold (float): the threshold after which a goal is considered achieved 37 | initial_qpos (dict): a dictionary of joint names and values that define the initial configuration 38 | reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense 39 | """ 40 | self.gripper_extra_height = gripper_extra_height 41 | self.block_gripper = block_gripper 42 | self.has_object = has_object 43 | self.target_in_the_air = target_in_the_air 44 | self.target_offset = target_offset 45 | self.obj_range = obj_range 46 | self.target_range = target_range 47 | self.distance_threshold = distance_threshold 48 | self.reward_type = reward_type 49 | 50 | super(RandomizedFetchEnv, self).__init__( 51 | model_path=model_path, n_substeps=n_substeps, n_actions=4, 52 | initial_qpos=initial_qpos) 53 | 54 | # randomization 55 | self.xml_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets") 56 | self.reference_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets", kwargs.get('xml_name')) 57 | self.reference_xml = et.parse(self.reference_path) 58 | self.config_file = kwargs.get('config') 59 | self.dimensions = [] 60 | self.dimension_map = [] 61 | self.suffixes = [] 62 | self._locate_randomize_parameters() 63 | 64 | # Randomization methods 65 | # ---------------------------- 66 | def _locate_randomize_parameters(self): 67 | self.root = self.reference_xml.getroot() 68 | self.object_joints = self.root.findall(".//body[@name='object0']/joint") 69 | 70 | def _randomize_damping(self): 71 | damping = self.dimensions[0].current_value 72 | for joint in self.object_joints: 73 | joint.set('damping', '{:3f}'.format(damping)) 74 | 75 | def _randomize_friction(self): 76 | current_friction = self.dimensions[0].current_value 77 | for i in range(len(self.sim.model.geom_friction)): 78 | self.sim.model.geom_friction[i] = [current_friction, 5.e-3, 1e-4] 79 | 80 | def _create_xml(self): 81 | self._randomize_damping() 82 | return et.tostring(self.root, encoding='unicode', method='xml') 83 | 84 | def update_randomized_params(self): 85 | self._randomize_friction() 86 | xml = self._create_xml() 87 | # self._re_init(xml) 88 | 89 | def _re_init(self, xml): 90 | # TODO: Now, likely needs rank 91 | randomized_path = os.path.join(self.xml_dir, "randomizedfetch.xml") 92 | with open(randomized_path, 'wb') as fp: 93 | fp.write(xml.encode()) 94 | fp.flush() 95 | 96 | self.model = mujoco_py.load_model_from_path(randomized_path) 97 | self.sim = mujoco_py.MjSim(self.model) 98 | self.data = self.sim.data 99 | self.init_qpos = self.data.qpos.ravel().copy() 100 | self.init_qvel = self.data.qvel.ravel().copy() 101 | observation, _reward, done, _info = self.step(np.zeros(4)) 102 | assert not done 103 | if self.viewer: 104 | self.viewer.update_sim(self.sim) 105 | 106 | # GoalEnv methods 107 | # ---------------------------- 108 | 109 | def compute_reward(self, achieved_goal, goal, info): 110 | # Compute distance between goal and the achieved goal. 111 | d = goal_distance(achieved_goal, goal) 112 | if self.reward_type == 'sparse': 113 | return -(d > self.distance_threshold).astype(np.float32) 114 | else: 115 | return -d 116 | 117 | # RobotEnv methods 118 | # ---------------------------- 119 | 120 | def _step_callback(self): 121 | if self.block_gripper: 122 | self.sim.data.set_joint_qpos('robot0:l_gripper_finger_joint', 0.) 123 | self.sim.data.set_joint_qpos('robot0:r_gripper_finger_joint', 0.) 124 | self.sim.forward() 125 | 126 | def _set_action(self, action): 127 | assert action.shape == (4,) 128 | action = action.copy() # ensure that we don't change the action outside of this scope 129 | pos_ctrl, gripper_ctrl = action[:3], action[3] 130 | 131 | pos_ctrl *= 0.05 # limit maximum change in position 132 | rot_ctrl = [1., 0., 1., 0.] # fixed rotation of the end effector, expressed as a quaternion 133 | gripper_ctrl = np.array([gripper_ctrl, gripper_ctrl]) 134 | assert gripper_ctrl.shape == (2,) 135 | if self.block_gripper: 136 | gripper_ctrl = np.zeros_like(gripper_ctrl) 137 | action = np.concatenate([pos_ctrl, rot_ctrl, gripper_ctrl]) 138 | 139 | # Apply action to simulation. 140 | utils.ctrl_set_action(self.sim, action) 141 | utils.mocap_set_action(self.sim, action) 142 | 143 | def _get_obs(self): 144 | # positions 145 | grip_pos = self.sim.data.get_site_xpos('robot0:grip') 146 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep 147 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt 148 | robot_qpos, robot_qvel = utils.robot_get_obs(self.sim) 149 | if self.has_object: 150 | object_pos = self.sim.data.get_site_xpos('object0') 151 | # rotations 152 | object_rot = rotations.mat2euler(self.sim.data.get_site_xmat('object0')) 153 | # velocities 154 | object_velp = self.sim.data.get_site_xvelp('object0') * dt 155 | object_velr = self.sim.data.get_site_xvelr('object0') * dt 156 | # gripper state 157 | object_rel_pos = object_pos - grip_pos 158 | object_velp -= grip_velp 159 | else: 160 | object_pos = object_rot = object_velp = object_velr = object_rel_pos = np.zeros(0) 161 | gripper_state = robot_qpos[-2:] 162 | gripper_vel = robot_qvel[-2:] * dt # change to a scalar if the gripper is made symmetric 163 | 164 | if not self.has_object: 165 | achieved_goal = grip_pos.copy() 166 | else: 167 | achieved_goal = np.squeeze(object_pos.copy()) 168 | obs = np.concatenate([ 169 | grip_pos, object_pos.ravel(), object_rel_pos.ravel(), gripper_state, object_rot.ravel(), 170 | object_velp.ravel(), object_velr.ravel(), grip_velp, gripper_vel, 171 | ]) 172 | 173 | return { 174 | 'observation': obs.copy(), 175 | 'achieved_goal': achieved_goal.copy(), 176 | 'desired_goal': self.goal.copy(), 177 | } 178 | 179 | def _viewer_setup(self): 180 | body_id = self.sim.model.body_name2id('robot0:gripper_link') 181 | lookat = self.sim.data.body_xpos[body_id] 182 | for idx, value in enumerate(lookat): 183 | self.viewer.cam.lookat[idx] = value 184 | self.viewer.cam.distance = 2.5 185 | self.viewer.cam.azimuth = 132. 186 | self.viewer.cam.elevation = -14. 187 | 188 | def _render_callback(self): 189 | # Visualize target. 190 | sites_offset = (self.sim.data.site_xpos - self.sim.model.site_pos).copy() 191 | site_id = self.sim.model.site_name2id('target0') 192 | self.sim.model.site_pos[site_id] = self.goal - sites_offset[0] 193 | self.sim.forward() 194 | 195 | def _reset_sim(self): 196 | self.sim.set_state(self.initial_state) 197 | 198 | # Randomize start position of object. 199 | if self.has_object: 200 | object_xpos = self.initial_gripper_xpos[:2] 201 | while np.linalg.norm(object_xpos - self.initial_gripper_xpos[:2]) < 0.1: 202 | object_xpos = self.initial_gripper_xpos[:2] + self.np_random.uniform(-self.obj_range, self.obj_range, size=2) 203 | object_qpos = self.sim.data.get_joint_qpos('object0:joint') 204 | assert object_qpos.shape == (7,) 205 | object_qpos[:2] = object_xpos 206 | self.sim.data.set_joint_qpos('object0:joint', object_qpos) 207 | 208 | self.sim.forward() 209 | return True 210 | 211 | def _sample_goal(self): 212 | if self.has_object: 213 | goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-self.target_range, self.target_range, size=3) 214 | goal += self.target_offset 215 | goal[2] = self.height_offset 216 | if self.target_in_the_air and self.np_random.uniform() < 0.5: 217 | goal[2] += self.np_random.uniform(0, 0.45) 218 | else: 219 | goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-0.15, 0.15, size=3) 220 | return goal.copy() 221 | 222 | def _is_success(self, achieved_goal, desired_goal): 223 | d = goal_distance(achieved_goal, desired_goal) 224 | return (d < self.distance_threshold).astype(np.float32) 225 | 226 | def _env_setup(self, initial_qpos): 227 | for name, value in initial_qpos.items(): 228 | self.sim.data.set_joint_qpos(name, value) 229 | utils.reset_mocap_welds(self.sim) 230 | self.sim.forward() 231 | 232 | # Move end effector into position. 233 | gripper_target = np.array([-0.498, 0.005, -0.431 + self.gripper_extra_height]) + self.sim.data.get_site_xpos('robot0:grip') 234 | gripper_rotation = np.array([1., 0., 1., 0.]) 235 | self.sim.data.set_mocap_pos('robot0:mocap', gripper_target) 236 | self.sim.data.set_mocap_quat('robot0:mocap', gripper_rotation) 237 | for _ in range(10): 238 | self.sim.step() 239 | 240 | # Extract information for sampling goals. 241 | self.initial_gripper_xpos = self.sim.data.get_site_xpos('robot0:grip').copy() 242 | if self.has_object: 243 | self.height_offset = self.sim.data.get_site_xpos('object0')[2] 244 | 245 | def render(self, mode='human', width=500, height=500): 246 | return super(RandomizedFetchEnv, self).render() 247 | -------------------------------------------------------------------------------- /common/randomizer/residual_push.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | 4 | import xml.etree.ElementTree as et 5 | 6 | from .residual_envs.residual_fetch_push_env import ResidualSlipperyPushEnv 7 | from gym.envs.robotics import rotations, robot_env, utils 8 | import mujoco_py 9 | 10 | def goal_distance(goal_a, goal_b): 11 | assert goal_a.shape == goal_b.shape 12 | return np.linalg.norm(goal_a - goal_b, axis=-1) 13 | 14 | 15 | class RandomizedResidualPushEnv(ResidualSlipperyPushEnv): 16 | """Superclass for all Fetch environments. 17 | """ 18 | 19 | def __init__( 20 | self, model_path, n_substeps, gripper_extra_height, block_gripper, 21 | has_object, target_in_the_air, target_offset, obj_range, target_range, 22 | distance_threshold, initial_qpos, reward_type, **kwargs 23 | ): 24 | """Initializes a new Fetch environment. 25 | 26 | Args: 27 | model_path (string): path to the environments XML file 28 | n_substeps (int): number of substeps the simulation runs on every call to step 29 | gripper_extra_height (float): additional height above the table when positioning the gripper 30 | block_gripper (boolean): whether or not the gripper is blocked (i.e. not movable) or not 31 | has_object (boolean): whether or not the environment has an object 32 | target_in_the_air (boolean): whether or not the target should be in the air above the table or on the table surface 33 | target_offset (float or array with 3 elements): offset of the target 34 | obj_range (float): range of a uniform distribution for sampling initial object positions 35 | target_range (float): range of a uniform distribution for sampling a target 36 | distance_threshold (float): the threshold after which a goal is considered achieved 37 | initial_qpos (dict): a dictionary of joint names and values that define the initial configuration 38 | reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense 39 | """ 40 | 41 | self.gripper_extra_height = gripper_extra_height 42 | self.block_gripper = block_gripper 43 | self.has_object = has_object 44 | self.target_in_the_air = target_in_the_air 45 | self.target_offset = target_offset 46 | self.obj_range = obj_range 47 | self.target_range = target_range 48 | self.distance_threshold = distance_threshold 49 | self.reward_type = reward_type 50 | 51 | super(RandomizedResidualPushEnv, self).__init__( 52 | model_path=model_path, n_substeps=n_substeps, n_actions=4, 53 | initial_qpos=initial_qpos) 54 | 55 | # randomization 56 | self.xml_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets") 57 | self.reference_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets", 58 | kwargs.get('xml_name')) 59 | self.reference_xml = et.parse(self.reference_path) 60 | self.config_file = kwargs.get('config') 61 | self.dimensions = [] 62 | self.dimension_map = [] 63 | self.suffixes = [] 64 | self.reset() 65 | self._locate_randomize_parameters() 66 | 67 | 68 | # Randomization methods 69 | # ---------------------------- 70 | def _locate_randomize_parameters(self): 71 | self.root = self.reference_xml.getroot() 72 | self.object_joints = self.root.findall(".//body[@name='object0']/joint") 73 | 74 | def _randomize_damping(self): 75 | damping = self.dimensions[0].current_value 76 | for joint in self.object_joints: 77 | joint.set('damping', '{:3f}'.format(damping)) 78 | 79 | def _randomize_friction(self): 80 | current_friction = self.dimensions[0].current_value 81 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)): 82 | self.fetch_env.env.sim.model.geom_friction[i] = [current_friction, 5.e-3, 1e-4] 83 | 84 | def _create_xml(self): 85 | self._randomize_friction() 86 | return et.tostring(self.root, encoding='unicode', method='xml') 87 | 88 | def update_randomized_params(self): 89 | self._randomize_friction() 90 | xml = self._create_xml() 91 | # # self._re_init(xml) 92 | 93 | def _re_init(self, xml): 94 | # TODO: Now, likely needs rank 95 | randomized_path = os.path.join(self.xml_dir, "pusher.xml") 96 | with open(randomized_path, 'wb') as fp: 97 | fp.write(xml.encode()) 98 | fp.flush() 99 | 100 | self.model = mujoco_py.load_model_from_path(randomized_path) 101 | self.sim = mujoco_py.MjSim(self.model) 102 | self.data = self.sim.data 103 | self.init_qpos = self.data.qpos.ravel().copy() 104 | self.init_qvel = self.data.qvel.ravel().copy() 105 | 106 | observation, _reward, done, _info = self.step(np.zeros(4)) 107 | assert not done 108 | if self.viewer: 109 | self.viewer.update_sim(self.sim) 110 | 111 | # GoalEnv methods 112 | # ---------------------------- 113 | 114 | def compute_reward(self, achieved_goal, goal, info): 115 | # Compute distance between goal and the achieved goal. 116 | d = goal_distance(achieved_goal, goal) 117 | if self.reward_type == 'sparse': 118 | return -(d > self.distance_threshold).astype(np.float32) 119 | else: 120 | return -d 121 | 122 | # RobotEnv methods 123 | # ---------------------------- 124 | 125 | def _step_callback(self): 126 | if self.block_gripper: 127 | self.sim.data.set_joint_qpos('robot0:l_gripper_finger_joint', 0.) 128 | self.sim.data.set_joint_qpos('robot0:r_gripper_finger_joint', 0.) 129 | self.sim.forward() 130 | 131 | def _set_action(self, action): 132 | assert action.shape == (4,) 133 | action = action.copy() # ensure that we don't change the action outside of this scope 134 | pos_ctrl, gripper_ctrl = action[:3], action[3] 135 | 136 | pos_ctrl *= 0.05 # limit maximum change in position 137 | rot_ctrl = [1., 0., 1., 0.] # fixed rotation of the end effector, expressed as a quaternion 138 | gripper_ctrl = np.array([gripper_ctrl, gripper_ctrl]) 139 | assert gripper_ctrl.shape == (2,) 140 | if self.block_gripper: 141 | gripper_ctrl = np.zeros_like(gripper_ctrl) 142 | action = np.concatenate([pos_ctrl, rot_ctrl, gripper_ctrl]) 143 | 144 | # Apply action to simulation. 145 | utils.ctrl_set_action(self.sim, action) 146 | utils.mocap_set_action(self.sim, action) 147 | 148 | 149 | def _get_obs(self): 150 | # positions 151 | grip_pos = self.sim.data.get_site_xpos('robot0:grip') 152 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep 153 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt 154 | robot_qpos, robot_qvel = utils.robot_get_obs(self.sim) 155 | if self.has_object: 156 | object_pos = self.sim.data.get_site_xpos('object0') 157 | # rotations 158 | object_rot = rotations.mat2euler(self.sim.data.get_site_xmat('object0')) 159 | # velocities 160 | object_velp = self.sim.data.get_site_xvelp('object0') * dt 161 | object_velr = self.sim.data.get_site_xvelr('object0') * dt 162 | # gripper state 163 | object_rel_pos = object_pos - grip_pos 164 | object_velp -= grip_velp 165 | else: 166 | object_pos = object_rot = object_velp = object_velr = object_rel_pos = np.zeros(0) 167 | gripper_state = robot_qpos[-2:] 168 | gripper_vel = robot_qvel[-2:] * dt # change to a scalar if the gripper is made symmetric 169 | 170 | if not self.has_object: 171 | achieved_goal = grip_pos.copy() 172 | else: 173 | achieved_goal = np.squeeze(object_pos.copy()) 174 | obs = np.concatenate([ 175 | grip_pos, object_pos.ravel(), object_rel_pos.ravel(), gripper_state, object_rot.ravel(), 176 | object_velp.ravel(), object_velr.ravel(), grip_velp, gripper_vel, 177 | ]) 178 | 179 | return { 180 | 'observation': obs.copy(), 181 | 'achieved_goal': achieved_goal.copy(), 182 | 'desired_goal': self.goal.copy(), 183 | } 184 | 185 | # def _viewer_setup(self): 186 | # body_id = self.sim.model.body_name2id('robot0:gripper_link') 187 | # lookat = self.sim.data.body_xpos[body_id] 188 | # for idx, value in enumerate(lookat): 189 | # self.viewer.cam.lookat[idx] = value 190 | # self.viewer.cam.distance = 2.5 191 | # self.viewer.cam.azimuth = 132. 192 | # self.viewer.cam.elevation = -14. 193 | 194 | def _render_callback(self): 195 | # Visualize target. 196 | sites_offset = (self.sim.data.site_xpos - self.sim.model.site_pos).copy() 197 | site_id = self.sim.model.site_name2id('target0') 198 | self.sim.model.site_pos[site_id] = self.goal - sites_offset[0] 199 | self.sim.forward() 200 | 201 | def _reset_sim(self): 202 | self.sim.set_state(self.initial_state) 203 | 204 | # Randomize start position of object. 205 | if self.has_object: 206 | object_xpos = self.initial_gripper_xpos[:2] 207 | while np.linalg.norm(object_xpos - self.initial_gripper_xpos[:2]) < 0.1: 208 | object_xpos = self.initial_gripper_xpos[:2] + self.np_random.uniform(-self.obj_range, self.obj_range, 209 | size=2) 210 | object_qpos = self.sim.data.get_joint_qpos('object0:joint') 211 | assert object_qpos.shape == (7,) 212 | object_qpos[:2] = object_xpos 213 | self.sim.data.set_joint_qpos('object0:joint', object_qpos) 214 | 215 | self.sim.forward() 216 | return True 217 | 218 | def _sample_goal(self): 219 | if self.has_object: 220 | goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-self.target_range, self.target_range, size=3) 221 | goal += self.target_offset 222 | goal[2] = self.height_offset 223 | if self.target_in_the_air and self.np_random.uniform() < 0.5: 224 | goal[2] += self.np_random.uniform(0, 0.45) 225 | else: 226 | goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-0.15, 0.15, size=3) 227 | return goal.copy() 228 | 229 | def _is_success(self, achieved_goal, desired_goal): 230 | d = goal_distance(achieved_goal, desired_goal) 231 | return (d < self.distance_threshold).astype(np.float32) 232 | 233 | def _env_setup(self, initial_qpos): 234 | for name, value in initial_qpos.items(): 235 | self.sim.data.set_joint_qpos(name, value) 236 | utils.reset_mocap_welds(self.sim) 237 | self.sim.forward() 238 | 239 | # Move end effector into position. 240 | gripper_target = np.array([-0.498, 0.005, -0.431 + self.gripper_extra_height]) + self.sim.data.get_site_xpos( 241 | 'robot0:grip') 242 | gripper_rotation = np.array([1., 0., 1., 0.]) 243 | self.sim.data.set_mocap_pos('robot0:mocap', gripper_target) 244 | self.sim.data.set_mocap_quat('robot0:mocap', gripper_rotation) 245 | for _ in range(10): 246 | self.sim.step() 247 | 248 | # Extract information for sampling goals. 249 | self.initial_gripper_xpos = self.sim.data.get_site_xpos('robot0:grip').copy() 250 | if self.has_object: 251 | self.height_offset = self.sim.data.get_site_xpos('object0')[2] 252 | 253 | def render(self, mode='human', width=500, height=500): 254 | 255 | return super(RandomizedResidualPushEnv, self).render() -------------------------------------------------------------------------------- /common/svpg/svpg.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch.optim as optim 4 | from torch.distributions.kl import kl_divergence 5 | 6 | from scipy.spatial.distance import squareform, pdist 7 | 8 | from common.svpg.particles import SVPGParticle 9 | from common.svpg.svpg_utils import * 10 | 11 | 12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 13 | torch.set_default_tensor_type(torch.FloatTensor) 14 | eps = np.finfo(np.float32).eps.item() 15 | 16 | LEARNING_RATE = 3e-4 17 | HIDDEN_DIMENSIONS = 100 18 | 19 | class SVPG: 20 | """Class that implements Stein Variational Policy Gradient 21 | Input is the current randomization settings, and output is either a 22 | direction to move in (Discrete - for 1D/2D) or a delta across all parameter (Continuous) 23 | """ 24 | def __init__(self, nparticles, nparams, max_step_length, svpg_rollout_length, svpg_horizon, temperature, discrete=False, kld_coefficient=0.0): 25 | self.particles = [] 26 | self.prior_particles = [] 27 | self.optimizers = [] 28 | self.max_step_length = max_step_length 29 | self.svpg_rollout_length = svpg_rollout_length 30 | self.svpg_horizon = svpg_horizon 31 | self.temperature = temperature 32 | self.nparticles = nparticles 33 | self.gamma = 0.99 34 | 35 | self.nparams = nparams 36 | self.noutputs = nparams * 2 if discrete else nparams 37 | self.discrete = discrete 38 | self.kld_coefficient = kld_coefficient 39 | 40 | self.last_states = np.random.uniform(0, 1, (self.nparticles, self.nparams)) 41 | self.timesteps = np.zeros(self.nparticles) 42 | 43 | for i in range(self.nparticles): 44 | # Initialize each of the individual particles 45 | policy = SVPGParticle(input_dim=self.nparams, 46 | output_dim=self.noutputs, 47 | hidden_dim=HIDDEN_DIMENSIONS, 48 | discrete=discrete).to(device) 49 | 50 | prior_policy = SVPGParticle(input_dim=self.nparams, 51 | output_dim=self.noutputs, 52 | hidden_dim=HIDDEN_DIMENSIONS, 53 | discrete=discrete, 54 | freeze=True).to(device) 55 | 56 | optimizer = optim.Adam(policy.parameters(), lr=LEARNING_RATE) 57 | self.particles.append(policy) 58 | self.prior_particles.append(prior_policy) 59 | self.optimizers.append(optimizer) 60 | 61 | def _squared_dist(self, X): 62 | # Compute squared distance matrix using torch.pdist 63 | dists = torch.pdist(X) 64 | inds_l, inds_r = np.triu_indices(X.shape[-2], 1) 65 | res = torch.zeros(*X.shape[:-2], X.shape[-2], X.shape[-2], dtype=X.dtype, device=X.device) 66 | res[..., inds_l, inds_r] = dists 67 | res[..., inds_r, inds_l] = dists 68 | 69 | return res 70 | 71 | def _Kxx_dxKxx(self, X): 72 | """ 73 | Computes covariance matrix K(X,X) and its gradient w.r.t. X 74 | for RBF kernel with design matrix X, as in the second term in eqn (8) 75 | of reference SVGD paper. 76 | 77 | Args: 78 | X (Tensor): (S, P), design matrix of samples, where S is num of 79 | samples, P is the dim of each sample which stacks all params 80 | into a (1, P) row. Thus P could be 1 millions. 81 | """ 82 | 83 | X_np = X.cpu().data.numpy() 84 | pairwise_dists = squareform(pdist(X_np))**2 85 | 86 | # Median trick 87 | h = np.median(pairwise_dists) 88 | h = np.sqrt(0.5 * h / np.log(self.nparticles+1)) 89 | 90 | # Compute RBF Kernel 91 | Kxx = torch.exp(-torch.from_numpy(pairwise_dists).to(device).float() / h**2 / 2) 92 | 93 | # Compute kernel gradient 94 | dxKxx = -(Kxx).matmul(X) 95 | sumKxx = Kxx.sum(1) 96 | for i in range(X.shape[1]): 97 | dxKxx[:, i] = dxKxx[:, i] + X[:, i].matmul(sumKxx) 98 | dxKxx /= (h ** 2) 99 | 100 | return Kxx, dxKxx 101 | 102 | def select_action(self, policy_idx, state): 103 | state = torch.from_numpy(state).float().unsqueeze(0).to(device) 104 | policy = self.particles[policy_idx] 105 | prior_policy = self.prior_particles[policy_idx] 106 | dist, value = policy(state) 107 | prior_dist, _ = prior_policy(state) 108 | 109 | action = dist.sample() 110 | 111 | policy.saved_log_probs.append(dist.log_prob(action)) 112 | policy.saved_klds.append(kl_divergence(dist, prior_dist)) 113 | 114 | # TODO: Gross temporary hack 115 | if self.nparams == 1 or self.discrete: 116 | action = action.item() 117 | else: 118 | action = action.squeeze().cpu().detach().numpy() 119 | 120 | return action, value 121 | 122 | def compute_returns(self, next_value, rewards, masks, klds): 123 | R = next_value 124 | returns = [] 125 | for step in reversed(range(len(rewards))): 126 | # Eq. 80: https://arxiv.org/abs/1704.06440 127 | R = self.gamma * masks[step] * R + (rewards[step] - self.kld_coefficient * klds[step]) 128 | returns.insert(0, R) 129 | 130 | return returns 131 | 132 | def step(self): 133 | """Rollout trajectories, starting from random initializations, 134 | of randomization settings, each of svpg_rollout_length size 135 | Then, send it to agent for further training and reward calculation 136 | """ 137 | self.simulation_instances = np.zeros((self.nparticles, self.svpg_rollout_length, self.nparams)) 138 | 139 | # Store the values of each state - for advantage estimation 140 | self.values = torch.zeros((self.nparticles, self.svpg_rollout_length, 1)).float().to(device) 141 | # Store the last states for each particle (calculating rewards) 142 | self.masks = np.ones((self.nparticles, self.svpg_rollout_length)) 143 | 144 | for i in range(self.nparticles): 145 | self.particles[i].reset() 146 | current_sim_params = self.last_states[i] 147 | 148 | for t in range(self.svpg_rollout_length): 149 | self.simulation_instances[i][t] = current_sim_params 150 | 151 | action, value = self.select_action(i, current_sim_params) 152 | self.values[i][t] = value 153 | 154 | action = self._process_action(action) 155 | clipped_action = action * self.max_step_length 156 | next_params = np.clip(current_sim_params + clipped_action, 0, 1) 157 | 158 | if np.array_equal(next_params, current_sim_params) or self.timesteps[i] + 1 == self.svpg_horizon: 159 | next_params = np.random.uniform(0, 1, (self.nparams,)) 160 | 161 | self.masks[i][t] = 0 # done = True 162 | self.timesteps[i] = 0 163 | 164 | current_sim_params = next_params 165 | self.timesteps[i] += 1 166 | 167 | self.last_states[i] = current_sim_params 168 | 169 | return np.array(self.simulation_instances) 170 | 171 | def train(self, simulator_rewards): 172 | """Train SVPG agent with rewards from rollouts 173 | """ 174 | policy_grads = [] 175 | parameters = [] 176 | 177 | for i in range(self.nparticles): 178 | policy_grad_particle = [] 179 | 180 | # Calculate the value of last state - for Return Computation 181 | _, next_value = self.select_action(i, self.last_states[i]) 182 | 183 | particle_rewards = torch.from_numpy(simulator_rewards[i]).float().to(device) 184 | masks = torch.from_numpy(self.masks[i]).float().to(device) 185 | 186 | # Calculate entropy-augmented returns, advantages 187 | returns = self.compute_returns(next_value, particle_rewards, masks, self.particles[i].saved_klds) 188 | returns = torch.cat(returns).detach() 189 | advantages = returns - self.values[i] 190 | # for s, v, r in zip(self.simulation_instances[i], self.values[i], simulator_rewards[i]): 191 | # print('Setting: {}, Reward: {}, Value: {}'.format(s,r,v)) 192 | 193 | # logprob * A = policy gradient (before backwards) 194 | for log_prob, advantage in zip(self.particles[i].saved_log_probs, advantages): 195 | policy_grad_particle.append(log_prob * advantage.detach()) 196 | 197 | # Compute value loss, update critic 198 | self.optimizers[i].zero_grad() 199 | critic_loss = 0.5 * advantages.pow(2).mean() 200 | critic_loss.backward(retain_graph=True) 201 | self.optimizers[i].step() 202 | 203 | # Store policy gradients for SVPG update 204 | self.optimizers[i].zero_grad() 205 | policy_grad = -torch.cat(policy_grad_particle).mean() 206 | policy_grad.backward() 207 | 208 | # Vectorize parameters and PGs 209 | vec_param, vec_policy_grad = parameters_to_vector( 210 | self.particles[i].parameters(), both=True) 211 | 212 | policy_grads.append(vec_policy_grad.unsqueeze(0)) 213 | parameters.append(vec_param.unsqueeze(0)) 214 | 215 | # calculating the kernel matrix and its gradients 216 | parameters = torch.cat(parameters) 217 | Kxx, dxKxx = self._Kxx_dxKxx(parameters) 218 | 219 | policy_grads = 1 / self.temperature * torch.cat(policy_grads) 220 | grad_logp = torch.mm(Kxx, policy_grads) 221 | 222 | grad_theta = (grad_logp + dxKxx) / self.nparticles 223 | # explicitly deleting variables does not release memory :( 224 | 225 | # update param gradients 226 | for i in range(self.nparticles): 227 | vector_to_parameters(grad_theta[i], 228 | self.particles[i].parameters(), grad=True) 229 | self.optimizers[i].step() 230 | 231 | def save(self, directory): 232 | for i in range(self.nparticles): 233 | torch.save(self.particles[i].state_dict(), '{}/particle_{}.pth'.format(directory, i)) 234 | torch.save(self.particles[i].critic.state_dict(), '{}/particle_critic{}.pth'.format(directory, i)) 235 | 236 | def load(self, directory): 237 | return # TODO: Does this solve the value function issue? 238 | for i in range(self.nparticles): 239 | prior = torch.load('{}/particle_{}.pth'.format(directory, i)) 240 | particle = self.particles[i].state_dict() 241 | actor = {k: v for k, v in prior.items() if k.find('critic') == -1} 242 | 243 | # Only load the actor! 244 | self.particles[i].load_state_dict(actor, strict=False) 245 | 246 | def load_prior_particles(self, directory): 247 | for i in range(self.nparticles): 248 | self.prior_particles[i].load_state_dict(torch.load('{}/particle_{}.pth'.format(directory, i), map_location=device)) 249 | self.prior_particles[i].freeze() 250 | 251 | def _process_action(self, action): 252 | """Transform policy output into environment-action 253 | """ 254 | if self.discrete: 255 | if self.nparams == 1: 256 | if action == 0: 257 | action = [-1.] 258 | elif action == 1: 259 | action = [1.] 260 | elif self.nparams == 2: 261 | if action == 0: 262 | action = [-1., 0] 263 | elif action == 1: 264 | action = [1., 0] 265 | elif action == 2: 266 | action = [0, -1.] 267 | elif action == 3: 268 | action = [0, 1.] 269 | else: 270 | if isinstance(action, float): 271 | action = np.clip(action, -1, 1) 272 | else: 273 | action /= np.linalg.norm(action, ord=2) 274 | 275 | return np.array(action) 276 | -------------------------------------------------------------------------------- /common/randomizer/fetch_hook_env.py: -------------------------------------------------------------------------------- 1 | from gym.envs.robotics import rotations, fetch_env 2 | from gym import utils, spaces 3 | import numpy as np 4 | import os 5 | 6 | from .controllers.hook_controller import get_hook_control 7 | 8 | import pdb 9 | 10 | DIR_PATH = os.path.dirname(os.path.abspath(__file__)) 11 | 12 | 13 | class FetchHookEnv(fetch_env.FetchEnv, utils.EzPickle): 14 | def __init__(self, xml_file=None): 15 | initial_qpos = { 16 | 'robot0:slide0': 0.405, 17 | 'robot0:slide1': 0.48, 18 | 'robot0:slide2': 0.0, 19 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.], 20 | 'hook:joint': [1.35, 0.35, 0.4, 1., 0., 0., 0.], 21 | } 22 | 23 | if xml_file is None: 24 | xml_file = os.path.join(DIR_PATH, 'assets_residual', 'hook.xml') 25 | 26 | self._goal_pos = np.array([1.65, 0.75, 0.42]) 27 | self._object_xpos = np.array([1.8, 0.75]) 28 | 29 | fetch_env.FetchEnv.__init__( 30 | self, xml_file, has_object=True, block_gripper=False, n_substeps=20, 31 | gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0, 32 | obj_range=None, target_range=None, distance_threshold=0.05, 33 | initial_qpos=initial_qpos, reward_type='sparse') 34 | 35 | utils.EzPickle.__init__(self) 36 | 37 | def render(self, mode="human", *args, **kwargs): 38 | # See https://github.com/openai/gym/issues/1081 39 | self._render_callback() 40 | if mode == 'rgb_array': 41 | self._get_viewer().render() 42 | width, height = 3350, 1800 43 | data = self._get_viewer().read_pixels(width, height, depth=False) 44 | # original image is upside-down, so flip it 45 | return data[::-1, :, :] 46 | elif mode == 'human': 47 | self._get_viewer().render() 48 | 49 | return super(FetchHookEnv, self).render(*args, **kwargs) 50 | 51 | def _sample_goal(self): 52 | goal_pos = self._goal_pos.copy() 53 | goal_pos[:2] += self.np_random.uniform(-0.05, 0.05) 54 | return goal_pos 55 | 56 | def _viewer_setup(self): 57 | body_id = self.sim.model.body_name2id('robot0:gripper_link') 58 | lookat = self.sim.data.body_xpos[body_id] 59 | for idx, value in enumerate(lookat): 60 | self.viewer.cam.lookat[idx] = value 61 | self.viewer.cam.distance = 2.5 62 | self.viewer.cam.azimuth = 180. 63 | self.viewer.cam.elevation = -24. 64 | 65 | def _reset_sim(self): 66 | self.sim.set_state(self.initial_state) 67 | 68 | object_xpos_x = 1.65 + self.np_random.uniform(-0.05, 0.05) 69 | while True: 70 | object_xpos_x = 1.8 + self.np_random.uniform(-0.05, 0.10) 71 | object_xpos_y = 0.75 + self.np_random.uniform(-0.05, 0.05) 72 | if (object_xpos_x - self._goal_pos[0]) ** 2 + (object_xpos_y - self._goal_pos[1]) ** 2 >= 0.01: 73 | break 74 | self._object_xpos = np.array([object_xpos_x, object_xpos_y]) 75 | 76 | object_qpos = self.sim.data.get_joint_qpos('object0:joint') 77 | assert object_qpos.shape == (7,) 78 | object_qpos[:2] = self._object_xpos 79 | self.sim.data.set_joint_qpos('object0:joint', object_qpos) 80 | 81 | self.sim.forward() 82 | return True 83 | 84 | def _get_obs(self): 85 | obs = fetch_env.FetchEnv._get_obs(self) 86 | 87 | grip_pos = self.sim.data.get_site_xpos('robot0:grip') 88 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep 89 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt 90 | 91 | hook_pos = self.sim.data.get_site_xpos('hook') 92 | # rotations 93 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook')) 94 | # velocities 95 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt 96 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt 97 | # gripper state 98 | hook_rel_pos = hook_pos - grip_pos 99 | hook_velp -= grip_velp 100 | 101 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos]) 102 | 103 | obs['observation'] = np.concatenate([obs['observation'], hook_observation]) 104 | 105 | return obs 106 | 107 | def _noisify_obs(self, obs, noise=1.): 108 | return obs + np.random.normal(0, noise, size=obs.shape) 109 | 110 | 111 | class ResidualFetchHookEnv(FetchHookEnv): 112 | 113 | def step(self, residual_action): 114 | residual_action = 2. * residual_action 115 | 116 | action = np.add(residual_action, get_hook_control(self._last_observation)) 117 | action = np.clip(action, -1, 1) 118 | observation, reward, done, debug_info = FetchHookEnv.step(self, action) 119 | 120 | self._last_observation = observation 121 | 122 | return observation, reward, done, debug_info 123 | 124 | def compute_reward(self, *args, **kwargs): 125 | return FetchHookEnv.compute_reward(self, *args, **kwargs) 126 | 127 | def reset(self): 128 | observation = FetchHookEnv.reset(self) 129 | self._last_observation = observation 130 | 131 | return observation 132 | 133 | 134 | class NoisyFetchHookEnv(FetchHookEnv): 135 | 136 | def _get_obs(self): 137 | obs = fetch_env.FetchEnv._get_obs(self) 138 | 139 | grip_pos = self.sim.data.get_site_xpos('robot0:grip') 140 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep 141 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt 142 | 143 | hook_pos = self.sim.data.get_site_xpos('hook') 144 | hook_pos = self._noisify_obs(hook_pos, noise=0.025) 145 | # rotations 146 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook')) 147 | hook_rot = self._noisify_obs(hook_rot, noise=0.025) 148 | # velocities 149 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt 150 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt 151 | # gripper state 152 | hook_rel_pos = hook_pos - grip_pos 153 | hook_velp -= grip_velp 154 | 155 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos]) 156 | 157 | obs['observation'] = np.concatenate([obs['observation'], hook_observation]) 158 | obs['observation'][3:5] = self._noisify_obs(obs['observation'][3:5], noise=0.025) 159 | obs['observation'][6:9] = obs['observation'][3:6] - obs['observation'][:3] # object_pos - grip_pos 160 | obs['observation'][12:15] = self._noisify_obs(obs['observation'][6:9], noise=0.025) 161 | return obs 162 | 163 | def _noisify_obs(self, obs, noise=1.): 164 | return obs + np.random.normal(0, noise, size=obs.shape) 165 | 166 | def compute_reward(self, *args, **kwargs): 167 | return FetchHookEnv.compute_reward(self, *args, **kwargs) 168 | 169 | def reset(self): 170 | observation = FetchHookEnv.reset(self) 171 | self._last_observation = observation 172 | 173 | return observation 174 | 175 | 176 | class TwoFrameResidualHookNoisyEnv(FetchHookEnv): 177 | def __init__(self, xml_file=None): 178 | super(TwoFrameResidualHookNoisyEnv, self).__init__() 179 | self.observation_space.spaces['observation'] = spaces.Box(low=np.hstack( 180 | (self.observation_space.spaces['observation'].low, self.observation_space.spaces['observation'].low)), 181 | high=np.hstack((self.observation_space.spaces[ 182 | 'observation'].high, 183 | self.observation_space.spaces[ 184 | 'observation'].high)), 185 | dtype=np.float32) 186 | 187 | def _get_obs(self): 188 | obs = fetch_env.FetchEnv._get_obs(self) 189 | 190 | grip_pos = self.sim.data.get_site_xpos('robot0:grip') 191 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep 192 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt 193 | 194 | hook_pos = self.sim.data.get_site_xpos('hook') 195 | hook_pos = self._noisify_obs(hook_pos, noise=0.025) 196 | # rotations 197 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook')) 198 | hook_rot = self._noisify_obs(hook_rot, noise=0.025) 199 | # velocities 200 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt 201 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt 202 | # gripper state 203 | hook_rel_pos = hook_pos - grip_pos 204 | hook_velp -= grip_velp 205 | 206 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos]) 207 | 208 | obs['observation'] = np.concatenate([obs['observation'], hook_observation]) 209 | obs['observation'][3:5] = self._noisify_obs(obs['observation'][3:5], noise=0.025) 210 | obs['observation'][6:9] = obs['observation'][3:6] - obs['observation'][:3] # object_pos - grip_pos 211 | obs['observation'][12:15] = self._noisify_obs(obs['observation'][6:9], noise=0.025) 212 | return obs 213 | 214 | def step(self, residual_action): 215 | residual_action = 2. * residual_action 216 | 217 | action = np.add(residual_action, get_hook_control(self._last_observation)) 218 | action = np.clip(action, -1, 1) 219 | observation, reward, done, debug_info = FetchHookEnv.step(self, action) 220 | 221 | obs_out = observation.copy() 222 | obs_out['observation'] = np.hstack((self._last_observation['observation'], observation['observation'])) 223 | self._last_observation = observation 224 | 225 | return obs_out, reward, done, debug_info 226 | 227 | def reset(self): 228 | observation = FetchHookEnv.reset(self) 229 | self._last_observation = observation.copy() 230 | observation['observation'] = np.hstack((self._last_observation['observation'], observation['observation'])) 231 | return observation 232 | 233 | def _noisify_obs(self, obs, noise=1.): 234 | return obs + np.random.normal(0, noise, size=obs.shape) 235 | 236 | 237 | class TwoFrameHookNoisyEnv(FetchHookEnv): 238 | def __init__(self, xml_file=None): 239 | super(TwoFrameHookNoisyEnv, self).__init__() 240 | self.observation_space.spaces['observation'] = spaces.Box(low=np.hstack( 241 | (self.observation_space.spaces['observation'].low, self.observation_space.spaces['observation'].low)), 242 | high=np.hstack((self.observation_space.spaces[ 243 | 'observation'].high, 244 | self.observation_space.spaces[ 245 | 'observation'].high)), 246 | dtype=np.float32) 247 | 248 | def _get_obs(self): 249 | obs = fetch_env.FetchEnv._get_obs(self) 250 | 251 | grip_pos = self.sim.data.get_site_xpos('robot0:grip') 252 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep 253 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt 254 | 255 | hook_pos = self.sim.data.get_site_xpos('hook') 256 | hook_pos = self._noisify_obs(hook_pos, noise=0.025) 257 | # rotations 258 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook')) 259 | hook_rot = self._noisify_obs(hook_rot, noise=0.025) 260 | # velocities 261 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt 262 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt 263 | # gripper state 264 | hook_rel_pos = hook_pos - grip_pos 265 | hook_velp -= grip_velp 266 | 267 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos]) 268 | 269 | obs['observation'] = np.concatenate([obs['observation'], hook_observation]) 270 | obs['observation'][3:5] = self._noisify_obs(obs['observation'][3:5], noise=0.025) 271 | obs['observation'][6:9] = obs['observation'][3:6] - obs['observation'][:3] # object_pos - grip_pos 272 | obs['observation'][12:15] = self._noisify_obs(obs['observation'][6:9], noise=0.025) 273 | return obs 274 | 275 | def step(self, action): 276 | observation, reward, done, debug_info = FetchHookEnv.step(self, action) 277 | 278 | obs_out = observation.copy() 279 | obs_out['observation'] = np.hstack((self._last_observation['observation'], observation['observation'])) 280 | self._last_observation = observation 281 | 282 | return obs_out, reward, done, debug_info 283 | 284 | def reset(self): 285 | observation = FetchHookEnv.reset(self) 286 | self._last_observation = observation.copy() 287 | observation['observation'] = np.hstack((self._last_observation['observation'], observation['observation'])) 288 | return observation 289 | 290 | def _noisify_obs(self, obs, noise=1.): 291 | return obs + np.random.normal(0, noise, size=obs.shape) 292 | 293 | 294 | class NoisyResidualFetchHookEnv(FetchHookEnv): 295 | 296 | def step(self, residual_action): 297 | residual_action = 2. * residual_action 298 | 299 | action = np.add(residual_action, get_hook_control(self._last_observation)) 300 | action = np.clip(action, -1, 1) 301 | observation, reward, done, debug_info = FetchHookEnv.step(self, action) 302 | 303 | self._last_observation = observation 304 | 305 | return observation, reward, done, debug_info 306 | 307 | def _get_obs(self): 308 | obs = fetch_env.FetchEnv._get_obs(self) 309 | 310 | grip_pos = self.sim.data.get_site_xpos('robot0:grip') 311 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep 312 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt 313 | 314 | hook_pos = self.sim.data.get_site_xpos('hook') 315 | hook_pos = self._noisify_obs(hook_pos, noise=0.025) 316 | # rotations 317 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook')) 318 | hook_rot = self._noisify_obs(hook_rot, noise=0.025) 319 | # velocities 320 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt 321 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt 322 | # gripper state 323 | hook_rel_pos = hook_pos - grip_pos 324 | hook_velp -= grip_velp 325 | 326 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos]) 327 | 328 | obs['observation'] = np.concatenate([obs['observation'], hook_observation]) 329 | obs['observation'][3:5] = self._noisify_obs(obs['observation'][3:5], noise=0.025) 330 | obs['observation'][6:9] = obs['observation'][3:6] - obs['observation'][:3] # object_pos - grip_pos 331 | obs['observation'][12:15] = self._noisify_obs(obs['observation'][6:9], noise=0.025) 332 | return obs 333 | 334 | def _noisify_obs(self, obs, noise=1.): 335 | return obs + np.random.normal(0, noise, size=obs.shape) 336 | 337 | def compute_reward(self, *args, **kwargs): 338 | return FetchHookEnv.compute_reward(self, *args, **kwargs) 339 | 340 | def reset(self): 341 | observation = FetchHookEnv.reset(self) 342 | self._last_observation = observation 343 | 344 | return observation 345 | --------------------------------------------------------------------------------