├── __init__.py
├── common
├── __init__.py
├── adr
│ ├── __init__.py
│ └── adr.py
├── svpg
│ ├── __init__.py
│ ├── particles
│ │ ├── __init__.py
│ │ ├── utils.py
│ │ ├── distributions.py
│ │ └── svpg_particle.py
│ ├── svpg_utils.py
│ └── svpg.py
├── agents
│ ├── __init__.py
│ ├── ddpg
│ │ ├── __init__.py
│ │ ├── replay_buffer.py
│ │ └── ddpg.py
│ └── selfplay_models.py
├── models
│ ├── __init__.py
│ ├── actor_critic.py
│ └── discriminator.py
├── randomizer
│ ├── controllers
│ │ ├── __init__.py
│ │ ├── miscalibrated_push_controller.py
│ │ ├── pick_and_place_controller.py
│ │ └── hook_controller.py
│ ├── residual_envs
│ │ ├── __init__.py
│ │ └── residual_fetch_push_env.py
│ ├── config
│ │ ├── __init__.py
│ │ ├── ErgoPushRandomized
│ │ │ ├── hard.json
│ │ │ ├── default.json
│ │ │ └── random.json
│ │ ├── FetchPushRandomized
│ │ │ ├── default.json
│ │ │ └── random.json
│ │ ├── FetchSlideRandomized
│ │ │ ├── default.json
│ │ │ └── random.json
│ │ ├── ResidualPushRandomized
│ │ │ ├── default.json
│ │ │ └── random.json
│ │ ├── FetchHookRandomized
│ │ │ ├── default.json
│ │ │ └── random.json
│ │ ├── ResidualComplexHookRandomized
│ │ │ ├── default.json
│ │ │ └── random.json
│ │ ├── ResidualFetchHookRandomized
│ │ │ ├── default.json
│ │ │ └── random.json
│ │ └── ErgoReacherRandomized
│ │ │ ├── default.json
│ │ │ ├── random.json
│ │ │ └── hard.json
│ ├── assets
│ │ ├── stls
│ │ │ ├── estop_link.stl
│ │ │ ├── laser_link.stl
│ │ │ ├── gripper_link.stl
│ │ │ ├── torso_fixed_link.stl
│ │ │ ├── base_link_collision.stl
│ │ │ ├── bellows_link_collision.stl
│ │ │ ├── head_pan_link_collision.stl
│ │ │ ├── l_wheel_link_collision.stl
│ │ │ ├── r_wheel_link_collision.stl
│ │ │ ├── elbow_flex_link_collision.stl
│ │ │ ├── head_tilt_link_collision.stl
│ │ │ ├── torso_lift_link_collision.stl
│ │ │ ├── wrist_flex_link_collision.stl
│ │ │ ├── wrist_roll_link_collision.stl
│ │ │ ├── forearm_roll_link_collision.stl
│ │ │ ├── shoulder_lift_link_collision.stl
│ │ │ ├── shoulder_pan_link_collision.stl
│ │ │ └── upperarm_roll_link_collision.stl
│ │ ├── textures
│ │ │ ├── block.png
│ │ │ └── block_hidden.png
│ │ ├── push.xml
│ │ ├── slide.xml
│ │ ├── hook.xml
│ │ ├── shared.xml
│ │ ├── pusher.xml
│ │ └── robot.xml
│ ├── test.py
│ ├── randomized_fetchslide.py
│ ├── dimension.py
│ ├── randomized_fetchpush.py
│ ├── randomized_residual_push.py
│ ├── randomized_ergo_pusher.py
│ ├── randomized_ergoreacher.py
│ ├── miscalibrated_push_controller.py
│ ├── ergo_objects.py
│ ├── wrappers.py
│ ├── residual_fetch_push_env.py
│ ├── __init__.py
│ ├── complex_hook_env.py
│ ├── randomized_fetch.py
│ ├── residual_push.py
│ └── fetch_hook_env.py
└── discriminator
│ └── discriminator_rewarder.py
├── utils
├── __init__.py
├── learning_curve.py
└── plot_utils.py
├── architecture-selfplay.png
├── setup.py
├── ergo_expts.sh
├── .gitignore
├── experiments
├── evaluate_ergo_envs.py
└── ddpg_train.py
├── README.md
├── arguments.py
└── realrobot_evals
└── realrobot_test.py
/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/adr/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/svpg/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/agents/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/models/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/agents/ddpg/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/randomizer/controllers/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/randomizer/residual_envs/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/common/svpg/particles/__init__.py:
--------------------------------------------------------------------------------
1 | from .svpg_particle import SVPGParticle
--------------------------------------------------------------------------------
/common/randomizer/config/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | CONFIG_PATH = os.path.abspath(os.path.dirname(__file__))
3 |
--------------------------------------------------------------------------------
/architecture-selfplay.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/architecture-selfplay.png
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/estop_link.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/estop_link.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/laser_link.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/laser_link.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/textures/block.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/textures/block.png
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/gripper_link.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/gripper_link.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/torso_fixed_link.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/torso_fixed_link.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/textures/block_hidden.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/textures/block_hidden.png
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/base_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/base_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/bellows_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/bellows_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/head_pan_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/head_pan_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/l_wheel_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/l_wheel_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/r_wheel_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/r_wheel_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/elbow_flex_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/elbow_flex_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/head_tilt_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/head_tilt_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/torso_lift_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/torso_lift_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/wrist_flex_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/wrist_flex_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/wrist_roll_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/wrist_roll_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/forearm_roll_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/forearm_roll_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/shoulder_lift_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/shoulder_lift_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/shoulder_pan_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/shoulder_pan_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/assets/stls/upperarm_roll_link_collision.stl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/montrealrobotics/unsupervised-adr/HEAD/common/randomizer/assets/stls/upperarm_roll_link_collision.stl
--------------------------------------------------------------------------------
/common/randomizer/config/ErgoPushRandomized/hard.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ErgoPusherRandomized-v0",
3 | "dimensions": [
4 | {
5 | "name": "friction",
6 | "default": 0.1,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | }
10 | ]
11 | }
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 |
3 | setup(
4 | name='unsupadr',
5 | version='1.0',
6 | packages=['common'],
7 | install_requires=[
8 | "torch>=1.0", "tqdm", "numpy", "matplotlib", 'gym>=0.10',
9 | 'mujoco_py>=1.15'
10 | ])
11 |
--------------------------------------------------------------------------------
/common/randomizer/config/ErgoPushRandomized/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ErgoPusherRandomized-v0",
3 | "dimensions": [
4 | {
5 | "name": "friction",
6 | "default": 0.4,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | }
10 | ]
11 | }
--------------------------------------------------------------------------------
/common/randomizer/config/ErgoPushRandomized/random.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ErgoPusherRandomized-v0",
3 | "dimensions": [
4 | {
5 | "name": "friction",
6 | "default": 0.4,
7 | "multiplier_min": 0.25,
8 | "multiplier_max": 2.0
9 | }
10 | ]
11 | }
--------------------------------------------------------------------------------
/common/randomizer/config/FetchPushRandomized/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "FetchPushDefault-v0",
3 | "dimensions": [
4 | {
5 | "name": "friction",
6 | "default": 0.18,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | }
10 | ]
11 | }
12 |
--------------------------------------------------------------------------------
/common/randomizer/config/FetchPushRandomized/random.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "FetchPushDefault-v0",
3 | "dimensions": [
4 | {
5 | "name": "friction",
6 | "default": 0.18,
7 | "multiplier_min": 0.1,
8 | "multiplier_max": 1.11
9 | }
10 | ]
11 | }
12 |
--------------------------------------------------------------------------------
/common/randomizer/config/FetchSlideRandomized/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "FetchPushDefault-v0",
3 | "dimensions": [
4 | {
5 | "name": "damping",
6 | "default": 0.01,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | }
10 | ]
11 | }
12 |
--------------------------------------------------------------------------------
/common/randomizer/config/FetchSlideRandomized/random.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "FetchPushDefault-v0",
3 | "dimensions": [
4 | {
5 | "name": "friction",
6 | "default": 0.18,
7 | "multiplier_min": 0.2,
8 | "multiplier_max": 2.22
9 | }
10 | ]
11 | }
12 |
--------------------------------------------------------------------------------
/common/randomizer/config/ResidualPushRandomized/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ResidualPushDefaultEnv-v0",
3 | "dimensions": [
4 | {
5 | "name": "damping",
6 | "default": 0.01,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | },
10 | {
11 | "name": "friction",
12 | "default": 0.18,
13 | "multiplier_min": 1.0,
14 | "multiplier_max": 1.0
15 | }
16 | ]
17 | }
18 |
--------------------------------------------------------------------------------
/common/randomizer/config/ResidualPushRandomized/random.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ResidualPushRandomizedEnv-v0",
3 | "dimensions": [
4 | {
5 | "name": "damping",
6 | "default": 0.1,
7 | "multiplier_min": 0.1,
8 | "multiplier_max": 3
9 | },
10 |
11 | {
12 | "name": "friction",
13 | "default": 0.18,
14 | "multiplier_min": 0.2777,
15 | "multiplier_max": 2.222
16 | }
17 | ]
18 | }
19 |
--------------------------------------------------------------------------------
/common/randomizer/test.py:
--------------------------------------------------------------------------------
1 | import randomizer
2 | import gym
3 | import numpy as np
4 | from randomizer.wrappers import RandomizedEnvWrapper
5 |
6 |
7 | env = gym.make("FetchPushRandomizedEnv-v0")
8 | # env = RandomizedEnvWrapper(env, seed=12)
9 | obs = env.reset()
10 |
11 |
12 | for i in range(1000):
13 | obs, rew, _, _ = env.step(env.action_space.sample())
14 | env.render()
15 | if i % 100 == 0:
16 | # print('done')
17 | print(rew)
18 | # Randomize the environment
19 | env.randomize([-1])
20 | env.reset()
21 |
--------------------------------------------------------------------------------
/common/randomizer/config/FetchHookRandomized/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ResidualHookRandomizedEnv-v0",
3 | "dimensions": [
4 | {
5 | "name": "block_mass",
6 | "default": 4,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | },
10 | {
11 | "name": "hook_mass",
12 | "default": 4,
13 | "multiplier_min": 1.0,
14 | "multiplier_max": 1.0
15 | },
16 | {
17 | "name": "friction",
18 | "default": 1,
19 | "multiplier_min": 1.0,
20 | "multiplier_max": 1.0
21 | }
22 | ]
23 | }
--------------------------------------------------------------------------------
/common/randomizer/config/ResidualComplexHookRandomized/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ResidualHookRandomizedEnv-v0",
3 | "dimensions": [
4 | {
5 | "name": "block_mass",
6 | "default": 4,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | },
10 | {
11 | "name": "hook_mass",
12 | "default": 4,
13 | "multiplier_min": 1.0,
14 | "multiplier_max": 1.0
15 | },
16 | {
17 | "name": "friction",
18 | "default": 1,
19 | "multiplier_min": 1.0,
20 | "multiplier_max": 1.0
21 | }
22 | ]
23 | }
--------------------------------------------------------------------------------
/common/randomizer/config/ResidualFetchHookRandomized/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ResidualHookRandomizedEnv-v0",
3 | "dimensions": [
4 | {
5 | "name": "block_mass",
6 | "default": 4,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | },
10 | {
11 | "name": "hook_mass",
12 | "default": 4,
13 | "multiplier_min": 1.0,
14 | "multiplier_max": 1.0
15 | },
16 | {
17 | "name": "friction",
18 | "default": 1,
19 | "multiplier_min": 1.0,
20 | "multiplier_max": 1.0
21 | }
22 | ]
23 | }
--------------------------------------------------------------------------------
/common/randomizer/config/FetchHookRandomized/random.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ResidualHookRandomizedEnv-v0",
3 | "dimensions": [
4 | {
5 | "name": "block_mass",
6 | "default": 2,
7 | "multiplier_min": 0.6,
8 | "multiplier_max": 4
9 | },
10 | {
11 | "name": "hook_mass",
12 | "default": 4,
13 | "multiplier_min": 0.6,
14 | "multiplier_max": 2.5
15 | },
16 | {
17 | "name": "friction",
18 | "default": 1,
19 | "multiplier_min": 0.5,
20 | "multiplier_max": 1.2
21 | }
22 | ]
23 | }
24 |
25 |
--------------------------------------------------------------------------------
/common/randomizer/config/ResidualComplexHookRandomized/random.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ResidualHookRandomizedEnv-v0",
3 | "dimensions": [
4 | {
5 | "name": "block_mass",
6 | "default": 4,
7 | "multiplier_min": 0.3,
8 | "multiplier_max": 3
9 | },
10 | {
11 | "name": "hook_mass",
12 | "default": 4,
13 | "multiplier_min": 0.6,
14 | "multiplier_max": 4
15 | },
16 | {
17 | "name": "friction",
18 | "default": 1,
19 | "multiplier_min": 0.8,
20 | "multiplier_max": 2.3
21 | }
22 | ]
23 | }
24 |
25 |
--------------------------------------------------------------------------------
/common/randomizer/config/ResidualFetchHookRandomized/random.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ResidualHookRandomizedEnv-v0",
3 | "dimensions": [
4 | {
5 | "name": "block_mass",
6 | "default": 2,
7 | "multiplier_min": 0.3,
8 | "multiplier_max": 4
9 | },
10 | {
11 | "name": "hook_mass",
12 | "default": 4,
13 | "multiplier_min": 0.6,
14 | "multiplier_max": 2.5
15 | },
16 | {
17 | "name": "friction",
18 | "default": 1,
19 | "multiplier_min": 0.5,
20 | "multiplier_max": 1.2
21 | }
22 | ]
23 | }
24 |
25 |
--------------------------------------------------------------------------------
/common/svpg/particles/utils.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch.nn as nn
3 |
4 |
5 | class AddBias(nn.Module):
6 | def __init__(self, bias):
7 | super(AddBias, self).__init__()
8 | self._bias = nn.Parameter(bias.unsqueeze(1))
9 |
10 | def forward(self, x):
11 | if x.dim() == 2:
12 | bias = self._bias.t().view(1, -1)
13 | else:
14 | bias = self._bias.t().view(1, -1, 1, 1)
15 |
16 | return x + bias
17 |
18 |
19 | def init(module, weight_init, bias_init, gain=1):
20 | weight_init(module.weight.data, gain=gain)
21 | bias_init(module.bias.data)
22 | return module
23 |
24 |
25 | orthogonal_init = lambda m: init(module=m,
26 | weight_init=nn.init.orthogonal_,
27 | bias_init=lambda x: nn.init.constant_(x, 0),
28 | gain=np.sqrt(2))
29 |
--------------------------------------------------------------------------------
/ergo_expts.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --partition=long
3 | #SBATCH --gres=gpu:1 # Ask for 1 GPU
4 | # SBATCH --nodes=5
5 | # SBATCH --ntasks=8
6 | #SBATCH --mem=36G # Ask for 10 GB of RAM
7 | #SBATCH --time=72:00:00 # The job will run for 3 hours
8 | #SBATCH --array=1-5
9 | #SBATCH --mail-user=sharathraparthy@gmail.com
10 | #SBATCH --mail-type=ALL
11 | #SBATCH --mail-type=BEGIN
12 | #SBATCH --mail-type=END
13 | #SBATCH --mail-type=FAIL
14 | #SBATCH --mail-type=REQUEUE
15 | #SBATCH -o /network/home/raparths/onlysp-reacher-adr-ergo-%j.out
16 |
17 | module load python/3.6
18 | # module load mujoco/2.0
19 | module load cuda/10.0
20 | source $HOME/sharath/bin/activate
21 |
22 | python ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-default' --only-sp=True --env_name='ErgoReacherRandomizedEnv-Headless-v0' --n-params=8
23 |
--------------------------------------------------------------------------------
/common/randomizer/randomized_fetchslide.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 |
4 | from gym import utils
5 | from .randomized_fetch import RandomizedFetchEnv
6 |
7 |
8 | # Ensure we get the path separator correct on windows
9 | MODEL_XML_PATH = os.path.join('fetch', 'slide.xml')
10 |
11 |
12 | class FetchSlideRandomizedEnv(RandomizedFetchEnv, utils.EzPickle):
13 | def __init__(self, reward_type='sparse', **kwargs):
14 | initial_qpos = {
15 | 'robot0:slide0': 0.05,
16 | 'robot0:slide1': 0.48,
17 | 'robot0:slide2': 0.0,
18 | 'object0:joint': [1.7, 1.1, 0.41, 1., 0., 0., 0.],
19 | }
20 | RandomizedFetchEnv.__init__(
21 | self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20,
22 | gripper_extra_height=-0.02, target_in_the_air=False, target_offset=np.array([0.4, 0.0, 0.0]),
23 | obj_range=0.1, target_range=0.3, distance_threshold=0.05,
24 | initial_qpos=initial_qpos, reward_type=reward_type, **kwargs)
25 |
--------------------------------------------------------------------------------
/common/randomizer/dimension.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Dimension(object):
5 | """Class which handles the machinery of randomizing a particular dimension
6 | """
7 | def __init__(self, default_value, multiplier_min=0.0, multiplier_max=1.0, name=None):
8 | self.default_value = default_value
9 | self.current_value = default_value
10 | self.multiplier_min = multiplier_min
11 | self.multiplier_max = multiplier_max
12 | self.range_min = self.default_value * self.multiplier_min
13 | self.range_max = self.default_value * self.multiplier_max
14 | self.name = name
15 |
16 | def randomize(self):
17 | self.current_value = np.random.uniform(low=self.range_min, high=self.range_max)
18 |
19 | def rescale(self, value):
20 | return self.range_min + (self.range_max - self.range_min) * value
21 |
22 | def reset(self):
23 | self.current_value = self.default_value
24 |
25 | def set(self, value):
26 | self.current_value = value
27 |
28 |
--------------------------------------------------------------------------------
/common/models/actor_critic.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 |
6 | class Actor(nn.Module):
7 | def __init__(self, state_dim, action_dim, max_action):
8 | super(Actor, self).__init__()
9 |
10 | self.l1 = nn.Linear(state_dim, 400)
11 | self.l2 = nn.Linear(400, 300)
12 | self.l3 = nn.Linear(300, action_dim)
13 |
14 | self.max_action = max_action
15 |
16 | def forward(self, x):
17 | x = F.relu(self.l1(x))
18 | x = F.relu(self.l2(x))
19 | x = self.max_action * torch.tanh(self.l3(x))
20 | return x
21 |
22 | class Critic(nn.Module):
23 | def __init__(self, state_dim, action_dim):
24 | super(Critic, self).__init__()
25 |
26 | self.l1 = nn.Linear(state_dim + action_dim, 400)
27 | self.l2 = nn.Linear(400, 300)
28 | self.l3 = nn.Linear(300, 1)
29 |
30 | def forward(self, x, u):
31 | x = F.relu(self.l1(torch.cat([x, u], 1)))
32 | x = F.relu(self.l2(x))
33 | x = self.l3(x)
34 | return x
35 |
--------------------------------------------------------------------------------
/common/randomizer/randomized_fetchpush.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from gym import utils
4 | from gym.envs.mujoco import mujoco_env
5 |
6 | from .randomized_fetch import RandomizedFetchEnv
7 |
8 | # Ensure we get the path separator correct on windows
9 | MODEL_XML_PATH = os.path.join('fetch', 'push.xml')
10 |
11 | class FetchPushRandomizedEnv(RandomizedFetchEnv, utils.EzPickle):
12 | def __init__(self, reward_type='dense', **kwargs):
13 | initial_qpos = {
14 | 'robot0:slide0': 0.405,
15 | 'robot0:slide1': 0.48,
16 | 'robot0:slide2': 0.0,
17 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.],
18 | }
19 |
20 | RandomizedFetchEnv.__init__(self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20,
21 | gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0,
22 | obj_range=0.15, target_range=0.15, distance_threshold=0.05,
23 | initial_qpos=initial_qpos, reward_type=reward_type, **kwargs)
24 |
25 | utils.EzPickle.__init__(self)
--------------------------------------------------------------------------------
/common/randomizer/randomized_residual_push.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | from gym import utils
4 | from gym.envs.mujoco import mujoco_env
5 |
6 | from .residual_push import RandomizedResidualPushEnv
7 |
8 | # Ensure we get the path separator correct on windows
9 | MODEL_XML_PATH = os.path.join('fetch', 'push.xml')
10 |
11 | # RANDOMIZED RESIDUAL PUSH ENVIRONMENT
12 | class ResidualPushRandomizedEnv(RandomizedResidualPushEnv, utils.EzPickle):
13 | def __init__(self, reward_type='sparse', **kwargs):
14 | initial_qpos = {
15 | 'robot0:slide0': 0.405,
16 | 'robot0:slide1': 0.48,
17 | 'robot0:slide2': 0.0,
18 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.],
19 | }
20 | RandomizedResidualPushEnv.__init__(self, MODEL_XML_PATH, has_object=True, block_gripper=True, n_substeps=20,
21 | gripper_extra_height=0.0, target_in_the_air=False, target_offset=0.0,
22 | obj_range=0.15, target_range=0.15, distance_threshold=0.05,
23 | initial_qpos=initial_qpos, reward_type=reward_type, **kwargs)
24 |
25 | utils.EzPickle.__init__(self)
--------------------------------------------------------------------------------
/common/randomizer/config/ErgoReacherRandomized/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ErgoReacherDefault-v0",
3 | "dimensions": [
4 | {
5 | "name": "joint0gain",
6 | "default": 1,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | },
10 | {
11 | "name": "joint1gain",
12 | "default": 1,
13 | "multiplier_min": 1.0,
14 | "multiplier_max": 1.0
15 | },
16 | {
17 | "name": "joint2gain",
18 | "default": 1,
19 | "multiplier_min": 1.0,
20 | "multiplier_max": 1.0
21 | },
22 | {
23 | "name": "joint3gain",
24 | "default": 1,
25 | "multiplier_min": 1.0,
26 | "multiplier_max": 1.0
27 | },
28 | {
29 | "name": "joint0maxtorque",
30 | "default": 18,
31 | "multiplier_min": 1.0,
32 | "multiplier_max": 1.0
33 | },
34 | {
35 | "name": "joint1maxtorque",
36 | "default": 18,
37 | "multiplier_min": 1.0,
38 | "multiplier_max": 1.0
39 | },
40 | {
41 | "name": "joint2maxtorque",
42 | "default": 18,
43 | "multiplier_min": 1.0,
44 | "multiplier_max": 1.0
45 | },
46 | {
47 | "name": "joint3maxtorque",
48 | "default": 18,
49 | "multiplier_min": 1.0,
50 | "multiplier_max": 1.0
51 | }
52 | ]
53 | }
--------------------------------------------------------------------------------
/common/randomizer/config/ErgoReacherRandomized/random.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ErgoReacherRandomized-v0",
3 | "dimensions": [
4 | {
5 | "name": "joint0gain",
6 | "default": 1,
7 | "multiplier_min": 0.3,
8 | "multiplier_max": 2.0
9 | },
10 | {
11 | "name": "joint1gain",
12 | "default": 1,
13 | "multiplier_min": 0.3,
14 | "multiplier_max": 2.0
15 | },
16 | {
17 | "name": "joint2gain",
18 | "default": 1,
19 | "multiplier_min": 0.3,
20 | "multiplier_max": 2.0
21 | },
22 | {
23 | "name": "joint3gain",
24 | "default": 1,
25 | "multiplier_min": 0.3,
26 | "multiplier_max": 2.0
27 | },
28 | {
29 | "name": "joint0maxtorque",
30 | "default": 18,
31 | "multiplier_min": 1.0,
32 | "multiplier_max": 4.0
33 | },
34 | {
35 | "name": "joint1maxtorque",
36 | "default": 18,
37 | "multiplier_min": 1.0,
38 | "multiplier_max": 4.0
39 | },
40 | {
41 | "name": "joint2maxtorque",
42 | "default": 18,
43 | "multiplier_min": 1.0,
44 | "multiplier_max": 4.0
45 | },
46 | {
47 | "name": "joint3maxtorque",
48 | "default": 18,
49 | "multiplier_min": 1.0,
50 | "multiplier_max": 4.0
51 | }
52 | ]
53 | }
--------------------------------------------------------------------------------
/common/randomizer/config/ErgoReacherRandomized/hard.json:
--------------------------------------------------------------------------------
1 | {
2 | "env": "ErgoReacherRandomizedHard-v0",
3 | "dimensions": [
4 | {
5 | "name": "joint0gain",
6 | "default": 0.25,
7 | "multiplier_min": 1.0,
8 | "multiplier_max": 1.0
9 | },
10 | {
11 | "name": "joint1gain",
12 | "default": 0.25,
13 | "multiplier_min": 1.0,
14 | "multiplier_max": 1.0
15 | },
16 | {
17 | "name": "joint2gain",
18 | "default": 0.25,
19 | "multiplier_min": 1.0,
20 | "multiplier_max": 1.0
21 | },
22 | {
23 | "name": "joint3gain",
24 | "default": 0.25,
25 | "multiplier_min": 1.0,
26 | "multiplier_max": 1.0
27 | },
28 | {
29 | "name": "joint0maxtorque",
30 | "default": 5.0,
31 | "multiplier_min": 1.0,
32 | "multiplier_max": 1.0
33 | },
34 | {
35 | "name": "joint1maxtorque",
36 | "default": 5.0,
37 | "multiplier_min": 1.0,
38 | "multiplier_max": 1.0
39 | },
40 | {
41 | "name": "joint2maxtorque",
42 | "default": 5.0,
43 | "multiplier_min": 1.0,
44 | "multiplier_max": 1.0
45 | },
46 | {
47 | "name": "joint3maxtorque",
48 | "default": 5.0,
49 | "multiplier_min": 1.0,
50 | "multiplier_max": 1.0
51 | }
52 | ]
53 | }
--------------------------------------------------------------------------------
/common/randomizer/assets/push.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/common/randomizer/assets/slide.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
--------------------------------------------------------------------------------
/utils/learning_curve.py:
--------------------------------------------------------------------------------
1 | from arguments import get_args
2 | import numpy as np
3 | import os
4 | import matplotlib.pyplot as plt
5 | from utils.plot_utils import learning_curve
6 |
7 | if __name__=='__main__':
8 | args = get_args()
9 | # load the model param
10 | selfplay_percent = [0.0, 1.0, 1.0]
11 | x_gen_labels = np.linspace(0, 199, 199)
12 | approach = ["udr", "adr", "unsupervised-default"]
13 | PLOTCOLORS = ['hotpink', 'red', 'darkolivegreen', 'hotpink', 'blue']
14 | save_plots = os.getcwd() + f'/plots/{args.env_name}/'
15 | if not os.path.isdir(save_plots):
16 | os.makedirs(save_plots)
17 | plt.rcParams["figure.figsize"] = (10, 6)
18 | SEED = [42, 41, 44, 43]
19 |
20 | for selfplay_index, a in enumerate(approach):
21 | mean, std = learning_curve(args, selfplay_percent, selfplay_index, a, SEED)
22 | mean = np.reshape(mean, (-1))
23 | std = np.reshape(std, (-1))
24 | plt.plot(x_gen_labels, mean, label=f'{a} - sp{selfplay_percent[selfplay_index]}', alpha=0.7)
25 | plt.fill_between(x_gen_labels, mean + std/2, mean - std/2, alpha=0.2)
26 | plt.title(f'Learning Curve for {args.env_name} | Mode : {args.mode}')
27 | plt.xlabel("Number of evaluations | 1 eval per 5000 timesteps")
28 | plt.ylabel("Average Distance")
29 | plt.ylim(0, 0.3)
30 | plt.xlim(0, 200)
31 | plt.legend()
32 | plt.savefig(f'{save_plots}/{args.mode}_generalization_sp{selfplay_percent[1]}.png', figsize=(10, 6))
33 | plt.show()
--------------------------------------------------------------------------------
/common/agents/ddpg/replay_buffer.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | class ReplayBuffer(object):
6 |
7 | def __init__(self, state_dim, action_dim, max_size=int(1e6)):
8 | self.max_size = max_size
9 | self.ptr = 0
10 | self.size = 0
11 |
12 | self.state = np.zeros((max_size, state_dim))
13 | self.action = np.zeros((max_size, action_dim))
14 | self.next_state = np.zeros((max_size, state_dim))
15 | self.reward = np.zeros((max_size, 1))
16 | self.not_done = np.zeros((max_size, 1))
17 |
18 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
19 |
20 | def add(self, state, action, next_state, reward, done):
21 | self.state[self.ptr] = state
22 | self.action[self.ptr] = action
23 | self.next_state[self.ptr] = next_state
24 | self.reward[self.ptr] = reward
25 | self.not_done[self.ptr] = 1. - done
26 |
27 | self.ptr = (self.ptr + 1) % self.max_size
28 | self.size = min(self.size + 1, self.max_size)
29 |
30 | def sample(self, batch_size):
31 | ind = np.random.randint(0, self.size, size=batch_size)
32 |
33 | return (
34 | torch.FloatTensor(self.state[ind]).to(self.device),
35 | torch.FloatTensor(self.action[ind]).to(self.device),
36 | torch.FloatTensor(self.next_state[ind]).to(self.device),
37 | torch.FloatTensor(self.reward[ind]).to(self.device),
38 | torch.FloatTensor(self.not_done[ind]).to(self.device)
39 | )
--------------------------------------------------------------------------------
/common/models/discriminator.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | class MLPDiscriminator(nn.Module):
5 | """Discriminator class based on Feedforward Network
6 | Input is a state-action-state' transition
7 | Output is probability that it was from a reference trajectory
8 | """
9 | def __init__(self, state_dim, action_dim):
10 | super(MLPDiscriminator, self).__init__()
11 |
12 | self.l1 = nn.Linear((state_dim + action_dim + state_dim), 128)
13 | self.l2 = nn.Linear(128, 128)
14 | self.logic = nn.Linear(128, 1)
15 |
16 | self.logic.weight.data.mul_(0.1)
17 | self.logic.bias.data.mul_(0.0)
18 |
19 | # Tuple of S-A-S'
20 | def forward(self, x):
21 | x = torch.tanh(self.l1(x))
22 | x = torch.tanh(self.l2(x))
23 | x = self.logic(x)
24 | return torch.sigmoid(x)
25 |
26 | class GAILMLPDiscriminator(nn.Module):
27 | """Discriminator class based on Feedforward Network
28 | Input is a state-action-state' transition
29 | Output is probability that it was from a reference trajectory
30 | """
31 | def __init__(self, state_dim, action_dim):
32 | super(GAILMLPDiscriminator, self).__init__()
33 | self.l1 = nn.Linear((state_dim + action_dim), 128)
34 | self.l2 = nn.Linear(128, 128)
35 | self.logic = nn.Linear(128, 1)
36 |
37 | self.logic.weight.data.mul_(0.1)
38 | self.logic.bias.data.mul_(0.0)
39 |
40 | # Tuple of S-A-S'
41 | def forward(self, x):
42 | x = torch.tanh(self.l1(x))
43 | x = torch.tanh(self.l2(x))
44 | x = self.logic(x)
45 | return torch.sigmoid(x)
--------------------------------------------------------------------------------
/common/randomizer/randomized_ergo_pusher.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import gym_ergojr
3 | from gym_ergojr.envs.ergo_pusher_env import ErgoPusherEnv
4 |
5 | def goal_distance(goal_a, goal_b):
6 | assert goal_a.shape == goal_b.shape
7 | return np.linalg.norm(goal_a - goal_b, axis=-1)
8 |
9 |
10 | class ErgoPusherRandomizedEnv(ErgoPusherEnv):
11 | def __init__(self, **kwargs):
12 | self.dimensions = []
13 | self.config_file = kwargs.get('config')
14 | del kwargs['config']
15 |
16 | super().__init__(**kwargs)
17 | self.reward_type = 'sparse'
18 | self.distance_threshold = 0.05
19 |
20 | # # this is affected by the DR
21 | # self.puck.friction
22 |
23 | def step(self, action):
24 | obs, reward, done, info = super().step(action)
25 | observation = {"observation": obs,
26 | "achieved_goal": obs[6:8],
27 | "desired_goal": obs[8:]}
28 | return observation, reward, done, info
29 |
30 | def reset(self, forced=False):
31 | obs = super(ErgoPusherRandomizedEnv, self).reset()
32 | observation = {"observation": obs,
33 | "achieved_goal": obs[6:8],
34 | "desired_goal": obs[8:]}
35 | return observation
36 |
37 | def update_randomized_params(self):
38 |
39 | self.puck.friction = self.dimensions[0].current_value
40 |
41 | def compute_reward(self, achieved_goal, goal, info):
42 | # Compute distance between goal and the achieved goal.
43 | d = goal_distance(achieved_goal, goal)
44 | if self.reward_type == 'sparse':
45 | return -(d > self.distance_threshold).astype(np.float32)
46 | else:
47 | return -d
--------------------------------------------------------------------------------
/common/randomizer/randomized_ergoreacher.py:
--------------------------------------------------------------------------------
1 | from gym_ergojr.envs import ErgoReacherEnv
2 | import numpy as np
3 |
4 |
5 | class ErgoReacherRandomizedEnv(ErgoReacherEnv):
6 | def __init__(self, **kwargs):
7 | self.dimensions = [] # this will be 8 elements long after wrapper init
8 | self.config_file = kwargs.get('config')
9 |
10 | del kwargs['config']
11 |
12 | super().__init__(**kwargs)
13 |
14 | # # these two are affected by the DR
15 | # self.max_force
16 | # self.max_vel
17 |
18 | def step(self, action):
19 | obs, reward, done, info = super().step(action)
20 | info = {'distance': self.dist.query()}
21 | observation = {"observation": obs,
22 | "achieved_goal": np.asarray(self.get_tip()[0][1:]),
23 | "desired_goal": obs[8:]}
24 | return observation, reward, done, info
25 |
26 | def reset(self, forced=False):
27 | obs = super(ErgoReacherRandomizedEnv, self).reset()
28 | observation = {"observation": obs,
29 | "achieved_goal": np.asarray(self.get_tip()[0][1:]),
30 | "desired_goal": obs[8:]}
31 | return observation
32 |
33 | def update_randomized_params(self):
34 | # these are used automatically in the `step` function
35 | self.max_force = np.zeros(6, np.float32)
36 | self.max_vel = np.zeros(6, np.float32)
37 |
38 | if self.simple:
39 | self.max_force[[1, 2, 4, 5]] = [x.current_value for x in self.dimensions[:4]]
40 | self.max_vel[[1, 2, 4, 5]] = [x.current_value for x in self.dimensions[4:]]
41 | else:
42 | self.max_force[:] = [x.current_value for x in self.dimensions[:6]]
43 | self.max_vel[:] = [x.current_value for x in self.dimensions[6:]]
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | saved-models/
2 | .idea/
3 |
4 | # Byte-compiled / optimized / DLL files
5 | __pycache__/
6 | *.py[cod]
7 | *$py.class
8 |
9 | # C extensions
10 | *.so
11 |
12 | # Distribution / packaging
13 | .Python
14 | build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | *.egg-info/
27 | .installed.cfg
28 | *.egg
29 | MANIFEST
30 |
31 | # PyInstaller
32 | # Usually these files are written by a python script from a template
33 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
34 | *.manifest
35 | *.spec
36 |
37 | # Installer logs
38 | pip-log.txt
39 | pip-delete-this-directory.txt
40 |
41 | # Unit test / coverage reports
42 | htmlcov/
43 | .tox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | .hypothesis/
51 | .pytest_cache/
52 |
53 | # Translations
54 | *.mo
55 | *.pot
56 |
57 | # Django stuff:
58 | *.log
59 | local_settings.py
60 | db.sqlite3
61 |
62 | # Flask stuff:
63 | instance/
64 | .webassets-cache
65 |
66 | # Scrapy stuff:
67 | .scrapy
68 |
69 | # Sphinx documentation
70 | docs/_build/
71 |
72 | # PyBuilder
73 | target/
74 |
75 | # Jupyter Notebook
76 | .ipynb_checkpoints
77 |
78 | # pyenv
79 | .python-version
80 |
81 | # celery beat schedule file
82 | celerybeat-schedule
83 |
84 | # SageMath parsed files
85 | *.sage.py
86 |
87 | # Environments
88 | .env
89 | .venv
90 | env/
91 | venv/
92 | ENV/
93 | env.bak/
94 | venv.bak/
95 |
96 | # Spyder project settings
97 | .spyderproject
98 | .spyproject
99 |
100 | # Rope project settings
101 | .ropeproject
102 |
103 | # mkdocs documentation
104 | /site
105 |
106 | # mypy
107 | .mypy_cache/
108 | .DS_Store
109 |
110 | *.pth
111 | *.pt
112 | *.npy
113 |
--------------------------------------------------------------------------------
/common/randomizer/assets/hook.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/common/svpg/particles/distributions.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from common.svpg.particles.utils import AddBias, init
6 |
7 | """
8 | Modify standard PyTorch distributions so they are compatible with this code.
9 | """
10 |
11 | # Categorical
12 | FixedCategorical = torch.distributions.Categorical
13 |
14 | old_sample = FixedCategorical.sample
15 | FixedCategorical.sample = lambda self: old_sample(self).unsqueeze(-1)
16 |
17 | log_prob_cat = FixedCategorical.log_prob
18 | FixedCategorical.log_probs = lambda self, actions: log_prob_cat(self, actions.squeeze(-1)).view(actions.size(0), -1).sum(-1).unsqueeze(-1)
19 |
20 | FixedCategorical.mode = lambda self: self.probs.argmax(dim=-1, keepdim=True)
21 |
22 |
23 | # Normal
24 | FixedNormal = torch.distributions.Normal
25 |
26 | log_prob_normal = FixedNormal.log_prob
27 | FixedNormal.log_probs = lambda self, actions: log_prob_normal(self, actions).sum(-1, keepdim=True)
28 |
29 | normal_entropy = FixedNormal.entropy
30 | FixedNormal.entropy = lambda self: normal_entropy(self).sum(-1)
31 |
32 | FixedNormal.mode = lambda self: self.mean
33 |
34 |
35 | class Categorical(nn.Module):
36 | def __init__(self, num_inputs, num_outputs):
37 | super(Categorical, self).__init__()
38 |
39 | init_ = lambda m: init(m,
40 | nn.init.orthogonal_,
41 | lambda x: nn.init.constant_(x, 0),
42 | gain=0.01)
43 |
44 | self.linear = init_(nn.Linear(num_inputs, num_outputs))
45 |
46 | def forward(self, x):
47 | x = self.linear(x)
48 | return FixedCategorical(logits=x)
49 |
50 |
51 | class DiagGaussian(nn.Module):
52 | def __init__(self, num_inputs, num_outputs):
53 | super(DiagGaussian, self).__init__()
54 |
55 | init_ = lambda m: init(m,
56 | nn.init.orthogonal_,
57 | lambda x: nn.init.constant_(x, 0))
58 |
59 | self.fc_mean = init_(nn.Linear(num_inputs, num_outputs))
60 | self.logstd = AddBias(torch.zeros(num_outputs))
61 |
62 | def forward(self, x):
63 | action_mean = self.fc_mean(x)
64 |
65 | # An ugly hack for my KFAC implementation.
66 | zeros = torch.zeros(action_mean.size())
67 | if x.is_cuda:
68 | zeros = zeros.cuda()
69 |
70 | action_logstd = self.logstd(zeros)
71 | return FixedNormal(action_mean, action_logstd.exp())
--------------------------------------------------------------------------------
/common/randomizer/controllers/miscalibrated_push_controller.py:
--------------------------------------------------------------------------------
1 | from .pick_and_place_controller import get_move_action
2 |
3 | import numpy as np
4 |
5 | DEBUG = False
6 |
7 |
8 | def get_push_control(obs, atol=1e-3, block_width=0.15, workspace_height=0.1):
9 |
10 | gripper_position = obs['observation'][:3]
11 | block_position = obs['observation'][3:6]
12 | goal = obs['desired_goal']
13 |
14 | desired_block_angle = np.arctan2(goal[0] - block_position[0], goal[1] - block_position[1])
15 | gripper_angle = np.arctan2(goal[0] - gripper_position[0], goal[1] - gripper_position[1])
16 |
17 | push_position = block_position.copy()
18 | push_position[0] += -1. * np.sin(desired_block_angle) * block_width / 2.
19 | push_position[1] += -1. * np.cos(desired_block_angle) * block_width / 2.
20 | push_position[2] += 0.005
21 |
22 | # If the block is already at the place position, do nothing
23 | if np.sum(np.subtract(block_position, goal)**2) < atol:
24 | if DEBUG:
25 | print("The block is already at the place position; do nothing")
26 | return np.array([0., 0., 0., 0.])
27 |
28 | # Angle between gripper and goal vs block and goal is roughly the same
29 | angle_diff = abs((desired_block_angle - gripper_angle + np.pi) % (2*np.pi) - np.pi)
30 |
31 | gripper_sq_distance = (gripper_position[0] - goal[0])**2 + (gripper_position[1] - goal[1])**2
32 | block_sq_distance = (block_position[0] - goal[0])**2 + (block_position[1] - goal[1])**2
33 |
34 | if (gripper_position[2] - push_position[2])**2 < atol and angle_diff < np.pi/4 and block_sq_distance < gripper_sq_distance:
35 |
36 | # Push towards the goal
37 | target_position = goal
38 | target_position[2] = gripper_position[2]
39 | if DEBUG:
40 | print("Push")
41 | return get_move_action(obs, target_position, atol=atol, gain=5.0)
42 |
43 | # If the gripper is above the push position
44 | if (gripper_position[0] - push_position[0])**2 + (gripper_position[1] - push_position[1])**2 < atol:
45 |
46 | # Move down to prepare for push
47 | if DEBUG:
48 | print("Move down to prepare for push")
49 | return get_move_action(obs, push_position, atol=atol)
50 |
51 |
52 | # Else move the gripper to above the push
53 | target_position = push_position.copy()
54 | target_position[2] += workspace_height
55 | if DEBUG:
56 | print("Move to above the push position")
57 |
58 | return get_move_action(obs, target_position, atol=atol)
59 |
60 |
61 |
--------------------------------------------------------------------------------
/common/randomizer/miscalibrated_push_controller.py:
--------------------------------------------------------------------------------
1 | from .controllers.pick_and_place_controller import get_move_action
2 |
3 | import numpy as np
4 |
5 | DEBUG = False
6 |
7 |
8 | def get_push_control(obs, atol=1e-3, block_width=0.15, workspace_height=0.1):
9 |
10 | gripper_position = obs['observation'][:3]
11 | block_position = obs['observation'][3:6]
12 | goal = obs['desired_goal']
13 |
14 | desired_block_angle = np.arctan2(goal[0] - block_position[0], goal[1] - block_position[1])
15 | gripper_angle = np.arctan2(goal[0] - gripper_position[0], goal[1] - gripper_position[1])
16 |
17 | push_position = block_position.copy()
18 | push_position[0] += -1. * np.sin(desired_block_angle) * block_width / 2.
19 | push_position[1] += -1. * np.cos(desired_block_angle) * block_width / 2.
20 | push_position[2] += 0.005
21 |
22 | # If the block is already at the place position, do nothing
23 | if np.sum(np.subtract(block_position, goal)**2) < atol:
24 | if DEBUG:
25 | print("The block is already at the place position; do nothing")
26 | return np.array([0., 0., 0., 0.])
27 |
28 | # Angle between gripper and goal vs block and goal is roughly the same
29 | angle_diff = abs((desired_block_angle - gripper_angle + np.pi) % (2*np.pi) - np.pi)
30 |
31 | gripper_sq_distance = (gripper_position[0] - goal[0])**2 + (gripper_position[1] - goal[1])**2
32 | block_sq_distance = (block_position[0] - goal[0])**2 + (block_position[1] - goal[1])**2
33 |
34 | if (gripper_position[2] - push_position[2])**2 < atol and angle_diff < np.pi/4 and block_sq_distance < gripper_sq_distance:
35 |
36 | # Push towards the goal
37 | target_position = goal
38 | target_position[2] = gripper_position[2]
39 | if DEBUG:
40 | print("Push")
41 | return get_move_action(obs, target_position, atol=atol, gain=5.0)
42 |
43 | # If the gripper is above the push position
44 | if (gripper_position[0] - push_position[0])**2 + (gripper_position[1] - push_position[1])**2 < atol:
45 |
46 | # Move down to prepare for push
47 | if DEBUG:
48 | print("Move down to prepare for push")
49 | return get_move_action(obs, push_position, atol=atol)
50 |
51 |
52 | # Else move the gripper to above the push
53 | target_position = push_position.copy()
54 | target_position[2] += workspace_height
55 | if DEBUG:
56 | print("Move to above the push position")
57 |
58 | return get_move_action(obs, target_position, atol=atol)
59 |
60 |
61 |
--------------------------------------------------------------------------------
/common/svpg/particles/svpg_particle.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 |
5 | from .utils import orthogonal_init
6 | from .distributions import Categorical, DiagGaussian
7 |
8 |
9 | class SVPGParticleCritic(nn.Module):
10 | def __init__(self, input_dim, output_dim, hidden_dim):
11 | super(SVPGParticleCritic, self).__init__()
12 |
13 | self.critic = nn.Sequential(
14 | orthogonal_init(nn.Linear(input_dim, hidden_dim)),
15 | nn.Tanh(),
16 | orthogonal_init(nn.Linear(hidden_dim, hidden_dim)),
17 | nn.Tanh(),
18 | orthogonal_init(nn.Linear(hidden_dim, 1))
19 | )
20 |
21 | def forward(self, x):
22 | return self.critic(x)
23 |
24 | class SVPGParticleActorBase(nn.Module):
25 | def __init__(self, input_dim, hidden_dim):
26 | super(SVPGParticleActorBase, self).__init__()
27 |
28 | self.actor_hidden = nn.Sequential(
29 | orthogonal_init(nn.Linear(input_dim, hidden_dim)),
30 | nn.Tanh(),
31 | orthogonal_init(nn.Linear(hidden_dim, hidden_dim)),
32 | nn.Tanh(),
33 | )
34 |
35 | def forward(self, x):
36 | return self.actor_hidden(x)
37 |
38 |
39 | class SVPGParticle(nn.Module):
40 | """Implements a AC architecture for a Discrete Advantage
41 | Actor Critic Policy, used inside of SVPG
42 | """
43 | def __init__(self, input_dim, output_dim, hidden_dim, discrete, freeze=False):
44 | super(SVPGParticle, self).__init__()
45 |
46 | self.critic = SVPGParticleCritic(input_dim, output_dim, hidden_dim)
47 | self.actor_hidden = SVPGParticleActorBase(input_dim, hidden_dim)
48 |
49 | if discrete:
50 | self.dist = Categorical(hidden_dim, output_dim)
51 | else:
52 | self.dist = DiagGaussian(hidden_dim, output_dim)
53 |
54 | if freeze:
55 | self.freeze()
56 |
57 | self.reset()
58 |
59 | def forward(self, x):
60 | actor_hidden = self.actor_hidden(x)
61 | dist = self.dist(actor_hidden)
62 | value = self.critic(x)
63 |
64 | return dist, value
65 |
66 | def freeze(self):
67 | for param in self.critic.parameters():
68 | param.requires_grad = False
69 |
70 | for param in self.actor_hidden.parameters():
71 | param.requires_grad = False
72 |
73 | for param in self.dist.parameters():
74 | param.requires_grad = False
75 |
76 | def reset(self):
77 | self.saved_log_probs = []
78 | self.saved_klds = []
79 | self.rewards = []
--------------------------------------------------------------------------------
/common/randomizer/ergo_objects.py:
--------------------------------------------------------------------------------
1 | import random
2 | import os
3 | import gym
4 | import numpy as np
5 | from gym import spaces
6 | from tqdm import tqdm
7 | from gym_ergojr.sim.objects import Puck
8 | import pybullet as p
9 |
10 | PUSHER_GOAL_X = [-.2, -.1] # Get three points
11 | PUSHER_GOAL_Y = [-.1, .05] # Get three points
12 | # Define a circle equation with three points. After that choose four equidistant points on that circle.
13 |
14 | circle1 = np.array([[-0.1, -0.1],
15 | [-0.15, -0.1],
16 | [-0.1, 0.05]])
17 | shift = [-0.029, -0.0475, -0.066]
18 |
19 |
20 | def define_circle(p1, p2, p3):
21 | temp = p2[0] * p2[0] + p2[1] * p2[1]
22 | bc = (p1[0] * p1[0] + p1[1] * p1[1] - temp) / 2
23 | cd = (temp - p3[0] * p3[0] - p3[1] * p3[1]) / 2
24 | det = (p1[0] - p2[0]) * (p2[1] - p3[1]) - (p2[0] - p3[0]) * (p1[1] - p2[1])
25 |
26 | if abs(det) < 1.0e-6:
27 | return (None, np.inf)
28 |
29 | # Center of circle
30 | cx = (bc*(p2[1] - p3[1]) - cd*(p1[1] - p2[1])) / det
31 | cy = ((p1[0] - p2[0]) * cd - (p2[0] - p3[0]) * bc) / det
32 |
33 | radius = np.sqrt((cx - p1[0])**2 + (cy - p1[1])**2)
34 | return ((cx, cy), radius)
35 |
36 |
37 | def angle(line1, line2):
38 | return np.arccos(np.dot(line1, line2)/(np.linalg.norm(line1)*np.linalg.norm(line2)))
39 |
40 |
41 | goal_pos = np.asarray([[-0.154, 0.0, 0], [-0.15, 0.032, 0], [-0.15, -0.032, 0], [-0.137, 0.062, 0],
42 | [-0.137, -0.062, 0], [-0.117, 0.088, 0], [-0.117, -0.088, 0], [-0.172, 0.0, 0],
43 | [-0.168, 0.032, 0], [-0.168, -0.032, 0],[-0.155, 0.062, 0],[-0.155, -0.062, 0], [-0.135, 0.088, 0],
44 | [-0.135, -0.088, 0], [-0.191, 0.0, 0], [-0.191, -0.0, 0], [-0.186, 0.032, 0], [-0.186, -0.032, 0],
45 | [-0.174, 0.062, 0], [-0.174, -0.062, 0], [-0.154, 0.088, 0], [-0.154, -0.08, 0]]) # Don't hard code. Change afterwards
46 |
47 | puck_positions = np.array([[-0.07, 0.05],
48 | [-0.85, 0.65],
49 | [-0.10, 0.08]])
50 | class PuckEval(Puck):
51 | def __init__(self, friction=0.4):
52 | super(PuckEval, self).__init__()
53 | self.goal = None
54 |
55 | def reset(self):
56 | super().reset()
57 |
58 | def hard_reset(self):
59 | super().hard_reset()
60 |
61 | def add_target(self):
62 | self.dbo.goal = goal_pos[int(os.environ['goal_index'])]
63 | self.obj_visual = p.createVisualShape(
64 | p.GEOM_CYLINDER, radius=0.02, length=0.01, rgbaColor=[0, 1, 0, 1])
65 | self.target = p.createMultiBody(
66 | baseVisualShapeIndex=self.obj_visual, basePosition=self.dbo.goal)
67 |
68 |
--------------------------------------------------------------------------------
/common/randomizer/wrappers.py:
--------------------------------------------------------------------------------
1 | import gym
2 | import json
3 | import numpy as np
4 |
5 | import gym.spaces as spaces
6 |
7 | from common.randomizer.dimension import Dimension
8 |
9 |
10 | class RandomizedEnvWrapper(gym.Wrapper):
11 | """Creates a randomization-enabled enviornment, which can change
12 | physics / simulation parameters without relaunching everything
13 | """
14 |
15 | def __init__(self, env, seed):
16 | super(RandomizedEnvWrapper, self).__init__(env)
17 | self.config_file = self.unwrapped.config_file
18 |
19 | self._load_randomization_dimensions(seed)
20 | self.unwrapped.update_randomized_params()
21 | self.randomized_default = ['random'] * len(self.unwrapped.dimensions)
22 |
23 | def _load_randomization_dimensions(self, seed):
24 | """ Helper function to load environment defaults ranges
25 | """
26 | self.unwrapped.dimensions = []
27 |
28 | with open(self.config_file, mode='r') as f:
29 | config = json.load(f)
30 |
31 | for dimension in config['dimensions']:
32 | self.unwrapped.dimensions.append(
33 | Dimension(
34 | default_value=dimension['default'],
35 | multiplier_min=dimension['multiplier_min'],
36 | multiplier_max=dimension['multiplier_max'],
37 | name=dimension['name']
38 | )
39 | )
40 |
41 | nrand = len(self.unwrapped.dimensions)
42 | self.unwrapped.randomization_space = spaces.Box(0, 1, shape=(nrand,), dtype=np.float32)
43 |
44 | def randomize(self, randomized_values=[-1]):
45 | """Sets the parameter values such that a call to`update_randomized_params()`
46 | will generate an environment with those settings.
47 |
48 | Passing a list of 'default' strings will give the default value
49 | Passing a list of 'random' strings will give a purely random value for that dimension
50 | Passing a list of -1 integers will have the same effect.
51 | """
52 | for dimension, randomized_value in enumerate(randomized_values):
53 | if randomized_value == 'default':
54 | self.unwrapped.dimensions[dimension].current_value = \
55 | self.unwrapped.dimensions[dimension].default_value
56 | elif randomized_value != 'random' and randomized_value != -1:
57 | assert 0.0 <= randomized_value <= 1.0, "using incorrect: {}".format(randomized_value)
58 | self.unwrapped.dimensions[dimension].current_value = \
59 | self.unwrapped.dimensions[dimension].rescale(randomized_value)
60 | else: # random
61 | self.unwrapped.dimensions[dimension].randomize()
62 |
63 | self.unwrapped.update_randomized_params()
64 |
65 | def step(self, action):
66 | return self.env.step(action)
67 |
68 | def reset(self, **kwargs):
69 | return self.env.reset(**kwargs)
--------------------------------------------------------------------------------
/experiments/evaluate_ergo_envs.py:
--------------------------------------------------------------------------------
1 | import re
2 | from common.randomizer.wrappers import RandomizedEnvWrapper
3 | from locale import atof
4 | import time
5 | import torch
6 | import common.randomizer as randomizer
7 | from arguments import get_args
8 | import gym
9 | import gym_ergojr
10 | import numpy as np
11 | import os.path as osp
12 | import os
13 | import time
14 | from common.agents.ddpg.ddpg import DDPG
15 |
16 |
17 | def atof(text):
18 | try:
19 | retval = float(text)
20 | except ValueError:
21 | retval = text
22 | return retval
23 |
24 |
25 | def natural_keys(text):
26 | return [atof(c) for c in re.split(r'[+-]?([0-9]+(?:[.][0-9]*)?|[.][0-9]+)', text) ]
27 |
28 |
29 | if __name__ == '__main__':
30 | args = get_args()
31 | # load the model param
32 | SEED = [41, 42, 43, 44]
33 | APPROACH = ['udr']
34 | SP_PERCENT = [0.0]
35 | for i, approach in enumerate(APPROACH):
36 | for seed in SEED:
37 | print('------------------------------------------------------------------------')
38 | print(f'Approach : {approach} | Seed : {seed} | Env : {args.env_name}')
39 | print('------------------------------------------------------------------------')
40 | args.save_dir = osp.join('/saved_models', 'sp{}polyak{}'.format(SP_PERCENT[i], args.sp_polyak) + '-' + approach)
41 | model_path = osp.join(os.getcwd() + args.save_dir, str(seed), args.env_name)
42 | models_path = os.listdir(model_path + '/')
43 |
44 | # List the file names that start with model and sort according to number.
45 | models_path = list(filter(lambda x: x.lower().endswith("actor.pth"), models_path))
46 | models_path.sort(key=natural_keys)
47 |
48 | env = gym.make(args.env_name)
49 | env = RandomizedEnvWrapper(env, seed=12)
50 | env.randomize(['default'] * args.n_param)
51 | env.seed(args.seed + 100)
52 | eval_episodes = 1
53 |
54 | state_dim = env.observation_space.shape[0]
55 | action_dim = env.action_space.shape[0]
56 | max_action = float(env.action_space.high[0])
57 | goal_dim = 2
58 | final_dist = []
59 | policy = DDPG(args, state_dim, action_dim, max_action, goal_dim)
60 | for model in models_path:
61 | print(model)
62 | policy.load(model, model_path)
63 | avg_reward = 0.
64 | avg_dist = 0
65 | model_dist = []
66 | for key in range(eval_episodes):
67 | obs = env.reset()
68 | done = False
69 | while not done:
70 | time.sleep(0.01)
71 | action = policy.select_action(np.array(obs, dtype=np.float64))
72 | obs, reward, done, info = env.step(action)
73 | avg_reward += reward
74 |
75 | avg_dist += info["distance"]
76 | final_dist.append(avg_dist/eval_episodes)
77 | np.save(model_path + f'{args.mode}_evaluation.npy', np.asarray(final_dist))
78 |
79 |
80 |
--------------------------------------------------------------------------------
/utils/plot_utils.py:
--------------------------------------------------------------------------------
1 | import itertools
2 | import numpy as np
3 | import os
4 | import os.path as osp
5 |
6 |
7 | def learning_curve(args, selfplay_percent, selplay_index, approach, seed):
8 | '''
9 | args: arguments
10 | selfplay_percent: list of selfplay percentages (list)
11 | selplay_index: index of particular approach (integer)
12 | approach: list of different approaches ( Ex: ['udr', 'adr', 'unsupervised-default']
13 | seed: list of seed numbers
14 | return: returns mean and standard deviation across all seeds.
15 | '''
16 | default_dist = []
17 | for s in seed:
18 | a = []
19 | save_dir = osp.join(os.getcwd() + '/' + args.save_dir, "sp" + str(selfplay_percent[selplay_index]) + "polyak" +
20 | str(args.polyak) + '-' + str(approach), str(s) + '/')
21 | array = np.load(save_dir + f'{args.env_name}{args.mode}_evaluation.npy', allow_pickle=True)
22 | for arr in array:
23 | a.append(arr)
24 | array = np.asarray(a)
25 | print(f'Array shape : {array.shape} | Seed : {s} | Approach : {approach} | selplay_index : {selplay_index}')
26 | default_dist.append([array[:199]])
27 |
28 | mean = np.mean(np.asarray(default_dist), axis=0)
29 | N = args.window_size
30 | mode = 'causal'
31 | mean = smooth(np.reshape(mean, (mean.shape[1])), N, mode=mode)
32 | std = np.std(np.asarray(default_dist), axis=0)
33 | std = smooth(np.reshape(std, (std.shape[1])), N, mode=mode)
34 |
35 | return mean, std
36 |
37 |
38 | def smooth(y, radius, mode='two_sided', valid_only=False):
39 | '''
40 | Smooth signal y, where radius is determines the size of the window
41 | mode='twosided':
42 | average over the window [max(index - radius, 0), min(index + radius, len(y)-1)]
43 | mode='causal':
44 | average over the window [max(index - radius, 0), index]
45 | valid_only: put nan in entries where the full-sized window is not available
46 | '''
47 | assert mode in ('two_sided', 'causal')
48 | if len(y) < 2*radius+1:
49 | return np.ones_like(y) * y.mean()
50 | elif mode == 'two_sided':
51 | convkernel = np.ones(2 * radius+1)
52 | out = np.convolve(y, convkernel,mode='same') / np.convolve(np.ones_like(y), convkernel, mode='same')
53 | if valid_only:
54 | out[:radius] = out[-radius:] = np.nan
55 | elif mode == 'causal':
56 | convkernel = np.ones(radius)
57 | out = np.convolve(y, convkernel,mode='full') / np.convolve(np.ones_like(y), convkernel, mode='full')
58 | out = out[:-radius+1]
59 | if valid_only:
60 | out[:radius] = np.nan
61 | return out
62 |
63 |
64 | def sampling_plot(args, seed, sp_index, approach='adr'):
65 |
66 | for s in seed:
67 | save_dir = osp.join(args.save_dir, "sp" + str(sp_index) + "polyak" +
68 | str(args.polyak) + '-' + str(approach), str(s), args.env_name + '/')
69 | print(save_dir)
70 | alice_envs = np.load(save_dir + f'alice_envs.npy', allow_pickle=True)
71 | envs = alice_envs
72 | envs = list(itertools.chain(*envs))
73 | list_ = np.reshape(envs, (-1, 1))
74 | return list_
75 |
76 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Self-Supervised Active Domain Randomization
2 | This repository provides official code base for the paper "Generating Automatic Curricula via Self-Supervised Active Domain Randomization".
3 | 
4 |
5 | ## Experiments
6 | We perform our experiments on ErgoReacher, a 4 DoF arm and ErgoPusher, a 3-DoF arm from both in simulation and on the real robot.
7 | ### Important Flags
8 | There are few important flags which differentiate various experiments. `--approach` specifies the approach ['udr' | 'unsupervised-default' | 'unsupervised-adr'], `--sp-percent` flag specifies the self-play percentage. For all out experiments we use either `--sp-percent=1.0` (full self-play/completely unsupervised) or `--sp-percent=0.0` (no self-play,/completely supervised) depending on the `--approach`. `--only-sp` specifies where the bob operates. It is `True` for `--approach=unsupervised-default` and `False` for `--approach=unsupervised-adr`. For all the reacher experiments we used `--n-params=8` and for pusher experiments we used `--n-params=1`.
9 |
10 | ### Uniform Domain Randomization
11 | For `ErgoReacher` baseline experiments:
12 |
13 | `python experiments/ddpg_train.py --sp-percent 0.0 --approach 'udr' --env-name='ErgoReacherRandomizedEnv-Headless-v0' --n-params=8 `
14 |
15 | For `ErgoPusher` baseline experiments:
16 |
17 | `python experiments/ddpg_train.py --sp-percent 0.0 --approach 'udr' --env-name='ErgoPushRandomizedEnv-Headless-v0' --n-params=1`
18 |
19 | ### Unsupervised Default
20 | For `ErgoReacher` experiments:
21 |
22 | `python experiments/ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-default' --only-sp=True --env-name='ErgoReacherRandomizedEnv-Headless-v0' --n-params=8 `
23 |
24 | For `ErgoPusher` experiments:
25 |
26 | `python experiments/ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-default' --only-sp=True --env-name='ErgoPushRandomizedEnv-Headless-v0' --n-params=1`
27 |
28 | ### Self-Supervised Active Domain Randomization
29 | For `ErgoReacher` experiments:
30 |
31 | `python experiments/ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-adr' --only-sp=True --env-name='ErgoReacherRandomizedEnv-Headless-v0' --n-params=8`
32 |
33 | For `ErgoPusher` experiments:
34 |
35 | `python experiments/ddpg_train.py --sp-percent 1.0 --approach 'unsupervised-adr' --only-sp=True --env-name='ErgoPushRandomizedEnv-Headless-v0' --n-params=1`
36 |
37 | ## Evaluations
38 | In order to evaluate the trained models on simulator, on the command line execute the following.
39 |
40 | Here `--mode` can be `[default | hard]`. Here is an example to evaluate `ErgoReacher` in default environment. You can likewise evaluate `ErgoPusher` with different modes.
41 |
42 | `python experiments/evaluate_ergo_envs.py --env-name "ErgoReacherRandomizedEnv-Headless-v0" --mode='default' --sp-polyak 0.95 --n-params=8`
43 |
44 | ## Reference
45 |
46 | ```
47 | @article{raparthy2020selfsupadr,
48 | title={Generating Automatic Curricula via Self-Supervised Active Domain Randomization},
49 | author={Raparthy, Sharath Chandra and Mehta, Bhairav and Golemo, Florian and Paull, Liam},
50 | url={https://arxiv.org/abs/2002.07911},
51 | year={2020}
52 | }
53 | ```
54 |
55 | Built by [@Sharath](https://sharathraparthy.github.io/) and [@Bhairav Mehta](https://bhairavmehta95.github.io/)
56 |
--------------------------------------------------------------------------------
/common/discriminator/discriminator_rewarder.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import torch
4 | import torch.nn as nn
5 | from torch.autograd import Variable
6 |
7 | from common.models.discriminator import MLPDiscriminator
8 |
9 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10 |
11 |
12 | class DiscriminatorRewarder(object):
13 | def __init__(self, state_dim, action_dim, discriminator_batchsz, reward_scale,
14 | load_discriminator=False, discriminator_lr=3e-3, add_pz=True):
15 | self.discriminator = MLPDiscriminator(
16 | state_dim=state_dim,
17 | action_dim=action_dim
18 | )
19 |
20 | self.discriminator_criterion = nn.BCELoss()
21 | self.discriminator_optimizer = torch.optim.Adam(self.discriminator.parameters(), lr=discriminator_lr)
22 | self.reward_scale = reward_scale
23 | self.batch_size = discriminator_batchsz
24 | self.add_pz = add_pz
25 |
26 | if load_discriminator:
27 | self._load_discriminator(randomized_env_id)
28 |
29 | def calculate_rewards(self, randomized_trajectory):
30 | """Discriminator based reward calculation
31 | We want to use the negative of the adversarial calculation (Normally, -log(D)). We want to *reward*
32 | our simulator for making it easier to discriminate between the reference env + randomized onea
33 | """
34 | score, _, _ = self.get_score(randomized_trajectory)
35 | reward = np.log(score)
36 |
37 | if self.add_pz:
38 | reward -= np.log(0.5)
39 |
40 | return self.reward_scale * reward
41 |
42 | def get_score(self, trajectory):
43 | """Discriminator based reward calculation
44 | We want to use the negative of the adversarial calculation (Normally, -log(D)). We want to *reward*
45 | our simulator for making it easier to discriminate between the reference env + randomized onea
46 | """
47 | traj_tensor = self._trajectory2tensor(trajectory).float()
48 |
49 | with torch.no_grad():
50 | score = (self.discriminator(traj_tensor).cpu().detach().numpy()+1e-8)
51 | return score.mean(), np.median(score), np.sum(score)
52 |
53 | def train_discriminator(self, reference_trajectory, randomized_trajectory, iterations):
54 | """Trains discriminator to distinguish between reference and randomized state action tuples
55 | """
56 | for _ in range(iterations):
57 | randind = np.random.randint(0, len(randomized_trajectory[0]), size=int(self.batch_size))
58 | refind = np.random.randint(0, len(reference_trajectory[0]), size=int(self.batch_size))
59 |
60 | randomized_batch = self._trajectory2tensor(randomized_trajectory[randind])
61 | reference_batch = self._trajectory2tensor(reference_trajectory[refind])
62 |
63 | g_o = self.discriminator(randomized_batch)
64 | e_o = self.discriminator(reference_batch)
65 |
66 | self.discriminator_optimizer.zero_grad()
67 |
68 | discrim_loss = self.discriminator_criterion(g_o, torch.ones((len(randomized_batch), 1), device=device)) + \
69 | self.discriminator_criterion(e_o, torch.zeros((len(reference_batch), 1), device=device))
70 | discrim_loss.backward()
71 |
72 | self.discriminator_optimizer.step()
73 |
74 | def _load_discriminator(self, name, path='saved-models/discriminator/discriminator_{}.pth'):
75 | self.discriminator.load_state_dict(torch.load(path.format(name), map_location=device))
76 |
77 | def _save_discriminator(self, name, path='saved-models/discriminator/discriminator_{}.pth'):
78 | torch.save(self.discriminator.state_dict(), path.format(name))
79 |
80 | def _trajectory2tensor(self, trajectory):
81 | return torch.from_numpy(trajectory).float().to(device)
82 |
--------------------------------------------------------------------------------
/common/agents/selfplay_models.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.optim as optim
4 | from torch.distributions import Bernoulli
5 | import numpy as np
6 |
7 | eps = np.finfo(np.float32).eps.item()
8 |
9 |
10 | class BernoulliPolicyFetch(nn.Module):
11 | def __init__(self, goal_dim, action_dim=1):
12 | super(BernoulliPolicyFetch, self).__init__()
13 | self.base = nn.Sequential(
14 | nn.Linear(goal_dim * 2, 300),
15 | nn.ReLU(),
16 | nn.Linear(300, 300),
17 | nn.ReLU(),
18 | )
19 |
20 | self.out = nn.Linear(300, 1)
21 | self.value = nn.Linear(300, 1)
22 |
23 | self.saved_log_probs = []
24 | self.rewards = []
25 | self.values = []
26 |
27 | def forward(self, x):
28 | x = self.base(x)
29 | termination_prob = self.out(x)
30 | value = self.value(x)
31 | return torch.sigmoid(termination_prob), value
32 |
33 |
34 | class AlicePolicyFetch:
35 | def __init__(self, args, goal_dim, action_dim=1):
36 | self.policy = BernoulliPolicyFetch(goal_dim, action_dim=1)
37 | self.optimizer = optim.Adam(self.policy.parameters(), lr=3e-3)
38 | # self.comm = setup_mpi(self.policy)
39 | self.args = args
40 |
41 | def select_action(self, state, deterministic=False, save_log_probs=True):
42 | state = torch.from_numpy(state).float().unsqueeze(0)
43 | probs, value = self.policy(state)
44 | m = Bernoulli(probs)
45 | action = m.sample()
46 | if save_log_probs:
47 | self.policy.saved_log_probs.append(m.log_prob(action))
48 | self.policy.values.append(value)
49 |
50 | return action.cpu().data.numpy()[0]
51 |
52 | def log(self, reward):
53 | self.policy.rewards.append(reward)
54 |
55 | def load_from_file(self, file):
56 | self.policy.load_state_dict(torch.load(file))
57 |
58 | def load_from_policy(self, original):
59 | with torch.no_grad():
60 | for param, target_param in zip(self.policy.parameters(), original.policy.parameters()):
61 | param.data.copy_((1 - self.args.polyak) * param.data + self.args.polyak * target_param.data)
62 | param.requires_grad = False
63 |
64 | def perturb(self, alpha, weight_noise, bias_noise):
65 | with torch.no_grad():
66 | weight_noise = torch.from_numpy(alpha * weight_noise).float()
67 | bias_noise = torch.from_numpy(alpha * bias_noise).float()
68 | for param, noise in zip(self.policy.parameters(), [weight_noise, bias_noise]):
69 | param.add_(noise)
70 | param.requires_grad = False
71 |
72 | def finish_episode(self, gamma):
73 | R = 0
74 | returns = []
75 |
76 | # Normalize
77 | self.policy.rewards = np.array(self.policy.rewards)
78 | self.policy.rewards = (self.policy.rewards - self.policy.rewards.mean()) / (self.policy.rewards.std() + eps)
79 |
80 | for r in self.policy.rewards[::-1]:
81 | R = r + gamma * R
82 | returns.insert(0, torch.FloatTensor([R]).unsqueeze(1))
83 |
84 | log_probs = torch.cat(self.policy.saved_log_probs)
85 | returns = torch.cat(returns).detach()
86 | values = torch.cat(self.policy.values)
87 |
88 | advantage = returns - values
89 |
90 | actor_loss = -(log_probs * advantage.detach()).mean()
91 | critic_loss = advantage.pow(2).mean()
92 | loss = actor_loss + 0.5 * critic_loss
93 |
94 | self.optimizer.zero_grad()
95 | loss.backward()
96 | # sync_grads(self.comm, self.policy)
97 | self.optimizer.step()
98 |
99 | self.policy.rewards = []
100 | self.policy.saved_log_probs = []
101 | self.policy.values = []
--------------------------------------------------------------------------------
/common/randomizer/residual_fetch_push_env.py:
--------------------------------------------------------------------------------
1 | from .miscalibrated_push_controller import get_push_control
2 |
3 | from gym.envs.robotics import FetchPushEnv
4 |
5 | import gym
6 | from gym import error, spaces, utils
7 | from gym.utils import seeding
8 |
9 | import numpy as np
10 | import time
11 |
12 |
13 | class ResidualSlipperyPushEnv(gym.Env):
14 |
15 | def __init__(self, *args, **kwargs):
16 | self.fetch_env = gym.make("FetchPush-v1")
17 |
18 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)):
19 | self.fetch_env.env.sim.model.geom_friction[i] = [18e-2, 5.e-3, 1e-4]
20 |
21 | self.metadata = self.fetch_env.metadata
22 | self.hardcoded_controller = None
23 | self.action_space = self.fetch_env.action_space
24 | self.observation_space = self.fetch_env.observation_space
25 |
26 |
27 | def step(self, residual_action):
28 | residual_action = 2. * residual_action
29 |
30 | action = np.add(residual_action, get_push_control(self._last_observation))
31 | action = np.clip(action, -1, 1)
32 | observation, reward, done, debug_info = self.fetch_env.step(action)
33 | self._last_observation = observation
34 |
35 | return observation, reward, done, debug_info
36 |
37 | def reset(self):
38 | observation = self.fetch_env.reset()
39 | self._last_observation = observation
40 | return observation
41 |
42 | def seed(self, seed=0):
43 | return self.fetch_env.seed(seed=seed)
44 |
45 | def render(self, mode="human", *args, **kwargs):
46 | # See https://github.com/openai/gym/issues/1081
47 | self.fetch_env.env._render_callback()
48 | if mode == 'rgb_array':
49 | self.fetch_env.env._get_viewer().render()
50 | width, height = 3350, 1800
51 | data = self.fetch_env.env._get_viewer().read_pixels(width, height, depth=False)
52 | # original image is upside-down, so flip it
53 | return data[::-1, :, :]
54 | elif mode == 'human':
55 | self.fetch_env.env._get_viewer().render()
56 |
57 | return self.fetch_env.render(*args, **kwargs)
58 |
59 | def compute_reward(self, *args, **kwargs):
60 | return self.fetch_env.compute_reward(*args, **kwargs)
61 |
62 | class SlipperyPushEnv(gym.Env):
63 |
64 | def __init__(self, *args, **kwargs):
65 | self.fetch_env = gym.make("FetchPush-v1")
66 |
67 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)):
68 | self.fetch_env.env.sim.model.geom_friction[i] = [18e-2, 5.e-3, 1e-4]
69 |
70 | self.metadata = self.fetch_env.metadata
71 |
72 | self.action_space = self.fetch_env.action_space
73 | self.observation_space = self.fetch_env.observation_space
74 |
75 | def step(self, action):
76 | return self.fetch_env.step(action)
77 |
78 | def reset(self):
79 | return self.fetch_env.reset()
80 |
81 | def seed(self, seed=0):
82 | return self.fetch_env.seed(seed=seed)
83 |
84 | def render(self, mode="human", *args, **kwargs):
85 | # See https://github.com/openai/gym/issues/1081
86 | self.fetch_env.env._render_callback()
87 | if mode == 'rgb_array':
88 | self.fetch_env.env._get_viewer().render()
89 | width, height = 3350, 1800
90 | data = self.fetch_env.env._get_viewer().read_pixels(width, height, depth=False)
91 | # original image is upside-down, so flip it
92 | return data[::-1, :, :]
93 | elif mode == 'human':
94 | self.fetch_env.env._get_viewer().render()
95 |
96 | return self.fetch_env.render(*args, **kwargs)
97 |
98 | def compute_reward(self, *args, **kwargs):
99 | return self.fetch_env.compute_reward(*args, **kwargs)
100 |
101 |
--------------------------------------------------------------------------------
/common/randomizer/controllers/pick_and_place_controller.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | DEBUG = False
4 |
5 | def get_move_action(observation, target_position, atol=1e-3, gain=10., close_gripper=False):
6 | """
7 | Move an end effector to a position and orientation.
8 | """
9 | # Get the currents
10 | current_position = observation['observation'][:3]
11 |
12 | action = gain * np.subtract(target_position, current_position)
13 | if close_gripper:
14 | gripper_action = -1.
15 | else:
16 | gripper_action = 0.
17 | action = np.hstack((action, gripper_action))
18 |
19 | return action
20 |
21 | def block_is_grasped(obs, relative_grasp_position, atol=1e-3):
22 | return block_inside_grippers(obs, relative_grasp_position, atol=atol) and grippers_are_closed(obs, atol=atol)
23 |
24 | def block_inside_grippers(obs, relative_grasp_position, atol=1e-3):
25 | gripper_position = obs['observation'][:3]
26 | block_position = obs['observation'][3:6]
27 |
28 | relative_position = np.subtract(gripper_position, block_position)
29 |
30 | return np.sum(np.subtract(relative_position, relative_grasp_position)**2) < atol
31 |
32 | def grippers_are_closed(obs, atol=1e-3):
33 | gripper_state = obs['observation'][9:11]
34 | return abs(gripper_state[0] - 0.024) < atol
35 |
36 | def grippers_are_open(obs, atol=1e-3):
37 | gripper_state = obs['observation'][9:11]
38 | return abs(gripper_state[0] - 0.05) < atol
39 |
40 | def get_pick_and_place_control(obs, relative_grasp_position=(0., 0., -0.02), workspace_height=0.1, atol=1e-3, gain=10):
41 | """
42 | Returns
43 | -------
44 | action : [float] * 4
45 | """
46 | gripper_position = obs['observation'][:3]
47 | block_position = obs['observation'][3:6]
48 | place_position = obs['desired_goal']
49 |
50 | # If the gripper is already grasping the block
51 | if block_is_grasped(obs, relative_grasp_position, atol=atol):
52 |
53 | # If the block is already at the place position, do nothing except keep the gripper closed
54 | if np.sum(np.subtract(block_position, place_position)**2) < atol:
55 | if DEBUG:
56 | print("The block is already at the place position; do nothing")
57 | return np.array([0., 0., 0., -1.])
58 |
59 | # Move to the place position while keeping the gripper closed
60 | target_position = np.add(place_position, relative_grasp_position)
61 | target_position[2] += workspace_height/2.
62 | if DEBUG:
63 | print("Move to above the place position")
64 | return get_move_action(obs, target_position, atol=atol, close_gripper=True, gain=gain)
65 |
66 | # If the block is ready to be grasped
67 | if block_inside_grippers(obs, relative_grasp_position, atol=atol):
68 |
69 | # Close the grippers
70 | if DEBUG:
71 | print("Close the grippers")
72 | return np.array([0., 0., 0., -1.])
73 |
74 | # If the gripper is above the block
75 | if (gripper_position[0] - block_position[0])**2 + (gripper_position[1] - block_position[1])**2 < atol:
76 |
77 | # If the grippers are closed, open them
78 | if not grippers_are_open(obs, atol=atol):
79 | if DEBUG:
80 | print("Open the grippers")
81 | return np.array([0., 0., 0., 1.])
82 |
83 | # Move down to grasp
84 | target_position = np.add(block_position, relative_grasp_position)
85 | if DEBUG:
86 | print("Move down to grasp")
87 | return get_move_action(obs, target_position, atol=atol, gain=gain)
88 |
89 |
90 | # Else move the gripper to above the block
91 | target_position = np.add(block_position, relative_grasp_position)
92 | target_position[2] += workspace_height
93 | if DEBUG:
94 | print("Move to above the block")
95 | return get_move_action(obs, target_position, atol=atol, gain=gain)
96 |
97 |
98 |
--------------------------------------------------------------------------------
/common/randomizer/residual_envs/residual_fetch_push_env.py:
--------------------------------------------------------------------------------
1 | from randomizer.controllers.miscalibrated_push_controller import get_push_control
2 |
3 | from gym.envs.robotics import FetchPushEnv
4 |
5 | import gym
6 | from gym import error, spaces, utils
7 | from gym.utils import seeding
8 |
9 | import numpy as np
10 | import time
11 |
12 |
13 | class ResidualSlipperyPushEnv(gym.Env):
14 |
15 | def __init__(self, *args, **kwargs):
16 | self.fetch_env = gym.make("FetchPush-v1")
17 |
18 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)):
19 | self.fetch_env.env.sim.model.geom_friction[i] = [18e-2, 5.e-3, 1e-4]
20 |
21 | self.metadata = self.fetch_env.metadata
22 | self.hardcoded_controller = None
23 | self.action_space = self.fetch_env.action_space
24 | self.observation_space = self.fetch_env.observation_space
25 |
26 |
27 | def step(self, residual_action):
28 | residual_action = 2. * residual_action
29 |
30 | action = np.add(residual_action, get_push_control(self._last_observation))
31 | action = np.clip(action, -1, 1)
32 | observation, reward, done, debug_info = self.fetch_env.step(action)
33 | self._last_observation = observation
34 |
35 | return observation, reward, done, debug_info
36 |
37 | def reset(self):
38 | observation = self.fetch_env.reset()
39 | self._last_observation = observation
40 | return observation
41 |
42 | def seed(self, seed=0):
43 | return self.fetch_env.seed(seed=seed)
44 |
45 | def render(self, mode="human", *args, **kwargs):
46 | # See https://github.com/openai/gym/issues/1081
47 | self.fetch_env.env._render_callback()
48 | if mode == 'rgb_array':
49 | self.fetch_env.env._get_viewer(mode=mode).render()
50 | width, height = 3350, 1800
51 | data = self.fetch_env.env._get_viewer().read_pixels(width, height, depth=False)
52 | # original image is upside-down, so flip it
53 | return data[::-1, :, :]
54 | elif mode == 'human':
55 | self.fetch_env.env._get_viewer().render()
56 |
57 | return self.fetch_env.render(*args, **kwargs)
58 |
59 | def compute_reward(self, *args, **kwargs):
60 | return self.fetch_env.compute_reward(*args, **kwargs)
61 |
62 | class SlipperyPushEnv(gym.Env):
63 |
64 | def __init__(self, *args, **kwargs):
65 | self.fetch_env = gym.make("FetchPush-v1")
66 |
67 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)):
68 | self.fetch_env.env.sim.model.geom_friction[i] = [18e-2, 5.e-3, 1e-4]
69 |
70 | self.metadata = self.fetch_env.metadata
71 |
72 | self.action_space = self.fetch_env.action_space
73 | self.observation_space = self.fetch_env.observation_space
74 |
75 | def step(self, action):
76 | return self.fetch_env.step(action)
77 |
78 | def reset(self):
79 | return self.fetch_env.reset()
80 |
81 | def seed(self, seed=0):
82 | return self.fetch_env.seed(seed=seed)
83 |
84 | def render(self, mode="human", *args, **kwargs):
85 | # See https://github.com/openai/gym/issues/1081
86 | self.fetch_env.env._render_callback()
87 | if mode == 'rgb_array':
88 | self.fetch_env.env._get_viewer().render()
89 | width, height = 3350, 1800
90 | data = self.fetch_env.env._get_viewer().read_pixels(width, height, depth=False)
91 | # original image is upside-down, so flip it
92 | return data[::-1, :, :]
93 | elif mode == 'human':
94 | self.fetch_env.env._get_viewer().render()
95 |
96 | return self.fetch_env.render(*args, **kwargs)
97 |
98 | def compute_reward(self, *args, **kwargs):
99 | return self.fetch_env.compute_reward(*args, **kwargs)
100 |
101 |
--------------------------------------------------------------------------------
/common/adr/adr.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from common.discriminator.discriminator_rewarder import DiscriminatorRewarder
4 | from common.svpg.svpg import SVPG
5 |
6 | class ADR:
7 | def __init__(
8 | self,
9 | nparticles,
10 | nparams,
11 | state_dim,
12 | action_dim,
13 | temperature,
14 | svpg_rollout_length,
15 | svpg_horizon,
16 | max_step_length,
17 | reward_scale,
18 | initial_svpg_steps,
19 | seed,
20 | discriminator_batchsz
21 | ):
22 | assert nparticles > 2
23 |
24 | self.nparticles = nparticles
25 | self.nparams = nparams
26 |
27 | self.svpg_rollout_length = svpg_rollout_length
28 | self.svpg_horizon = svpg_horizon
29 | self.initial_svpg_steps = initial_svpg_steps
30 |
31 | self.seed = seed
32 | self.svpg_timesteps = 0
33 |
34 | self.discriminator_rewarder = DiscriminatorRewarder(
35 | state_dim=state_dim,
36 | action_dim=action_dim,
37 | discriminator_batchsz=discriminator_batchsz,
38 | reward_scale=reward_scale,
39 | )
40 |
41 | self.svpg = SVPG(
42 | nparticles=nparticles,
43 | nparams=self.nparams,
44 | max_step_length=max_step_length,
45 | svpg_rollout_length=svpg_rollout_length,
46 | svpg_horizon=svpg_horizon,
47 | temperature=temperature,
48 | )
49 |
50 | self.parameter_settings = np.ones(
51 | (self.nparticles,
52 | self.svpg_horizon,
53 | self.svpg.svpg_rollout_length,
54 | self.svpg.nparams)
55 | ) * -1
56 |
57 | def score_trajectories(self, randomized_trajectories):
58 | rewards = np.zeros((self.nparticles, self.svpg.svpg_rollout_length))
59 |
60 | for i in range(self.nparticles):
61 | for t in range(self.svpg.svpg_rollout_length):
62 | # flatten and combine all randomized and reference trajectories for discriminator
63 | randomized_discrim_score_mean, _, _ = \
64 | self.discriminator_rewarder.get_score(randomized_trajectories[i][t])
65 |
66 | rewards[i][t] = randomized_discrim_score_mean
67 |
68 | return rewards
69 |
70 | def step_particles(self):
71 | if self.svpg_timesteps >= self.initial_svpg_steps:
72 | # Get sim instances from SVPG policy
73 | simulation_instances = self.svpg.step()
74 |
75 | index = self.svpg_timesteps % self.svpg_horizon
76 | self.parameter_settings[:, index, :, :] = simulation_instances
77 |
78 | else:
79 | # Creates completely randomized environment
80 | simulation_instances = np.ones((self.nparticles,
81 | self.svpg.svpg_rollout_length,
82 | self.svpg.nparams)) * -1
83 |
84 | assert (self.nparticles, self.svpg.svpg_rollout_length, self.svpg.nparams) \
85 | == simulation_instances.shape
86 |
87 | # Reshape to work with vectorized environments
88 | simulation_instances = np.transpose(simulation_instances, (1, 0, 2))
89 |
90 | self.svpg_timesteps += 1
91 | return simulation_instances
92 |
93 | def train(self, reference_trajectories, randomized_trajectories):
94 | rewards = self.score_trajectories(randomized_trajectories)
95 | self._train_particles(rewards)
96 | self._train_discriminator(reference_trajectories, randomized_trajectories)
97 |
98 | def _train_discriminator(self, reference_trajectories, randomized_trajectories):
99 | flattened_randomized = [randomized_trajectories[i][t] for i in range(self.nagents)]
100 | flattened_randomized = np.concatenate(flattened_randomized)
101 |
102 | flattened_reference = [reference_trajectories[i][t] for i in range(self.nagents)]
103 | flattened_reference = np.concatenate(flattened_reference)
104 |
105 | self.discriminator_rewarder.train_discriminator(
106 | flattened_reference,
107 | flattened_randomized,
108 | iterations=len(flattened_randomized)
109 | )
110 |
111 | def _train_particles(self, rewards):
112 | self.svpg.train(rewards)
113 |
114 |
--------------------------------------------------------------------------------
/common/randomizer/assets/shared.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
--------------------------------------------------------------------------------
/common/svpg/svpg_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import os
4 |
5 |
6 | def parameters_to_vector(parameters, grad=False, both=False):
7 | """Convert parameters or/and their gradients to one vector
8 | Arguments:
9 | parameters (Iterable[Variable]): an iterator of Variables that are the
10 | parameters of a model.
11 | grad (bool): Vectorizes gradients if true, otherwise vectorizes params
12 | both (bool): If True, vectorizes both parameters and their gradients,
13 | `grad` has no effect in this case. Otherwise vectorizes parameters
14 | or gradients according to `grad`.
15 | Returns:
16 | The parameters or/and their gradients (each) represented by a single
17 | vector (th.Tensor, not Variable)
18 | """
19 | # Flag for the device where the parameter is located
20 | param_device = None
21 |
22 | if not both:
23 | vec = []
24 | if not grad:
25 | for param in parameters:
26 | # Ensure the parameters are located in the same device
27 | param_device = _check_param_device(param, param_device)
28 | vec.append(param.data.view(-1))
29 | else:
30 | for param in parameters:
31 | param_device = _check_param_device(param, param_device)
32 | vec.append(param.grad.data.view(-1))
33 | return torch.cat(vec)
34 | else:
35 | vec_params, vec_grads = [], []
36 | for param in parameters:
37 | param_device = _check_param_device(param, param_device)
38 | vec_params.append(param.data.view(-1))
39 | vec_grads.append(param.grad.data.view(-1))
40 | return torch.cat(vec_params), torch.cat(vec_grads)
41 |
42 | def vector_to_parameters(vec, parameters, grad=True):
43 | """Convert one vector to the parameters or gradients of the parameters
44 | Arguments:
45 | vec (torch.Tensor): a single vector represents the parameters of a model.
46 | parameters (Iterable[Variable]): an iterator of Variables that are the
47 | parameters of a model.
48 | grad (bool): True for assigning de-vectorized `vec` to gradients
49 | """
50 | # Ensure vec of type Variable
51 | if not isinstance(vec, torch.cuda.FloatTensor):
52 | raise TypeError('expected torch.Tensor, but got: {}'
53 | .format(torch.typename(vec)))
54 | # Flag for the device where the parameter is located
55 | param_device = None
56 |
57 | # Pointer for slicing the vector for each parameter
58 | pointer = 0
59 | if grad:
60 | for param in parameters:
61 | # Ensure the parameters are located in the same device
62 | param_device = _check_param_device(param, param_device)
63 | # The length of the parameter
64 | num_param = torch.prod(torch.LongTensor(list(param.size())))
65 | param.grad.data = vec[pointer:pointer + num_param].view(
66 | param.size())
67 | # Increment the pointer
68 | pointer += num_param
69 | else:
70 | for param in parameters:
71 | # Ensure the parameters are located in the same device
72 | param_device = _check_param_device(param, param_device)
73 | # The length of the parameter
74 | num_param = torch.prod(torch.LongTensor(list(param.size())))
75 | param.data = vec[pointer:pointer + num_param].view(
76 | param.size())
77 | # Increment the pointer
78 | pointer += num_param
79 |
80 |
81 | def _check_param_device(param, old_param_device):
82 | """This helper function is to check if the parameters are located
83 | in the same device. Currently, the conversion between model parameters
84 | and single vector form is not supported for multiple allocations,
85 | e.g. parameters in different GPUs, or mixture of CPU/GPU.
86 | Arguments:
87 | param ([Variable]): a Variable of a parameter of a model
88 | old_param_device (int): the device where the first parameter of a
89 | model is allocated.
90 | Returns:
91 | old_param_device (int): report device for the first time
92 | """
93 |
94 | # Meet the first parameter
95 | if old_param_device is None:
96 | old_param_device = param.get_device() if param.is_cuda else -1
97 | else:
98 | warn = False
99 | if param.is_cuda: # Check if in same GPU
100 | warn = (param.get_device() != old_param_device)
101 | else: # Check if in CPU
102 | warn = (old_param_device != -1)
103 | if warn:
104 | raise TypeError('Found two parameters on different devices, '
105 | 'this is currently not supported.')
106 | return old_param_device
--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | """
4 | Here are the param for the training
5 |
6 | """
7 | def get_args():
8 | parser = argparse.ArgumentParser()
9 | # the environment setting
10 | parser.add_argument('--n-epochs', type=int, default=50, help='the number of epochs to train the agent')
11 | parser.add_argument('--n-cycles', type=int, default=50, help='the times to collect samples per epoch')
12 | parser.add_argument('--n-batches', type=int, default=40, help='the times to update the network')
13 | parser.add_argument('--save-interval', type=int, default=5, help='the interval that save the trajectory')
14 | parser.add_argument('--seed', type=int, default=123, help='random seed')
15 | parser.add_argument('--num-workers', type=int, default=1, help='the number of cpus to collect samples')
16 | parser.add_argument('--replay-strategy', type=str, default='future', help='the HER strategy')
17 | parser.add_argument('--clip-return', type=float, default=50, help='if clip the returns')
18 | parser.add_argument('--save-dir', type=str, default='saved_models/', help='the path to save the models')
19 | parser.add_argument('--noise-eps', type=float, default=0.2, help='noise eps')
20 | parser.add_argument('--random-eps', type=float, default=0.3, help='random eps')
21 | parser.add_argument('--buffer-size', type=int, default=int(1e6), help='the size of the buffer')
22 | parser.add_argument('--replay-k', type=int, default=4, help='ratio to be replace')
23 | parser.add_argument('--clip-obs', type=float, default=200, help='the clip ratio')
24 | parser.add_argument('--batch-size', type=int, default=256, help='the sample batch size')
25 | parser.add_argument('--gamma', type=float, default=0.98, help='the discount factor')
26 | parser.add_argument('--action-l2', type=float, default=1, help='l2 reg')
27 | parser.add_argument('--lr-actor', type=float, default=0.001, help='the learning rate of the actor')
28 | parser.add_argument('--lr-critic', type=float, default=0.001, help='the learning rate of the critic')
29 | parser.add_argument('--polyak', type=float, default=0.95, help='the average coefficient')
30 | parser.add_argument('--n-test-rollouts', type=int, default=10, help='the number of tests')
31 | parser.add_argument('--clip-range', type=float, default=5, help='the clip range')
32 | parser.add_argument('--demo-length', type=int, default=20, help='the demo length')
33 | parser.add_argument('--cuda', action='store_true', help='if use gpu do the acceleration')
34 | parser.add_argument('--num-rollouts-per-mpi', type=int, default=10, help='the rollouts per mpi')
35 | parser.add_argument('--sp-polyak', type=float, default=0.1, help='Polyak Averaging Coefficient')
36 | parser.add_argument('--sp-gamma', type=float, default=0.1, help='Self play gamma')
37 | parser.add_argument('--sp-percent', type=float, default=0.1, help='Self Play Percentage')
38 | parser.add_argument('--friction', type=float, default=0.18, help='friction parameter to set')
39 | parser.add_argument('--approach', type=str, default='baseline', help='Different approaches for experiments')
40 | parser.add_argument('--svpg-rollout-length', type=int, default=5)
41 | parser.add_argument('--nmpi', type=int, default=8) # TODO!
42 | parser.add_argument('--mode', type=str, default='default')
43 | parser.add_argument('--window-size', type=int, default=20)
44 | parser.add_argument("--policy-name", default="OurDDPG") # Policy name
45 | parser.add_argument("--env-name", default="ErgoPushRandomizedEnv-Headless-v0") # OpenAI gym environment name
46 | parser.add_argument("--start-timesteps", default=1e4,
47 | type=int) # How many time steps purely random policy is run for
48 | parser.add_argument("--eval-freq", default=5e3, type=float) # How often (time steps) we evaluate
49 | parser.add_argument("--max-timesteps", default=1e6, type=float) # Max time steps to run environment for
50 | parser.add_argument("--save-models", default=True, action="store_true") # Whether or not models are saved
51 | parser.add_argument("--expl-noise", default=0.1, type=float) # Std of Gaussian exploration noise
52 | parser.add_argument("--discount", default=0.99, type=float) # Discount factor
53 | parser.add_argument("--tau", default=0.005, type=float) # Target network update rate
54 | parser.add_argument("--policy-noise", default=0.2, type=float) # Noise added to target policy during critic update
55 | parser.add_argument("--noise-clip", default=0.5, type=float) # Range to clip target policy noise
56 | parser.add_argument("--policy-freq", default=2, type=int) # Frequency of delayed policy updates
57 | parser.add_argument("--nparticles", default=1, type=int)
58 | parser.add_argument('--n-params', type=int, default=1)
59 | parser.add_argument('--only-sp', type=bool, default=False)
60 |
61 |
62 | # Add more arguments if needed.
63 |
64 | args = parser.parse_args()
65 |
66 | return args
67 |
--------------------------------------------------------------------------------
/common/randomizer/controllers/hook_controller.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | DEBUG = False
4 |
5 | def get_move_action(observation, target_position, atol=1e-3, gain=10., close_gripper=False):
6 | """
7 | Move an end effector to a position and orientation.
8 | """
9 | # Get the currents
10 | current_position = observation['observation'][:3]
11 |
12 | action = gain * np.subtract(target_position, current_position)
13 | if close_gripper:
14 | gripper_action = -1.
15 | else:
16 | gripper_action = 0.
17 | action = np.hstack((action, gripper_action))
18 |
19 | return action
20 |
21 | def block_is_grasped(obs, gripper_position, block_position, relative_grasp_position, atol=1e-3):
22 | block_inside = block_inside_grippers(gripper_position, block_position, relative_grasp_position, atol=atol)
23 | grippers_closed = grippers_are_closed(obs, atol=atol)
24 |
25 | return block_inside and grippers_closed
26 |
27 | def block_inside_grippers(gripper_position, block_position, relative_grasp_position, atol=1e-3):
28 | relative_position = np.subtract(gripper_position, block_position)
29 |
30 | return np.sum(np.subtract(relative_position, relative_grasp_position)**2) < atol
31 |
32 | def grippers_are_closed(obs, atol=1e-3):
33 | gripper_state = obs['observation'][9:11]
34 | return abs(gripper_state[0]) - 0.024 <= atol
35 |
36 | def grippers_are_open(obs, atol=1e-3):
37 | gripper_state = obs['observation'][9:11]
38 |
39 | return abs(gripper_state[0] - 0.05) <= atol
40 |
41 |
42 | def pick_at_position(obs, block_position, place_position, relative_grasp_position=(0., 0., -0.02), workspace_height=0.1, atol=1e-3):
43 | """
44 | Returns
45 | -------
46 | action : [float] * 4
47 | """
48 | gripper_position = obs['observation'][:3]
49 |
50 | # If the gripper is already grasping the block
51 | if block_is_grasped(obs, gripper_position, block_position, relative_grasp_position, atol=atol):
52 |
53 | # If the block is already at the place position, do nothing except keep the gripper closed
54 | if np.sum(np.subtract(block_position, place_position)**2) < atol:
55 | if DEBUG:
56 | print("The block is already at the place position; do nothing")
57 | return np.array([0., 0., 0., -1.])
58 |
59 | # Move to the place position while keeping the gripper closed
60 | target_position = np.add(place_position, relative_grasp_position)
61 | target_position[2] += workspace_height/2.
62 | if DEBUG:
63 | print("Move to above the place position")
64 | return get_move_action(obs, target_position, atol=atol, close_gripper=True)
65 |
66 | # If the block is ready to be grasped
67 | if block_inside_grippers(gripper_position, block_position, relative_grasp_position, atol=atol):
68 |
69 | # Close the grippers
70 | if DEBUG:
71 | print("Close the grippers")
72 | return np.array([0., 0., 0., -1.])
73 |
74 | # If the gripper is above the block
75 | target_position = np.add(block_position, relative_grasp_position)
76 | if (gripper_position[0] - target_position[0])**2 + (gripper_position[1] - target_position[1])**2 < atol:
77 |
78 | # If the grippers are closed, open them
79 | if not grippers_are_open(obs, atol=atol):
80 | if DEBUG:
81 | print("Open the grippers")
82 | return np.array([0., 0., 0., 1.])
83 |
84 | # Move down to grasp
85 | if DEBUG:
86 | print("Move down to grasp")
87 | return get_move_action(obs, target_position, atol=atol)
88 |
89 |
90 | # Else move the gripper to above the block
91 | target_position[2] += workspace_height
92 | if DEBUG:
93 | print("Move to above the block")
94 | return get_move_action(obs, target_position, atol=atol)
95 |
96 |
97 | def get_hook_control(obs, atol=1e-2):
98 | """
99 | Returns
100 | -------
101 | action : [float] * 4
102 | """
103 | gripper_position = obs['observation'][:3]
104 | block_position = obs['observation'][3:6]
105 | hook_position = obs['observation'][25:28]
106 | place_position = obs['desired_goal']
107 |
108 | # Done
109 | if abs(block_position[0] - place_position[0]) + abs(block_position[1] - place_position[1]) <= 1e-2:
110 | if DEBUG:
111 | print("DONE")
112 | return np.array([0., 0., 0., -1.])
113 |
114 | # Grasp and lift the hook
115 | if not block_is_grasped(obs, gripper_position, hook_position, relative_grasp_position=(0., 0., -0.05), atol=atol):
116 | if DEBUG:
117 | print("Grasping and lifting the hook")
118 | hook_target = hook_position.copy()
119 | hook_target[2] = 0.5
120 | return pick_at_position(obs, hook_position, hook_target, relative_grasp_position=(0., 0., -0.05))
121 |
122 | # Align the hook to sweep
123 | hook_target = np.array([block_position[0] - 0.5, block_position[1] - 0.05, 0.45])
124 |
125 | if hook_position[0] >= hook_target[0] + 0.1 or hook_position[1] + 0.1 <= hook_target[1]:
126 | if DEBUG:
127 | print("Aligning to sweep", hook_position[0] + atol, hook_target[0], hook_position[1] + atol, hook_target[1])
128 | return get_move_action(obs, hook_target, close_gripper=True)
129 |
130 | if DEBUG:
131 | print("Sweeping back")
132 |
133 | direction = np.subtract(place_position, block_position)
134 | direction = direction[:2] / np.linalg.norm(direction[:2])
135 |
136 | return np.array([0.4 * direction[0], 0.4 * direction[1], 0., -1.])
137 |
138 |
139 |
140 |
141 |
142 |
143 |
--------------------------------------------------------------------------------
/realrobot_evals/realrobot_test.py:
--------------------------------------------------------------------------------
1 | import h5py
2 | import torch
3 | import time
4 | import numpy as np
5 | import gym
6 | import argparse
7 | import os
8 | import os.path as osp
9 | from tqdm import tqdm, trange
10 |
11 | from common.agents.ddpg.ddpg import DDPG
12 | import gym_ergojr
13 |
14 | parser = argparse.ArgumentParser(description='Real Robot Experiment Driver')
15 |
16 | parser.add_argument('--nepisodes', type=int, default=25, help='Number of trials per *seed*')
17 | parser.add_argument('--experiment-prefix', type=str, default='real', help='Prefix to append to logs')
18 | parser.add_argument('--log-dir', type=str, default='/data/fgolemo/UADR-results/real-robot', help='Log Directory Prefix')
19 | parser.add_argument('--model-dir', type=str, default='saved_models/', help='Model Directory Prefix') # TODO
20 | parser.add_argument('--cont', type=str, default='190329-180631', help='To continue existing file, enter timestamp here')
21 | parser.add_argument('--cont', type=str, default='', help='To continue existing file, enter timestamp here')
22 |
23 | args = parser.parse_args()
24 |
25 | if len(args.cont) == 0:
26 | TIMESTAMP = time.strftime("%y%m%d-%H%M%S")
27 | file_flag = "w"
28 |
29 | else:
30 | TIMESTAMP = args.cont
31 | file_flag = "r+"
32 |
33 | file_path = "{}/{}-{}.hdf5".format(args.log_dir, args.experiment_prefix, TIMESTAMP)
34 |
35 | MAX_EPISODE_STEPS = 100
36 | EPISODES = args.nepisodes
37 | SEED = ["31", "32", "33", "34", "35"]
38 | # Policies to look for
39 | policies = ['udr', 'adr-1.0', 'baseline-sp-1.0']
40 |
41 | env_name = "ErgoReacher-MultiGoal-Live-v1"
42 | npa = np.array
43 |
44 | img_buffer = []
45 |
46 | if not osp.exists(args.log_dir):
47 | os.makedirs(args.log_dir)
48 |
49 |
50 | env = gym.make(env_name)
51 | state_dim = env.observation_space.shape[0]
52 | action_dim = env.action_space.shape[0]
53 |
54 | n_episodes = 3 # num of episodes to run
55 | max_timesteps = 100 # max timesteps in one episode
56 | render = True # render the environment
57 | save_gif = False # png images are saved in gif folder
58 |
59 | state_dim = env.observation_space.shape[0]
60 | action_dim = env.action_space.shape[0]
61 | max_action = float(env.action_space.high[0])
62 | goal_dim = 2
63 |
64 | policy = DDPG(args, state_dim, action_dim, max_action, goal_dim)
65 |
66 | obs = env.reset()
67 |
68 |
69 | with h5py.File(file_path, file_flag) as f:
70 | for policy_type in tqdm(policies, desc="approaches"):
71 | for torque_idx, torque in enumerate(tqdm(SEED, desc="torques...")):
72 |
73 | model_path = osp.join(args.model_dir, policy_type, f"Variant-{torque}")
74 | print(model_path)
75 | no_models = len(os.listdir(model_path))
76 |
77 | if policy_type not in f: # if dataset doesn't have these tables
78 | log_group = f.create_group(policy_type)
79 | rewards = log_group.create_dataset("rewards", (no_models, len(SEED), EPISODES, MAX_EPISODE_STEPS),
80 | dtype=np.float32)
81 | distances = log_group.create_dataset("distances", (no_models, len(SEED), EPISODES, MAX_EPISODE_STEPS),
82 | dtype=np.float32)
83 | trajectories = log_group.create_dataset("trajectories",
84 | (no_models, len(SEED), EPISODES, MAX_EPISODE_STEPS, 24),
85 | dtype=np.float32)
86 | imgs = log_group.create_dataset("images",
87 | (no_models, len(SEED), EPISODES, MAX_EPISODE_STEPS, 480, 640, 3),
88 | dtype=np.uint8, compression="lzf")
89 | else: # if tables are in dataset, grab their pointers
90 | rewards = f.get(f"/{policy_type}/rewards")
91 | distances = f.get(f"/{policy_type}/distances")
92 | trajectories = f.get(f"/{policy_type}/trajectories")
93 | imgs = f.get(f"/{policy_type}/images")
94 |
95 | tqdm.write(f'Starting analysis of {policy_type} - variant {torque}')
96 |
97 | for model_idx, actorpth in enumerate(tqdm(os.listdir(model_path), desc="models....")):
98 | # Load model weights
99 | policy.load(os.path.join(model_path, actorpth))
100 |
101 | for ep_num in trange(EPISODES, desc="episodes.."):
102 | non_zero_steps = np.count_nonzero(trajectories[model_idx, torque_idx, ep_num], axis=1)
103 |
104 | if np.count_nonzero(non_zero_steps) == 0:
105 | obs = env.reset()
106 | done = False
107 | cumulative = 0
108 | counter = 0
109 | img_buffer = []
110 | while counter < MAX_EPISODE_STEPS:
111 | with torch.no_grad():
112 | # print(counter, obs)
113 | action = policy.select_action(np.array(obs))
114 |
115 | nobs, reward, _, misc = env.step(action)
116 | cumulative += reward
117 |
118 | trajectories[model_idx, torque_idx, ep_num, counter, :] = np.concatenate(
119 | [obs, action, nobs])
120 | rewards[model_idx, torque_idx, ep_num, counter] = reward
121 | distances[model_idx, torque_idx, ep_num, counter] = misc["distance"]
122 | img_buffer.append(np.copy(misc["img"]))
123 |
124 | obs = np.copy(nobs)
125 | counter += 1
126 |
127 | imgs[model_idx, torque_idx, ep_num, :counter, :, :, :] = img_buffer
128 |
129 | f.flush()
130 |
131 | env.reset()
132 |
--------------------------------------------------------------------------------
/common/randomizer/__init__.py:
--------------------------------------------------------------------------------
1 | from gym.envs.registration import register
2 | import os.path as osp
3 |
4 |
5 | register(
6 | id='FetchPushRandomizedEnv-v0',
7 | entry_point='common.randomizer.randomized_fetchpush:FetchPushRandomizedEnv',
8 | max_episode_steps=50,
9 | kwargs={
10 | 'config': 'common/randomizer/config/FetchPushRandomized/random.json',
11 | 'xml_name': 'push.xml'
12 | }
13 | )
14 | register(
15 | id='FetchSlideRandomizedEnv-v0',
16 | entry_point='common.randomizer.randomized_fetchslide:FetchSlideRandomizedEnv',
17 | max_episode_steps=50,
18 | kwargs={
19 | 'config': 'common/randomizer/config/FetchSlideRandomized/random.json',
20 | 'xml_name': 'slide.xml'
21 | }
22 | )
23 |
24 | register(
25 | id='FetchHookRandomizedEnv-v0',
26 | entry_point='common.randomizer.randomized_residual_hook:FetchHookEnv',
27 | max_episode_steps=100,
28 | kwargs={
29 | 'config': 'common/randomizer/config/FetchHookRandomized/random.json',
30 | 'xml_name': 'hook.xml'
31 | }
32 | )
33 | register(
34 | id='FetchHookDefaultEnv-v0',
35 | entry_point='common.randomizer.randomized_residual_hook:FetchHookEnv',
36 | max_episode_steps=100,
37 | kwargs={
38 | 'config': 'common/randomizer/config/FetchHookRandomized/default.json',
39 | 'xml_name': 'hook.xml'
40 | }
41 | )
42 | register(
43 | id='NoisyFetchHookRandomizedEnv-v0',
44 | entry_point='common.randomizer.randomized_residual_hook:NoisyResidualFetchHookEnv',
45 | max_episode_steps=100,
46 | kwargs={
47 | 'config': 'common/randomizer/config/FetchHookRandomized/random.json',
48 | 'xml_name': 'hook.xml'
49 | }
50 | )
51 | register(
52 | id='NoisyFetchHookDefaultEnv-v0',
53 | entry_point='common.randomizer.randomized_residual_hook:NoisyResidualFetchHookEnv',
54 | max_episode_steps=100,
55 | kwargs={
56 | 'config': 'common/randomizer/config/FetchHookRandomized/default.json',
57 | 'xml_name': 'hook.xml'
58 | }
59 | )
60 | register(
61 | id='ErgoPushRandomizedEnv-Headless-v0',
62 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv',
63 | max_episode_steps=100,
64 | reward_threshold=0,
65 | kwargs={
66 | 'config': 'common/randomizer/config/ErgoPushRandomized/random.json',
67 | 'headless': True
68 | }
69 | )
70 | register(
71 | id='ErgoPushDefaultEnv-Headless-v0',
72 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv',
73 | max_episode_steps=100,
74 | reward_threshold=0,
75 | kwargs={
76 | 'config': 'common/randomizer/config/ErgoPushRandomized/default.json',
77 | 'headless': True
78 | }
79 | )
80 | register(
81 | id='ErgoPushHardEnv-Headless-v0',
82 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv',
83 | max_episode_steps=100,
84 | reward_threshold=0,
85 | kwargs={
86 | 'config': 'common/randomizer/config/ErgoPushRandomized/hard.json',
87 | 'headless': True
88 | }
89 | )
90 | register(
91 | id='ErgoReacherRandomizedEnv-Headless-v0',
92 | entry_point='common.randomizer.randomized_ergoreacher:ErgoReacherRandomizedEnv',
93 | max_episode_steps=100,
94 | reward_threshold=0,
95 | kwargs={
96 | 'config': 'common/randomizer/config/ErgoReacherRandomized/random.json',
97 | 'headless': True,
98 | 'simple': True,
99 | 'goal_halfsphere': True,
100 | 'multi_goal': True
101 | }
102 | )
103 | register(
104 | id='ErgoReacherDefaultEnv-Headless-v0',
105 | entry_point='common.randomizer.randomized_ergoreacher:ErgoReacherRandomizedEnv',
106 | max_episode_steps=100,
107 | kwargs={
108 | 'config': 'common/randomizer/config/ErgoReacherRandomized/default.json',
109 | 'headless': True,
110 | 'simple': True,
111 | 'goal_halfsphere': True,
112 | 'multi_goal': True
113 | }
114 | )
115 | register(
116 | id='ErgoReacherHardEnv-Headless-v0',
117 | entry_point='common.randomizer.randomized_ergoreacher:ErgoReacherRandomizedEnv',
118 | max_episode_steps=100,
119 | kwargs={
120 | 'config': 'common/randomizer/config/ErgoReacherRandomized/hard.json',
121 | 'headless': True,
122 | 'simple': True,
123 | 'goal_halfsphere': True,
124 | 'multi_goal': True
125 | }
126 | )
127 | register(
128 | id='ErgoReacherDefaultEnv-Graphical-v0',
129 | entry_point='common.randomizer.randomized_ergoreacher:ErgoReacherRandomizedEnv',
130 | max_episode_steps=100,
131 | kwargs={
132 | 'config': 'common/randomizer/config/ErgoReacherRandomized/hard.json',
133 | 'headless': False,
134 | 'simple': True,
135 | 'goal_halfsphere': True,
136 | 'multi_goal': True
137 | }
138 | )
139 | register(
140 | id='ErgoPushRandomizedEnv-Graphical-v0',
141 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv',
142 | max_episode_steps=100,
143 | reward_threshold=0,
144 | kwargs={
145 | 'config': 'common/randomizer/config/ErgoPushRandomized/random.json',
146 | 'headless': False
147 | }
148 | )
149 | register(
150 | id='ErgoPushHardEnv-Graphical-v0',
151 | entry_point='common.randomizer.randomized_ergo_pusher:ErgoPusherRandomizedEnv',
152 | max_episode_steps=100,
153 | reward_threshold=0,
154 | kwargs={
155 | 'config': 'common/randomizer/config/ErgoPushRandomized/hard.json',
156 | 'headless': False
157 | }
158 | )
159 |
160 | register(
161 | id='ResFetchPushRandomizedEnv-v0',
162 | entry_point='common.randomizer.randomized_residual_push:ResidualPushRandomizedEnv',
163 | max_episode_steps=50,
164 | kwargs={
165 | 'config': 'common/randomizer/config/FetchPushRandomized/random.json',
166 | 'xml_name': 'push.xml'
167 | }
168 | )
169 | register(
170 | id='ResFetchPushDefaultEnv-v0',
171 | entry_point='common.randomizer.randomized_residual_push:ResidualPushRandomizedEnv',
172 | max_episode_steps=50,
173 | kwargs={
174 | 'config': 'common/randomizer/config/FetchPushRandomized/default.json',
175 | 'xml_name': 'push.xml'
176 | }
177 | )
--------------------------------------------------------------------------------
/common/randomizer/assets/pusher.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
--------------------------------------------------------------------------------
/experiments/ddpg_train.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import gym
4 | import argparse
5 | import os
6 | import gym_ergojr
7 | from common.agents.ddpg.replay_buffer import ReplayBuffer
8 | from common.agents.ddpg.ddpg import DDPG
9 | import common.randomizer
10 | from common.randomizer.wrappers import RandomizedEnvWrapper
11 | from common.adr.adr import ADR
12 | import multiprocessing as mp
13 | from arguments import get_args
14 |
15 |
16 | def get_env_params(env):
17 | observation = env.reset()
18 | # close the environment
19 | params = {'obs': observation["observation"].shape[0], 'goal': observation["achieved_goal"].shape[0], 'action': env.action_space.shape[0],
20 | 'action_max': env.action_space.high[0], 'max_timesteps': env._max_episode_steps}
21 | return params
22 |
23 |
24 | args = get_args()
25 | args.save_models = True
26 | args.num_workers = 8
27 |
28 | env = gym.make(args.env_name)
29 | env_param = get_env_params(env)
30 |
31 | # jobid = os.environ['SLURM_ARRAY_TASK_ID']
32 | # seed = [40, 41, 42, 43, 44]
33 | # args.seed = seed[int(jobid) - 1] # Set seeds
34 |
35 | env.seed(args.seed)
36 | torch.manual_seed(args.seed)
37 | np.random.seed(args.seed)
38 | args.nparticles = mp.cpu_count() - 1
39 |
40 | state_dim = env_param["obs"]
41 | action_dim = env_param["action"]
42 | max_action = env_param["action_max"]
43 | goal_dim = env_param["goal"]
44 | total_timesteps = 0
45 | timesteps_since_eval = 0
46 | episode_num = 0
47 | done = True
48 | env = RandomizedEnvWrapper(env, seed=args.seed)
49 | env.reset()
50 | svpg_rewards = []
51 |
52 | # Initialize policy
53 | policy = DDPG(args, state_dim, action_dim, max_action, goal_dim)
54 |
55 | replay_buffer = ReplayBuffer(state_dim, action_dim)
56 |
57 | # ADR integration
58 | adr = ADR(
59 | nparticles=args.num_workers,
60 | nparams=args.n_params,
61 | state_dim=1,
62 | action_dim=1,
63 | temperature=10,
64 | svpg_rollout_length=args.svpg_rollout_length,
65 | svpg_horizon=25,
66 | max_step_length=0.05,
67 | reward_scale=1,
68 | initial_svpg_steps=0,
69 | seed=args.seed,
70 | discriminator_batchsz=320,
71 | )
72 | count = 0
73 | args.save_dir = os.getcwd() + '/' + os.path.join(args.save_dir,
74 | 'sp{}polyak{}'.format(args.sp_percent, args.polyak) + '-' + args.approach,
75 | str(args.seed))
76 | model_path = os.path.join(args.save_dir, args.env_name)
77 |
78 | if not os.path.isdir(model_path):
79 | os.makedirs(model_path)
80 | args.save_dir = model_path
81 | alice_envs = []
82 | alice_envs_total = []
83 |
84 | while total_timesteps < args.max_timesteps:
85 | if done:
86 | env.randomize([-1] * args.n_params) # Randomize the environment
87 | if total_timesteps != 0:
88 | print("Total T: {} Episode Num: {} Episode T: {} Reward: {}".format(total_timesteps, episode_num,
89 | episode_timesteps, episode_reward))
90 | # Reset environment
91 | obs = env.reset()
92 | done = False
93 | episode_reward = 0
94 | episode_timesteps = 0
95 | episode_num += 1
96 |
97 | env_settings = adr.step_particles()
98 | env_settings = np.ascontiguousarray(env_settings)
99 | count = 0
100 | svpg_index = total_timesteps % args.svpg_rollout_length
101 |
102 | if np.random.random() < args.sp_percent: # Self-play loop
103 | env.randomize(["default"] * args.n_params)
104 |
105 | bobs_goal_state, alice_time = policy.alice_loop(args, env, env_param) # Alice Loop
106 |
107 | if total_timesteps % int(1e4) == 0:
108 | alice_envs_total.append(alice_envs)
109 | alice_envs = []
110 |
111 | if not args.only_sp:
112 | multiplier = np.clip(env_settings[svpg_index][0][:args.n_params], 0, 1.0)
113 | alice_envs.append(multiplier)
114 | env.randomize(multiplier) # Randomize the env for bob
115 | bob_time, done = policy.bob_loop(env, env_param, bobs_goal_state, alice_time, replay_buffer) # Bob Loop
116 | alice_reward = policy.train_alice(alice_time, bob_time) # Train alice
117 | svpg_rewards.append(alice_reward)
118 | if len(svpg_rewards) == args.num_workers * args.svpg_rollout_length: # ADR training
119 | all_rewards = np.reshape(np.asarray(svpg_rewards), (args.num_workers, args.svpg_rollout_length))
120 | adr._train_particles(all_rewards)
121 | svpg_rewards = []
122 | else:
123 | env.randomize(["default"] * args.n_params)
124 | bob_time, done = policy.bob_loop(env, env_param, bobs_goal_state, alice_time, replay_buffer) # Bob Loop with only_sp=True
125 | alice_reward = policy.train_alice(alice_time, bob_time)
126 | else:
127 | observation = env.reset()
128 | obs = observation["observation"]
129 | # Select action randomly or according to policy
130 | if total_timesteps < args.start_timesteps:
131 | action = env.action_space.sample()
132 | else:
133 | action = policy.select_action(np.array(obs))
134 | if args.expl_noise != 0:
135 | action = (action + np.random.normal(0, args.expl_noise, size=env.action_space.shape[0])).clip(
136 | env.action_space.low, env.action_space.high)
137 |
138 | # Perform action
139 | new_obs, reward, done, _ = env.step(action)
140 | done_bool = 0 if episode_timesteps + 1 == env_param['max_timesteps'] else float(done)
141 | done = done or episode_timesteps + 1 == env_param['max_timesteps']
142 | episode_reward += reward
143 |
144 | # Store data in replay buffer
145 | replay_buffer.add(obs, action, new_obs["observation"], reward, done_bool)
146 | obs = new_obs["observation"]
147 | # Train the policy after collecting sufficient data
148 | if total_timesteps >= args.start_timesteps:
149 | policy.train(replay_buffer, args.batch_size)
150 |
151 | if timesteps_since_eval >= args.eval_freq:
152 | timesteps_since_eval %= args.eval_freq
153 |
154 | if args.save_models: policy.save(f'model_{total_timesteps}',
155 | directory=args.save_dir)
156 | np.save(f"{args.save_dir}/alice_envs.npy", alice_envs_total)
157 | print("---------------------------------------")
158 | print("Env Name: %s | Seed : %s | sp-percent : %s" % (args.env_name, args.seed, args.sp_percent))
159 | print("---------------------------------------")
160 |
161 | episode_timesteps += 1
162 | total_timesteps += 1
163 | timesteps_since_eval += 1
164 |
165 | if args.save_models: policy.save(f'model_{total_timesteps}', directory=args.save_dir)
166 |
--------------------------------------------------------------------------------
/common/agents/ddpg/ddpg.py:
--------------------------------------------------------------------------------
1 | import copy
2 | import numpy as np
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | from common.agents.selfplay_models import AlicePolicyFetch
7 |
8 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
9 |
10 |
11 | # Re-tuned version of Deep Deterministic Policy Gradients (DDPG)
12 | # Paper: https://arxiv.org/abs/1509.02971
13 |
14 |
15 | class Actor(nn.Module):
16 | def __init__(self, state_dim, action_dim, max_action):
17 | super(Actor, self).__init__()
18 |
19 | self.l1 = nn.Linear(state_dim, 400)
20 | self.l2 = nn.Linear(400, 300)
21 | self.l3 = nn.Linear(300, action_dim)
22 |
23 | self.max_action = max_action
24 |
25 | def forward(self, state):
26 | a = F.relu(self.l1(state))
27 | a = F.relu(self.l2(a))
28 | return self.max_action * torch.tanh(self.l3(a))
29 |
30 |
31 | class Critic(nn.Module):
32 | def __init__(self, state_dim, action_dim):
33 | super(Critic, self).__init__()
34 |
35 | self.l1 = nn.Linear(state_dim + action_dim, 400)
36 | self.l2 = nn.Linear(400, 300)
37 | self.l3 = nn.Linear(300, 1)
38 |
39 | def forward(self, state, action):
40 | q = F.relu(self.l1(torch.cat([state, action], 1)))
41 | q = F.relu(self.l2(q))
42 | return self.l3(q)
43 |
44 |
45 | class DDPG(object):
46 | def __init__(self, args, state_dim, action_dim, max_action, goal_dim, discount=0.99, tau=0.005):
47 | self.args = args
48 | self.goal_dim = goal_dim
49 | self.actor = Actor(state_dim, action_dim, max_action).to(device) # Bob's policy
50 | self.actor_target = copy.deepcopy(self.actor) # Alice acting policy
51 | self.actor_optimizer = torch.optim.Adam(self.actor.parameters())
52 |
53 | self.critic = Critic(state_dim, action_dim).to(device) # Bob's critic network
54 | self.critic_target = copy.deepcopy(self.critic) # Alice's critic network
55 | self.critic_optimizer = torch.optim.Adam(self.critic.parameters())
56 |
57 | self.alice_policy = AlicePolicyFetch(self.args, goal_dim, action_dim=1) # Alice Policy
58 |
59 | self.discount = discount
60 | self.tau = tau
61 |
62 | def select_action(self, state):
63 | state = torch.FloatTensor(state.reshape(1, -1)).to(device)
64 | return self.actor(state).cpu().data.numpy().flatten()
65 |
66 |
67 | def train(self, replay_buffer, batch_size=100):
68 |
69 | # Sample replay buffer
70 | state, action, next_state, reward, not_done = replay_buffer.sample(batch_size)
71 |
72 | # Compute the target Q value
73 | target_Q = self.critic_target(next_state, self.actor_target(next_state))
74 | target_Q = reward + (not_done * self.discount * target_Q).detach()
75 |
76 | # Get current Q estimate
77 | current_Q = self.critic(state, action)
78 |
79 | # Compute critic loss
80 | critic_loss = F.mse_loss(current_Q, target_Q)
81 |
82 | # Optimize the critic
83 | self.critic_optimizer.zero_grad()
84 | critic_loss.backward()
85 | self.critic_optimizer.step()
86 |
87 | # Compute actor loss
88 | actor_loss = -self.critic(state, self.actor(state)).mean()
89 |
90 | # Optimize the actor
91 | self.actor_optimizer.zero_grad()
92 | actor_loss.backward()
93 | self.actor_optimizer.step()
94 |
95 | # Update the frozen target models
96 | for param, target_param in zip(self.critic.parameters(), self.critic_target.parameters()):
97 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
98 |
99 | for param, target_param in zip(self.actor.parameters(), self.actor_target.parameters()):
100 | target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
101 |
102 | def alice_loop(self, args, env, env_params):
103 | alice_done = False
104 | alice_time = 0
105 | env.randomize(["default"] * args.n_params)
106 | observation = env.reset()
107 |
108 | obs = observation["observation"]
109 | ag = observation["achieved_goal"]
110 | alice_state = np.concatenate([ag, np.zeros(ag.shape[0])])
111 |
112 | while not alice_done and (alice_time < env_params['max_timesteps']):
113 | observation = torch.FloatTensor(obs.reshape(1, -1)).to(device)
114 | pi = self.actor_target(observation)
115 | action = pi.cpu().data.numpy().flatten()
116 |
117 | observation_new, reward, env_done, _ = env.step(action)
118 | obs_new = observation_new
119 | ag_new = observation_new["achieved_goal"]
120 | alice_signal = self.alice_policy.select_action(alice_state)
121 | bobs_goal_state = ag_new
122 |
123 | # Stopping Criteria
124 | if alice_signal > np.random.random(): alice_done = True
125 | alice_done = env_done or alice_time + 1 == env_params['max_timesteps'] or alice_signal and alice_time >= 1
126 | if not alice_done:
127 | alice_state[env_params["goal"]:] = ag_new
128 | bobs_goal_state = ag_new
129 | alice_time += 1
130 | self.alice_policy.log(0.0)
131 | obs = obs_new["observation"]
132 | ag = ag_new
133 |
134 | return bobs_goal_state, alice_time
135 |
136 | def bob_loop(self, env, env_params, bobs_goal_state, alice_time, replay_buffer):
137 | observation = env.reset()
138 | obs = observation["observation"]
139 | bob_done = False
140 | bob_time = 0
141 | while not bob_done and alice_time + bob_time < env_params['max_timesteps']:
142 | action = self.select_action(obs)
143 | new_obs, reward, env_done, _ = env.step(action)
144 | bob_signal = self._check_closeness(new_obs["achieved_goal"], bobs_goal_state)
145 | bob_done = env_done or bob_signal or bob_done
146 |
147 | if not bob_done:
148 | replay_buffer.add(obs, action, new_obs["observation"], reward, bob_done)
149 | obs = new_obs["observation"]
150 | bob_time += 1
151 |
152 | return bob_time, bob_done
153 |
154 | def train_alice(self, alice_time, bob_time):
155 | reward_alice = self.args.sp_gamma * max(0, bob_time - alice_time)
156 | self.alice_policy.log(reward_alice)
157 | self.alice_policy.finish_episode(gamma=0.99)
158 | return reward_alice
159 |
160 | @staticmethod
161 | def _check_closeness(state, goal):
162 | dist = np.linalg.norm(state - goal)
163 | return dist < 0.025
164 |
165 | def save(self, filename, directory):
166 | torch.save(self.actor.state_dict(), '%s/%s_actor.pth' % (directory, filename))
167 | torch.save(self.critic.state_dict(), '%s/%s_critic.pth' % (directory, filename))
168 |
169 | def load(self, filename, directory):
170 | self.actor.load_state_dict(torch.load('%s/%s' % (directory, filename), map_location=lambda storage, loc: storage))
171 | # self.critic.load_state_dict(torch.load('%s/%s_critic.pth' % (directory, filename), map_location='cpu'))
172 |
--------------------------------------------------------------------------------
/common/randomizer/assets/robot.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/common/randomizer/complex_hook_env.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from gym import utils, spaces
4 | from gym.utils import seeding
5 | from gym.envs.robotics import rotations, fetch_env
6 | import gym
7 |
8 |
9 | import glob
10 | import os
11 |
12 |
13 | from .controllers.hook_controller import get_hook_control
14 |
15 | DIR_PATH = os.path.dirname(os.path.abspath(__file__))
16 |
17 |
18 | class ComplexHookSingleObjectEnv(fetch_env.FetchEnv, utils.EzPickle):
19 | def __init__(self, xml_file=None):
20 | initial_qpos = {
21 | 'robot0:slide0': 0.405,
22 | 'robot0:slide1': 0.48,
23 | 'robot0:slide2': 0.0,
24 | 'object0:joint': [1.25, 0.53, 0.6, 1., 0., 0., 0.],
25 | 'hook:joint': [1.35, 0.35, 0.6, 1., 0., 0., 0.],
26 | }
27 |
28 | if xml_file is None:
29 | xml_file = os.path.join(DIR_PATH, 'assets', 'hook.xml')
30 |
31 | self._goal_pos = np.array([1.65, 0.75, 0.42])
32 | self._object_xpos = np.array([1.8, 0.75])
33 |
34 | fetch_env.FetchEnv.__init__(
35 | self, xml_file, has_object=True, block_gripper=False, n_substeps=20,
36 | gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0,
37 | obj_range=None, target_range=None, distance_threshold=0.05,
38 | initial_qpos=initial_qpos, reward_type='sparse')
39 |
40 | utils.EzPickle.__init__(self)
41 |
42 |
43 |
44 | def render(self, mode="human", *args, **kwargs):
45 | # See https://github.com/openai/gym/issues/1081
46 | self._render_callback()
47 | if mode == 'rgb_array':
48 | self._get_viewer().render()
49 | width, height = 3350, 1800
50 | data = self._get_viewer().read_pixels(width, height, depth=False)
51 | # original image is upside-down, so flip it
52 | return data[::-1, :, :]
53 | elif mode == 'human':
54 | self._get_viewer().render()
55 |
56 | return super(ComplexHookSingleObjectEnv, self).render(*args, **kwargs)
57 |
58 | def _sample_goal(self):
59 | goal_pos = self._goal_pos.copy()
60 | goal_pos[:2] += self.np_random.uniform(-0.05, 0.05)
61 | return goal_pos
62 |
63 | def _viewer_setup(self):
64 | body_id = self.sim.model.body_name2id('robot0:gripper_link')
65 | lookat = self.sim.data.body_xpos[body_id]
66 | for idx, value in enumerate(lookat):
67 | self.viewer.cam.lookat[idx] = value
68 | self.viewer.cam.distance = 2.5
69 | self.viewer.cam.azimuth = 180.
70 | self.viewer.cam.elevation = -24.
71 |
72 | def _reset_sim(self):
73 | self.sim.set_state(self.initial_state)
74 |
75 | object_xpos_x = 1.65 + self.np_random.uniform(-0.05, 0.05)
76 | while True:
77 | object_xpos_x = 1.8 + self.np_random.uniform(-0.05, 0.05)
78 | object_xpos_y = 0.75 + self.np_random.uniform(-0.05, 0.05)
79 | if (object_xpos_x - self._goal_pos[0]) ** 2 + (object_xpos_y - self._goal_pos[1]) ** 2 >= 0.01:
80 | break
81 | self._object_xpos = np.array([object_xpos_x, object_xpos_y])
82 |
83 | object_qpos = self.sim.data.get_joint_qpos('object0:joint')
84 | assert object_qpos.shape == (7,)
85 | object_qpos[:2] = self._object_xpos
86 | self.sim.data.set_joint_qpos('object0:joint', object_qpos)
87 |
88 | self.sim.forward()
89 | return True
90 |
91 | def _get_obs(self):
92 | obs = fetch_env.FetchEnv._get_obs(self)
93 |
94 | grip_pos = self.sim.data.get_site_xpos('robot0:grip')
95 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep
96 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt
97 |
98 | hook_pos = self.sim.data.get_site_xpos('hook')
99 | # rotations
100 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook'))
101 | # velocities
102 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt
103 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt
104 | # gripper state
105 | hook_rel_pos = hook_pos - grip_pos
106 | hook_velp -= grip_velp
107 |
108 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos])
109 |
110 | obs['observation'] = np.concatenate([obs['observation'], hook_observation])
111 |
112 | return obs
113 |
114 |
115 | class ResidualComplexHookSingleObjectEnv(ComplexHookSingleObjectEnv):
116 | def step(self, residual_action):
117 | residual_action = 2. * residual_action
118 |
119 | action = np.add(residual_action, get_hook_control(self._last_observation))
120 | action = np.clip(action, -1, 1)
121 | observation, reward, done, debug_info = ComplexHookSingleObjectEnv.step(self, action)
122 |
123 | self._last_observation = observation
124 |
125 | return observation, reward, done, debug_info
126 |
127 | def compute_reward(self, *args, **kwargs):
128 | return ComplexHookSingleObjectEnv.compute_reward(self, *args, **kwargs)
129 |
130 | def reset(self):
131 | observation = ComplexHookSingleObjectEnv.reset(self)
132 | self._last_observation = observation
133 |
134 | return observation
135 |
136 |
137 | class TwoFrameResidualComplexHookSingleObjectEnv(ComplexHookSingleObjectEnv):
138 | def __init__(self, xml_file):
139 | super(TwoFrameResidualComplexHookSingleObjectEnv, self).__init__(xml_file)
140 | self.observation_space.spaces['observation'] = spaces.Box(low=np.hstack(
141 | (self.observation_space.spaces['observation'].low, self.observation_space.spaces['observation'].low)),
142 | high=np.hstack((self.observation_space.spaces[
143 | 'observation'].high,
144 | self.observation_space.spaces[
145 | 'observation'].high)),
146 | dtype=np.float32)
147 |
148 | def step(self, residual_action):
149 | residual_action = 2. * residual_action
150 |
151 | action = np.add(residual_action, get_hook_control(self._last_observation))
152 | action = np.clip(action, -1, 1)
153 | observation, reward, done, debug_info = ComplexHookSingleObjectEnv.step(self, action)
154 |
155 | obs_out = observation.copy()
156 | obs_out['observation'] = np.hstack((self._last_observation['observation'], observation['observation']))
157 | self._last_observation = observation
158 |
159 | return obs_out, reward, done, debug_info
160 |
161 | def reset(self):
162 | observation = ComplexHookSingleObjectEnv.reset(self)
163 | self._last_observation = observation.copy()
164 | observation['observation'] = np.hstack((self._last_observation['observation'], observation['observation']))
165 | return observation
166 |
167 |
168 | class ComplexHookEnv(gym.Env):
169 | def __init__(self, train=True, num_scenes=100, scene_offset=0, subenv_type='normal', **kwargs):
170 | if train:
171 | xml_files = list(glob.glob(os.path.join(DIR_PATH, 'assets', 'fetch_complex_objects', 'fetch',
172 | 'train_scene_hook_*.xml')))
173 | else:
174 | xml_files = list(glob.glob(os.path.join(DIR_PATH, 'assets', 'fetch_complex_objects', 'fetch',
175 | 'test_scene_hook_*.xml')))
176 |
177 | xml_files = xml_files[scene_offset:scene_offset + num_scenes]
178 |
179 | self._subenvs = []
180 |
181 | self.seed()
182 |
183 | for xml_file in xml_files:
184 | try:
185 | if subenv_type == 'residual':
186 | subenv = ResidualComplexHookSingleObjectEnv(xml_file)
187 | elif subenv_type == 'twoframe_residual':
188 | subenv = TwoFrameResidualComplexHookSingleObjectEnv(xml_file)
189 | elif subenv_type == 'normal':
190 | subenv = ComplexHookSingleObjectEnv(xml_file)
191 | else:
192 | print('subenv_type not recognized')
193 | self._subenvs.append(subenv)
194 | except:
195 | print("FAILED; skipping")
196 |
197 | self._num_envs = len(self._subenvs)
198 |
199 | self.action_space = self._subenvs[0].action_space
200 | self.observation = self._subenvs[0].observation_space
201 | self.metadata = self._subenvs[0].metadata
202 | self.config_file = kwargs.get('config')
203 | self.dimensions = []
204 | self.reset()
205 |
206 | def _randomize_block_mass(self):
207 | block_mass = self.dimensions[0].current_value
208 | self._env.sim.model.body_mass[-2] = block_mass
209 |
210 | def _randomize_hook_mass(self):
211 | hook_mass = self.dimensions[1].current_value
212 | self._env.sim.model.body_mass[-1] = hook_mass
213 |
214 | def _randomize_friction(self):
215 | current_friction = self.dimensions[2].current_value
216 | for i in range(len(self._env.sim.model.geom_friction)):
217 | self._env.sim.model.geom_friction[i] = [current_friction, 5.e-3, 1e-4]
218 |
219 | def update_randomized_params(self):
220 | self._randomize_block_mass()
221 | self._randomize_hook_mass()
222 | self._randomize_friction()
223 |
224 | def reset(self):
225 | env_id = self.np_random.randint(self._num_envs)
226 | self._env = self._subenvs[env_id]
227 | return self._env.reset()
228 |
229 | def step(self, action):
230 | return self._env.step(action)
231 |
232 | def render(self, *args, **kwargs):
233 | return self._env.render(*args, **kwargs)
234 |
235 | def seed(self, seed=0):
236 | self.np_random, seed = seeding.np_random(seed)
237 |
238 | for subenv in self._subenvs:
239 | subenv.seed(seed=self.np_random.randint(1e8))
240 |
241 | return [seed]
242 |
243 | def compute_reward(self, *args, **kwargs):
244 | return self._env.compute_reward(*args, **kwargs)
245 |
246 |
247 | class ResidualComplexHookEnv(ComplexHookEnv):
248 | def __init__(self, **kwargs):
249 | ComplexHookEnv.__init__(self, train=True, subenv_type='residual', **kwargs)
250 |
251 |
252 | class TwoFrameResidualComplexHookEnv(ComplexHookEnv):
253 | def __init__(self):
254 | ComplexHookEnv.__init__(self, train=True, subenv_type='twoframe_residual')
255 |
256 |
257 |
--------------------------------------------------------------------------------
/common/randomizer/randomized_fetch.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 |
4 | import xml.etree.ElementTree as et
5 |
6 | from gym.envs.robotics import rotations, robot_env, utils
7 |
8 | import mujoco_py
9 |
10 | def goal_distance(goal_a, goal_b):
11 | assert goal_a.shape == goal_b.shape
12 | return np.linalg.norm(goal_a - goal_b, axis=-1)
13 |
14 |
15 | class RandomizedFetchEnv(robot_env.RobotEnv):
16 | """Superclass for all Fetch environments.
17 | """
18 |
19 | def __init__(
20 | self, model_path, n_substeps, gripper_extra_height, block_gripper,
21 | has_object, target_in_the_air, target_offset, obj_range, target_range,
22 | distance_threshold, initial_qpos, reward_type, **kwargs,
23 | ):
24 | """Initializes a new Fetch environment.
25 |
26 | Args:
27 | model_path (string): path to the environments XML file
28 | n_substeps (int): number of substeps the simulation runs on every call to step
29 | gripper_extra_height (float): additional height above the table when positioning the gripper
30 | block_gripper (boolean): whether or not the gripper is blocked (i.e. not movable) or not
31 | has_object (boolean): whether or not the environment has an object
32 | target_in_the_air (boolean): whether or not the target should be in the air above the table or on the table surface
33 | target_offset (float or array with 3 elements): offset of the target
34 | obj_range (float): range of a uniform distribution for sampling initial object positions
35 | target_range (float): range of a uniform distribution for sampling a target
36 | distance_threshold (float): the threshold after which a goal is considered achieved
37 | initial_qpos (dict): a dictionary of joint names and values that define the initial configuration
38 | reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense
39 | """
40 | self.gripper_extra_height = gripper_extra_height
41 | self.block_gripper = block_gripper
42 | self.has_object = has_object
43 | self.target_in_the_air = target_in_the_air
44 | self.target_offset = target_offset
45 | self.obj_range = obj_range
46 | self.target_range = target_range
47 | self.distance_threshold = distance_threshold
48 | self.reward_type = reward_type
49 |
50 | super(RandomizedFetchEnv, self).__init__(
51 | model_path=model_path, n_substeps=n_substeps, n_actions=4,
52 | initial_qpos=initial_qpos)
53 |
54 | # randomization
55 | self.xml_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets")
56 | self.reference_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets", kwargs.get('xml_name'))
57 | self.reference_xml = et.parse(self.reference_path)
58 | self.config_file = kwargs.get('config')
59 | self.dimensions = []
60 | self.dimension_map = []
61 | self.suffixes = []
62 | self._locate_randomize_parameters()
63 |
64 | # Randomization methods
65 | # ----------------------------
66 | def _locate_randomize_parameters(self):
67 | self.root = self.reference_xml.getroot()
68 | self.object_joints = self.root.findall(".//body[@name='object0']/joint")
69 |
70 | def _randomize_damping(self):
71 | damping = self.dimensions[0].current_value
72 | for joint in self.object_joints:
73 | joint.set('damping', '{:3f}'.format(damping))
74 |
75 | def _randomize_friction(self):
76 | current_friction = self.dimensions[0].current_value
77 | for i in range(len(self.sim.model.geom_friction)):
78 | self.sim.model.geom_friction[i] = [current_friction, 5.e-3, 1e-4]
79 |
80 | def _create_xml(self):
81 | self._randomize_damping()
82 | return et.tostring(self.root, encoding='unicode', method='xml')
83 |
84 | def update_randomized_params(self):
85 | self._randomize_friction()
86 | xml = self._create_xml()
87 | # self._re_init(xml)
88 |
89 | def _re_init(self, xml):
90 | # TODO: Now, likely needs rank
91 | randomized_path = os.path.join(self.xml_dir, "randomizedfetch.xml")
92 | with open(randomized_path, 'wb') as fp:
93 | fp.write(xml.encode())
94 | fp.flush()
95 |
96 | self.model = mujoco_py.load_model_from_path(randomized_path)
97 | self.sim = mujoco_py.MjSim(self.model)
98 | self.data = self.sim.data
99 | self.init_qpos = self.data.qpos.ravel().copy()
100 | self.init_qvel = self.data.qvel.ravel().copy()
101 | observation, _reward, done, _info = self.step(np.zeros(4))
102 | assert not done
103 | if self.viewer:
104 | self.viewer.update_sim(self.sim)
105 |
106 | # GoalEnv methods
107 | # ----------------------------
108 |
109 | def compute_reward(self, achieved_goal, goal, info):
110 | # Compute distance between goal and the achieved goal.
111 | d = goal_distance(achieved_goal, goal)
112 | if self.reward_type == 'sparse':
113 | return -(d > self.distance_threshold).astype(np.float32)
114 | else:
115 | return -d
116 |
117 | # RobotEnv methods
118 | # ----------------------------
119 |
120 | def _step_callback(self):
121 | if self.block_gripper:
122 | self.sim.data.set_joint_qpos('robot0:l_gripper_finger_joint', 0.)
123 | self.sim.data.set_joint_qpos('robot0:r_gripper_finger_joint', 0.)
124 | self.sim.forward()
125 |
126 | def _set_action(self, action):
127 | assert action.shape == (4,)
128 | action = action.copy() # ensure that we don't change the action outside of this scope
129 | pos_ctrl, gripper_ctrl = action[:3], action[3]
130 |
131 | pos_ctrl *= 0.05 # limit maximum change in position
132 | rot_ctrl = [1., 0., 1., 0.] # fixed rotation of the end effector, expressed as a quaternion
133 | gripper_ctrl = np.array([gripper_ctrl, gripper_ctrl])
134 | assert gripper_ctrl.shape == (2,)
135 | if self.block_gripper:
136 | gripper_ctrl = np.zeros_like(gripper_ctrl)
137 | action = np.concatenate([pos_ctrl, rot_ctrl, gripper_ctrl])
138 |
139 | # Apply action to simulation.
140 | utils.ctrl_set_action(self.sim, action)
141 | utils.mocap_set_action(self.sim, action)
142 |
143 | def _get_obs(self):
144 | # positions
145 | grip_pos = self.sim.data.get_site_xpos('robot0:grip')
146 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep
147 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt
148 | robot_qpos, robot_qvel = utils.robot_get_obs(self.sim)
149 | if self.has_object:
150 | object_pos = self.sim.data.get_site_xpos('object0')
151 | # rotations
152 | object_rot = rotations.mat2euler(self.sim.data.get_site_xmat('object0'))
153 | # velocities
154 | object_velp = self.sim.data.get_site_xvelp('object0') * dt
155 | object_velr = self.sim.data.get_site_xvelr('object0') * dt
156 | # gripper state
157 | object_rel_pos = object_pos - grip_pos
158 | object_velp -= grip_velp
159 | else:
160 | object_pos = object_rot = object_velp = object_velr = object_rel_pos = np.zeros(0)
161 | gripper_state = robot_qpos[-2:]
162 | gripper_vel = robot_qvel[-2:] * dt # change to a scalar if the gripper is made symmetric
163 |
164 | if not self.has_object:
165 | achieved_goal = grip_pos.copy()
166 | else:
167 | achieved_goal = np.squeeze(object_pos.copy())
168 | obs = np.concatenate([
169 | grip_pos, object_pos.ravel(), object_rel_pos.ravel(), gripper_state, object_rot.ravel(),
170 | object_velp.ravel(), object_velr.ravel(), grip_velp, gripper_vel,
171 | ])
172 |
173 | return {
174 | 'observation': obs.copy(),
175 | 'achieved_goal': achieved_goal.copy(),
176 | 'desired_goal': self.goal.copy(),
177 | }
178 |
179 | def _viewer_setup(self):
180 | body_id = self.sim.model.body_name2id('robot0:gripper_link')
181 | lookat = self.sim.data.body_xpos[body_id]
182 | for idx, value in enumerate(lookat):
183 | self.viewer.cam.lookat[idx] = value
184 | self.viewer.cam.distance = 2.5
185 | self.viewer.cam.azimuth = 132.
186 | self.viewer.cam.elevation = -14.
187 |
188 | def _render_callback(self):
189 | # Visualize target.
190 | sites_offset = (self.sim.data.site_xpos - self.sim.model.site_pos).copy()
191 | site_id = self.sim.model.site_name2id('target0')
192 | self.sim.model.site_pos[site_id] = self.goal - sites_offset[0]
193 | self.sim.forward()
194 |
195 | def _reset_sim(self):
196 | self.sim.set_state(self.initial_state)
197 |
198 | # Randomize start position of object.
199 | if self.has_object:
200 | object_xpos = self.initial_gripper_xpos[:2]
201 | while np.linalg.norm(object_xpos - self.initial_gripper_xpos[:2]) < 0.1:
202 | object_xpos = self.initial_gripper_xpos[:2] + self.np_random.uniform(-self.obj_range, self.obj_range, size=2)
203 | object_qpos = self.sim.data.get_joint_qpos('object0:joint')
204 | assert object_qpos.shape == (7,)
205 | object_qpos[:2] = object_xpos
206 | self.sim.data.set_joint_qpos('object0:joint', object_qpos)
207 |
208 | self.sim.forward()
209 | return True
210 |
211 | def _sample_goal(self):
212 | if self.has_object:
213 | goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-self.target_range, self.target_range, size=3)
214 | goal += self.target_offset
215 | goal[2] = self.height_offset
216 | if self.target_in_the_air and self.np_random.uniform() < 0.5:
217 | goal[2] += self.np_random.uniform(0, 0.45)
218 | else:
219 | goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-0.15, 0.15, size=3)
220 | return goal.copy()
221 |
222 | def _is_success(self, achieved_goal, desired_goal):
223 | d = goal_distance(achieved_goal, desired_goal)
224 | return (d < self.distance_threshold).astype(np.float32)
225 |
226 | def _env_setup(self, initial_qpos):
227 | for name, value in initial_qpos.items():
228 | self.sim.data.set_joint_qpos(name, value)
229 | utils.reset_mocap_welds(self.sim)
230 | self.sim.forward()
231 |
232 | # Move end effector into position.
233 | gripper_target = np.array([-0.498, 0.005, -0.431 + self.gripper_extra_height]) + self.sim.data.get_site_xpos('robot0:grip')
234 | gripper_rotation = np.array([1., 0., 1., 0.])
235 | self.sim.data.set_mocap_pos('robot0:mocap', gripper_target)
236 | self.sim.data.set_mocap_quat('robot0:mocap', gripper_rotation)
237 | for _ in range(10):
238 | self.sim.step()
239 |
240 | # Extract information for sampling goals.
241 | self.initial_gripper_xpos = self.sim.data.get_site_xpos('robot0:grip').copy()
242 | if self.has_object:
243 | self.height_offset = self.sim.data.get_site_xpos('object0')[2]
244 |
245 | def render(self, mode='human', width=500, height=500):
246 | return super(RandomizedFetchEnv, self).render()
247 |
--------------------------------------------------------------------------------
/common/randomizer/residual_push.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 |
4 | import xml.etree.ElementTree as et
5 |
6 | from .residual_envs.residual_fetch_push_env import ResidualSlipperyPushEnv
7 | from gym.envs.robotics import rotations, robot_env, utils
8 | import mujoco_py
9 |
10 | def goal_distance(goal_a, goal_b):
11 | assert goal_a.shape == goal_b.shape
12 | return np.linalg.norm(goal_a - goal_b, axis=-1)
13 |
14 |
15 | class RandomizedResidualPushEnv(ResidualSlipperyPushEnv):
16 | """Superclass for all Fetch environments.
17 | """
18 |
19 | def __init__(
20 | self, model_path, n_substeps, gripper_extra_height, block_gripper,
21 | has_object, target_in_the_air, target_offset, obj_range, target_range,
22 | distance_threshold, initial_qpos, reward_type, **kwargs
23 | ):
24 | """Initializes a new Fetch environment.
25 |
26 | Args:
27 | model_path (string): path to the environments XML file
28 | n_substeps (int): number of substeps the simulation runs on every call to step
29 | gripper_extra_height (float): additional height above the table when positioning the gripper
30 | block_gripper (boolean): whether or not the gripper is blocked (i.e. not movable) or not
31 | has_object (boolean): whether or not the environment has an object
32 | target_in_the_air (boolean): whether or not the target should be in the air above the table or on the table surface
33 | target_offset (float or array with 3 elements): offset of the target
34 | obj_range (float): range of a uniform distribution for sampling initial object positions
35 | target_range (float): range of a uniform distribution for sampling a target
36 | distance_threshold (float): the threshold after which a goal is considered achieved
37 | initial_qpos (dict): a dictionary of joint names and values that define the initial configuration
38 | reward_type ('sparse' or 'dense'): the reward type, i.e. sparse or dense
39 | """
40 |
41 | self.gripper_extra_height = gripper_extra_height
42 | self.block_gripper = block_gripper
43 | self.has_object = has_object
44 | self.target_in_the_air = target_in_the_air
45 | self.target_offset = target_offset
46 | self.obj_range = obj_range
47 | self.target_range = target_range
48 | self.distance_threshold = distance_threshold
49 | self.reward_type = reward_type
50 |
51 | super(RandomizedResidualPushEnv, self).__init__(
52 | model_path=model_path, n_substeps=n_substeps, n_actions=4,
53 | initial_qpos=initial_qpos)
54 |
55 | # randomization
56 | self.xml_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets")
57 | self.reference_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "assets",
58 | kwargs.get('xml_name'))
59 | self.reference_xml = et.parse(self.reference_path)
60 | self.config_file = kwargs.get('config')
61 | self.dimensions = []
62 | self.dimension_map = []
63 | self.suffixes = []
64 | self.reset()
65 | self._locate_randomize_parameters()
66 |
67 |
68 | # Randomization methods
69 | # ----------------------------
70 | def _locate_randomize_parameters(self):
71 | self.root = self.reference_xml.getroot()
72 | self.object_joints = self.root.findall(".//body[@name='object0']/joint")
73 |
74 | def _randomize_damping(self):
75 | damping = self.dimensions[0].current_value
76 | for joint in self.object_joints:
77 | joint.set('damping', '{:3f}'.format(damping))
78 |
79 | def _randomize_friction(self):
80 | current_friction = self.dimensions[0].current_value
81 | for i in range(len(self.fetch_env.env.sim.model.geom_friction)):
82 | self.fetch_env.env.sim.model.geom_friction[i] = [current_friction, 5.e-3, 1e-4]
83 |
84 | def _create_xml(self):
85 | self._randomize_friction()
86 | return et.tostring(self.root, encoding='unicode', method='xml')
87 |
88 | def update_randomized_params(self):
89 | self._randomize_friction()
90 | xml = self._create_xml()
91 | # # self._re_init(xml)
92 |
93 | def _re_init(self, xml):
94 | # TODO: Now, likely needs rank
95 | randomized_path = os.path.join(self.xml_dir, "pusher.xml")
96 | with open(randomized_path, 'wb') as fp:
97 | fp.write(xml.encode())
98 | fp.flush()
99 |
100 | self.model = mujoco_py.load_model_from_path(randomized_path)
101 | self.sim = mujoco_py.MjSim(self.model)
102 | self.data = self.sim.data
103 | self.init_qpos = self.data.qpos.ravel().copy()
104 | self.init_qvel = self.data.qvel.ravel().copy()
105 |
106 | observation, _reward, done, _info = self.step(np.zeros(4))
107 | assert not done
108 | if self.viewer:
109 | self.viewer.update_sim(self.sim)
110 |
111 | # GoalEnv methods
112 | # ----------------------------
113 |
114 | def compute_reward(self, achieved_goal, goal, info):
115 | # Compute distance between goal and the achieved goal.
116 | d = goal_distance(achieved_goal, goal)
117 | if self.reward_type == 'sparse':
118 | return -(d > self.distance_threshold).astype(np.float32)
119 | else:
120 | return -d
121 |
122 | # RobotEnv methods
123 | # ----------------------------
124 |
125 | def _step_callback(self):
126 | if self.block_gripper:
127 | self.sim.data.set_joint_qpos('robot0:l_gripper_finger_joint', 0.)
128 | self.sim.data.set_joint_qpos('robot0:r_gripper_finger_joint', 0.)
129 | self.sim.forward()
130 |
131 | def _set_action(self, action):
132 | assert action.shape == (4,)
133 | action = action.copy() # ensure that we don't change the action outside of this scope
134 | pos_ctrl, gripper_ctrl = action[:3], action[3]
135 |
136 | pos_ctrl *= 0.05 # limit maximum change in position
137 | rot_ctrl = [1., 0., 1., 0.] # fixed rotation of the end effector, expressed as a quaternion
138 | gripper_ctrl = np.array([gripper_ctrl, gripper_ctrl])
139 | assert gripper_ctrl.shape == (2,)
140 | if self.block_gripper:
141 | gripper_ctrl = np.zeros_like(gripper_ctrl)
142 | action = np.concatenate([pos_ctrl, rot_ctrl, gripper_ctrl])
143 |
144 | # Apply action to simulation.
145 | utils.ctrl_set_action(self.sim, action)
146 | utils.mocap_set_action(self.sim, action)
147 |
148 |
149 | def _get_obs(self):
150 | # positions
151 | grip_pos = self.sim.data.get_site_xpos('robot0:grip')
152 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep
153 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt
154 | robot_qpos, robot_qvel = utils.robot_get_obs(self.sim)
155 | if self.has_object:
156 | object_pos = self.sim.data.get_site_xpos('object0')
157 | # rotations
158 | object_rot = rotations.mat2euler(self.sim.data.get_site_xmat('object0'))
159 | # velocities
160 | object_velp = self.sim.data.get_site_xvelp('object0') * dt
161 | object_velr = self.sim.data.get_site_xvelr('object0') * dt
162 | # gripper state
163 | object_rel_pos = object_pos - grip_pos
164 | object_velp -= grip_velp
165 | else:
166 | object_pos = object_rot = object_velp = object_velr = object_rel_pos = np.zeros(0)
167 | gripper_state = robot_qpos[-2:]
168 | gripper_vel = robot_qvel[-2:] * dt # change to a scalar if the gripper is made symmetric
169 |
170 | if not self.has_object:
171 | achieved_goal = grip_pos.copy()
172 | else:
173 | achieved_goal = np.squeeze(object_pos.copy())
174 | obs = np.concatenate([
175 | grip_pos, object_pos.ravel(), object_rel_pos.ravel(), gripper_state, object_rot.ravel(),
176 | object_velp.ravel(), object_velr.ravel(), grip_velp, gripper_vel,
177 | ])
178 |
179 | return {
180 | 'observation': obs.copy(),
181 | 'achieved_goal': achieved_goal.copy(),
182 | 'desired_goal': self.goal.copy(),
183 | }
184 |
185 | # def _viewer_setup(self):
186 | # body_id = self.sim.model.body_name2id('robot0:gripper_link')
187 | # lookat = self.sim.data.body_xpos[body_id]
188 | # for idx, value in enumerate(lookat):
189 | # self.viewer.cam.lookat[idx] = value
190 | # self.viewer.cam.distance = 2.5
191 | # self.viewer.cam.azimuth = 132.
192 | # self.viewer.cam.elevation = -14.
193 |
194 | def _render_callback(self):
195 | # Visualize target.
196 | sites_offset = (self.sim.data.site_xpos - self.sim.model.site_pos).copy()
197 | site_id = self.sim.model.site_name2id('target0')
198 | self.sim.model.site_pos[site_id] = self.goal - sites_offset[0]
199 | self.sim.forward()
200 |
201 | def _reset_sim(self):
202 | self.sim.set_state(self.initial_state)
203 |
204 | # Randomize start position of object.
205 | if self.has_object:
206 | object_xpos = self.initial_gripper_xpos[:2]
207 | while np.linalg.norm(object_xpos - self.initial_gripper_xpos[:2]) < 0.1:
208 | object_xpos = self.initial_gripper_xpos[:2] + self.np_random.uniform(-self.obj_range, self.obj_range,
209 | size=2)
210 | object_qpos = self.sim.data.get_joint_qpos('object0:joint')
211 | assert object_qpos.shape == (7,)
212 | object_qpos[:2] = object_xpos
213 | self.sim.data.set_joint_qpos('object0:joint', object_qpos)
214 |
215 | self.sim.forward()
216 | return True
217 |
218 | def _sample_goal(self):
219 | if self.has_object:
220 | goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-self.target_range, self.target_range, size=3)
221 | goal += self.target_offset
222 | goal[2] = self.height_offset
223 | if self.target_in_the_air and self.np_random.uniform() < 0.5:
224 | goal[2] += self.np_random.uniform(0, 0.45)
225 | else:
226 | goal = self.initial_gripper_xpos[:3] + self.np_random.uniform(-0.15, 0.15, size=3)
227 | return goal.copy()
228 |
229 | def _is_success(self, achieved_goal, desired_goal):
230 | d = goal_distance(achieved_goal, desired_goal)
231 | return (d < self.distance_threshold).astype(np.float32)
232 |
233 | def _env_setup(self, initial_qpos):
234 | for name, value in initial_qpos.items():
235 | self.sim.data.set_joint_qpos(name, value)
236 | utils.reset_mocap_welds(self.sim)
237 | self.sim.forward()
238 |
239 | # Move end effector into position.
240 | gripper_target = np.array([-0.498, 0.005, -0.431 + self.gripper_extra_height]) + self.sim.data.get_site_xpos(
241 | 'robot0:grip')
242 | gripper_rotation = np.array([1., 0., 1., 0.])
243 | self.sim.data.set_mocap_pos('robot0:mocap', gripper_target)
244 | self.sim.data.set_mocap_quat('robot0:mocap', gripper_rotation)
245 | for _ in range(10):
246 | self.sim.step()
247 |
248 | # Extract information for sampling goals.
249 | self.initial_gripper_xpos = self.sim.data.get_site_xpos('robot0:grip').copy()
250 | if self.has_object:
251 | self.height_offset = self.sim.data.get_site_xpos('object0')[2]
252 |
253 | def render(self, mode='human', width=500, height=500):
254 |
255 | return super(RandomizedResidualPushEnv, self).render()
--------------------------------------------------------------------------------
/common/svpg/svpg.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 | import torch.optim as optim
4 | from torch.distributions.kl import kl_divergence
5 |
6 | from scipy.spatial.distance import squareform, pdist
7 |
8 | from common.svpg.particles import SVPGParticle
9 | from common.svpg.svpg_utils import *
10 |
11 |
12 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
13 | torch.set_default_tensor_type(torch.FloatTensor)
14 | eps = np.finfo(np.float32).eps.item()
15 |
16 | LEARNING_RATE = 3e-4
17 | HIDDEN_DIMENSIONS = 100
18 |
19 | class SVPG:
20 | """Class that implements Stein Variational Policy Gradient
21 | Input is the current randomization settings, and output is either a
22 | direction to move in (Discrete - for 1D/2D) or a delta across all parameter (Continuous)
23 | """
24 | def __init__(self, nparticles, nparams, max_step_length, svpg_rollout_length, svpg_horizon, temperature, discrete=False, kld_coefficient=0.0):
25 | self.particles = []
26 | self.prior_particles = []
27 | self.optimizers = []
28 | self.max_step_length = max_step_length
29 | self.svpg_rollout_length = svpg_rollout_length
30 | self.svpg_horizon = svpg_horizon
31 | self.temperature = temperature
32 | self.nparticles = nparticles
33 | self.gamma = 0.99
34 |
35 | self.nparams = nparams
36 | self.noutputs = nparams * 2 if discrete else nparams
37 | self.discrete = discrete
38 | self.kld_coefficient = kld_coefficient
39 |
40 | self.last_states = np.random.uniform(0, 1, (self.nparticles, self.nparams))
41 | self.timesteps = np.zeros(self.nparticles)
42 |
43 | for i in range(self.nparticles):
44 | # Initialize each of the individual particles
45 | policy = SVPGParticle(input_dim=self.nparams,
46 | output_dim=self.noutputs,
47 | hidden_dim=HIDDEN_DIMENSIONS,
48 | discrete=discrete).to(device)
49 |
50 | prior_policy = SVPGParticle(input_dim=self.nparams,
51 | output_dim=self.noutputs,
52 | hidden_dim=HIDDEN_DIMENSIONS,
53 | discrete=discrete,
54 | freeze=True).to(device)
55 |
56 | optimizer = optim.Adam(policy.parameters(), lr=LEARNING_RATE)
57 | self.particles.append(policy)
58 | self.prior_particles.append(prior_policy)
59 | self.optimizers.append(optimizer)
60 |
61 | def _squared_dist(self, X):
62 | # Compute squared distance matrix using torch.pdist
63 | dists = torch.pdist(X)
64 | inds_l, inds_r = np.triu_indices(X.shape[-2], 1)
65 | res = torch.zeros(*X.shape[:-2], X.shape[-2], X.shape[-2], dtype=X.dtype, device=X.device)
66 | res[..., inds_l, inds_r] = dists
67 | res[..., inds_r, inds_l] = dists
68 |
69 | return res
70 |
71 | def _Kxx_dxKxx(self, X):
72 | """
73 | Computes covariance matrix K(X,X) and its gradient w.r.t. X
74 | for RBF kernel with design matrix X, as in the second term in eqn (8)
75 | of reference SVGD paper.
76 |
77 | Args:
78 | X (Tensor): (S, P), design matrix of samples, where S is num of
79 | samples, P is the dim of each sample which stacks all params
80 | into a (1, P) row. Thus P could be 1 millions.
81 | """
82 |
83 | X_np = X.cpu().data.numpy()
84 | pairwise_dists = squareform(pdist(X_np))**2
85 |
86 | # Median trick
87 | h = np.median(pairwise_dists)
88 | h = np.sqrt(0.5 * h / np.log(self.nparticles+1))
89 |
90 | # Compute RBF Kernel
91 | Kxx = torch.exp(-torch.from_numpy(pairwise_dists).to(device).float() / h**2 / 2)
92 |
93 | # Compute kernel gradient
94 | dxKxx = -(Kxx).matmul(X)
95 | sumKxx = Kxx.sum(1)
96 | for i in range(X.shape[1]):
97 | dxKxx[:, i] = dxKxx[:, i] + X[:, i].matmul(sumKxx)
98 | dxKxx /= (h ** 2)
99 |
100 | return Kxx, dxKxx
101 |
102 | def select_action(self, policy_idx, state):
103 | state = torch.from_numpy(state).float().unsqueeze(0).to(device)
104 | policy = self.particles[policy_idx]
105 | prior_policy = self.prior_particles[policy_idx]
106 | dist, value = policy(state)
107 | prior_dist, _ = prior_policy(state)
108 |
109 | action = dist.sample()
110 |
111 | policy.saved_log_probs.append(dist.log_prob(action))
112 | policy.saved_klds.append(kl_divergence(dist, prior_dist))
113 |
114 | # TODO: Gross temporary hack
115 | if self.nparams == 1 or self.discrete:
116 | action = action.item()
117 | else:
118 | action = action.squeeze().cpu().detach().numpy()
119 |
120 | return action, value
121 |
122 | def compute_returns(self, next_value, rewards, masks, klds):
123 | R = next_value
124 | returns = []
125 | for step in reversed(range(len(rewards))):
126 | # Eq. 80: https://arxiv.org/abs/1704.06440
127 | R = self.gamma * masks[step] * R + (rewards[step] - self.kld_coefficient * klds[step])
128 | returns.insert(0, R)
129 |
130 | return returns
131 |
132 | def step(self):
133 | """Rollout trajectories, starting from random initializations,
134 | of randomization settings, each of svpg_rollout_length size
135 | Then, send it to agent for further training and reward calculation
136 | """
137 | self.simulation_instances = np.zeros((self.nparticles, self.svpg_rollout_length, self.nparams))
138 |
139 | # Store the values of each state - for advantage estimation
140 | self.values = torch.zeros((self.nparticles, self.svpg_rollout_length, 1)).float().to(device)
141 | # Store the last states for each particle (calculating rewards)
142 | self.masks = np.ones((self.nparticles, self.svpg_rollout_length))
143 |
144 | for i in range(self.nparticles):
145 | self.particles[i].reset()
146 | current_sim_params = self.last_states[i]
147 |
148 | for t in range(self.svpg_rollout_length):
149 | self.simulation_instances[i][t] = current_sim_params
150 |
151 | action, value = self.select_action(i, current_sim_params)
152 | self.values[i][t] = value
153 |
154 | action = self._process_action(action)
155 | clipped_action = action * self.max_step_length
156 | next_params = np.clip(current_sim_params + clipped_action, 0, 1)
157 |
158 | if np.array_equal(next_params, current_sim_params) or self.timesteps[i] + 1 == self.svpg_horizon:
159 | next_params = np.random.uniform(0, 1, (self.nparams,))
160 |
161 | self.masks[i][t] = 0 # done = True
162 | self.timesteps[i] = 0
163 |
164 | current_sim_params = next_params
165 | self.timesteps[i] += 1
166 |
167 | self.last_states[i] = current_sim_params
168 |
169 | return np.array(self.simulation_instances)
170 |
171 | def train(self, simulator_rewards):
172 | """Train SVPG agent with rewards from rollouts
173 | """
174 | policy_grads = []
175 | parameters = []
176 |
177 | for i in range(self.nparticles):
178 | policy_grad_particle = []
179 |
180 | # Calculate the value of last state - for Return Computation
181 | _, next_value = self.select_action(i, self.last_states[i])
182 |
183 | particle_rewards = torch.from_numpy(simulator_rewards[i]).float().to(device)
184 | masks = torch.from_numpy(self.masks[i]).float().to(device)
185 |
186 | # Calculate entropy-augmented returns, advantages
187 | returns = self.compute_returns(next_value, particle_rewards, masks, self.particles[i].saved_klds)
188 | returns = torch.cat(returns).detach()
189 | advantages = returns - self.values[i]
190 | # for s, v, r in zip(self.simulation_instances[i], self.values[i], simulator_rewards[i]):
191 | # print('Setting: {}, Reward: {}, Value: {}'.format(s,r,v))
192 |
193 | # logprob * A = policy gradient (before backwards)
194 | for log_prob, advantage in zip(self.particles[i].saved_log_probs, advantages):
195 | policy_grad_particle.append(log_prob * advantage.detach())
196 |
197 | # Compute value loss, update critic
198 | self.optimizers[i].zero_grad()
199 | critic_loss = 0.5 * advantages.pow(2).mean()
200 | critic_loss.backward(retain_graph=True)
201 | self.optimizers[i].step()
202 |
203 | # Store policy gradients for SVPG update
204 | self.optimizers[i].zero_grad()
205 | policy_grad = -torch.cat(policy_grad_particle).mean()
206 | policy_grad.backward()
207 |
208 | # Vectorize parameters and PGs
209 | vec_param, vec_policy_grad = parameters_to_vector(
210 | self.particles[i].parameters(), both=True)
211 |
212 | policy_grads.append(vec_policy_grad.unsqueeze(0))
213 | parameters.append(vec_param.unsqueeze(0))
214 |
215 | # calculating the kernel matrix and its gradients
216 | parameters = torch.cat(parameters)
217 | Kxx, dxKxx = self._Kxx_dxKxx(parameters)
218 |
219 | policy_grads = 1 / self.temperature * torch.cat(policy_grads)
220 | grad_logp = torch.mm(Kxx, policy_grads)
221 |
222 | grad_theta = (grad_logp + dxKxx) / self.nparticles
223 | # explicitly deleting variables does not release memory :(
224 |
225 | # update param gradients
226 | for i in range(self.nparticles):
227 | vector_to_parameters(grad_theta[i],
228 | self.particles[i].parameters(), grad=True)
229 | self.optimizers[i].step()
230 |
231 | def save(self, directory):
232 | for i in range(self.nparticles):
233 | torch.save(self.particles[i].state_dict(), '{}/particle_{}.pth'.format(directory, i))
234 | torch.save(self.particles[i].critic.state_dict(), '{}/particle_critic{}.pth'.format(directory, i))
235 |
236 | def load(self, directory):
237 | return # TODO: Does this solve the value function issue?
238 | for i in range(self.nparticles):
239 | prior = torch.load('{}/particle_{}.pth'.format(directory, i))
240 | particle = self.particles[i].state_dict()
241 | actor = {k: v for k, v in prior.items() if k.find('critic') == -1}
242 |
243 | # Only load the actor!
244 | self.particles[i].load_state_dict(actor, strict=False)
245 |
246 | def load_prior_particles(self, directory):
247 | for i in range(self.nparticles):
248 | self.prior_particles[i].load_state_dict(torch.load('{}/particle_{}.pth'.format(directory, i), map_location=device))
249 | self.prior_particles[i].freeze()
250 |
251 | def _process_action(self, action):
252 | """Transform policy output into environment-action
253 | """
254 | if self.discrete:
255 | if self.nparams == 1:
256 | if action == 0:
257 | action = [-1.]
258 | elif action == 1:
259 | action = [1.]
260 | elif self.nparams == 2:
261 | if action == 0:
262 | action = [-1., 0]
263 | elif action == 1:
264 | action = [1., 0]
265 | elif action == 2:
266 | action = [0, -1.]
267 | elif action == 3:
268 | action = [0, 1.]
269 | else:
270 | if isinstance(action, float):
271 | action = np.clip(action, -1, 1)
272 | else:
273 | action /= np.linalg.norm(action, ord=2)
274 |
275 | return np.array(action)
276 |
--------------------------------------------------------------------------------
/common/randomizer/fetch_hook_env.py:
--------------------------------------------------------------------------------
1 | from gym.envs.robotics import rotations, fetch_env
2 | from gym import utils, spaces
3 | import numpy as np
4 | import os
5 |
6 | from .controllers.hook_controller import get_hook_control
7 |
8 | import pdb
9 |
10 | DIR_PATH = os.path.dirname(os.path.abspath(__file__))
11 |
12 |
13 | class FetchHookEnv(fetch_env.FetchEnv, utils.EzPickle):
14 | def __init__(self, xml_file=None):
15 | initial_qpos = {
16 | 'robot0:slide0': 0.405,
17 | 'robot0:slide1': 0.48,
18 | 'robot0:slide2': 0.0,
19 | 'object0:joint': [1.25, 0.53, 0.4, 1., 0., 0., 0.],
20 | 'hook:joint': [1.35, 0.35, 0.4, 1., 0., 0., 0.],
21 | }
22 |
23 | if xml_file is None:
24 | xml_file = os.path.join(DIR_PATH, 'assets_residual', 'hook.xml')
25 |
26 | self._goal_pos = np.array([1.65, 0.75, 0.42])
27 | self._object_xpos = np.array([1.8, 0.75])
28 |
29 | fetch_env.FetchEnv.__init__(
30 | self, xml_file, has_object=True, block_gripper=False, n_substeps=20,
31 | gripper_extra_height=0.2, target_in_the_air=True, target_offset=0.0,
32 | obj_range=None, target_range=None, distance_threshold=0.05,
33 | initial_qpos=initial_qpos, reward_type='sparse')
34 |
35 | utils.EzPickle.__init__(self)
36 |
37 | def render(self, mode="human", *args, **kwargs):
38 | # See https://github.com/openai/gym/issues/1081
39 | self._render_callback()
40 | if mode == 'rgb_array':
41 | self._get_viewer().render()
42 | width, height = 3350, 1800
43 | data = self._get_viewer().read_pixels(width, height, depth=False)
44 | # original image is upside-down, so flip it
45 | return data[::-1, :, :]
46 | elif mode == 'human':
47 | self._get_viewer().render()
48 |
49 | return super(FetchHookEnv, self).render(*args, **kwargs)
50 |
51 | def _sample_goal(self):
52 | goal_pos = self._goal_pos.copy()
53 | goal_pos[:2] += self.np_random.uniform(-0.05, 0.05)
54 | return goal_pos
55 |
56 | def _viewer_setup(self):
57 | body_id = self.sim.model.body_name2id('robot0:gripper_link')
58 | lookat = self.sim.data.body_xpos[body_id]
59 | for idx, value in enumerate(lookat):
60 | self.viewer.cam.lookat[idx] = value
61 | self.viewer.cam.distance = 2.5
62 | self.viewer.cam.azimuth = 180.
63 | self.viewer.cam.elevation = -24.
64 |
65 | def _reset_sim(self):
66 | self.sim.set_state(self.initial_state)
67 |
68 | object_xpos_x = 1.65 + self.np_random.uniform(-0.05, 0.05)
69 | while True:
70 | object_xpos_x = 1.8 + self.np_random.uniform(-0.05, 0.10)
71 | object_xpos_y = 0.75 + self.np_random.uniform(-0.05, 0.05)
72 | if (object_xpos_x - self._goal_pos[0]) ** 2 + (object_xpos_y - self._goal_pos[1]) ** 2 >= 0.01:
73 | break
74 | self._object_xpos = np.array([object_xpos_x, object_xpos_y])
75 |
76 | object_qpos = self.sim.data.get_joint_qpos('object0:joint')
77 | assert object_qpos.shape == (7,)
78 | object_qpos[:2] = self._object_xpos
79 | self.sim.data.set_joint_qpos('object0:joint', object_qpos)
80 |
81 | self.sim.forward()
82 | return True
83 |
84 | def _get_obs(self):
85 | obs = fetch_env.FetchEnv._get_obs(self)
86 |
87 | grip_pos = self.sim.data.get_site_xpos('robot0:grip')
88 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep
89 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt
90 |
91 | hook_pos = self.sim.data.get_site_xpos('hook')
92 | # rotations
93 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook'))
94 | # velocities
95 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt
96 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt
97 | # gripper state
98 | hook_rel_pos = hook_pos - grip_pos
99 | hook_velp -= grip_velp
100 |
101 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos])
102 |
103 | obs['observation'] = np.concatenate([obs['observation'], hook_observation])
104 |
105 | return obs
106 |
107 | def _noisify_obs(self, obs, noise=1.):
108 | return obs + np.random.normal(0, noise, size=obs.shape)
109 |
110 |
111 | class ResidualFetchHookEnv(FetchHookEnv):
112 |
113 | def step(self, residual_action):
114 | residual_action = 2. * residual_action
115 |
116 | action = np.add(residual_action, get_hook_control(self._last_observation))
117 | action = np.clip(action, -1, 1)
118 | observation, reward, done, debug_info = FetchHookEnv.step(self, action)
119 |
120 | self._last_observation = observation
121 |
122 | return observation, reward, done, debug_info
123 |
124 | def compute_reward(self, *args, **kwargs):
125 | return FetchHookEnv.compute_reward(self, *args, **kwargs)
126 |
127 | def reset(self):
128 | observation = FetchHookEnv.reset(self)
129 | self._last_observation = observation
130 |
131 | return observation
132 |
133 |
134 | class NoisyFetchHookEnv(FetchHookEnv):
135 |
136 | def _get_obs(self):
137 | obs = fetch_env.FetchEnv._get_obs(self)
138 |
139 | grip_pos = self.sim.data.get_site_xpos('robot0:grip')
140 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep
141 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt
142 |
143 | hook_pos = self.sim.data.get_site_xpos('hook')
144 | hook_pos = self._noisify_obs(hook_pos, noise=0.025)
145 | # rotations
146 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook'))
147 | hook_rot = self._noisify_obs(hook_rot, noise=0.025)
148 | # velocities
149 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt
150 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt
151 | # gripper state
152 | hook_rel_pos = hook_pos - grip_pos
153 | hook_velp -= grip_velp
154 |
155 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos])
156 |
157 | obs['observation'] = np.concatenate([obs['observation'], hook_observation])
158 | obs['observation'][3:5] = self._noisify_obs(obs['observation'][3:5], noise=0.025)
159 | obs['observation'][6:9] = obs['observation'][3:6] - obs['observation'][:3] # object_pos - grip_pos
160 | obs['observation'][12:15] = self._noisify_obs(obs['observation'][6:9], noise=0.025)
161 | return obs
162 |
163 | def _noisify_obs(self, obs, noise=1.):
164 | return obs + np.random.normal(0, noise, size=obs.shape)
165 |
166 | def compute_reward(self, *args, **kwargs):
167 | return FetchHookEnv.compute_reward(self, *args, **kwargs)
168 |
169 | def reset(self):
170 | observation = FetchHookEnv.reset(self)
171 | self._last_observation = observation
172 |
173 | return observation
174 |
175 |
176 | class TwoFrameResidualHookNoisyEnv(FetchHookEnv):
177 | def __init__(self, xml_file=None):
178 | super(TwoFrameResidualHookNoisyEnv, self).__init__()
179 | self.observation_space.spaces['observation'] = spaces.Box(low=np.hstack(
180 | (self.observation_space.spaces['observation'].low, self.observation_space.spaces['observation'].low)),
181 | high=np.hstack((self.observation_space.spaces[
182 | 'observation'].high,
183 | self.observation_space.spaces[
184 | 'observation'].high)),
185 | dtype=np.float32)
186 |
187 | def _get_obs(self):
188 | obs = fetch_env.FetchEnv._get_obs(self)
189 |
190 | grip_pos = self.sim.data.get_site_xpos('robot0:grip')
191 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep
192 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt
193 |
194 | hook_pos = self.sim.data.get_site_xpos('hook')
195 | hook_pos = self._noisify_obs(hook_pos, noise=0.025)
196 | # rotations
197 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook'))
198 | hook_rot = self._noisify_obs(hook_rot, noise=0.025)
199 | # velocities
200 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt
201 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt
202 | # gripper state
203 | hook_rel_pos = hook_pos - grip_pos
204 | hook_velp -= grip_velp
205 |
206 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos])
207 |
208 | obs['observation'] = np.concatenate([obs['observation'], hook_observation])
209 | obs['observation'][3:5] = self._noisify_obs(obs['observation'][3:5], noise=0.025)
210 | obs['observation'][6:9] = obs['observation'][3:6] - obs['observation'][:3] # object_pos - grip_pos
211 | obs['observation'][12:15] = self._noisify_obs(obs['observation'][6:9], noise=0.025)
212 | return obs
213 |
214 | def step(self, residual_action):
215 | residual_action = 2. * residual_action
216 |
217 | action = np.add(residual_action, get_hook_control(self._last_observation))
218 | action = np.clip(action, -1, 1)
219 | observation, reward, done, debug_info = FetchHookEnv.step(self, action)
220 |
221 | obs_out = observation.copy()
222 | obs_out['observation'] = np.hstack((self._last_observation['observation'], observation['observation']))
223 | self._last_observation = observation
224 |
225 | return obs_out, reward, done, debug_info
226 |
227 | def reset(self):
228 | observation = FetchHookEnv.reset(self)
229 | self._last_observation = observation.copy()
230 | observation['observation'] = np.hstack((self._last_observation['observation'], observation['observation']))
231 | return observation
232 |
233 | def _noisify_obs(self, obs, noise=1.):
234 | return obs + np.random.normal(0, noise, size=obs.shape)
235 |
236 |
237 | class TwoFrameHookNoisyEnv(FetchHookEnv):
238 | def __init__(self, xml_file=None):
239 | super(TwoFrameHookNoisyEnv, self).__init__()
240 | self.observation_space.spaces['observation'] = spaces.Box(low=np.hstack(
241 | (self.observation_space.spaces['observation'].low, self.observation_space.spaces['observation'].low)),
242 | high=np.hstack((self.observation_space.spaces[
243 | 'observation'].high,
244 | self.observation_space.spaces[
245 | 'observation'].high)),
246 | dtype=np.float32)
247 |
248 | def _get_obs(self):
249 | obs = fetch_env.FetchEnv._get_obs(self)
250 |
251 | grip_pos = self.sim.data.get_site_xpos('robot0:grip')
252 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep
253 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt
254 |
255 | hook_pos = self.sim.data.get_site_xpos('hook')
256 | hook_pos = self._noisify_obs(hook_pos, noise=0.025)
257 | # rotations
258 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook'))
259 | hook_rot = self._noisify_obs(hook_rot, noise=0.025)
260 | # velocities
261 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt
262 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt
263 | # gripper state
264 | hook_rel_pos = hook_pos - grip_pos
265 | hook_velp -= grip_velp
266 |
267 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos])
268 |
269 | obs['observation'] = np.concatenate([obs['observation'], hook_observation])
270 | obs['observation'][3:5] = self._noisify_obs(obs['observation'][3:5], noise=0.025)
271 | obs['observation'][6:9] = obs['observation'][3:6] - obs['observation'][:3] # object_pos - grip_pos
272 | obs['observation'][12:15] = self._noisify_obs(obs['observation'][6:9], noise=0.025)
273 | return obs
274 |
275 | def step(self, action):
276 | observation, reward, done, debug_info = FetchHookEnv.step(self, action)
277 |
278 | obs_out = observation.copy()
279 | obs_out['observation'] = np.hstack((self._last_observation['observation'], observation['observation']))
280 | self._last_observation = observation
281 |
282 | return obs_out, reward, done, debug_info
283 |
284 | def reset(self):
285 | observation = FetchHookEnv.reset(self)
286 | self._last_observation = observation.copy()
287 | observation['observation'] = np.hstack((self._last_observation['observation'], observation['observation']))
288 | return observation
289 |
290 | def _noisify_obs(self, obs, noise=1.):
291 | return obs + np.random.normal(0, noise, size=obs.shape)
292 |
293 |
294 | class NoisyResidualFetchHookEnv(FetchHookEnv):
295 |
296 | def step(self, residual_action):
297 | residual_action = 2. * residual_action
298 |
299 | action = np.add(residual_action, get_hook_control(self._last_observation))
300 | action = np.clip(action, -1, 1)
301 | observation, reward, done, debug_info = FetchHookEnv.step(self, action)
302 |
303 | self._last_observation = observation
304 |
305 | return observation, reward, done, debug_info
306 |
307 | def _get_obs(self):
308 | obs = fetch_env.FetchEnv._get_obs(self)
309 |
310 | grip_pos = self.sim.data.get_site_xpos('robot0:grip')
311 | dt = self.sim.nsubsteps * self.sim.model.opt.timestep
312 | grip_velp = self.sim.data.get_site_xvelp('robot0:grip') * dt
313 |
314 | hook_pos = self.sim.data.get_site_xpos('hook')
315 | hook_pos = self._noisify_obs(hook_pos, noise=0.025)
316 | # rotations
317 | hook_rot = rotations.mat2euler(self.sim.data.get_site_xmat('hook'))
318 | hook_rot = self._noisify_obs(hook_rot, noise=0.025)
319 | # velocities
320 | hook_velp = self.sim.data.get_site_xvelp('hook') * dt
321 | hook_velr = self.sim.data.get_site_xvelr('hook') * dt
322 | # gripper state
323 | hook_rel_pos = hook_pos - grip_pos
324 | hook_velp -= grip_velp
325 |
326 | hook_observation = np.concatenate([hook_pos, hook_rot, hook_velp, hook_velr, hook_rel_pos])
327 |
328 | obs['observation'] = np.concatenate([obs['observation'], hook_observation])
329 | obs['observation'][3:5] = self._noisify_obs(obs['observation'][3:5], noise=0.025)
330 | obs['observation'][6:9] = obs['observation'][3:6] - obs['observation'][:3] # object_pos - grip_pos
331 | obs['observation'][12:15] = self._noisify_obs(obs['observation'][6:9], noise=0.025)
332 | return obs
333 |
334 | def _noisify_obs(self, obs, noise=1.):
335 | return obs + np.random.normal(0, noise, size=obs.shape)
336 |
337 | def compute_reward(self, *args, **kwargs):
338 | return FetchHookEnv.compute_reward(self, *args, **kwargs)
339 |
340 | def reset(self):
341 | observation = FetchHookEnv.reset(self)
342 | self._last_observation = observation
343 |
344 | return observation
345 |
--------------------------------------------------------------------------------