├── requirements.txt
├── media
    └── teaser.gif
├── src
    ├── envs
    │   ├── task_object_names.json
    │   └── rlbench_env.py
    ├── prompts
    │   └── rlbench
    │   │   ├── get_velocity_map_prompt.txt
    │   │   ├── get_avoidance_map_prompt.txt
    │   │   ├── get_gripper_map_prompt.txt
    │   │   ├── parse_query_obj_prompt.txt
    │   │   ├── get_rotation_map_prompt.txt
    │   │   ├── planner_prompt.txt
    │   │   ├── get_affordance_map_prompt.txt
    │   │   └── composer_prompt.txt
    ├── dynamics_models.py
    ├── arguments.py
    ├── LLM_cache.py
    ├── configs
    │   └── rlbench_config.yaml
    ├── playground.ipynb
    ├── LMP.py
    ├── visualizers.py
    ├── planners.py
    ├── controllers.py
    ├── utils.py
    └── interfaces.py
├── LICENSE
└── README.md


/requirements.txt:
--------------------------------------------------------------------------------
1 | jupyter
2 | openai
3 | plotly
4 | transforms3d
5 | open3d


--------------------------------------------------------------------------------
/media/teaser.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huangwl18/VoxPoser/HEAD/media/teaser.gif


--------------------------------------------------------------------------------
/src/envs/task_object_names.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "put_rubbish_in_bin": [["bin", "bin"], ["rubbish", "rubbish"], ["tomato1", "tomato1"], ["tomato2", "tomato2"]],
 3 |     "lamp_off": [["lamp", "lamp_base"], ["button", "push_button_target"]],
 4 |     "open_wine_bottle": [["bottle", "bottle"], ["cap", "cap"]],
 5 |     "push_button": [["button", "push_button_target"]],
 6 |     "take_off_weighing_scales": [["scale", "scales_tray_physical"], ["green pepper", "pepper0"], ["red pepper", "pepper1"], ["yellow pepper", "pepper2"]],
 7 |     "meat_off_grill": [["grill", "grill"], ["meat", "chicken"]],
 8 |     "slide_block_to_target": [["block", "block"], ["target", "target"]],
 9 |     "take_lid_off_saucepan": [["saucepan", "saucepan"], ["saucepan_lid", "saucepan_lid"]],
10 |     "take_umbrella_out_of_umbrella_stand": [["umbrella", "umbrella"], ["stand", "stand"]]
11 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Wenlong Huang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/prompts/rlbench/get_velocity_map_prompt.txt:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from plan_utils import get_empty_velocity_map, set_voxel_by_radius, cm2index
 3 | from perception_utils import parse_query_obj
 4 | 
 5 | # Query: faster when on the right side of the table and slower when on the left side of the table.
 6 | velocity_map = get_empty_velocity_map()
 7 | table = parse_query_obj('table')
 8 | center_x, center_y, center_z = table.position
 9 | # faster on right side so 1.5 when y > center_y, slower on left side so 0.5 when y < center_y
10 | velocity_map[:, center_y:, :] = 1.5
11 | velocity_map[:, :center_y, :] = 0.5
12 | ret_val = velocity_map
13 | 
14 | # Query: slow down by a quarter.
15 | velocity_map = get_empty_velocity_map()
16 | velocity_map[:] = 0.75
17 | ret_val = velocity_map
18 | 
19 | # Query: slow down by a half when you're near anything fragile (objects: ['block', 'fork', 'mug', 'bowl', 'chips']).
20 | velocity_map = get_empty_velocity_map()
21 | mug = parse_query_obj('mug')
22 | set_voxel_by_radius(velocity_map, mug.position, radius_cm=10, value=0.5)
23 | bowl = parse_query_obj('bowl')
24 | set_voxel_by_radius(velocity_map, bowl.position, radius_cm=10, value=0.5)
25 | ret_val = velocity_map
26 | 
27 | # Query: quarter of the speed when within 9cm from the yellow line.
28 | velocity_map = get_empty_velocity_map()
29 | yellow_line = parse_query_obj('yellow_line')
30 | set_voxel_by_radius(velocity_map, yellow_line.position, radius_cm=9, value=0.25)
31 | ret_val = velocity_map


--------------------------------------------------------------------------------
/src/prompts/rlbench/get_avoidance_map_prompt.txt:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from perception_utils import parse_query_obj
 3 | from plan_utils import get_empty_avoidance_map, set_voxel_by_radius, cm2index
 4 | 
 5 | # Query: 10cm from the bowl.
 6 | avoidance_map = get_empty_avoidance_map()
 7 | bowl = parse_query_obj('bowl')
 8 | set_voxel_by_radius(avoidance_map, bowl.position, radius_cm=10, value=1)
 9 | ret_val = avoidance_map
10 | 
11 | # Query: 20cm near the mug.
12 | avoidance_map = get_empty_avoidance_map()
13 | mug = parse_query_obj('mug')
14 | set_voxel_by_radius(avoidance_map, mug.position, radius_cm=20, value=1)
15 | ret_val = avoidance_map
16 | 
17 | # Query: 20cm around the mug and 10cm around the bowl.
18 | avoidance_map = get_empty_avoidance_map()
19 | mug = parse_query_obj('mug')
20 | set_voxel_by_radius(avoidance_map, mug.position, radius_cm=20, value=1)
21 | bowl = parse_query_obj('bowl')
22 | set_voxel_by_radius(avoidance_map, bowl.position, radius_cm=10, value=1)
23 | ret_val = avoidance_map
24 | 
25 | # Query: 10cm from anything fragile.
26 | avoidance_map = get_empty_avoidance_map()
27 | fragile_objects = parse_query_obj('anything fragile')
28 | for obj in fragile_objects:
29 |     set_voxel_by_radius(avoidance_map, obj.position, radius_cm=10, value=1)
30 | ret_val = avoidance_map
31 | 
32 | # Query: 10cm from the blue circle.
33 | avoidance_map = get_empty_avoidance_map()
34 | blue_circle = parse_query_obj('blue circle')
35 | set_voxel_by_radius(avoidance_map, blue_circle.position, radius_cm=10, value=1)
36 | ret_val = avoidance_map


--------------------------------------------------------------------------------
/src/dynamics_models.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class PushingDynamicsModel:
 4 |     """
 5 |     Heuristics-based pushing dynamics model.
 6 |     Translates the object by gripper_moving_distance in gripper_direction.
 7 |     """
 8 |     def __init__(self):
 9 |         pass
10 | 
11 |     def forward(self, inputs, max_per_batch=2000):
12 |         """split inputs into multiple batches if exceeds max_per_batch"""
13 |         num_batch = int(np.ceil(inputs[0].shape[0] / max_per_batch))
14 |         output = []
15 |         for i in range(num_batch):
16 |             start = i * max_per_batch
17 |             end = (i + 1) * max_per_batch
18 |             output.append(self._forward_batched([x[start:end] for x in inputs]))
19 |         output = np.concatenate(output, axis=0)
20 |         return output
21 | 
22 |     def _forward_batched(self, inputs):
23 |         (pcs, contact_position, gripper_direction, gripper_moving_distance) = inputs
24 |         # to float16
25 |         pcs = pcs.astype(np.float16)
26 |         contact_position = contact_position.astype(np.float16)
27 |         gripper_direction = gripper_direction.astype(np.float16)
28 |         gripper_moving_distance = gripper_moving_distance.astype(np.float16)
29 |         # find invalid push (i.e., outward push)
30 |         obj_center = np.mean(pcs, axis=1)  # B x 3
31 |         is_outward = np.sum((obj_center - contact_position) * gripper_direction, axis=1) < 0  # B
32 |         moving_dist = gripper_moving_distance.copy()
33 |         moving_dist[is_outward] = 0
34 |         # translate pc by gripper_moving_distance in gripper_direction
35 |         output = pcs + moving_dist[:, np.newaxis, :] * gripper_direction[:, np.newaxis, :]  # B x N x 3
36 |         return output
37 | 


--------------------------------------------------------------------------------
/src/arguments.py:
--------------------------------------------------------------------------------
 1 | """load YAML config file"""
 2 | import os
 3 | import yaml
 4 | 
 5 | def load_config(config_path):
 6 |     with open(config_path, 'r') as f:
 7 |         config = yaml.load(f, Loader=yaml.FullLoader)
 8 |     return config
 9 | 
10 | def get_config(env=None, config_path=None):
11 |     assert env is None or config_path is None, 'env and config_path cannot be both specified'
12 |     if config_path is None:
13 |         assert env.lower() == 'rlbench'
14 |         config_path = './configs/rlbench_config.yaml'
15 |     assert config_path and os.path.exists(config_path), f'config file does not exist ({config_path})'
16 |     config = load_config(config_path)
17 |     # wrap dict such that we can access config through attribute
18 |     class ConfigDict(dict):
19 |         def __init__(self, config):
20 |             """recursively build config"""
21 |             self.config = config
22 |             for key, value in config.items():
23 |                 if isinstance(value, str) and value.lower() == 'none':
24 |                     value = None
25 |                 if isinstance(value, dict):
26 |                     self[key] = ConfigDict(value)
27 |                 else:
28 |                     self[key] = value
29 |         def __getattr__(self, key):
30 |             return self[key]
31 |         def __setattr__(self, key, value):
32 |             self[key] = value
33 |         def __delattr__(self, key):
34 |             del self[key]
35 |         def __getstate__(self):
36 |             return self.config
37 |         def __setstate__(self, state):
38 |             self.config = state
39 |             self.__init__(state)
40 |     config = ConfigDict(config)
41 |     return config
42 | 
43 | def main():
44 |     config = get_config(config_path='./configs/rlbench_config.yaml')
45 |     print(config)
46 | 
47 | 


--------------------------------------------------------------------------------
/src/prompts/rlbench/get_gripper_map_prompt.txt:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from perception_utils import parse_query_obj
 3 | from plan_utils import get_empty_gripper_map, set_voxel_by_radius, cm2index
 4 | 
 5 | # Query: open everywhere except 1cm around the green block.
 6 | gripper_map = get_empty_gripper_map()
 7 | # open everywhere
 8 | gripper_map[:, :, :] = 1
 9 | # close when 1cm around the green block
10 | green_block = parse_query_obj('green block')
11 | set_voxel_by_radius(gripper_map, green_block.position, radius_cm=1, value=0)
12 | ret_val = gripper_map
13 | 
14 | # Query: close everywhere but open when on top of the back left corner of the table.
15 | gripper_map = get_empty_gripper_map()
16 | # close everywhere
17 | gripper_map[:, :, :] = 0
18 | # open when on top of the back left corner of the table
19 | table = parse_query_obj('table')
20 | (min_x, min_y, min_z), (max_x, max_y, max_z) = table.aabb
21 | center_x, center_y, center_z = table.position
22 | # back so x = min_x, left so y = min_y, top so we add to z
23 | x = min_x
24 | y = min_y
25 | z = max_z + cm2index(15, 'z')
26 | set_voxel_by_radius(gripper_map, (x, y, z), radius_cm=10, value=1)
27 | ret_val = gripper_map
28 | 
29 | # Query: always open except when you are on the right side of the table.
30 | gripper_map = get_empty_gripper_map()
31 | # always open
32 | gripper_map[:, :, :] = 1
33 | # close when you are on the right side of the table
34 | table = parse_query_obj('table')
35 | center_x, center_y, center_z = table.position
36 | # right side so y is greater than center_y
37 | gripper_map[:, center_y:, :] = 0
38 | 
39 | # Query: always close except when you are on the back side of the table.
40 | gripper_map = get_empty_gripper_map()
41 | # always close
42 | gripper_map[:, :, :] = 0
43 | # open when you are on the back side of the table
44 | table = parse_query_obj('table')
45 | center_x, center_y, center_z = table.position
46 | # back side so x is less than center_x
47 | gripper_map[:center_x, :, :] = 1
48 | ret_val = gripper_map


--------------------------------------------------------------------------------
/src/LLM_cache.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pickle
 3 | import json
 4 | import hashlib
 5 | 
 6 | class DiskCache:
 7 |     """
 8 |     A convenient disk cache that stores key-value pairs on disk.
 9 |     Useful for querying LLM API.
10 |     """
11 |     def __init__(self, cache_dir='cache', load_cache=True):
12 |         self.cache_dir = cache_dir
13 |         self.data = {}
14 |         
15 |         if not os.path.exists(self.cache_dir):
16 |             os.makedirs(self.cache_dir)
17 |         else:
18 |             if load_cache:
19 |                 self._load_cache()
20 | 
21 |     def _generate_filename(self, key):
22 |         key_str = json.dumps(key)
23 |         key_hash = hashlib.sha1(key_str.encode('utf-8')).hexdigest()
24 |         return f"{key_hash}.pkl"
25 | 
26 |     def _load_cache(self):
27 |         for filename in os.listdir(self.cache_dir):
28 |             with open(os.path.join(self.cache_dir, filename), 'rb') as file:
29 |                 key, value = pickle.load(file)
30 |                 self.data[json.dumps(key)] = value
31 | 
32 |     def _save_to_disk(self, key, value):
33 |         filename = self._generate_filename(key)
34 |         with open(os.path.join(self.cache_dir, filename), 'wb') as file:
35 |             pickle.dump((key, value), file)
36 | 
37 |     def __setitem__(self, key, value):
38 |         str_key = json.dumps(key)
39 |         self.data[str_key] = value
40 |         self._save_to_disk(key, value)
41 | 
42 |     def __getitem__(self, key):
43 |         str_key = json.dumps(key)
44 |         return self.data[str_key]
45 |     
46 |     def __contains__(self, key):
47 |         str_key = json.dumps(key)
48 |         return str_key in self.data
49 | 
50 |     def __repr__(self):
51 |         return repr(self.data)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     cache = DiskCache()
56 |     cache_key = {'id': 1, 'name': 'John'}
57 |     cache[cache_key] = 'value1'
58 | 
59 |     print(cache)  # {"{'id': 1, 'name': 'John'}": 'value1'}
60 | 
61 |     # When you restart your program, the cache will load the previous key-value pairs
62 |     cache2 = DiskCache()
63 |     print(cache2)  # {"{'id': 1, 'name': 'John'}": 'value1'}
64 | 
65 | 
66 | 


--------------------------------------------------------------------------------
/src/prompts/rlbench/parse_query_obj_prompt.txt:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from perception_utils import detect
 3 | 
 4 | objects = ['green block', 'cardboard box']
 5 | # Query: gripper.
 6 | gripper = detect('gripper')
 7 | ret_val = gripper
 8 | 
 9 | objects = ['handle1', 'handle2', 'egg1', 'egg2', 'plate']
10 | # Query: topmost handle.
11 | handle1 = detect('handle1')
12 | handle2 = detect('handle2')
13 | if handle1.position[2] > handle2.position[2]:
14 |     top_handle = handle1
15 | else:
16 |     top_handle = handle2
17 | ret_val = top_handle
18 | 
19 | objects = ['vase', 'napkin box', 'mask']
20 | # Query: table.
21 | table = detect('table')
22 | ret_val = table
23 | 
24 | objects = ['brown line', 'red block', 'monitor']
25 | # Query: brown line.
26 | brown_line = detect('brown line')
27 | ret_val = brown_line
28 | 
29 | objects = ['green block', 'cup holder', 'black block']
30 | # Query: any block.
31 | block = detect('green block')
32 | ret_val = block
33 | 
34 | objects = ['mouse', 'yellow bowl', 'brown bowl', 'sticker']
35 | # Query: bowl closest to the sticker.
36 | yellow_bowl = detect('yellow bowl')
37 | brown_bowl = detect('brown bowl')
38 | sticker = detect('sticker')
39 | if np.linalg.norm(yellow_bowl.position - sticker.position) < np.linalg.norm(brown_bowl.position - sticker.position):
40 |     closest_bowl = yellow_bowl
41 | else:
42 |     closest_bowl = brown_bowl
43 | ret_val = closest_bowl
44 | 
45 | objects = ['grape', 'wood tray', 'strawberry', 'white tray', 'blue tray', 'bread']
46 | # Query: tray that contains the bread.
47 | wood_tray = detect('wood tray')
48 | white_tray = detect('white tray')
49 | bread = detect('bread')
50 | if np.linalg.norm(wood_tray.position - bread.position) < np.linalg.norm(white_tray.position - bread.position):
51 |     tray_with_bread = wood_tray
52 | else:
53 |     tray_with_bread = white_tray
54 | ret_val = tray_with_bread
55 | 
56 | objects = ['glass', 'vase', 'plastic bottle', 'block', 'phone case']
57 | # Query: anything fragile.
58 | fragile_items = []
59 | for obj in ['glass', 'vase']:
60 |     item = detect(obj)
61 |     fragile_items.append(item)
62 | ret_val = fragile_items
63 | 
64 | objects = ['blue block', 'red block']
65 | # Query: green block.
66 | ret_val = None


--------------------------------------------------------------------------------
/src/prompts/rlbench/get_rotation_map_prompt.txt:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from plan_utils import get_empty_rotation_map, set_voxel_by_radius, cm2index, vec2quat
 3 | from perception_utils import parse_query_obj
 4 | from transforms3d.euler import euler2quat, quat2euler
 5 | from transforms3d.quaternions import qmult, qinverse
 6 | 
 7 | # Query: face the support surface of the bowl.
 8 | rotation_map = get_empty_rotation_map()
 9 | bowl = parse_query_obj('bowl')
10 | target_rotation = vec2quat(-bowl.normal)
11 | rotation_map[:, :, :] = target_rotation
12 | ret_val = rotation_map
13 | 
14 | # Query: face the table when within 30cm from table center.
15 | rotation_map = get_empty_rotation_map()
16 | table = parse_query_obj('table')
17 | table_center = table.position
18 | target_rotation = vec2quat(-table.normal)
19 | set_voxel_by_radius(rotation_map, table_center, radius_cm=30, value=target_rotation)
20 | ret_val = rotation_map
21 | 
22 | # Query: face the blue bowl.
23 | rotation_map = get_empty_rotation_map()
24 | blue_bowl = parse_query_obj('brown block')
25 | target_rotation = vec2quat(-blue_bowl.normal)
26 | rotation_map[:, :, :] = target_rotation
27 | ret_val = rotation_map
28 | 
29 | # Query: turn clockwise by 45 degrees when at the center of the beer cap.
30 | rotation_map = get_empty_rotation_map()
31 | beer_cap = parse_query_obj('beer cap')
32 | (x, y, z) = beer_cap.position
33 | curr_rotation = rotation_map[x, y, z]
34 | rotation_delta = euler2quat(0, 0, np.pi / 4)
35 | rotation_map[x, y, z] = qmult(curr_rotation, rotation_delta)
36 | ret_val = rotation_map
37 | 
38 | # Query: turn counter-clockwise by 30 degrees.
39 | rotation_map = get_empty_rotation_map()
40 | curr_rotation = rotation_map[0, 0, 0]
41 | rotation_delta = euler2quat(0, 0, -np.pi / 6)
42 | rotation_map[:, :, :] = qmult(curr_rotation, rotation_delta)
43 | ret_val = rotation_map
44 | 
45 | # Query: rotate the gripper to be 45 degrees slanted relative to the plate.
46 | rotation_map = get_empty_rotation_map()
47 | plate = parse_query_obj('plate')
48 | face_plate_quat = vec2quat(-plate.normal)
49 | # rotate 45 degrees around the x-axis
50 | rotation_delta = euler2quat(-np.pi / 4, 0, 0)
51 | target_rotation = qmult(face_plate_quat, rotation_delta)
52 | rotation_map[:, :, :] = target_rotation
53 | ret_val = rotation_map


--------------------------------------------------------------------------------
/src/prompts/rlbench/planner_prompt.txt:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from env_utils import execute
  3 | from perception_utils import parse_query_obj
  4 | import action_utils import composer
  5 | 
  6 | objects = ['blue block', 'yellow block', 'mug']
  7 | # Query: place the blue block on the yellow block, and avoid the mug at all time.
  8 | composer("grasp the blue block while keeping at least 15cm away from the mug")
  9 | composer("back to default pose")
 10 | composer("move to 5cm on top of the yellow block while keeping at least 15cm away from the mug")
 11 | composer("open gripper")
 12 | # done
 13 | 
 14 | objects = ['airpods', 'drawer']
 15 | # Query: Open the drawer slowly.
 16 | composer("grasp the drawer handle, at 0.5x speed")
 17 | composer("move away from the drawer handle by 25cm, at 0.5x speed")
 18 | composer("open gripper, at 0.5x speed")
 19 | # done
 20 | 
 21 | objects = ['tissue box', 'tissue', 'bowl']
 22 | # Query: Can you pass me a tissue and place it next to the bowl?
 23 | composer("grasp the tissue")
 24 | composer("back to default pose")
 25 | composer("move to 10cm to the right of the bowl")
 26 | composer("open gripper")
 27 | composer("back to default pose")
 28 | # done
 29 | 
 30 | objects = ['charger', 'outlet']
 31 | # Query: unplug the charger from the wall.
 32 | composer("grasp the charger")
 33 | composer("back to default pose")
 34 | # done
 35 | 
 36 | objects = ['grape', 'lemon', 'drill', 'router', 'bread', 'tray']
 37 | # Query: put the sweeter fruit in the tray that contains the bread.
 38 | composer("grasp the grape")
 39 | composer("back to default pose")
 40 | composer("move to the top of the tray that contains the bread")
 41 | composer("open gripper")
 42 | # done
 43 | 
 44 | objects = ['marbles', 'tray', 'broom']
 45 | # Query: Can you sweep the marbles into the tray?
 46 | composer("grasp the broom")
 47 | composer("back to default pose")
 48 | composer("push the marbles into the tray")
 49 | # done
 50 | 
 51 | objects = ['orange', 'QR code', 'lemon', 'drawer']
 52 | # Query: put the sour fruit into the top drawer.
 53 | composer("grasp the top drawer handle")
 54 | composer("move away from the top drawer handle by 25cm")
 55 | composer("open gripper")
 56 | composer("back to default pose")
 57 | composer("grasp the lemon")
 58 | composer("move to 10cm on top of the top drawer")
 59 | composer("open gripper")
 60 | # done
 61 | 
 62 | objects = ['fridge', 'hot soup']
 63 | # Query: Open the fridge door and be careful around the hot soup.
 64 | composer("grasp the fridge handle and keep at least 15cm away from the hot soup")
 65 | composer("move away from the fridge handle by 25cm and keep at least 15cm away from the hot soup")
 66 | composer("open gripper")
 67 | # done
 68 | 
 69 | objects = ['cyan bowl', 'yellow bowl', 'box', 'ice cream']
 70 | # Query: move to the top of the cyan bowl.
 71 | composer("move to the top of the cyan bowl")
 72 | # done
 73 | 
 74 | objects = ['drawer', 'umbrella']
 75 | # Query: close the drawer.
 76 | composer("push close the drawer handle by 25cm")
 77 | # done
 78 | 
 79 | objects = ['iPhone', 'airpods']
 80 | # Query: slide the iPhone towards the airpods.
 81 | composer("push the iPhone towards the airpods")
 82 | # done
 83 | 
 84 | objects = ['plate', 'steak', 'fork', 'knife', 'spoon']
 85 | # Query: Could you please set up the fork for the steak for me?
 86 | composer("grasp the fork")
 87 | composer("back to default pose")
 88 | composer("move to 10cm to the right of the plate")
 89 | composer("open gripper")
 90 | composer("back to default pose")
 91 | # done
 92 | 
 93 | objects = ['lamp', 'switch']
 94 | # Query: Turn off the lamp.
 95 | composer("close the gripper")
 96 | composer("move to the center of the switch")
 97 | composer("back to default pose")
 98 | # done
 99 | 
100 | objects = ['beer']
101 | # Query: turn close the beer.
102 | composer("grasp the beer cap")
103 | composer("turn clockwise by 180 degrees")
104 | composer("back to default pose")
105 | # done
106 | 
107 | objects = ['steak', 'grill', 'plate']
108 | # Query: Take the steak out of the grill and put it flat on the plate.
109 | composer("grasp the steak")
110 | composer("back to default pose")
111 | composer("rotate the gripper to be 45 degrees slanted relative to the plate")
112 | composer("move to 10cm on top of the plate")
113 | composer("open gripper")
114 | composer("back to default pose")
115 | # done


--------------------------------------------------------------------------------
/src/configs/rlbench_config.yaml:
--------------------------------------------------------------------------------
  1 | env_name: rlbench
  2 | 
  3 | planner:
  4 |   stop_threshold: 0.001
  5 |   savgol_polyorder: 3
  6 |   savgol_window_size: 20
  7 |   obstacle_map_weight: 1
  8 |   max_steps: 300
  9 |   obstacle_map_gaussian_sigma: 10
 10 |   target_map_weight: 2
 11 |   stop_criteria: no_nearby_equal
 12 |   target_spacing: 1
 13 |   max_curvature: 3
 14 |   pushing_skip_per_k: 5
 15 | 
 16 | controller:
 17 |   horizon_length: 1
 18 |   num_samples: 10000
 19 |   ee_local: temperature
 20 |   ee_local_radius: 0.15
 21 | 
 22 | visualizer:
 23 |   save_dir: ./visualizations
 24 |   quality: low
 25 |   map_size: 100
 26 | 
 27 | lmp_config:
 28 |   env:
 29 |     map_size: 100
 30 |     num_waypoints_per_plan: 10000  # set to a large number since we only do open loop for sim
 31 |     max_plan_iter: 1
 32 |     visualize: True
 33 |   lmps:
 34 |     planner:
 35 |       prompt_fname: planner_prompt
 36 |       model: gpt-4
 37 |       max_tokens: 512
 38 |       temperature: 0
 39 |       query_prefix: '# Query: '
 40 |       query_suffix: '.'
 41 |       stop:
 42 |         - '# Query: '
 43 |         - 'objects = '
 44 |       maintain_session: False
 45 |       include_context: True
 46 |       has_return: False
 47 |       return_val_name: ret_val
 48 |       load_cache: True
 49 |     composer:
 50 |       prompt_fname: composer_prompt
 51 |       model: gpt-4
 52 |       max_tokens: 512
 53 |       temperature: 0
 54 |       query_prefix: '# Query: '
 55 |       query_suffix: '.'
 56 |       stop:
 57 |         - '# Query: '
 58 |         - 'objects ='
 59 |       maintain_session: False
 60 |       include_context: False
 61 |       has_return: False
 62 |       return_val_name: ret_val
 63 |       load_cache: True
 64 |     parse_query_obj:
 65 |       prompt_fname: parse_query_obj_prompt
 66 |       model: gpt-4
 67 |       max_tokens: 512
 68 |       temperature: 0
 69 |       query_prefix: '# Query: '
 70 |       query_suffix: '.'
 71 |       stop:
 72 |         - '# Query: '
 73 |         - 'objects ='
 74 |       maintain_session: False
 75 |       include_context: True
 76 |       has_return: True
 77 |       return_val_name: ret_val
 78 |       load_cache: True
 79 |     get_affordance_map:
 80 |       prompt_fname: get_affordance_map_prompt
 81 |       model: gpt-4
 82 |       max_tokens: 512
 83 |       temperature: 0
 84 |       query_prefix: '# Query: '
 85 |       query_suffix: '.'
 86 |       stop:
 87 |         - '# Query: '
 88 |         - 'objects ='
 89 |       maintain_session: False
 90 |       include_context: False
 91 |       has_return: True
 92 |       return_val_name: ret_val
 93 |       load_cache: True
 94 |     get_avoidance_map:
 95 |       prompt_fname: get_avoidance_map_prompt
 96 |       model: gpt-4
 97 |       max_tokens: 512
 98 |       temperature: 0
 99 |       query_prefix: '# Query: '
100 |       query_suffix: '.'
101 |       stop:
102 |         - '# Query: '
103 |         - 'objects ='
104 |       maintain_session: False
105 |       include_context: False
106 |       has_return: True
107 |       return_val_name: ret_val
108 |       load_cache: True
109 |     get_velocity_map:
110 |       prompt_fname: get_velocity_map_prompt
111 |       model: gpt-4
112 |       max_tokens: 512
113 |       temperature: 0
114 |       query_prefix: '# Query: '
115 |       query_suffix: '.'
116 |       stop:
117 |         - '# Query: '
118 |         - 'objects ='
119 |       maintain_session: False
120 |       include_context: False
121 |       has_return: True
122 |       return_val_name: ret_val
123 |       load_cache: True
124 |     get_rotation_map:
125 |       prompt_fname: get_rotation_map_prompt
126 |       model: gpt-4
127 |       max_tokens: 512
128 |       temperature: 0
129 |       query_prefix: '# Query: '
130 |       query_suffix: '.'
131 |       stop:
132 |         - '# Query: '
133 |         - 'objects ='
134 |       maintain_session: False
135 |       include_context: False
136 |       has_return: True
137 |       return_val_name: ret_val
138 |       load_cache: True
139 |     get_gripper_map:
140 |       prompt_fname: get_gripper_map_prompt
141 |       model: gpt-4
142 |       max_tokens: 512
143 |       temperature: 0
144 |       query_prefix: '# Query: '
145 |       query_suffix: '.'
146 |       stop:
147 |         - '# Query: '
148 |         - 'objects ='
149 |       maintain_session: False
150 |       include_context: False
151 |       has_return: True
152 |       return_val_name: ret_val
153 |       load_cache: True


--------------------------------------------------------------------------------
/src/playground.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import openai\n",
 10 |     "from arguments import get_config\n",
 11 |     "from interfaces import setup_LMP\n",
 12 |     "from visualizers import ValueMapVisualizer\n",
 13 |     "from envs.rlbench_env import VoxPoserRLBench\n",
 14 |     "from utils import set_lmp_objects\n",
 15 |     "import numpy as np\n",
 16 |     "from rlbench import tasks\n",
 17 |     "\n",
 18 |     "openai.api_key = None  # set your API key here"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "attachments": {},
 23 |    "cell_type": "markdown",
 24 |    "metadata": {},
 25 |    "source": [
 26 |     "## Setup"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "config = get_config('rlbench')\n",
 36 |     "# uncomment this if you'd like to change the language model (e.g., for faster speed or lower cost)\n",
 37 |     "# for lmp_name, cfg in config['lmp_config']['lmps'].items():\n",
 38 |     "#     cfg['model'] = 'gpt-3.5-turbo'\n",
 39 |     "\n",
 40 |     "# initialize env and voxposer ui\n",
 41 |     "visualizer = ValueMapVisualizer(config['visualizer'])\n",
 42 |     "env = VoxPoserRLBench(visualizer=visualizer)\n",
 43 |     "lmps, lmp_env = setup_LMP(env, config, debug=False)\n",
 44 |     "voxposer_ui = lmps['plan_ui']"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "attachments": {},
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "## Playground\n",
 53 |     "\n",
 54 |     "By default we use one of the instructions that come with each task. However, you may treat each task as simply a scene initialization from RLBench, and feel free to try any task you can come up with for the scene.\n",
 55 |     "\n",
 56 |     "Note:\n",
 57 |     "- Whether an instruction can be executed or not depends on 1) whether relevant objects are available, and 2) capabilities of the overall algorithm.\n",
 58 |     "- Each execution may produce one or more visualizations. You may view them in \"./visualizations/\" folder.\n",
 59 |     "- The prompts are adapted with minimal change from the real-world environment in the VoxPoser paper. If a task failure is due to incorrect code generated by the LLM, feel free to modify the relevant prompt in \"./prompts/\" folder.\n",
 60 |     "- You may view the reward by printing \"env.latest_reward\". These are computed by RLBench for each task.\n",
 61 |     "- To inspect in viewer without performing any action, you may call \"env.rlbench_env._scene.step()\" in a loop."
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "# # uncomment this to show all available tasks in rlbench\n",
 71 |     "# # NOTE: in order to run a new task, you need to add the list of objects (and their corresponding env names) to the \"task_object_names.json\" file. See README for more details.\n",
 72 |     "# print([task for task in dir(tasks) if task[0].isupper() and not '_' in task])"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "# below are the tasks that have object names added to the \"task_object_names.json\" file\n",
 82 |     "# uncomment one to use\n",
 83 |     "env.load_task(tasks.PutRubbishInBin)\n",
 84 |     "# env.load_task(tasks.LampOff)\n",
 85 |     "# env.load_task(tasks.OpenWineBottle)\n",
 86 |     "# env.load_task(tasks.PushButton)\n",
 87 |     "# env.load_task(tasks.TakeOffWeighingScales)\n",
 88 |     "# env.load_task(tasks.MeatOffGrill)\n",
 89 |     "# env.load_task(tasks.SlideBlockToTarget)\n",
 90 |     "# env.load_task(tasks.TakeLidOffSaucepan)\n",
 91 |     "# env.load_task(tasks.TakeUmbrellaOutOfUmbrellaStand)\n",
 92 |     "\n",
 93 |     "descriptions, obs = env.reset()\n",
 94 |     "set_lmp_objects(lmps, env.get_object_names())  # set the object names to be used by voxposer"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "instruction = np.random.choice(descriptions)\n",
104 |     "voxposer_ui(instruction)"
105 |    ]
106 |   }
107 |  ],
108 |  "metadata": {
109 |   "kernelspec": {
110 |    "display_name": "voxposer-rlbench",
111 |    "language": "python",
112 |    "name": "python3"
113 |   },
114 |   "language_info": {
115 |    "codemirror_mode": {
116 |     "name": "ipython",
117 |     "version": 3
118 |    },
119 |    "file_extension": ".py",
120 |    "mimetype": "text/x-python",
121 |    "name": "python",
122 |    "nbconvert_exporter": "python",
123 |    "pygments_lexer": "ipython3",
124 |    "version": "3.9.18"
125 |   },
126 |   "orig_nbformat": 4,
127 |   "vscode": {
128 |    "interpreter": {
129 |     "hash": "d3e8e1ce6146f4dbb0e1ef1d424d742293054e718434205b504bd28714852756"
130 |    }
131 |   }
132 |  },
133 |  "nbformat": 4,
134 |  "nbformat_minor": 2
135 | }
136 | 


--------------------------------------------------------------------------------
/src/prompts/rlbench/get_affordance_map_prompt.txt:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from perception_utils import parse_query_obj
  3 | from plan_utils import get_empty_affordance_map, set_voxel_by_radius, cm2index
  4 | 
  5 | # Query: a point 10cm in front of [10, 15, 60].
  6 | affordance_map = get_empty_affordance_map()
  7 | # 10cm in front of so we add to x-axis
  8 | x = 10 + cm2index(10, 'x')
  9 | y = 15
 10 | z = 60
 11 | affordance_map[x, y, z] = 1
 12 | ret_val = affordance_map
 13 | 
 14 | # Query: a point on the right side of the table.
 15 | affordance_map = get_empty_affordance_map()
 16 | table = parse_query_obj('table')
 17 | (min_x, min_y, min_z), (max_x, max_y, max_z) = table.aabb
 18 | center_x, center_y, center_z = table.position
 19 | # right side so y = max_y
 20 | x = center_x
 21 | y = max_y
 22 | z = center_z
 23 | affordance_map[x, y, z] = 1
 24 | ret_val = affordance_map
 25 | 
 26 | # Query: a point 20cm on top of the container.
 27 | affordance_map = get_empty_affordance_map()
 28 | container = parse_query_obj('container')
 29 | (min_x, min_y, min_z), (max_x, max_y, max_z) = container.aabb
 30 | center_x, center_y, center_z = container.position
 31 | # 20cm on top of so we add to z-axis
 32 | x = center_x
 33 | y = center_y
 34 | z = max_z + cm2index(20, 'z')
 35 | affordance_map[x, y, z] = 1
 36 | ret_val = affordance_map
 37 | 
 38 | # Query: a point 1cm to the left of the brown block.
 39 | affordance_map = get_empty_affordance_map()
 40 | brown_block = parse_query_obj('brown block')
 41 | (min_x, min_y, min_z), (max_x, max_y, max_z) = brown_block.aabb
 42 | center_x, center_y, center_z = brown_block.position
 43 | # 1cm to the left of so we subtract from y-axis
 44 | x = center_x
 45 | y = min_y - cm2index(1, 'y')
 46 | z = center_z
 47 | affordance_map[x, y, z] = 1
 48 | ret_val = affordance_map
 49 | 
 50 | # Query: anywhere within 20cm of the right most block.
 51 | affordance_map = get_empty_affordance_map()
 52 | right_most_block = parse_query_obj('the right most block')
 53 | set_voxel_by_radius(affordance_map, right_most_block.position, radius_cm=20, value=1)
 54 | 
 55 | # Query: a point on the back side of the table.
 56 | affordance_map = get_empty_affordance_map()
 57 | table = parse_query_obj('table')
 58 | (min_x, min_y, min_z), (max_x, max_y, max_z) = table.aabb
 59 | center_x, center_y, center_z = table.position
 60 | # back side so x = min_x
 61 | x = min_x
 62 | y = center_y
 63 | z = center_z
 64 | affordance_map[x, y, z] = 1
 65 | ret_val = affordance_map
 66 | 
 67 | # Query: a point on the front right corner of the table.
 68 | affordance_map = get_empty_affordance_map()
 69 | table = parse_query_obj('table')
 70 | (min_x, min_y, min_z), (max_x, max_y, max_z) = table.aabb
 71 | center_x, center_y, center_z = table.position
 72 | # front right corner so x = max_x and y = max_y
 73 | x = max_x
 74 | y = max_y
 75 | z = center_z
 76 | affordance_map[x, y, z] = 1
 77 | ret_val = affordance_map
 78 | 
 79 | # Query: a point 30cm into the topmost drawer handle.
 80 | affordance_map = get_empty_affordance_map()
 81 | top_handle = parse_query_obj('topmost drawer handle')
 82 | # negative normal because we are moving into the handle.
 83 | moving_dir = -top_handle.normal
 84 | affordance_xyz = top_handle.position + cm2index(30, moving_dir)
 85 | affordance_map[affordance_xyz[0], affordance_xyz[1], affordance_xyz[2]] = 1
 86 | ret_val = affordance_map
 87 | 
 88 | # Query: a point 5cm above the blue block.
 89 | affordance_map = get_empty_affordance_map()
 90 | blue_block = parse_query_obj('blue block')
 91 | (min_x, min_y, min_z), (max_x, max_y, max_z) = blue_block.aabb
 92 | center_x, center_y, center_z = blue_block.position
 93 | # 5cm above so we add to z-axis
 94 | x = center_x
 95 | y = center_y
 96 | z = max_z + cm2index(5, 'z')
 97 | affordance_map[x, y, z] = 1
 98 | ret_val = affordance_map
 99 | 
100 | # Query: a point 20cm away from the leftmost block.
101 | affordance_map = get_empty_affordance_map()
102 | leftmost_block = parse_query_obj('leftmost block')
103 | # positive normal because we are moving away from the block.
104 | moving_dir = leftmost_block.normal
105 | affordance_xyz = leftmost_block.position + cm2index(20, moving_dir)
106 | affordance_map[affordance_xyz[0], affordance_xyz[1], affordance_xyz[2]] = 1
107 | ret_val = affordance_map
108 | 
109 | # Query: a point 4cm to the left of and 10cm on top of the tray that contains the lemon.
110 | affordance_map = get_empty_affordance_map()
111 | tray_with_lemon = parse_query_obj('tray that contains the lemon')
112 | (min_x, min_y, min_z), (max_x, max_y, max_z) = tray_with_lemon.aabb
113 | center_x, center_y, center_z = tray_with_lemon.position
114 | # 4cm to the left of so we subtract from y-axis, and 10cm on top of so we add to z-axis
115 | x = center_x
116 | y = min_y - cm2index(4, 'y')
117 | z = max_z + cm2index(10, 'z')
118 | affordance_map[x, y, z] = 1
119 | ret_val = affordance_map
120 | 
121 | # Query: a point 10cm to the right of [45 49 66], and 5cm above it.
122 | affordance_map = get_empty_affordance_map()
123 | # 10cm to the right of so we add to y-axis, and 5cm above it so we add to z-axis
124 | x = 45
125 | y = 49 + cm2index(10, 'y')
126 | z = 66 + cm2index(5, 'z')
127 | affordance_map[x, y, z] = 1
128 | ret_val = affordance_map
129 | 
130 | # Query: the blue circle.
131 | affordance_map = get_empty_affordance_map()
132 | blue_circle = parse_query_obj('blue circle')
133 | affordance_map = blue_circle.occupancy_map
134 | ret_val = affordance_map
135 | 
136 | # Query: a point at the center of the blue circle.
137 | affordance_map = get_empty_affordance_map()
138 | blue_circle = parse_query_obj('blue circle')
139 | x, y, z = blue_block.position
140 | affordance_map[x, y, z] = 1
141 | ret_val = affordance_map
142 | 
143 | # Query: a point 10cm above and 5cm to the left of the yellow bowl.
144 | affordance_map = get_empty_affordance_map()
145 | yellow_bowl = parse_query_obj('yellow bowl')
146 | (min_x, min_y, min_z), (max_x, max_y, max_z) = yellow_bowl.aabb
147 | center_x, center_y, center_z = yellow_bowl.position
148 | # 10cm above so we add to z-axis, and 5cm to the left of so we subtract from y-axis
149 | x = center_x
150 | y = min_y - cm2index(5, 'y')
151 | z = max_z + cm2index(10, 'z')
152 | affordance_map[x, y, z] = 1
153 | ret_val = affordance_mapS


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models
 2 | 
 3 | #### [[Project Page]](https://voxposer.github.io/) [[Paper]](https://voxposer.github.io/voxposer.pdf) [[Video]](https://www.youtube.com/watch?v=Yvn4eR05A3M)
 4 | 
 5 | [Wenlong Huang](https://wenlong.page)<sup>1</sup>, [Chen Wang](https://www.chenwangjeremy.net/)<sup>1</sup>, [Ruohan Zhang](https://ai.stanford.edu/~zharu/)<sup>1</sup>, [Yunzhu Li](https://yunzhuli.github.io/)<sup>1,2</sup>, [Jiajun Wu](https://jiajunwu.com/)<sup>1</sup>, [Li Fei-Fei](https://profiles.stanford.edu/fei-fei-li)<sup>1</sup>
 6 | 
 7 | <sup>1</sup>Stanford University, <sup>2</sup>University of Illinois Urbana-Champaign
 8 | 
 9 | <img  src="media/teaser.gif" width="550">
10 | 
11 | This is the official demo code for [VoxPoser](https://voxposer.github.io/), a method that uses large language models and vision-language models to zero-shot synthesize trajectories for manipulation tasks.
12 | 
13 | In this repo, we provide the implementation of VoxPoser in [RLBench](https://sites.google.com/view/rlbench) as its task diversity best resembles our real-world setup. Note that VoxPoser is a zero-shot method that does not require any training data. Therefore, the main purpose of this repo is to provide a demo implementation rather than an evaluation benchmark.
14 | 
15 | **Note: This codebase currently does not contain the perception pipeline used in our real-world experiments, which produces a real-time mapping from object names to object masks. Instead, it uses the object masks provided as part of RLBench's `get_observation` function. If you are interested in deploying the code on a real robot, you may find more information in the section [Real World Deployment](#real-world-deployment).**
16 | 
17 | If you find this work useful in your research, please cite using the following BibTeX:
18 | 
19 | ```bibtex
20 | @article{huang2023voxposer,
21 |       title={VoxPoser: Composable 3D Value Maps for Robotic Manipulation with Language Models},
22 |       author={Huang, Wenlong and Wang, Chen and Zhang, Ruohan and Li, Yunzhu and Wu, Jiajun and Fei-Fei, Li},
23 |       journal={arXiv preprint arXiv:2307.05973},
24 |       year={2023}
25 |     }
26 | ```
27 | 
28 | ## Setup Instructions
29 | 
30 | Note that this codebase is best run with a display. For running in headless mode, refer to the [instructions in RLBench](https://github.com/stepjam/RLBench#running-headless).
31 | 
32 | - Create a conda environment:
33 | ```Shell
34 | conda create -n voxposer-env python=3.9
35 | conda activate voxposer-env
36 | ```
37 | 
38 | - See [Instructions](https://github.com/stepjam/RLBench#install) to install PyRep and RLBench (Note: install these inside the created conda environment).
39 | 
40 | - Install other dependencies:
41 | ```Shell
42 | pip install -r requirements.txt
43 | ```
44 | 
45 | - Obtain an [OpenAI API](https://openai.com/blog/openai-api) key, and put it inside the first cell of the demo notebook.
46 | 
47 | ## Running Demo
48 | 
49 | Demo code is at `src/playground.ipynb`. Instructions can be found in the notebook.
50 | 
51 | ## Code Structure
52 | 
53 | Core to VoxPoser:
54 | 
55 | - **`playground.ipynb`**: Playground for VoxPoser.
56 | - **`LMP.py`**: Implementation of Language Model Programs (LMPs) that recursively generates code to decompose instructions and compose value maps for each sub-task.
57 | - **`interfaces.py`**: Interface that provides necessary APIs for language models (i.e., LMPs) to operate in voxel space and to invoke motion planner.
58 | - **`planners.py`**: Implementation of a greedy planner that plans a trajectory (represented as a series of waypoints) for an entity/movable given a value map.
59 | - **`controllers.py`**: Given a waypoint for an entity/movable, the controller applies (a series of) robot actions to achieve the waypoint.
60 | - **`dynamics_models.py`**: Environment dynamics model for the case where entity/movable is an object or object part. This is used in `controllers.py` to perform MPC.
61 | - **`prompts/rlbench`**: Prompts used by the different Language Model Programs (LMPs) in VoxPoser.
62 | 
63 | Environment and utilities:
64 | 
65 | - **`envs`**:
66 |   - **`rlbench_env.py`**: Wrapper of RLBench env to expose useful functions for VoxPoser.
67 |   - **`task_object_names.json`**: Mapping of object names exposed to VoxPoser and their corresponding scene object names for each individual task.
68 | - **`configs/rlbench_config.yaml`**: Config file for all the involved modules in RLBench environment.
69 | - **`arguments.py`**: Argument parser for the config file.
70 | - **`LLM_cache.py`**: Caching of language model outputs that writes to disk to save cost and time.
71 | - **`utils.py`**: Utility functions.
72 | - **`visualizers.py`**: A Plotly-based visualizer for value maps and planned trajectories.
73 | 
74 | ## Real-World Deployment
75 | To adapt the code to deploy on a real robot, most changes should only happen in the environment file (e.g., you can consider making a copy of `rlbench_env.py` and implementing the same APIs based on your perception and controller modules).
76 | 
77 | Our perception pipeline consists of the following modules: [OWL-ViT](https://huggingface.co/docs/transformers/en/model_doc/owlvit) for open-vocabulary detection in the first frame, [SAM](https://github.com/facebookresearch/segment-anything?tab=readme-ov-file#segment-anything) for converting the produced bounding boxes to masks in the first frame, and [XMEM](https://github.com/hkchengrex/XMem) for tracking the masks over time for the subsequent frames. Now you may consider simplifying the pipeline using only an open-vocabulary detector and [SAM 2](https://github.com/facebookresearch/segment-anything?tab=readme-ov-file#latest-updates----sam-2-segment-anything-in-images-and-videos) for segmentation and tracking. Our controller is based on the OSC implementation from [Deoxys](https://github.com/UT-Austin-RPL/deoxys_control). More details can be found in the [paper](https://voxposer.github.io/voxposer.pdf).
78 | 
79 | To avoid compounded latency introduced by different modules (especially the perception pipeline), you may also consider running a concurrent process that only performs tracking.
80 | 
81 | ## Acknowledgments
82 | - Environment is based on [RLBench](https://sites.google.com/view/rlbench).
83 | - Implementation of Language Model Programs (LMPs) is based on [Code as Policies](https://code-as-policies.github.io/).
84 | - Some code snippets are from [Where2Act](https://cs.stanford.edu/~kaichun/where2act/).
85 | 


--------------------------------------------------------------------------------
/src/prompts/rlbench/composer_prompt.txt:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from env_utils import execute, reset_to_default_pose
  3 | from perception_utils import parse_query_obj
  4 | from plan_utils import get_affordance_map, get_avoidance_map, get_velocity_map, get_rotation_map, get_gripper_map
  5 | 
  6 | # Query: move ee forward for 10cm.
  7 | movable = parse_query_obj('gripper')
  8 | affordance_map = get_affordance_map(f'a point 10cm in front of {movable.position}')
  9 | execute(movable, affordance_map)
 10 | 
 11 | # Query: go back to default.
 12 | reset_to_default_pose()
 13 | 
 14 | # Query: move the gripper behind the bowl, and slow down when near the bowl.
 15 | movable = parse_query_obj('gripper')
 16 | affordance_map = get_affordance_map('a point 15cm behind the bowl')
 17 | avoidance_map = get_avoidance_map('10cm near the bowl')
 18 | velocity_map = get_velocity_map('slow down when near the bowl')
 19 | execute(movable, affordance_map=affordance_map, avoidance_map=avoidance_map, velocity_map=velocity_map)
 20 | 
 21 | # Query: move to the back side of the table while staying at least 5cm from the blue block.
 22 | movable = parse_query_obj('gripper')
 23 | affordance_map = get_affordance_map('a point on the back side of the table')
 24 | avoidance_map = get_avoidance_map('5cm from the blue block')
 25 | execute(movable, affordance_map=affordance_map, avoidance_map=avoidance_map)
 26 | 
 27 | # Query: move to the top of the plate and face the plate.
 28 | movable = parse_query_obj('gripper')
 29 | affordance_map = get_affordance_map('a point 10cm above the plate')
 30 | rotation_map = get_rotation_map('face the plate')
 31 | execute(movable, affordance_map=affordance_map, rotation_map=rotation_map)
 32 | 
 33 | # Query: drop the toy inside container.
 34 | movable = parse_query_obj('gripper')
 35 | affordance_map = get_affordance_map('a point 15cm above the container')
 36 | gripper_map = get_gripper_map('close everywhere but open when on top of the container')
 37 | execute(movable, affordance_map=affordance_map, gripper_map=gripper_map)
 38 | 
 39 | # Query: push close the topmost drawer.
 40 | movable = parse_query_obj('topmost drawer handle')
 41 | affordance_map = get_affordance_map('a point 30cm into the topmost drawer handle')
 42 | execute(movable, affordance_map=affordance_map)
 43 | 
 44 | # Query: push the second to the left block along the red line.
 45 | movable = parse_query_obj('second to the left block')
 46 | affordance_map = get_affordance_map('the red line')
 47 | execute(movable, affordance_map=affordance_map)
 48 | 
 49 | # Query: grasp the blue block from the table at a quarter of the speed.
 50 | movable = parse_query_obj('gripper')
 51 | affordance_map = get_affordance_map('a point at the center of blue block')
 52 | velocity_map = get_velocity_map('quarter of the speed')
 53 | gripper_map = get_gripper_map('open everywhere except 1cm around the blue block')
 54 | execute(movable, affordance_map=affordance_map, velocity_map=velocity_map, gripper_map=gripper_map)
 55 | 
 56 | # Query: move to the left of the brown block.
 57 | movable = parse_query_obj('gripper')
 58 | affordance_map = get_affordance_map('a point 10cm to the left of the brown block')
 59 | execute(movable, affordance_map=affordance_map)
 60 | 
 61 | # Query: move to the top of the tray that contains the lemon.
 62 | movable = parse_query_obj('gripper')
 63 | affordance_map = get_affordance_map('a point 10cm above the tray that contains the lemon')
 64 | execute(movable, affordance_map=affordance_map)
 65 | 
 66 | # Query: close drawer by 5cm.
 67 | movable = parse_query_obj('drawer handle')
 68 | affordance_map = get_affordance_map('a point 5cm into the drawer handle')
 69 | execute(movable, affordance_map=affordance_map)
 70 | 
 71 | # Query: move to 5cm on top of the soda can, at 0.5x speed when within 20cm of the wooden mug, and keep at least 15cm away from the wooden mug.
 72 | movable = parse_query_obj('gripper')
 73 | affordance_map = get_affordance_map('a point 5cm above the soda can')
 74 | avoidance_map = get_avoidance_map('15cm from the wooden mug')
 75 | velocity_map = get_velocity_map('0.5x speed when within 20cm of the wooden mug')
 76 | execute(movable, affordance_map=affordance_map, avoidance_map=avoidance_map, velocity_map=velocity_map)
 77 | 
 78 | # Query: wipe the red dot but avoid the blue block.
 79 | movable = parse_query_obj('gripper')
 80 | affordance_map = get_affordance_map('the red dot')
 81 | avoidance_map = get_avoidance_map('10cm from the blue block')
 82 | execute(movable, affordance_map=affordance_map, avoidance_map=avoidance_map)
 83 | 
 84 | # Query: grasp the mug from the shelf.
 85 | movable = parse_query_obj('gripper')
 86 | affordance_map = get_affordance_map('a point at the center of the mug handle')
 87 | gripper_map = get_gripper_map('open everywhere except 1cm around the mug handle')
 88 | execute(movable, affordance_map=affordance_map, gripper_map=gripper_map)
 89 | 
 90 | # Query: move to 10cm on top of the soup bowl, and 5cm to the left of the soup bowl, while away from the glass, at 0.75x speed.
 91 | movable = parse_query_obj('gripper')
 92 | affordance_map = get_affordance_map('a point 10cm above and 5cm to the left of the soup bowl')
 93 | avoidance_map = get_avoidance_map('10cm from the glass')
 94 | velocity_map = get_velocity_map('0.75x speed')
 95 | execute(movable, affordance_map=affordance_map, avoidance_map=avoidance_map, velocity_map=velocity_map)
 96 | 
 97 | # Query: open gripper.
 98 | movable = parse_query_obj('gripper')
 99 | gripper_map = get_gripper_map('open everywhere')
100 | execute(movable, gripper_map=gripper_map)
101 | 
102 | # Query: turn counter-clockwise by 180 degrees.
103 | movable = parse_query_obj('gripper')
104 | rotation_map = get_rotation_map('turn counter-clockwise by 180 degrees')
105 | execute(movable, rotation_map=rotation_map)
106 | 
107 | # Query: sweep all particles to the left side of the table.
108 | particles = parse_query_obj('particles')
109 | for particle in particles:
110 |     movable = particle
111 |     affordance_map = get_affordance_map('a point on the left side of the table')
112 |     execute(particle, affordance_map=affordance_map)
113 | 
114 | # Query: grasp the bottom drawer handle while moving at 0.5x speed.
115 | movable = parse_query_obj('gripper')
116 | affordance_map = get_affordance_map('a point at the center of the bottom drawer handle')
117 | velocity_map = get_velocity_map('0.5x speed')
118 | rotation_map = get_rotation_map('face the bottom drawer handle')
119 | gripper_map = get_gripper_map('open everywhere except 1cm around the bottom drawer handle')
120 | execute(movable, affordance_map=affordance_map, velocity_map=velocity_map, rotation_map=rotation_map, gripper_map=gripper_map)


--------------------------------------------------------------------------------
/src/LMP.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import openai
  3 | from time import sleep
  4 | from openai.error import RateLimitError, APIConnectionError
  5 | from pygments import highlight
  6 | from pygments.lexers import PythonLexer
  7 | from pygments.formatters import TerminalFormatter
  8 | from utils import load_prompt, DynamicObservation, IterableDynamicObservation
  9 | import time
 10 | from LLM_cache import DiskCache
 11 | 
 12 | class LMP:
 13 |     """Language Model Program (LMP), adopted from Code as Policies."""
 14 |     def __init__(self, name, cfg, fixed_vars, variable_vars, debug=False, env='rlbench'):
 15 |         self._name = name
 16 |         self._cfg = cfg
 17 |         self._debug = debug
 18 |         self._base_prompt = load_prompt(f"{env}/{self._cfg['prompt_fname']}.txt")
 19 |         self._stop_tokens = list(self._cfg['stop'])
 20 |         self._fixed_vars = fixed_vars
 21 |         self._variable_vars = variable_vars
 22 |         self.exec_hist = ''
 23 |         self._context = None
 24 |         self._cache = DiskCache(load_cache=self._cfg['load_cache'])
 25 | 
 26 |     def clear_exec_hist(self):
 27 |         self.exec_hist = ''
 28 | 
 29 |     def build_prompt(self, query):
 30 |         if len(self._variable_vars) > 0:
 31 |             variable_vars_imports_str = f"from utils import {', '.join(self._variable_vars.keys())}"
 32 |         else:
 33 |             variable_vars_imports_str = ''
 34 |         prompt = self._base_prompt.replace('{variable_vars_imports}', variable_vars_imports_str)
 35 | 
 36 |         if self._cfg['maintain_session'] and self.exec_hist != '':
 37 |             prompt += f'\n{self.exec_hist}'
 38 |         
 39 |         prompt += '\n'  # separate prompted examples with the query part
 40 | 
 41 |         if self._cfg['include_context']:
 42 |             assert self._context is not None, 'context is None'
 43 |             prompt += f'\n{self._context}'
 44 | 
 45 |         user_query = f'{self._cfg["query_prefix"]}{query}{self._cfg["query_suffix"]}'
 46 |         prompt += f'\n{user_query}'
 47 | 
 48 |         return prompt, user_query
 49 |     
 50 |     def _cached_api_call(self, **kwargs):
 51 |         # check whether completion endpoint or chat endpoint is used
 52 |         if kwargs['model'] != 'gpt-3.5-turbo-instruct' and \
 53 |             any([chat_model in kwargs['model'] for chat_model in ['gpt-3.5', 'gpt-4']]):
 54 |             # add special prompt for chat endpoint
 55 |             user1 = kwargs.pop('prompt')
 56 |             new_query = '# Query:' + user1.split('# Query:')[-1]
 57 |             user1 = ''.join(user1.split('# Query:')[:-1]).strip()
 58 |             user1 = f"I would like you to help me write Python code to control a robot arm operating in a tabletop environment. Please complete the code every time when I give you new query. Pay attention to appeared patterns in the given context code. Be thorough and thoughtful in your code. Do not include any import statement. Do not repeat my question. Do not provide any text explanation (comment in code is okay). I will first give you the context of the code below:\n\n```\n{user1}\n```\n\nNote that x is back to front, y is left to right, and z is bottom to up."
 59 |             assistant1 = f'Got it. I will complete what you give me next.'
 60 |             user2 = new_query
 61 |             # handle given context (this was written originally for completion endpoint)
 62 |             if user1.split('\n')[-4].startswith('objects = ['):
 63 |                 obj_context = user1.split('\n')[-4]
 64 |                 # remove obj_context from user1
 65 |                 user1 = '\n'.join(user1.split('\n')[:-4]) + '\n' + '\n'.join(user1.split('\n')[-3:])
 66 |                 # add obj_context to user2
 67 |                 user2 = obj_context.strip() + '\n' + user2
 68 |             messages=[
 69 |                 {"role": "system", "content": "You are a helpful assistant that pays attention to the user's instructions and writes good python code for operating a robot arm in a tabletop environment."},
 70 |                 {"role": "user", "content": user1},
 71 |                 {"role": "assistant", "content": assistant1},
 72 |                 {"role": "user", "content": user2},
 73 |             ]
 74 |             kwargs['messages'] = messages
 75 |             if kwargs in self._cache:
 76 |                 print('(using cache)', end=' ')
 77 |                 return self._cache[kwargs]
 78 |             else:
 79 |                 ret = openai.ChatCompletion.create(**kwargs)['choices'][0]['message']['content']
 80 |                 # post processing
 81 |                 ret = ret.replace('```', '').replace('python', '').strip()
 82 |                 self._cache[kwargs] = ret
 83 |                 return ret
 84 |         else:
 85 |             if kwargs in self._cache:
 86 |                 print('(using cache)', end=' ')
 87 |                 return self._cache[kwargs]
 88 |             else:
 89 |                 ret = openai.Completion.create(**kwargs)['choices'][0]['text'].strip()
 90 |                 self._cache[kwargs] = ret
 91 |                 return ret
 92 | 
 93 |     def __call__(self, query, **kwargs):
 94 |         prompt, user_query = self.build_prompt(query)
 95 | 
 96 |         start_time = time.time()
 97 |         while True:
 98 |             try:
 99 |                 code_str = self._cached_api_call(
100 |                     prompt=prompt,
101 |                     stop=self._stop_tokens,
102 |                     temperature=self._cfg['temperature'],
103 |                     model=self._cfg['model'],
104 |                     max_tokens=self._cfg['max_tokens']
105 |                 )
106 |                 break
107 |             except (RateLimitError, APIConnectionError) as e:
108 |                 print(f'OpenAI API got err {e}')
109 |                 print('Retrying after 3s.')
110 |                 sleep(3)
111 |         print(f'*** OpenAI API call took {time.time() - start_time:.2f}s ***')
112 | 
113 |         if self._cfg['include_context']:
114 |             assert self._context is not None, 'context is None'
115 |             to_exec = f'{self._context}\n{code_str}'
116 |             to_log = f'{self._context}\n{user_query}\n{code_str}'
117 |         else:
118 |             to_exec = code_str
119 |             to_log = f'{user_query}\n{to_exec}'
120 | 
121 |         to_log_pretty = highlight(to_log, PythonLexer(), TerminalFormatter())
122 | 
123 |         if self._cfg['include_context']:
124 |             print('#'*40 + f'\n## "{self._name}" generated code\n' + f'## context: "{self._context}"\n' + '#'*40 + f'\n{to_log_pretty}\n')
125 |         else:
126 |             print('#'*40 + f'\n## "{self._name}" generated code\n' + '#'*40 + f'\n{to_log_pretty}\n')
127 | 
128 |         gvars = merge_dicts([self._fixed_vars, self._variable_vars])
129 |         lvars = kwargs
130 | 
131 |         # return function instead of executing it so we can replan using latest obs（do not do this for high-level UIs)
132 |         if not self._name in ['composer', 'planner']:
133 |             to_exec = 'def ret_val():\n' + to_exec.replace('ret_val = ', 'return ')
134 |             to_exec = to_exec.replace('\n', '\n    ')
135 | 
136 |         if self._debug:
137 |             # only "execute" function performs actions in environment, so we comment it out
138 |             action_str = ['execute(']
139 |             try:
140 |                 for s in action_str:
141 |                     exec_safe(to_exec.replace(s, f'# {s}'), gvars, lvars)
142 |             except Exception as e:
143 |                 print(f'Error: {e}')
144 |                 import pdb ; pdb.set_trace()
145 |         else:
146 |             exec_safe(to_exec, gvars, lvars)
147 | 
148 |         self.exec_hist += f'\n{to_log.strip()}'
149 | 
150 |         if self._cfg['maintain_session']:
151 |             self._variable_vars.update(lvars)
152 | 
153 |         if self._cfg['has_return']:
154 |             if self._name == 'parse_query_obj':
155 |                 try:
156 |                     # there may be multiple objects returned, but we also want them to be unevaluated functions so that we can access latest obs
157 |                     return IterableDynamicObservation(lvars[self._cfg['return_val_name']])
158 |                 except AssertionError:
159 |                     return DynamicObservation(lvars[self._cfg['return_val_name']])
160 |             return lvars[self._cfg['return_val_name']]
161 | 
162 | 
163 | def merge_dicts(dicts):
164 |     return {
165 |         k : v 
166 |         for d in dicts
167 |         for k, v in d.items()
168 |     }
169 |     
170 | 
171 | def exec_safe(code_str, gvars=None, lvars=None):
172 |     banned_phrases = ['import', '__']
173 |     for phrase in banned_phrases:
174 |         assert phrase not in code_str
175 |   
176 |     if gvars is None:
177 |         gvars = {}
178 |     if lvars is None:
179 |         lvars = {}
180 |     empty_fn = lambda *args, **kwargs: None
181 |     custom_gvars = merge_dicts([
182 |         gvars,
183 |         {'exec': empty_fn, 'eval': empty_fn}
184 |     ])
185 |     try:
186 |         exec(code_str, custom_gvars, lvars)
187 |     except Exception as e:
188 |         print(f'Error executing code:\n{code_str}')
189 |         raise e


--------------------------------------------------------------------------------
/src/visualizers.py:
--------------------------------------------------------------------------------
  1 | """Plotly-Based Visualizer"""
  2 | import plotly.graph_objects as go
  3 | import numpy as np
  4 | import os
  5 | import datetime
  6 | 
  7 | 
  8 | class ValueMapVisualizer:
  9 |     """
 10 |     A Plotly-based visualizer for 3D value map and planned path.
 11 |     """
 12 |     def __init__(self, config):
 13 |         self.scene_points = None
 14 |         self.save_dir = config['save_dir']
 15 |         if self.save_dir is not None:
 16 |             os.makedirs(self.save_dir, exist_ok=True)
 17 |         self.quality = config['quality']
 18 |         self.update_quality(self.quality)
 19 |         self.map_size = config['map_size']
 20 |     
 21 |     def update_bounds(self, lower, upper):
 22 |         self.workspace_bounds_min = lower
 23 |         self.workspace_bounds_max = upper
 24 |         self.plot_bounds_min = lower - 0.15 * (upper - lower)
 25 |         self.plot_bounds_max = upper + 0.15 * (upper - lower)
 26 |         xyz_ratio = 1 / (self.workspace_bounds_max - self.workspace_bounds_min)
 27 |         scene_scale = np.max(xyz_ratio) / xyz_ratio
 28 |         self.scene_scale = scene_scale
 29 | 
 30 |     def update_quality(self, quality):
 31 |         self.quality = quality
 32 |         if self.quality == 'low':
 33 |             self.downsample_ratio = 4
 34 |             self.max_scene_points = 150000
 35 |             self.costmap_opacity = 0.2 * 0.6
 36 |             self.costmap_surface_count = 10
 37 |         elif self.quality == 'low-full-scene':
 38 |             self.downsample_ratio = 4
 39 |             self.max_scene_points = 1000000
 40 |             self.costmap_opacity = 0.2 * 0.6
 41 |             self.costmap_surface_count = 10
 42 |         elif self.quality == 'low-half-scene':
 43 |             self.downsample_ratio = 4
 44 |             self.max_scene_points = 250000
 45 |             self.costmap_opacity = 0.2 * 0.6
 46 |             self.costmap_surface_count = 10
 47 |         elif self.quality == 'medium':
 48 |             self.downsample_ratio = 2
 49 |             self.max_scene_points = 300000
 50 |             self.costmap_opacity = 0.1 * 0.6
 51 |             self.costmap_surface_count = 30
 52 |         elif self.quality == 'medium-full-scene':
 53 |             self.downsample_ratio = 2
 54 |             self.max_scene_points = 1000000
 55 |             self.costmap_opacity = 0.1 * 0.6
 56 |             self.costmap_surface_count = 30
 57 |         elif self.quality == 'medium-half-scene':
 58 |             self.downsample_ratio = 2
 59 |             self.max_scene_points = 500000
 60 |             self.costmap_opacity = 0.1 * 0.6
 61 |             self.costmap_surface_count = 30
 62 |         elif self.quality == 'high':
 63 |             self.downsample_ratio = 1
 64 |             self.max_scene_points = 500000
 65 |             self.costmap_opacity = 0.07 * 0.6
 66 |             self.costmap_surface_count = 50
 67 |         elif self.quality == 'best':
 68 |             self.downsample_ratio = 1
 69 |             self.max_scene_points = 500000
 70 |             self.costmap_opacity = 0.05 * 0.6
 71 |             self.costmap_surface_count = 100
 72 |         else:
 73 |             raise ValueError(f'Unknown quality: {self.quality}; should be one of [low, medium, high]')
 74 | 
 75 |     def update_scene_points(self, points, colors=None):
 76 |         points = points.astype(np.float16)
 77 |         assert colors.dtype == np.uint8
 78 |         self.scene_points = (points, colors)
 79 | 
 80 |     def visualize(self, info, show=False, save=True):
 81 |         """visualize the path and relevant info using plotly"""
 82 |         planner_info = info['planner_info']
 83 |         waypoints_world = np.array([p[0] for p in info['traj_world']])
 84 |         start_pos_world = info['start_pos_world']
 85 |         assert len(start_pos_world.shape) == 1
 86 |         waypoints_world = np.concatenate([start_pos_world[None, ...], waypoints_world], axis=0)
 87 |         
 88 |         fig_data = []
 89 |         # plot path
 90 |         # add marker to path waypoints
 91 |         fig_data.append(go.Scatter3d(x=waypoints_world[:, 0], y=waypoints_world[:, 1], z=waypoints_world[:, 2], mode='markers', name='waypoints', marker=dict(size=4, color='red')))
 92 |         # add lines between waypoints
 93 |         for i in range(waypoints_world.shape[0] - 1):
 94 |             fig_data.append(go.Scatter3d(x=waypoints_world[i:i+2, 0], y=waypoints_world[i:i+2, 1], z=waypoints_world[i:i+2, 2], mode='lines', name='path', line=dict(width=10, color='orange')))
 95 |         if planner_info is not None:
 96 |             # plot costmap
 97 |             if 'costmap' in planner_info:
 98 |                 costmap = planner_info['costmap'][::self.downsample_ratio, ::self.downsample_ratio, ::self.downsample_ratio]
 99 |                 skip_ratio = (self.workspace_bounds_max - self.workspace_bounds_min) / (self.map_size / self.downsample_ratio)
100 |                 x, y, z = np.mgrid[self.workspace_bounds_min[0]:self.workspace_bounds_max[0]:skip_ratio[0],
101 |                                 self.workspace_bounds_min[1]:self.workspace_bounds_max[1]:skip_ratio[1],
102 |                                 self.workspace_bounds_min[2]:self.workspace_bounds_max[2]:skip_ratio[2]]
103 |                 grid_shape = costmap.shape
104 |                 x = x[:grid_shape[0], :grid_shape[1], :grid_shape[2]]
105 |                 y = y[:grid_shape[0], :grid_shape[1], :grid_shape[2]]
106 |                 z = z[:grid_shape[0], :grid_shape[1], :grid_shape[2]]
107 |                 fig_data.append(go.Volume(x=x.flatten(), y=y.flatten(), z=z.flatten(), value=costmap.flatten(), isomin=0, isomax=1, opacity=self.costmap_opacity, surface_count=self.costmap_surface_count, colorscale='Jet', showlegend=True, showscale=False))
108 |             # plot start position
109 |             if 'start_pos' in planner_info:
110 |                 fig_data.append(go.Scatter3d(x=[start_pos_world[0]], y=[start_pos_world[1]], z=[start_pos_world[2]], mode='markers', name='start', marker=dict(size=6, color='blue')))
111 |             # plot target as dots extracted from target_map
112 |             if 'raw_target_map' in planner_info:
113 |                 targets_world = info['targets_world']
114 |                 fig_data.append(go.Scatter3d(x=targets_world[:, 0], y=targets_world[:, 1], z=targets_world[:, 2], mode='markers', name='target', marker=dict(size=6, color='green', opacity=0.7)))
115 | 
116 |         # visualize scene points
117 |         if self.scene_points is None:
118 |             print('no scene points to overlay, skipping...')
119 |             scene_points = None
120 |         else:
121 |             scene_points, scene_point_colors = self.scene_points
122 |             # resample to reduce the number of points
123 |             if scene_points.shape[0] > self.max_scene_points:
124 |                 resample_idx = np.random.choice(scene_points.shape[0], min(scene_points.shape[0], self.max_scene_points), replace=False)
125 |                 scene_points = scene_points[resample_idx]
126 |                 if scene_point_colors is not None:
127 |                     scene_point_colors = scene_point_colors[resample_idx]
128 |             if scene_point_colors is None:
129 |                 scene_point_colors = scene_points[:, 2]
130 |             else:
131 |                 scene_point_colors = scene_point_colors / 255.0
132 |             # add scene points
133 |             fig_data.append(go.Scatter3d(x=scene_points[:, 0], y=scene_points[:, 1], z=scene_points[:, 2],
134 |                                         mode='markers', marker=dict(size=3, color=scene_point_colors, opacity=1.0)))
135 |         
136 |         fig = go.Figure(data=fig_data)
137 |  
138 |         # set bounds and ratio
139 |         fig.update_layout(scene=dict(xaxis=dict(range=[self.plot_bounds_min[0], self.plot_bounds_max[0]], autorange=False),
140 |                                     yaxis=dict(range=[self.plot_bounds_min[1], self.plot_bounds_max[1]], autorange=False),
141 |                                     zaxis=dict(range=[self.plot_bounds_min[2], self.plot_bounds_max[2]], autorange=False)),
142 |                         scene_aspectmode='manual',
143 |                         scene_aspectratio=dict(x=self.scene_scale[0], y=self.scene_scale[1], z=self.scene_scale[2]))
144 | 
145 |         # do not show grid and axes
146 |         fig.update_layout(scene=dict(xaxis=dict(showgrid=False, showticklabels=False, title='', visible=False),
147 |                                     yaxis=dict(showgrid=False, showticklabels=False, title='', visible=False),
148 |                                     zaxis=dict(showgrid=False, showticklabels=False, title='', visible=False)))
149 | 
150 |         # set background color as white
151 |         fig.update_layout(template='none')
152 | 
153 |         # save and show
154 |         if save and self.save_dir is not None:
155 |             curr_time = datetime.datetime.now()
156 |             log_id = f'{curr_time.hour}:{curr_time.minute}:{curr_time.second}'
157 |             save_path = os.path.join(self.save_dir, log_id + '.html')
158 |             latest_save_path = os.path.join(self.save_dir, 'latest.html')
159 |             print('** saving visualization to', save_path, '...')
160 |             fig.write_html(save_path)
161 |             print('** saving visualization to', latest_save_path, '...')
162 |             fig.write_html(latest_save_path)
163 |             print(f'** save to {save_path}')
164 |         if show:
165 |             fig.show()
166 | 
167 |         return fig


--------------------------------------------------------------------------------
/src/planners.py:
--------------------------------------------------------------------------------
  1 | """Greedy path planner."""
  2 | import numpy as np
  3 | from scipy.ndimage import gaussian_filter
  4 | from scipy.ndimage import distance_transform_edt
  5 | from scipy.signal import savgol_filter
  6 | from utils import get_clock_time, normalize_map, calc_curvature
  7 | 
  8 | 
  9 | class PathPlanner:
 10 |     """
 11 |     A greedy path planner that greedily chooses the next voxel with the lowest cost.
 12 |     Then apply several postprocessing steps to the path.
 13 |     (TODO: can be improved using more principled methods, including extension to whole-arm planning)
 14 |     """
 15 |     def __init__(self, planner_config, map_size):
 16 |         self.config = planner_config
 17 |         self.map_size = map_size
 18 | 
 19 |     def optimize(self, start_pos: np.ndarray, target_map: np.ndarray, obstacle_map: np.ndarray, object_centric=False):
 20 |         """
 21 |         config:
 22 |             start_pos: (3,) np.ndarray, start position
 23 |             target_map: (map_size, map_size, map_size) np.ndarray, target_map
 24 |             obstacle_map: (map_size, map_size, map_size) np.ndarray, obstacle_map
 25 |             object_centric: bool, whether the task is object centric (entity of interest is an object/part instead of robot)
 26 |         Returns:
 27 |             path: (n, 3) np.ndarray, path
 28 |             info: dict, info
 29 |         """
 30 |         print(f'[planners.py | {get_clock_time(milliseconds=True)}] start')
 31 |         info = dict()
 32 |         # make copies
 33 |         start_pos, raw_start_pos = start_pos.copy(), start_pos
 34 |         target_map, raw_target_map = target_map.copy(), target_map
 35 |         obstacle_map, raw_obstacle_map = obstacle_map.copy(), obstacle_map
 36 |         # smoothing
 37 |         target_map = distance_transform_edt(1 - target_map)
 38 |         target_map = normalize_map(target_map)
 39 |         obstacle_map = gaussian_filter(obstacle_map, sigma=self.config.obstacle_map_gaussian_sigma)
 40 |         obstacle_map = normalize_map(obstacle_map)
 41 |         # combine target_map and obstacle_map
 42 |         costmap = target_map * self.config.target_map_weight + obstacle_map * self.config.obstacle_map_weight
 43 |         costmap = normalize_map(costmap)
 44 |         _costmap = costmap.copy()
 45 |         # get stop criteria
 46 |         stop_criteria = self._get_stop_criteria()
 47 |         # initialize path
 48 |         path, current_pos = [start_pos], start_pos
 49 |         # optimize
 50 |         print(f'[planners.py | {get_clock_time(milliseconds=True)}] start optimizing, start_pos: {start_pos}')
 51 |         for i in range(self.config.max_steps):
 52 |             # calculate all nearby voxels around current position
 53 |             all_nearby_voxels = self._calculate_nearby_voxel(current_pos, object_centric=object_centric)
 54 |             # calculate the score of all nearby voxels
 55 |             nearby_score = _costmap[all_nearby_voxels[:, 0], all_nearby_voxels[:, 1], all_nearby_voxels[:, 2]]
 56 |             # Find the minimum cost voxel
 57 |             steepest_idx = np.argmin(nearby_score)
 58 |             next_pos = all_nearby_voxels[steepest_idx]
 59 |             # increase cost at current position to avoid going back
 60 |             _costmap[current_pos[0].round().astype(int),
 61 |                      current_pos[1].round().astype(int),
 62 |                      current_pos[2].round().astype(int)] += 1
 63 |             # update path and current position
 64 |             path.append(next_pos)
 65 |             current_pos = next_pos
 66 |             # check stop criteria
 67 |             if stop_criteria(current_pos, _costmap, self.config.stop_threshold):
 68 |                 break
 69 |         raw_path = np.array(path)
 70 |         print(f'[planners.py | {get_clock_time(milliseconds=True)}] optimization finished; path length: {len(raw_path)}')
 71 |         # postprocess path
 72 |         processed_path = self._postprocess_path(raw_path, raw_target_map, object_centric=object_centric)
 73 |         print(f'[planners.py | {get_clock_time(milliseconds=True)}] after postprocessing, path length: {len(processed_path)}')
 74 |         print(f'[planners.py | {get_clock_time(milliseconds=True)}] last waypoint: {processed_path[-1]}')
 75 |         # save info
 76 |         info['start_pos'] = start_pos
 77 |         info['target_map'] = target_map
 78 |         info['obstacle_map'] = obstacle_map
 79 |         info['costmap'] = costmap
 80 |         info['costmap_altered'] = _costmap
 81 |         info['raw_start_pos'] = raw_start_pos
 82 |         info['raw_target_map'] = raw_target_map
 83 |         info['raw_obstacle_map'] = raw_obstacle_map
 84 |         info['planner_raw_path'] = raw_path.copy()
 85 |         info['planner_postprocessed_path'] = processed_path.copy()
 86 |         info['targets_voxel'] = np.argwhere(raw_target_map == 1)
 87 |         return processed_path, info
 88 |     
 89 |     def _get_stop_criteria(self):
 90 |         def no_nearby_equal_criteria(current_pos, costmap, stop_threshold):
 91 |             """
 92 |             Do not stop if there is a nearby voxel with cost less than current cost + stop_threshold.
 93 |             """
 94 |             assert np.isnan(costmap).sum() == 0, 'costmap contains nan'
 95 |             current_pos_discrete = current_pos.round().clip(0, self.map_size - 1).astype(int)
 96 |             current_cost = costmap[current_pos_discrete[0], current_pos_discrete[1], current_pos_discrete[2]]
 97 |             nearby_locs = self._calculate_nearby_voxel(current_pos, object_centric=False)
 98 |             nearby_equal = np.any(costmap[nearby_locs[:, 0], nearby_locs[:, 1], nearby_locs[:, 2]] < current_cost + stop_threshold)
 99 |             if nearby_equal:
100 |                 return False
101 |             return True
102 |         return no_nearby_equal_criteria
103 | 
104 |     def _calculate_nearby_voxel(self, current_pos, object_centric=False):
105 |         # create a grid of nearby voxels
106 |         half_size = int(2 * self.map_size / 100)
107 |         offsets = np.arange(-half_size, half_size + 1)
108 |         # our heuristics-based dynamics model only supports planar pushing -> only xy path is considered
109 |         if object_centric:
110 |             offsets_grid = np.array(np.meshgrid(offsets, offsets, [0])).T.reshape(-1, 3)
111 |             # Remove the [0, 0, 0] offset, which corresponds to the current position
112 |             offsets_grid = offsets_grid[np.any(offsets_grid != [0, 0, 0], axis=1)]
113 |         else:
114 |             offsets_grid = np.array(np.meshgrid(offsets, offsets, offsets)).T.reshape(-1, 3)
115 |             # Remove the [0, 0, 0] offset, which corresponds to the current position
116 |             offsets_grid = offsets_grid[np.any(offsets_grid != [0, 0, 0], axis=1)]
117 |         # Calculate all nearby voxel coordinates
118 |         all_nearby_voxels = np.clip(current_pos + offsets_grid, 0, self.map_size - 1)
119 |         # Remove duplicates, if any, caused by clipping
120 |         all_nearby_voxels = np.unique(all_nearby_voxels, axis=0)
121 |         return all_nearby_voxels
122 |     
123 |     def _postprocess_path(self, path, raw_target_map, object_centric=False):
124 |         """
125 |         Apply various postprocessing steps to the path.
126 |         """
127 |         # smooth the path
128 |         savgol_window_size = min(len(path), self.config.savgol_window_size)
129 |         savgol_polyorder = min(self.config.savgol_polyorder, savgol_window_size - 1)
130 |         path = savgol_filter(path, savgol_window_size, savgol_polyorder, axis=0)
131 |         # early cutoff if curvature is too high
132 |         curvature = calc_curvature(path)
133 |         if len(curvature) > 5:
134 |             high_curvature_idx = np.where(curvature[5:] > self.config.max_curvature)[0]
135 |             if len(high_curvature_idx) > 0:
136 |                 high_curvature_idx += 5
137 |                 path = path[:int(0.9 * high_curvature_idx[0])]  
138 |         # skip waypoints such that they reach target spacing
139 |         path_trimmed = path[1:-1]
140 |         skip_ratio = None
141 |         if len(path_trimmed) > 1:
142 |             target_spacing = int(self.config['target_spacing'] * self.map_size / 100)
143 |             length = np.linalg.norm(path_trimmed[1:] - path_trimmed[:-1], axis=1).sum()
144 |             if length > target_spacing:
145 |                 curr_spacing = np.linalg.norm(path_trimmed[1:] - path_trimmed[:-1], axis=1).mean()
146 |                 skip_ratio = np.round(target_spacing / curr_spacing).astype(int)
147 |                 if skip_ratio > 1:
148 |                     path_trimmed = path_trimmed[::skip_ratio]
149 |         path = np.concatenate([path[0:1], path_trimmed, path[-1:]])
150 |         # force last position to be one of the target positions
151 |         last_waypoint = path[-1].round().clip(0, self.map_size - 1).astype(int)
152 |         if raw_target_map[last_waypoint[0], last_waypoint[1], last_waypoint[2]] == 0:
153 |             # find the closest target position
154 |             target_pos = np.argwhere(raw_target_map == 1)
155 |             closest_target_idx = np.argmin(np.linalg.norm(target_pos - last_waypoint, axis=1))
156 |             closest_target = target_pos[closest_target_idx]
157 |             # for object centric motion, we assume we can only push in the xy plane
158 |             if object_centric:
159 |                 closest_target[2] = last_waypoint[2]
160 |             path = np.append(path, [closest_target], axis=0)
161 |         # space out path more if task is object centric (so that we can push faster)
162 |         if object_centric:
163 |             k = self.config['pushing_skip_per_k']
164 |             path = np.concatenate([path[k:-1:k], path[-1:]])
165 |         path = path.clip(0, self.map_size-1)
166 |         return path
167 | 


--------------------------------------------------------------------------------
/src/controllers.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from transforms3d.quaternions import mat2quat
  3 | from utils import normalize_vector
  4 | import copy
  5 | import time
  6 | from dynamics_models import PushingDynamicsModel
  7 | 
  8 | # creating some aliases for end effector and table in case LLMs refer to them differently
  9 | EE_ALIAS = ['ee', 'endeffector', 'end_effector', 'end effector', 'gripper', 'hand']
 10 | 
 11 | class Controller:
 12 |     def __init__(self, env, config):
 13 |         self.config = config
 14 |         self.env = env
 15 |         self.dynamics_model = PushingDynamicsModel()
 16 |     
 17 |     def _calculate_ee_rot(self, pushing_dir):
 18 |         """
 19 |         Given a pushing direction, calculate the rotation matrix for the end effector
 20 |         It is offsetted such that it doesn't exactly point towards the direction but slanted towards table, so it's safer
 21 |         """
 22 |         pushing_dir = normalize_vector(pushing_dir)
 23 |         desired_dir = pushing_dir + np.array([0, 0, -np.linalg.norm(pushing_dir)])
 24 |         desired_dir = normalize_vector(desired_dir)
 25 |         left = np.cross(pushing_dir, desired_dir)
 26 |         left = normalize_vector(left)
 27 |         up = np.array(desired_dir, dtype=np.float32)
 28 |         up = normalize_vector(up)
 29 |         forward = np.cross(left, up)
 30 |         forward = normalize_vector(forward)
 31 |         rotmat = np.eye(3).astype(np.float32)
 32 |         rotmat[:3, 0] = forward
 33 |         rotmat[:3, 1] = left
 34 |         rotmat[:3, 2] = up
 35 |         quat_wxyz = mat2quat(rotmat)
 36 |         return quat_wxyz
 37 |     
 38 |     def _apply_mpc_control(self, control, target_velocity=1):
 39 |         """
 40 |         apply control to the object; depending on different control type
 41 |         """
 42 |         # calculate start and final ee pose
 43 |         contact_position = control[:3]  # [3]
 44 |         pushing_dir = control[3:6]  # [3]
 45 |         pushing_dist = control[6]  # [1]
 46 |         # calculate a safe end effector rotation
 47 |         ee_quat = self._calculate_ee_rot(pushing_dir)
 48 |         # calculate translation
 49 |         start_dist = 0.08
 50 |         t_start = contact_position - pushing_dir * start_dist
 51 |         t_interact = contact_position + pushing_dir * pushing_dist
 52 |         t_rest = contact_position - pushing_dir * start_dist * 0.8
 53 | 
 54 |         # apply control
 55 |         self.env.close_gripper()
 56 |         # move to start pose
 57 |         self.env.move_to_pose(np.concatenate([t_start, ee_quat]), speed=target_velocity)
 58 |         print('[controllers.py] moved to start pose', end='; ')
 59 |         # move to interact pose
 60 |         self.env.move_to_pose(np.concatenate([t_interact, ee_quat]), speed=target_velocity * 0.2)
 61 |         print('[controllers.py] moved to final pose', end='; ')
 62 |         # back to rest pose
 63 |         self.env.move_to_pose(np.concatenate([t_rest, ee_quat]), speed=target_velocity * 0.33)
 64 |         print('[controllers.py] back to release pose', end='; ')
 65 |         self.env.reset_to_default_pose()
 66 |         print('[controllers.py] back togenerate_random_control default pose', end='')
 67 |         print()
 68 | 
 69 |     def execute(self, movable_obs, waypoint):
 70 |         """
 71 |         execute a waypoint
 72 |         If movable is "end effector", then do not consider object interaction (no dynamics considered)
 73 |         If movable is "object", then consider object interaction (use heuristics-based dynamics model)
 74 | 
 75 |         :param movable_obs: observation dict of the object to be moved
 76 |         :param waypoint: list, [target_xyz, target_rotation, target_velocity, target_gripper], target_xyz is for movable in world frame
 77 |         :return: None
 78 |         """
 79 |         info = dict()
 80 |         target_xyz, target_rotation, target_velocity, target_gripper = waypoint
 81 |         object_centric = (not movable_obs['name'].lower() in EE_ALIAS)
 82 |         # move to target pose directly
 83 |         if not object_centric:
 84 |             target_pose = np.concatenate([target_xyz, target_rotation])
 85 |             result = self.env.apply_action(np.concatenate([target_pose, [target_gripper]]))
 86 |             info['mp_info'] = result
 87 |         # optimize through dynamics model to obtain robot actions
 88 |         else:
 89 |             start = time.time()
 90 |             # sample control sequence using MPC
 91 |             movable_obs = {key: value for key, value in movable_obs.items() if key in ['_point_cloud_world']}
 92 |             best_control, self.mpc_info = self.random_shooting_MPC(movable_obs, target_xyz)  # [7]
 93 |             print('[controllers.py] mpc search completed in {} seconds with {} samples'.format(time.time() - start, self.config.num_samples))
 94 |             # apply first control in the sequence
 95 |             self.mpc_velocity = target_velocity
 96 |             self._apply_mpc_control(best_control[0])
 97 |             print(f'[controllers.py] applied control (pos: {best_control[0][:3].round(4)}, dir: {best_control[0][3:6].round(4)}, dist: {best_control[0][6:].round(4)})')
 98 |             info['mpc_info'] = self.mpc_info
 99 |             info['mpc_control'] = best_control[0]
100 |         return info
101 | 
102 |     def random_shooting_MPC(self, start_obs, target):
103 |         # Initialize empty list to store the control sequence and corresponding cost
104 |         obs_sequences = []
105 |         controls_sequences = []
106 |         costs = []
107 |         info = dict()
108 |         # repeat the observation for the number of samples (non-batched -> batched)
109 |         batched_start_obs = {}
110 |         for key, value in start_obs.items():
111 |             batched_start_obs[key] = np.repeat(value[None, ...], self.config.num_samples, axis=0)
112 |         obs_sequences.append(batched_start_obs)
113 |         # Generate random control sequences
114 |         for t in range(self.config.horizon_length):
115 |             curr_obs = copy.deepcopy(obs_sequences[-1])
116 |             controls = self.generate_random_control(curr_obs, target)
117 |             # Simulate the system using the model and the generated control sequence
118 |             pred_next_obs = self.forward_step(curr_obs, controls)
119 |             # record the control sequence and the resulting observation
120 |             obs_sequences.append(pred_next_obs)  # obs_sequences: [T, N, obs_dim]
121 |             controls_sequences.append(controls)  # controls_sequences: [T, N, control_dim]
122 |         # Calculate the cost of the generated control sequence
123 |         costs = self.calculate_cost(obs_sequences, controls_sequences, target)  # [N]
124 |         # Find the control sequence with the lowest cost
125 |         best_traj_idx = np.argmin(costs)
126 |         best_controls_sequence = np.array([control_per_step[best_traj_idx] for control_per_step in controls_sequences])  # [T, control_dim]
127 |         # log info
128 |         info['best_controls_sequence'] = best_controls_sequence
129 |         info['best_cost'] = costs[best_traj_idx]
130 |         info['costs'] = costs
131 |         info['controls_sequences'] = controls_sequences
132 |         info['obs_sequences'] = obs_sequences
133 |         return best_controls_sequence, info
134 | 
135 |     def forward_step(self, obs, controls):
136 |         """
137 |         obs: dict including point cloud [B, N, obs_dim]
138 |         controls: batched control sequences [B, control_dim]
139 |         returns: resulting point cloud [B, N, obs_dim]
140 |         """
141 |         # forward dynamics
142 |         pcs = obs['_point_cloud_world']  # [B, N, 3]
143 |         contact_position = controls[:, :3]  # [B, 3]
144 |         pushing_dir = controls[:, 3:6]  # [B, 3]
145 |         pushing_dist = controls[:, 6:]  # [B, 1]
146 |         inputs = (pcs, contact_position, pushing_dir, pushing_dist)
147 |         next_pcs = self.dynamics_model.forward(inputs)  # [B, N, 3]
148 |         # assemble next_obs
149 |         next_obs = copy.deepcopy(obs)
150 |         next_obs['_point_cloud_world'] = next_pcs
151 |         return next_obs
152 | 
153 |     def generate_random_control(self, obs, target):
154 |         """
155 |         the function samples the following:
156 |         1) contact_position [B, 3]: uniform sample randomly from object point cloud
157 |         2) pushing_dir [B, 3]: fixed to be the direction from contact_position to target
158 |         3) pushing_dist [B, 1]: uniform sampling from some range
159 | 
160 |         returns: batched control sequences [B, 7] (3 for contact position, 3 for gripper direction, 1 for gripper moving distance)
161 |         """
162 |         pcs = obs['_point_cloud_world']  # [B, N, 3]
163 |         num_samples, num_points, _ = pcs.shape
164 |         # sample contact position randomly on point cloud
165 |         points_idx = np.random.randint(0, num_points, num_samples)
166 |         contact_positions = pcs[np.arange(num_samples), points_idx]  # [B, 3]
167 |         # sample pushing_dir
168 |         pushing_dirs = target - contact_positions  # [B, 3]
169 |         pushing_dirs = normalize_vector(pushing_dirs)
170 |         # sample pushing_dist
171 |         pushing_dist = np.random.uniform(-0.02, 0.09, size=(num_samples, 1))  # [B, 1]
172 |         # assemble control sequences
173 |         controls = np.concatenate([contact_positions, pushing_dirs, pushing_dist], axis=1)  # [B, 7]
174 |         return controls
175 | 
176 |     def calculate_cost(self, obs_sequences, controls_sequences, target_xyz):
177 |         """
178 |         Calculate the cost of the generated control sequence
179 | 
180 |         inputs:
181 |         obs_sequences: batched observation sequences [T, B, N, 3]
182 |         controls_sequences: batched control sequences [T, B, 7]
183 | 
184 |         returns: cost [B, 1]
185 |         """
186 |         num_samples, _, _ = obs_sequences[0]['_point_cloud_world'].shape
187 |         last_obs = obs_sequences[-1]
188 |         costs = []
189 |         for i in range(num_samples):
190 |             last_pc = last_obs['_point_cloud_world'][i]  # [N, 3]
191 |             last_position = np.mean(last_pc, axis=0)  # [3]
192 |             cost = np.linalg.norm(last_position - target_xyz)
193 |             costs.append(cost)
194 |         costs = np.array(costs)  # [B]
195 |         return costs


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import numpy as np
  4 | import plotly.graph_objects as go
  5 | import datetime
  6 | from transforms3d.quaternions import mat2quat
  7 | 
  8 | def set_lmp_objects(lmps, objects):
  9 |     if isinstance(lmps, dict):
 10 |         lmps = lmps.values()
 11 |     for lmp in lmps:
 12 |         lmp._context = f'objects = {objects}'
 13 | 
 14 | def get_clock_time(milliseconds=False):
 15 |     curr_time = datetime.datetime.now()
 16 |     if milliseconds:
 17 |         return f'{curr_time.hour}:{curr_time.minute}:{curr_time.second}.{curr_time.microsecond // 1000}'
 18 |     else:
 19 |         return f'{curr_time.hour}:{curr_time.minute}:{curr_time.second}'
 20 | 
 21 | class bcolors:
 22 |     HEADER = '\033[95m'
 23 |     OKBLUE = '\033[94m'
 24 |     OKCYAN = '\033[96m'
 25 |     OKGREEN = '\033[92m'
 26 |     WARNING = '\033[93m'
 27 |     FAIL = '\033[91m'
 28 |     ENDC = '\033[0m'
 29 |     BOLD = '\033[1m'
 30 |     UNDERLINE = '\033[4m'
 31 | 
 32 | def load_prompt(prompt_fname):
 33 |     # get current directory
 34 |     curr_dir = os.path.dirname(os.path.abspath(__file__))
 35 |     # get full path to file
 36 |     if '/' in prompt_fname:
 37 |         prompt_fname = prompt_fname.split('/')
 38 |         full_path = os.path.join(curr_dir, 'prompts', *prompt_fname)
 39 |     else:
 40 |         full_path = os.path.join(curr_dir, 'prompts', prompt_fname)
 41 |     # read file
 42 |     with open(full_path, 'r') as f:
 43 |         contents = f.read().strip()
 44 |     return contents
 45 | 
 46 | def normalize_vector(x, eps=1e-6):
 47 |     """normalize a vector to unit length"""
 48 |     x = np.asarray(x)
 49 |     if x.ndim == 1:
 50 |         norm = np.linalg.norm(x)
 51 |         return np.zeros_like(x) if norm < eps else (x / norm)
 52 |     elif x.ndim == 2:
 53 |         norm = np.linalg.norm(x, axis=1)  # (N,)
 54 |         normalized = np.zeros_like(x)
 55 |         normalized[norm > eps] = x[norm > eps] / norm[norm > eps][:, None]
 56 |         return normalized
 57 | 
 58 | def normalize_map(map):
 59 |     """normalization voxel maps to [0, 1] without producing nan"""
 60 |     denom = map.max() - map.min()
 61 |     if denom == 0:
 62 |         return map
 63 |     return (map - map.min()) / denom
 64 | 
 65 | def calc_curvature(path):
 66 |     dx = np.gradient(path[:, 0])
 67 |     dy = np.gradient(path[:, 1])
 68 |     dz = np.gradient(path[:, 2])
 69 |     ddx = np.gradient(dx)
 70 |     ddy = np.gradient(dy)
 71 |     ddz = np.gradient(dz)
 72 |     curvature = np.sqrt((ddy * dx - ddx * dy)**2 + (ddz * dx - ddx * dz)**2 + (ddz * dy - ddy * dz)**2) / np.power(dx**2 + dy**2 + dz**2, 3/2)
 73 |     # convert any nan to 0
 74 |     curvature[np.isnan(curvature)] = 0
 75 |     return curvature
 76 | 
 77 | class IterableDynamicObservation:
 78 |     """acts like a list of DynamicObservation objects, initialized with a function that evaluates to a list"""
 79 |     def __init__(self, func):
 80 |         assert callable(func), 'func must be callable'
 81 |         self.func = func
 82 |         self._validate_func_output()
 83 | 
 84 |     def _validate_func_output(self):
 85 |         evaluated = self.func()
 86 |         assert isinstance(evaluated, list), 'func must evaluate to a list'
 87 | 
 88 |     def __getitem__(self, index):
 89 |         def helper():
 90 |             evaluated = self.func()
 91 |             item = evaluated[index]
 92 |             # assert isinstance(item, Observation), f'got type {type(item)} instead of Observation'
 93 |             return item
 94 |         return helper
 95 | 
 96 |     def __len__(self):
 97 |         return len(self.func())
 98 | 
 99 |     def __iter__(self):
100 |         for i in range(len(self)):
101 |             yield self.__getitem__(i)
102 | 
103 |     def __call__(self):
104 |         static_list = self.func()
105 |         return static_list
106 | 
107 | class DynamicObservation:
108 |     """acts like dict observation but initialized with a function such that it uses the latest info"""
109 |     def __init__(self, func):
110 |         try:
111 |             assert callable(func) and not isinstance(func, dict), 'func must be callable or cannot be a dict'
112 |         except AssertionError as e:
113 |             print(e)
114 |             import pdb; pdb.set_trace()
115 |         self.func = func
116 |     
117 |     def __get__(self, key):
118 |         evaluated = self.func()
119 |         if isinstance(evaluated[key], np.ndarray):
120 |             return evaluated[key].copy()
121 |         return evaluated[key]
122 |     
123 |     def __getattr__(self, key):
124 |         return self.__get__(key)
125 |     
126 |     def __getitem__(self, key):
127 |         return self.__get__(key)
128 | 
129 |     def __call__(self):
130 |         static_obs = self.func()
131 |         if not isinstance(static_obs, Observation):
132 |             static_obs = Observation(static_obs)
133 |         return static_obs
134 | 
135 | class Observation(dict):
136 |     def __init__(self, obs_dict):
137 |         super().__init__(obs_dict)
138 |         self.obs_dict = obs_dict
139 |     
140 |     def __getattr__(self, key):
141 |         return self.obs_dict[key]
142 |     
143 |     def __getitem__(self, key):
144 |         return self.obs_dict[key]
145 | 
146 |     def __getstate__(self):
147 |         return self.obs_dict
148 |     
149 |     def __setstate__(self, state):
150 |         self.obs_dict = state
151 | 
152 | def pointat2quat(pointat):
153 |     """
154 |     calculate quaternion from pointat vector
155 |     """
156 |     up = np.array(pointat, dtype=np.float32)
157 |     up = normalize_vector(up)
158 |     rand_vec = np.array([1, 0, 0], dtype=np.float32)
159 |     rand_vec = normalize_vector(rand_vec)
160 |     # make sure that the random vector is close to desired direction
161 |     if np.abs(np.dot(rand_vec, up)) > 0.99:
162 |         rand_vec = np.array([0, 1, 0], dtype=np.float32)
163 |         rand_vec = normalize_vector(rand_vec)
164 |     left = np.cross(up, rand_vec)
165 |     left = normalize_vector(left)
166 |     forward = np.cross(left, up)
167 |     forward = normalize_vector(forward)
168 |     rotmat = np.eye(3).astype(np.float32)
169 |     rotmat[:3, 0] = forward
170 |     rotmat[:3, 1] = left
171 |     rotmat[:3, 2] = up
172 |     quat_wxyz = mat2quat(rotmat)
173 |     return quat_wxyz
174 | 
175 | def visualize_points(point_cloud, point_colors=None, show=True):
176 |     """visualize point clouds using plotly"""
177 |     if point_colors is None:
178 |         point_colors = point_cloud[:, 2]
179 |     fig = go.Figure(data=[go.Scatter3d(x=point_cloud[:, 0], y=point_cloud[:, 1], z=point_cloud[:, 2],
180 |                                     mode='markers', marker=dict(size=3, color=point_colors, opacity=1.0))])
181 |     if show:
182 |         fig.show()
183 |     else:
184 |         # save to html
185 |         fig.write_html('temp_pc.html')
186 |         print(f'Point cloud saved to temp_pc.html')
187 | 
188 | def _process_llm_index(indices, array_shape):
189 |     """
190 |     processing function for returned voxel maps (which are to be manipulated by LLMs)
191 |     handles non-integer indexing
192 |     handles negative indexing with manually designed special cases
193 |     """
194 |     if isinstance(indices, int) or isinstance(indices, np.int64) or isinstance(indices, np.int32) or isinstance(indices, np.int16) or isinstance(indices, np.int8):
195 |         processed = indices if indices >= 0 or indices == -1 else 0
196 |         assert len(array_shape) == 1, "1D array expected"
197 |         processed = min(processed, array_shape[0] - 1)
198 |     elif isinstance(indices, float) or isinstance(indices, np.float64) or isinstance(indices, np.float32) or isinstance(indices, np.float16):
199 |         processed = np.round(indices).astype(int) if indices >= 0 or indices == -1 else 0
200 |         assert len(array_shape) == 1, "1D array expected"
201 |         processed = min(processed, array_shape[0] - 1)
202 |     elif isinstance(indices, slice):
203 |         start, stop, step = indices.start, indices.stop, indices.step
204 |         if start is not None:
205 |             start = np.round(start).astype(int)
206 |         if stop is not None:
207 |             stop = np.round(stop).astype(int)
208 |         if step is not None:
209 |             step = np.round(step).astype(int)
210 |         # only convert the case where the start is negative and the stop is positive/negative
211 |         if (start is not None and start < 0) and (stop is not None):
212 |             if stop >= 0:
213 |                 processed = slice(0, stop, step)
214 |             else:
215 |                 processed = slice(0, 0, step)
216 |         else:
217 |             processed = slice(start, stop, step)
218 |     elif isinstance(indices, tuple) or isinstance(indices, list):
219 |         processed = tuple(
220 |             _process_llm_index(idx, (array_shape[i],)) for i, idx in enumerate(indices)
221 |         )
222 |     elif isinstance(indices, np.ndarray):
223 |         print("[IndexingWrapper] Warning: numpy array indexing was converted to list")
224 |         processed = _process_llm_index(indices.tolist(), array_shape)
225 |     else:
226 |         print(f"[IndexingWrapper] {indices} (type: {type(indices)}) not supported")
227 |         raise TypeError("Indexing type not supported")
228 |     # give warning if index was negative
229 |     if processed != indices:
230 |         print(f"[IndexingWrapper] Warning: index was changed from {indices} to {processed}")
231 |     # print(f"[IndexingWrapper] {idx} -> {processed}")
232 |     return processed
233 | 
234 | class VoxelIndexingWrapper:
235 |     """
236 |     LLM indexing wrapper that uses _process_llm_index to process indexing
237 |     behaves like a numpy array
238 |     """
239 |     def __init__(self, array):
240 |         self.array = array
241 | 
242 |     def __getitem__(self, idx):
243 |         return self.array[_process_llm_index(idx, tuple(self.array.shape))]
244 |     
245 |     def __setitem__(self, idx, value):
246 |         self.array[_process_llm_index(idx, tuple(self.array.shape))] = value
247 |     
248 |     def __repr__(self) -> str:
249 |         return self.array.__repr__()
250 |     
251 |     def __str__(self) -> str:
252 |         return self.array.__str__()
253 |     
254 |     def __eq__(self, other):
255 |         return self.array == other
256 |     
257 |     def __ne__(self, other):
258 |         return self.array != other
259 |     
260 |     def __lt__(self, other):
261 |         return self.array < other
262 |     
263 |     def __le__(self, other):
264 |         return self.array <= other
265 |     
266 |     def __gt__(self, other):
267 |         return self.array > other
268 |     
269 |     def __ge__(self, other):
270 |         return self.array >= other
271 |     
272 |     def __add__(self, other):
273 |         return self.array + other
274 |     
275 |     def __sub__(self, other):
276 |         return self.array - other
277 |     
278 |     def __mul__(self, other):
279 |         return self.array * other
280 |     
281 |     def __truediv__(self, other):
282 |         return self.array / other
283 |     
284 |     def __floordiv__(self, other):
285 |         return self.array // other
286 |     
287 |     def __mod__(self, other):
288 |         return self.array % other
289 |     
290 |     def __divmod__(self, other):
291 |         return self.array.__divmod__(other)
292 |     
293 |     def __pow__(self, other):
294 |         return self.array ** other
295 |     
296 |     def __lshift__(self, other):
297 |         return self.array << other
298 |     
299 |     def __rshift__(self, other):
300 |         return self.array >> other
301 |     
302 |     def __and__(self, other):
303 |         return self.array & other
304 |     
305 |     def __xor__(self, other):
306 |         return self.array ^ other
307 |     
308 |     def __or__(self, other):
309 |         return self.array | other
310 |     
311 |     def __radd__(self, other):
312 |         return other + self.array
313 |     
314 |     def __rsub__(self, other):
315 |         return other - self.array
316 |     
317 |     def __rmul__(self, other):
318 |         return other * self.array
319 |     
320 |     def __rtruediv__(self, other):
321 |         return other / self.array
322 |     
323 |     def __rfloordiv__(self, other):
324 |         return other // self.array
325 |     
326 |     def __rmod__(self, other):
327 |         return other % self.array
328 |     
329 |     def __rdivmod__(self, other):
330 |         return other.__divmod__(self.array)
331 |     
332 |     def __rpow__(self, other):
333 |         return other ** self.array
334 |     
335 |     def __rlshift__(self, other):
336 |         return other << self.array
337 |     
338 |     def __rrshift__(self, other):
339 |         return other >> self.array
340 |     
341 |     def __rand__(self, other):
342 |         return other & self.array
343 |     
344 |     def __rxor__(self, other):
345 |         return other ^ self.array
346 |     
347 |     def __ror__(self, other):
348 |         return other | self.array
349 |     
350 |     def __getattribute__(self, name):
351 |         if name == "array":
352 |             return super().__getattribute__(name)
353 |         elif name == "__getitem__":
354 |             return super().__getitem__
355 |         elif name == "__setitem__":
356 |             return super().__setitem__
357 |         else:
358 |             # print(name)
359 |             return super().array.__getattribute__(name)
360 |     
361 |     def __getattr__(self, name):
362 |         return self.array.__getattribute__(name)


--------------------------------------------------------------------------------
/src/envs/rlbench_env.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import open3d as o3d
  4 | import json
  5 | from rlbench.action_modes.action_mode import MoveArmThenGripper
  6 | from rlbench.action_modes.arm_action_modes import ArmActionMode, EndEffectorPoseViaPlanning
  7 | from rlbench.action_modes.gripper_action_modes import Discrete, GripperActionMode
  8 | from rlbench.environment import Environment
  9 | import rlbench.tasks as tasks
 10 | from pyrep.const import ObjectType
 11 | from utils import normalize_vector, bcolors
 12 | 
 13 | class CustomMoveArmThenGripper(MoveArmThenGripper):
 14 |     """
 15 |     A potential workaround for the default MoveArmThenGripper as we frequently run into zero division errors and failed path.
 16 |     TODO: check the root cause of it.
 17 |     Ignore arm action if it fails.
 18 | 
 19 |     Attributes:
 20 |         _prev_arm_action (numpy.ndarray): Stores the previous arm action.
 21 |     """
 22 |     def __init__(self, **kwargs):
 23 |         super().__init__(**kwargs)
 24 |         self._prev_arm_action = None
 25 | 
 26 |     def action(self, scene, action):
 27 |         arm_act_size = np.prod(self.arm_action_mode.action_shape(scene))
 28 |         arm_action = np.array(action[:arm_act_size])
 29 |         ee_action = np.array(action[arm_act_size:])
 30 |         # if the arm action is the same as the previous action, skip it
 31 |         if self._prev_arm_action is not None and np.allclose(arm_action, self._prev_arm_action):
 32 |             self.gripper_action_mode.action(scene, ee_action)
 33 |         else:
 34 |             try:
 35 |                 self.arm_action_mode.action(scene, arm_action)
 36 |             except Exception as e:
 37 |                 print(f'{bcolors.FAIL}[rlbench_env.py] Ignoring failed arm action; Exception: "{str(e)}"{bcolors.ENDC}')
 38 |             self.gripper_action_mode.action(scene, ee_action)
 39 |         self._prev_arm_action = arm_action.copy()
 40 | 
 41 | class VoxPoserRLBench():
 42 |     def __init__(self, visualizer=None):
 43 |         """
 44 |         Initializes the VoxPoserRLBench environment.
 45 | 
 46 |         Args:
 47 |             visualizer: Visualization interface, optional.
 48 |         """
 49 |         action_mode = CustomMoveArmThenGripper(arm_action_mode=EndEffectorPoseViaPlanning(),
 50 |                                         gripper_action_mode=Discrete())
 51 |         self.rlbench_env = Environment(action_mode)
 52 |         self.rlbench_env.launch()
 53 |         self.task = None
 54 | 
 55 |         self.workspace_bounds_min = np.array([self.rlbench_env._scene._workspace_minx, self.rlbench_env._scene._workspace_miny, self.rlbench_env._scene._workspace_minz])
 56 |         self.workspace_bounds_max = np.array([self.rlbench_env._scene._workspace_maxx, self.rlbench_env._scene._workspace_maxy, self.rlbench_env._scene._workspace_maxz])
 57 |         self.visualizer = visualizer
 58 |         if self.visualizer is not None:
 59 |             self.visualizer.update_bounds(self.workspace_bounds_min, self.workspace_bounds_max)
 60 |         self.camera_names = ['front', 'left_shoulder', 'right_shoulder', 'overhead', 'wrist']
 61 |         # calculate lookat vector for all cameras (for normal estimation)
 62 |         name2cam = {
 63 |             'front': self.rlbench_env._scene._cam_front,
 64 |             'left_shoulder': self.rlbench_env._scene._cam_over_shoulder_left,
 65 |             'right_shoulder': self.rlbench_env._scene._cam_over_shoulder_right,
 66 |             'overhead': self.rlbench_env._scene._cam_overhead,
 67 |             'wrist': self.rlbench_env._scene._cam_wrist,
 68 |         }
 69 |         forward_vector = np.array([0, 0, 1])
 70 |         self.lookat_vectors = {}
 71 |         for cam_name in self.camera_names:
 72 |             extrinsics = name2cam[cam_name].get_matrix()
 73 |             lookat = extrinsics[:3, :3] @ forward_vector
 74 |             self.lookat_vectors[cam_name] = normalize_vector(lookat)
 75 |         # load file containing object names for each task
 76 |         path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'task_object_names.json')
 77 |         with open(path, 'r') as f:
 78 |             self.task_object_names = json.load(f)
 79 | 
 80 |         self._reset_task_variables()
 81 | 
 82 |     def get_object_names(self):
 83 |         """
 84 |         Returns the names of all objects in the current task environment.
 85 | 
 86 |         Returns:
 87 |             list: A list of object names.
 88 |         """
 89 |         name_mapping = self.task_object_names[self.task.get_name()]
 90 |         exposed_names = [names[0] for names in name_mapping]
 91 |         return exposed_names
 92 | 
 93 |     def load_task(self, task):
 94 |         """
 95 |         Loads a new task into the environment and resets task-related variables.
 96 |         Records the mask IDs of the robot, gripper, and objects in the scene.
 97 | 
 98 |         Args:
 99 |             task (str or rlbench.tasks.Task): Name of the task class or a task object.
100 |         """
101 |         self._reset_task_variables()
102 |         if isinstance(task, str):
103 |             task = getattr(tasks, task)
104 |         self.task = self.rlbench_env.get_task(task)
105 |         self.arm_mask_ids = [obj.get_handle() for obj in self.task._robot.arm.get_objects_in_tree(exclude_base=False)]
106 |         self.gripper_mask_ids = [obj.get_handle() for obj in self.task._robot.gripper.get_objects_in_tree(exclude_base=False)]
107 |         self.robot_mask_ids = self.arm_mask_ids + self.gripper_mask_ids
108 |         self.obj_mask_ids = [obj.get_handle() for obj in self.task._task.get_base().get_objects_in_tree(exclude_base=False)]
109 |         # store (object name <-> object id) mapping for relevant task objects
110 |         try:
111 |             name_mapping = self.task_object_names[self.task.get_name()]
112 |         except KeyError:
113 |             raise KeyError(f'Task {self.task.get_name()} not found in "envs/task_object_names.json" (hint: make sure the task and the corresponding object names are added to the file)')
114 |         exposed_names = [names[0] for names in name_mapping]
115 |         internal_names = [names[1] for names in name_mapping]
116 |         scene_objs = self.task._task.get_base().get_objects_in_tree(object_type=ObjectType.SHAPE,
117 |                                                                       exclude_base=False,
118 |                                                                       first_generation_only=False)
119 |         for scene_obj in scene_objs:
120 |             if scene_obj.get_name() in internal_names:
121 |                 exposed_name = exposed_names[internal_names.index(scene_obj.get_name())]
122 |                 self.name2ids[exposed_name] = [scene_obj.get_handle()]
123 |                 self.id2name[scene_obj.get_handle()] = exposed_name
124 |                 for child in scene_obj.get_objects_in_tree():
125 |                     self.name2ids[exposed_name].append(child.get_handle())
126 |                     self.id2name[child.get_handle()] = exposed_name
127 | 
128 |     def get_3d_obs_by_name(self, query_name):
129 |         """
130 |         Retrieves 3D point cloud observations and normals of an object by its name.
131 | 
132 |         Args:
133 |             query_name (str): The name of the object to query.
134 | 
135 |         Returns:
136 |             tuple: A tuple containing object points and object normals.
137 |         """
138 |         assert query_name in self.name2ids, f"Unknown object name: {query_name}"
139 |         obj_ids = self.name2ids[query_name]
140 |         # gather points and masks from all cameras
141 |         points, masks, normals = [], [], []
142 |         for cam in self.camera_names:
143 |             points.append(getattr(self.latest_obs, f"{cam}_point_cloud").reshape(-1, 3))
144 |             masks.append(getattr(self.latest_obs, f"{cam}_mask").reshape(-1))
145 |             # estimate normals using o3d
146 |             pcd = o3d.geometry.PointCloud()
147 |             pcd.points = o3d.utility.Vector3dVector(points[-1])
148 |             pcd.estimate_normals()
149 |             cam_normals = np.asarray(pcd.normals)
150 |             # use lookat vector to adjust normal vectors
151 |             flip_indices = np.dot(cam_normals, self.lookat_vectors[cam]) > 0
152 |             cam_normals[flip_indices] *= -1
153 |             normals.append(cam_normals)
154 |         points = np.concatenate(points, axis=0)
155 |         masks = np.concatenate(masks, axis=0)
156 |         normals = np.concatenate(normals, axis=0)
157 |         # get object points
158 |         obj_points = points[np.isin(masks, obj_ids)]
159 |         if len(obj_points) == 0:
160 |             raise ValueError(f"Object {query_name} not found in the scene")
161 |         obj_normals = normals[np.isin(masks, obj_ids)]
162 |         # voxel downsample using o3d
163 |         pcd = o3d.geometry.PointCloud()
164 |         pcd.points = o3d.utility.Vector3dVector(obj_points)
165 |         pcd.normals = o3d.utility.Vector3dVector(obj_normals)
166 |         pcd_downsampled = pcd.voxel_down_sample(voxel_size=0.001)
167 |         obj_points = np.asarray(pcd_downsampled.points)
168 |         obj_normals = np.asarray(pcd_downsampled.normals)
169 |         return obj_points, obj_normals
170 | 
171 |     def get_scene_3d_obs(self, ignore_robot=False, ignore_grasped_obj=False):
172 |         """
173 |         Retrieves the entire scene's 3D point cloud observations and colors.
174 | 
175 |         Args:
176 |             ignore_robot (bool): Whether to ignore points corresponding to the robot.
177 |             ignore_grasped_obj (bool): Whether to ignore points corresponding to grasped objects.
178 | 
179 |         Returns:
180 |             tuple: A tuple containing scene points and colors.
181 |         """
182 |         points, colors, masks = [], [], []
183 |         for cam in self.camera_names:
184 |             points.append(getattr(self.latest_obs, f"{cam}_point_cloud").reshape(-1, 3))
185 |             colors.append(getattr(self.latest_obs, f"{cam}_rgb").reshape(-1, 3))
186 |             masks.append(getattr(self.latest_obs, f"{cam}_mask").reshape(-1))
187 |         points = np.concatenate(points, axis=0)
188 |         colors = np.concatenate(colors, axis=0)
189 |         masks = np.concatenate(masks, axis=0)
190 | 
191 |         # only keep points within workspace
192 |         chosen_idx_x = (points[:, 0] > self.workspace_bounds_min[0]) & (points[:, 0] < self.workspace_bounds_max[0])
193 |         chosen_idx_y = (points[:, 1] > self.workspace_bounds_min[1]) & (points[:, 1] < self.workspace_bounds_max[1])
194 |         chosen_idx_z = (points[:, 2] > self.workspace_bounds_min[2]) & (points[:, 2] < self.workspace_bounds_max[2])
195 |         points = points[(chosen_idx_x & chosen_idx_y & chosen_idx_z)]
196 |         colors = colors[(chosen_idx_x & chosen_idx_y & chosen_idx_z)]
197 |         masks = masks[(chosen_idx_x & chosen_idx_y & chosen_idx_z)]
198 | 
199 |         if ignore_robot:
200 |             robot_mask = np.isin(masks, self.robot_mask_ids)
201 |             points = points[~robot_mask]
202 |             colors = colors[~robot_mask]
203 |             masks = masks[~robot_mask]
204 |         if self.grasped_obj_ids and ignore_grasped_obj:
205 |             grasped_mask = np.isin(masks, self.grasped_obj_ids)
206 |             points = points[~grasped_mask]
207 |             colors = colors[~grasped_mask]
208 |             masks = masks[~grasped_mask]
209 | 
210 |         # voxel downsample using o3d
211 |         pcd = o3d.geometry.PointCloud()
212 |         pcd.points = o3d.utility.Vector3dVector(points)
213 |         pcd.colors = o3d.utility.Vector3dVector(colors)
214 |         pcd_downsampled = pcd.voxel_down_sample(voxel_size=0.001)
215 |         points = np.asarray(pcd_downsampled.points)
216 |         colors = np.asarray(pcd_downsampled.colors).astype(np.uint8)
217 | 
218 |         return points, colors
219 | 
220 |     def reset(self):
221 |         """
222 |         Resets the environment and the task. Also updates the visualizer.
223 | 
224 |         Returns:
225 |             tuple: A tuple containing task descriptions and initial observations.
226 |         """
227 |         assert self.task is not None, "Please load a task first"
228 |         self.task.sample_variation()
229 |         descriptions, obs = self.task.reset()
230 |         obs = self._process_obs(obs)
231 |         self.init_obs = obs
232 |         self.latest_obs = obs
233 |         self._update_visualizer()
234 |         return descriptions, obs
235 | 
236 |     def apply_action(self, action):
237 |         """
238 |         Applies an action in the environment and updates the state.
239 | 
240 |         Args:
241 |             action: The action to apply.
242 | 
243 |         Returns:
244 |             tuple: A tuple containing the latest observations, reward, and termination flag.
245 |         """
246 |         assert self.task is not None, "Please load a task first"
247 |         action = self._process_action(action)
248 |         obs, reward, terminate = self.task.step(action)
249 |         obs = self._process_obs(obs)
250 |         self.latest_obs = obs
251 |         self.latest_reward = reward
252 |         self.latest_terminate = terminate
253 |         self.latest_action = action
254 |         self._update_visualizer()
255 |         grasped_objects = self.rlbench_env._scene.robot.gripper.get_grasped_objects()
256 |         if len(grasped_objects) > 0:
257 |             self.grasped_obj_ids = [obj.get_handle() for obj in grasped_objects]
258 |         return obs, reward, terminate
259 | 
260 |     def move_to_pose(self, pose, speed=None):
261 |         """
262 |         Moves the robot arm to a specific pose.
263 | 
264 |         Args:
265 |             pose: The target pose.
266 |             speed: The speed at which to move the arm. Currently not implemented.
267 | 
268 |         Returns:
269 |             tuple: A tuple containing the latest observations, reward, and termination flag.
270 |         """
271 |         if self.latest_action is None:
272 |             action = np.concatenate([pose, [self.init_obs.gripper_open]])
273 |         else:
274 |             action = np.concatenate([pose, [self.latest_action[-1]]])
275 |         return self.apply_action(action)
276 |     
277 |     def open_gripper(self):
278 |         """
279 |         Opens the gripper of the robot.
280 |         """
281 |         action = np.concatenate([self.latest_obs.gripper_pose, [1.0]])
282 |         return self.apply_action(action)
283 | 
284 |     def close_gripper(self):
285 |         """
286 |         Closes the gripper of the robot.
287 |         """
288 |         action = np.concatenate([self.latest_obs.gripper_pose, [0.0]])
289 |         return self.apply_action(action)
290 | 
291 |     def set_gripper_state(self, gripper_state):
292 |         """
293 |         Sets the state of the gripper.
294 | 
295 |         Args:
296 |             gripper_state: The target state for the gripper.
297 | 
298 |         Returns:
299 |             tuple: A tuple containing the latest observations, reward, and termination flag.
300 |         """
301 |         action = np.concatenate([self.latest_obs.gripper_pose, [gripper_state]])
302 |         return self.apply_action(action)
303 | 
304 |     def reset_to_default_pose(self):
305 |         """
306 |         Resets the robot arm to its default pose.
307 | 
308 |         Returns:
309 |             tuple: A tuple containing the latest observations, reward, and termination flag.
310 |         """
311 |         if self.latest_action is None:
312 |             action = np.concatenate([self.init_obs.gripper_pose, [self.init_obs.gripper_open]])
313 |         else:
314 |             action = np.concatenate([self.init_obs.gripper_pose, [self.latest_action[-1]]])
315 |         return self.apply_action(action)
316 | 
317 |     def get_ee_pose(self):
318 |         assert self.latest_obs is not None, "Please reset the environment first"
319 |         return self.latest_obs.gripper_pose
320 | 
321 |     def get_ee_pos(self):
322 |         return self.get_ee_pose()[:3]
323 | 
324 |     def get_ee_quat(self):
325 |         return self.get_ee_pose()[3:]
326 | 
327 |     def get_last_gripper_action(self):
328 |         """
329 |         Returns the last gripper action.
330 | 
331 |         Returns:
332 |             float: The last gripper action.
333 |         """
334 |         if self.latest_action is not None:
335 |             return self.latest_action[-1]
336 |         else:
337 |             return self.init_obs.gripper_open
338 | 
339 |     def _reset_task_variables(self):
340 |         """
341 |         Resets variables related to the current task in the environment.
342 | 
343 |         Note: This function is generally called internally.
344 |         """
345 |         self.init_obs = None
346 |         self.latest_obs = None
347 |         self.latest_reward = None
348 |         self.latest_terminate = None
349 |         self.latest_action = None
350 |         self.grasped_obj_ids = None
351 |         # scene-specific helper variables
352 |         self.arm_mask_ids = None
353 |         self.gripper_mask_ids = None
354 |         self.robot_mask_ids = None
355 |         self.obj_mask_ids = None
356 |         self.name2ids = {}  # first_generation name -> list of ids of the tree
357 |         self.id2name = {}  # any node id -> first_generation name
358 |    
359 |     def _update_visualizer(self):
360 |         """
361 |         Updates the scene in the visualizer with the latest observations.
362 | 
363 |         Note: This function is generally called internally.
364 |         """
365 |         if self.visualizer is not None:
366 |             points, colors = self.get_scene_3d_obs(ignore_robot=False, ignore_grasped_obj=False)
367 |             self.visualizer.update_scene_points(points, colors)
368 |     
369 |     def _process_obs(self, obs):
370 |         """
371 |         Processes the observations, specifically converts quaternion format from xyzw to wxyz.
372 | 
373 |         Args:
374 |             obs: The observation to process.
375 | 
376 |         Returns:
377 |             The processed observation.
378 |         """
379 |         quat_xyzw = obs.gripper_pose[3:]
380 |         quat_wxyz = np.concatenate([quat_xyzw[-1:], quat_xyzw[:-1]])
381 |         obs.gripper_pose[3:] = quat_wxyz
382 |         return obs
383 | 
384 |     def _process_action(self, action):
385 |         """
386 |         Processes the action, specifically converts quaternion format from wxyz to xyzw.
387 | 
388 |         Args:
389 |             action: The action to process.
390 | 
391 |         Returns:
392 |             The processed action.
393 |         """
394 |         quat_wxyz = action[3:7]
395 |         quat_xyzw = np.concatenate([quat_wxyz[1:], quat_wxyz[:1]])
396 |         action[3:7] = quat_xyzw
397 |         return action


--------------------------------------------------------------------------------
/src/interfaces.py:
--------------------------------------------------------------------------------
  1 | from LMP import LMP
  2 | from utils import get_clock_time, normalize_vector, pointat2quat, bcolors, Observation, VoxelIndexingWrapper
  3 | import numpy as np
  4 | from planners import PathPlanner
  5 | import time
  6 | from scipy.ndimage import distance_transform_edt
  7 | import transforms3d
  8 | from controllers import Controller
  9 | 
 10 | # creating some aliases for end effector and table in case LLMs refer to them differently (but rarely this happens)
 11 | EE_ALIAS = ['ee', 'endeffector', 'end_effector', 'end effector', 'gripper', 'hand']
 12 | TABLE_ALIAS = ['table', 'desk', 'workstation', 'work_station', 'work station', 'workspace', 'work_space', 'work space']
 13 | 
 14 | class LMP_interface():
 15 | 
 16 |   def __init__(self, env, lmp_config, controller_config, planner_config, env_name='rlbench'):
 17 |     self._env = env
 18 |     self._env_name = env_name
 19 |     self._cfg = lmp_config
 20 |     self._map_size = self._cfg['map_size']
 21 |     self._planner = PathPlanner(planner_config, map_size=self._map_size)
 22 |     self._controller = Controller(self._env, controller_config)
 23 | 
 24 |     # calculate size of each voxel (resolution)
 25 |     self._resolution = (self._env.workspace_bounds_max - self._env.workspace_bounds_min) / self._map_size
 26 |     print('#' * 50)
 27 |     print(f'## voxel resolution: {self._resolution}')
 28 |     print('#' * 50)
 29 |     print()
 30 |     print()
 31 |   
 32 |   # ======================================================
 33 |   # == functions exposed to LLM
 34 |   # ======================================================
 35 |   def get_ee_pos(self):
 36 |     return self._world_to_voxel(self._env.get_ee_pos())
 37 |   
 38 |   def detect(self, obj_name):
 39 |     """return an observation dict containing useful information about the object"""
 40 |     if obj_name.lower() in EE_ALIAS:
 41 |       obs_dict = dict()
 42 |       obs_dict['name'] = obj_name
 43 |       obs_dict['position'] = self.get_ee_pos()
 44 |       obs_dict['aabb'] = np.array([self.get_ee_pos(), self.get_ee_pos()])
 45 |       obs_dict['_position_world'] = self._env.get_ee_pos()
 46 |     elif obj_name.lower() in TABLE_ALIAS:
 47 |       offset_percentage = 0.1
 48 |       x_min = self._env.workspace_bounds_min[0] + offset_percentage * (self._env.workspace_bounds_max[0] - self._env.workspace_bounds_min[0])
 49 |       x_max = self._env.workspace_bounds_max[0] - offset_percentage * (self._env.workspace_bounds_max[0] - self._env.workspace_bounds_min[0])
 50 |       y_min = self._env.workspace_bounds_min[1] + offset_percentage * (self._env.workspace_bounds_max[1] - self._env.workspace_bounds_min[1])
 51 |       y_max = self._env.workspace_bounds_max[1] - offset_percentage * (self._env.workspace_bounds_max[1] - self._env.workspace_bounds_min[1])
 52 |       table_max_world = np.array([x_max, y_max, 0])
 53 |       table_min_world = np.array([x_min, y_min, 0])
 54 |       table_center = (table_max_world + table_min_world) / 2
 55 |       obs_dict = dict()
 56 |       obs_dict['name'] = obj_name
 57 |       obs_dict['position'] = self._world_to_voxel(table_center)
 58 |       obs_dict['_position_world'] = table_center
 59 |       obs_dict['normal'] = np.array([0, 0, 1])
 60 |       obs_dict['aabb'] = np.array([self._world_to_voxel(table_min_world), self._world_to_voxel(table_max_world)])
 61 |     else:
 62 |       obs_dict = dict()
 63 |       obj_pc, obj_normal = self._env.get_3d_obs_by_name(obj_name)
 64 |       voxel_map = self._points_to_voxel_map(obj_pc)
 65 |       aabb_min = self._world_to_voxel(np.min(obj_pc, axis=0))
 66 |       aabb_max = self._world_to_voxel(np.max(obj_pc, axis=0))
 67 |       obs_dict['occupancy_map'] = voxel_map  # in voxel frame
 68 |       obs_dict['name'] = obj_name
 69 |       obs_dict['position'] = self._world_to_voxel(np.mean(obj_pc, axis=0))  # in voxel frame
 70 |       obs_dict['aabb'] = np.array([aabb_min, aabb_max])  # in voxel frame
 71 |       obs_dict['_position_world'] = np.mean(obj_pc, axis=0)  # in world frame
 72 |       obs_dict['_point_cloud_world'] = obj_pc  # in world frame
 73 |       obs_dict['normal'] = normalize_vector(obj_normal.mean(axis=0))
 74 | 
 75 |     object_obs = Observation(obs_dict)
 76 |     return object_obs
 77 |   
 78 |   def execute(self, movable_obs_func, affordance_map=None, avoidance_map=None, rotation_map=None,
 79 |               velocity_map=None, gripper_map=None):
 80 |     """
 81 |     First use planner to generate waypoint path, then use controller to follow the waypoints.
 82 | 
 83 |     Args:
 84 |       movable_obs_func: callable function to get observation of the body to be moved
 85 |       affordance_map: callable function that generates a 3D numpy array, the target voxel map
 86 |       avoidance_map: callable function that generates a 3D numpy array, the obstacle voxel map
 87 |       rotation_map: callable function that generates a 4D numpy array, the rotation voxel map (rotation is represented by a quaternion *in world frame*)
 88 |       velocity_map: callable function that generates a 3D numpy array, the velocity voxel map
 89 |       gripper_map: callable function that generates a 3D numpy array, the gripper voxel map
 90 |     """
 91 |     # initialize default voxel maps if not specified
 92 |     if rotation_map is None:
 93 |       rotation_map = self._get_default_voxel_map('rotation')
 94 |     if velocity_map is None:
 95 |       velocity_map = self._get_default_voxel_map('velocity')
 96 |     if gripper_map is None:
 97 |       gripper_map = self._get_default_voxel_map('gripper')
 98 |     if avoidance_map is None:
 99 |       avoidance_map = self._get_default_voxel_map('obstacle')
100 |     object_centric = (not movable_obs_func()['name'] in EE_ALIAS)
101 |     execute_info = []
102 |     if affordance_map is not None:
103 |       # execute path in closed-loop
104 |       for plan_iter in range(self._cfg['max_plan_iter']):
105 |         step_info = dict()
106 |         # evaluate voxel maps such that we use latest information
107 |         movable_obs = movable_obs_func()
108 |         _affordance_map = affordance_map()
109 |         _avoidance_map = avoidance_map()
110 |         _rotation_map = rotation_map()
111 |         _velocity_map = velocity_map()
112 |         _gripper_map = gripper_map()
113 |         # preprocess avoidance map
114 |         _avoidance_map = self._preprocess_avoidance_map(_avoidance_map, _affordance_map, movable_obs)
115 |         # start planning
116 |         start_pos = movable_obs['position']
117 |         start_time = time.time()
118 |         # optimize path and log
119 |         path_voxel, planner_info = self._planner.optimize(start_pos, _affordance_map, _avoidance_map,
120 |                                                         object_centric=object_centric)
121 |         print(f'{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] planner time: {time.time() - start_time:.3f}s{bcolors.ENDC}')
122 |         assert len(path_voxel) > 0, 'path_voxel is empty'
123 |         step_info['path_voxel'] = path_voxel
124 |         step_info['planner_info'] = planner_info
125 |         # convert voxel path to world trajectory, and include rotation, velocity, and gripper information
126 |         traj_world = self._path2traj(path_voxel, _rotation_map, _velocity_map, _gripper_map)
127 |         traj_world = traj_world[:self._cfg['num_waypoints_per_plan']]
128 |         step_info['start_pos'] = start_pos
129 |         step_info['plan_iter'] = plan_iter
130 |         step_info['movable_obs'] = movable_obs
131 |         step_info['traj_world'] = traj_world
132 |         step_info['affordance_map'] = _affordance_map
133 |         step_info['rotation_map'] = _rotation_map
134 |         step_info['velocity_map'] = _velocity_map
135 |         step_info['gripper_map'] = _gripper_map
136 |         step_info['avoidance_map'] = _avoidance_map
137 | 
138 |         # visualize
139 |         if self._cfg['visualize']:
140 |           assert self._env.visualizer is not None
141 |           step_info['start_pos_world'] = self._voxel_to_world(start_pos)
142 |           step_info['targets_world'] = self._voxel_to_world(planner_info['targets_voxel'])
143 |           self._env.visualizer.visualize(step_info)
144 | 
145 |         # execute path
146 |         print(f'{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] start executing path via controller ({len(traj_world)} waypoints){bcolors.ENDC}')
147 |         controller_infos = dict()
148 |         for i, waypoint in enumerate(traj_world):
149 |           # check if the movement is finished
150 |           if np.linalg.norm(movable_obs['_position_world'] - traj_world[-1][0]) <= 0.01:
151 |             print(f"{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] reached last waypoint; curr_xyz={movable_obs['_position_world']}, target={traj_world[-1][0]} (distance: {np.linalg.norm(movable_obs['_position_world'] - traj_world[-1][0]):.3f})){bcolors.ENDC}")
152 |             break
153 |           # skip waypoint if moving to this point is going in opposite direction of the final target point
154 |           # (for example, if you have over-pushed an object, no need to move back)
155 |           if i != 0 and i != len(traj_world) - 1:
156 |             movable2target = traj_world[-1][0] - movable_obs['_position_world']
157 |             movable2waypoint = waypoint[0] - movable_obs['_position_world']
158 |             if np.dot(movable2target, movable2waypoint).round(3) <= 0:
159 |               print(f'{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] skip waypoint {i+1} because it is moving in opposite direction of the final target{bcolors.ENDC}')
160 |               continue
161 |           # execute waypoint
162 |           controller_info = self._controller.execute(movable_obs, waypoint)
163 |           # loggging
164 |           movable_obs = movable_obs_func()
165 |           dist2target = np.linalg.norm(movable_obs['_position_world'] - traj_world[-1][0])
166 |           if not object_centric and controller_info['mp_info'] == -1:
167 |             print(f'{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] failed waypoint {i+1} (wp: {waypoint[0].round(3)}, actual: {movable_obs["_position_world"].round(3)}, target: {traj_world[-1][0].round(3)}, start: {traj_world[0][0].round(3)}, dist2target: {dist2target.round(3)}); mp info: {controller_info["mp_info"]}{bcolors.ENDC}')
168 |           else:
169 |             print(f'{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] completed waypoint {i+1} (wp: {waypoint[0].round(3)}, actual: {movable_obs["_position_world"].round(3)}, target: {traj_world[-1][0].round(3)}, start: {traj_world[0][0].round(3)}, dist2target: {dist2target.round(3)}){bcolors.ENDC}')
170 |           controller_info['controller_step'] = i
171 |           controller_info['target_waypoint'] = waypoint
172 |           controller_infos[i] = controller_info
173 |         step_info['controller_infos'] = controller_infos
174 |         execute_info.append(step_info)
175 |         # check whether we need to replan
176 |         curr_pos = movable_obs['position']
177 |         if distance_transform_edt(1 - _affordance_map)[tuple(curr_pos)] <= 2:
178 |           print(f'{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] reached target; terminating {bcolors.ENDC}')
179 |           break
180 |     print(f'{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] finished executing path via controller{bcolors.ENDC}')
181 | 
182 |     # make sure we are at the final target position and satisfy any additional parametrization
183 |     # (skip if we are specifying object-centric motion)
184 |     if not object_centric:
185 |       try:
186 |         # traj_world: world_xyz, rotation, velocity, gripper
187 |         ee_pos_world = traj_world[-1][0]
188 |         ee_rot_world = traj_world[-1][1]
189 |         ee_pose_world = np.concatenate([ee_pos_world, ee_rot_world])
190 |         ee_speed = traj_world[-1][2]
191 |         gripper_state = traj_world[-1][3]
192 |       except:
193 |         # evaluate latest voxel map
194 |         _rotation_map = rotation_map()
195 |         _velocity_map = velocity_map()
196 |         _gripper_map = gripper_map()
197 |         # get last ee pose
198 |         ee_pos_world = self._env.get_ee_pos()
199 |         ee_pos_voxel = self.get_ee_pos()
200 |         ee_rot_world = _rotation_map[ee_pos_voxel[0], ee_pos_voxel[1], ee_pos_voxel[2]]
201 |         ee_pose_world = np.concatenate([ee_pos_world, ee_rot_world])
202 |         ee_speed = _velocity_map[ee_pos_voxel[0], ee_pos_voxel[1], ee_pos_voxel[2]]
203 |         gripper_state = _gripper_map[ee_pos_voxel[0], ee_pos_voxel[1], ee_pos_voxel[2]]
204 |       # move to the final target
205 |       self._env.apply_action(np.concatenate([ee_pose_world, [gripper_state]]))
206 | 
207 |     return execute_info
208 |   
209 |   def cm2index(self, cm, direction):
210 |     if isinstance(direction, str) and direction == 'x':
211 |       x_resolution = self._resolution[0] * 100  # resolution is in m, we need cm
212 |       return int(cm / x_resolution)
213 |     elif isinstance(direction, str) and direction == 'y':
214 |       y_resolution = self._resolution[1] * 100
215 |       return int(cm / y_resolution)
216 |     elif isinstance(direction, str) and direction == 'z':
217 |       z_resolution = self._resolution[2] * 100
218 |       return int(cm / z_resolution)
219 |     else:
220 |       # calculate index along the direction
221 |       assert isinstance(direction, np.ndarray) and direction.shape == (3,)
222 |       direction = normalize_vector(direction)
223 |       x_cm = cm * direction[0]
224 |       y_cm = cm * direction[1]
225 |       z_cm = cm * direction[2]
226 |       x_index = self.cm2index(x_cm, 'x')
227 |       y_index = self.cm2index(y_cm, 'y')
228 |       z_index = self.cm2index(z_cm, 'z')
229 |       return np.array([x_index, y_index, z_index])
230 |   
231 |   def index2cm(self, index, direction=None):
232 |     if direction is None:
233 |       average_resolution = np.mean(self._resolution)
234 |       return index * average_resolution * 100  # resolution is in m, we need cm
235 |     elif direction == 'x':
236 |       x_resolution = self._resolution[0] * 100
237 |       return index * x_resolution
238 |     elif direction == 'y':
239 |       y_resolution = self._resolution[1] * 100
240 |       return index * y_resolution
241 |     elif direction == 'z':
242 |       z_resolution = self._resolution[2] * 100
243 |       return index * z_resolution
244 |     else:
245 |       raise NotImplementedError
246 |     
247 |   def pointat2quat(self, vector):
248 |     assert isinstance(vector, np.ndarray) and vector.shape == (3,), f'vector: {vector}'
249 |     return pointat2quat(vector)
250 | 
251 |   def set_voxel_by_radius(self, voxel_map, voxel_xyz, radius_cm=0, value=1):
252 |     """given a 3D np array, set the value of the voxel at voxel_xyz to value. If radius is specified, set the value of all voxels within the radius to value."""
253 |     voxel_map[voxel_xyz[0], voxel_xyz[1], voxel_xyz[2]] = value
254 |     if radius_cm > 0:
255 |       radius_x = self.cm2index(radius_cm, 'x')
256 |       radius_y = self.cm2index(radius_cm, 'y')
257 |       radius_z = self.cm2index(radius_cm, 'z')
258 |       # simplified version - use rectangle instead of circle (because it is faster)
259 |       min_x = max(0, voxel_xyz[0] - radius_x)
260 |       max_x = min(self._map_size, voxel_xyz[0] + radius_x + 1)
261 |       min_y = max(0, voxel_xyz[1] - radius_y)
262 |       max_y = min(self._map_size, voxel_xyz[1] + radius_y + 1)
263 |       min_z = max(0, voxel_xyz[2] - radius_z)
264 |       max_z = min(self._map_size, voxel_xyz[2] + radius_z + 1)
265 |       voxel_map[min_x:max_x, min_y:max_y, min_z:max_z] = value
266 |     return voxel_map
267 |   
268 |   def get_empty_affordance_map(self):
269 |     return self._get_default_voxel_map('target')()  # return evaluated voxel map instead of functions (such that LLM can manipulate it)
270 | 
271 |   def get_empty_avoidance_map(self):
272 |     return self._get_default_voxel_map('obstacle')()  # return evaluated voxel map instead of functions (such that LLM can manipulate it)
273 |   
274 |   def get_empty_rotation_map(self):
275 |     return self._get_default_voxel_map('rotation')()  # return evaluated voxel map instead of functions (such that LLM can manipulate it)
276 |   
277 |   def get_empty_velocity_map(self):
278 |     return self._get_default_voxel_map('velocity')()  # return evaluated voxel map instead of functions (such that LLM can manipulate it)
279 |   
280 |   def get_empty_gripper_map(self):
281 |     return self._get_default_voxel_map('gripper')()  # return evaluated voxel map instead of functions (such that LLM can manipulate it)
282 |   
283 |   def reset_to_default_pose(self):
284 |      self._env.reset_to_default_pose()
285 |   
286 |   # ======================================================
287 |   # == helper functions
288 |   # ======================================================
289 |   def _world_to_voxel(self, world_xyz):
290 |     _world_xyz = world_xyz.astype(np.float32)
291 |     _voxels_bounds_robot_min = self._env.workspace_bounds_min.astype(np.float32)
292 |     _voxels_bounds_robot_max = self._env.workspace_bounds_max.astype(np.float32)
293 |     _map_size = self._map_size
294 |     voxel_xyz = pc2voxel(_world_xyz, _voxels_bounds_robot_min, _voxels_bounds_robot_max, _map_size)
295 |     return voxel_xyz
296 | 
297 |   def _voxel_to_world(self, voxel_xyz):
298 |     _voxels_bounds_robot_min = self._env.workspace_bounds_min.astype(np.float32)
299 |     _voxels_bounds_robot_max = self._env.workspace_bounds_max.astype(np.float32)
300 |     _map_size = self._map_size
301 |     world_xyz = voxel2pc(voxel_xyz, _voxels_bounds_robot_min, _voxels_bounds_robot_max, _map_size)
302 |     return world_xyz
303 | 
304 |   def _points_to_voxel_map(self, points):
305 |     """convert points in world frame to voxel frame, voxelize, and return the voxelized points"""
306 |     _points = points.astype(np.float32)
307 |     _voxels_bounds_robot_min = self._env.workspace_bounds_min.astype(np.float32)
308 |     _voxels_bounds_robot_max = self._env.workspace_bounds_max.astype(np.float32)
309 |     _map_size = self._map_size
310 |     return pc2voxel_map(_points, _voxels_bounds_robot_min, _voxels_bounds_robot_max, _map_size)
311 | 
312 |   def _get_voxel_center(self, voxel_map):
313 |     """calculte the center of the voxel map where value is 1"""
314 |     voxel_center = np.array(np.where(voxel_map == 1)).mean(axis=1)
315 |     return voxel_center
316 | 
317 |   def _get_scene_collision_voxel_map(self):
318 |     collision_points_world, _ = self._env.get_scene_3d_obs(ignore_robot=True)
319 |     collision_voxel = self._points_to_voxel_map(collision_points_world)
320 |     return collision_voxel
321 | 
322 |   def _get_default_voxel_map(self, type='target'):
323 |     """returns default voxel map (defaults to current state)"""
324 |     def fn_wrapper():
325 |       if type == 'target':
326 |         voxel_map = np.zeros((self._map_size, self._map_size, self._map_size))
327 |       elif type == 'obstacle':  # for LLM to do customization
328 |         voxel_map = np.zeros((self._map_size, self._map_size, self._map_size))
329 |       elif type == 'velocity':
330 |         voxel_map = np.ones((self._map_size, self._map_size, self._map_size))
331 |       elif type == 'gripper':
332 |         voxel_map = np.ones((self._map_size, self._map_size, self._map_size)) * self._env.get_last_gripper_action()
333 |       elif type == 'rotation':
334 |         voxel_map = np.zeros((self._map_size, self._map_size, self._map_size, 4))
335 |         voxel_map[:, :, :] = self._env.get_ee_quat()
336 |       else:
337 |         raise ValueError('Unknown voxel map type: {}'.format(type))
338 |       voxel_map = VoxelIndexingWrapper(voxel_map)
339 |       return voxel_map
340 |     return fn_wrapper
341 |   
342 |   def _path2traj(self, path, rotation_map, velocity_map, gripper_map):
343 |     """
344 |     convert path (generated by planner) to trajectory (used by controller)
345 |     path only contains a sequence of voxel coordinates, while trajectory parametrize the motion of the end-effector with rotation, velocity, and gripper on/off command
346 |     """
347 |     # convert path to trajectory
348 |     traj = []
349 |     for i in range(len(path)):
350 |       # get the current voxel position
351 |       voxel_xyz = path[i]
352 |       # get the current world position
353 |       world_xyz = self._voxel_to_world(voxel_xyz)
354 |       voxel_xyz = np.round(voxel_xyz).astype(int)
355 |       # get the current rotation (in world frame)
356 |       rotation = rotation_map[voxel_xyz[0], voxel_xyz[1], voxel_xyz[2]]
357 |       # get the current velocity
358 |       velocity = velocity_map[voxel_xyz[0], voxel_xyz[1], voxel_xyz[2]]
359 |       # get the current on/off
360 |       gripper = gripper_map[voxel_xyz[0], voxel_xyz[1], voxel_xyz[2]]
361 |       # LLM might specify a gripper value change, but sometimes EE may not be able to reach the exact voxel, so we overwrite the gripper value if it's close enough (TODO: better way to do this?)
362 |       if (i == len(path) - 1) and not (np.all(gripper_map == 1) or np.all(gripper_map == 0)):
363 |         # get indices of the less common values
364 |         less_common_value = 1 if np.sum(gripper_map == 1) < np.sum(gripper_map == 0) else 0
365 |         less_common_indices = np.where(gripper_map == less_common_value)
366 |         less_common_indices = np.array(less_common_indices).T
367 |         # get closest distance from voxel_xyz to any of the indices that have less common value
368 |         closest_distance = np.min(np.linalg.norm(less_common_indices - voxel_xyz[None, :], axis=0))
369 |         # if the closest distance is less than threshold, then set gripper to less common value
370 |         if closest_distance <= 3:
371 |           gripper = less_common_value
372 |           print(f'{bcolors.OKBLUE}[interfaces.py | {get_clock_time()}] overwriting gripper to less common value for the last waypoint{bcolors.ENDC}')
373 |       # add to trajectory
374 |       traj.append((world_xyz, rotation, velocity, gripper))
375 |     # append the last waypoint a few more times for the robot to stabilize
376 |     for _ in range(2):
377 |       traj.append((world_xyz, rotation, velocity, gripper))
378 |     return traj
379 |   
380 |   def _preprocess_avoidance_map(self, avoidance_map, affordance_map, movable_obs):
381 |     # collision avoidance
382 |     scene_collision_map = self._get_scene_collision_voxel_map()
383 |     # anywhere within 15/100 indices of the target is ignored (to guarantee that we can reach the target)
384 |     ignore_mask = distance_transform_edt(1 - affordance_map)
385 |     scene_collision_map[ignore_mask < int(0.15 * self._map_size)] = 0
386 |     # anywhere within 15/100 indices of the start is ignored
387 |     try:
388 |       ignore_mask = distance_transform_edt(1 - movable_obs['occupancy_map'])
389 |       scene_collision_map[ignore_mask < int(0.15 * self._map_size)] = 0
390 |     except KeyError:
391 |       start_pos = movable_obs['position']
392 |       ignore_mask = np.ones_like(avoidance_map)
393 |       ignore_mask[start_pos[0] - int(0.1 * self._map_size):start_pos[0] + int(0.1 * self._map_size),
394 |                   start_pos[1] - int(0.1 * self._map_size):start_pos[1] + int(0.1 * self._map_size),
395 |                   start_pos[2] - int(0.1 * self._map_size):start_pos[2] + int(0.1 * self._map_size)] = 0
396 |       scene_collision_map *= ignore_mask
397 |     avoidance_map += scene_collision_map
398 |     avoidance_map = np.clip(avoidance_map, 0, 1)
399 |     return avoidance_map
400 | 
401 | def setup_LMP(env, general_config, debug=False):
402 |   controller_config = general_config['controller']
403 |   planner_config = general_config['planner']
404 |   lmp_env_config = general_config['lmp_config']['env']
405 |   lmps_config = general_config['lmp_config']['lmps']
406 |   env_name = general_config['env_name']
407 |   # LMP env wrapper
408 |   lmp_env = LMP_interface(env, lmp_env_config, controller_config, planner_config, env_name=env_name)
409 |   # creating APIs that the LMPs can interact with
410 |   fixed_vars = {
411 |       'np': np,
412 |       'euler2quat': transforms3d.euler.euler2quat,
413 |       'quat2euler': transforms3d.euler.quat2euler,
414 |       'qinverse': transforms3d.quaternions.qinverse,
415 |       'qmult': transforms3d.quaternions.qmult,
416 |   }  # external library APIs
417 |   variable_vars = {
418 |       k: getattr(lmp_env, k)
419 |       for k in dir(lmp_env) if callable(getattr(lmp_env, k)) and not k.startswith("_")
420 |   }  # our custom APIs exposed to LMPs
421 | 
422 |   # allow LMPs to access other LMPs
423 |   lmp_names = [name for name in lmps_config.keys() if not name in ['composer', 'planner', 'config']]
424 |   low_level_lmps = {
425 |       k: LMP(k, lmps_config[k], fixed_vars, variable_vars, debug, env_name)
426 |       for k in lmp_names
427 |   }
428 |   variable_vars.update(low_level_lmps)
429 | 
430 |   # creating the LMP for skill-level composition
431 |   composer = LMP(
432 |       'composer', lmps_config['composer'], fixed_vars, variable_vars, debug, env_name
433 |   )
434 |   variable_vars['composer'] = composer
435 | 
436 |   # creating the LMP that deals w/ high-level language commands
437 |   task_planner = LMP(
438 |       'planner', lmps_config['planner'], fixed_vars, variable_vars, debug, env_name
439 |   )
440 | 
441 |   lmps = {
442 |       'plan_ui': task_planner,
443 |       'composer_ui': composer,
444 |   }
445 |   lmps.update(low_level_lmps)
446 | 
447 |   return lmps, lmp_env
448 | 
449 | 
450 | # ======================================================
451 | # jit-ready functions (for faster replanning time, need to install numba and add "@njit")
452 | # ======================================================
453 | def pc2voxel(pc, voxel_bounds_robot_min, voxel_bounds_robot_max, map_size):
454 |   """voxelize a point cloud"""
455 |   pc = pc.astype(np.float32)
456 |   # make sure the point is within the voxel bounds
457 |   pc = np.clip(pc, voxel_bounds_robot_min, voxel_bounds_robot_max)
458 |   # voxelize
459 |   voxels = (pc - voxel_bounds_robot_min) / (voxel_bounds_robot_max - voxel_bounds_robot_min) * (map_size - 1)
460 |   # to integer
461 |   _out = np.empty_like(voxels)
462 |   voxels = np.round(voxels, 0, _out).astype(np.int32)
463 |   assert np.all(voxels >= 0), f'voxel min: {voxels.min()}'
464 |   assert np.all(voxels < map_size), f'voxel max: {voxels.max()}'
465 |   return voxels
466 | 
467 | def voxel2pc(voxels, voxel_bounds_robot_min, voxel_bounds_robot_max, map_size):
468 |   """de-voxelize a voxel"""
469 |   # check voxel coordinates are non-negative
470 |   assert np.all(voxels >= 0), f'voxel min: {voxels.min()}'
471 |   assert np.all(voxels < map_size), f'voxel max: {voxels.max()}'
472 |   voxels = voxels.astype(np.float32)
473 |   # de-voxelize
474 |   pc = voxels / (map_size - 1) * (voxel_bounds_robot_max - voxel_bounds_robot_min) + voxel_bounds_robot_min
475 |   return pc
476 | 
477 | def pc2voxel_map(points, voxel_bounds_robot_min, voxel_bounds_robot_max, map_size):
478 |   """given point cloud, create a fixed size voxel map, and fill in the voxels"""
479 |   points = points.astype(np.float32)
480 |   voxel_bounds_robot_min = voxel_bounds_robot_min.astype(np.float32)
481 |   voxel_bounds_robot_max = voxel_bounds_robot_max.astype(np.float32)
482 |   # make sure the point is within the voxel bounds
483 |   points = np.clip(points, voxel_bounds_robot_min, voxel_bounds_robot_max)
484 |   # voxelize
485 |   voxel_xyz = (points - voxel_bounds_robot_min) / (voxel_bounds_robot_max - voxel_bounds_robot_min) * (map_size - 1)
486 |   # to integer
487 |   _out = np.empty_like(voxel_xyz)
488 |   points_vox = np.round(voxel_xyz, 0, _out).astype(np.int32)
489 |   voxel_map = np.zeros((map_size, map_size, map_size))
490 |   for i in range(points_vox.shape[0]):
491 |       voxel_map[points_vox[i, 0], points_vox[i, 1], points_vox[i, 2]] = 1
492 |   return voxel_map


--------------------------------------------------------------------------------