├── tabletop_handybot
    ├── resource
    │   └── tabletop_handybot
    ├── tabletop_handybot
    │   ├── __init__.py
    │   ├── audio_prompt_node.py
    │   ├── point_cloud_conversion.py
    │   ├── openai_assistant.py
    │   └── tabletop_handybot_node.py
    ├── setup.cfg
    ├── package.xml
    ├── setup.py
    └── launch
    │   └── run.launch.py
├── requirements.txt
├── assets
    ├── roll_dice_thumbnail.png
    ├── architecture_diagram.png
    └── pick_markers_thumbnail.png
├── .vscode
    └── settings.json
├── tabletop-handybot.repos
├── .gitignore
└── README.md


/tabletop_handybot/resource/tabletop_handybot:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tabletop_handybot/tabletop_handybot/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | open3d
2 | whisper-mic
3 | openai


--------------------------------------------------------------------------------
/assets/roll_dice_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ycheng517/tabletop-handybot/HEAD/assets/roll_dice_thumbnail.png


--------------------------------------------------------------------------------
/assets/architecture_diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ycheng517/tabletop-handybot/HEAD/assets/architecture_diagram.png


--------------------------------------------------------------------------------
/assets/pick_markers_thumbnail.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ycheng517/tabletop-handybot/HEAD/assets/pick_markers_thumbnail.png


--------------------------------------------------------------------------------
/tabletop_handybot/setup.cfg:
--------------------------------------------------------------------------------
1 | [develop]
2 | script_dir=$base/lib/tabletop_handybot
3 | [install]
4 | install_scripts=$base/lib/tabletop_handybot
5 | 
6 | [build_scripts]
7 | executable = /usr/bin/env python3
8 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "python.analysis.extraPaths": [
 3 |       "/home/yifei/src/Grounded-Segment-Anything/GroundingDINO"
 4 |   ],
 5 |   "files.exclude": {
 6 |     "build/": true,
 7 |     "install/": true,
 8 |     "log/": true
 9 |   }
10 | }


--------------------------------------------------------------------------------
/tabletop-handybot.repos:
--------------------------------------------------------------------------------
 1 | repositories:
 2 |   pymoveit2:
 3 |     type: git
 4 |     url: https://github.com/ycheng517/pymoveit2.git
 5 |     version: devel
 6 |   ar4_ros_driver:
 7 |     type: git
 8 |     url: https://github.com/ycheng517/ar4_ros_driver.git
 9 |   Grounded-Segment-Anything/Grounded-Segment-Anything:
10 |     type: git
11 |     url: https://github.com/IDEA-Research/Grounded-Segment-Anything
12 | 
13 |   # dependencies of ar4_ros_driver
14 |   easy_handeye2:
15 |     type: git
16 |     url: https://github.com/ycheng517/easy_handeye2.git
17 |   ros2_aruco:
18 |     type: git
19 |     url: https://github.com/JMU-ROBOTICS-VIVA/ros2_aruco.git
20 | 


--------------------------------------------------------------------------------
/tabletop_handybot/tabletop_handybot/audio_prompt_node.py:
--------------------------------------------------------------------------------
 1 | import rclpy
 2 | import rclpy.node
 3 | from std_msgs.msg import Empty, String
 4 | from whisper_mic import WhisperMic
 5 | 
 6 | 
 7 | class AudioPromptNode(rclpy.node.Node):
 8 |     """ROS2 Node that listens for a /listen message and publishes the
 9 |     result of the audio prompt to /prompt."""
10 | 
11 |     def __init__(self):
12 |         super().__init__("audio_prompt_node")
13 | 
14 |         self.whisper_mic = WhisperMic()
15 | 
16 |         self.prompt_pub = self.create_publisher(String, "/prompt", 10)
17 |         self.listen_sub = self.create_subscription(Empty, "/listen",
18 |                                                    self.listen, 10)
19 |         self._logger.info("Audio Prompt Node Initialized")
20 | 
21 |     def listen(self, _: Empty):
22 |         result = self.whisper_mic.listen(timeout=10.0, phrase_time_limit=10.0)
23 |         self._logger.info(f"Prompt: {result}")
24 |         self.prompt_pub.publish(String(data=result))
25 | 
26 | 
27 | def main():
28 |     rclpy.init()
29 |     node = AudioPromptNode()
30 |     try:
31 |         rclpy.spin(node)
32 |     except KeyboardInterrupt:
33 |         pass
34 | 
35 |     rclpy.shutdown()
36 | 
37 | 
38 | if __name__ == "__main__":
39 |     main()
40 | 


--------------------------------------------------------------------------------
/tabletop_handybot/package.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0"?>
 2 | <?xml-model href="http://download.ros.org/schema/package_format2.xsd" schematypens="http://www.w3.org/2001/XMLSchema"?>
 3 | <package format="2">
 4 |   <name>tabletop_handybot</name>
 5 |   <version>0.1.0</version>
 6 |   <description>A handy robot arm for performing tabletop tasks.</description>
 7 | 
 8 |   <license>Apache License 2.0</license>
 9 |   <maintainer email="ycheng517@gmail.com">Yifei Cheng</maintainer>
10 |   <author email="ycheng517@gmail.com">Yifei Cheng</author>
11 | 
12 |   <buildtool_depend>ament_python</buildtool_depend>
13 | 
14 |   <exec_depend>ar_moveit_config</exec_depend>
15 |   <exec_depend>cv_bridge</exec_depend>
16 |   <exec_depend>easy_handeye2</exec_depend>
17 |   <exec_depend>geometry_msgs</exec_depend>
18 |   <exec_depend>rclpy</exec_depend>
19 |   <exec_depend>realsense2_camera</exec_depend>
20 |   <exec_depend>ros2_aruco</exec_depend>
21 |   <exec_depend>tf2_ros</exec_depend>
22 |   <exec_depend>sensor_msgs</exec_depend>
23 |   <exec_depend>std_msgs</exec_depend>
24 | 
25 |   <!-- These test dependencies are optional
26 |   Their purpose is to make sure that the code passes the linters -->
27 |   <test_depend>ament_copyright</test_depend>
28 |   <test_depend>ament_flake8</test_depend>
29 |   <test_depend>ament_pep257</test_depend>
30 |   <test_depend>python3-pytest</test_depend>
31 | 
32 |   <export>
33 |     <build_type>ament_python</build_type>
34 |   </export>
35 | </package>
36 | 


--------------------------------------------------------------------------------
/tabletop_handybot/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from glob import glob
 3 | from setuptools import setup
 4 | 
 5 | package_name = 'tabletop_handybot'
 6 | 
 7 | setup(
 8 |     name=package_name,
 9 |     version='0.1.0',
10 |     packages=[package_name],
11 |     data_files=[('share/ament_index/resource_index/packages',
12 |                  ['resource/' + package_name]),
13 |                 ('share/' + package_name, ['package.xml']),
14 |                 (os.path.join('share', package_name, 'launch'),
15 |                  glob(os.path.join('launch', '*launch.[pxy][yma]*')))],
16 |     install_requires=['setuptools'],
17 |     zip_safe=True,
18 |     author='Yifei Cheng',
19 |     author_email='ycheng517@gmail.com',
20 |     maintainer='Yifei Cheng',
21 |     maintainer_email='ycheng517@gmail.com',
22 |     keywords=['ROS'],
23 |     classifiers=[
24 |         'Intended Audience :: Developers',
25 |         'License :: OSI Approved :: Apache Software License',
26 |         'Programming Language :: Python',
27 |         'Topic :: Software Development',
28 |     ],
29 |     description='A handy robot arm for performing tabletop tasks.',
30 |     license='Apache License, Version 2.0',
31 |     tests_require=['pytest'],
32 |     entry_points={
33 |         'console_scripts': [
34 |             'tabletop_handybot_node = tabletop_handybot.tabletop_handybot_node:main',
35 |             'audio_prompt_node = tabletop_handybot.audio_prompt_node:main',
36 |         ],
37 |     },
38 | )
39 | 


--------------------------------------------------------------------------------
/tabletop_handybot/tabletop_handybot/point_cloud_conversion.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sensor_msgs.msg import PointCloud2, PointField
 3 | from std_msgs.msg import Header
 4 | 
 5 | 
 6 | def point_cloud_to_msg(points, parent_frame):
 7 |     """Creates a point cloud message.
 8 |     Args:
 9 |         points: Nx3 array of xyz positions.
10 |         parent_frame: frame in which the point cloud is defined
11 |     Returns:
12 |         sensor_msgs/PointCloud2 message
13 | 
14 |     Code source:
15 |         https://gist.github.com/pgorczak/5c717baa44479fa064eb8d33ea4587e0
16 | 
17 |     References:
18 |         http://docs.ros.org/melodic/api/sensor_msgs/html/msg/PointCloud2.html
19 |         http://docs.ros.org/melodic/api/sensor_msgs/html/msg/PointField.html
20 |         http://docs.ros.org/melodic/api/std_msgs/html/msg/Header.html
21 | 
22 |     """
23 |     # In a PointCloud2 message, the point cloud is stored as an byte
24 |     # array. In order to unpack it, we also include some parameters
25 |     # which desribes the size of each individual point.
26 |     ros_dtype = PointField.FLOAT32
27 |     dtype = np.float32
28 |     itemsize = np.dtype(dtype).itemsize  # A 32-bit float takes 4 bytes.
29 | 
30 |     data = points.astype(dtype).tobytes()
31 | 
32 |     # The fields specify what the bytes represents. The first 4 bytes
33 |     # represents the x-coordinate, the next 4 the y-coordinate, etc.
34 |     fields = [
35 |         PointField(name=n, offset=i * itemsize, datatype=ros_dtype, count=1)
36 |         for i, n in enumerate("xyz")
37 |     ]
38 | 
39 |     # The PointCloud2 message also has a header which specifies which
40 |     # coordinate frame it is represented in.
41 |     header = Header(frame_id=parent_frame)
42 | 
43 |     return PointCloud2(
44 |         header=header,
45 |         height=1,
46 |         width=points.shape[0],
47 |         is_dense=False,
48 |         is_bigendian=False,
49 |         fields=fields,
50 |         point_step=(itemsize * 3),  # Every point consists of three float32s.
51 |         row_step=(itemsize * 3 * points.shape[0]),
52 |         data=data,
53 |     )
54 | 


--------------------------------------------------------------------------------
/tabletop_handybot/launch/run.launch.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import yaml
 4 | from ament_index_python.packages import get_package_share_directory
 5 | from launch import LaunchDescription
 6 | from launch.actions import IncludeLaunchDescription, TimerAction
 7 | from launch.launch_description_sources import PythonLaunchDescriptionSource
 8 | from launch_ros.actions import Node
 9 | 
10 | 
11 | def load_yaml(package_name, file_name):
12 |     package_path = get_package_share_directory(package_name)
13 |     absolute_file_path = os.path.join(package_path, file_name)
14 |     with open(absolute_file_path, "r", encoding="utf-8") as file:
15 |         return yaml.safe_load(file)
16 | 
17 | 
18 | def generate_launch_description():
19 |     realsense_args = {
20 |         "enable_rgbd": "true",
21 |         "enable_sync": "true",
22 |         "align_depth.enable": "true",
23 |         "enable_color": "true",
24 |         "enable_depth": "true",
25 |         "depth_module.depth_profile": "640x480x30",
26 |         "depth_module.infra_profile": "640x480x30",
27 |     }
28 |     realsense = IncludeLaunchDescription(
29 |         PythonLaunchDescriptionSource([
30 |             os.path.join(get_package_share_directory("realsense2_camera"),
31 |                          "launch", "rs_launch.py")
32 |         ]),
33 |         launch_arguments=realsense_args.items())
34 | 
35 |     calibration_tf_publisher = IncludeLaunchDescription(
36 |         PythonLaunchDescriptionSource([
37 |             os.path.join(get_package_share_directory("easy_handeye2"),
38 |                          "launch", "publish.launch.py")
39 |         ]),
40 |         launch_arguments={"name": "ar4_calibration"}.items())
41 | 
42 |     delay_calibration_tf_publisher = TimerAction(
43 |         actions=[calibration_tf_publisher],
44 |         period=2.0,
45 |     )
46 | 
47 |     ar_moveit_launch = PythonLaunchDescriptionSource([
48 |         os.path.join(get_package_share_directory("ar_moveit_config"), "launch",
49 |                      "ar_moveit.launch.py")
50 |     ])
51 |     ar_moveit_args = {
52 |         "include_gripper": "True",
53 |         "rviz_config_file": "moveit_with_camera.rviz"
54 |     }.items()
55 |     ar_moveit = IncludeLaunchDescription(ar_moveit_launch,
56 |                                          launch_arguments=ar_moveit_args)
57 | 
58 |     tabletop_handybot_node = Node(
59 |         package="tabletop_handybot",
60 |         executable="tabletop_handybot_node",
61 |         name="tabletop_handybot_node",
62 |         output="screen",
63 |     )
64 | 
65 |     audio_prompt_node = Node(
66 |         package="tabletop_handybot",
67 |         executable="audio_prompt_node",
68 |         name="audio_prompt_node",
69 |         output="screen",
70 |     )
71 | 
72 |     return LaunchDescription([
73 |         realsense, delay_calibration_tf_publisher, ar_moveit,
74 |         audio_prompt_node, tabletop_handybot_node
75 |     ])
76 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # ROS
  2 | install/
  3 | log/
  4 | 
  5 | # imported repositories
  6 | pymoveit2/
  7 | ar4_ros_driver/
  8 | Grounded-Segment-Anything/
  9 | easy_handeye2/
 10 | ros2_aruco/
 11 | 
 12 | # Byte-compiled / optimized / DLL files
 13 | __pycache__/
 14 | *.py[cod]
 15 | *$py.class
 16 | 
 17 | # C extensions
 18 | *.so
 19 | 
 20 | # Distribution / packaging
 21 | .Python
 22 | build/
 23 | develop-eggs/
 24 | dist/
 25 | downloads/
 26 | eggs/
 27 | .eggs/
 28 | lib/
 29 | lib64/
 30 | parts/
 31 | sdist/
 32 | var/
 33 | wheels/
 34 | share/python-wheels/
 35 | *.egg-info/
 36 | .installed.cfg
 37 | *.egg
 38 | MANIFEST
 39 | 
 40 | # PyInstaller
 41 | #  Usually these files are written by a python script from a template
 42 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 43 | *.manifest
 44 | *.spec
 45 | 
 46 | # Installer logs
 47 | pip-log.txt
 48 | pip-delete-this-directory.txt
 49 | 
 50 | # Unit test / coverage reports
 51 | htmlcov/
 52 | .tox/
 53 | .nox/
 54 | .coverage
 55 | .coverage.*
 56 | .cache
 57 | nosetests.xml
 58 | coverage.xml
 59 | *.cover
 60 | *.py,cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | cover/
 64 | 
 65 | # Translations
 66 | *.mo
 67 | *.pot
 68 | 
 69 | # Django stuff:
 70 | *.log
 71 | local_settings.py
 72 | db.sqlite3
 73 | db.sqlite3-journal
 74 | 
 75 | # Flask stuff:
 76 | instance/
 77 | .webassets-cache
 78 | 
 79 | # Scrapy stuff:
 80 | .scrapy
 81 | 
 82 | # Sphinx documentation
 83 | docs/_build/
 84 | 
 85 | # PyBuilder
 86 | .pybuilder/
 87 | target/
 88 | 
 89 | # Jupyter Notebook
 90 | .ipynb_checkpoints
 91 | 
 92 | # IPython
 93 | profile_default/
 94 | ipython_config.py
 95 | 
 96 | # pyenv
 97 | #   For a library or package, you might want to ignore these files since the code is
 98 | #   intended to run in multiple environments; otherwise, check them in:
 99 | # .python-version
100 | 
101 | # pipenv
102 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
103 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
104 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
105 | #   install all needed dependencies.
106 | #Pipfile.lock
107 | 
108 | # poetry
109 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
110 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
111 | #   commonly ignored for libraries.
112 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
113 | #poetry.lock
114 | 
115 | # pdm
116 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
117 | #pdm.lock
118 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
119 | #   in version control.
120 | #   https://pdm.fming.dev/#use-with-ide
121 | .pdm.toml
122 | 
123 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
124 | __pypackages__/
125 | 
126 | # Celery stuff
127 | celerybeat-schedule
128 | celerybeat.pid
129 | 
130 | # SageMath parsed files
131 | *.sage.py
132 | 
133 | # Environments
134 | .env
135 | .venv
136 | env/
137 | venv/
138 | ENV/
139 | env.bak/
140 | venv.bak/
141 | 
142 | # Spyder project settings
143 | .spyderproject
144 | .spyproject
145 | 
146 | # Rope project settings
147 | .ropeproject
148 | 
149 | # mkdocs documentation
150 | /site
151 | 
152 | # mypy
153 | .mypy_cache/
154 | .dmypy.json
155 | dmypy.json
156 | 
157 | # Pyre type checker
158 | .pyre/
159 | 
160 | # pytype static type analyzer
161 | .pytype/
162 | 
163 | # Cython debug symbols
164 | cython_debug/
165 | 
166 | # PyCharm
167 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
168 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
169 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
170 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
171 | #.idea/
172 | 


--------------------------------------------------------------------------------
/tabletop_handybot/tabletop_handybot/openai_assistant.py:
--------------------------------------------------------------------------------
  1 | from openai import OpenAI  # pylint: disable=import-self
  2 | from openai.types.beta import Assistant
  3 | 
  4 | # pylint: disable=line-too-long
  5 | 
  6 | 
  7 | def get_or_create_assistant(client: OpenAI,
  8 |                             assistant_id: str = "") -> Assistant:
  9 |     if assistant_id:
 10 |         return client.beta.assistants.retrieve(assistant_id)
 11 | 
 12 |     return client.beta.assistants.create(
 13 |         name="Tabletop Assistant",
 14 |         instructions=(
 15 |             "You are a robot arm mounted on a table. Write and run code to "
 16 |             "do tasks on the table. You can only pick up one object at a time."
 17 |         ),
 18 |         model="gpt-4o",
 19 |         temperature=0.01,
 20 |         tools=[{
 21 |             "type": "function",
 22 |             "function": {
 23 |                 "name": "detect_objects",
 24 |                 "description":
 25 |                 "Detect objects in the field of view of the camera",
 26 |                 "parameters": {
 27 |                     "type": "object",
 28 |                     "properties": {
 29 |                         "object_classes": {
 30 |                             "type":
 31 |                             "string",
 32 |                             "description":
 33 |                             ("Object classes to detect, comma separated"
 34 |                              "For example: horses,rivers,plain"),
 35 |                         }
 36 |                     },
 37 |                     "required": ["object_classes"],
 38 |                 },
 39 |             },
 40 |         }, {
 41 |             "type": "function",
 42 |             "function": {
 43 |                 "name": "pick_object",
 44 |                 "description":
 45 |                 "Pick up an object from the output of get_objects.",
 46 |                 "parameters": {
 47 |                     "type": "object",
 48 |                     "properties": {
 49 |                         "object_index": {
 50 |                             "type":
 51 |                             "integer",
 52 |                             "description":
 53 |                             "index of target object in the detected objects list to execute the action for."
 54 |                         }
 55 |                     },
 56 |                     "required": ["object_index"]
 57 |                 }
 58 |             }
 59 |         }, {
 60 |             "type": "function",
 61 |             "function": {
 62 |                 "name": "move_above_object_and_release",
 63 |                 "description":
 64 |                 "move the end effector above the object and release the gripper. The object is from the output of get_objects",
 65 |                 "parameters": {
 66 |                     "type": "object",
 67 |                     "properties": {
 68 |                         "object_index": {
 69 |                             "type":
 70 |                             "integer",
 71 |                             "description":
 72 |                             "index of target object in the detected objects list to execute the action for."
 73 |                         }
 74 |                     },
 75 |                     "required": ["object_index"]
 76 |                 }
 77 |             }
 78 |         }, {
 79 |             "type": "function",
 80 |             "function": {
 81 |                 "name": "release_gripper",
 82 |                 "description": "Open up the gripper",
 83 |                 "parameters": {
 84 |                     "type": "object",
 85 |                     "properties": {},
 86 |                     "required": []
 87 |                 }
 88 |             }
 89 |         }, {
 90 |             "type": "function",
 91 |             "function": {
 92 |                 "name": "flick_wrist_while_release",
 93 |                 "description":
 94 |                 "Flick the wrist while releasing the gripper, basically tossing the object.",
 95 |                 "parameters": {
 96 |                     "type": "object",
 97 |                     "properties": {},
 98 |                     "required": []
 99 |                 }
100 |             }
101 |         }],
102 |     )
103 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Tabletop HandyBot
  2 | 
  3 | A low-cost AI powered robotic arm assistant that listens to your voice commands
  4 | and can carry out a variety of tabletop tasks. Currently it can pick and place
  5 | arbitrary objects presented in front of it. The total BOM is around $2300 USD.
  6 | 
  7 | ## Video Demo
  8 | 
  9 | <div align="center">
 10 | 
 11 | |                                       Rolling Dice                                       |                                        Picking Markers                                         |
 12 | | :--------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------: |
 13 | | [![Roll Dice](./assets/roll_dice_thumbnail.png)](https://youtube.com/shorts/xAmPpvCZZU4) | [![Pick Markers](./assets/pick_markers_thumbnail.png)](https://youtube.com/shorts/fsCkS7HvgGo) |
 14 | 
 15 | </div>
 16 | 
 17 | ## System Architecture
 18 | 
 19 | ![System Architecture](./assets/architecture_diagram.png)
 20 | 
 21 | ### Technology Used
 22 | 
 23 | - [AR4 Robot](https://www.anninrobotics.com/): a $2000 robot arm that can lift 5lb dumbbells
 24 |   - [ar4_ros_driver](https://github.com/ycheng517/ar4_ros_driver): ROS driver for the arm
 25 | - [Realsense D435](https://store.intelrealsense.com/buy-intel-realsense-depth-camera-d435.html?_ga=2.156205052.639062113.1715864879-1357777019.1715864879): a reliable $300 RGBD camera
 26 |   - [realsense-ros](https://github.com/IntelRealSense/realsense-ros): ROS wrapper for the camera
 27 | - [Whisper Mic](https://github.com/mallorbc/whisper_mic): Convenient library for using OpenAI Whisper with a microphone
 28 | - [Grounding DINO](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/main/GroundingDINO) A SOTA zero-shot object detector that can detect any object.
 29 | - [Segment Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/main/segment_anything): A SOTA zero-shot object segmentation model that can segment any object.
 30 | - [OpenAI Assistants](https://platform.openai.com/docs/assistants/overview): an API that calls ChatGPT with specific instructions and function calls
 31 | 
 32 | ## Setup
 33 | 
 34 | ### Pre-requisites
 35 | 
 36 | You should have [ROS 2 Iron](https://docs.ros.org/en/iron/index.html) installed on Ubuntu 22.04.
 37 | 
 38 | The follow hardware is is used. The use of other hardware will require adapting the codebase.
 39 | 
 40 | - [AR4 robot arm](https://www.anninrobotics.com/) with [AR servo gripper](https://www.anninrobotics.com/product-page/servo-gripper-parts-kit)
 41 | - [Intel RealSense D435 Depth Camera](https://www.intelrealsense.com/depth-camera-d435/)
 42 | 
 43 | You should be able to run [ar4_ros_driver](https://github.com/ycheng517/ar4_ros_driver)
 44 | with the gripper, and successfully perform hand-eye calibration.
 45 | 
 46 | ### Install
 47 | 
 48 | Import dependent repos
 49 | 
 50 | ```bash
 51 | vcs import . --input tabletop-handybot.repos
 52 | ```
 53 | 
 54 | Create a virtual environment, i.e.
 55 | 
 56 | ```bash
 57 | pyenv virtualenv 3.10.12 handybot
 58 | ```
 59 | 
 60 | Go to the `./Grounded-Segment-Anything/Grounded-Segment-Anything/` sub-directory
 61 | and setup [Grounded-Segment-Anything](https://github.com/IDEA-Research/Grounded-Segment-Anything)
 62 | in the virtualenv. Ensure you can run the [grounded_sam.ipynb](https://github.com/IDEA-Research/Grounded-Segment-Anything/blob/main/grounded_sam.ipynb)
 63 | notebook.
 64 | 
 65 | Install Python dependencies not in the ROS Index
 66 | 
 67 | ```bash
 68 | pip install -r requirements.txt
 69 | ```
 70 | 
 71 | Build and source the project
 72 | 
 73 | ```bash
 74 | colcon build
 75 | source install/setup.bash
 76 | ```
 77 | 
 78 | ### Run
 79 | 
 80 | Launch the AR4
 81 | 
 82 | ```bash
 83 | ros2 launch ar_hardware_interface ar_hardware.launch.py calibrate:=True include_gripper:=True
 84 | ```
 85 | 
 86 | Launch all other programs
 87 | 
 88 | ```bash
 89 | ros2 launch tabletop_handybot run.launch.py
 90 | ```
 91 | 
 92 | If all things are looking good, then you can try publishing a prompt to the
 93 | `/prompt` topic for the robot to execute, i.e:
 94 | 
 95 | ```bash
 96 | ros2 topic pub --once /prompt std_msgs/msg/String "data: 'put the marker in the container'"
 97 | ```
 98 | 
 99 | If you have a microphone attached to the computer, you can publish a message
100 | to the `/listen` topic, and then say your prompt.
101 | 
102 | ```bash
103 | ros2 topic pub --once /listen std_msgs/msg/Empty "{}"
104 | ```
105 | 


--------------------------------------------------------------------------------
/tabletop_handybot/tabletop_handybot/tabletop_handybot_node.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import time
  4 | from functools import cached_property
  5 | from typing import List
  6 | 
  7 | import cv2
  8 | import numpy as np
  9 | import open3d as o3d
 10 | import openai
 11 | import rclpy
 12 | import supervision as sv
 13 | import tf2_ros
 14 | import torch
 15 | import torchvision
 16 | from cv_bridge import CvBridge
 17 | from geometry_msgs.msg import Pose, PoseStamped
 18 | from groundingdino.util.inference import Model
 19 | from openai.types.beta import Assistant
 20 | from openai.types.beta.threads import RequiredActionFunctionToolCall
 21 | from rclpy.callback_groups import ReentrantCallbackGroup
 22 | from rclpy.duration import Duration
 23 | from rclpy.executors import MultiThreadedExecutor
 24 | from rclpy.callback_groups import MutuallyExclusiveCallbackGroup
 25 | from rclpy.node import Node
 26 | from rclpy.time import Time
 27 | from scipy.spatial.transform import Rotation
 28 | from segment_anything import SamPredictor, sam_model_registry
 29 | from sensor_msgs.msg import Image, PointCloud2
 30 | from std_msgs.msg import Int64, String
 31 | from sensor_msgs.msg import JointState
 32 | 
 33 | from pymoveit2 import GripperInterface, MoveIt2
 34 | 
 35 | from .openai_assistant import get_or_create_assistant
 36 | from .point_cloud_conversion import point_cloud_to_msg
 37 | 
 38 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 39 | 
 40 | # GroundingDINO config and checkpoint
 41 | GSA_PATH = "./Grounded-Segment-Anything/Grounded-Segment-Anything"
 42 | GROUNDING_DINO_CONFIG_PATH = os.path.join(
 43 |     GSA_PATH, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
 44 | GROUNDING_DINO_CHECKPOINT_PATH = os.path.join(GSA_PATH,
 45 |                                               "groundingdino_swint_ogc.pth")
 46 | 
 47 | # Segment-Anything checkpoint
 48 | SAM_ENCODER_VERSION = "vit_h"
 49 | SAM_CHECKPOINT_PATH = os.path.join(GSA_PATH, "sam_vit_h_4b8939.pth")
 50 | 
 51 | # Predict classes and hyper-param for GroundingDINO
 52 | BOX_THRESHOLD = 0.4
 53 | TEXT_THRESHOLD = 0.25
 54 | NMS_THRESHOLD = 0.8
 55 | 
 56 | 
 57 | # Prompting SAM with detected boxes
 58 | def segment(sam_predictor: SamPredictor, image: np.ndarray,
 59 |             xyxy: np.ndarray) -> np.ndarray:
 60 |     sam_predictor.set_image(image)
 61 |     result_masks = []
 62 |     for box in xyxy:
 63 |         masks, scores, _ = sam_predictor.predict(box=box,
 64 |                                                  multimask_output=True)
 65 |         index = np.argmax(scores)
 66 |         result_masks.append(masks[index])
 67 |     return np.array(result_masks)
 68 | 
 69 | 
 70 | class TabletopHandyBotNode(Node):
 71 |     """Main ROS 2 node for Tabletop HandyBot."""
 72 | 
 73 |     # TODO: make args rosparams
 74 |     def __init__(
 75 |             self,
 76 |             annotate: bool = False,
 77 |             publish_point_cloud: bool = False,
 78 |             assistant_id: str = "",
 79 |             # Adjust these offsets to your needs:
 80 |             offset_x: float = 0.015,
 81 |             offset_y: float = -0.015,
 82 |             offset_z: float = 0.08,  # accounts for the height of the gripper
 83 |     ):
 84 |         super().__init__("tabletop_handybot_node")
 85 | 
 86 |         self.logger = self.get_logger()
 87 | 
 88 |         self.cv_bridge = CvBridge()
 89 |         self.gripper_joint_name = "gripper_joint"
 90 |         callback_group = ReentrantCallbackGroup()
 91 |         # Create MoveIt 2 interface
 92 |         self.arm_joint_names = [
 93 |             "joint_1", "joint_2", "joint_3", "joint_4", "joint_5", "joint_6"
 94 |         ]
 95 |         self.moveit2 = MoveIt2(
 96 |             node=self,
 97 |             joint_names=self.arm_joint_names,
 98 |             base_link_name="base_link",
 99 |             end_effector_name="link_6",
100 |             group_name="ar_manipulator",
101 |             callback_group=callback_group,
102 |         )
103 |         self.moveit2.planner_id = "RRTConnectkConfigDefault"
104 |         self.gripper_interface = GripperInterface(
105 |             node=self,
106 |             gripper_joint_names=["gripper_jaw1_joint"],
107 |             open_gripper_joint_positions=[-0.012],
108 |             closed_gripper_joint_positions=[0.0],
109 |             gripper_group_name="ar_gripper",
110 |             callback_group=callback_group,
111 |             gripper_command_action_name="/gripper_controller/gripper_cmd",
112 |         )
113 |         self.tf_buffer = tf2_ros.Buffer()
114 |         self.tf_listener = tf2_ros.TransformListener(self.tf_buffer, self)
115 | 
116 |         self.grounding_dino_model = Model(
117 |             model_config_path=GROUNDING_DINO_CONFIG_PATH,
118 |             model_checkpoint_path=GROUNDING_DINO_CHECKPOINT_PATH,
119 |         )
120 |         self.sam = sam_model_registry[SAM_ENCODER_VERSION](
121 |             checkpoint=SAM_CHECKPOINT_PATH)
122 |         self.sam.to(device=DEVICE)
123 |         self.sam_predictor = SamPredictor(self.sam)
124 |         self.openai = openai.OpenAI()
125 |         self.assistant: Assistant = get_or_create_assistant(
126 |             self.openai, assistant_id)
127 |         self.logger.info(f"Loaded assistant with ID: {self.assistant.id}")
128 | 
129 |         self.annotate = annotate
130 |         self.publish_point_cloud = publish_point_cloud
131 |         self.n_frames_processed = 0
132 |         self._last_depth_msg = None
133 |         self._last_rgb_msg = None
134 |         self.arm_joint_state: JointState | None = None
135 |         self._last_detections: sv.Detections | None = None
136 |         self._object_in_gripper: bool = False
137 |         self.gripper_squeeze_factor = 0.5
138 |         self.offset_x = offset_x
139 |         self.offset_y = offset_y
140 |         self.offset_z = offset_z
141 | 
142 |         self.image_sub = self.create_subscription(Image,
143 |                                                   "/camera/color/image_raw",
144 |                                                   self.image_callback, 10)
145 |         self.depth_sub = self.create_subscription(
146 |             Image, "/camera/aligned_depth_to_color/image_raw",
147 |             self.depth_callback, 10)
148 |         self.joint_states_sub = self.create_subscription(
149 |             JointState, "/joint_states", self.joint_states_callback, 10)
150 | 
151 |         if self.publish_point_cloud:
152 |             self.point_cloud_pub = self.create_publisher(
153 |                 PointCloud2, "/point_cloud", 2)
154 |         self.prompt_sub = self.create_subscription(
155 |             String,
156 |             "/prompt",
157 |             self.start,
158 |             10,
159 |             callback_group=MutuallyExclusiveCallbackGroup())
160 |         self.save_images_sub = self.create_subscription(
161 |             String, "/save_images", self.save_images, 10)
162 |         self.detect_objects_sub = self.create_subscription(
163 |             String, "/detect_objects", self.detect_objects_cb, 10)
164 |         self.release_at_sub = self.create_subscription(Int64, "/release_above",
165 |                                                        self.release_above_cb,
166 |                                                        10)
167 |         self.pick_object_sub = self.create_subscription(
168 |             Int64, "/pick_object", self.pick_object_cb, 10)
169 | 
170 |         self.logger.info("Tabletop HandyBot node initialized.")
171 | 
172 |     def execution_success(self, tool_call: RequiredActionFunctionToolCall,
173 |                           tool_outputs: List[dict]) -> None:
174 |         output_dict = {"success": True}
175 |         tool_outputs.append({
176 |             "tool_call_id": tool_call.id,
177 |             "output": json.dumps(output_dict),
178 |         })
179 | 
180 |     def execution_failure(self, tool_call: RequiredActionFunctionToolCall,
181 |                           tool_outputs: List[dict], failure_reason: str):
182 |         output_dict = {
183 |             "success": False,
184 |             "failure_reason": failure_reason,
185 |         }
186 |         tool_outputs.append({
187 |             "tool_call_id": tool_call.id,
188 |             "output": json.dumps(output_dict),
189 |         })
190 | 
191 |     def handle_tool_call(self, tool_call: RequiredActionFunctionToolCall,
192 |                          rgb_image: np.ndarray, depth_image: np.ndarray,
193 |                          tool_outputs: List[dict]):
194 |         if tool_call.function.name == "detect_objects":
195 |             args = json.loads(tool_call.function.arguments)
196 |             classes_str = args["object_classes"]
197 |             classes = classes_str.split(",")
198 |             self._last_detections = self.detect_objects(rgb_image, classes)
199 |             detected_classes = [
200 |                 classes[class_id]
201 |                 for class_id in self._last_detections.class_id
202 |             ]
203 |             self.logger.info(f"Detected {detected_classes}.")
204 |             self.logger.info(
205 |                 f"detection confidence: {self._last_detections.confidence}")
206 |             output_dict = {
207 |                 "object_classes": detected_classes,
208 |             }
209 |             tool_outputs.append({
210 |                 "tool_call_id": tool_call.id,
211 |                 "output": json.dumps(output_dict),
212 |             })
213 |         elif tool_call.function.name == "pick_object":
214 |             args = json.loads(tool_call.function.arguments)
215 |             if self._last_detections is None:
216 |                 self.execution_failure(tool_call, tool_outputs,
217 |                                        "No detections available.")
218 |                 return
219 | 
220 |             if self._object_in_gripper:
221 |                 self.execution_failure(
222 |                     tool_call, tool_outputs,
223 |                     "Gripper is already holding an object.")
224 |                 return
225 | 
226 |             if args["object_index"] >= len(self._last_detections.mask):
227 |                 self.execution_failure(
228 |                     tool_call, tool_outputs,
229 |                     ("Invalid object index. Only "
230 |                      f"{len(self._last_detections.mask)} objects detected."))
231 |                 return
232 | 
233 |             self.pick_object(args["object_index"], self._last_detections,
234 |                              depth_image)
235 |             self.logger.info(
236 |                 f"done picking object. Joint states: {self.arm_joint_state.position}"
237 |             )
238 |             self._object_in_gripper = True
239 |             self.execution_success(tool_call, tool_outputs)
240 |         elif tool_call.function.name == "move_above_object_and_release":
241 |             args = json.loads(tool_call.function.arguments)
242 |             if self._last_detections is None:
243 |                 self.execution_failure(tool_call, tool_outputs,
244 |                                        "No detections available.")
245 |                 return
246 | 
247 |             if args["object_index"] >= len(self._last_detections.mask):
248 |                 self.execution_failure(
249 |                     tool_call, tool_outputs,
250 |                     ("Invalid object index. Only "
251 |                      f"{len(self._last_detections.mask)} objects detected."))
252 |                 return
253 | 
254 |             self.release_above(args["object_index"], self._last_detections,
255 |                                depth_image)
256 |             self._object_in_gripper = False
257 |             self.execution_success(tool_call, tool_outputs)
258 |         elif tool_call.function.name == "release_gripper":
259 |             self.release_gripper()
260 |             self.execution_success(tool_call, tool_outputs)
261 |             self._object_in_gripper = False
262 |         elif tool_call.function.name == "flick_wrist_while_release":
263 |             self.flick_wrist_while_release()
264 |             self.execution_success(tool_call, tool_outputs)
265 |             self._object_in_gripper = False
266 | 
267 |     def start(self, msg: String):
268 |         if not self._last_rgb_msg or not self._last_depth_msg:
269 |             return
270 | 
271 |         rgb_image = self.cv_bridge.imgmsg_to_cv2(self._last_rgb_msg)
272 |         depth_image = self.cv_bridge.imgmsg_to_cv2(self._last_depth_msg)
273 |         self._last_detections = None
274 | 
275 |         self.logger.info(f"Processing: {msg.data}")
276 |         self.logger.info(
277 |             f"Initial Joint states: {self.arm_joint_state.position}")
278 |         thread = self.openai.beta.threads.create()
279 |         message = self.openai.beta.threads.messages.create(
280 |             thread_id=thread.id,
281 |             role="user",
282 |             content=msg.data,
283 |         )
284 |         self.logger.info(str(message))
285 | 
286 |         run = self.openai.beta.threads.runs.create_and_poll(
287 |             thread_id=thread.id,
288 |             assistant_id=self.assistant.id,
289 |         )
290 | 
291 |         done = False
292 |         while not done:
293 |             if run.status == "completed":
294 |                 messages = self.openai.beta.threads.messages.list(
295 |                     thread_id=thread.id)
296 |                 self.logger.info(str(messages))
297 |                 done = True
298 |                 break
299 |             else:
300 |                 self.logger.info(run.status)
301 | 
302 |             tool_outputs = []
303 |             for tool_call in run.required_action.submit_tool_outputs.tool_calls:
304 |                 self.logger.info(f"tool_call: {tool_call}")
305 |                 if tool_call.type == "function":
306 |                     self.handle_tool_call(tool_call, rgb_image, depth_image,
307 |                                           tool_outputs)
308 | 
309 |             self.logger.info(f"tool_outputs: {tool_outputs}")
310 |             if tool_outputs:
311 |                 try:
312 |                     run = self.openai.beta.threads.runs.submit_tool_outputs_and_poll(
313 |                         thread_id=thread.id,
314 |                         run_id=run.id,
315 |                         tool_outputs=tool_outputs)
316 |                     self.logger.info("Tool outputs submitted successfully.")
317 |                 except Exception as e:  # pylint: disable=broad-except
318 |                     self.logger.error(f"Failed to submit tool outputs: {e}")
319 |             else:
320 |                 self.logger.info("No tool outputs to submit.")
321 | 
322 |         self.go_home()
323 |         self.logger.info("Task completed.")
324 | 
325 |     def detect_objects(self, image: np.ndarray, object_classes: List[str]):
326 |         self.logger.info(f"Detecting objects of classes: {object_classes}")
327 |         detections: sv.Detections = self.grounding_dino_model.predict_with_classes(
328 |             image=image,
329 |             classes=object_classes,
330 |             box_threshold=BOX_THRESHOLD,
331 |             text_threshold=TEXT_THRESHOLD,
332 |         )
333 | 
334 |         # NMS post process
335 |         nms_idx = (torchvision.ops.nms(
336 |             torch.from_numpy(detections.xyxy),
337 |             torch.from_numpy(detections.confidence),
338 |             NMS_THRESHOLD,
339 |         ).numpy().tolist())
340 | 
341 |         detections.xyxy = detections.xyxy[nms_idx]
342 |         detections.confidence = detections.confidence[nms_idx]
343 |         detections.class_id = detections.class_id[nms_idx]
344 | 
345 |         detections.mask = segment(
346 |             sam_predictor=self.sam_predictor,
347 |             image=cv2.cvtColor(image, cv2.COLOR_BGR2RGB),
348 |             xyxy=detections.xyxy,
349 |         )
350 | 
351 |         if self.annotate:
352 |             box_annotator = sv.BoxAnnotator()
353 |             mask_annotator = sv.MaskAnnotator()
354 |             labels = [
355 |                 f"{object_classes[class_id]} {confidence:0.2f}"
356 |                 for _, _, confidence, class_id, _, _ in detections
357 |             ]
358 |             annotated_frame = box_annotator.annotate(scene=image.copy(),
359 |                                                      detections=detections,
360 |                                                      labels=labels)
361 |             cv2.imwrite(
362 |                 f"annotated_image_detections_{self.n_frames_processed}.jpg",
363 |                 annotated_frame)
364 | 
365 |             annotated_frame = mask_annotator.annotate(scene=image.copy(),
366 |                                                       detections=detections)
367 |             cv2.imwrite(f"annotated_image_masks_{self.n_frames_processed}.jpg",
368 |                         annotated_frame)
369 | 
370 |         if self.publish_point_cloud:
371 |             depth_image = self.cv_bridge.imgmsg_to_cv2(self._last_depth_msg)
372 |             # mask out the depth image except for the detected objects
373 |             masked_depth_image = np.zeros_like(depth_image, dtype=np.float32)
374 |             for mask in detections.mask:
375 |                 masked_depth_image[mask] = depth_image[mask]
376 |             masked_depth_image /= 1000.0
377 | 
378 |             # convert the masked depth image to a point cloud
379 |             pcd = o3d.geometry.PointCloud.create_from_depth_image(
380 |                 o3d.geometry.Image(masked_depth_image),
381 |                 o3d.camera.PinholeCameraIntrinsic(
382 |                     o3d.camera.PinholeCameraIntrinsicParameters.
383 |                     PrimeSenseDefault),
384 |             )
385 | 
386 |             # convert it to a ROS PointCloud2 message
387 |             points = np.asarray(pcd.points)
388 |             pc_msg = point_cloud_to_msg(points, "/camera_color_frame")
389 |             self.point_cloud_pub.publish(pc_msg)
390 | 
391 |         self.n_frames_processed += 1
392 |         return detections
393 | 
394 |     def pick_object(self, object_index: int, detections: sv.Detections,
395 |                     depth_image: np.ndarray):
396 |         """Perform a top-down grasp on the object."""
397 |         # mask out the depth image except for the detected objects
398 |         masked_depth_image = np.zeros_like(depth_image, dtype=np.float32)
399 |         mask = detections.mask[object_index]
400 |         masked_depth_image[mask] = depth_image[mask]
401 |         masked_depth_image /= 1000.0
402 | 
403 |         # convert the masked depth image to a point cloud
404 |         pcd = o3d.geometry.PointCloud.create_from_depth_image(
405 |             o3d.geometry.Image(masked_depth_image),
406 |             o3d.camera.PinholeCameraIntrinsic(
407 |                 o3d.camera.PinholeCameraIntrinsicParameters.PrimeSenseDefault),
408 |         )
409 |         pcd.transform(self.cam_to_base_affine)
410 |         points = np.asarray(pcd.points)
411 |         grasp_z = points[:, 2].max()
412 | 
413 |         near_grasp_z_points = points[points[:, 2] > grasp_z - 0.008]
414 |         xy_points = near_grasp_z_points[:, :2]
415 |         xy_points = xy_points.astype(np.float32)
416 |         center, dimensions, theta = cv2.minAreaRect(xy_points)
417 | 
418 |         gripper_rotation = theta
419 |         if dimensions[0] > dimensions[1]:
420 |             gripper_rotation -= 90
421 |         if gripper_rotation < -90:
422 |             gripper_rotation += 180
423 |         elif gripper_rotation > 90:
424 |             gripper_rotation -= 180
425 | 
426 |         gripper_opening = min(dimensions)
427 |         grasp_pose = Pose()
428 |         grasp_pose.position.x = center[0] + self.offset_x
429 |         grasp_pose.position.y = center[1] + self.offset_y
430 |         grasp_pose.position.z = grasp_z + self.offset_z
431 |         top_down_rot = Rotation.from_quat([0, 1, 0, 0])
432 |         extra_rot = Rotation.from_euler("z", gripper_rotation, degrees=True)
433 |         grasp_quat = (extra_rot * top_down_rot).as_quat()
434 |         grasp_pose.orientation.x = grasp_quat[0]
435 |         grasp_pose.orientation.y = grasp_quat[1]
436 |         grasp_pose.orientation.z = grasp_quat[2]
437 |         grasp_pose.orientation.w = grasp_quat[3]
438 |         self.grasp_at(grasp_pose, gripper_opening)
439 | 
440 |     def grasp_at(self, msg: Pose, gripper_opening: float):
441 |         self.logger.info(f"Grasp at: {msg} with opening: {gripper_opening}")
442 | 
443 |         self.gripper_interface.open()
444 |         self.gripper_interface.wait_until_executed()
445 | 
446 |         # move 5cm above the item first
447 |         msg.position.z += 0.05
448 |         self.move_to(msg)
449 |         time.sleep(0.05)
450 | 
451 |         # grasp the item
452 |         msg.position.z -= 0.05
453 |         self.move_to(msg)
454 |         time.sleep(0.05)
455 | 
456 |         gripper_pos = -gripper_opening / 2. * self.gripper_squeeze_factor
457 |         gripper_pos = min(gripper_pos, 0.0)
458 |         self.gripper_interface.move_to_position(gripper_pos)
459 |         self.gripper_interface.wait_until_executed()
460 | 
461 |         # lift the item
462 |         msg.position.z += 0.12
463 |         self.move_to(msg)
464 |         time.sleep(0.05)
465 | 
466 |     def release_above(self, object_index: int, detections: sv.Detections,
467 |                       depth_image: np.ndarray):
468 |         """Move the robot arm above the object and release the gripper."""
469 |         masked_depth_image = np.zeros_like(depth_image, dtype=np.float32)
470 |         mask = detections.mask[object_index]
471 |         masked_depth_image[mask] = depth_image[mask]
472 |         masked_depth_image /= 1000.0
473 | 
474 |         # convert the masked depth image to a point cloud
475 |         pcd = o3d.geometry.PointCloud.create_from_depth_image(
476 |             o3d.geometry.Image(masked_depth_image),
477 |             o3d.camera.PinholeCameraIntrinsic(
478 |                 o3d.camera.PinholeCameraIntrinsicParameters.PrimeSenseDefault),
479 |         )
480 |         pcd.transform(self.cam_to_base_affine)
481 | 
482 |         points = np.asarray(pcd.points).astype(np.float32)
483 |         # release 5cm above the object
484 |         drop_z = np.percentile(points[:, 2], 95) + 0.05
485 |         median_z = np.median(points[:, 2])
486 | 
487 |         xy_points = points[points[:, 2] > median_z, :2]
488 |         xy_points = xy_points.astype(np.float32)
489 |         center, _, _ = cv2.minAreaRect(xy_points)
490 | 
491 |         drop_pose = Pose()
492 |         drop_pose.position.x = center[0] + self.offset_x
493 |         drop_pose.position.y = center[1] + self.offset_y
494 |         drop_pose.position.z = drop_z + self.offset_z
495 |         # Straight down pose
496 |         drop_pose.orientation.x = 0.0
497 |         drop_pose.orientation.y = 1.0
498 |         drop_pose.orientation.z = 0.0
499 |         drop_pose.orientation.w = 0.0
500 | 
501 |         self.release_at(drop_pose)
502 | 
503 |     def release_gripper(self):
504 |         self.gripper_interface.open()
505 |         self.gripper_interface.wait_until_executed()
506 | 
507 |     def flick_wrist_while_release(self):
508 |         joint_positions = self.arm_joint_state.position
509 |         joint_positions[4] -= np.deg2rad(25)
510 |         self.moveit2.move_to_configuration(joint_positions,
511 |                                            self.arm_joint_names,
512 |                                            tolerance=0.005)
513 |         time.sleep(3)
514 | 
515 |         self.gripper_interface.open()
516 |         self.gripper_interface.wait_until_executed()
517 |         self.moveit2.wait_until_executed()
518 | 
519 |     def go_home(self):
520 |         joint_positions = [0., 0., 0., 0., 0., 0.]
521 |         self.moveit2.move_to_configuration(joint_positions,
522 |                                            self.arm_joint_names,
523 |                                            tolerance=0.005)
524 |         self.moveit2.wait_until_executed()
525 | 
526 |     @cached_property
527 |     def cam_to_base_affine(self):
528 |         cam_to_base_link_tf = self.tf_buffer.lookup_transform(
529 |             target_frame="base_link",
530 |             source_frame="camera_color_frame",
531 |             time=Time(),
532 |             timeout=Duration(seconds=5))
533 |         cam_to_base_rot = Rotation.from_quat([
534 |             cam_to_base_link_tf.transform.rotation.x,
535 |             cam_to_base_link_tf.transform.rotation.y,
536 |             cam_to_base_link_tf.transform.rotation.z,
537 |             cam_to_base_link_tf.transform.rotation.w,
538 |         ])
539 |         cam_to_base_pos = np.array([
540 |             cam_to_base_link_tf.transform.translation.x,
541 |             cam_to_base_link_tf.transform.translation.y,
542 |             cam_to_base_link_tf.transform.translation.z,
543 |         ])
544 |         affine = np.eye(4)
545 |         affine[:3, :3] = cam_to_base_rot.as_matrix()
546 |         affine[:3, 3] = cam_to_base_pos
547 |         return affine
548 | 
549 |     def move_to(self, msg: Pose):
550 |         pose_goal = PoseStamped()
551 |         pose_goal.header.frame_id = "base_link"
552 |         pose_goal.pose = msg
553 | 
554 |         self.moveit2.move_to_pose(pose=pose_goal)
555 |         self.moveit2.wait_until_executed()
556 | 
557 |     def release_at(self, msg: Pose):
558 |         # NOTE: straight down is wxyz 0, 0, 1, 0
559 |         # good pose is 0, -0.3, 0.35
560 |         self.logger.info(f"Releasing at: {msg}")
561 |         self.move_to(msg)
562 | 
563 |         self.gripper_interface.open()
564 |         self.gripper_interface.wait_until_executed()
565 | 
566 |     def detect_objects_cb(self, msg: String):
567 |         if self._last_rgb_msg is None:
568 |             self.logger.warning("No RGB image available.")
569 |             return
570 | 
571 |         class_names = msg.data.split(",")
572 |         rgb_image = self.cv_bridge.imgmsg_to_cv2(self._last_rgb_msg)
573 |         self._last_detections = self.detect_objects(rgb_image, class_names)
574 |         detected_classes = [
575 |             class_names[class_id]
576 |             for class_id in self._last_detections.class_id
577 |         ]
578 |         self.logger.info(f"Detected objects: {detected_classes}")
579 | 
580 |     def pick_object_cb(self, msg: Int64):
581 |         if self._last_detections is None or self._last_depth_msg is None:
582 |             self.logger.warning("No detections or depth image available.")
583 |             return
584 | 
585 |         depth_image = self.cv_bridge.imgmsg_to_cv2(self._last_depth_msg)
586 |         self.pick_object(msg.data, self._last_detections, depth_image)
587 | 
588 |     def release_above_cb(self, msg: Int64):
589 |         if self._last_detections is None or self._last_depth_msg is None:
590 |             self.logger.warning("No detections or depth image available.")
591 |             return
592 | 
593 |         depth_image = self.cv_bridge.imgmsg_to_cv2(self._last_depth_msg)
594 |         self.release_above(msg.data, self._last_detections, depth_image)
595 | 
596 |     def depth_callback(self, msg):
597 |         self._last_depth_msg = msg
598 | 
599 |     def image_callback(self, msg):
600 |         self._last_rgb_msg = msg
601 | 
602 |     def joint_states_callback(self, msg: JointState):
603 |         joint_state = JointState()
604 |         joint_state.header = msg.header
605 |         for name in self.arm_joint_names:
606 |             for i, joint_state_joint_name in enumerate(msg.name):
607 |                 if name == joint_state_joint_name:
608 |                     joint_state.name.append(name)
609 |                     joint_state.position.append(msg.position[i])
610 |                     joint_state.velocity.append(msg.velocity[i])
611 |                     joint_state.effort.append(msg.effort[i])
612 | 
613 |         self.arm_joint_state = joint_state
614 |         # self.logger.info(f"joint_state in: {msg}")
615 |         # self.logger.info(f"joint_state out: {self.arm_joint_state}")
616 | 
617 |     def save_images(self, msg):
618 |         if not self._last_rgb_msg or not self._last_depth_msg:
619 |             return
620 | 
621 |         rgb_image = self.cv_bridge.imgmsg_to_cv2(self._last_rgb_msg)
622 |         depth_image = self.cv_bridge.imgmsg_to_cv2(self._last_depth_msg)
623 | 
624 |         save_dir = msg.data
625 |         cv2.imwrite(
626 |             os.path.join(save_dir, f"rgb_image_{self.n_frames_processed}.png"),
627 |             rgb_image)
628 |         np.save(
629 |             os.path.join(save_dir, f"depth_image_{self.n_frames_processed}"),
630 |             depth_image)
631 | 
632 | 
633 | def main():
634 |     rclpy.init()
635 |     node = TabletopHandyBotNode()
636 |     executor = MultiThreadedExecutor(4)
637 |     executor.add_node(node)
638 |     try:
639 |         executor.spin()
640 |     except KeyboardInterrupt:
641 |         pass
642 | 
643 |     rclpy.shutdown()
644 | 
645 | 
646 | if __name__ == "__main__":
647 |     main()
648 | 


--------------------------------------------------------------------------------