├── .github └── workflows │ └── documentation.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── agents ├── CHANGELOG.rst ├── CMakeLists.txt ├── agents │ ├── __init__.py │ ├── callbacks.py │ ├── clients │ │ ├── __init__.py │ │ ├── db_base.py │ │ ├── model_base.py │ │ ├── ollama.py │ │ └── roboml.py │ ├── components │ │ ├── __init__.py │ │ ├── component_base.py │ │ ├── imagestovideo.py │ │ ├── llm.py │ │ ├── map_encoding.py │ │ ├── mllm.py │ │ ├── model_component.py │ │ ├── semantic_router.py │ │ ├── speechtotext.py │ │ ├── texttospeech.py │ │ └── vision.py │ ├── config.py │ ├── models.py │ ├── publisher.py │ ├── resources │ │ ├── test.jpeg │ │ └── test.wav │ ├── ros.py │ ├── utils │ │ ├── __init__.py │ │ ├── pluralize.py │ │ ├── utils.py │ │ └── voice.py │ └── vectordbs.py ├── msg │ ├── Bbox2D.msg │ ├── Detection2D.msg │ ├── Detections2D.msg │ ├── Point2D.msg │ ├── Tracking.msg │ ├── Trackings.msg │ └── Video.msg ├── package.xml ├── scripts │ ├── chainlit_client │ │ ├── app.py │ │ ├── chainlit.md │ │ └── tiny_web_client │ └── executable └── tests │ └── test_clients.py ├── docs ├── _static │ ├── ROS_AGENTS.png │ ├── ROS_AGENTS_DARK.png │ ├── automatika-logo.png │ ├── complete_dark.png │ └── complete_light.png ├── basics.md ├── conf.py ├── examples │ ├── complete.md │ ├── conversational.md │ ├── goto.md │ ├── index.md │ ├── multiprocessing.md │ ├── prompt_engineering.md │ ├── semantic_map.md │ ├── semantic_router.md │ └── tool_calling.md ├── index.md ├── installation.md ├── intro.md └── quickstart.md ├── examples ├── complete_agent.py ├── complete_agent_multiprocessing.py ├── conversational_agent_with_audio.py ├── go_to_x.py ├── prompt_engineering.py ├── semantic_map.py ├── semantic_router.py └── tool_calling.py ├── interrogate_badge.svg └── pyproject.toml /.github/workflows/documentation.yml: -------------------------------------------------------------------------------- 1 | name: documentation 2 | 3 | on: [push, pull_request, workflow_dispatch] 4 | 5 | permissions: 6 | contents: write 7 | 8 | jobs: 9 | docs: 10 | runs-on: ubuntu-24.04 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: actions/setup-python@v5 14 | - name: Install dependencies 15 | run: | 16 | pip install --break-system-packages sphinx myst_parser sphinx-copybutton sphinx-autodoc2 sphinx-book-theme linkify-it-py 17 | - name: Sphinx build 18 | run: | 19 | sphinx-build docs _build 20 | - name: Deploy to GitHub Pages 21 | uses: peaceiris/actions-gh-pages@v3 22 | if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }} 23 | with: 24 | publish_branch: gh-pages 25 | github_token: ${{ secrets.GITHUB_TOKEN }} 26 | publish_dir: _build/ 27 | force_orphan: true 28 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *,cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # dotenv 80 | .env 81 | 82 | # virtualenv 83 | .venv/ 84 | venv/ 85 | ENV/ 86 | 87 | # Spyder project settings 88 | .spyderproject 89 | 90 | # Rope project settings 91 | .ropeproject 92 | 93 | # MAC 94 | .DS_Store 95 | 96 | # VSCode 97 | .vscode 98 | 99 | # ROS 100 | log/ 101 | install/ 102 | src/ 103 | 104 | # custom 105 | shared/ 106 | logdir/ 107 | data/ 108 | logs/ 109 | tmp/ 110 | *.csv 111 | *.h5 112 | *.npz 113 | *.zip 114 | *.ods 115 | *.xyz 116 | *.off 117 | *.obj 118 | 119 | # Ignores for web client 120 | .chainlit/ 121 | 122 | # Ignores for Docs 123 | docs/Makefile 124 | docs/make.bat 125 | docs/apidocs/ 126 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: check-yaml 6 | - id: end-of-file-fixer 7 | - id: trailing-whitespace 8 | - id: check-docstring-first 9 | - id: check-toml 10 | - repo: https://github.com/astral-sh/ruff-pre-commit 11 | # Ruff version. 12 | rev: v0.5.4 13 | hooks: 14 | # linter. 15 | - id: ruff 16 | types_or: [ python, pyi, jupyter ] 17 | # formatter. 18 | - id: ruff-format 19 | types_or: [ python, pyi, jupyter ] 20 | - repo: https://github.com/econchick/interrogate 21 | rev: 1.7.0 22 | hooks: 23 | # docstring coverage 24 | - id: interrogate 25 | args: [-vv, --fail-under=80, -c, pyproject.toml] 26 | pass_filenames: false 27 | 28 | 29 | ## Uncomment mypy for type-checking errors in pre-commit 30 | 31 | # - repo: https://github.com/pre-commit/mirrors-mypy 32 | # rev: v1.5.0 33 | # hooks: 34 | # - id: mypy 35 | # additional_dependencies: [tokenize-rt==3.2.0, 'types-PyYAML'] 36 | # exclude: ^tests/ 37 | # args: 38 | # [ 39 | # "--ignore-missing-imports", 40 | # "--check-untyped-defs", 41 | # "--warn-redundant-casts", 42 | # "--no-implicit-optional", 43 | # "--warn-return-any" 44 | # ] 45 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Automatika Robotics 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | ROS Agents Logo. 5 | 6 | 7 | ROS Agents is a fully-loaded framework for creating interactive embodied agents that can understand, remember, and act upon contextual information from their environment. 8 | 9 | - **Agents in the real world:** Designed to be used with autonomous robot systems that operate in dynamic environments, specifically AMRs. 10 | - **Intuitive API**: Simple pythonic API to utilize local or cloud based ML models (specifically **Multimodal LLMs** and other **Transformer Architectures**) on robots. 11 | - **Semantic Memory**: Integrates vector databases, semantic routing and other supporting components to quickly build arbitrarily complex graphs for agentic information flow. No need to utilize bloated "GenAI" frameworks on your robot. 12 | - **Made in ROS2**: Utilizes ROS2 as the underlying middleware. Theoretically, all devices that provide a ROS2 package can be utilized to send data to ML models, as long as the datatype callback has been implemented. 13 | 14 | Checkout [Installation Instructions](https://automatika-robotics.github.io/ros-agents/installation.html) 🛠️ 15 | 16 | Get started with the [Quickstart Guide](https://automatika-robotics.github.io/ros-agents/quickstart.html) 🚀 17 | 18 | Get familiar with [Basic Concepts](https://automatika-robotics.github.io/ros-agents/basics.html) 📚 19 | 20 | Dive right in with [Examples](https://automatika-robotics.github.io/ros-agents/examples/index.html) ✨ 21 | 22 | ## Installation 🛠️ 23 | 24 | ### Pre-Requisits 25 | 26 | #### Install ROS 27 | 28 | ROS Agents is built to be used with ROS2. All ROS distributions starting from _Iron_ are supported. Install ROS2 by following the instructions on the [official site](https://docs.ros.org/en/iron/Installation.html). 29 | 30 | #### Install a model serving platform 31 | 32 | The core of ROS Agents is agnostic to model serving platforms. It currently supports [Ollama](https://ollama.com) and [RoboML](https://github.com/automatika-robotics/robo-ml). Please install either of these by following the instructions provided by respective projects. Support for new platforms will be continuously added. If you would like to support a particular platform, please open an issue/PR. 33 | 34 | ### Install ROS Agents (Ubuntu) 35 | 36 | **Binary packages for Ubuntu will be released soon. Check this space.** 37 | 38 | ### Install ROS Agents from source 39 | 40 | #### Get Dependencies 41 | 42 | Install python dependencies 43 | 44 | ```shell 45 | pip install numpy opencv-python-headless 'attrs>=23.2.0' jinja2 httpx setproctitle msgpack msgpack-numpy numpy-quaternion platformdirs 46 | ``` 47 | 48 | Download ROS Sugar 49 | 50 | ```shell 51 | git clone https://github.com/automatika-robotics/ros-sugar 52 | ``` 53 | 54 | #### Install ROS Agents 55 | 56 | ```shell 57 | git clone https://github.com/automatika-robotics/ros-agents.git 58 | cd .. 59 | colcon build 60 | source install/setup.bash 61 | python your_script.py 62 | ``` 63 | 64 | ## Quick Start 🚀 65 | 66 | Unlike other ROS package, ROS Agents provides a pure pythonic way of describing the node graph using [ROS Sugar](https://www.github.com/automatika-robotics/ros-sugar). Copy the following code in a python script and run it. 67 | 68 | ```python 69 | from agents.clients.ollama import OllamaClient 70 | from agents.components import MLLM 71 | from agents.models import Llava 72 | from agents.ros import Topic, Launcher 73 | 74 | # Define input and output topics (pay attention to msg_type) 75 | text0 = Topic(name="text0", msg_type="String") 76 | image0 = Topic(name="image_raw", msg_type="Image") 77 | text1 = Topic(name="text1", msg_type="String") 78 | 79 | # Define a model client (working with Ollama in this case) 80 | llava = Llava(name="llava") 81 | llava_client = OllamaClient(llava) 82 | 83 | # Define an MLLM component (A component represents a node with a particular functionality) 84 | mllm = MLLM( 85 | inputs=[text0, image0], 86 | outputs=[text1], 87 | model_client=llava_client, 88 | trigger=[text0], 89 | component_name="vqa" 90 | ) 91 | # Additional prompt settings 92 | mllm.set_topic_prompt(text0, template="""You are an amazing and funny robot. 93 | Answer the following about this image: {{ text0 }}""" 94 | ) 95 | # Launch the component 96 | launcher = Launcher() 97 | launcher.add_pkg(components=[mllm]) 98 | launcher.bringup() 99 | ``` 100 | 101 | And just like that we have an agent that can answer questions like **'What do you see?'**. To interact with this agent, ROS Agents includes a tiny web client. Checkout the [Quick Start Guide](https://automatika-robotics.github.io/ros-agents/quickstart.html) to learn more about how components and models work together. 102 | 103 | ## Elaborate Embodied Agents 104 | The quickstart example above is just an amuse-bouche of what is possible with ROS Agents. In ROS Agents we can create arbitrarily sophisticated component graphs. And furthermore our system can be configured to even change or reconfigure itself based on events internal or external to the system. Check out the code for the following agent [here](https://automatika-robotics.github.io/ros-agents/examples/complete.html). 105 | 106 | 107 | 108 | 109 | Elaborate Agent 110 | 111 | 112 | ## Copyright 113 | 114 | The code in this distribution is Copyright (c) 2024 Automatika Robotics unless explicitly indicated otherwise. 115 | 116 | ROS Agents is made available under the MIT license. Details can be found in the [LICENSE](LICENSE) file. 117 | 118 | ## Contributions 119 | 120 | ROS Agents has been developed in collaboration betweeen [Automatika Robotics](https://automatikarobotics.com/) and [Inria](https://inria.fr/). Contributions from the community are most welcome. 121 | -------------------------------------------------------------------------------- /agents/CHANGELOG.rst: -------------------------------------------------------------------------------- 1 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 2 | Changelog for package automatika_embodied_agents 3 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 4 | 5 | 0.3.3 (2025-01-28) 6 | ------------------ 7 | * (fix) Removes python dependencies from package manifest until package names merged in rosdistro 8 | * Contributors: ahr 9 | 10 | 0.3.2 (2025-01-28) 11 | ------------------ 12 | * (docs) Updates docs for conversational agent and SpeechToTextConfig 13 | * (feature) Adds vad, audio feautres and wakeword classification classes based local onnx models 14 | * (feature) Adds utility function for downloading models and status classes for speech processing 15 | * (feature) Adds configuration for wakeword detections in speechtotext component 16 | * (fix) Fixes error in ollama client where tool calls are received without output content 17 | * (fix) Adds a fix to map encoding where it can start with a single detections layer 18 | * (refactor) Makes component name non-optional in components to avoid name conflicts 19 | * (fix) Fixes error for long prompts when checking if prompt is a filename 20 | * (refactor) Removes pytorch as a dependency and runs VAD model with onnxruntime 21 | * (refactor) Makes warmup a property of model components that defaults to false 22 | * (feature) Adds utility method to download onnx model files 23 | * (refactor) Replaces info with debug to reduce logging spam 24 | * (fix) Fixes getting logging severity level for jazzy onwards 25 | * (fix) Adds minor improvements to branching for llm and mllm components 26 | * (chore) Cleansup dependencies for packaging 27 | * (chore) Adds dependency for sugar and removes unnecessary python dependencies from packaging 28 | * (fix) Corrects import of Topic class 29 | * (docs) Removes redefinition of Topic and corrects links to ROS Sugar 30 | * (fix) Changes topic in base component to be directly inherited from ROS Sugar for consistency accross packages 31 | * (feature) Adds warmup functions to all model based components 32 | * (refactor) Removes pillow as a dependancy 33 | * (refactor) Removes overrrides from components and adds custom meathods instead 34 | * (feature) Adds warmup to vision component for displaying stats on init 35 | * (fix) Adds fix for correct colors in cv2 visualization 36 | * (fix) Adds node name as window name for visualization in vision component 37 | * (feature) Adds cv2 based visualization option to vision component 38 | * (refactor) Reduces branching in execution step for components 39 | * (chore) Combines agents and agents_interfaces to one package 40 | * (chore) Changes deb package name 41 | * (fix) Fixes raising error in model initialization for roboml clients 42 | * (refactor) Adds passing additional agent types to ros sugar 43 | * (fix) Fixes error messages when wrong component inputs/outputs are passed 44 | * (feature) Adds support for CompressedImage msg type in components 45 | * (feature) Adds option to deploy vision models using tensorrt 46 | Works with roboml 47 | * (fix) Fixes check on sufficient topics in component validation 48 | * (fix) Fixes a bug in topic validation 49 | * (fix) Fixes validation of topics in components 50 | * (refactor) Changes handling of image messages for publication 51 | - Adds support for CompressedImage messages 52 | - Gathers image messages directly in vision component instead of getting them back from clients 53 | * (feature) Adds frame_id to trackings publisher and updates msg and callback 54 | * (feature) Adds boxes to vision tracking message 55 | * Contributors: ahr, mkabtoul 56 | 57 | 0.3.1 (2024-10-29) 58 | ------------------ 59 | * (chore) bump version 0.3.0 -> 0.3.1 60 | * (feature) Adds support for using tool calling in LLM components in multiprocess execution 61 | * Contributors: ahr 62 | 63 | 0.3.0 (2024-10-28) 64 | ------------------ 65 | * (chore) bump version 0.2.0 -> 0.3.0 66 | * (chore) Adds bumpver config 67 | * Merge pull request `#14 `_ from automatika-robotics/feature/external_processors 68 | Adds support for running components as separate processes 69 | * (docs) Updates docs based on ROS Sugar version update 70 | * (fix) Fixes bug in registering triggers with components 71 | * (refactor) Simplifies by adding direct serialization of clients and triggers 72 | * (refactor) Removes gratuitous logging from utils 73 | * (fix) Minor bug fixes for components to run in multiprocessing 74 | - Fixes trigger assignment for components 75 | - Handles private attributes of attrs classes 76 | - Fixes component and config init in common executable 77 | * (fix) Fixes serializing log level in clients 78 | * (fix) Fixes minor bugs in utils, components, configs and models 79 | * (feature) Adds support for running components in multiple processes 80 | - Adds common executable to the package for ROS Sugar launcher 81 | - Refactors components to be serializable 82 | - Adds serialization to clients 83 | - Minor type hint changes for compatibility with older versions of ROS 84 | * (fix) Adds the correct check for external processors given new ros-sugar implementation 85 | * Contributors: ahr 86 | 87 | 0.2.0 (2024-09-28) 88 | ------------------ 89 | * (chore) Bump up the version 90 | * Merge pull request `#13 `_ from automatika-robotics/feature/better_clients 91 | Adds enhanced functionality in clients specifically for LLM and MLLM components 92 | * (feature) Adds tool calling for LLM component using the OllamaClient 93 | * (fix) Fixes rag results in templated inputs to LLMs which do not contain input 94 | * (refactor) Makes named models subclasses of TransformersLLM and TransformersMLLM for easier handling in roboml client 95 | * (fix) Fixes key error in ollama client response retreival 96 | * (fix) Adds flag for chat history for chat history reset and fixes logging 97 | * (feature) Adds TransformersLLM and TransformersMLLM models for roboml clients 98 | * (fix) Removes history reset phrase from model definitions and add system prompt for LLMs and derivates 99 | * (refactor) Changes model component to have execution step as an abstract method implemented by child components 100 | * (fix) Changes ollama client inference call to use chat endpoint 101 | * (feature) Adds chat history management to llm and mllm components 102 | * (docs) Clarifies handling of RAG results for llm component 103 | * (fix) Fixes bug in rag result handling for llm component 104 | * (fix) Removes default init_timeout from models 105 | * (refactor) Moves roboml resp client dependancies inside the client initialization 106 | * (fix) Explicity exposes QoSConfig in ros module 107 | * (refactor) Replaces map_meta_data parameter with map_topic for MapEncoding component 108 | * (refactor) Removes direct dependancy on pypdf 109 | * (fix) Changes map meta data topic to type OccupancyGrid 110 | * (feature) Adds audio options to chainlit client 111 | * (fix) Removes unused imports 112 | * (fix) Fixes the initialization of map encoding and semantic router components 113 | * (refactor) Fixes imports and refactors code according to latest version of ROS sugar 114 | * (fix) Fixes passing the config in components to parent base component 115 | * (fix) Fixes ROS sugar import for BaseTopic 116 | * (refactor) Removes auto_ros as a dependency 117 | * (feature) Adds init_on_activation flag to all implemented clientsc 118 | * (feature) Seperates abstract methods from callable methods in db client base 119 | * (feature) Seperates callable methods, from abstract methods in client base class 120 | * Contributors: ahr 121 | 122 | 0.1.1 (2024-09-05) 123 | ------------------ 124 | * (feature) Adds component action for adding points to map collection (`#12 `_) 125 | * Makes version compliant with ROS convention 126 | * (chore) Adds license declaration in setup.py 127 | * Bumps version number and adds license information 128 | * Initial release 0.1.1a 129 | * Contributors: ahr, mkabtoul 130 | -------------------------------------------------------------------------------- /agents/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.8) 2 | project(automatika_embodied_agents) 3 | 4 | if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") 5 | add_compile_options(-Wall -Wextra -Wpedantic) 6 | endif() 7 | 8 | # find dependencies 9 | find_package(ament_cmake REQUIRED) 10 | find_package(ament_cmake_python REQUIRED) 11 | 12 | find_package(rclcpp REQUIRED) 13 | find_package(rclpy REQUIRED) 14 | find_package(rosidl_default_generators REQUIRED) 15 | find_package(builtin_interfaces REQUIRED) 16 | find_package(std_msgs REQUIRED) 17 | find_package(sensor_msgs REQUIRED) 18 | 19 | file(GLOB_RECURSE MSG_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "msg/*.msg" ) 20 | 21 | rosidl_generate_interfaces(${PROJECT_NAME} 22 | ${MSG_FILES} 23 | DEPENDENCIES builtin_interfaces std_msgs sensor_msgs 24 | ) 25 | 26 | ament_export_dependencies(rosidl_default_runtime) 27 | 28 | # Install Python module 29 | ament_python_install_package(agents) 30 | # Add executables 31 | install(PROGRAMS 32 | scripts/executable 33 | scripts/chainlit_client/tiny_web_client 34 | scripts/chainlit_client/app.py # chainlit app definition 35 | scripts/chainlit_client/chainlit.md # readme picked by chainlit client 36 | DESTINATION lib/${PROJECT_NAME} 37 | ) 38 | 39 | ament_package() 40 | -------------------------------------------------------------------------------- /agents/agents/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/agents/agents/__init__.py -------------------------------------------------------------------------------- /agents/agents/callbacks.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import os 3 | import cv2 4 | import numpy as np 5 | from ros_sugar.io import ( 6 | GenericCallback, 7 | TextCallback, 8 | get_logger, 9 | ) 10 | 11 | from ros_sugar.io.utils import image_pre_processing, read_compressed_image 12 | 13 | from .utils import create_detection_context 14 | 15 | __all__ = ["GenericCallback", "TextCallback"] 16 | 17 | 18 | class VideoCallback(GenericCallback): 19 | """ 20 | Video Callback class. Its get method saves a video as list of bytes 21 | """ 22 | 23 | def __init__(self, input_topic, node_name: Optional[str] = None) -> None: 24 | """ 25 | Constructs a new instance. 26 | :param input_topic: Subscription topic 27 | :type input_topic: Input 28 | """ 29 | super().__init__(input_topic, node_name) 30 | # fixed video needs to be a path to cv2 readable video 31 | if hasattr(input_topic, "fixed"): 32 | if os.path.isfile(input_topic.fixed): 33 | try: 34 | # read all video frames 35 | video = [] 36 | cap = cv2.VideoCapture(input_topic.fixed) 37 | if not cap.isOpened(): 38 | raise TypeError() 39 | while cap.isOpened(): 40 | ret, frame = cap.read() 41 | if ret: 42 | video.append(frame) 43 | else: 44 | break 45 | # Convert frame list to ndarray 46 | self.msg = np.array(video) 47 | except Exception: 48 | get_logger(self.node_name).error( 49 | f"Fixed path {self.msg} provided for Vidoe topic is not readable Video file" 50 | ) 51 | else: 52 | get_logger(self.node_name).error( 53 | f"Fixed path {self.msg} provided for Video topic is not a valid file path" 54 | ) 55 | 56 | def _get_output(self, **_) -> Optional[np.ndarray]: 57 | """ 58 | Gets video as a numpy array. 59 | :returns: Video as nd_array 60 | :rtype: np.ndarray 61 | """ 62 | if not self.msg: 63 | return None 64 | 65 | # return np.ndarray if fixed video has been read 66 | if isinstance(self.msg, np.ndarray): 67 | return self.msg 68 | else: 69 | # pre-process in case of weird encodings and reshape ROS topic 70 | video = [] 71 | for img in self.msg.frames: 72 | video.append(image_pre_processing(img)) 73 | for img in self.msg.compressed_frames: 74 | video.append(read_compressed_image(img)) 75 | return np.array(video) 76 | 77 | 78 | class ObjectDetectionCallback(GenericCallback): 79 | """ 80 | Object detection Callback class. 81 | Its get method returns the bounding box data 82 | """ 83 | 84 | def __init__(self, input_topic, node_name: Optional[str] = None) -> None: 85 | """ 86 | Constructs a new instance. 87 | 88 | :param input_topic: Subscription topic 89 | :type input_topic: str 90 | """ 91 | super().__init__(input_topic, node_name) 92 | self.msg = input_topic.fixed if hasattr(input_topic, "fixed") else None 93 | 94 | def _get_output(self, **_) -> Optional[str]: 95 | """ 96 | Processes labels and returns a context string for 97 | prompt engineering 98 | 99 | :returns: Comma separated classnames 100 | :rtype: str 101 | """ 102 | if not self.msg: 103 | return None 104 | # send fixed list of labels if it exists 105 | if isinstance(self.msg, list): 106 | return create_detection_context(self.msg) 107 | # send labels from ROS message 108 | else: 109 | label_list = [ 110 | label for detection in self.msg.detections for label in detection.labels 111 | ] 112 | detections_string = create_detection_context(label_list) 113 | return detections_string 114 | -------------------------------------------------------------------------------- /agents/agents/clients/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Clients are standard interfaces for components to interact with ML models or vector DBs served by various platforms. Currently ROS Agents provides the following clients, which cover the most popular open source model deployment platforms. Simple clients can be easily implemented for other platforms and the use of heavy duct-tape "AI" frameworks on the robot is discouraged 😅. 3 | 4 | ```{note} 5 | Some clients might need additional dependacies, which are provided in the following table. If missing the user will also be prompted for them at runtime. 6 | ``` 7 | 8 | ```{list-table} 9 | :widths: 20 20 60 10 | :header-rows: 1 11 | * - Platform 12 | - Client 13 | - Description 14 | 15 | * - **RoboML** 16 | - [HTTPModelClient](agents.clients.roboml.HTTPModelClient) 17 | - An HTTP client for interaction with ML models served on RoboML. 18 | 19 | * - **RoboML** 20 | - [HTTPDBClient](agents.clients.roboml.HTTPDBClient) 21 | - An HTTP client for interaction with vector DBs served on RoboML. 22 | 23 | * - **RoboML** 24 | - [RESPModelClient](agents.clients.roboml.RESPModelClient) 25 | - A Redis Serialization Protocol (RESP) based client for interaction with ML models served on RoboML. **Note:** In order to use this client, please install dependancies with `pip install redis[hiredis] msgpack msgpack-numpy` 26 | 27 | * - **RoboML** 28 | - [RESPDBClient](agents.clients.roboml.RESPDBClient) 29 | - A Redis Serialization Protocol (RESP) based client for interaction with vector DBs served on RoboML. **Note:** In order to use this client, please install dependancies with `pip install redis[hiredis] msgpack msgpack-numpy` 30 | 31 | * - **Ollama** 32 | - [OllamaClient](agents.clients.ollama.OllamaClient) 33 | - An HTTP client for interaction with ML models served on Ollama. **Note:** In order to use this client, please install dependancies with `pip install ollama` 34 | 35 | """ 36 | 37 | from .ollama import OllamaClient 38 | from .roboml import HTTPDBClient, HTTPModelClient, RESPDBClient, RESPModelClient 39 | 40 | 41 | __all__ = [ 42 | "OllamaClient", 43 | "HTTPDBClient", 44 | "HTTPModelClient", 45 | "RESPDBClient", 46 | "RESPModelClient", 47 | ] 48 | -------------------------------------------------------------------------------- /agents/agents/clients/db_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Optional, Dict, Union 3 | 4 | from rclpy import logging 5 | 6 | from ..vectordbs import DB 7 | from ..utils import validate_func_args 8 | 9 | 10 | class DBClient(ABC): 11 | """DBClient.""" 12 | 13 | @validate_func_args 14 | def __init__( 15 | self, 16 | db: Union[DB, Dict], 17 | host: Optional[str] = None, 18 | port: Optional[int] = None, 19 | response_timeout: int = 30, 20 | init_on_activation: bool = True, 21 | logging_level: str = "info", 22 | **_, 23 | ): 24 | """__init__. 25 | :param db: 26 | :type db: DB 27 | :param host: 28 | :type host: Optional[str] 29 | :param port: 30 | :type port: Optional[int] 31 | :param init_on_activation: 32 | :type init_on_activation: bool 33 | :param logging_level: 34 | :type logging_level: str 35 | """ 36 | if isinstance(db, DB): 37 | self.db_type = db.__class__.__name__ 38 | self.db_name = db.name 39 | self.init_timeout = db.init_timeout 40 | self.db_init_params = db._get_init_params() 41 | 42 | else: 43 | self.db_type = db["db_type"] 44 | self.db_name = db["db_name"] 45 | self.init_timeout = db["init_timeout"] 46 | self.db_init_params = db["db_init_params"] 47 | 48 | self.host = host 49 | self.port = port 50 | self.init_on_activation = init_on_activation 51 | self.logger = logging.get_logger(self.db_name) 52 | logging.set_logger_level( 53 | self.db_name, logging.get_logging_severity_from_string(logging_level) 54 | ) 55 | self.response_timeout = response_timeout 56 | 57 | def serialize(self) -> Dict: 58 | """Get client json 59 | :rtype: Dict 60 | """ 61 | db = { 62 | "db_name": self.db_name, 63 | "db_type": self.db_type, 64 | "init_timeout": self.init_timeout, 65 | "db_init_params": self.db_init_params, 66 | } 67 | 68 | return { 69 | "client_type": self.__class__.__name__, 70 | "db": db, 71 | "host": self.host, 72 | "port": self.port, 73 | "init_on_activation": self.init_on_activation, 74 | "logging_level": self.logger.get_effective_level().name, 75 | "response_timeout": self.response_timeout, 76 | } 77 | 78 | def check_connection(self) -> None: 79 | """initialize. 80 | :rtype: None 81 | """ 82 | self._check_connection() 83 | 84 | def initialize(self) -> None: 85 | """initialize. 86 | :rtype: None 87 | """ 88 | if self.init_on_activation: 89 | self._initialize() 90 | 91 | def add(self, db_input: Dict[str, Any]) -> Optional[Dict]: 92 | """add data. 93 | :param db_input: 94 | :type db_input: dict[str, Any] 95 | :rtype: dict | None 96 | """ 97 | return self._add(db_input) 98 | 99 | def conditional_add(self, db_input: Dict[str, Any]) -> Optional[Dict]: 100 | """add data if given ids dont exist. Update metadatas of the ids that exist 101 | :param db_input: 102 | :type db_input: dict[str, Any] 103 | :rtype: dict | None 104 | """ 105 | return self._conditional_add(db_input) 106 | 107 | def metadata_query(self, db_input: Dict[str, Any]) -> Optional[Dict]: 108 | """Query based on given metadata. 109 | :param db_input: 110 | :type db_input: dict[str, Any] 111 | :rtype: dict | None 112 | """ 113 | return self._metadata_query(db_input) 114 | 115 | def query(self, db_input: Dict[str, Any]) -> Optional[Dict]: 116 | """Query based on query string. 117 | :param db_input: 118 | :type db_input: dict[str, Any] 119 | :rtype: dict | None 120 | """ 121 | return self._query(db_input) 122 | 123 | def deinitialize(self) -> None: 124 | """deinitialize.""" 125 | # TODO: Add check for db initialization by keeping db 126 | # state in client 127 | if self.init_on_activation: 128 | self._deinitialize() 129 | 130 | @abstractmethod 131 | def _check_connection(self) -> None: 132 | """check_connection. 133 | :rtype: None 134 | """ 135 | raise NotImplementedError( 136 | "This method needs to be implemented in a child class" 137 | ) 138 | 139 | @abstractmethod 140 | def _initialize(self) -> None: 141 | """initialize. 142 | :rtype: None 143 | """ 144 | raise NotImplementedError( 145 | "This method needs to be implemented in a child class" 146 | ) 147 | 148 | @abstractmethod 149 | def _add(self, db_input: Dict[str, Any]) -> Optional[Dict]: 150 | """add data. 151 | :param db_input: 152 | :type db_input: dict[str, Any] 153 | :rtype: dict | None 154 | """ 155 | raise NotImplementedError( 156 | "This method needs to be implemented in a child class" 157 | ) 158 | 159 | @abstractmethod 160 | def _conditional_add(self, db_input: Dict[str, Any]) -> Optional[Dict]: 161 | """add data if given ids dont exist. Update metadatas of the ids that exist 162 | :param db_input: 163 | :type db_input: dict[str, Any] 164 | :rtype: dict | None 165 | """ 166 | raise NotImplementedError( 167 | "This method needs to be implemented in a child class" 168 | ) 169 | 170 | @abstractmethod 171 | def _metadata_query(self, db_input: Dict[str, Any]) -> Optional[Dict]: 172 | """Query based on given metadata. 173 | :param db_input: 174 | :type db_input: dict[str, Any] 175 | :rtype: dict | None 176 | """ 177 | raise NotImplementedError( 178 | "This method needs to be implemented in a child class" 179 | ) 180 | 181 | @abstractmethod 182 | def _query(self, db_input: Dict[str, Any]) -> Optional[Dict]: 183 | """Query based on query string. 184 | :param db_input: 185 | :type db_input: dict[str, Any] 186 | :rtype: dict | None 187 | """ 188 | raise NotImplementedError( 189 | "This method needs to be implemented in a child class" 190 | ) 191 | 192 | @abstractmethod 193 | def _deinitialize(self) -> None: 194 | """deinitialize.""" 195 | raise NotImplementedError( 196 | "This method needs to be implemented in a child class" 197 | ) 198 | -------------------------------------------------------------------------------- /agents/agents/clients/model_base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Optional, Dict, Union 3 | 4 | from rclpy import logging 5 | 6 | from ..models import Model 7 | from ..utils import validate_func_args 8 | 9 | 10 | class ModelClient(ABC): 11 | """MLClient.""" 12 | 13 | @validate_func_args 14 | def __init__( 15 | self, 16 | model: Union[Model, Dict], 17 | host: Optional[str] = None, 18 | port: Optional[int] = None, 19 | inference_timeout: int = 30, 20 | init_on_activation: bool = True, 21 | logging_level: str = "info", 22 | **_, 23 | ): 24 | """__init__. 25 | :param model: 26 | :type model: Model 27 | :param host: 28 | :type host: Optional[str] 29 | :param port: 30 | :type port: Optional[int] 31 | :param inference_timeout: 32 | :type inference_timeout: int 33 | :param logging_level: 34 | :type logging_level: str 35 | """ 36 | if isinstance(model, Model): 37 | self._model = model 38 | self.model_type = model.__class__.__name__ 39 | self.model_name = model.name 40 | self.init_timeout = model.init_timeout 41 | self.model_init_params = model._get_init_params() 42 | 43 | else: 44 | self.model_type = model["model_type"] 45 | self.model_name = model["model_name"] 46 | self.init_timeout = model["init_timeout"] 47 | self.model_init_params = model["model_init_params"] 48 | 49 | self.host = host 50 | self.port = port 51 | self.init_on_activation = init_on_activation 52 | self.logger = logging.get_logger(self.model_name) 53 | logging.set_logger_level( 54 | self.model_name, logging.get_logging_severity_from_string(logging_level) 55 | ) 56 | self.inference_timeout = inference_timeout 57 | 58 | def serialize(self) -> Dict: 59 | """Get client json 60 | :rtype: Dict 61 | """ 62 | model = { 63 | "model_name": self.model_name, 64 | "model_type": self.model_type, 65 | "init_timeout": self.init_timeout, 66 | "model_init_params": self.model_init_params, 67 | } 68 | 69 | return { 70 | "client_type": self.__class__.__name__, 71 | "model": model, 72 | "host": self.host, 73 | "port": self.port, 74 | "init_on_activation": self.init_on_activation, 75 | "logging_level": self.logger.get_effective_level().name, 76 | "inference_timeout": self.inference_timeout, 77 | } 78 | 79 | def check_connection(self) -> None: 80 | """initialize. 81 | :rtype: None 82 | """ 83 | self._check_connection() 84 | 85 | def initialize(self) -> None: 86 | """initialize. 87 | :rtype: None 88 | """ 89 | if self.init_on_activation: 90 | self._initialize() 91 | 92 | def inference(self, inference_input: Dict[str, Any]) -> Optional[Dict]: 93 | """inference. 94 | :param inference_input: 95 | :type inference_input: dict[str, Any] 96 | :rtype: dict | None 97 | """ 98 | return self._inference(inference_input) 99 | 100 | def deinitialize(self): 101 | """deinitialize.""" 102 | # TODO: Add check for model initialization by keeping model 103 | # state in client 104 | if self.init_on_activation: 105 | self._deinitialize() 106 | 107 | @abstractmethod 108 | def _check_connection(self) -> None: 109 | """check_connection. 110 | :rtype: None 111 | """ 112 | raise NotImplementedError( 113 | "This method needs to be implemented in a child class" 114 | ) 115 | 116 | @abstractmethod 117 | def _initialize(self) -> None: 118 | """initialize. 119 | :rtype: None 120 | """ 121 | raise NotImplementedError( 122 | "This method needs to be implemented in a child class" 123 | ) 124 | 125 | @abstractmethod 126 | def _inference(self, inference_input: Dict[str, Any]) -> Optional[Dict]: 127 | """inference. 128 | :param inference_input: 129 | :type inference_input: dict[str, Any] 130 | :rtype: dict | None 131 | """ 132 | raise NotImplementedError( 133 | "This method needs to be implemented in a child class" 134 | ) 135 | 136 | @abstractmethod 137 | def _deinitialize(self): 138 | """deinitialize.""" 139 | raise NotImplementedError( 140 | "This method needs to be implemented in a child class" 141 | ) 142 | -------------------------------------------------------------------------------- /agents/agents/clients/ollama.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Dict, Union 2 | 3 | import httpx 4 | 5 | from ..models import LLM 6 | from ..utils import encode_arr_base64 7 | from .model_base import ModelClient 8 | 9 | __all__ = ["OllamaClient"] 10 | 11 | 12 | class OllamaClient(ModelClient): 13 | """An HTTP client for interaction with ML models served on ollama""" 14 | 15 | def __init__( 16 | self, 17 | model: Union[LLM, Dict], 18 | host: str = "127.0.0.1", 19 | port: int = 11434, 20 | inference_timeout: int = 30, 21 | init_on_activation: bool = True, 22 | logging_level: str = "info", 23 | **kwargs, 24 | ): 25 | if isinstance(model, LLM): 26 | model._set_ollama_checkpoint() 27 | try: 28 | from ollama import Client 29 | 30 | self.client = Client(host=f"{host}:{port}") 31 | except ModuleNotFoundError as e: 32 | raise ModuleNotFoundError( 33 | "In order to use the OllamaClient, you need ollama-python package installed. You can install it with 'pip install ollama'" 34 | ) from e 35 | super().__init__( 36 | model=model, 37 | host=host, 38 | port=port, 39 | inference_timeout=inference_timeout, 40 | init_on_activation=init_on_activation, 41 | logging_level=logging_level, 42 | **kwargs, 43 | ) 44 | self._check_connection() 45 | 46 | def _check_connection(self) -> None: 47 | """Check if the platfrom is being served on specified IP and port""" 48 | # Ping remote server to check connection 49 | self.logger.info("Checking connection with remote_host Ollama") 50 | try: 51 | httpx.get(f"http://{self.host}:{self.port}").raise_for_status() 52 | except Exception as e: 53 | self.logger.error(str(e)) 54 | raise 55 | 56 | def _initialize(self) -> None: 57 | """ 58 | Initialize the model on platform using the paramters provided in the model specification class 59 | """ 60 | self.logger.info(f"Initializing {self.model_name} on ollama") 61 | try: 62 | # set timeout on underlying httpx client 63 | self.client._client.timeout = self.init_timeout 64 | r = self.client.pull(self.model_init_params["checkpoint"]) 65 | if r.get("status") != "success": # type: ignore 66 | raise Exception( 67 | f"Could not pull model {self.model_init_params['checkpoint']}" 68 | ) 69 | # load model in memory with empty request 70 | self.client.generate( 71 | model=self.model_init_params["checkpoint"], keep_alive=10 72 | ) 73 | self.logger.info(f"{self.model_name} model initialized") 74 | except Exception as e: 75 | self.logger.error(str(e)) 76 | return None 77 | 78 | def _inference(self, inference_input: Dict[str, Any]) -> Optional[Dict]: 79 | """Call inference on the model using data and inference parameters from the component""" 80 | if not (query := inference_input.get("query")): 81 | raise TypeError( 82 | "OllamaClient can only be used with LLM and MLLM components" 83 | ) 84 | # create input 85 | input = { 86 | "model": self.model_init_params["checkpoint"], 87 | "messages": query, 88 | } 89 | inference_input.pop("query") 90 | 91 | # make images part of the latest message in message list 92 | if images := inference_input.get("images"): 93 | input["messages"][-1]["images"] = [encode_arr_base64(img) for img in images] 94 | inference_input.pop("images") 95 | 96 | # Add tools as part of input, if available 97 | if tools := inference_input.get("tools"): 98 | input["tools"] = tools 99 | inference_input.pop("tools") 100 | 101 | # ollama uses num_predict for max_new_tokens 102 | if inference_input.get("max_new_tokens"): 103 | inference_input["num_predict"] = inference_input["max_new_tokens"] 104 | inference_input.pop("max_new_tokens") 105 | input["options"] = inference_input 106 | 107 | # call inference method 108 | try: 109 | # set timeout on underlying httpx client 110 | self.client._client.timeout = self.inference_timeout 111 | ollama_result = self.client.chat(**input) 112 | except Exception as e: 113 | self.logger.error(str(e)) 114 | return None 115 | 116 | self.logger.debug(str(ollama_result)) 117 | 118 | # make result part of the input 119 | if output := ollama_result["message"].get("content"): 120 | input["output"] = output # type: ignore 121 | # if tool calls exist 122 | if tool_calls := ollama_result["message"].get("tool_calls"): # type: ignore 123 | input["tool_calls"] = tool_calls 124 | return input 125 | else: 126 | # if tool calls exist 127 | if tool_calls := ollama_result["message"].get("tool_calls"): # type: ignore 128 | input["output"] = "" # Add empty output for tool calls 129 | input["tool_calls"] = tool_calls 130 | return input 131 | 132 | # no output or tool calls 133 | self.logger.debug("Output not received") 134 | return 135 | 136 | def _deinitialize(self): 137 | """Deinitialize the model on the platform""" 138 | 139 | self.logger.error(f"Deinitializing {self.model_name} model on ollama") 140 | try: 141 | self.client.generate( 142 | model=self.model_init_params["checkpoint"], keep_alive=0 143 | ) 144 | except Exception as e: 145 | self.logger.error(str(e)) 146 | return None 147 | -------------------------------------------------------------------------------- /agents/agents/components/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | A Component is the main execution unit in ROS Agents and in essence each component is synctactic sugar over a ROS2 Lifecycle Node. ROS Agents provides the following components. These components can be arbitrarily combined to form an embodied agent graph. 3 | 4 | ```{list-table} 5 | :widths: 20 80 6 | :header-rows: 1 7 | * - Component Name 8 | - Description 9 | 10 | * - **[LLM](agents.components.llm.md)** 11 | - This component utilizes large language models (e.g LLama) that can be used to process text data. 12 | 13 | * - **[MLLM](agents.components.mllm.md)** 14 | - This component utilizes multi-modal large language models (e.g. Llava) that can be used to process text and image data. 15 | 16 | * - **[SpeechToText](agents.components.speechtotext.md)** 17 | - This component takes in audio input and outputs a text representation of the audio using Speech-to-Text models (e.g. Whisper). 18 | 19 | * - **[TextToSpeech](agents.components.texttospeech.md)** 20 | - This component takes in text input and outputs an audio representation of the text using TTS models (e.g. SpeechT5). The generated audio can be played using any audio playback device available on the agent. 21 | 22 | * - **[MapEncoding](agents.components.map_encoding.md)** 23 | - Map encoding component that encodes text information as a semantic map based on the robots localization. It takes in map layers, position topic, map meta data topic, and a vector database client. Map layers can be arbitrary text based outputs from other components such as MLLMs or Vision. 24 | 25 | * - **[SemanticRouter](agents.components.semantic_router.md)** 26 | - A component that routes semantic information from input topics to output topics based on pre-defined routes. The Semantic Router takes in a list of input topics, a list of routes, an optional default route, and a configuration object. It uses the database client to store and retrieve routing information. 27 | 28 | * - **[Vision](agents.components.vision.md)** 29 | - This component performs object detection and tracking on input images and outputs a list of detected objects, along with their bounding boxes and confidence scores. 30 | 31 | * - **[VideoMessageMaker](agents.components.imagestovideo.md)** 32 | - This component generates ROS video messages from input image messages. A video message is a collection of image messages that have a perceivable motion. I.e. the primary task of this component is to make intentionality decisions about what sequence of consecutive images should be treated as one coherent temporal sequence. The motion estimation method used for selecting images for a video can be configured in component config. 33 | ``` 34 | """ 35 | 36 | from .component_base import Component 37 | from .imagestovideo import VideoMessageMaker 38 | from .llm import LLM 39 | from .map_encoding import MapEncoding 40 | from .mllm import MLLM 41 | from .model_component import ModelComponent 42 | from .semantic_router import SemanticRouter 43 | from .speechtotext import SpeechToText 44 | from .texttospeech import TextToSpeech 45 | from .vision import Vision 46 | 47 | __all__ = [ 48 | "Component", 49 | "ModelComponent", 50 | "MapEncoding", 51 | "MLLM", 52 | "LLM", 53 | "SpeechToText", 54 | "TextToSpeech", 55 | "Vision", 56 | "VideoMessageMaker", 57 | "SemanticRouter", 58 | ] 59 | -------------------------------------------------------------------------------- /agents/agents/components/component_base.py: -------------------------------------------------------------------------------- 1 | import json 2 | from abc import abstractmethod 3 | from copy import deepcopy 4 | from typing import Optional, Sequence, Union, List, Dict, Type 5 | 6 | from ..ros import BaseComponent, ComponentRunType, FixedInput, SupportedType, Topic 7 | from ..config import BaseComponentConfig 8 | 9 | 10 | class Component(BaseComponent): 11 | """Component.""" 12 | 13 | def __init__( 14 | self, 15 | inputs: Optional[Sequence[Union[Topic, FixedInput]]] = None, 16 | outputs: Optional[Sequence[Topic]] = None, 17 | config: Optional[BaseComponentConfig] = None, 18 | trigger: Union[Topic, List[Topic], float] = 1.0, 19 | callback_group=None, 20 | component_name: str = "agents_component", 21 | **kwargs, 22 | ): 23 | self.config: BaseComponentConfig = ( 24 | deepcopy(config) if config else BaseComponentConfig() 25 | ) 26 | self.allowed_inputs: Dict[str, List[Type[SupportedType]]] 27 | self.allowed_outputs: Dict[str, List[Type[SupportedType]]] 28 | 29 | # setup inputs and outputs 30 | if inputs: 31 | self.validate_topics( 32 | inputs, 33 | allowed_topic_types=self.allowed_inputs, 34 | topics_direction="Inputs", 35 | ) 36 | 37 | if outputs: 38 | if hasattr(self, "allowed_outputs"): 39 | self.validate_topics( 40 | outputs, 41 | allowed_topic_types=self.allowed_outputs, 42 | topics_direction="Outputs", 43 | ) 44 | 45 | # Initialize Parent Component 46 | super().__init__( 47 | component_name=component_name, 48 | inputs=inputs, 49 | outputs=outputs, 50 | config=self.config, 51 | callback_group=callback_group, 52 | enable_health_broadcast=False, 53 | **kwargs, 54 | ) 55 | 56 | # setup component run type and triggers 57 | self.trigger(trigger) 58 | 59 | def custom_on_activate(self): 60 | """ 61 | Custom configuration for creating triggers. 62 | """ 63 | # Setup trigger based callback or frequency based timer 64 | if self.run_type is ComponentRunType.EVENT: 65 | self.activate_all_triggers() 66 | 67 | def create_all_subscribers(self): 68 | """ 69 | Override to handle trigger topics and fixed inputs. 70 | Called by parent BaseComponent 71 | """ 72 | self.get_logger().info("STARTING ALL SUBSCRIBERS") 73 | all_callbacks = ( 74 | list(self.callbacks.values()) + list(self.trig_callbacks.values()) 75 | if self.run_type is ComponentRunType.EVENT 76 | else self.callbacks.values() 77 | ) 78 | for callback in all_callbacks: 79 | callback.set_node_name(self.node_name) 80 | if hasattr(callback.input_topic, "fixed"): 81 | self.get_logger().debug( 82 | f"Fixed input specified for topic: {callback.input_topic} of type {callback.input_topic.msg_type}" 83 | ) 84 | else: 85 | callback.set_subscriber(self._add_ros_subscriber(callback)) 86 | 87 | def activate_all_triggers(self) -> None: 88 | """ 89 | Activates component triggers by attaching execution step to callbacks 90 | """ 91 | self.get_logger().info("ACTIVATING TRIGGER TOPICS") 92 | if hasattr(self, "trig_callbacks"): 93 | for callback in self.trig_callbacks.values(): 94 | # Add execution step of the node as a post callback function 95 | callback.on_callback_execute(self._execution_step) 96 | 97 | def destroy_all_subscribers(self) -> None: 98 | """ 99 | Destroys all node subscribers 100 | """ 101 | self.get_logger().info("DESTROYING ALL SUBSCRIBERS") 102 | all_callbacks = ( 103 | list(self.callbacks.values()) + list(self.trig_callbacks.values()) 104 | if self.run_type is ComponentRunType.EVENT 105 | else self.callbacks.values() 106 | ) 107 | for callback in all_callbacks: 108 | if callback._subscriber: 109 | self.destroy_subscription(callback._subscriber) 110 | 111 | def trigger(self, trigger: Union[Topic, List[Topic], float]) -> None: 112 | """ 113 | Set component trigger 114 | """ 115 | if isinstance(trigger, list): 116 | for t in trigger: 117 | if t.name not in self.callbacks: 118 | raise TypeError( 119 | f"Invalid configuration for component trigger {t.name} - A trigger needs to be one of the inputs already defined in component inputs." 120 | ) 121 | self.run_type = ComponentRunType.EVENT 122 | self.trig_callbacks = {} 123 | for t in trigger: 124 | self.trig_callbacks[t.name] = self.callbacks[t.name] 125 | # remove trigger inputs from self.callbacks 126 | del self.callbacks[t.name] 127 | 128 | elif isinstance(trigger, Topic): 129 | if trigger.name not in self.callbacks: 130 | raise TypeError( 131 | f"Invalid configuration for component trigger {trigger.name} - A trigger needs to be one of the inputs already defined in component inputs." 132 | ) 133 | self.run_type = ComponentRunType.EVENT 134 | self.trig_callbacks = {trigger.name: self.callbacks[trigger.name]} 135 | del self.callbacks[trigger.name] 136 | 137 | else: 138 | self.run_type = ComponentRunType.TIMED 139 | # Set component loop_rate (Hz) 140 | self.config.loop_rate = 1 / trigger 141 | 142 | self.trig_topic: Union[Topic, list[Topic], float] = trigger 143 | 144 | def validate_topics( 145 | self, 146 | topics: Sequence[Union[Topic, FixedInput]], 147 | allowed_topic_types: Optional[Dict[str, List[Type[SupportedType]]]] = None, 148 | topics_direction: str = "Topics", 149 | ): 150 | """ 151 | Verify component specific inputs or outputs using allowed topics if provided 152 | """ 153 | # type validation 154 | correct_type = all(isinstance(i, (Topic, FixedInput)) for i in topics) 155 | if not correct_type: 156 | raise TypeError( 157 | f"{topics_direction} to a component can only be of type Topic" 158 | ) 159 | 160 | # Check that only the allowed topics (or their subtypes) have been given 161 | if not allowed_topic_types: 162 | return 163 | 164 | all_msg_types = {topic.msg_type for topic in topics} 165 | all_topic_types = allowed_topic_types["Required"] + ( 166 | allowed_topic_types.get("Optional") or [] 167 | ) 168 | 169 | if msg_type := next( 170 | ( 171 | topic 172 | for topic in all_msg_types 173 | if not any( 174 | issubclass(topic, allowed_t) for allowed_t in all_topic_types 175 | ) 176 | ), 177 | None, 178 | ): 179 | raise TypeError( 180 | f"{topics_direction} to the component of type {self.__class__.__name__} can only be of the allowed datatypes: {[topic.__name__ for topic in all_topic_types]} or their subclasses. A topic of type {msg_type.__name__} cannot be given to this component." 181 | ) 182 | 183 | # Check that all required topics (or subtypes) have been given 184 | sufficient_topics = all( 185 | any(issubclass(m_type, allowed_type) for m_type in all_msg_types) 186 | for allowed_type in allowed_topic_types["Required"] 187 | ) 188 | 189 | if not sufficient_topics: 190 | raise TypeError( 191 | f"{self.__class__.__name__} component {topics_direction} should have at least one topic of each datatype in the following list: {[topic.__name__ for topic in allowed_topic_types['Required']]}" 192 | ) 193 | 194 | @abstractmethod 195 | def _execution_step(self, **kwargs): 196 | """_execution_step. 197 | 198 | :param args: 199 | :param kwargs: 200 | """ 201 | raise NotImplementedError( 202 | "This method needs to be implemented by child components." 203 | ) 204 | 205 | def _update_cmd_args_list(self): 206 | """ 207 | Update launch command arguments 208 | """ 209 | super()._update_cmd_args_list() 210 | 211 | self.launch_cmd_args = [ 212 | "--trigger", 213 | self._get_trigger_json(), 214 | ] 215 | 216 | def _get_trigger_json(self) -> Union[str, bytes, bytearray]: 217 | """ 218 | Serialize component routes to json 219 | 220 | :return: Serialized inputs 221 | :rtype: str | bytes | bytearray 222 | """ 223 | if isinstance(self.trig_topic, Topic): 224 | return self.trig_topic.to_json() 225 | elif isinstance(self.trig_topic, List): 226 | return json.dumps([t.to_json() for t in self.trig_topic]) 227 | else: 228 | return json.dumps(self.trig_topic) 229 | -------------------------------------------------------------------------------- /agents/agents/components/imagestovideo.py: -------------------------------------------------------------------------------- 1 | import math 2 | from typing import Optional, Union, List 3 | 4 | import cv2 5 | import numpy as np 6 | 7 | from ..config import VideoMessageMakerConfig 8 | from ..ros import Image, Topic, Video, ROSImage, ROSCompressedImage 9 | from ..utils import validate_func_args 10 | from .component_base import Component 11 | 12 | 13 | class VideoMessageMaker(Component): 14 | """ 15 | This component generates ROS video messages from input image messages. A video message is a collection of image messages that have a perceivable motion. 16 | I.e. the primary task of this component is to make intentionality decisions about what sequence of consecutive images should be treated as one coherent temporal sequence. 17 | The motion estimation method used for selecting images for a video can be configured in component config. 18 | 19 | :param inputs: The input topics for the object detection. 20 | This should be a list of Topic objects or FixedInput objects, limited to Image type. 21 | :type inputs: list[Topic] 22 | :param outputs: The output topics for the object detection. 23 | This should be a list of Topic objects, Video type. 24 | :type outputs: list[Topic] 25 | :param config: The configuration for the video message generation. 26 | This should be an instance of VideoMessageMakerConfig. 27 | :type config: VideoMessageMakerConfig 28 | :param trigger: The trigger value or topic for the object detection. 29 | This can be a single Topic object or a list of Topic objects. 30 | :type trigger: Union[Topic, list[Topic]] 31 | :param callback_group: An optional callback group for the video message generation. 32 | If provided, this should be a string. Otherwise, it defaults to None. 33 | :type callback_group: str 34 | :param component_name: The name of the video message generation component. 35 | This should be a string and defaults to "video_maker_component". 36 | :type component_name: str 37 | 38 | Example usage: 39 | ```python 40 | image_topic = Topic(name="image", msg_type="Image") 41 | video_topic = Topic(name="video", msg_type="Video") 42 | config = VideoMessageMakerConfig() 43 | video_message_maker = VideoMessageMaker( 44 | inputs=[image_topic], 45 | outputs=[video_topic], 46 | config=config, 47 | component_name="video_message_maker", 48 | ) 49 | ``` 50 | """ 51 | 52 | @validate_func_args 53 | def __init__( 54 | self, 55 | *, 56 | inputs: List[Topic], 57 | outputs: List[Topic], 58 | config: Optional[VideoMessageMakerConfig] = None, 59 | trigger: Union[Topic, List[Topic]], 60 | component_name: str, 61 | callback_group=None, 62 | **kwargs, 63 | ): 64 | if isinstance(trigger, float): 65 | raise TypeError( 66 | "VideoMessageMaker component needs to be given a valid trigger topic. It cannot be started as a timed component." 67 | ) 68 | 69 | self.config: VideoMessageMakerConfig = config or VideoMessageMakerConfig() 70 | self.allowed_inputs = {"Required": [Image]} 71 | self.allowed_outputs = {"Required": [Video]} 72 | 73 | super().__init__( 74 | inputs, 75 | outputs, 76 | self.config, 77 | trigger, 78 | callback_group, 79 | component_name, 80 | **kwargs, 81 | ) 82 | 83 | self._frames: Union[List[ROSImage], List[ROSCompressedImage]] = [] 84 | self._last_frame: Optional[np.ndarray] = None 85 | self._capture: bool = False 86 | 87 | def _motion_estimation(self, current_frame: np.ndarray) -> bool: 88 | """Motion estimation methods between two frames. 89 | :param current_frame: 90 | :type current_frame: np.ndarray 91 | :rtype: bool 92 | """ 93 | # get gray scale image 94 | gray = cv2.cvtColor(current_frame, cv2.COLOR_RGB2GRAY) 95 | if self.config.motion_estimation_func == "frame_difference": 96 | return self._frame_difference(gray, self.config.threshold) 97 | elif self.config.motion_estimation_func == "optical_flow": 98 | return self._optical_flow( 99 | gray, self.config.threshold, **self.config.flow_kwargs 100 | ) 101 | else: 102 | return True 103 | 104 | def _frame_difference(self, img: np.ndarray, threshold: float) -> bool: 105 | """Calculates difference between two frames and returns true 106 | if difference is greater than defined threshold. 107 | :param img: 108 | :type img: np.ndarray 109 | :param threshold: 110 | :type threshold: int 111 | :rtype: bool 112 | """ 113 | # calculate frame difference 114 | diff = cv2.subtract(img, self._last_frame) 115 | # apply blur to improve thresholding 116 | diff = cv2.medianBlur(diff, 3) 117 | # apply adaptive thresholding 118 | mask = cv2.adaptiveThreshold( 119 | diff, 1, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2 120 | ) 121 | return True if mask.sum() > (threshold * math.prod(img.shape) / 100) else False 122 | 123 | def _optical_flow(self, img: np.ndarray, threshold: float, **kwargs) -> bool: 124 | """Calculates optical flow between two frames and returns true 125 | if flow is greater than defined threshold. 126 | :param img: 127 | :type img: np.ndarray 128 | :param threshold: 129 | :type threshold: int 130 | :rtype: bool 131 | """ 132 | # calculate optical flow 133 | flow = cv2.calcOpticalFlowFarneback(self._last_frame, img, None, **kwargs) 134 | mask = np.uint8(flow > 1) / 10 135 | return True if mask.sum() > (threshold * math.prod(img.shape) / 100) else False 136 | 137 | def _execution_step(self, *_, **kwargs) -> None: 138 | """Collects incoming image messages until a criteria is met 139 | When met, publishes image messages as video 140 | :param args: 141 | :param kwargs: 142 | """ 143 | msg = kwargs.get("msg") 144 | topic = kwargs.get("topic") 145 | if msg and topic: 146 | output = self.trig_callbacks[topic.name].get_output() 147 | if self._last_frame is not None: 148 | # calculate motion estimation for start and stop 149 | self._capture = ( 150 | True 151 | if self._motion_estimation(output) 152 | and len(self._frames) < self.config.max_video_frames 153 | else False 154 | ) 155 | if self._capture: 156 | self._frames.append(msg) 157 | self._last_frame = cv2.cvtColor(output, cv2.COLOR_RGB2GRAY) 158 | 159 | # publish if video capture finished 160 | if ( 161 | self.publishers_dict 162 | and (not self._capture) 163 | and len(self._frames) >= self.config.min_video_frames 164 | ): 165 | self.get_logger().debug(f"Sending out video of {len(self._frames)} frames") 166 | for publisher in self.publishers_dict.values(): 167 | publisher.publish(output=self._frames) 168 | self._frames = [] 169 | -------------------------------------------------------------------------------- /agents/agents/components/mllm.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Union, Optional, List, Dict 2 | 3 | from ..clients.db_base import DBClient 4 | from ..clients.model_base import ModelClient 5 | from ..config import MLLMConfig 6 | from ..ros import FixedInput, Image, String, Topic, Detections 7 | from ..utils import validate_func_args 8 | from .llm import LLM 9 | 10 | 11 | class MLLM(LLM): 12 | """ 13 | This component utilizes multi-modal large language models (e.g. Llava) that can be used to process text and image data. 14 | 15 | :param inputs: The input topics or fixed inputs for the MLLM component. 16 | This should be a list of Topic objects or FixedInput instances, limited to String and Image types. 17 | :type inputs: list[Topic | FixedInput] 18 | :param outputs: The output topics for the MLLM component. 19 | This should be a list of Topic objects. String type is handled automatically. 20 | :type outputs: list[Topic] 21 | :param model_client: The model client for the MLLM component. 22 | This should be an instance of ModelClient. 23 | :type model_client: ModelClient 24 | :param config: Optional configuration for the MLLM component. 25 | This should be an instance of MLLMConfig. If not provided, defaults to MLLMConfig(). 26 | :type config: MLLMConfig 27 | :param trigger: The trigger value or topic for the MLLM component. 28 | This can be a single Topic object, a list of Topic objects, or a float value for a timed component. Defaults to 1. 29 | :type trigger: Union[Topic, list[Topic], float] 30 | :param callback_group: An optional callback group for the MLLM component. 31 | If provided, this should be a string. Otherwise, it defaults to None. 32 | :type callback_group: str 33 | :param component_name: The name of the MLLM component. 34 | This should be a string and defaults to "mllm_component". 35 | :type component_name: str 36 | 37 | Example usage: 38 | ```python 39 | text0 = Topic(name="text0", msg_type="String") 40 | image0 = Topic(name="image0", msg_type="Image") 41 | text0 = Topic(name="text1", msg_type="String") 42 | config = MLLMConfig() 43 | model = TransformersMLLM(name='idefics') 44 | model_client = ModelClient(model=model) 45 | mllm_component = MLLM(inputs=[text0, image0], 46 | outputs=[text1], 47 | model_client=model_client, 48 | config=config, 49 | component_name='mllm_component') 50 | ``` 51 | """ 52 | 53 | @validate_func_args 54 | def __init__( 55 | self, 56 | *, 57 | inputs: List[Union[Topic, FixedInput]], 58 | outputs: List[Topic], 59 | model_client: ModelClient, 60 | config: Optional[MLLMConfig] = None, 61 | db_client: Optional[DBClient] = None, 62 | trigger: Union[Topic, List[Topic], float] = 1.0, 63 | component_name: str, 64 | callback_group=None, 65 | **kwargs, 66 | ): 67 | self.allowed_inputs = {"Required": [String, Image], "Optional": [Detections]} 68 | 69 | config = config or MLLMConfig() 70 | 71 | super().__init__( 72 | inputs=inputs, 73 | outputs=outputs, 74 | model_client=model_client, 75 | config=config, 76 | db_client=db_client, 77 | trigger=trigger, 78 | callback_group=callback_group, 79 | component_name=component_name, 80 | allowed_inputs=self.allowed_inputs, 81 | **kwargs, 82 | ) 83 | 84 | def _create_input(self, *_, **kwargs) -> Optional[Dict[str, Any]]: 85 | """Create inference input for MLLM models 86 | :param args: 87 | :param kwargs: 88 | :rtype: dict[str, Any] 89 | """ 90 | images = [] 91 | # context dict to gather all String inputs for use in system prompt 92 | context = {} 93 | # set mllm query as trigger 94 | if trigger := kwargs.get("topic"): 95 | query = self.trig_callbacks[trigger.name].get_output() 96 | context[trigger.name] = query 97 | 98 | # handle chat reset 99 | if ( 100 | self.config.chat_history 101 | and query.strip().lower() == self.config.history_reset_phrase 102 | ): 103 | self.messages = [] 104 | return None 105 | 106 | else: 107 | query = None 108 | 109 | # aggregate all inputs that are available 110 | for i in self.callbacks.values(): 111 | if (item := i.get_output()) is not None: 112 | # set trigger equal to a topic with type String if trigger not found 113 | if i.input_topic.msg_type is String: 114 | if not query: 115 | query = item 116 | context[i.input_topic.name] = item 117 | elif i.input_topic.msg_type is Detections: 118 | context[i.input_topic.name] = item 119 | # get images from image topics 120 | if issubclass(i.input_topic.msg_type, Image): 121 | images.append(item) 122 | 123 | if not query or not images: 124 | return None 125 | 126 | # get RAG results if enabled in config and if docs retreived 127 | rag_result = self._handle_rag_query(query) if self.config.enable_rag else None 128 | 129 | # set system prompt template 130 | query = ( 131 | self.component_prompt.render(context) if self.component_prompt else query 132 | ) 133 | 134 | # get RAG results if enabled in config and if docs retreived 135 | query = f"{rag_result}\n{query}" if rag_result else query 136 | 137 | message = {"role": "user", "content": query} 138 | self._handle_chat_history(message) 139 | 140 | self.get_logger().debug(f"Input from component: {self.messages}") 141 | 142 | input = { 143 | "query": self.messages, 144 | "images": images, 145 | **self.config._get_inference_params(), 146 | } 147 | 148 | # Add any tools, if registered 149 | if self.config._tool_descriptions: 150 | input["tools"] = self.config._tool_descriptions 151 | 152 | return input 153 | 154 | def _warmup(self): 155 | """Warm up and stat check""" 156 | import time 157 | from pathlib import Path 158 | import cv2 159 | 160 | image = cv2.imread(str(Path(__file__).parents[1] / Path("resources/test.jpeg"))) 161 | 162 | message = {"role": "user", "content": "What do you see?"} 163 | inference_input = { 164 | "query": [message], 165 | "images": [image], 166 | **self.config._get_inference_params(), 167 | } 168 | 169 | # Run inference once to warm up and once to measure time 170 | self.model_client.inference(inference_input) 171 | 172 | inference_input = { 173 | "query": [message], 174 | "images": [image], 175 | **self.config._get_inference_params(), 176 | } 177 | start_time = time.time() 178 | result = self.model_client.inference(inference_input) 179 | elapsed_time = time.time() - start_time 180 | 181 | self.get_logger().warning(f"Model Output: {result['output']}") 182 | self.get_logger().warning(f"Approximate Inference time: {elapsed_time} seconds") 183 | -------------------------------------------------------------------------------- /agents/agents/components/model_component.py: -------------------------------------------------------------------------------- 1 | from abc import abstractmethod 2 | import inspect 3 | import json 4 | from typing import Any, Optional, Sequence, Union, List, Dict, Type 5 | 6 | from ..clients.model_base import ModelClient 7 | from ..config import ModelComponentConfig 8 | from ..ros import FixedInput, Topic, SupportedType 9 | from .component_base import Component 10 | 11 | 12 | class ModelComponent(Component): 13 | """ModelComponent.""" 14 | 15 | def __init__( 16 | self, 17 | inputs: Optional[Sequence[Union[Topic, FixedInput]]] = None, 18 | outputs: Optional[Sequence[Topic]] = None, 19 | model_client: Optional[ModelClient] = None, 20 | config: Optional[ModelComponentConfig] = None, 21 | trigger: Union[Topic, List[Topic], float] = 1.0, 22 | callback_group=None, 23 | component_name: str = "model_component", 24 | **kwargs, 25 | ): 26 | # setup model client 27 | self.model_client = model_client if model_client else None 28 | 29 | self.handled_outputs: List[Type[SupportedType]] 30 | 31 | if not config: 32 | self.config = ModelComponentConfig() 33 | 34 | # Initialize Component 35 | super().__init__( 36 | inputs, 37 | outputs, 38 | config, 39 | trigger, 40 | callback_group, 41 | component_name, 42 | **kwargs, 43 | ) 44 | 45 | def custom_on_configure(self): 46 | """ 47 | Create model client if provided and initialize model. 48 | """ 49 | self.get_logger().debug(f"Current Status: {self.health_status.value}") 50 | 51 | # validate output topics if handled_outputs exist 52 | self.get_logger().info("Validating Model Component Output Topics") 53 | self._validate_output_topics() 54 | 55 | # Initialize model 56 | if self.model_client: 57 | self.model_client.check_connection() 58 | self.model_client.initialize() 59 | if self.config.warmup: 60 | try: 61 | self._warmup() 62 | except Exception as e: 63 | self.get_logger().error(f"Error encountered in warmup: {e}") 64 | 65 | def custom_on_deactivate(self): 66 | """ 67 | Destroy model client if it exists 68 | """ 69 | # Deinitialize model 70 | if self.model_client: 71 | self.model_client.check_connection() 72 | self.model_client.deinitialize() 73 | 74 | def _validate_output_topics(self) -> None: 75 | """ 76 | Verify that output topics that are not handled, have pre-processing functions provided. We just check that there is a pre-processing function and do not check whether the functions have output of the corresponding type. 77 | """ 78 | 79 | if hasattr(self, "publishers_dict") and hasattr(self, "handled_outputs"): 80 | for name, pub in self.publishers_dict.items(): 81 | if pub.output_topic.msg_type not in self.handled_outputs and ( 82 | not self._external_processors 83 | ): 84 | func_body = inspect.getsource(pub.output_topic.msg_type.convert) 85 | raise TypeError(f"""{type(self).__name__} components can only handle output topics of type(s) {self.handled_outputs} automatically. Topic {name} is of type {pub.output_topic.msg_type}. EITHER provide a pre-processing function for this topic and attach it to the topic by calling the `add_publisher_preprocessor` on the component {self.node_name} OR provide a tool call that can provide structured inference output and attach it by calling `register_tool` on {self.node_name}. Make sure the output can be passed as parameter `output` to the following function: 86 | {func_body}""") 87 | 88 | @property 89 | def warmup(self) -> bool: 90 | """Enable warmup of the model.""" 91 | return self.config.warmup 92 | 93 | @warmup.setter 94 | def warmup(self, value: bool) -> None: 95 | """Enable warmup of the model.""" 96 | self.config.warmup = value 97 | 98 | @abstractmethod 99 | def _create_input(self, *args, **kwargs) -> Union[Dict[str, Any], None]: 100 | """_create_input. 101 | 102 | :param args: 103 | :param kwargs: 104 | :rtype: dict[str, Any] | None 105 | """ 106 | raise NotImplementedError( 107 | "_create_input method needs to be implemented by child components." 108 | ) 109 | 110 | @abstractmethod 111 | def _execution_step(self, *args, **kwargs): 112 | """_execution_step. 113 | 114 | :param args: 115 | :param kwargs: 116 | """ 117 | raise NotImplementedError( 118 | "_execution_step method needs to be implemented by child components." 119 | ) 120 | 121 | @abstractmethod 122 | def _warmup(self, *args, **kwargs): 123 | """_warmup. 124 | 125 | :param args: 126 | :param kwargs: 127 | """ 128 | raise NotImplementedError( 129 | "_warmup method needs to be implemented by child components." 130 | ) 131 | 132 | def _update_cmd_args_list(self): 133 | """ 134 | Update launch command arguments 135 | """ 136 | super()._update_cmd_args_list() 137 | 138 | self.launch_cmd_args = [ 139 | "--model_client", 140 | self._get_model_client_json(), 141 | ] 142 | 143 | def _get_model_client_json(self) -> Union[str, bytes, bytearray]: 144 | """ 145 | Serialize component routes to json 146 | 147 | :return: Serialized inputs 148 | :rtype: str | bytes | bytearray 149 | """ 150 | if not self.model_client: 151 | return "" 152 | return json.dumps(self.model_client.serialize()) 153 | -------------------------------------------------------------------------------- /agents/agents/components/semantic_router.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, List, Union 2 | import json 3 | 4 | from ..clients.db_base import DBClient 5 | from ..config import SemanticRouterConfig 6 | from ..publisher import Publisher 7 | from ..ros import String, Topic, Route 8 | from ..utils import validate_func_args 9 | from .component_base import Component 10 | 11 | 12 | class SemanticRouter(Component): 13 | """A component that routes semantic information from input topics to output topics based on pre-defined routes. The Semantic Router takes in a list of input topics, a list of routes, an optional default route, and a configuration object. It uses the database client to store and retrieve routing information. 14 | 15 | :param inputs: 16 | A list of input text topics that this component will subscribe to. 17 | :type inputs: list[Topic] 18 | :param routes: 19 | A list of pre-defined routes that publish incoming input to the routed output topics. 20 | :type routes: list[Route] 21 | :param default_route: 22 | An optional route that specifies the default behavior when no specific route matches up to a threshold. If not provided, the component will use the first route in the list. 23 | :type default_route: Optional[Route] 24 | :param config: 25 | The configuration object for this Semantic Router component. 26 | :type config: SemanticRouterConfig 27 | :param db_client: 28 | A database client that is used to store and retrieve routing information. 29 | :type db_client: DBClient 30 | :param callback_group: 31 | An optional callback group for this component. 32 | :param component_name: 33 | The name of this Semantic Router component (default: "router_component"). 34 | :type component_name: str 35 | :param kwargs: 36 | Additional keyword arguments. 37 | 38 | Example usage: 39 | ```python 40 | input_text = Topic(name="text0", msg_type="String") 41 | goto_route = Route( 42 | routes_to=goto, # where goto is an input topic to another component 43 | samples=[ 44 | "Go to the door", 45 | "Go to the kitchen", 46 | "Get me a glass", 47 | "Fetch a ball", 48 | "Go to hallway", 49 | "Go over there", 50 | ], 51 | ) 52 | mllm_route = Route( 53 | routes_to=mllm_input, # where mllm_input is an input topic to another component 54 | samples=[ 55 | "Are we indoors or outdoors", 56 | "What do you see?", 57 | "Whats in front of you?", 58 | "Where are we", 59 | "Do you see any people?", 60 | "How many things are infront of you?", 61 | "Is this room occupied?", 62 | ], 63 | ) 64 | config = SemanticRouterConfig(router_name="my_router") 65 | db_client = DBClient(db=ChromaDB("database_name")) 66 | semantic_router = SemanticRouter( 67 | inputs=[input_text], 68 | routes=[route1, route2], 69 | default_route=None, 70 | config=config, 71 | db_client=db_client 72 | component_name = "router" 73 | ) 74 | ``` 75 | """ 76 | 77 | @validate_func_args 78 | def __init__( 79 | self, 80 | *, 81 | inputs: List[Topic], 82 | routes: List[Route], 83 | config: SemanticRouterConfig, 84 | db_client: DBClient, 85 | default_route: Optional[Route] = None, 86 | component_name: str, 87 | callback_group=None, 88 | **kwargs, 89 | ): 90 | self.config: SemanticRouterConfig = config 91 | self.allowed_inputs = {"Required": [String]} 92 | self.allowed_outputs = {"Required": [String]} 93 | self.db_client = db_client 94 | 95 | super().__init__( 96 | inputs, 97 | None, 98 | self.config, 99 | inputs, 100 | callback_group, 101 | component_name, 102 | **kwargs, 103 | ) 104 | 105 | # create routes 106 | self._routes(routes) 107 | 108 | if default_route: 109 | if default_route.routes_to.name not in self.routes_dict: 110 | raise TypeError("default_route must be one of the specified routes") 111 | self.default_route = self.config._default_route = default_route 112 | 113 | def custom_on_configure(self): 114 | self.get_logger().debug(f"Current Status: {self.health_status.value}") 115 | 116 | # configure the rest 117 | super().custom_on_configure() 118 | 119 | # initialize db client 120 | self.db_client.check_connection() 121 | self.db_client.initialize() 122 | 123 | # initialize routes 124 | self._initialize_routes() 125 | 126 | def deactivate(self): 127 | # deactivate db client 128 | self.db_client.check_connection() 129 | self.db_client.deinitialize() 130 | 131 | def _initialize_routes(self): 132 | """Create routes by saving route samples in the database.""" 133 | self.get_logger().info("Initializing all routes") 134 | for idx, (name, route) in enumerate(self.routes_dict.items()): 135 | route_to_add = { 136 | "collection_name": self.config.router_name, 137 | "distance_func": self.config.distance_func, 138 | "documents": route.samples, 139 | "metadatas": [{"route_name": name} for _ in range(len(route.samples))], 140 | "ids": [f"{name}.{i}" for i in range(len(route.samples))], 141 | } 142 | # reset collection on the addition of first route if it exists 143 | if idx == 0: 144 | route_to_add["reset_collection"] = True 145 | 146 | self.db_client.add(route_to_add) 147 | 148 | def _execution_step(self, **kwargs): 149 | """Execution step for Semantic Router component. 150 | :param args: 151 | :param kwargs: 152 | """ 153 | trigger = kwargs.get("topic") 154 | if not trigger: 155 | return 156 | 157 | self.get_logger().debug(f"Received trigger on {trigger.name}") 158 | trigger_query = self.trig_callbacks[trigger.name].get_output() 159 | # get route 160 | db_input = { 161 | "collection_name": self.config.router_name, 162 | "query": trigger_query, 163 | "n_results": 1, 164 | } 165 | result = self.db_client.query(db_input) 166 | 167 | # TODO: Add treatment of multiple results by using an averaging function 168 | if result: 169 | distance = result["output"]["distances"][0][0] 170 | # if default route is specified and distance is less than min 171 | # threshold, redirect to default route 172 | route = ( 173 | self.default_route.routes_to.name 174 | if self.default_route and distance > self.config.maximum_distance 175 | else result["output"]["metadatas"][0][0]["route_name"] 176 | ) 177 | 178 | self.publishers_dict[route].publish(trigger_query) 179 | else: 180 | self.health_status.set_failure() 181 | 182 | def _routes(self, routes: List[Route]): 183 | """ 184 | Set component Routes (topics) 185 | """ 186 | self.routes_dict = {route.routes_to.name: route for route in routes} 187 | route_topics: List[Topic] = [route.routes_to for route in routes] # type: ignore 188 | self.validate_topics(route_topics, self.allowed_outputs, "Outputs") 189 | self.publishers_dict = { 190 | route_topic.name: Publisher(route_topic) for route_topic in route_topics 191 | } 192 | 193 | def _update_cmd_args_list(self): 194 | """ 195 | Update launch command arguments 196 | """ 197 | super()._update_cmd_args_list() 198 | 199 | self.launch_cmd_args = [ 200 | "--routes", 201 | self._get_routes_json(), 202 | ] 203 | 204 | self.launch_cmd_args = [ 205 | "--db_client", 206 | self._get_db_client_json(), 207 | ] 208 | 209 | def _get_routes_json(self) -> Union[str, bytes, bytearray]: 210 | """ 211 | Serialize component routes to json 212 | 213 | :return: Serialized inputs 214 | :rtype: str | bytes | bytearray 215 | """ 216 | if not hasattr(self, "routes_dict"): 217 | return "[]" 218 | return json.dumps([route.to_json() for route in self.routes_dict.values()]) 219 | 220 | def _get_db_client_json(self) -> Union[str, bytes, bytearray]: 221 | """ 222 | Serialize component routes to json 223 | 224 | :return: Serialized inputs 225 | :rtype: str | bytes | bytearray 226 | """ 227 | if not self.db_client: 228 | return "" 229 | return json.dumps(self.db_client.serialize()) 230 | -------------------------------------------------------------------------------- /agents/agents/publisher.py: -------------------------------------------------------------------------------- 1 | from ros_sugar.io import Publisher 2 | 3 | __all__ = ["Publisher"] 4 | -------------------------------------------------------------------------------- /agents/agents/resources/test.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/agents/agents/resources/test.jpeg -------------------------------------------------------------------------------- /agents/agents/resources/test.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/agents/agents/resources/test.wav -------------------------------------------------------------------------------- /agents/agents/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .utils import ( 2 | create_detection_context, 3 | validate_kwargs, 4 | validate_func_args, 5 | PDFReader, 6 | get_prompt_template, 7 | encode_arr_base64, 8 | VADStatus, 9 | WakeWordStatus, 10 | load_model, 11 | ) 12 | 13 | __all__ = [ 14 | "create_detection_context", 15 | "validate_kwargs", 16 | "validate_func_args", 17 | "PDFReader", 18 | "get_prompt_template", 19 | "encode_arr_base64", 20 | "VADStatus", 21 | "WakeWordStatus", 22 | "load_model", 23 | ] 24 | -------------------------------------------------------------------------------- /agents/agents/vectordbs.py: -------------------------------------------------------------------------------- 1 | """ 2 | The following vector DB specification classes are meant to define a comman interface for initialization of vector DBs. Currently the only supported vector DB is Chroma. 3 | """ 4 | 5 | from typing import Optional, Dict 6 | 7 | from attrs import define, field 8 | from .ros import BaseAttrs 9 | from .models import Encoder 10 | 11 | __all__ = ["ChromaDB"] 12 | 13 | 14 | @define(kw_only=True) 15 | class DB(BaseAttrs): 16 | """This class describes a database initialization configuration.""" 17 | 18 | name: str 19 | db_location: str = field(default="./data") 20 | username: Optional[str] = field(default=None) 21 | password: Optional[str] = field(default=None) 22 | encoder: Optional[Encoder] = field(default=None) 23 | init_timeout: int = field(default=600) # 10 minutes 24 | host: str = field(default="127.0.0.1") 25 | port: Optional[int] = field(default=None) 26 | 27 | def _get_init_params(self) -> Dict: 28 | params = { 29 | "username": self.username, 30 | "password": self.password, 31 | "db_location": self.db_location, 32 | } 33 | if self.encoder: 34 | params["encoder"] = self.encoder._get_init_params() 35 | return params 36 | 37 | 38 | @define(kw_only=True) 39 | class ChromaDB(DB): 40 | """[Chroma](https://www.trychroma.com/) is the open-source AI application database. It provides embeddings, vector search, document storage, full-text search, metadata filtering, and multi-modal retreival support. 41 | 42 | :param name: An arbitrary name given to the database. 43 | :type name: str 44 | :param db_location: The on-disk location where the database will be initialized. Defaults to "./data". 45 | :type db_location: str, optional 46 | :param username: The username for authentication. Defaults to None. 47 | :type username: Optional[str], optional 48 | :param password: The password for authentication. Defaults to None. 49 | :type password: Optional[str], optional 50 | :param encoder: An optional encoder model to use for text encoding. Defaults to None. 51 | :type encoder: Optional[Encoder], optional 52 | :param init_timeout: The timeout in seconds for the initialization process. Defaults to 10 minutes (600 seconds). 53 | :type init_timeout: int, optional 54 | :param host: The hostname or IP address of the database server. Defaults to "127.0.0.1". 55 | :type host: str, optional 56 | :param port: The port number to connect to the database server. Defaults to None. 57 | :type port: Optional[int], optional 58 | 59 | Example usage: 60 | ```python 61 | from agents.models import Encoder 62 | db_config = DB(name='my_database', username='user123', password='pass123') 63 | db_config.db_location = '/path/to/new/location' 64 | db_config.encoder = Encoder(checkpoint="BAAI/bge-small-en") 65 | ``` 66 | """ 67 | 68 | pass 69 | -------------------------------------------------------------------------------- /agents/msg/Bbox2D.msg: -------------------------------------------------------------------------------- 1 | float64 top_left_x 2 | float64 top_left_y 3 | float64 bottom_right_x 4 | float64 bottom_right_y 5 | -------------------------------------------------------------------------------- /agents/msg/Detection2D.msg: -------------------------------------------------------------------------------- 1 | std_msgs/Header header 2 | 3 | float64[] scores 4 | string[] labels 5 | Bbox2D[] boxes 6 | 7 | # Either an image or compressed image 8 | sensor_msgs/Image image 9 | sensor_msgs/CompressedImage compressed_image 10 | -------------------------------------------------------------------------------- /agents/msg/Detections2D.msg: -------------------------------------------------------------------------------- 1 | std_msgs/Header header 2 | 3 | Detection2D[] detections 4 | -------------------------------------------------------------------------------- /agents/msg/Point2D.msg: -------------------------------------------------------------------------------- 1 | float64 x 2 | float64 y 3 | -------------------------------------------------------------------------------- /agents/msg/Tracking.msg: -------------------------------------------------------------------------------- 1 | std_msgs/Header header 2 | 3 | Point2D[] centroids 4 | string[] labels 5 | Bbox2D[] boxes 6 | int8[] ids 7 | Point2D[] estimated_velocities 8 | 9 | # Either an image or compressed image 10 | sensor_msgs/Image image 11 | sensor_msgs/CompressedImage compressed_image 12 | -------------------------------------------------------------------------------- /agents/msg/Trackings.msg: -------------------------------------------------------------------------------- 1 | std_msgs/Header header 2 | 3 | Tracking[] trackings 4 | -------------------------------------------------------------------------------- /agents/msg/Video.msg: -------------------------------------------------------------------------------- 1 | std_msgs/Header header 2 | 3 | # Eithen a list of images or compressed images 4 | sensor_msgs/Image[] frames 5 | sensor_msgs/CompressedImage[] compressed_frames 6 | -------------------------------------------------------------------------------- /agents/package.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | automatika_embodied_agents 5 | 0.3.3 6 | agents 7 | Automatika Robotics 8 | MIT 9 | 10 | builtin_interfaces 11 | std_msgs 12 | sensor_msgs 13 | python3-tqdm 14 | python3-httpx 15 | automatika_ros_sugar 16 | 17 | ament_cmake 18 | ament_cmake_python 19 | rosidl_default_generators 20 | rosidl_default_runtime 21 | rosidl_interface_packages 22 | 23 | python3-pytest 24 | 25 | 26 | ament_cmake 27 | 28 | 29 | -------------------------------------------------------------------------------- /agents/scripts/chainlit_client/app.py: -------------------------------------------------------------------------------- 1 | from io import BytesIO 2 | from typing import Union, Optional, List 3 | from enum import Enum 4 | 5 | import chainlit as cl 6 | from chainlit.element import ElementBased 7 | from chainlit.input_widget import TextInput 8 | 9 | import rclpy 10 | from rclpy.node import Node 11 | from std_msgs.msg import ByteMultiArray, String 12 | 13 | 14 | class Status(Enum): 15 | INIT = 0 16 | RECEIVED_TEXT = 1 17 | RECEIVED_AUDIO = 2 18 | TIMEOUT = 3 19 | 20 | 21 | class ClientNode(Node): 22 | """ 23 | Cli based text client with a publisher and subscriber. 24 | """ 25 | 26 | def __init__(self) -> None: 27 | """ 28 | Constructs a new instance. 29 | """ 30 | super().__init__("cli_client") 31 | self.msg: Optional[Union[str, bytes]] = None 32 | # Start with defaults 33 | self.set_trigger("text0", "audio0") 34 | self.set_target("text1", "audio1") 35 | 36 | def publish(self, prompt: Union[str, bytes]) -> None: 37 | """ 38 | Publish to the trigger topics and listen to the target topics 39 | 40 | :param prompt: The prompt/question 41 | :type prompt: {str, bytes} 42 | 43 | :returns: None 44 | :rtype: None 45 | """ 46 | 47 | # set timeout flag 48 | self.msg_received = Status.INIT 49 | # Check for publishers on available topic and quit if none available 50 | if isinstance(prompt, bytes): 51 | if not self.count_subscribers(self.audio_trigger) > 0: 52 | self.get_logger().info( 53 | f"No one is listening to {self.audio_trigger}, so I am timing out" 54 | ) 55 | self.timer = self.create_timer(0, self.timer_callback) 56 | return None 57 | msg = ByteMultiArray() 58 | msg.data = prompt 59 | self.audio_publisher.publish(msg) 60 | self.get_logger().info(f"Publishing to {self.audio_trigger}") 61 | else: 62 | if not self.count_subscribers(self.text_trigger) > 0: 63 | self.get_logger().info( 64 | f"No one is listening to {self.text_trigger}, so I am timing out" 65 | ) 66 | self.timer = self.create_timer(0, self.timer_callback) 67 | return None 68 | # Create and publish message 69 | msg = String() 70 | msg.data = prompt 71 | self.text_publisher.publish(msg) 72 | self.get_logger().info(f"Publishing to {self.text_trigger}") 73 | 74 | self.get_logger().info("Now listening..") 75 | 76 | def listener_callback(self, msg: Union[String, ByteMultiArray]) -> None: 77 | """ 78 | Listener callback 79 | 80 | :param msg: The message 81 | :type msg: {ROS Message} 82 | """ 83 | if isinstance(msg, String): 84 | self.msg_received = Status.RECEIVED_TEXT 85 | self.get_logger().info(f"A: {msg.data}") 86 | self.msg = msg.data 87 | elif isinstance(msg, ByteMultiArray): 88 | self.msg_received = Status.RECEIVED_AUDIO 89 | self.get_logger().info("A: Audio bytes") 90 | self.msg = b"".join(msg.data) 91 | else: 92 | self.get_logger().error( 93 | "Something went wrong. Received message is neither String nor ByteMultiArray" 94 | ) 95 | 96 | def timer_callback(self): 97 | """ 98 | Timer Callback just for destroying the time and end node spin_once 99 | """ 100 | # the timer should be destroyed once utilized 101 | self.destroy_timer(self.timer) 102 | self.msg_received = Status.TIMEOUT 103 | 104 | def set_trigger(self, text_trigger: str, audio_trigger: str): 105 | """ 106 | Set topic to send messages to 107 | """ 108 | if hasattr(self, "text_publisher"): 109 | self.destroy_publisher(self.text_publisher) 110 | self.text_trigger = text_trigger 111 | self.text_publisher = self.create_publisher(String, self.text_trigger, 1) 112 | 113 | if hasattr(self, "audio_publisher"): 114 | self.destroy_publisher(self.audio_publisher) 115 | self.audio_trigger = audio_trigger 116 | self.audio_publisher = self.create_publisher( 117 | ByteMultiArray, self.audio_trigger, 1 118 | ) 119 | 120 | def set_target(self, text_target: str, audio_target: str): 121 | """ 122 | Set topic to receive messages from 123 | """ 124 | if hasattr(self, "text_subscription"): 125 | self.destroy_subscription(self.text_subscription) 126 | self.text_target = text_target 127 | self.text_subscription = self.create_subscription( 128 | String, self.text_target, self.listener_callback, 1 129 | ) 130 | 131 | if hasattr(self, "audio_subscription"): 132 | self.destroy_subscription(self.audio_subscription) 133 | self.audio_target = audio_target 134 | self.audio_subscription = self.create_subscription( 135 | ByteMultiArray, self.audio_target, self.listener_callback, 1 136 | ) 137 | 138 | 139 | @cl.on_chat_start 140 | async def on_chat_start(): 141 | """ 142 | On chat start, specify default settings 143 | """ 144 | # Init rclpy 145 | if not rclpy.ok(): 146 | rclpy.init() 147 | await cl.ChatSettings([ 148 | TextInput( 149 | id="text_trigger", 150 | label="String topic to send message to", 151 | initial="text0", 152 | ), 153 | TextInput( 154 | id="text_target", 155 | label="String topic to listen to for response", 156 | initial="text1", 157 | ), 158 | TextInput( 159 | id="audio_trigger", 160 | label="Audio topic to send message to", 161 | initial="audio0", 162 | ), 163 | TextInput( 164 | id="audio_target", 165 | label="Audio topic to listen to for response", 166 | initial="audio1", 167 | ), 168 | TextInput(id="timeout", label="Timeout (sec)", initial="30"), 169 | ]).send() 170 | cl.user_session.set("timeout", 30) 171 | client: ClientNode = ClientNode() 172 | cl.user_session.set("client", client) 173 | await cl.Message( 174 | content="Welcome to Leibniz ROS client. Set the input/output topics in settings. Then type your message or press `P` to send audio!" 175 | ).send() 176 | 177 | 178 | @cl.on_settings_update 179 | async def setup_ros_node(settings): 180 | """ 181 | On settings update, update nodes 182 | """ 183 | client: ClientNode = cl.user_session.get("client") 184 | client.set_trigger(settings["text_trigger"], settings["audio_trigger"]) 185 | client.set_target(settings["text_target"], settings["audio_target"]) 186 | if not settings["timeout"].isdigit(): 187 | return 188 | cl.user_session.set("timeout", int(settings["timeout"])) 189 | 190 | 191 | @cl.step(type="run") 192 | def publish_on_ros(msg: Union[str, bytes]): 193 | """Publish input to the ROS Client node. 194 | :param msg: 195 | :type msg: Union[str, bytes] 196 | """ 197 | timeout: int = cl.user_session.get("timeout") 198 | client: ClientNode = cl.user_session.get("client") 199 | client.publish(msg) 200 | rclpy.spin_once(client, timeout_sec=timeout) 201 | 202 | 203 | @cl.step(type="run") 204 | async def handle_output(msg_type: type): 205 | """Handle Output from the ROS Client node. 206 | :param msg_type: 207 | :type msg_type: type 208 | """ 209 | client: ClientNode = cl.user_session.get("client") 210 | if client.msg_received is Status.INIT: 211 | await cl.Message( 212 | content=f"I did not receive a message on **{client.text_target}** or **{client.audio_target}**. Timedout.", 213 | ).send() 214 | elif client.msg_received is Status.RECEIVED_TEXT: 215 | await cl.Message( 216 | content=f"{client.msg}", 217 | ).send() 218 | elif client.msg_received is Status.RECEIVED_AUDIO: 219 | output_audio_el = cl.Audio(content=client.msg, name="Response Audio") 220 | await cl.Message( 221 | author="Robot", 222 | type="assistant_message", 223 | content="", 224 | elements=[output_audio_el], 225 | ).send() 226 | else: 227 | trig = client.audio_trigger if msg_type is bytes else client.text_trigger 228 | await cl.Message( 229 | content=f"There is no one listening on **{trig}**. Is this the correct topic. If not, set the correct trigger and response topics in the settings.", 230 | ).send() 231 | 232 | 233 | @cl.on_message 234 | async def on_message(msg: cl.Message): 235 | """ 236 | On message, handle text message 237 | """ 238 | publish_on_ros(msg.content) 239 | await handle_output(type(msg)) 240 | 241 | 242 | @cl.on_audio_chunk 243 | async def on_audio_chunk(chunk: cl.AudioChunk): 244 | """Receive audio chunks 245 | :param chunk: 246 | :type chunk: cl.AudioChunk 247 | """ 248 | if chunk.isStart: 249 | # Initialize new audio buffer 250 | buffer = BytesIO() 251 | buffer.name = "input_audio" 252 | cl.user_session.set("audio_buffer", buffer) 253 | cl.user_session.set("audio_mime_type", chunk.mimeType) 254 | 255 | # write chunks to buffer 256 | cl.user_session.get("audio_buffer").write(chunk.data) 257 | 258 | 259 | @cl.on_audio_end 260 | async def on_audio_end(elements: List[ElementBased]): 261 | """Publish audio to the topic. 262 | :param elements: 263 | :type elements: list[ElementBased] 264 | """ 265 | audio_buffer: BytesIO = cl.user_session.get("audio_buffer") 266 | audio_buffer.seek(0) 267 | audio_mime_type: str = cl.user_session.get("audio_mime_type") 268 | audio_bytes = audio_buffer.read() 269 | 270 | # Add users audio to the chat 271 | input_audio_el = cl.Audio( 272 | mime=audio_mime_type, content=audio_bytes, name="User Audio" 273 | ) 274 | await cl.Message( 275 | author="User", 276 | type="user_message", 277 | content="", 278 | elements=[input_audio_el, *elements], 279 | ).send() 280 | 281 | # publish using ROS client 282 | publish_on_ros(audio_bytes) 283 | await handle_output(type(audio_bytes)) 284 | 285 | 286 | @cl.on_chat_end 287 | async def on_chat_end(): 288 | """ 289 | On chat end destroy client nodes 290 | """ 291 | if rclpy.ok(): 292 | client: ClientNode = cl.user_session.get("client") 293 | client.destroy_node() 294 | rclpy.shutdown() 295 | -------------------------------------------------------------------------------- /agents/scripts/chainlit_client/chainlit.md: -------------------------------------------------------------------------------- 1 | # Tiny Web Client for ROS Agents 2 | 3 | This client is based on chainlit. In order to use it, run the following in order. 4 | 5 | `pip install chainlit` 6 | 7 | `ros2 run automatika_embodied_agents tiny_web_client` 8 | 9 | The client displays a web UI on **localhost:8000**. Open this link from browser. 10 | 11 | ROS input and output topic settings for text and audio topics can be configured from the web UI by pressing the settings icon. 12 | -------------------------------------------------------------------------------- /agents/scripts/chainlit_client/tiny_web_client: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from pathlib import Path 3 | from chainlit.cli import run_chainlit 4 | from chainlit.config import config 5 | 6 | 7 | def main(): 8 | """Run from ROS""" 9 | # TODO: Add chainlit option handling via ROS 10 | 11 | root_path = Path(__file__).parent / Path("app.py") 12 | 13 | # Set general config options 14 | config.run.headless = True 15 | config.project.enable_telemetry = False 16 | config.root = str(root_path.parent) 17 | 18 | # Set audio config options 19 | config.features.audio.sample_rate = 16000 # type: ignore 20 | config.features.audio.initial_silence_timeout = 2000 # type: ignore 21 | config.features.audio.silence_timeout = 1000 # type: ignore 22 | 23 | run_chainlit(str(root_path)) 24 | 25 | 26 | if __name__ == "__main__": 27 | main() 28 | -------------------------------------------------------------------------------- /agents/scripts/executable: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import json 3 | import argparse 4 | from typing import List, Dict, Union 5 | 6 | import rclpy 7 | import setproctitle 8 | from rclpy.executors import MultiThreadedExecutor 9 | from rclpy.utilities import try_shutdown 10 | 11 | from agents import config as all_configs 12 | from agents import components as all_components 13 | from agents import clients 14 | from agents.ros import Topic, FixedInput, MapLayer, Route 15 | 16 | 17 | def _parse_args() -> tuple[argparse.Namespace, List[str]]: 18 | """Parse arguments.""" 19 | parser = argparse.ArgumentParser(description="Component Executable Config") 20 | parser.add_argument( 21 | "--config_type", type=str, help="Component configuration class name" 22 | ) 23 | parser.add_argument("--component_type", type=str, help="Component class name") 24 | parser.add_argument( 25 | "--node_name", 26 | type=str, 27 | help="Component ROS2 node name", 28 | ) 29 | parser.add_argument("--config", type=str, help="Component configuration object") 30 | parser.add_argument( 31 | "--inputs", 32 | type=str, 33 | help="Component input topics", 34 | ) 35 | parser.add_argument( 36 | "--outputs", 37 | type=str, 38 | help="Component output topics", 39 | ) 40 | parser.add_argument( 41 | "--routes", 42 | type=str, 43 | help="Semantic router routes", 44 | ) 45 | parser.add_argument( 46 | "--layers", 47 | type=str, 48 | help="Map Encoding layers", 49 | ) 50 | parser.add_argument( 51 | "--trigger", 52 | type=str, 53 | help="Component trigger", 54 | ) 55 | parser.add_argument( 56 | "--model_client", 57 | type=str, 58 | help="Model Client", 59 | ) 60 | parser.add_argument( 61 | "--db_client", 62 | type=str, 63 | help="DB Client", 64 | ) 65 | parser.add_argument( 66 | "--config_file", type=str, help="Path to configuration YAML file" 67 | ) 68 | parser.add_argument( 69 | "--events", type=str, help="Events to be monitored by the component" 70 | ) 71 | parser.add_argument( 72 | "--actions", type=str, help="Actions associated with the component Events" 73 | ) 74 | parser.add_argument( 75 | "--external_processors", 76 | type=str, 77 | help="External processors associated with the component input and output topics", 78 | ) 79 | return parser.parse_known_args() 80 | 81 | 82 | def _parse_component_config( 83 | args: argparse.Namespace, 84 | ) -> all_configs.BaseComponentConfig: 85 | """Parse the component config object 86 | 87 | :param args: Command line arguments 88 | :type args: argparse.Namespace 89 | 90 | :return: Component config object 91 | :rtype: object 92 | """ 93 | config_type = args.config_type or None 94 | if not config_type: 95 | raise ValueError("config_type must be provided") 96 | 97 | # Get config type and update from json arg 98 | config_class = getattr(all_configs, config_type) 99 | if not config_class: 100 | raise TypeError( 101 | f"Unknown config_type '{config_type}'. Known types are {all_configs.__all__}" 102 | ) 103 | 104 | config = config_class(**json.loads(args.config)) 105 | 106 | return config 107 | 108 | 109 | def _parse_trigger(trigger_str: str) -> Union[Topic, List[Topic], float]: 110 | """Parse component trigger json string 111 | 112 | :param trigger_str: Trigger JSON string 113 | :type trigger_str: str 114 | 115 | :return: Trigger topics or float 116 | :rtype: Topic | List[Topic] | float 117 | """ 118 | trigger_json = json.loads(trigger_str) 119 | if isinstance(trigger_json, List): 120 | return [Topic(**json.loads(t)) for t in trigger_json] 121 | elif isinstance(trigger_json, Dict): 122 | return Topic(**trigger_json) 123 | else: 124 | # return float 125 | return trigger_json 126 | 127 | 128 | def _deserialize_topics(serialized_topics: str) -> List[Dict]: 129 | list_of_str = json.loads(serialized_topics) 130 | return [json.loads(t) for t in list_of_str] 131 | 132 | 133 | def _parse_ros_args(args_names: List[str]) -> List[str]: 134 | """Parse ROS arguments from command line arguments 135 | 136 | :param args_names: List of all parsed arguments 137 | :type args_names: list[str] 138 | 139 | :return: List ROS parsed arguments 140 | :rtype: list[str] 141 | """ 142 | # Look for --ros-args in ros_args 143 | ros_args_start = None 144 | if "--ros-args" in args_names: 145 | ros_args_start = args_names.index("--ros-args") 146 | 147 | if ros_args_start is not None: 148 | ros_specific_args = args_names[ros_args_start:] 149 | else: 150 | ros_specific_args = [] 151 | return ros_specific_args 152 | 153 | 154 | def main(): 155 | """Executable main function to run a component as a ROS2 node in a new process. 156 | Used to start a node using ROS Sugar Launcher. Extends functionality from ROS Sugar 157 | 158 | :param list_of_components: List of all known Component classes in the package 159 | :type list_of_components: List[Type] 160 | :param list_of_configs: List of all known ComponentConfig classes in the package 161 | :type list_of_configs: List[Type] 162 | :raises ValueError: If component or component config are unknown classes 163 | :raises ValueError: If component cannot be started with provided arguments 164 | """ 165 | args, args_names = _parse_args() 166 | 167 | # Initialize rclpy with the ros-specific arguments 168 | rclpy.init(args=_parse_ros_args(args_names)) 169 | 170 | component_type = args.component_type or None 171 | 172 | if not component_type: 173 | raise ValueError("Cannot launch without providing a component_type") 174 | 175 | comp_class = getattr(all_components, component_type) 176 | 177 | if not comp_class: 178 | raise ValueError( 179 | f"Cannot launch unknown component type '{component_type}'. Known types are: '{all_components.__all__}'" 180 | ) 181 | 182 | # Get name 183 | component_name = args.node_name or None 184 | 185 | if not component_name: 186 | raise ValueError("Cannot launch component without specifying a name") 187 | 188 | # SET PROCESS NAME 189 | setproctitle.setproctitle(component_name) 190 | 191 | config = _parse_component_config(args) 192 | 193 | # Get Yaml config file if provided 194 | config_file = args.config_file or None 195 | 196 | # Get inputs/outputs/layers/routes 197 | inputs = ( 198 | [ 199 | FixedInput(**i) if i.get("fixed") else Topic(**i) 200 | for i in _deserialize_topics(args.inputs) 201 | ] 202 | if args.inputs 203 | else None 204 | ) 205 | outputs = ( 206 | [Topic(**o) for o in _deserialize_topics(args.outputs)] 207 | if args.outputs 208 | else None 209 | ) 210 | layers = ( 211 | [MapLayer(**i) for i in _deserialize_topics(args.layers)] 212 | if args.layers 213 | else None 214 | ) 215 | routes = ( 216 | [Route(**r) for r in _deserialize_topics(args.routes)] if args.routes else None 217 | ) 218 | 219 | # Get triggers 220 | trigger = _parse_trigger(args.trigger) 221 | 222 | # Init the component 223 | # Semantic Router Component 224 | if component_type == all_components.SemanticRouter.__name__: 225 | db_client_json = json.loads(args.db_client) 226 | db_client = getattr(clients, db_client_json["client_type"])(**db_client_json) 227 | component = comp_class( 228 | inputs=inputs, 229 | routes=routes, 230 | db_client=db_client, 231 | config=config, 232 | default_route=config._default_route, 233 | component_name=component_name, 234 | config_file=config_file, 235 | ) 236 | # Map Encoding Component 237 | elif component_type == all_components.MapEncoding.__name__: 238 | db_client_json = json.loads(args.db_client) 239 | db_client = getattr(clients, db_client_json["client_type"])(**db_client_json) 240 | component = comp_class( 241 | layers=layers, 242 | position=config._position, 243 | map_topic=config._map_topic, 244 | db_client=db_client, 245 | config=config, 246 | trigger=trigger, 247 | component_name=component_name, 248 | config_file=config_file, 249 | ) 250 | 251 | # All other components 252 | else: 253 | if args.model_client: 254 | model_client_json = json.loads(args.model_client) 255 | model_client = getattr(clients, model_client_json["client_type"])( 256 | **model_client_json 257 | ) 258 | else: 259 | model_client = None 260 | if args.db_client: 261 | db_client_json = json.loads(args.db_client) 262 | db_client = getattr(clients, db_client_json["client_type"])( 263 | **db_client_json 264 | ) 265 | else: 266 | db_client = None 267 | 268 | component = comp_class( 269 | inputs=inputs, 270 | outputs=outputs, 271 | model_client=model_client, 272 | db_client=db_client, 273 | trigger=trigger, 274 | config=config, 275 | component_name=component_name, 276 | config_file=config_file, 277 | ) 278 | 279 | # Init the node with rclpy 280 | component.rclpy_init_node() 281 | 282 | # Set events/actions 283 | events_json = args.events or None 284 | actions_json = args.actions or None 285 | 286 | if events_json and actions_json: 287 | component._events_json = events_json 288 | component._actions_json = actions_json 289 | 290 | # Set external processors 291 | external_processors = args.external_processors or None 292 | if external_processors: 293 | component._external_processors_json = external_processors 294 | 295 | executor = MultiThreadedExecutor() 296 | 297 | executor.add_node(component) 298 | 299 | try: 300 | executor.spin() 301 | 302 | except KeyboardInterrupt: 303 | pass 304 | 305 | finally: 306 | executor.remove_node(component) 307 | try_shutdown() 308 | 309 | 310 | if __name__ == "__main__": 311 | main() 312 | -------------------------------------------------------------------------------- /agents/tests/test_clients.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import subprocess 4 | import shutil 5 | 6 | import cv2 7 | import pytest 8 | from agents.models import Idefics2, OllamaModel 9 | from agents.vectordbs import ChromaDB 10 | from agents.clients.roboml import ( 11 | HTTPModelClient, 12 | HTTPDBClient, 13 | RESPDBClient, 14 | RESPModelClient, 15 | ) 16 | from agents.clients.ollama import OllamaClient 17 | 18 | HOST = "http://localhost" 19 | RAY_PORT = 8000 20 | RESP_PORT = 6379 21 | 22 | 23 | @pytest.fixture(scope="class") 24 | def http_clients(): 25 | """Fixture to run roboml ray and make its clients before tests are run""" 26 | 27 | # start server 28 | p = subprocess.Popen(["roboml"]) 29 | # give it 20 seconds to start before sending request 30 | time.sleep(20) 31 | model = Idefics2(name="idefics") 32 | model_client = HTTPModelClient(model, port=RAY_PORT, logging_level="debug") 33 | db = ChromaDB(name="chroma", db_location="./http_data") 34 | db_client = HTTPDBClient(db, port=RAY_PORT, logging_level="debug") 35 | 36 | yield {"model": model_client, "db": db_client} 37 | 38 | # terminate server process - kill to remove ray monitoring child 39 | p.kill() 40 | shutil.rmtree("./http_data") 41 | 42 | 43 | @pytest.fixture(scope="class") 44 | def resp_clients(): 45 | """Fixture to run roboml-resp and make its clients before tests are run""" 46 | 47 | # start server 48 | p = subprocess.Popen(["roboml-resp"]) 49 | # give it 20 seconds to start before sending request 50 | time.sleep(20) 51 | model = Idefics2(name="idefics") 52 | model_client = RESPModelClient(model, logging_level="debug") 53 | db = ChromaDB(name="chroma", db_location="./resp_data") 54 | db_client = RESPDBClient(db, logging_level="debug") 55 | 56 | yield {"model": model_client, "db": db_client} 57 | 58 | # terminate server process 59 | p.terminate() 60 | shutil.rmtree("./resp_data") 61 | 62 | 63 | @pytest.fixture(scope="class") 64 | def ollama_client(): 65 | """Fixture to create client ollama before tests are run""" 66 | 67 | model = OllamaModel(name="llava", checkpoint="llava") 68 | ollama_client = OllamaClient(model, logging_level="debug") 69 | yield ollama_client 70 | 71 | 72 | @pytest.fixture 73 | def loaded_img(): 74 | """Fixture to load test image""" 75 | return cv2.imread("agents/resources/test.jpeg", cv2.COLOR_BGR2RGB) 76 | 77 | 78 | @pytest.fixture 79 | def data(): 80 | return { 81 | "ids": ["a"], 82 | "metadatas": [{"something": "about a"}], 83 | "documents": ["description of a"], 84 | "collection_name": "alphabets", 85 | } 86 | 87 | 88 | class TestRobomlHTTPClient: 89 | """ 90 | Test roboml http client 91 | """ 92 | 93 | def test_model_init(self, http_clients): 94 | """ 95 | Test roboml http model client init 96 | """ 97 | try: 98 | http_clients["model"].check_connection() 99 | except Exception: 100 | logging.error( 101 | "Make sure roboml is installed on this machine before running these tests. roboml can be installed with `pip install roboml`" 102 | ) 103 | raise 104 | http_clients["model"].initialize() 105 | 106 | def test_model_inference(self, http_clients, loaded_img): 107 | """ 108 | Test roboml http model client inference 109 | """ 110 | inference_input = {"query": "What do you see?", "images": [loaded_img]} 111 | result = http_clients["model"].inference(inference_input) 112 | assert result is not None 113 | assert result["output"] is not None 114 | logging.info(result["output"]) 115 | 116 | def test_model_deinit(self, http_clients): 117 | """ 118 | Test roboml http model client deinit 119 | """ 120 | http_clients["model"].deinitialize() 121 | 122 | def test_db_init(self, http_clients): 123 | """ 124 | Test roboml http db client init 125 | """ 126 | http_clients["db"].check_connection() 127 | http_clients["db"].initialize() 128 | 129 | def test_db_add(self, http_clients, data): 130 | """ 131 | Test roboml http db client add 132 | """ 133 | result = http_clients["db"].add(data) 134 | assert result is not None 135 | assert result["output"] is not None 136 | logging.info(result["output"]) 137 | 138 | def test_db_conditional_add(self, http_clients, data): 139 | """ 140 | Test roboml http db client conditional add 141 | """ 142 | result = http_clients["db"].conditional_add(data) 143 | assert result is not None 144 | assert result["output"] is not None 145 | logging.info(result["output"]) 146 | 147 | def test_db_metadata_query(self, http_clients, data): 148 | """ 149 | Test roboml http db client metadata query 150 | """ 151 | metadata_query = { 152 | "metadatas": data["metadatas"], 153 | "collection_name": data["collection_name"], 154 | } 155 | result = http_clients["db"].metadata_query(metadata_query) 156 | assert result is not None 157 | assert result["output"] is not None 158 | logging.info(result["output"]) 159 | 160 | def test_db_query(self, http_clients, data): 161 | """ 162 | Test roboml http db client query 163 | """ 164 | metadata_query = { 165 | "query": "what is a", 166 | "collection_name": data["collection_name"], 167 | } 168 | result = http_clients["db"].query(metadata_query) 169 | assert result is not None 170 | assert result["output"] is not None 171 | logging.info(result["output"]) 172 | 173 | def test_db_deinit(self, http_clients): 174 | """ 175 | Test roboml http db client deinit 176 | """ 177 | http_clients["db"].deinitialize() 178 | 179 | 180 | class TestRobomlRESPClient: 181 | """ 182 | Test roboml resp client 183 | """ 184 | 185 | def test_model_init(self, resp_clients): 186 | """ 187 | Test roboml resp model client init 188 | """ 189 | try: 190 | resp_clients["model"].check_connection() 191 | except Exception: 192 | logging.error( 193 | "Make sure roboml is installed on this machine before running these tests. roboml can be installed with `pip install roboml`" 194 | ) 195 | raise 196 | resp_clients["model"].initialize() 197 | 198 | def test_model_inference(self, resp_clients, loaded_img): 199 | """ 200 | Test roboml resp model client inference 201 | """ 202 | inference_input = {"query": "What do you see?", "images": [loaded_img]} 203 | result = resp_clients["model"].inference(inference_input) 204 | assert result is not None 205 | assert result["output"] is not None 206 | logging.info(result["output"]) 207 | 208 | def test_model_deinit(self, resp_clients): 209 | """ 210 | Test roboml resp model client deinit 211 | """ 212 | resp_clients["model"].deinitialize() 213 | 214 | def test_db_init(self, resp_clients): 215 | """ 216 | Test roboml resp db client init 217 | """ 218 | resp_clients["db"].check_connection() 219 | resp_clients["db"].initialize() 220 | 221 | def test_db_add(self, resp_clients, data): 222 | """ 223 | Test roboml resp db client add 224 | """ 225 | result = resp_clients["db"].add(data) 226 | assert result is not None 227 | assert result["output"] is not None 228 | logging.info(result["output"]) 229 | 230 | def test_db_conditional_add(self, resp_clients, data): 231 | """ 232 | Test roboml resp db client conditional add 233 | """ 234 | result = resp_clients["db"].conditional_add(data) 235 | assert result is not None 236 | assert result["output"] is not None 237 | logging.info(result["output"]) 238 | 239 | def test_db_metadata_query(self, resp_clients, data): 240 | """ 241 | Test roboml resp db client metadata query 242 | """ 243 | metadata_query = { 244 | "metadatas": data["metadatas"], 245 | "collection_name": data["collection_name"], 246 | } 247 | result = resp_clients["db"].metadata_query(metadata_query) 248 | assert result is not None 249 | assert result["output"] is not None 250 | logging.info(result["output"]) 251 | 252 | def test_db_query(self, resp_clients, data): 253 | """ 254 | Test roboml resp db client query 255 | """ 256 | metadata_query = { 257 | "query": "what is a", 258 | "collection_name": data["collection_name"], 259 | } 260 | result = resp_clients["db"].query(metadata_query) 261 | assert result is not None 262 | assert result["output"] is not None 263 | logging.info(result["output"]) 264 | 265 | def test_db_deinit(self, resp_clients): 266 | """ 267 | Test roboml resp db client deinit 268 | """ 269 | resp_clients["db"].deinitialize() 270 | 271 | 272 | class TestOllamaClient: 273 | """ 274 | Test ollama client 275 | """ 276 | 277 | def test_model_init(self, ollama_client): 278 | """ 279 | Test ollama model client init 280 | """ 281 | try: 282 | ollama_client.check_connection() 283 | except Exception: 284 | logging.error( 285 | "Make sure Ollama is installed on this machine before running these tests. Visit https://ollama.com for installation instructions." 286 | ) 287 | raise 288 | ollama_client.initialize() 289 | 290 | def test_model_inference(self, ollama_client, loaded_img): 291 | """ 292 | Test ollama model client inference 293 | """ 294 | inference_input = {"query": "What do you see?", "images": [loaded_img]} 295 | result = ollama_client.inference(inference_input) 296 | assert result is not None 297 | assert result["output"] is not None 298 | logging.info(result["output"]) 299 | 300 | def test_model_deinit(self, ollama_client): 301 | """ 302 | Test ollama model client deinit 303 | """ 304 | ollama_client.deinitialize() 305 | -------------------------------------------------------------------------------- /docs/_static/ROS_AGENTS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/ROS_AGENTS.png -------------------------------------------------------------------------------- /docs/_static/ROS_AGENTS_DARK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/ROS_AGENTS_DARK.png -------------------------------------------------------------------------------- /docs/_static/automatika-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/automatika-logo.png -------------------------------------------------------------------------------- /docs/_static/complete_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/complete_dark.png -------------------------------------------------------------------------------- /docs/_static/complete_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/complete_light.png -------------------------------------------------------------------------------- /docs/basics.md: -------------------------------------------------------------------------------- 1 | # Basic Concepts 📚 2 | 3 | The following is an overview of basic building blocks of ROS Agents. You can follow the links in each subsection to dig deeper. 4 | 5 | ## Component 6 | 7 | A Component is the main execution unit in ROS Agents and in essence each component is synctactic sugar over a ROS2 Lifecycle Node. All the functionalities implemented in ROS2 nodes can be found in the component. Components take a single Topic or a list of Topics as inputs and ouputs. Depending on the components functionality, certain types of Topics might be mandatory. 8 | 9 | ```{note} 10 | To learn more about components, checkout [ROS Sugar Documentation](https://automatika-robotics.github.io/ros-sugar/). 11 | ``` 12 | 13 | ### Components Available in ROS Agents 14 | 15 | ROS Agents provides various ready to use components. You can see their details [here](apidocs/agents/agents.components). 16 | 17 | ### Component Config 18 | 19 | Each component can take in an optional config. Configs are generally [attrs](https://www.attrs.org/en/stable/) classes and for components that use ML models, configs are also the place where inference parameters are defined. You can see the default options for configs of each available component [here](apidocs/agents/agents.config). 20 | 21 | ### Component RunType 22 | 23 | In ROS Agents, components can be of the following two types: 24 | 25 | ```{list-table} 26 | :widths: 10 80 27 | * - **Timed** 28 | - Execute the main execution function in a timed loop. 29 | * - **Event** 30 | - Execute the main execution function based on a trigger topic/event. 31 | ``` 32 | 33 | ### Health Check and Fallback 34 | 35 | Each component maintains a health status, based on which, one can configure various fallback options for the component allowing it to recover from failures or shutdown gracefully. This aspect can be significant in embodied autonomous agents, not just in terms of safety but for generally coherent and reliable performance. To learn more about these topics, check out the documentation of [ROS Sugar Documentation](https://automatika-robotics.github.io/ros-sugar/). 36 | 37 | ## Topic 38 | 39 | A [topic](apidocs/agents/agents.ros) is an idomatic wrapper for a ROS2 topic. Topics can be given as inputs or outputs to components. When given as inputs, components automatically create listeners for the topics upon their activation. And when given as outputs, components create publishers for publishing to the topic. Each topic has a name (duh?) and a data type, defining its listening callback and publishing behavior. The data type can be provided to the topic as a string. Checkout the list of supported data types [here](https://automatika-robotics.github.io/ros-sugar/advanced/types.html). 40 | 41 | ```{note} 42 | Learn more about Topics in [ROS Sugar](https://automatika-robotics.github.io/ros-sugar/). 43 | ``` 44 | 45 | ## Model/DB Client 46 | 47 | Certain components in ROS Agents deal with ML models, vector DBs or both. These components take in a model or db client as one of their initialization parameters. The reason for this separate abstraction is to enforce _separation of concerns_. An ML model can be running on the edge hardware itself, or a powerful compute node in the network, or in the cloud, the components running on the robot edge can always use the model (or DB) via a client in a standardized way. This also makes the components independant of the model serving platforms, which can implement various inference optimizations which are usually model specific. Thus one can choose an ML serving platform with the best latency/accuracy tradeoff, depending on the application concerns. 48 | 49 | All clients implement a connection check. ML clients must implement a model inference and optionally model initialization and deinitialization methods (since an embodied agent can initialize different models (or fine tuned versions of the same model) for the same component, depending on some event in the environment). Similarly vector DB clients implement standard CRUD methods for vector DBs. Checkout the list of available clients [here](apidocs/agents/agents.clients). 50 | 51 | ## Models/DBs 52 | 53 | The clients we mentioned above take as input a model or vector database specification. These are in the form of [attrs](https://www.attrs.org/en/stable/) classes and define intialization parameters, such as quantization for ML models or choice of encoding model for vector DBs, among others. The available models and databases that can be instantiated on a particular model serving platform usually depend on the platform itself. However, with these model and vector DB specifications, we aim to standardize the model initialization specifications across platforms. Check the list of [models](apidocs/agents/agents.models) and [vector DBs](apidocs/agents/agents.vectordbs) that are available. 54 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | import os 3 | import sys 4 | from datetime import date 5 | import xml.etree.ElementTree as ET 6 | 7 | sys.path.insert(0, os.path.abspath("..")) 8 | version = ET.parse("../agents/package.xml").getroot()[1].text 9 | print("Found version:", version) 10 | 11 | project = "ROS Agents" 12 | copyright = f"{date.today().year}, Automatika Robotics" 13 | author = "Automatika Robotics" 14 | release = version 15 | 16 | extensions = [ 17 | "sphinx.ext.viewcode", 18 | "sphinx.ext.doctest", 19 | "sphinx_copybutton", # install with `pip install sphinx-copybutton` 20 | "autodoc2", # install with `pip install sphinx-autodoc2` 21 | "myst_parser", # install with `pip install myst-parser` 22 | ] 23 | 24 | autodoc2_packages = [ 25 | { 26 | "module": "agents", 27 | "path": "../agents/agents", 28 | "exclude_dirs": ["__pycache__", "utils"], 29 | "exclude_files": [ 30 | "callbacks.py", 31 | "publisher.py", 32 | "component_base.py", 33 | "model_component.py", 34 | "model_base.py", 35 | "db_base.py", 36 | "executable.py", 37 | ], 38 | }, 39 | ] 40 | 41 | autodoc2_docstrings = "all" 42 | autodoc2_class_docstring = "both" # bug in autodoc2, should be `merge` 43 | autodoc2_render_plugin = "myst" 44 | autodoc2_hidden_objects = ["private", "dunder", "undoc"] 45 | autodoc2_module_all_regexes = [ 46 | r"agents.config", 47 | r"agents.models", 48 | r"agents.vectordbs", 49 | r"agents.ros", 50 | r"agents.clients\.[^\.]+", 51 | ] 52 | 53 | templates_path = ["_templates"] 54 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 55 | 56 | myst_enable_extensions = [ 57 | "amsmath", 58 | "attrs_inline", 59 | "colon_fence", 60 | "deflist", 61 | "dollarmath", 62 | "fieldlist", 63 | "html_admonition", 64 | "html_image", 65 | "linkify", 66 | "replacements", 67 | "smartquotes", 68 | "strikethrough", 69 | "substitution", 70 | "tasklist", 71 | ] 72 | language = "en" 73 | myst_html_meta = { 74 | "google-site-verification": "cQVj-BaADcGVOGB7GOvfbkgJjxni10C2fYWCZ03jOeo" 75 | } 76 | 77 | 78 | html_theme = "sphinx_book_theme" # install with `pip install sphinx-book-theme` 79 | html_static_path = ["_static"] 80 | html_theme_options = { 81 | "logo": { 82 | "image_light": "_static/ROS_AGENTS_DARK.png", 83 | "image_dark": "_static/ROS_AGENTS.png", 84 | }, 85 | "icon_links": [ 86 | { 87 | "name": "Automatika", 88 | "url": "https://automatikarobotics.com/", 89 | "icon": "_static/automatika-logo.png", 90 | "type": "local", 91 | }, 92 | { 93 | "name": "GitHub", 94 | "url": "https://github.com/automatika-robotics/ros-agents", 95 | "icon": "fa-brands fa-github", 96 | }, 97 | { 98 | "name": "Discord", 99 | "url": "https://discord.gg/cAW3BWwt", 100 | "icon": "fa-brands fa-discord", 101 | }, 102 | ], 103 | "path_to_docs": "docs", 104 | "repository_url": "https://github.com/automatika-robotics/ros-agents", 105 | "repository_branch": "main", 106 | "use_source_button": True, 107 | "use_issues_button": True, 108 | "use_edit_page_button": True, 109 | "show_navbar_depth": 2, 110 | } 111 | -------------------------------------------------------------------------------- /docs/examples/complete.md: -------------------------------------------------------------------------------- 1 | # Bringing it all together 🤖 2 | 3 | In this example we will combine everything we implemented in the previous examples to create one big graph of components. Afterwards we will analyze what we have accomplished. Here is what the code looks like: 4 | 5 | ```python 6 | import numpy as np 7 | import json 8 | from typing import Optional 9 | from agents.components import MLLM, SpeechToText, TextToSpeech, LLM, Vision, MapEncoding, SemanticRouter 10 | from agents.config import SpeechToTextConfig, TextToSpeechConfig 11 | from agents.clients.roboml import HTTPModelClient, RESPModelClient, HTTPDBClient 12 | from agents.clients.ollama import OllamaClient 13 | from agents.models import Whisper, SpeechT5, Llava, Llama3_1, VisionModel 14 | from agents.vectordbs import ChromaDB 15 | from agents.config import VisionConfig, LLMConfig, MapConfig, SemanticRouterConfig 16 | from agents.ros import Topic, Launcher, FixedInput, MapLayer, Route 17 | 18 | 19 | ### Setup our models and vectordb ### 20 | whisper = Whisper(name="whisper") 21 | whisper_client = HTTPModelClient(whisper) 22 | speecht5 = SpeechT5(name="speecht5") 23 | speecht5_client = HTTPModelClient(speecht5) 24 | object_detection_model = VisionModel(name="dino_4scale", 25 | checkpoint="dino-4scale_r50_8xb2-12e_coco") 26 | detection_client = RESPModelClient(object_detection_model) 27 | llava = Llava(name="llava") 28 | llava_client = OllamaClient(llava) 29 | llama = Llama3_1(name="llama") 30 | llama_client = OllamaClient(llama) 31 | chroma = ChromaDB(name="MainDB") 32 | chroma_client = HTTPDBClient(db=chroma) 33 | 34 | ### Setup our components ### 35 | # Setup a speech to text component 36 | audio_in = Topic(name="audio0", msg_type="Audio") 37 | query_topic = Topic(name="question", msg_type="String") 38 | 39 | speech_to_text = SpeechToText( 40 | inputs=[audio_in], 41 | outputs=[query_topic], 42 | model_client=whisper_client, 43 | trigger=audio_in, 44 | config=SpeechToTextConfig(enable_vad=True), # option to always listen for speech through the microphone 45 | component_name="speech_to_text" 46 | ) 47 | 48 | # Setup a text to speech component 49 | query_answer = Topic(name="answer", msg_type="String") 50 | 51 | t2s_config = TextToSpeechConfig(play_on_device=True) 52 | 53 | text_to_speech = TextToSpeech( 54 | inputs=[query_answer], 55 | trigger=query_answer, 56 | model_client=speecht5_client, 57 | config=t2s_config, 58 | component_name="text_to_speech", 59 | ) 60 | 61 | # Setup a vision component for object detection 62 | image0 = Topic(name="image_raw", msg_type="Image") 63 | detections_topic = Topic(name="detections", msg_type="Detections") 64 | 65 | detection_config = VisionConfig(threshold=0.5) 66 | vision = Vision( 67 | inputs=[image0], 68 | outputs=[detections_topic], 69 | trigger=image0, 70 | config=detection_config, 71 | model_client=detection_client, 72 | component_name="object_detection", 73 | ) 74 | 75 | # Define a generic mllm component for vqa 76 | mllm_query = Topic(name="mllm_query", msg_type="String") 77 | 78 | mllm = MLLM( 79 | inputs=[mllm_query, image0, detections_topic], 80 | outputs=[query_answer], 81 | model_client=llava_client, 82 | trigger=mllm_query, 83 | component_name="visual_q_and_a" 84 | ) 85 | 86 | mllm.set_component_prompt( 87 | template="""Imagine you are a robot. 88 | This image has following items: {{ detections }}. 89 | Answer the following about this image: {{ text0 }}""" 90 | ) 91 | 92 | # Define a fixed input mllm component that does introspection 93 | introspection_query = FixedInput( 94 | name="introspection_query", msg_type="String", 95 | fixed="What kind of a room is this? Is it an office, a bedroom or a kitchen? Give a one word answer, out of the given choices") 96 | introspection_answer = Topic(name="introspection_answer", msg_type="String") 97 | 98 | introspector = MLLM( 99 | inputs=[introspection_query, image0], 100 | outputs=[introspection_answer], 101 | model_client=llava_client, 102 | trigger=15.0, 103 | component_name="introspector", 104 | ) 105 | 106 | 107 | def introspection_validation(output: str) -> Optional[str]: 108 | for option in ["office", "bedroom", "kitchen"]: 109 | if option in output.lower(): 110 | return option 111 | 112 | 113 | introspector.add_publisher_preprocessor(introspection_answer, introspection_validation) 114 | 115 | # Define a semantic map using MapEncoding component 116 | layer1 = MapLayer(subscribes_to=detections_topic, temporal_change=True) 117 | layer2 = MapLayer(subscribes_to=introspection_answer, resolution_multiple=3) 118 | 119 | position = Topic(name="odom", msg_type="Odometry") 120 | map_topic = Topic(name="map", msg_type="OccupancyGrid") 121 | 122 | map_conf = MapConfig(map_name="map") 123 | map = MapEncoding( 124 | layers=[layer1, layer2], 125 | position=position, 126 | map_topic=map_topic, 127 | config=map_conf, 128 | db_client=chroma_client, 129 | trigger=15.0, 130 | component_name="map_encoder" 131 | ) 132 | 133 | # Define a generic LLM component 134 | llm_query = Topic(name="llm_query", msg_type="String") 135 | 136 | llm = LLM( 137 | inputs=[llm_query], 138 | outputs=[query_answer], 139 | model_client=llama_client, 140 | trigger=[llm_query], 141 | component_name="general_q_and_a" 142 | ) 143 | 144 | # Define a Go-to-X component using LLM 145 | goto_query = Topic(name="goto_query", msg_type="String") 146 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 147 | 148 | goto_config = LLMConfig( 149 | enable_rag=True, 150 | collection_name="map", 151 | distance_func="l2", 152 | n_results=1, 153 | add_metadata=True, 154 | ) 155 | 156 | goto = LLM( 157 | inputs=[goto_query], 158 | outputs=[goal_point], 159 | model_client=llama_client, 160 | config=goto_config, 161 | db_client=chroma_client, 162 | trigger=goto_query, 163 | component_name="go_to_x", 164 | ) 165 | 166 | goto.set_component_prompt( 167 | template="""From the given metadata, extract coordinates and provide 168 | the coordinates in the following json format:\n {"position": coordinates}""" 169 | ) 170 | 171 | 172 | # pre-process the output before publishing to a topic of msg_type PoseStamped 173 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 174 | # extract the json part of the output string (including brackets) 175 | # one can use sophisticated regex parsing here but we'll keep it simple 176 | json_string = output[output.find("{") : output.rfind("}") + 1] 177 | # load the string as a json and extract position coordinates 178 | # if there is an error, return None, i.e. no output would be published to goal_point 179 | try: 180 | json_dict = json.loads(json_string) 181 | coordinates = np.fromstring(json_dict["position"], sep=',', dtype=np.float64) 182 | print('Coordinates Extracted:', coordinates) 183 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3: 184 | return 185 | elif coordinates.shape[0] == 2: # sometimes LLMs avoid adding the zeros of z-dimension 186 | coordinates = np.append(coordinates, 0) 187 | return coordinates 188 | except Exception: 189 | return 190 | 191 | 192 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 193 | 194 | # Define a semantic router between a generic LLM component, VQA MLLM component and Go-to-X component 195 | goto_route = Route(routes_to=goto_query, 196 | samples=["Go to the door", "Go to the kitchen", 197 | "Get me a glass", "Fetch a ball", "Go to hallway"]) 198 | 199 | llm_route = Route(routes_to=llm_query, 200 | samples=["What is the capital of France?", "Is there life on Mars?", 201 | "How many tablespoons in a cup?", "How are you today?", "Whats up?"]) 202 | 203 | mllm_route = Route(routes_to=mllm_query, 204 | samples=["Are we indoors or outdoors", "What do you see?", "Whats in front of you?", 205 | "Where are we", "Do you see any people?", "How many things are infront of you?", 206 | "Is this room occupied?"]) 207 | 208 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2") 209 | # Initialize the router component 210 | router = SemanticRouter( 211 | inputs=[query_topic], 212 | routes=[llm_route, goto_route, mllm_route], 213 | default_route=llm_route, 214 | config=router_config, 215 | db_client=chroma_client, 216 | component_name='router' 217 | ) 218 | 219 | # Launch the components 220 | launcher = Launcher() 221 | launcher.add_pkg( 222 | components=[ 223 | mllm, 224 | llm, 225 | goto, 226 | introspector, 227 | map, 228 | router, 229 | speech_to_text, 230 | text_to_speech, 231 | vision 232 | ] 233 | ) 234 | launcher.bringup() 235 | ``` 236 | ```{note} 237 | Note how we use the same model for _general_q_and_a_ and _goto_to_x_ components. Similarly _visual_q_and_a_ and _introspector_ components share a multimodal LLM model. 238 | ``` 239 | 240 | In this small code block above, we have setup a fairly sophisticated embodied agent with the following capabilities. 241 | 242 | - A conversational interface using speech-to-text and text-to-speech models that uses the robots microphone and playback speaker. 243 | - The ability to answer contextual queries based on the robots camera, using an MLLM model. 244 | - The ability to answer generic queries, using an LLM model. 245 | - A semantic map of the robots observations, that acts as a spatio-temporal memory. 246 | - The ability to respond to Go-to-X commands utilizing the semantic map. 247 | - A single input interface that routes the input to different models based on its content. 248 | 249 | We can visualize the complete graph in the following diagram: 250 | ```{figure} ../_static/complete_dark.png 251 | :class: only-dark 252 | :alt: Complete embodied agent 253 | :align: center 254 | Complete embodied agent graph 255 | ``` 256 | ```{figure} ../_static/complete_light.png 257 | :class: only-light 258 | :alt: Complete embodied agent 259 | :align: center 260 | Complete embodied agent graph 261 | ``` 262 | -------------------------------------------------------------------------------- /docs/examples/goto.md: -------------------------------------------------------------------------------- 1 | # Create a Go-to-X component using map data 2 | 3 | In the previous [example](semantic_map.md) we created a semantic map using the MapEncoding component. Intuitively one can imagine that using the map data would require some form of RAG. Let us suppose that we want to create a Go-to-X component, which, when given a command like 'Go to the yellow door', would retreive the coordinates of the _yellow door_ from the map and publish them to a goal point topic of type _PoseStamped_ to be handled by our robots navigation system. We will create our Go-to-X component using the LLM component provided by ROS Agents. We will start by initializing the component, and configuring it to use RAG. 4 | 5 | ## Initialize the component 6 | 7 | ```python 8 | from agents.components import LLM 9 | from agents.models import Llama3_1 10 | from agents.config import LLMConfig 11 | from agents.clients.ollama import OllamaClient 12 | from agents.ros import Topic 13 | 14 | # Start a Llama3.1 based llm component using ollama client 15 | llama = Llama3_1(name="llama") 16 | llama_client = OllamaClient(llama) 17 | 18 | # Define LLM input and output topics including goal_point topic of type PoseStamped 19 | goto_in = Topic(name="goto_in", msg_type="String") 20 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 21 | ``` 22 | 23 | In order to configure the component to use RAG, we will set the following options in its config. 24 | 25 | ```python 26 | config = LLMConfig(enable_rag=True, 27 | collection_name="map", 28 | distance_func="l2", 29 | n_results=1, 30 | add_metadata=True) 31 | ``` 32 | 33 | Note that the _collection_name_ parameter is the same as the map name we set in the previous [example](semantic_map.md). We have also set _add_metadata_ parameter to true to make sure that our metadata is included in the RAG result, as the spatial coordinates we want to get are part of the metadata. Let us have a quick look at the metadata stored in the map by the MapEncoding component. 34 | 35 | ``` 36 | { 37 | "coordinates": [1.1, 2.2, 0.0], 38 | "layer_name": "Topic_Name", # same as topic name that the layer is subscribed to 39 | "timestamp": 1234567, 40 | "temporal_change": True 41 | } 42 | ``` 43 | 44 | With this information, we will first initialize our component. 45 | ```{caution} 46 | In the following code block we are using the same DB client that was setup in the previous [example](semantic_map.md). 47 | ``` 48 | 49 | ```python 50 | # initialize the component 51 | goto = LLM( 52 | inputs=[goto_in], 53 | outputs=[goal_point], 54 | model_client=llama_client, 55 | db_client=chroma_client, # check the previous example where we setup this database client 56 | trigger=goto_in, 57 | config=config, 58 | component_name='go_to_x' 59 | ) 60 | ``` 61 | 62 | ## Pre-process the model output before publishing 63 | 64 | Knowing that the output of retreival will be appended to the beggining of our query as context, we will setup a component level promot for our LLM. 65 | 66 | ```python 67 | # set a component prompt 68 | goto.set_component_prompt( 69 | template="""From the given metadata, extract coordinates and provide 70 | the coordinates in the following json format:\n {"position": coordinates}""" 71 | ) 72 | ``` 73 | 74 | ```{note} 75 | One might notice that we have not used an input topic name in our prompt. This is because we only need the input topic to fetch data from the vector DB during the RAG step. The query to the LLM in this case would only be composed of data fetched from the DB and our prompt. 76 | ``` 77 | 78 | As the LLM output will contain text other than the _json_ string that we have asked for, we need to add a pre-processing function to the output topic that extracts the required part of the text and returns the output in a format that can be published to a _PoseStamped_ topic, i.e. a numpy array of floats. 79 | 80 | ```python 81 | from typing import Optional 82 | import json 83 | import numpy as np 84 | 85 | # pre-process the output before publishing to a topic of msg_type PoseStamped 86 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 87 | # extract the json part of the output string (including brackets) 88 | # one can use sophisticated regex parsing here but we'll keep it simple 89 | json_string = output[output.find("{") : output.rfind("}") + 1] 90 | # load the string as a json and extract position coordinates 91 | # if there is an error, return None, i.e. no output would be published to goal_point 92 | try: 93 | json_dict = json.loads(json_string) 94 | coordinates = np.fromstring(json_dict["position"], sep=',', dtype=np.float64) 95 | print('Coordinates Extracted:', coordinates) 96 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3: 97 | return 98 | elif coordinates.shape[0] == 2: # sometimes LLMs avoid adding the zeros of z-dimension 99 | coordinates = np.append(coordinates, 0) 100 | return coordinates 101 | except Exception: 102 | return 103 | 104 | # add the pre-processing function to the goal_point output topic 105 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 106 | ``` 107 | 108 | ## Launching the Components 109 | 110 | And we will launch our Go-to-X component. 111 | 112 | ```python 113 | from agents.ros import Launcher 114 | 115 | # Launch the component 116 | launcher = Launcher() 117 | launcher.add_pkg( 118 | components=[goto] 119 | ) 120 | launcher.bringup() 121 | ``` 122 | 123 | And that is all. Our Go-to-X component is ready. The complete code for this example is given below: 124 | 125 | ```{code-block} python 126 | :caption: Go-to-X Component 127 | :linenos: 128 | from typing import Optional 129 | import json 130 | import numpy as np 131 | from agents.components import LLM 132 | from agents.models import Llama3_1 133 | from agents.vectordbs import ChromaDB 134 | from agents.config import LLMConfig 135 | from agents.clients.roboml import HTTPDBClient 136 | from agents.clients.ollama import OllamaClient 137 | from agents.ros import Launcher, Topic 138 | 139 | # Start a Llama3.1 based llm component using ollama client 140 | llama = Llama3_1(name="llama") 141 | llama_client = OllamaClient(llama) 142 | 143 | # Initialize a vector DB that will store our routes 144 | chroma = ChromaDB(name="MainDB") 145 | chroma_client = HTTPDBClient(db=chroma) 146 | 147 | # Define LLM input and output topics including goal_point topic of type PoseStamped 148 | goto_in = Topic(name="goto_in", msg_type="String") 149 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 150 | 151 | config = LLMConfig(enable_rag=True, 152 | collection_name="map", 153 | distance_func="l2", 154 | n_results=1, 155 | add_metadata=True) 156 | 157 | # initialize the component 158 | goto = LLM( 159 | inputs=[goto_in], 160 | outputs=[goal_point], 161 | model_client=llama_client, 162 | db_client=chroma_client, # check the previous example where we setup this database client 163 | trigger=goto_in, 164 | config=config, 165 | component_name='go_to_x' 166 | ) 167 | 168 | # set a component prompt 169 | goto.set_component_prompt( 170 | template="""From the given metadata, extract coordinates and provide 171 | the coordinates in the following json format:\n {"position": coordinates}""" 172 | ) 173 | 174 | 175 | # pre-process the output before publishing to a topic of msg_type PoseStamped 176 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 177 | # extract the json part of the output string (including brackets) 178 | # one can use sophisticated regex parsing here but we'll keep it simple 179 | json_string = output[output.find("{") : output.rfind("}") + 1] 180 | # load the string as a json and extract position coordinates 181 | # if there is an error, return None, i.e. no output would be published to goal_point 182 | try: 183 | json_dict = json.loads(json_string) 184 | coordinates = np.fromstring(json_dict["position"], sep=',', dtype=np.float64) 185 | print('Coordinates Extracted:', coordinates) 186 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3: 187 | return 188 | elif coordinates.shape[0] == 2: # sometimes LLMs avoid adding the zeros of z-dimension 189 | coordinates = np.append(coordinates, 0) 190 | return coordinates 191 | except Exception: 192 | return 193 | 194 | 195 | # add the pre-processing function to the goal_point output topic 196 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 197 | 198 | # Launch the component 199 | launcher = Launcher() 200 | launcher.add_pkg( 201 | components=[goto] 202 | ) 203 | launcher.bringup() 204 | ``` 205 | -------------------------------------------------------------------------------- /docs/examples/index.md: -------------------------------------------------------------------------------- 1 | # Examples ✨ 2 | 3 | In this section you will find basic examples of ROS Agents usage in the form of short tutorials. These examples would show you how ROS Agents' components can be used to create real world embodied agent capabilities in robots. It is recommended to go through the examples in sequence. 4 | 5 | ```{toctree} 6 | :maxdepth: 1 7 | 8 | conversational 9 | prompt_engineering 10 | semantic_map 11 | goto 12 | tool_calling 13 | semantic_router 14 | complete 15 | multiprocessing 16 | ``` 17 | -------------------------------------------------------------------------------- /docs/examples/prompt_engineering.md: -------------------------------------------------------------------------------- 1 | # Prompt engineering for LLMs/MLLMs using vision models 2 | 3 | In this example we will use the output of an object detection component to enrich the prompt of an MLLM component. Let us start by importing the components. 4 | ```python 5 | from agents.components import Vision, MLLM 6 | ``` 7 | 8 | ## Setting up the Object Detection Component 9 | For object detection and tracking, ROS Agents provides a unified Vision component. This component takes as input an image topic published by a camera device onboard our robot. The output of this component can be a _detections_ topic in case of object detection or a _trackings_ topic in case of object tracking. In this example we will use a _detections_ topic. 10 | 11 | ```python 12 | from agents.ros import Topic 13 | 14 | # Define the image input topic 15 | image0 = Topic(name="image_raw", msg_type="Image") 16 | # Create a detection topic 17 | detections_topic = Topic(name="detections", msg_type="Detections") 18 | ``` 19 | Additionally the component requiers a model client with an object detection model. We will use the RESP client for RoboML and use the VisionModel a convenient model class made available in ROS Agents, for initializing all vision models available in the opensource [mmdetection](https://github.com/open-mmlab/mmdetection) library. We will specify the model we want to use by specifying the checkpoint attribute. 20 | 21 | ```{note} 22 | Learn about setting up RoboML with vision [here](https://github.com/automatika-robotics/roboml/blob/main/README.md#for-vision-models-support). 23 | ``` 24 | ```{seealso} 25 | Checkout all available mmdetection models and their benchmarking results in the [mmdetection model zoo](https://github.com/open-mmlab/mmdetection?tab=readme-ov-file#overview-of-benchmark-and-model-zoo). 26 | ``` 27 | 28 | ```python 29 | from agents.models import VisionModel 30 | from agents.clients.roboml import RESPModelClient, HTTPModelClient 31 | from agents.config import VisionConfig 32 | 33 | # Add an object detection model 34 | object_detection = VisionModel(name="object_detection", 35 | checkpoint="dino-4scale_r50_8xb2-12e_coco") 36 | roboml_detection = RESPModelClient(object_detection) 37 | 38 | # Initialize the Vision component 39 | detection_config = VisionConfig(threshold=0.5) 40 | vision = Vision( 41 | inputs=[image0], 42 | outputs=[detections_topic], 43 | trigger=image0, 44 | config=detection_config, 45 | model_client=roboml_detection, 46 | component_name="detection_component", 47 | ) 48 | ``` 49 | 50 | ```{tip} 51 | Notice that we passed in an option config to the component. Component configs can be used to setup various parameters in the component. If the component calls an ML than inference parameters for the model can be set in the component config. 52 | ``` 53 | 54 | ## Setting up the MLLM Component 55 | 56 | For the MLLM component, we will provide an additional text input topic, which will listen to our queries. The output of the component will be another text topic. We will use the RoboML HTTP client with the multimodal LLM Idefics2 by the good folks at HuggingFace for this example. 57 | 58 | ```python 59 | from agents.models import Idefics2 60 | 61 | # Define MLLM input and output text topics 62 | text_query = Topic(name="text0", msg_type="String") 63 | text_answer = Topic(name="text1", msg_type="String") 64 | 65 | # Define a model client (working with roboml in this case) 66 | idefics = Idefics2(name="idefics_model") 67 | idefics_client = HTTPModelClient(idefics) 68 | 69 | # Define an MLLM component 70 | # We can pass in the detections topic which we defined previously directy as an optional input 71 | # to the MLLM component in addition to its other required inputs 72 | mllm = MLLM( 73 | inputs=[text_query, image0, detections_topic], 74 | outputs=[text_answer], 75 | model_client=idefics_client, 76 | trigger=text_query, 77 | component_name="mllm_component" 78 | ) 79 | ``` 80 | Next we will setup a component level prompt to ensure that our text query and the output of the detections topic are sent to the model as we intend. We will do this by passing a jinja2 template to the **set_component_prompt** function. 81 | ```python 82 | mllm.set_component_prompt( 83 | template="""Imagine you are a robot. 84 | This image has following items: {{ detections }}. 85 | Answer the following about this image: {{ text0 }}""" 86 | ) 87 | ``` 88 | ```{caution} 89 | The names of the topics used in the jinja2 template are the same as the name parameters set when creation the Topic objects. 90 | ``` 91 | 92 | ## Launching the Components 93 | 94 | Finally we will launch our components as we did in the previous example. 95 | 96 | ```python 97 | from agents.ros import Launcher 98 | 99 | # Launch the components 100 | launcher = Launcher() 101 | launcher.add_pkg( 102 | components=[vision, mllm] 103 | ) 104 | launcher.bringup() 105 | ``` 106 | 107 | And there we have it. Complete code of this example is provided below. 108 | 109 | ```{code-block} python 110 | :caption: Prompt Engineering with Object Detection 111 | :linenos: 112 | from agents.components import Vision, MLLM 113 | from agents.models import VisionModel, Idefics2 114 | from agents.clients.roboml import RESPModelClient, HTTPModelClient 115 | from agents.config import VisionConfig 116 | from agents.ros import Topic, Launcher 117 | 118 | image0 = Topic(name="image_raw", msg_type="Image") 119 | detections_topic = Topic(name="detections", msg_type="Detections") 120 | 121 | object_detection = VisionModel(name="object_detection", 122 | checkpoint="dino-4scale_r50_8xb2-12e_coco") 123 | roboml_detection = RESPModelClient(object_detection) 124 | 125 | detection_config = VisionConfig(threshold=0.5) 126 | vision = Vision( 127 | inputs=[image0], 128 | outputs=[detections_topic], 129 | trigger=image0, 130 | config=detection_config, 131 | model_client=roboml_detection, 132 | component_name="detection_component", 133 | ) 134 | 135 | text_query = Topic(name="text0", msg_type="String") 136 | text_answer = Topic(name="text1", msg_type="String") 137 | 138 | idefics = Idefics2(name="idefics_model") 139 | idefics_client = HTTPModelClient(idefics) 140 | 141 | mllm = MLLM( 142 | inputs=[text_query, image0, detections_topic], 143 | outputs=[text_answer], 144 | model_client=idefics_client, 145 | trigger=text_query, 146 | component_name="mllm_component" 147 | ) 148 | 149 | mllm.set_component_prompt( 150 | template="""Imagine you are a robot. 151 | This image has following items: {{ detections }}. 152 | Answer the following about this image: {{ text0 }}""" 153 | ) 154 | launcher = Launcher() 155 | launcher.add_pkg( 156 | components=[vision, mllm] 157 | ) 158 | launcher.bringup() 159 | ``` 160 | -------------------------------------------------------------------------------- /docs/examples/semantic_router.md: -------------------------------------------------------------------------------- 1 | # Create a semantic router to route text queries between different components 2 | 3 | While semantic routing can be implemented with an LLM component, ROS Agents also provides a convenient SemanticRouter component that works directly with text encoding distances and can be utilized with a vector DB. 4 | 5 | In this example we will use the SemanticRouter component to route text queries between two components, a general purpose LLM and a Go-to-X component that we built in the previous [example](goto.md). Lets start by setting up our components. 6 | 7 | ## Setting up the components 8 | 9 | In the following code snippet we will setup our two components. 10 | 11 | ```python 12 | from agents.components import LLM 13 | from agents.clients.ollama import OllamaClient 14 | from agents.clients.roboml import HTTPModelClient 15 | from agents.models import Idefics2, Llama3_1 16 | from agents.config import LLMConfig 17 | from agents.ros import Topic 18 | 19 | # Create a llama3.1 client using Ollama 20 | llama = Llama3_1(name="llama") 21 | ollama_client = OllamaClient(llama) 22 | 23 | # Make a generic LLM component using the Llama3_1 model 24 | llm_in = Topic(name="llm_in", msg_type="String") 25 | llm_out = Topic(name="llm_out", msg_type="String") 26 | 27 | llm = LLM( 28 | inputs=[llm_in], 29 | outputs=[llm_out], 30 | model_client=llama_client, 31 | trigger=[llm_in], 32 | ) 33 | 34 | # Make a Go-to-X component using the same Llama3_1 model 35 | goto_in = Topic(name="goto_in", msg_type="String") 36 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 37 | 38 | config = LLMConfig(enable_rag=True, 39 | collection_name="map", 40 | distance_func="l2", 41 | n_results=1, 42 | add_metadata=True) 43 | 44 | goto = LLM( 45 | inputs=[goto_in], 46 | outputs=[goal_point], 47 | model_client=llama_client, 48 | db_client=chroma_client, 49 | trigger=goto_in, 50 | config=config, 51 | component_name='go_to_x' 52 | ) 53 | 54 | # set a component prompt 55 | goto.set_component_prompt( 56 | template="""From the given metadata, extract coordinates and provide 57 | the coordinates in the following json format:\n {"position": coordinates}""" 58 | ) 59 | 60 | # pre-process the output before publishing to a topic of msg_type PoseStamped 61 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 62 | # extract the json part of the output string (including brackets) 63 | # one can use sophisticated regex parsing here but we'll keep it simple 64 | json_string = output[output.find("{"):output.find("}") + 1] 65 | 66 | # load the string as a json and extract position coordinates 67 | # if there is an error, return None, i.e. no output would be published to goal_point 68 | try: 69 | json_dict = json.loads(json_string) 70 | return np.array(json_dict['position']) 71 | except Exception: 72 | return 73 | 74 | # add the pre-processing function to the goal_point output topic 75 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 76 | ``` 77 | 78 | ```{note} 79 | Note that we have reused the same model and its client for both components. 80 | ``` 81 | 82 | ```{note} 83 | For a detailed explanation of the code for setting up the Go-to-X component, check the previous [example](goto.md). 84 | ``` 85 | 86 | ```{caution} 87 | In the code block above we are using the same DB client that was setup in this [example](semantic_map.md). 88 | ``` 89 | 90 | ## Creating the SemanticRouter 91 | 92 | The SemanticRouter takes an input _String_ topic and sends whatever is published on that topic to a _Route_. A _Route_ is a thin wrapper around _Topic_ and takes in the name of a topic to publish on and example queries, that would match a potential query that should be published to a particular topic. For example, if we ask our robot a general question, like "Whats the capital of France?", we do not want that question to be routed to a Go-to-X component, but to a generic LLM. Thus in its route, we would provide examples of general questions. The SemanticRouter component works by storing these examples in a vector DB. Distance is calculated between an incoming query's embedding and the embeddings of example queries to determine which _Route_(_Topic_) the query should be sent on. Lets start by creating our routes for the input topics of the two components above. 93 | 94 | ```python 95 | from agents.ros import Route 96 | 97 | # Create the input topic for the router 98 | query_topic = Topic(name="question", msg_type="String") 99 | 100 | # Define a route to a topic that processes go-to-x commands 101 | goto_route = Route(routes_to=goto_in, 102 | samples=["Go to the door", "Go to the kitchen", 103 | "Get me a glass", "Fetch a ball", "Go to hallway"]) 104 | 105 | # Define a route to a topic that is input to an LLM component 106 | llm_route = Route(routes_to=llm_in, 107 | samples=["What is the capital of France?", "Is there life on Mars?", 108 | "How many tablespoons in a cup?", "How are you today?", "Whats up?"]) 109 | ``` 110 | 111 | For the database client we will use the ChromaDB client setup in [this example](semantic_map.md). We will specify a router name in our router config, which will act as a _collection_name_ in the database. 112 | 113 | ```python 114 | from agents.components import SemanticRouter 115 | from agents.config import SemanticRouterConfig 116 | 117 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2") 118 | # Initialize the router component 119 | router = SemanticRouter( 120 | inputs=[query_topic], 121 | routes=[llm_route, goto_route], 122 | default_route=llm_route, # If none of the routes fall within a distance threshold 123 | config=router_config, 124 | db_client=chroma_client, # reusing the db_client from the previous example 125 | component_name="router" 126 | ) 127 | ``` 128 | 129 | And that is it. Whenever something is published on the input topic **question**, it will be routed, either to a Go-to-X component or an LLM component. We can now expose this topic to our command interface. The complete code for setting up the router is given below: 130 | 131 | ```{code-block} python 132 | :caption: Semantic Routing 133 | :linenos: 134 | from typing import Optional 135 | import json 136 | import numpy as np 137 | from agents.components import LLM, SemanticRouter 138 | from agents.models import Llama3_1 139 | from agents.vectordbs import ChromaDB 140 | from agents.config import LLMConfig, SemanticRouterConfig 141 | from agents.clients.roboml import HTTPDBClient 142 | from agents.clients.ollama import OllamaClient 143 | from agents.ros import Launcher, Topic, Route 144 | 145 | 146 | # Start a Llama3.1 based llm component using ollama client 147 | llama = Llama3_1(name="llama") 148 | llama_client = OllamaClient(llama) 149 | 150 | # Initialize a vector DB that will store our routes 151 | chroma = ChromaDB(name="MainDB") 152 | chroma_client = HTTPDBClient(db=chroma) 153 | 154 | 155 | # Make a generic LLM component using the Llama3_1 model 156 | llm_in = Topic(name="llm_in", msg_type="String") 157 | llm_out = Topic(name="llm_out", msg_type="String") 158 | 159 | llm = LLM( 160 | inputs=[llm_in], 161 | outputs=[llm_out], 162 | model_client=llama_client, 163 | trigger=llm_in 164 | ) 165 | 166 | 167 | # Define LLM input and output topics including goal_point topic of type PoseStamped 168 | goto_in = Topic(name="goto_in", msg_type="String") 169 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 170 | 171 | config = LLMConfig(enable_rag=True, 172 | collection_name="map", 173 | distance_func="l2", 174 | n_results=1, 175 | add_metadata=True) 176 | 177 | # initialize the component 178 | goto = LLM( 179 | inputs=[goto_in], 180 | outputs=[goal_point], 181 | model_client=llama_client, 182 | db_client=chroma_client, # check the previous example where we setup this database client 183 | trigger=goto_in, 184 | config=config, 185 | component_name='go_to_x' 186 | ) 187 | 188 | # set a component prompt 189 | goto.set_component_prompt( 190 | template="""From the given metadata, extract coordinates and provide 191 | the coordinates in the following json format:\n {"position": coordinates}""" 192 | ) 193 | 194 | 195 | # pre-process the output before publishing to a topic of msg_type PoseStamped 196 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 197 | # extract the json part of the output string (including brackets) 198 | # one can use sophisticated regex parsing here but we'll keep it simple 199 | json_string = output[output.find("{"):output.find("}") + 1] 200 | 201 | # load the string as a json and extract position coordinates 202 | # if there is an error, return None, i.e. no output would be published to goal_point 203 | try: 204 | json_dict = json.loads(json_string) 205 | return np.array(json_dict['position']) 206 | except Exception: 207 | return 208 | 209 | 210 | # add the pre-processing function to the goal_point output topic 211 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 212 | 213 | # Create the input topic for the router 214 | query_topic = Topic(name="question", msg_type="String") 215 | 216 | # Define a route to a topic that processes go-to-x commands 217 | goto_route = Route(routes_to=goto_in, 218 | samples=["Go to the door", "Go to the kitchen", 219 | "Get me a glass", "Fetch a ball", "Go to hallway"]) 220 | 221 | # Define a route to a topic that is input to an LLM component 222 | llm_route = Route(routes_to=llm_in, 223 | samples=["What is the capital of France?", "Is there life on Mars?", 224 | "How many tablespoons in a cup?", "How are you today?", "Whats up?"]) 225 | 226 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2") 227 | # Initialize the router component 228 | router = SemanticRouter( 229 | inputs=[query_topic], 230 | routes=[llm_route, goto_route], 231 | default_route=llm_route, # If none of the routes fall within a distance threshold 232 | config=router_config, 233 | db_client=chroma_client, # reusing the db_client from the previous example 234 | component_name="router", 235 | ) 236 | 237 | # Launch the components 238 | launcher = Launcher() 239 | launcher.add_pkg( 240 | components=[llm, goto, router] 241 | ) 242 | launcher.bringup() 243 | ``` 244 | -------------------------------------------------------------------------------- /docs/examples/tool_calling.md: -------------------------------------------------------------------------------- 1 | # Use Tool Calling in Go-to-X 2 | 3 | In the previous [example](goto.md) we created a Go-to-X component using basic text manipulation on LLM output. However, for models that have been specifically trained for tool calling, one can get better results for structured outputs by invoking tool calling. At the same time tool calling can be useful to generate responses which require intermediate use of tools by the LLM before providing a final answer. In this example we will utilize tool calling for the former utility of getting a better structured output from the LLM, by reimplementing the Go-to-X component. 4 | 5 | ## Register a tool (function) to be called by the LLM 6 | To utilize tool calling we will change our strategy of doing pre-processing to LLM text output, and instead ask the LLM to provide structured input to a function (tool). The output of this function will then be sent for publishing to the output topic. Lets see what this will look like in the following code snippets. 7 | 8 | First we will modify the component level prompt for our LLM. 9 | 10 | ```python 11 | # set a component prompt 12 | goto.set_component_prompt( 13 | template="""What are the position coordinates in the given metadata?""" 14 | ) 15 | ``` 16 | Next we will replace our pre-processing function, with a much simpler function that takes in a list and provides a numpy array. The LLM will be expected to call this function with the appropriate output. This strategy generally works better than getting text input from LLM and trying to parse it with an arbitrary function. To register the function as a tool, we will also need to create its description in a format that is explanatory for the LLM. This format has been specified by the _Ollama_ client. 17 | 18 | ```{caution} 19 | Tool calling is currently available only when components utilize the OllamaClient. 20 | ``` 21 | ```{seealso} 22 | To see a list of models that work for tool calling using the OllamaClient, check [here](https://ollama.com/search?c=tools) 23 | ``` 24 | ```python 25 | # pre-process the output before publishing to a topic of msg_type PoseStamped 26 | def get_coordinates(position: list[float]) -> np.ndarray: 27 | """Get position coordinates""" 28 | return np.array(position, dtype=float) 29 | 30 | 31 | function_description = { 32 | "type": "function", 33 | "function": { 34 | "name": "get_coordinates", 35 | "description": "Get position coordinates", 36 | "parameters": { 37 | "type": "object", 38 | "properties": { 39 | "position": { 40 | "type": "list[float]", 41 | "description": "The position coordinates in x, y and z", 42 | } 43 | }, 44 | }, 45 | "required": ["position"], 46 | }, 47 | } 48 | 49 | # add the pre-processing function to the goal_point output topic 50 | goto.register_tool( 51 | tool=get_coordinates, 52 | tool_description=function_description, 53 | send_tool_response_to_model=False, 54 | ) 55 | ``` 56 | In the code above, the flag _send_tool_response_to_model_ has been set to False. This means that the function output will be sent directly for publication, since our usage of the tool in this example is limited to forcing the model to provide a structured output. If this flag was set to True, the output of the tool (function) will be sent back to the model to produce the final output, which will then be published. This latter usage is employed when a tool like a calculator, browser or code interpreter can be provided to the model for generating better answers. 57 | 58 | ## Launching the Components 59 | 60 | And as before, we will launch our Go-to-X component. 61 | 62 | ```python 63 | from agents.ros import Launcher 64 | 65 | # Launch the component 66 | launcher = Launcher() 67 | launcher.add_pkg(components=[goto]) 68 | launcher.bringup() 69 | ``` 70 | 71 | The complete code for this example is given below: 72 | 73 | ```{code-block} python 74 | :caption: Go-to-X Component 75 | :linenos: 76 | import numpy as np 77 | from agents.components import LLM 78 | from agents.models import Llama3_1 79 | from agents.vectordbs import ChromaDB 80 | from agents.config import LLMConfig 81 | from agents.clients.roboml import HTTPDBClient 82 | from agents.clients.ollama import OllamaClient 83 | from agents.ros import Launcher, Topic 84 | 85 | # Start a Llama3.1 based llm component using ollama client 86 | llama = Llama3_1(name="llama") 87 | llama_client = OllamaClient(llama) 88 | 89 | # Initialize a vector DB that will store our routes 90 | chroma = ChromaDB(name="MainDB") 91 | chroma_client = HTTPDBClient(db=chroma) 92 | 93 | # Define LLM input and output topics including goal_point topic of type PoseStamped 94 | goto_in = Topic(name="goto_in", msg_type="String") 95 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 96 | 97 | config = LLMConfig( 98 | enable_rag=True, 99 | collection_name="map", 100 | distance_func="l2", 101 | n_results=1, 102 | add_metadata=True, 103 | ) 104 | 105 | # initialize the component 106 | goto = LLM( 107 | inputs=[goto_in], 108 | outputs=[goal_point], 109 | model_client=llama_client, 110 | db_client=chroma_client, # check the previous example where we setup this database client 111 | trigger=goto_in, 112 | config=config, 113 | component_name="go_to_x", 114 | ) 115 | 116 | # set a component prompt 117 | goto.set_component_prompt( 118 | template="""What are the position coordinates in the given metadata?""" 119 | ) 120 | 121 | 122 | # pre-process the output before publishing to a topic of msg_type PoseStamped 123 | def get_coordinates(position: list[float]) -> np.ndarray: 124 | """Get position coordinates""" 125 | return np.array(position, dtype=float) 126 | 127 | 128 | function_description = { 129 | "type": "function", 130 | "function": { 131 | "name": "get_coordinates", 132 | "description": "Get position coordinates", 133 | "parameters": { 134 | "type": "object", 135 | "properties": { 136 | "position": { 137 | "type": "list[float]", 138 | "description": "The position coordinates in x, y and z", 139 | } 140 | }, 141 | }, 142 | "required": ["position"], 143 | }, 144 | } 145 | 146 | # add the pre-processing function to the goal_point output topic 147 | goto.register_tool( 148 | tool=get_coordinates, 149 | tool_description=function_description, 150 | send_tool_response_to_model=False, 151 | ) 152 | 153 | # Launch the component 154 | launcher = Launcher() 155 | launcher.add_pkg(components=[goto]) 156 | launcher.bringup() 157 | ``` 158 | -------------------------------------------------------------------------------- /docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: ROS Agents Documentation 3 | --- 4 | 5 | 6 | ```{include} intro.md 7 | ``` 8 | 9 | ## Table of Contents 10 | 11 | ```{toctree} 12 | :maxdepth: 2 13 | 14 | intro 15 | installation 16 | quickstart 17 | basics 18 | examples/index 19 | apidocs/index 20 | ``` 21 | -------------------------------------------------------------------------------- /docs/installation.md: -------------------------------------------------------------------------------- 1 | # Installation 🛠️ 2 | 3 | ## Pre-Requisits 4 | 5 | ### Install ROS 6 | 7 | ROS Agents is built to be used with ROS2. All ROS distributions starting from _Iron_ are supported. Install ROS2 by following the instructions on the [official site](https://docs.ros.org/en/iron/Installation.html). 8 | 9 | ### Install a model serving platform 10 | 11 | The core of ROS Agents is agnostic to model serving platforms. It currently supports [Ollama](https://ollama.com) and [RoboML](https://github.com/automatika-robotics/RoboML). Please install either of these by following the instructions provided by respective projects. Support for new platforms will be continuously added. If you would like to support a particular platform, please open an issue/PR. 12 | 13 | ```{tip} 14 | For utilizing larger models, it is recommended that model serving platforms are not installed directly on the robot (or the edge device) but on a GPU powered machine on the local network (or one of the cloud providers). 15 | ``` 16 | 17 | ## Install ROS Agents (Ubuntu) 18 | 19 | **Binary packages for Ubuntu will be released soon. Check this space.** 20 | 21 | ## Install ROS Agents from source 22 | 23 | Create your ROS workspace. 24 | ```shell 25 | mkdir -p agents_ws/src 26 | cd agents_ws/src 27 | ``` 28 | ### Get Dependencies 29 | 30 | Install python dependencies 31 | ```shell 32 | pip install numpy opencv-python-headless 'attrs>=23.2.0' jinja2 httpx setproctitle msgpack msgpack-numpy numpy-quaternion platformdirs 33 | ``` 34 | 35 | Download ROS Sugar. 36 | ```shell 37 | git clone https://github.com/automatika-robotics/ros-sugar 38 | ``` 39 | ### Install ROS Agents 40 | ```shell 41 | git clone https://github.com/automatika-robotics/ros-agents.git 42 | cd .. 43 | colcon build 44 | source install/setup.bash 45 | python your_script.py 46 | ``` 47 | -------------------------------------------------------------------------------- /docs/intro.md: -------------------------------------------------------------------------------- 1 | ![Logo](_static/ROS_AGENTS_DARK.png) 2 | 3 | # ROS Agents 🤖 4 | 5 | ROS Agents is a fully-loaded framework for creating interactive embodied agents that can understand, remember, and act upon contextual information from their environment. 6 | 7 | - **Agents in the real world:** Designed to be used with autonomous robot systems that operate in dynamic environments, specifically AMRs. 8 | - **Intuitive API**: Simple pythonic API to utilize local or cloud based ML models (specifically **Multimodal LLMs** and other **Transformer Architectures**) on robots. 9 | - **Semantic Memory**: Integrates vector databases, semantic routing and other supporting components to quickly build arbitrarily complex graphs for agentic information flow. No need to utilize bloated "GenAI" frameworks on your robot. 10 | - **Made in ROS2**: Utilizes ROS2 as the underlying distributed communications backbone. Theoretically, all devices that provide a ROS2 package can be utilized to send data to ML models, as long as the datatype callback has been implemented. 11 | 12 | Checkout [Installation Instructions](installation.md) 🛠️ 13 | 14 | Get started with the [Quickstart Guide](quickstart.md) 🚀 15 | 16 | Get familiar with [Basic Concepts](basics.md) 📚 17 | 18 | Dive right in with [Examples](examples/index.md) ✨ 19 | 20 | ## Contributions 21 | 22 | ROS Agents has been developed in collaboration betweeen [Automatika Robotics](https://automatikarobotics.com/) and [Inria](https://inria.fr/). Contributions from the community are most welcome. 23 | -------------------------------------------------------------------------------- /docs/quickstart.md: -------------------------------------------------------------------------------- 1 | # Quick Start 🚀 2 | 3 | Unlike other ROS package, ROS Agents provides a pure pythonic way of describing the node graph using [ROS Sugar](https://automatika-robotics.github.io/ros-sugar/). Copy the following code in a python script and run it. 4 | 5 | ```python 6 | from agents.clients.ollama import OllamaClient 7 | from agents.components import MLLM 8 | from agents.models import Llava 9 | from agents.ros import Topic, Launcher 10 | 11 | # Define input and output topics (pay attention to msg_type) 12 | text0 = Topic(name="text0", msg_type="String") 13 | image0 = Topic(name="image_raw", msg_type="Image") 14 | text1 = Topic(name="text1", msg_type="String") 15 | 16 | # Define a model client (working with Ollama in this case) 17 | llava = Llava(name="llava") 18 | llava_client = OllamaClient(llava) 19 | 20 | # Define an MLLM component (A component represents a node with a particular functionality) 21 | mllm = MLLM( 22 | inputs=[text0, image0], 23 | outputs=[text1], 24 | model_client=llava_client, 25 | trigger=[text0], 26 | component_name="vqa" 27 | ) 28 | # Additional prompt settings 29 | mllm.set_topic_prompt(text0, template="""You are an amazing and funny robot. 30 | Answer the following about this image: {{ text0 }}""" 31 | ) 32 | # Launch the component 33 | launcher = Launcher() 34 | launcher.add_pkg(components=[mllm]) 35 | launcher.bringup() 36 | ``` 37 | 38 | Now let us see step-by-step what we have done in this code. First we defined inputs and outputs to our component in the form of ROS Topics. Components automatically create listeners for input topics and publishers for output topics. 39 | 40 | ```python 41 | # Define input and output topics (pay attention to msg_type) 42 | text0 = Topic(name="text0", msg_type="String") 43 | image0 = Topic(name="image_raw", msg_type="Image") 44 | text1 = Topic(name="text1", msg_type="String") 45 | ``` 46 | 47 | ````{important} 48 | If you are running ROS Agents on a robot, make sure you change the name of the topic to which the robot's camera is publishing the RGB images to in the following line. 49 | 50 | ```python 51 | image0 = Topic(name="NAME_OF_THE_TOPIC", msg_type="Image") 52 | ```` 53 | 54 | ```{note} 55 | If you are running ROS Agents on a testing machine, and the machine has a webcam, you can install the [**ROS2 USB Cam**](https://github.com/klintan/ros2_usb_camera). Make sure you use the correct name of the image topic as above. 56 | ``` 57 | 58 | Then we will create a multimodal LLM component. Components are functional units in ROS Agents. To learn more about them, check out [Basic Concepts](basics.md). Other than input/output topics, the MLLM component expects a model client. So first we will create a model client that can utilize a [Llava](https://ollama.com/library/llava) model on [Ollama](https://ollama.com) as its model serving platform. 59 | 60 | ```python 61 | # Define a model client (working with Ollama in this case) 62 | llava = Llava(name="llava") 63 | llava_client = OllamaClient(llava) 64 | ``` 65 | 66 | ````{important} 67 | If you are not running Ollama on the same machine (robot) on which you are running ROS Agents, you can define access to the machine running Ollama using host and port in this line: 68 | ```python 69 | llava_client = OllamaClient(llava, host="127.0.0.1", port=8000) 70 | ```` 71 | 72 | ```{note} 73 | If the use of Ollama as a model serving platform is unclear, checkout [installation instructions](installation.md). 74 | ``` 75 | 76 | Now we are ready to setup our component. 77 | 78 | ```python 79 | # Define an MLLM component (A component represents a node with a particular functionality) 80 | mllm = MLLM( 81 | inputs=[text0, image0], 82 | outputs=[text1], 83 | model_client=llava_client, 84 | trigger=[text0], 85 | component_name="vqa" 86 | ) 87 | # Additional prompt settings 88 | mllm.set_topic_prompt(text0, template="""You are an amazing and funny robot. 89 | Answer the following about this image: {{ text0 }}""" 90 | ) 91 | ``` 92 | 93 | Note how the MLLM type of component, also allows us to set a topic or component level prompt, where a jinja2 template can be used to define a template in which our input string should be embedded. Finally we will launch the component. 94 | 95 | ```python 96 | # Launch the component 97 | launcher = Launcher() 98 | launcher.add_pkg(components=[mllm]) 99 | launcher.bringup() 100 | ``` 101 | 102 | Now we can check that our component is running by using familiar ROS2 commands from a new terminal. We should see our component running as a ROS node and the its input and output topics in the topic list. 103 | 104 | ```shell 105 | ros2 node list 106 | ros2 topic list 107 | ``` 108 | 109 | In order to interact with our component we can use the tiny web client that is bundled with ROS Agents. We can launch the client by running: 110 | 111 | ```shell 112 | ros2 run automatica_embodied_agents tiny_web_client 113 | ``` 114 | 115 | The client displays a web UI on http://localhost:8000. Open this address from browser. ROS input and output topic settings for text input and output topics can be configured from the web UI by pressing the settings icon. Send a question to your ROS Agent and you should get a the reply generated by the Llava model. 116 | -------------------------------------------------------------------------------- /examples/complete_agent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | from typing import Optional 4 | from agents.components import ( 5 | MLLM, 6 | SpeechToText, 7 | TextToSpeech, 8 | LLM, 9 | Vision, 10 | MapEncoding, 11 | SemanticRouter, 12 | ) 13 | from agents.config import TextToSpeechConfig 14 | from agents.clients.roboml import HTTPModelClient, RESPModelClient, HTTPDBClient 15 | from agents.clients.ollama import OllamaClient 16 | from agents.models import Whisper, SpeechT5, Llava, Llama3_1, VisionModel 17 | from agents.vectordbs import ChromaDB 18 | from agents.config import VisionConfig, LLMConfig, MapConfig, SemanticRouterConfig 19 | from agents.ros import Topic, Launcher, FixedInput, MapLayer, Route 20 | 21 | 22 | ### Setup our models and vectordb ### 23 | whisper = Whisper(name="whisper") 24 | whisper_client = HTTPModelClient(whisper) 25 | speecht5 = SpeechT5(name="speecht5") 26 | speecht5_client = HTTPModelClient(speecht5) 27 | object_detection_model = VisionModel( 28 | name="dino_4scale", checkpoint="dino-4scale_r50_8xb2-12e_coco" 29 | ) 30 | detection_client = RESPModelClient(object_detection_model) 31 | llava = Llava(name="llava") 32 | llava_client = OllamaClient(llava) 33 | llama = Llama3_1(name="llama") 34 | llama_client = OllamaClient(llama) 35 | chroma = ChromaDB(name="MainDB") 36 | chroma_client = HTTPDBClient(db=chroma) 37 | 38 | ### Setup our components ### 39 | # Setup a speech to text component 40 | audio_in = Topic(name="audio0", msg_type="Audio") 41 | query_topic = Topic(name="question", msg_type="String") 42 | 43 | speech_to_text = SpeechToText( 44 | inputs=[audio_in], 45 | outputs=[query_topic], 46 | model_client=whisper_client, 47 | trigger=audio_in, 48 | component_name="speech_to_text", 49 | ) 50 | 51 | # Setup a text to speech component 52 | query_answer = Topic(name="answer", msg_type="String") 53 | 54 | t2s_config = TextToSpeechConfig(play_on_device=True) 55 | 56 | text_to_speech = TextToSpeech( 57 | inputs=[query_answer], 58 | trigger=query_answer, 59 | model_client=speecht5_client, 60 | config=t2s_config, 61 | component_name="text_to_speech", 62 | ) 63 | 64 | # Setup a vision component for object detection 65 | image0 = Topic(name="image_raw", msg_type="Image") 66 | detections_topic = Topic(name="detections", msg_type="Detections") 67 | 68 | detection_config = VisionConfig(threshold=0.5) 69 | vision = Vision( 70 | inputs=[image0], 71 | outputs=[detections_topic], 72 | trigger=image0, 73 | config=detection_config, 74 | model_client=detection_client, 75 | component_name="object_detection", 76 | ) 77 | 78 | # Define a generic mllm component for vqa 79 | mllm_query = Topic(name="mllm_query", msg_type="String") 80 | 81 | mllm = MLLM( 82 | inputs=[mllm_query, image0, detections_topic], 83 | outputs=[query_answer], 84 | model_client=llava_client, 85 | trigger=mllm_query, 86 | component_name="visual_q_and_a", 87 | ) 88 | 89 | mllm.set_component_prompt( 90 | template="""Imagine you are a robot. 91 | This image has following items: {{ detections }}. 92 | Answer the following about this image: {{ text0 }}""" 93 | ) 94 | 95 | # Define a fixed input mllm component that does introspection 96 | introspection_query = FixedInput( 97 | name="introspection_query", 98 | msg_type="String", 99 | fixed="What kind of a room is this? Is it an office, a bedroom or a kitchen? Give a one word answer, out of the given choices", 100 | ) 101 | introspection_answer = Topic(name="introspection_answer", msg_type="String") 102 | 103 | introspector = MLLM( 104 | inputs=[introspection_query, image0], 105 | outputs=[introspection_answer], 106 | model_client=llava_client, 107 | trigger=15.0, 108 | component_name="introspector", 109 | ) 110 | 111 | 112 | def introspection_validation(output: str) -> Optional[str]: 113 | for option in ["office", "bedroom", "kitchen"]: 114 | if option in output.lower(): 115 | return option 116 | 117 | 118 | introspector.add_publisher_preprocessor(introspection_answer, introspection_validation) 119 | 120 | # Define a semantic map using MapEncoding component 121 | layer1 = MapLayer(subscribes_to=detections_topic, temporal_change=True) 122 | layer2 = MapLayer(subscribes_to=introspection_answer, resolution_multiple=3) 123 | 124 | position = Topic(name="odom", msg_type="Odometry") 125 | map_topic = Topic(name="map", msg_type="OccupancyGrid") 126 | 127 | map_conf = MapConfig(map_name="map") 128 | map = MapEncoding( 129 | layers=[layer1, layer2], 130 | position=position, 131 | map_topic=map_topic, 132 | config=map_conf, 133 | db_client=chroma_client, 134 | trigger=15.0, 135 | component_name="map_encoder", 136 | ) 137 | 138 | # Define a generic LLM component 139 | llm_query = Topic(name="llm_query", msg_type="String") 140 | 141 | llm = LLM( 142 | inputs=[llm_query], 143 | outputs=[query_answer], 144 | model_client=llama_client, 145 | trigger=[llm_query], 146 | component_name="general_q_and_a", 147 | ) 148 | 149 | # Define a Go-to-X component using LLM 150 | goto_query = Topic(name="goto_query", msg_type="String") 151 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 152 | 153 | goto_config = LLMConfig( 154 | enable_rag=True, 155 | collection_name="map", 156 | distance_func="l2", 157 | n_results=1, 158 | add_metadata=True, 159 | ) 160 | 161 | goto = LLM( 162 | inputs=[goto_query], 163 | outputs=[goal_point], 164 | model_client=llama_client, 165 | config=goto_config, 166 | db_client=chroma_client, 167 | trigger=goto_query, 168 | component_name="go_to_x", 169 | ) 170 | 171 | goto.set_component_prompt( 172 | template="""From the given metadata, extract coordinates and provide 173 | the coordinates in the following json format:\n {"position": coordinates}""" 174 | ) 175 | 176 | 177 | # pre-process the output before publishing to a topic of msg_type PoseStamped 178 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 179 | # extract the json part of the output string (including brackets) 180 | # one can use sophisticated regex parsing here but we'll keep it simple 181 | json_string = output[output.find("{") : output.rfind("}") + 1] 182 | # load the string as a json and extract position coordinates 183 | # if there is an error, return None, i.e. no output would be published to goal_point 184 | try: 185 | json_dict = json.loads(json_string) 186 | coordinates = np.fromstring(json_dict["position"], sep=",", dtype=np.float64) 187 | print("Coordinates Extracted:", coordinates) 188 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3: 189 | return 190 | elif ( 191 | coordinates.shape[0] == 2 192 | ): # sometimes LLMs avoid adding the zeros of z-dimension 193 | coordinates = np.append(coordinates, 0) 194 | return coordinates 195 | except Exception: 196 | return 197 | 198 | 199 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 200 | 201 | # Define a semantic router between a generic LLM component, VQA MLLM component and Go-to-X component 202 | goto_route = Route( 203 | routes_to=goto_query, 204 | samples=[ 205 | "Go to the door", 206 | "Go to the kitchen", 207 | "Get me a glass", 208 | "Fetch a ball", 209 | "Go to hallway", 210 | ], 211 | ) 212 | 213 | llm_route = Route( 214 | routes_to=llm_query, 215 | samples=[ 216 | "What is the capital of France?", 217 | "Is there life on Mars?", 218 | "How many tablespoons in a cup?", 219 | "How are you today?", 220 | "Whats up?", 221 | ], 222 | ) 223 | 224 | mllm_route = Route( 225 | routes_to=mllm_query, 226 | samples=[ 227 | "Are we indoors or outdoors", 228 | "What do you see?", 229 | "Whats in front of you?", 230 | "Where are we", 231 | "Do you see any people?", 232 | "How many things are infront of you?", 233 | "Is this room occupied?", 234 | ], 235 | ) 236 | 237 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2") 238 | # Initialize the router component 239 | router = SemanticRouter( 240 | inputs=[query_topic], 241 | routes=[llm_route, goto_route, mllm_route], 242 | default_route=llm_route, 243 | config=router_config, 244 | db_client=chroma_client, 245 | component_name="router", 246 | ) 247 | 248 | # Launch the components 249 | launcher = Launcher() 250 | launcher.add_pkg( 251 | components=[ 252 | mllm, 253 | llm, 254 | goto, 255 | introspector, 256 | map, 257 | router, 258 | speech_to_text, 259 | text_to_speech, 260 | vision, 261 | ] 262 | ) 263 | launcher.bringup() 264 | -------------------------------------------------------------------------------- /examples/complete_agent_multiprocessing.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | from typing import Optional 4 | from agents.components import ( 5 | MLLM, 6 | SpeechToText, 7 | TextToSpeech, 8 | LLM, 9 | Vision, 10 | MapEncoding, 11 | SemanticRouter, 12 | ) 13 | from agents.config import TextToSpeechConfig 14 | from agents.clients.roboml import HTTPModelClient, RESPModelClient, HTTPDBClient 15 | from agents.clients.ollama import OllamaClient 16 | from agents.models import Whisper, SpeechT5, Llava, Llama3_1, VisionModel 17 | from agents.vectordbs import ChromaDB 18 | from agents.config import VisionConfig, LLMConfig, MapConfig, SemanticRouterConfig 19 | from agents.ros import Topic, Launcher, FixedInput, MapLayer, Route 20 | 21 | 22 | ### Setup our models and vectordb ### 23 | whisper = Whisper(name="whisper") 24 | whisper_client = HTTPModelClient(whisper) 25 | speecht5 = SpeechT5(name="speecht5") 26 | speecht5_client = HTTPModelClient(speecht5) 27 | object_detection_model = VisionModel( 28 | name="dino_4scale", checkpoint="dino-4scale_r50_8xb2-12e_coco" 29 | ) 30 | detection_client = RESPModelClient(object_detection_model) 31 | llava = Llava(name="llava") 32 | llava_client = OllamaClient(llava) 33 | llama = Llama3_1(name="llama") 34 | llama_client = OllamaClient(llama) 35 | chroma = ChromaDB(name="MainDB") 36 | chroma_client = HTTPDBClient(db=chroma) 37 | 38 | ### Setup our components ### 39 | # Setup a speech to text component 40 | audio_in = Topic(name="audio0", msg_type="Audio") 41 | query_topic = Topic(name="question", msg_type="String") 42 | 43 | speech_to_text = SpeechToText( 44 | inputs=[audio_in], 45 | outputs=[query_topic], 46 | model_client=whisper_client, 47 | trigger=audio_in, 48 | component_name="speech_to_text", 49 | ) 50 | 51 | # Setup a text to speech component 52 | query_answer = Topic(name="answer", msg_type="String") 53 | 54 | t2s_config = TextToSpeechConfig(play_on_device=True) 55 | 56 | text_to_speech = TextToSpeech( 57 | inputs=[query_answer], 58 | trigger=query_answer, 59 | model_client=speecht5_client, 60 | config=t2s_config, 61 | component_name="text_to_speech", 62 | ) 63 | 64 | # Setup a vision component for object detection 65 | image0 = Topic(name="image_raw", msg_type="Image") 66 | detections_topic = Topic(name="detections", msg_type="Detections") 67 | 68 | detection_config = VisionConfig(threshold=0.5) 69 | vision = Vision( 70 | inputs=[image0], 71 | outputs=[detections_topic], 72 | trigger=image0, 73 | config=detection_config, 74 | model_client=detection_client, 75 | component_name="object_detection", 76 | ) 77 | 78 | # Define a generic mllm component for vqa 79 | mllm_query = Topic(name="mllm_query", msg_type="String") 80 | 81 | mllm = MLLM( 82 | inputs=[mllm_query, image0, detections_topic], 83 | outputs=[query_answer], 84 | model_client=llava_client, 85 | trigger=mllm_query, 86 | component_name="visual_q_and_a", 87 | ) 88 | 89 | mllm.set_component_prompt( 90 | template="""Imagine you are a robot. 91 | This image has following items: {{ detections }}. 92 | Answer the following about this image: {{ text0 }}""" 93 | ) 94 | 95 | # Define a fixed input mllm component that does introspection 96 | introspection_query = FixedInput( 97 | name="introspection_query", 98 | msg_type="String", 99 | fixed="What kind of a room is this? Is it an office, a bedroom or a kitchen? Give a one word answer, out of the given choices", 100 | ) 101 | introspection_answer = Topic(name="introspection_answer", msg_type="String") 102 | 103 | introspector = MLLM( 104 | inputs=[introspection_query, image0], 105 | outputs=[introspection_answer], 106 | model_client=llava_client, 107 | trigger=15.0, 108 | component_name="introspector", 109 | ) 110 | 111 | 112 | def introspection_validation(output: str) -> Optional[str]: 113 | for option in ["office", "bedroom", "kitchen"]: 114 | if option in output.lower(): 115 | return option 116 | 117 | 118 | introspector.add_publisher_preprocessor(introspection_answer, introspection_validation) 119 | 120 | # Define a semantic map using MapEncoding component 121 | layer1 = MapLayer(subscribes_to=detections_topic, temporal_change=True) 122 | layer2 = MapLayer( 123 | subscribes_to=introspection_answer, 124 | resolution_multiple=3, 125 | pre_defined=[(np.array([1.1, 2.1, 3.2]), "The door is here. DOOR.")], 126 | ) 127 | 128 | position = Topic(name="odom", msg_type="Odometry") 129 | map_topic = Topic(name="map", msg_type="OccupancyGrid") 130 | 131 | map_conf = MapConfig(map_name="map") 132 | map = MapEncoding( 133 | layers=[layer1, layer2], 134 | position=position, 135 | map_topic=map_topic, 136 | config=map_conf, 137 | db_client=chroma_client, 138 | trigger=15.0, 139 | component_name="map_encoder", 140 | ) 141 | 142 | # Define a generic LLM component 143 | llm_query = Topic(name="llm_query", msg_type="String") 144 | 145 | llm = LLM( 146 | inputs=[llm_query], 147 | outputs=[query_answer], 148 | model_client=llama_client, 149 | trigger=[llm_query], 150 | component_name="general_q_and_a", 151 | ) 152 | 153 | # Define a Go-to-X component using LLM 154 | goto_query = Topic(name="goto_query", msg_type="String") 155 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 156 | 157 | goto_config = LLMConfig( 158 | enable_rag=True, 159 | collection_name="map", 160 | distance_func="l2", 161 | n_results=1, 162 | add_metadata=True, 163 | ) 164 | 165 | goto = LLM( 166 | inputs=[goto_query], 167 | outputs=[goal_point], 168 | model_client=llama_client, 169 | config=goto_config, 170 | db_client=chroma_client, 171 | trigger=goto_query, 172 | component_name="go_to_x", 173 | ) 174 | 175 | goto.set_component_prompt( 176 | template="""From the given metadata, extract coordinates and provide 177 | the coordinates in the following json format:\n {"position": coordinates}""" 178 | ) 179 | 180 | 181 | # pre-process the output before publishing to a topic of msg_type PoseStamped 182 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 183 | # extract the json part of the output string (including brackets) 184 | # one can use sophisticated regex parsing here but we'll keep it simple 185 | json_string = output[output.find("{") : output.rfind("}") + 1] 186 | # load the string as a json and extract position coordinates 187 | # if there is an error, return None, i.e. no output would be published to goal_point 188 | try: 189 | json_dict = json.loads(json_string) 190 | coordinates = np.fromstring(json_dict["position"], sep=",", dtype=np.float64) 191 | print("Coordinates Extracted:", coordinates) 192 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3: 193 | return 194 | elif ( 195 | coordinates.shape[0] == 2 196 | ): # sometimes LLMs avoid adding the zeros of z-dimension 197 | coordinates = np.append(coordinates, 0) 198 | return coordinates 199 | except Exception: 200 | return 201 | 202 | 203 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 204 | 205 | # Define a semantic router between a generic LLM component, VQA MLLM component and Go-to-X component 206 | goto_route = Route( 207 | routes_to=goto_query, 208 | samples=[ 209 | "Go to the door", 210 | "Go to the kitchen", 211 | "Get me a glass", 212 | "Fetch a ball", 213 | "Go to hallway", 214 | ], 215 | ) 216 | 217 | llm_route = Route( 218 | routes_to=llm_query, 219 | samples=[ 220 | "What is the capital of France?", 221 | "Is there life on Mars?", 222 | "How many tablespoons in a cup?", 223 | "How are you today?", 224 | "Whats up?", 225 | ], 226 | ) 227 | 228 | mllm_route = Route( 229 | routes_to=mllm_query, 230 | samples=[ 231 | "Are we indoors or outdoors", 232 | "What do you see?", 233 | "Whats in front of you?", 234 | "Where are we", 235 | "Do you see any people?", 236 | "How many things are infront of you?", 237 | "Is this room occupied?", 238 | ], 239 | ) 240 | 241 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2") 242 | # Initialize the router component 243 | router = SemanticRouter( 244 | inputs=[query_topic], 245 | routes=[llm_route, goto_route, mllm_route], 246 | default_route=llm_route, 247 | config=router_config, 248 | db_client=chroma_client, 249 | component_name="router", 250 | ) 251 | 252 | # Launch the components 253 | launcher = Launcher() 254 | launcher.add_pkg( 255 | components=[ 256 | mllm, 257 | llm, 258 | goto, 259 | introspector, 260 | map, 261 | router, 262 | speech_to_text, 263 | text_to_speech, 264 | vision, 265 | ], 266 | package_name="automatika_embodied_agents", 267 | multiprocessing=True, 268 | ) 269 | launcher.on_fail(action_name="restart") 270 | launcher.fallback_rate = 1 / 10 # 0.1 Hz or 10 seconds 271 | launcher.bringup() 272 | -------------------------------------------------------------------------------- /examples/conversational_agent_with_audio.py: -------------------------------------------------------------------------------- 1 | from agents.components import MLLM, SpeechToText, TextToSpeech 2 | from agents.config import SpeechToTextConfig, TextToSpeechConfig 3 | from agents.clients.roboml import HTTPModelClient 4 | from agents.clients.ollama import OllamaClient 5 | from agents.models import Whisper, SpeechT5, Llava 6 | from agents.ros import Topic, Launcher 7 | 8 | audio_in = Topic(name="audio0", msg_type="Audio") 9 | text_query = Topic(name="text0", msg_type="String") 10 | 11 | whisper = Whisper(name="whisper") # Custom model init params can be provided here 12 | roboml_whisper = HTTPModelClient(whisper) 13 | 14 | s2t_config = SpeechToTextConfig( 15 | enable_vad=True, # option to listen for speech through the microphone 16 | enable_wakeword=True, # option to invoke the component with a wakeword like 'hey jarvis' 17 | ) 18 | speech_to_text = SpeechToText( 19 | inputs=[audio_in], 20 | outputs=[text_query], 21 | model_client=roboml_whisper, 22 | trigger=audio_in, 23 | config=s2t_config, 24 | component_name="speech_to_text", 25 | ) 26 | 27 | image0 = Topic(name="image_raw", msg_type="Image") 28 | text_answer = Topic(name="text1", msg_type="String") 29 | 30 | llava = Llava(name="llava") 31 | llava_client = OllamaClient(llava) 32 | 33 | mllm = MLLM( 34 | inputs=[text_query, image0], 35 | outputs=[text_answer], 36 | model_client=llava_client, 37 | trigger=text_query, 38 | component_name="vqa", 39 | ) 40 | 41 | # config for playing audio on device 42 | t2s_config = TextToSpeechConfig(play_on_device=True) 43 | 44 | speecht5 = SpeechT5(name="speecht5") 45 | roboml_speecht5 = HTTPModelClient(speecht5) 46 | text_to_speech = TextToSpeech( 47 | inputs=[text_answer], 48 | trigger=text_answer, 49 | model_client=roboml_speecht5, 50 | config=t2s_config, 51 | component_name="text_to_speech", 52 | ) 53 | 54 | launcher = Launcher() 55 | launcher.add_pkg( 56 | components=[speech_to_text, mllm, text_to_speech], 57 | ) 58 | launcher.bringup() 59 | -------------------------------------------------------------------------------- /examples/go_to_x.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import json 3 | import numpy as np 4 | from agents.components import LLM 5 | from agents.models import Llama3_1 6 | from agents.vectordbs import ChromaDB 7 | from agents.config import LLMConfig 8 | from agents.clients.roboml import HTTPDBClient 9 | from agents.clients.ollama import OllamaClient 10 | from agents.ros import Launcher, Topic 11 | 12 | # Start a Llama3.1 based llm component using ollama client 13 | llama = Llama3_1(name="llama") 14 | llama_client = OllamaClient(llama) 15 | 16 | # Initialize a vector DB that will store our routes 17 | chroma = ChromaDB(name="MainDB") 18 | chroma_client = HTTPDBClient(db=chroma) 19 | 20 | # Define LLM input and output topics including goal_point topic of type PoseStamped 21 | goto_in = Topic(name="goto_in", msg_type="String") 22 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 23 | 24 | config = LLMConfig( 25 | enable_rag=True, 26 | collection_name="map", 27 | distance_func="l2", 28 | n_results=1, 29 | add_metadata=True, 30 | ) 31 | 32 | # initialize the component 33 | goto = LLM( 34 | inputs=[goto_in], 35 | outputs=[goal_point], 36 | model_client=llama_client, 37 | db_client=chroma_client, # check the previous example where we setup this database client 38 | trigger=goto_in, 39 | config=config, 40 | component_name="go_to_x", 41 | ) 42 | 43 | # set a component prompt 44 | goto.set_component_prompt( 45 | template="""From the given metadata, extract coordinates and provide 46 | the coordinates in the following json format:\n {"position": coordinates}""" 47 | ) 48 | 49 | 50 | # pre-process the output before publishing to a topic of msg_type PoseStamped 51 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 52 | # extract the json part of the output string (including brackets) 53 | # one can use sophisticated regex parsing here but we'll keep it simple 54 | json_string = output[output.find("{") : output.rfind("}") + 1] 55 | # load the string as a json and extract position coordinates 56 | # if there is an error, return None, i.e. no output would be published to goal_point 57 | try: 58 | json_dict = json.loads(json_string) 59 | coordinates = np.fromstring(json_dict["position"], sep=",", dtype=np.float64) 60 | print("Coordinates Extracted:", coordinates) 61 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3: 62 | return 63 | elif ( 64 | coordinates.shape[0] == 2 65 | ): # sometimes LLMs avoid adding the zeros of z-dimension 66 | coordinates = np.append(coordinates, 0) 67 | return coordinates 68 | except Exception: 69 | return 70 | 71 | 72 | # add the pre-processing function to the goal_point output topic 73 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 74 | 75 | # Launch the component 76 | launcher = Launcher() 77 | launcher.add_pkg(components=[goto]) 78 | launcher.bringup() 79 | -------------------------------------------------------------------------------- /examples/prompt_engineering.py: -------------------------------------------------------------------------------- 1 | from agents.components import Vision, MLLM 2 | from agents.models import VisionModel, Idefics2 3 | from agents.clients.roboml import RESPModelClient, HTTPModelClient 4 | from agents.ros import Topic, Launcher 5 | from agents.config import VisionConfig 6 | 7 | image0 = Topic(name="image_raw", msg_type="Image") 8 | detections_topic = Topic(name="detections", msg_type="Detections") 9 | 10 | object_detection = VisionModel( 11 | name="object_detection", checkpoint="dino-4scale_r50_8xb2-12e_coco" 12 | ) 13 | roboml_detection = RESPModelClient(object_detection) 14 | 15 | detection_config = VisionConfig(threshold=0.5) 16 | vision = Vision( 17 | inputs=[image0], 18 | outputs=[detections_topic], 19 | trigger=image0, 20 | config=detection_config, 21 | model_client=roboml_detection, 22 | component_name="detection_component", 23 | ) 24 | 25 | text_query = Topic(name="text0", msg_type="String") 26 | text_answer = Topic(name="text1", msg_type="String") 27 | 28 | idefics = Idefics2(name="idefics_model") 29 | idefics_client = HTTPModelClient(idefics) 30 | 31 | mllm = MLLM( 32 | inputs=[text_query, image0, detections_topic], 33 | outputs=[text_answer], 34 | model_client=idefics_client, 35 | trigger=text_query, 36 | component_name="mllm_component", 37 | ) 38 | 39 | mllm.set_component_prompt( 40 | template="""Imagine you are a robot. 41 | This image has following items: {{ detections }}. 42 | Answer the following about this image: {{ text0 }}""" 43 | ) 44 | launcher = Launcher() 45 | launcher.add_pkg(components=[vision, mllm]) 46 | launcher.bringup() 47 | -------------------------------------------------------------------------------- /examples/semantic_map.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from agents.components import MapEncoding, Vision, MLLM 3 | from agents.models import VisionModel, Llava 4 | from agents.clients.roboml import RESPModelClient, HTTPDBClient 5 | from agents.clients.ollama import OllamaClient 6 | from agents.ros import Topic, MapLayer, Launcher, FixedInput 7 | from agents.vectordbs import ChromaDB 8 | from agents.config import MapConfig, VisionConfig 9 | 10 | # Define the image input topic 11 | image0 = Topic(name="image_raw", msg_type="Image") 12 | # Create a detection topic 13 | detections_topic = Topic(name="detections", msg_type="Detections") 14 | 15 | # Add an object detection model 16 | object_detection = VisionModel( 17 | name="object_detection", checkpoint="dino-4scale_r50_8xb2-12e_coco" 18 | ) 19 | roboml_detection = RESPModelClient(object_detection) 20 | 21 | # Initialize the Vision component 22 | detection_config = VisionConfig(threshold=0.5) 23 | vision = Vision( 24 | inputs=[image0], 25 | outputs=[detections_topic], 26 | trigger=image0, 27 | config=detection_config, 28 | model_client=roboml_detection, 29 | component_name="detection_component", 30 | ) 31 | 32 | 33 | # Define a model client (working with Ollama in this case) 34 | llava = Llava(name="llava") 35 | llava_client = OllamaClient(llava) 36 | 37 | # Define a fixed input for the component 38 | introspection_query = FixedInput( 39 | name="introspection_query", 40 | msg_type="String", 41 | fixed="What kind of a room is this? Is it an office, a bedroom or a kitchen? Give a one word answer, out of the given choices", 42 | ) 43 | # Define output of the component 44 | introspection_answer = Topic(name="introspection_answer", msg_type="String") 45 | 46 | # Start a timed (periodic) component using the mllm model defined earlier 47 | # This component answers the same question after every 15 seconds 48 | introspector = MLLM( 49 | inputs=[introspection_query, image0], # we use the image0 topic defined earlier 50 | outputs=[introspection_answer], 51 | model_client=llava_client, 52 | trigger=15.0, # we provide the time interval as a float value to the trigger parameter 53 | component_name="introspector", 54 | ) 55 | 56 | 57 | # Define an arbitrary function to validate the output of the introspective component 58 | # before publication. 59 | def introspection_validation(output: str) -> Optional[str]: 60 | for option in ["office", "bedroom", "kitchen"]: 61 | if option in output.lower(): 62 | return option 63 | 64 | 65 | introspector.add_publisher_preprocessor(introspection_answer, introspection_validation) 66 | 67 | # Object detection output from vision component 68 | layer1 = MapLayer(subscribes_to=detections_topic, temporal_change=True) 69 | # Introspection output from mllm component 70 | layer2 = MapLayer(subscribes_to=introspection_answer, resolution_multiple=3) 71 | 72 | # Initialize mandatory topics defining the robots localization in space 73 | position = Topic(name="odom", msg_type="Odometry") 74 | map_topic = Topic(name="map", msg_type="OccupancyGrid") 75 | 76 | # Initialize a vector DB that will store our semantic map 77 | chroma = ChromaDB(name="MainDB") 78 | chroma_client = HTTPDBClient(db=chroma) 79 | 80 | # Create the map component 81 | map_conf = MapConfig(map_name="map") # We give our map a name 82 | map = MapEncoding( 83 | layers=[layer1, layer2], 84 | position=position, 85 | map_topic=map_topic, 86 | config=map_conf, 87 | db_client=chroma_client, 88 | trigger=15.0, 89 | component_name="map_encoding", 90 | ) 91 | 92 | # Launch the components 93 | launcher = Launcher() 94 | launcher.add_pkg(components=[vision, introspector, map]) 95 | launcher.bringup() 96 | -------------------------------------------------------------------------------- /examples/semantic_router.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import json 3 | import numpy as np 4 | from agents.components import LLM, SemanticRouter 5 | from agents.models import Llama3_1 6 | from agents.vectordbs import ChromaDB 7 | from agents.config import LLMConfig, SemanticRouterConfig 8 | from agents.clients.roboml import HTTPDBClient 9 | from agents.clients.ollama import OllamaClient 10 | from agents.ros import Launcher, Topic, Route 11 | 12 | 13 | # Start a Llama3.1 based llm component using ollama client 14 | llama = Llama3_1(name="llama") 15 | llama_client = OllamaClient(llama) 16 | 17 | # Initialize a vector DB that will store our routes 18 | chroma = ChromaDB(name="MainDB") 19 | chroma_client = HTTPDBClient(db=chroma) 20 | 21 | 22 | # Make a generic LLM component using the Llama3_1 model 23 | llm_in = Topic(name="text_in_llm", msg_type="String") 24 | llm_out = Topic(name="text_out_llm", msg_type="String") 25 | 26 | llm = LLM(inputs=[llm_in], outputs=[llm_out], model_client=llama_client, trigger=llm_in) 27 | 28 | 29 | # Define LLM input and output topics including goal_point topic of type PoseStamped 30 | goto_in = Topic(name="goto_in", msg_type="String") 31 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 32 | 33 | config = LLMConfig( 34 | enable_rag=True, 35 | collection_name="map", 36 | distance_func="l2", 37 | n_results=1, 38 | add_metadata=True, 39 | ) 40 | 41 | # initialize the component 42 | goto = LLM( 43 | inputs=[goto_in], 44 | outputs=[goal_point], 45 | model_client=llama_client, 46 | db_client=chroma_client, # check the previous example where we setup this database client 47 | trigger=goto_in, 48 | config=config, 49 | component_name="go_to_x", 50 | ) 51 | 52 | # set a component prompt 53 | goto.set_component_prompt( 54 | template="""From the given metadata, extract coordinates and provide 55 | the coordinates in the following json format:\n {"position": coordinates}""" 56 | ) 57 | 58 | 59 | # pre-process the output before publishing to a topic of msg_type PoseStamped 60 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]: 61 | # extract the json part of the output string (including brackets) 62 | # one can use sophisticated regex parsing here but we'll keep it simple 63 | json_string = output[output.find("{") : output.find("}") + 1] 64 | 65 | # load the string as a json and extract position coordinates 66 | # if there is an error, return None, i.e. no output would be published to goal_point 67 | try: 68 | json_dict = json.loads(json_string) 69 | return np.array(json_dict["position"]) 70 | except Exception: 71 | return 72 | 73 | 74 | # add the pre-processing function to the goal_point output topic 75 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point) 76 | 77 | # Create the input topic for the router 78 | query_topic = Topic(name="question", msg_type="String") 79 | 80 | # Define a route to a topic that processes go-to-x commands 81 | goto_route = Route( 82 | routes_to=goto_in, 83 | samples=[ 84 | "Go to the door", 85 | "Go to the kitchen", 86 | "Get me a glass", 87 | "Fetch a ball", 88 | "Go to hallway", 89 | ], 90 | ) 91 | 92 | # Define a route to a topic that is input to an LLM component 93 | llm_route = Route( 94 | routes_to=llm_in, 95 | samples=[ 96 | "What is the capital of France?", 97 | "Is there life on Mars?", 98 | "How many tablespoons in a cup?", 99 | "How are you today?", 100 | "Whats up?", 101 | ], 102 | ) 103 | 104 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2") 105 | # Initialize the router component 106 | router = SemanticRouter( 107 | inputs=[query_topic], 108 | routes=[llm_route, goto_route], 109 | default_route=llm_route, # If none of the routes fall within a distance threshold 110 | config=router_config, 111 | db_client=chroma_client, # reusing the db_client from the previous example 112 | component_name="router", 113 | ) 114 | 115 | # Launch the components 116 | launcher = Launcher() 117 | launcher.add_pkg(components=[llm, goto, router]) 118 | launcher.bringup() 119 | -------------------------------------------------------------------------------- /examples/tool_calling.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from agents.components import LLM 3 | from agents.models import Llama3_1 4 | from agents.vectordbs import ChromaDB 5 | from agents.config import LLMConfig 6 | from agents.clients.roboml import HTTPDBClient 7 | from agents.clients.ollama import OllamaClient 8 | from agents.ros import Launcher, Topic 9 | 10 | # Start a Llama3.1 based llm component using ollama client 11 | llama = Llama3_1(name="llama") 12 | llama_client = OllamaClient(llama) 13 | 14 | # Initialize a vector DB that will store our routes 15 | chroma = ChromaDB(name="MainDB") 16 | chroma_client = HTTPDBClient(db=chroma) 17 | 18 | # Define LLM input and output topics including goal_point topic of type PoseStamped 19 | goto_in = Topic(name="goto_in", msg_type="String") 20 | goal_point = Topic(name="goal_point", msg_type="PoseStamped") 21 | 22 | config = LLMConfig( 23 | enable_rag=True, 24 | collection_name="map", 25 | distance_func="l2", 26 | n_results=1, 27 | add_metadata=True, 28 | ) 29 | 30 | # initialize the component 31 | goto = LLM( 32 | inputs=[goto_in], 33 | outputs=[goal_point], 34 | model_client=llama_client, 35 | db_client=chroma_client, # check the previous example where we setup this database client 36 | trigger=goto_in, 37 | config=config, 38 | component_name="go_to_x", 39 | ) 40 | 41 | # set a component prompt 42 | goto.set_component_prompt( 43 | template="""What are the position coordinates in the given metadata?""" 44 | ) 45 | 46 | 47 | # pre-process the output before publishing to a topic of msg_type PoseStamped 48 | def get_coordinates(position: list[float]) -> np.ndarray: 49 | """Get position coordinates""" 50 | return np.array(position, dtype=float) 51 | 52 | 53 | function_description = { 54 | "type": "function", 55 | "function": { 56 | "name": "get_coordinates", 57 | "description": "Get position coordinates", 58 | "parameters": { 59 | "type": "object", 60 | "properties": { 61 | "position": { 62 | "type": "list[float]", 63 | "description": "The position coordinates in x, y and z", 64 | } 65 | }, 66 | }, 67 | "required": ["position"], 68 | }, 69 | } 70 | 71 | # add the pre-processing function to the goal_point output topic 72 | goto.register_tool( 73 | tool=get_coordinates, 74 | tool_description=function_description, 75 | send_tool_response_to_model=False, 76 | ) 77 | 78 | # Launch the component 79 | launcher = Launcher() 80 | launcher.add_pkg(components=[goto]) 81 | launcher.bringup() 82 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pytest.ini_options] 2 | minversion = "6.0" 3 | addopts = "-ra -q" 4 | log_cli = true 5 | log_cli_level = "INFO" 6 | log_cli_format="[%(levelname)s] [%(asctime)s] [%(name)s] [%(process)d-%(thread)d] %(message)s" 7 | testpaths = [ 8 | "agents/tests" 9 | ] 10 | 11 | [tool.interrogate] 12 | ignore-init-method = true 13 | ignore-init-module = true 14 | ignore-magic = false 15 | ignore-semiprivate = false 16 | ignore-private = false 17 | ignore-property-decorators = false 18 | ignore-module = true 19 | ignore-nested-functions = false 20 | ignore-nested-classes = true 21 | ignore-setters = false 22 | exclude = ["setup.py", "docs", "build", "log", "install", "agents/tests", "examples"] 23 | ignore-regex = ["^get$", "^mock_.*", ".*BaseClass.*", "^main"] 24 | quiet = false 25 | whitelist-regex = [] 26 | color = true 27 | generate-badge = "." 28 | badge-format = "svg" 29 | 30 | [tool.ruff] 31 | extend-exclude = [".mypy_cache", ".tox", ".venv", "buck-out", "build", ".pytest_cache"] 32 | fix = true 33 | line-length = 88 34 | preview = true 35 | [tool.ruff.lint] 36 | ignore = ["E203", "E266", "E501", "F403", "F401"] 37 | select = ["B","C","E","F","W","B9"] 38 | [tool.ruff.lint.mccabe] 39 | max-complexity = 11 40 | 41 | [tool.bumpver] 42 | current_version = "0.3.1" 43 | version_pattern = "MAJOR.MINOR.PATCH" 44 | commit_message = "(chore) bump version {old_version} -> {new_version}" 45 | tag_message = "{new_version}" 46 | tag_scope = "default" 47 | pre_commit_hook = "" 48 | post_commit_hook = "" 49 | commit = true 50 | tag = true 51 | push = true 52 | 53 | [tool.bumpver.file_patterns] 54 | "agents/package.xml" = [ 55 | "{version}", 56 | ] 57 | --------------------------------------------------------------------------------