├── .github
└── workflows
│ └── documentation.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── agents
├── CHANGELOG.rst
├── CMakeLists.txt
├── agents
│ ├── __init__.py
│ ├── callbacks.py
│ ├── clients
│ │ ├── __init__.py
│ │ ├── db_base.py
│ │ ├── model_base.py
│ │ ├── ollama.py
│ │ └── roboml.py
│ ├── components
│ │ ├── __init__.py
│ │ ├── component_base.py
│ │ ├── imagestovideo.py
│ │ ├── llm.py
│ │ ├── map_encoding.py
│ │ ├── mllm.py
│ │ ├── model_component.py
│ │ ├── semantic_router.py
│ │ ├── speechtotext.py
│ │ ├── texttospeech.py
│ │ └── vision.py
│ ├── config.py
│ ├── models.py
│ ├── publisher.py
│ ├── resources
│ │ ├── test.jpeg
│ │ └── test.wav
│ ├── ros.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── pluralize.py
│ │ ├── utils.py
│ │ └── voice.py
│ └── vectordbs.py
├── msg
│ ├── Bbox2D.msg
│ ├── Detection2D.msg
│ ├── Detections2D.msg
│ ├── Point2D.msg
│ ├── Tracking.msg
│ ├── Trackings.msg
│ └── Video.msg
├── package.xml
├── scripts
│ ├── chainlit_client
│ │ ├── app.py
│ │ ├── chainlit.md
│ │ └── tiny_web_client
│ └── executable
└── tests
│ └── test_clients.py
├── docs
├── _static
│ ├── ROS_AGENTS.png
│ ├── ROS_AGENTS_DARK.png
│ ├── automatika-logo.png
│ ├── complete_dark.png
│ └── complete_light.png
├── basics.md
├── conf.py
├── examples
│ ├── complete.md
│ ├── conversational.md
│ ├── goto.md
│ ├── index.md
│ ├── multiprocessing.md
│ ├── prompt_engineering.md
│ ├── semantic_map.md
│ ├── semantic_router.md
│ └── tool_calling.md
├── index.md
├── installation.md
├── intro.md
└── quickstart.md
├── examples
├── complete_agent.py
├── complete_agent_multiprocessing.py
├── conversational_agent_with_audio.py
├── go_to_x.py
├── prompt_engineering.py
├── semantic_map.py
├── semantic_router.py
└── tool_calling.py
├── interrogate_badge.svg
└── pyproject.toml
/.github/workflows/documentation.yml:
--------------------------------------------------------------------------------
1 | name: documentation
2 |
3 | on: [push, pull_request, workflow_dispatch]
4 |
5 | permissions:
6 | contents: write
7 |
8 | jobs:
9 | docs:
10 | runs-on: ubuntu-24.04
11 | steps:
12 | - uses: actions/checkout@v4
13 | - uses: actions/setup-python@v5
14 | - name: Install dependencies
15 | run: |
16 | pip install --break-system-packages sphinx myst_parser sphinx-copybutton sphinx-autodoc2 sphinx-book-theme linkify-it-py
17 | - name: Sphinx build
18 | run: |
19 | sphinx-build docs _build
20 | - name: Deploy to GitHub Pages
21 | uses: peaceiris/actions-gh-pages@v3
22 | if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
23 | with:
24 | publish_branch: gh-pages
25 | github_token: ${{ secrets.GITHUB_TOKEN }}
26 | publish_dir: _build/
27 | force_orphan: true
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *,cover
47 | .hypothesis/
48 |
49 | # Translations
50 | *.mo
51 | *.pot
52 |
53 | # Django stuff:
54 | *.log
55 | local_settings.py
56 |
57 | # Flask stuff:
58 | instance/
59 | .webassets-cache
60 |
61 | # Scrapy stuff:
62 | .scrapy
63 |
64 | # Sphinx documentation
65 | docs/_build/
66 |
67 | # PyBuilder
68 | target/
69 |
70 | # Jupyter Notebook
71 | .ipynb_checkpoints
72 |
73 | # pyenv
74 | .python-version
75 |
76 | # celery beat schedule file
77 | celerybeat-schedule
78 |
79 | # dotenv
80 | .env
81 |
82 | # virtualenv
83 | .venv/
84 | venv/
85 | ENV/
86 |
87 | # Spyder project settings
88 | .spyderproject
89 |
90 | # Rope project settings
91 | .ropeproject
92 |
93 | # MAC
94 | .DS_Store
95 |
96 | # VSCode
97 | .vscode
98 |
99 | # ROS
100 | log/
101 | install/
102 | src/
103 |
104 | # custom
105 | shared/
106 | logdir/
107 | data/
108 | logs/
109 | tmp/
110 | *.csv
111 | *.h5
112 | *.npz
113 | *.zip
114 | *.ods
115 | *.xyz
116 | *.off
117 | *.obj
118 |
119 | # Ignores for web client
120 | .chainlit/
121 |
122 | # Ignores for Docs
123 | docs/Makefile
124 | docs/make.bat
125 | docs/apidocs/
126 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pre-commit/pre-commit-hooks
3 | rev: v2.3.0
4 | hooks:
5 | - id: check-yaml
6 | - id: end-of-file-fixer
7 | - id: trailing-whitespace
8 | - id: check-docstring-first
9 | - id: check-toml
10 | - repo: https://github.com/astral-sh/ruff-pre-commit
11 | # Ruff version.
12 | rev: v0.5.4
13 | hooks:
14 | # linter.
15 | - id: ruff
16 | types_or: [ python, pyi, jupyter ]
17 | # formatter.
18 | - id: ruff-format
19 | types_or: [ python, pyi, jupyter ]
20 | - repo: https://github.com/econchick/interrogate
21 | rev: 1.7.0
22 | hooks:
23 | # docstring coverage
24 | - id: interrogate
25 | args: [-vv, --fail-under=80, -c, pyproject.toml]
26 | pass_filenames: false
27 |
28 |
29 | ## Uncomment mypy for type-checking errors in pre-commit
30 |
31 | # - repo: https://github.com/pre-commit/mirrors-mypy
32 | # rev: v1.5.0
33 | # hooks:
34 | # - id: mypy
35 | # additional_dependencies: [tokenize-rt==3.2.0, 'types-PyYAML']
36 | # exclude: ^tests/
37 | # args:
38 | # [
39 | # "--ignore-missing-imports",
40 | # "--check-untyped-defs",
41 | # "--warn-redundant-casts",
42 | # "--no-implicit-optional",
43 | # "--warn-return-any"
44 | # ]
45 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Automatika Robotics
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | ROS Agents is a fully-loaded framework for creating interactive embodied agents that can understand, remember, and act upon contextual information from their environment.
8 |
9 | - **Agents in the real world:** Designed to be used with autonomous robot systems that operate in dynamic environments, specifically AMRs.
10 | - **Intuitive API**: Simple pythonic API to utilize local or cloud based ML models (specifically **Multimodal LLMs** and other **Transformer Architectures**) on robots.
11 | - **Semantic Memory**: Integrates vector databases, semantic routing and other supporting components to quickly build arbitrarily complex graphs for agentic information flow. No need to utilize bloated "GenAI" frameworks on your robot.
12 | - **Made in ROS2**: Utilizes ROS2 as the underlying middleware. Theoretically, all devices that provide a ROS2 package can be utilized to send data to ML models, as long as the datatype callback has been implemented.
13 |
14 | Checkout [Installation Instructions](https://automatika-robotics.github.io/ros-agents/installation.html) 🛠️
15 |
16 | Get started with the [Quickstart Guide](https://automatika-robotics.github.io/ros-agents/quickstart.html) 🚀
17 |
18 | Get familiar with [Basic Concepts](https://automatika-robotics.github.io/ros-agents/basics.html) 📚
19 |
20 | Dive right in with [Examples](https://automatika-robotics.github.io/ros-agents/examples/index.html) ✨
21 |
22 | ## Installation 🛠️
23 |
24 | ### Pre-Requisits
25 |
26 | #### Install ROS
27 |
28 | ROS Agents is built to be used with ROS2. All ROS distributions starting from _Iron_ are supported. Install ROS2 by following the instructions on the [official site](https://docs.ros.org/en/iron/Installation.html).
29 |
30 | #### Install a model serving platform
31 |
32 | The core of ROS Agents is agnostic to model serving platforms. It currently supports [Ollama](https://ollama.com) and [RoboML](https://github.com/automatika-robotics/robo-ml). Please install either of these by following the instructions provided by respective projects. Support for new platforms will be continuously added. If you would like to support a particular platform, please open an issue/PR.
33 |
34 | ### Install ROS Agents (Ubuntu)
35 |
36 | **Binary packages for Ubuntu will be released soon. Check this space.**
37 |
38 | ### Install ROS Agents from source
39 |
40 | #### Get Dependencies
41 |
42 | Install python dependencies
43 |
44 | ```shell
45 | pip install numpy opencv-python-headless 'attrs>=23.2.0' jinja2 httpx setproctitle msgpack msgpack-numpy numpy-quaternion platformdirs
46 | ```
47 |
48 | Download ROS Sugar
49 |
50 | ```shell
51 | git clone https://github.com/automatika-robotics/ros-sugar
52 | ```
53 |
54 | #### Install ROS Agents
55 |
56 | ```shell
57 | git clone https://github.com/automatika-robotics/ros-agents.git
58 | cd ..
59 | colcon build
60 | source install/setup.bash
61 | python your_script.py
62 | ```
63 |
64 | ## Quick Start 🚀
65 |
66 | Unlike other ROS package, ROS Agents provides a pure pythonic way of describing the node graph using [ROS Sugar](https://www.github.com/automatika-robotics/ros-sugar). Copy the following code in a python script and run it.
67 |
68 | ```python
69 | from agents.clients.ollama import OllamaClient
70 | from agents.components import MLLM
71 | from agents.models import Llava
72 | from agents.ros import Topic, Launcher
73 |
74 | # Define input and output topics (pay attention to msg_type)
75 | text0 = Topic(name="text0", msg_type="String")
76 | image0 = Topic(name="image_raw", msg_type="Image")
77 | text1 = Topic(name="text1", msg_type="String")
78 |
79 | # Define a model client (working with Ollama in this case)
80 | llava = Llava(name="llava")
81 | llava_client = OllamaClient(llava)
82 |
83 | # Define an MLLM component (A component represents a node with a particular functionality)
84 | mllm = MLLM(
85 | inputs=[text0, image0],
86 | outputs=[text1],
87 | model_client=llava_client,
88 | trigger=[text0],
89 | component_name="vqa"
90 | )
91 | # Additional prompt settings
92 | mllm.set_topic_prompt(text0, template="""You are an amazing and funny robot.
93 | Answer the following about this image: {{ text0 }}"""
94 | )
95 | # Launch the component
96 | launcher = Launcher()
97 | launcher.add_pkg(components=[mllm])
98 | launcher.bringup()
99 | ```
100 |
101 | And just like that we have an agent that can answer questions like **'What do you see?'**. To interact with this agent, ROS Agents includes a tiny web client. Checkout the [Quick Start Guide](https://automatika-robotics.github.io/ros-agents/quickstart.html) to learn more about how components and models work together.
102 |
103 | ## Elaborate Embodied Agents
104 | The quickstart example above is just an amuse-bouche of what is possible with ROS Agents. In ROS Agents we can create arbitrarily sophisticated component graphs. And furthermore our system can be configured to even change or reconfigure itself based on events internal or external to the system. Check out the code for the following agent [here](https://automatika-robotics.github.io/ros-agents/examples/complete.html).
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 | ## Copyright
113 |
114 | The code in this distribution is Copyright (c) 2024 Automatika Robotics unless explicitly indicated otherwise.
115 |
116 | ROS Agents is made available under the MIT license. Details can be found in the [LICENSE](LICENSE) file.
117 |
118 | ## Contributions
119 |
120 | ROS Agents has been developed in collaboration betweeen [Automatika Robotics](https://automatikarobotics.com/) and [Inria](https://inria.fr/). Contributions from the community are most welcome.
121 |
--------------------------------------------------------------------------------
/agents/CHANGELOG.rst:
--------------------------------------------------------------------------------
1 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2 | Changelog for package automatika_embodied_agents
3 | ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
4 |
5 | 0.3.3 (2025-01-28)
6 | ------------------
7 | * (fix) Removes python dependencies from package manifest until package names merged in rosdistro
8 | * Contributors: ahr
9 |
10 | 0.3.2 (2025-01-28)
11 | ------------------
12 | * (docs) Updates docs for conversational agent and SpeechToTextConfig
13 | * (feature) Adds vad, audio feautres and wakeword classification classes based local onnx models
14 | * (feature) Adds utility function for downloading models and status classes for speech processing
15 | * (feature) Adds configuration for wakeword detections in speechtotext component
16 | * (fix) Fixes error in ollama client where tool calls are received without output content
17 | * (fix) Adds a fix to map encoding where it can start with a single detections layer
18 | * (refactor) Makes component name non-optional in components to avoid name conflicts
19 | * (fix) Fixes error for long prompts when checking if prompt is a filename
20 | * (refactor) Removes pytorch as a dependency and runs VAD model with onnxruntime
21 | * (refactor) Makes warmup a property of model components that defaults to false
22 | * (feature) Adds utility method to download onnx model files
23 | * (refactor) Replaces info with debug to reduce logging spam
24 | * (fix) Fixes getting logging severity level for jazzy onwards
25 | * (fix) Adds minor improvements to branching for llm and mllm components
26 | * (chore) Cleansup dependencies for packaging
27 | * (chore) Adds dependency for sugar and removes unnecessary python dependencies from packaging
28 | * (fix) Corrects import of Topic class
29 | * (docs) Removes redefinition of Topic and corrects links to ROS Sugar
30 | * (fix) Changes topic in base component to be directly inherited from ROS Sugar for consistency accross packages
31 | * (feature) Adds warmup functions to all model based components
32 | * (refactor) Removes pillow as a dependancy
33 | * (refactor) Removes overrrides from components and adds custom meathods instead
34 | * (feature) Adds warmup to vision component for displaying stats on init
35 | * (fix) Adds fix for correct colors in cv2 visualization
36 | * (fix) Adds node name as window name for visualization in vision component
37 | * (feature) Adds cv2 based visualization option to vision component
38 | * (refactor) Reduces branching in execution step for components
39 | * (chore) Combines agents and agents_interfaces to one package
40 | * (chore) Changes deb package name
41 | * (fix) Fixes raising error in model initialization for roboml clients
42 | * (refactor) Adds passing additional agent types to ros sugar
43 | * (fix) Fixes error messages when wrong component inputs/outputs are passed
44 | * (feature) Adds support for CompressedImage msg type in components
45 | * (feature) Adds option to deploy vision models using tensorrt
46 | Works with roboml
47 | * (fix) Fixes check on sufficient topics in component validation
48 | * (fix) Fixes a bug in topic validation
49 | * (fix) Fixes validation of topics in components
50 | * (refactor) Changes handling of image messages for publication
51 | - Adds support for CompressedImage messages
52 | - Gathers image messages directly in vision component instead of getting them back from clients
53 | * (feature) Adds frame_id to trackings publisher and updates msg and callback
54 | * (feature) Adds boxes to vision tracking message
55 | * Contributors: ahr, mkabtoul
56 |
57 | 0.3.1 (2024-10-29)
58 | ------------------
59 | * (chore) bump version 0.3.0 -> 0.3.1
60 | * (feature) Adds support for using tool calling in LLM components in multiprocess execution
61 | * Contributors: ahr
62 |
63 | 0.3.0 (2024-10-28)
64 | ------------------
65 | * (chore) bump version 0.2.0 -> 0.3.0
66 | * (chore) Adds bumpver config
67 | * Merge pull request `#14 `_ from automatika-robotics/feature/external_processors
68 | Adds support for running components as separate processes
69 | * (docs) Updates docs based on ROS Sugar version update
70 | * (fix) Fixes bug in registering triggers with components
71 | * (refactor) Simplifies by adding direct serialization of clients and triggers
72 | * (refactor) Removes gratuitous logging from utils
73 | * (fix) Minor bug fixes for components to run in multiprocessing
74 | - Fixes trigger assignment for components
75 | - Handles private attributes of attrs classes
76 | - Fixes component and config init in common executable
77 | * (fix) Fixes serializing log level in clients
78 | * (fix) Fixes minor bugs in utils, components, configs and models
79 | * (feature) Adds support for running components in multiple processes
80 | - Adds common executable to the package for ROS Sugar launcher
81 | - Refactors components to be serializable
82 | - Adds serialization to clients
83 | - Minor type hint changes for compatibility with older versions of ROS
84 | * (fix) Adds the correct check for external processors given new ros-sugar implementation
85 | * Contributors: ahr
86 |
87 | 0.2.0 (2024-09-28)
88 | ------------------
89 | * (chore) Bump up the version
90 | * Merge pull request `#13 `_ from automatika-robotics/feature/better_clients
91 | Adds enhanced functionality in clients specifically for LLM and MLLM components
92 | * (feature) Adds tool calling for LLM component using the OllamaClient
93 | * (fix) Fixes rag results in templated inputs to LLMs which do not contain input
94 | * (refactor) Makes named models subclasses of TransformersLLM and TransformersMLLM for easier handling in roboml client
95 | * (fix) Fixes key error in ollama client response retreival
96 | * (fix) Adds flag for chat history for chat history reset and fixes logging
97 | * (feature) Adds TransformersLLM and TransformersMLLM models for roboml clients
98 | * (fix) Removes history reset phrase from model definitions and add system prompt for LLMs and derivates
99 | * (refactor) Changes model component to have execution step as an abstract method implemented by child components
100 | * (fix) Changes ollama client inference call to use chat endpoint
101 | * (feature) Adds chat history management to llm and mllm components
102 | * (docs) Clarifies handling of RAG results for llm component
103 | * (fix) Fixes bug in rag result handling for llm component
104 | * (fix) Removes default init_timeout from models
105 | * (refactor) Moves roboml resp client dependancies inside the client initialization
106 | * (fix) Explicity exposes QoSConfig in ros module
107 | * (refactor) Replaces map_meta_data parameter with map_topic for MapEncoding component
108 | * (refactor) Removes direct dependancy on pypdf
109 | * (fix) Changes map meta data topic to type OccupancyGrid
110 | * (feature) Adds audio options to chainlit client
111 | * (fix) Removes unused imports
112 | * (fix) Fixes the initialization of map encoding and semantic router components
113 | * (refactor) Fixes imports and refactors code according to latest version of ROS sugar
114 | * (fix) Fixes passing the config in components to parent base component
115 | * (fix) Fixes ROS sugar import for BaseTopic
116 | * (refactor) Removes auto_ros as a dependency
117 | * (feature) Adds init_on_activation flag to all implemented clientsc
118 | * (feature) Seperates abstract methods from callable methods in db client base
119 | * (feature) Seperates callable methods, from abstract methods in client base class
120 | * Contributors: ahr
121 |
122 | 0.1.1 (2024-09-05)
123 | ------------------
124 | * (feature) Adds component action for adding points to map collection (`#12 `_)
125 | * Makes version compliant with ROS convention
126 | * (chore) Adds license declaration in setup.py
127 | * Bumps version number and adds license information
128 | * Initial release 0.1.1a
129 | * Contributors: ahr, mkabtoul
130 |
--------------------------------------------------------------------------------
/agents/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 3.8)
2 | project(automatika_embodied_agents)
3 |
4 | if(CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
5 | add_compile_options(-Wall -Wextra -Wpedantic)
6 | endif()
7 |
8 | # find dependencies
9 | find_package(ament_cmake REQUIRED)
10 | find_package(ament_cmake_python REQUIRED)
11 |
12 | find_package(rclcpp REQUIRED)
13 | find_package(rclpy REQUIRED)
14 | find_package(rosidl_default_generators REQUIRED)
15 | find_package(builtin_interfaces REQUIRED)
16 | find_package(std_msgs REQUIRED)
17 | find_package(sensor_msgs REQUIRED)
18 |
19 | file(GLOB_RECURSE MSG_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "msg/*.msg" )
20 |
21 | rosidl_generate_interfaces(${PROJECT_NAME}
22 | ${MSG_FILES}
23 | DEPENDENCIES builtin_interfaces std_msgs sensor_msgs
24 | )
25 |
26 | ament_export_dependencies(rosidl_default_runtime)
27 |
28 | # Install Python module
29 | ament_python_install_package(agents)
30 | # Add executables
31 | install(PROGRAMS
32 | scripts/executable
33 | scripts/chainlit_client/tiny_web_client
34 | scripts/chainlit_client/app.py # chainlit app definition
35 | scripts/chainlit_client/chainlit.md # readme picked by chainlit client
36 | DESTINATION lib/${PROJECT_NAME}
37 | )
38 |
39 | ament_package()
40 |
--------------------------------------------------------------------------------
/agents/agents/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/agents/agents/__init__.py
--------------------------------------------------------------------------------
/agents/agents/callbacks.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | import os
3 | import cv2
4 | import numpy as np
5 | from ros_sugar.io import (
6 | GenericCallback,
7 | TextCallback,
8 | get_logger,
9 | )
10 |
11 | from ros_sugar.io.utils import image_pre_processing, read_compressed_image
12 |
13 | from .utils import create_detection_context
14 |
15 | __all__ = ["GenericCallback", "TextCallback"]
16 |
17 |
18 | class VideoCallback(GenericCallback):
19 | """
20 | Video Callback class. Its get method saves a video as list of bytes
21 | """
22 |
23 | def __init__(self, input_topic, node_name: Optional[str] = None) -> None:
24 | """
25 | Constructs a new instance.
26 | :param input_topic: Subscription topic
27 | :type input_topic: Input
28 | """
29 | super().__init__(input_topic, node_name)
30 | # fixed video needs to be a path to cv2 readable video
31 | if hasattr(input_topic, "fixed"):
32 | if os.path.isfile(input_topic.fixed):
33 | try:
34 | # read all video frames
35 | video = []
36 | cap = cv2.VideoCapture(input_topic.fixed)
37 | if not cap.isOpened():
38 | raise TypeError()
39 | while cap.isOpened():
40 | ret, frame = cap.read()
41 | if ret:
42 | video.append(frame)
43 | else:
44 | break
45 | # Convert frame list to ndarray
46 | self.msg = np.array(video)
47 | except Exception:
48 | get_logger(self.node_name).error(
49 | f"Fixed path {self.msg} provided for Vidoe topic is not readable Video file"
50 | )
51 | else:
52 | get_logger(self.node_name).error(
53 | f"Fixed path {self.msg} provided for Video topic is not a valid file path"
54 | )
55 |
56 | def _get_output(self, **_) -> Optional[np.ndarray]:
57 | """
58 | Gets video as a numpy array.
59 | :returns: Video as nd_array
60 | :rtype: np.ndarray
61 | """
62 | if not self.msg:
63 | return None
64 |
65 | # return np.ndarray if fixed video has been read
66 | if isinstance(self.msg, np.ndarray):
67 | return self.msg
68 | else:
69 | # pre-process in case of weird encodings and reshape ROS topic
70 | video = []
71 | for img in self.msg.frames:
72 | video.append(image_pre_processing(img))
73 | for img in self.msg.compressed_frames:
74 | video.append(read_compressed_image(img))
75 | return np.array(video)
76 |
77 |
78 | class ObjectDetectionCallback(GenericCallback):
79 | """
80 | Object detection Callback class.
81 | Its get method returns the bounding box data
82 | """
83 |
84 | def __init__(self, input_topic, node_name: Optional[str] = None) -> None:
85 | """
86 | Constructs a new instance.
87 |
88 | :param input_topic: Subscription topic
89 | :type input_topic: str
90 | """
91 | super().__init__(input_topic, node_name)
92 | self.msg = input_topic.fixed if hasattr(input_topic, "fixed") else None
93 |
94 | def _get_output(self, **_) -> Optional[str]:
95 | """
96 | Processes labels and returns a context string for
97 | prompt engineering
98 |
99 | :returns: Comma separated classnames
100 | :rtype: str
101 | """
102 | if not self.msg:
103 | return None
104 | # send fixed list of labels if it exists
105 | if isinstance(self.msg, list):
106 | return create_detection_context(self.msg)
107 | # send labels from ROS message
108 | else:
109 | label_list = [
110 | label for detection in self.msg.detections for label in detection.labels
111 | ]
112 | detections_string = create_detection_context(label_list)
113 | return detections_string
114 |
--------------------------------------------------------------------------------
/agents/agents/clients/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Clients are standard interfaces for components to interact with ML models or vector DBs served by various platforms. Currently ROS Agents provides the following clients, which cover the most popular open source model deployment platforms. Simple clients can be easily implemented for other platforms and the use of heavy duct-tape "AI" frameworks on the robot is discouraged 😅.
3 |
4 | ```{note}
5 | Some clients might need additional dependacies, which are provided in the following table. If missing the user will also be prompted for them at runtime.
6 | ```
7 |
8 | ```{list-table}
9 | :widths: 20 20 60
10 | :header-rows: 1
11 | * - Platform
12 | - Client
13 | - Description
14 |
15 | * - **RoboML**
16 | - [HTTPModelClient](agents.clients.roboml.HTTPModelClient)
17 | - An HTTP client for interaction with ML models served on RoboML.
18 |
19 | * - **RoboML**
20 | - [HTTPDBClient](agents.clients.roboml.HTTPDBClient)
21 | - An HTTP client for interaction with vector DBs served on RoboML.
22 |
23 | * - **RoboML**
24 | - [RESPModelClient](agents.clients.roboml.RESPModelClient)
25 | - A Redis Serialization Protocol (RESP) based client for interaction with ML models served on RoboML. **Note:** In order to use this client, please install dependancies with `pip install redis[hiredis] msgpack msgpack-numpy`
26 |
27 | * - **RoboML**
28 | - [RESPDBClient](agents.clients.roboml.RESPDBClient)
29 | - A Redis Serialization Protocol (RESP) based client for interaction with vector DBs served on RoboML. **Note:** In order to use this client, please install dependancies with `pip install redis[hiredis] msgpack msgpack-numpy`
30 |
31 | * - **Ollama**
32 | - [OllamaClient](agents.clients.ollama.OllamaClient)
33 | - An HTTP client for interaction with ML models served on Ollama. **Note:** In order to use this client, please install dependancies with `pip install ollama`
34 |
35 | """
36 |
37 | from .ollama import OllamaClient
38 | from .roboml import HTTPDBClient, HTTPModelClient, RESPDBClient, RESPModelClient
39 |
40 |
41 | __all__ = [
42 | "OllamaClient",
43 | "HTTPDBClient",
44 | "HTTPModelClient",
45 | "RESPDBClient",
46 | "RESPModelClient",
47 | ]
48 |
--------------------------------------------------------------------------------
/agents/agents/clients/db_base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any, Optional, Dict, Union
3 |
4 | from rclpy import logging
5 |
6 | from ..vectordbs import DB
7 | from ..utils import validate_func_args
8 |
9 |
10 | class DBClient(ABC):
11 | """DBClient."""
12 |
13 | @validate_func_args
14 | def __init__(
15 | self,
16 | db: Union[DB, Dict],
17 | host: Optional[str] = None,
18 | port: Optional[int] = None,
19 | response_timeout: int = 30,
20 | init_on_activation: bool = True,
21 | logging_level: str = "info",
22 | **_,
23 | ):
24 | """__init__.
25 | :param db:
26 | :type db: DB
27 | :param host:
28 | :type host: Optional[str]
29 | :param port:
30 | :type port: Optional[int]
31 | :param init_on_activation:
32 | :type init_on_activation: bool
33 | :param logging_level:
34 | :type logging_level: str
35 | """
36 | if isinstance(db, DB):
37 | self.db_type = db.__class__.__name__
38 | self.db_name = db.name
39 | self.init_timeout = db.init_timeout
40 | self.db_init_params = db._get_init_params()
41 |
42 | else:
43 | self.db_type = db["db_type"]
44 | self.db_name = db["db_name"]
45 | self.init_timeout = db["init_timeout"]
46 | self.db_init_params = db["db_init_params"]
47 |
48 | self.host = host
49 | self.port = port
50 | self.init_on_activation = init_on_activation
51 | self.logger = logging.get_logger(self.db_name)
52 | logging.set_logger_level(
53 | self.db_name, logging.get_logging_severity_from_string(logging_level)
54 | )
55 | self.response_timeout = response_timeout
56 |
57 | def serialize(self) -> Dict:
58 | """Get client json
59 | :rtype: Dict
60 | """
61 | db = {
62 | "db_name": self.db_name,
63 | "db_type": self.db_type,
64 | "init_timeout": self.init_timeout,
65 | "db_init_params": self.db_init_params,
66 | }
67 |
68 | return {
69 | "client_type": self.__class__.__name__,
70 | "db": db,
71 | "host": self.host,
72 | "port": self.port,
73 | "init_on_activation": self.init_on_activation,
74 | "logging_level": self.logger.get_effective_level().name,
75 | "response_timeout": self.response_timeout,
76 | }
77 |
78 | def check_connection(self) -> None:
79 | """initialize.
80 | :rtype: None
81 | """
82 | self._check_connection()
83 |
84 | def initialize(self) -> None:
85 | """initialize.
86 | :rtype: None
87 | """
88 | if self.init_on_activation:
89 | self._initialize()
90 |
91 | def add(self, db_input: Dict[str, Any]) -> Optional[Dict]:
92 | """add data.
93 | :param db_input:
94 | :type db_input: dict[str, Any]
95 | :rtype: dict | None
96 | """
97 | return self._add(db_input)
98 |
99 | def conditional_add(self, db_input: Dict[str, Any]) -> Optional[Dict]:
100 | """add data if given ids dont exist. Update metadatas of the ids that exist
101 | :param db_input:
102 | :type db_input: dict[str, Any]
103 | :rtype: dict | None
104 | """
105 | return self._conditional_add(db_input)
106 |
107 | def metadata_query(self, db_input: Dict[str, Any]) -> Optional[Dict]:
108 | """Query based on given metadata.
109 | :param db_input:
110 | :type db_input: dict[str, Any]
111 | :rtype: dict | None
112 | """
113 | return self._metadata_query(db_input)
114 |
115 | def query(self, db_input: Dict[str, Any]) -> Optional[Dict]:
116 | """Query based on query string.
117 | :param db_input:
118 | :type db_input: dict[str, Any]
119 | :rtype: dict | None
120 | """
121 | return self._query(db_input)
122 |
123 | def deinitialize(self) -> None:
124 | """deinitialize."""
125 | # TODO: Add check for db initialization by keeping db
126 | # state in client
127 | if self.init_on_activation:
128 | self._deinitialize()
129 |
130 | @abstractmethod
131 | def _check_connection(self) -> None:
132 | """check_connection.
133 | :rtype: None
134 | """
135 | raise NotImplementedError(
136 | "This method needs to be implemented in a child class"
137 | )
138 |
139 | @abstractmethod
140 | def _initialize(self) -> None:
141 | """initialize.
142 | :rtype: None
143 | """
144 | raise NotImplementedError(
145 | "This method needs to be implemented in a child class"
146 | )
147 |
148 | @abstractmethod
149 | def _add(self, db_input: Dict[str, Any]) -> Optional[Dict]:
150 | """add data.
151 | :param db_input:
152 | :type db_input: dict[str, Any]
153 | :rtype: dict | None
154 | """
155 | raise NotImplementedError(
156 | "This method needs to be implemented in a child class"
157 | )
158 |
159 | @abstractmethod
160 | def _conditional_add(self, db_input: Dict[str, Any]) -> Optional[Dict]:
161 | """add data if given ids dont exist. Update metadatas of the ids that exist
162 | :param db_input:
163 | :type db_input: dict[str, Any]
164 | :rtype: dict | None
165 | """
166 | raise NotImplementedError(
167 | "This method needs to be implemented in a child class"
168 | )
169 |
170 | @abstractmethod
171 | def _metadata_query(self, db_input: Dict[str, Any]) -> Optional[Dict]:
172 | """Query based on given metadata.
173 | :param db_input:
174 | :type db_input: dict[str, Any]
175 | :rtype: dict | None
176 | """
177 | raise NotImplementedError(
178 | "This method needs to be implemented in a child class"
179 | )
180 |
181 | @abstractmethod
182 | def _query(self, db_input: Dict[str, Any]) -> Optional[Dict]:
183 | """Query based on query string.
184 | :param db_input:
185 | :type db_input: dict[str, Any]
186 | :rtype: dict | None
187 | """
188 | raise NotImplementedError(
189 | "This method needs to be implemented in a child class"
190 | )
191 |
192 | @abstractmethod
193 | def _deinitialize(self) -> None:
194 | """deinitialize."""
195 | raise NotImplementedError(
196 | "This method needs to be implemented in a child class"
197 | )
198 |
--------------------------------------------------------------------------------
/agents/agents/clients/model_base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any, Optional, Dict, Union
3 |
4 | from rclpy import logging
5 |
6 | from ..models import Model
7 | from ..utils import validate_func_args
8 |
9 |
10 | class ModelClient(ABC):
11 | """MLClient."""
12 |
13 | @validate_func_args
14 | def __init__(
15 | self,
16 | model: Union[Model, Dict],
17 | host: Optional[str] = None,
18 | port: Optional[int] = None,
19 | inference_timeout: int = 30,
20 | init_on_activation: bool = True,
21 | logging_level: str = "info",
22 | **_,
23 | ):
24 | """__init__.
25 | :param model:
26 | :type model: Model
27 | :param host:
28 | :type host: Optional[str]
29 | :param port:
30 | :type port: Optional[int]
31 | :param inference_timeout:
32 | :type inference_timeout: int
33 | :param logging_level:
34 | :type logging_level: str
35 | """
36 | if isinstance(model, Model):
37 | self._model = model
38 | self.model_type = model.__class__.__name__
39 | self.model_name = model.name
40 | self.init_timeout = model.init_timeout
41 | self.model_init_params = model._get_init_params()
42 |
43 | else:
44 | self.model_type = model["model_type"]
45 | self.model_name = model["model_name"]
46 | self.init_timeout = model["init_timeout"]
47 | self.model_init_params = model["model_init_params"]
48 |
49 | self.host = host
50 | self.port = port
51 | self.init_on_activation = init_on_activation
52 | self.logger = logging.get_logger(self.model_name)
53 | logging.set_logger_level(
54 | self.model_name, logging.get_logging_severity_from_string(logging_level)
55 | )
56 | self.inference_timeout = inference_timeout
57 |
58 | def serialize(self) -> Dict:
59 | """Get client json
60 | :rtype: Dict
61 | """
62 | model = {
63 | "model_name": self.model_name,
64 | "model_type": self.model_type,
65 | "init_timeout": self.init_timeout,
66 | "model_init_params": self.model_init_params,
67 | }
68 |
69 | return {
70 | "client_type": self.__class__.__name__,
71 | "model": model,
72 | "host": self.host,
73 | "port": self.port,
74 | "init_on_activation": self.init_on_activation,
75 | "logging_level": self.logger.get_effective_level().name,
76 | "inference_timeout": self.inference_timeout,
77 | }
78 |
79 | def check_connection(self) -> None:
80 | """initialize.
81 | :rtype: None
82 | """
83 | self._check_connection()
84 |
85 | def initialize(self) -> None:
86 | """initialize.
87 | :rtype: None
88 | """
89 | if self.init_on_activation:
90 | self._initialize()
91 |
92 | def inference(self, inference_input: Dict[str, Any]) -> Optional[Dict]:
93 | """inference.
94 | :param inference_input:
95 | :type inference_input: dict[str, Any]
96 | :rtype: dict | None
97 | """
98 | return self._inference(inference_input)
99 |
100 | def deinitialize(self):
101 | """deinitialize."""
102 | # TODO: Add check for model initialization by keeping model
103 | # state in client
104 | if self.init_on_activation:
105 | self._deinitialize()
106 |
107 | @abstractmethod
108 | def _check_connection(self) -> None:
109 | """check_connection.
110 | :rtype: None
111 | """
112 | raise NotImplementedError(
113 | "This method needs to be implemented in a child class"
114 | )
115 |
116 | @abstractmethod
117 | def _initialize(self) -> None:
118 | """initialize.
119 | :rtype: None
120 | """
121 | raise NotImplementedError(
122 | "This method needs to be implemented in a child class"
123 | )
124 |
125 | @abstractmethod
126 | def _inference(self, inference_input: Dict[str, Any]) -> Optional[Dict]:
127 | """inference.
128 | :param inference_input:
129 | :type inference_input: dict[str, Any]
130 | :rtype: dict | None
131 | """
132 | raise NotImplementedError(
133 | "This method needs to be implemented in a child class"
134 | )
135 |
136 | @abstractmethod
137 | def _deinitialize(self):
138 | """deinitialize."""
139 | raise NotImplementedError(
140 | "This method needs to be implemented in a child class"
141 | )
142 |
--------------------------------------------------------------------------------
/agents/agents/clients/ollama.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional, Dict, Union
2 |
3 | import httpx
4 |
5 | from ..models import LLM
6 | from ..utils import encode_arr_base64
7 | from .model_base import ModelClient
8 |
9 | __all__ = ["OllamaClient"]
10 |
11 |
12 | class OllamaClient(ModelClient):
13 | """An HTTP client for interaction with ML models served on ollama"""
14 |
15 | def __init__(
16 | self,
17 | model: Union[LLM, Dict],
18 | host: str = "127.0.0.1",
19 | port: int = 11434,
20 | inference_timeout: int = 30,
21 | init_on_activation: bool = True,
22 | logging_level: str = "info",
23 | **kwargs,
24 | ):
25 | if isinstance(model, LLM):
26 | model._set_ollama_checkpoint()
27 | try:
28 | from ollama import Client
29 |
30 | self.client = Client(host=f"{host}:{port}")
31 | except ModuleNotFoundError as e:
32 | raise ModuleNotFoundError(
33 | "In order to use the OllamaClient, you need ollama-python package installed. You can install it with 'pip install ollama'"
34 | ) from e
35 | super().__init__(
36 | model=model,
37 | host=host,
38 | port=port,
39 | inference_timeout=inference_timeout,
40 | init_on_activation=init_on_activation,
41 | logging_level=logging_level,
42 | **kwargs,
43 | )
44 | self._check_connection()
45 |
46 | def _check_connection(self) -> None:
47 | """Check if the platfrom is being served on specified IP and port"""
48 | # Ping remote server to check connection
49 | self.logger.info("Checking connection with remote_host Ollama")
50 | try:
51 | httpx.get(f"http://{self.host}:{self.port}").raise_for_status()
52 | except Exception as e:
53 | self.logger.error(str(e))
54 | raise
55 |
56 | def _initialize(self) -> None:
57 | """
58 | Initialize the model on platform using the paramters provided in the model specification class
59 | """
60 | self.logger.info(f"Initializing {self.model_name} on ollama")
61 | try:
62 | # set timeout on underlying httpx client
63 | self.client._client.timeout = self.init_timeout
64 | r = self.client.pull(self.model_init_params["checkpoint"])
65 | if r.get("status") != "success": # type: ignore
66 | raise Exception(
67 | f"Could not pull model {self.model_init_params['checkpoint']}"
68 | )
69 | # load model in memory with empty request
70 | self.client.generate(
71 | model=self.model_init_params["checkpoint"], keep_alive=10
72 | )
73 | self.logger.info(f"{self.model_name} model initialized")
74 | except Exception as e:
75 | self.logger.error(str(e))
76 | return None
77 |
78 | def _inference(self, inference_input: Dict[str, Any]) -> Optional[Dict]:
79 | """Call inference on the model using data and inference parameters from the component"""
80 | if not (query := inference_input.get("query")):
81 | raise TypeError(
82 | "OllamaClient can only be used with LLM and MLLM components"
83 | )
84 | # create input
85 | input = {
86 | "model": self.model_init_params["checkpoint"],
87 | "messages": query,
88 | }
89 | inference_input.pop("query")
90 |
91 | # make images part of the latest message in message list
92 | if images := inference_input.get("images"):
93 | input["messages"][-1]["images"] = [encode_arr_base64(img) for img in images]
94 | inference_input.pop("images")
95 |
96 | # Add tools as part of input, if available
97 | if tools := inference_input.get("tools"):
98 | input["tools"] = tools
99 | inference_input.pop("tools")
100 |
101 | # ollama uses num_predict for max_new_tokens
102 | if inference_input.get("max_new_tokens"):
103 | inference_input["num_predict"] = inference_input["max_new_tokens"]
104 | inference_input.pop("max_new_tokens")
105 | input["options"] = inference_input
106 |
107 | # call inference method
108 | try:
109 | # set timeout on underlying httpx client
110 | self.client._client.timeout = self.inference_timeout
111 | ollama_result = self.client.chat(**input)
112 | except Exception as e:
113 | self.logger.error(str(e))
114 | return None
115 |
116 | self.logger.debug(str(ollama_result))
117 |
118 | # make result part of the input
119 | if output := ollama_result["message"].get("content"):
120 | input["output"] = output # type: ignore
121 | # if tool calls exist
122 | if tool_calls := ollama_result["message"].get("tool_calls"): # type: ignore
123 | input["tool_calls"] = tool_calls
124 | return input
125 | else:
126 | # if tool calls exist
127 | if tool_calls := ollama_result["message"].get("tool_calls"): # type: ignore
128 | input["output"] = "" # Add empty output for tool calls
129 | input["tool_calls"] = tool_calls
130 | return input
131 |
132 | # no output or tool calls
133 | self.logger.debug("Output not received")
134 | return
135 |
136 | def _deinitialize(self):
137 | """Deinitialize the model on the platform"""
138 |
139 | self.logger.error(f"Deinitializing {self.model_name} model on ollama")
140 | try:
141 | self.client.generate(
142 | model=self.model_init_params["checkpoint"], keep_alive=0
143 | )
144 | except Exception as e:
145 | self.logger.error(str(e))
146 | return None
147 |
--------------------------------------------------------------------------------
/agents/agents/components/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | A Component is the main execution unit in ROS Agents and in essence each component is synctactic sugar over a ROS2 Lifecycle Node. ROS Agents provides the following components. These components can be arbitrarily combined to form an embodied agent graph.
3 |
4 | ```{list-table}
5 | :widths: 20 80
6 | :header-rows: 1
7 | * - Component Name
8 | - Description
9 |
10 | * - **[LLM](agents.components.llm.md)**
11 | - This component utilizes large language models (e.g LLama) that can be used to process text data.
12 |
13 | * - **[MLLM](agents.components.mllm.md)**
14 | - This component utilizes multi-modal large language models (e.g. Llava) that can be used to process text and image data.
15 |
16 | * - **[SpeechToText](agents.components.speechtotext.md)**
17 | - This component takes in audio input and outputs a text representation of the audio using Speech-to-Text models (e.g. Whisper).
18 |
19 | * - **[TextToSpeech](agents.components.texttospeech.md)**
20 | - This component takes in text input and outputs an audio representation of the text using TTS models (e.g. SpeechT5). The generated audio can be played using any audio playback device available on the agent.
21 |
22 | * - **[MapEncoding](agents.components.map_encoding.md)**
23 | - Map encoding component that encodes text information as a semantic map based on the robots localization. It takes in map layers, position topic, map meta data topic, and a vector database client. Map layers can be arbitrary text based outputs from other components such as MLLMs or Vision.
24 |
25 | * - **[SemanticRouter](agents.components.semantic_router.md)**
26 | - A component that routes semantic information from input topics to output topics based on pre-defined routes. The Semantic Router takes in a list of input topics, a list of routes, an optional default route, and a configuration object. It uses the database client to store and retrieve routing information.
27 |
28 | * - **[Vision](agents.components.vision.md)**
29 | - This component performs object detection and tracking on input images and outputs a list of detected objects, along with their bounding boxes and confidence scores.
30 |
31 | * - **[VideoMessageMaker](agents.components.imagestovideo.md)**
32 | - This component generates ROS video messages from input image messages. A video message is a collection of image messages that have a perceivable motion. I.e. the primary task of this component is to make intentionality decisions about what sequence of consecutive images should be treated as one coherent temporal sequence. The motion estimation method used for selecting images for a video can be configured in component config.
33 | ```
34 | """
35 |
36 | from .component_base import Component
37 | from .imagestovideo import VideoMessageMaker
38 | from .llm import LLM
39 | from .map_encoding import MapEncoding
40 | from .mllm import MLLM
41 | from .model_component import ModelComponent
42 | from .semantic_router import SemanticRouter
43 | from .speechtotext import SpeechToText
44 | from .texttospeech import TextToSpeech
45 | from .vision import Vision
46 |
47 | __all__ = [
48 | "Component",
49 | "ModelComponent",
50 | "MapEncoding",
51 | "MLLM",
52 | "LLM",
53 | "SpeechToText",
54 | "TextToSpeech",
55 | "Vision",
56 | "VideoMessageMaker",
57 | "SemanticRouter",
58 | ]
59 |
--------------------------------------------------------------------------------
/agents/agents/components/component_base.py:
--------------------------------------------------------------------------------
1 | import json
2 | from abc import abstractmethod
3 | from copy import deepcopy
4 | from typing import Optional, Sequence, Union, List, Dict, Type
5 |
6 | from ..ros import BaseComponent, ComponentRunType, FixedInput, SupportedType, Topic
7 | from ..config import BaseComponentConfig
8 |
9 |
10 | class Component(BaseComponent):
11 | """Component."""
12 |
13 | def __init__(
14 | self,
15 | inputs: Optional[Sequence[Union[Topic, FixedInput]]] = None,
16 | outputs: Optional[Sequence[Topic]] = None,
17 | config: Optional[BaseComponentConfig] = None,
18 | trigger: Union[Topic, List[Topic], float] = 1.0,
19 | callback_group=None,
20 | component_name: str = "agents_component",
21 | **kwargs,
22 | ):
23 | self.config: BaseComponentConfig = (
24 | deepcopy(config) if config else BaseComponentConfig()
25 | )
26 | self.allowed_inputs: Dict[str, List[Type[SupportedType]]]
27 | self.allowed_outputs: Dict[str, List[Type[SupportedType]]]
28 |
29 | # setup inputs and outputs
30 | if inputs:
31 | self.validate_topics(
32 | inputs,
33 | allowed_topic_types=self.allowed_inputs,
34 | topics_direction="Inputs",
35 | )
36 |
37 | if outputs:
38 | if hasattr(self, "allowed_outputs"):
39 | self.validate_topics(
40 | outputs,
41 | allowed_topic_types=self.allowed_outputs,
42 | topics_direction="Outputs",
43 | )
44 |
45 | # Initialize Parent Component
46 | super().__init__(
47 | component_name=component_name,
48 | inputs=inputs,
49 | outputs=outputs,
50 | config=self.config,
51 | callback_group=callback_group,
52 | enable_health_broadcast=False,
53 | **kwargs,
54 | )
55 |
56 | # setup component run type and triggers
57 | self.trigger(trigger)
58 |
59 | def custom_on_activate(self):
60 | """
61 | Custom configuration for creating triggers.
62 | """
63 | # Setup trigger based callback or frequency based timer
64 | if self.run_type is ComponentRunType.EVENT:
65 | self.activate_all_triggers()
66 |
67 | def create_all_subscribers(self):
68 | """
69 | Override to handle trigger topics and fixed inputs.
70 | Called by parent BaseComponent
71 | """
72 | self.get_logger().info("STARTING ALL SUBSCRIBERS")
73 | all_callbacks = (
74 | list(self.callbacks.values()) + list(self.trig_callbacks.values())
75 | if self.run_type is ComponentRunType.EVENT
76 | else self.callbacks.values()
77 | )
78 | for callback in all_callbacks:
79 | callback.set_node_name(self.node_name)
80 | if hasattr(callback.input_topic, "fixed"):
81 | self.get_logger().debug(
82 | f"Fixed input specified for topic: {callback.input_topic} of type {callback.input_topic.msg_type}"
83 | )
84 | else:
85 | callback.set_subscriber(self._add_ros_subscriber(callback))
86 |
87 | def activate_all_triggers(self) -> None:
88 | """
89 | Activates component triggers by attaching execution step to callbacks
90 | """
91 | self.get_logger().info("ACTIVATING TRIGGER TOPICS")
92 | if hasattr(self, "trig_callbacks"):
93 | for callback in self.trig_callbacks.values():
94 | # Add execution step of the node as a post callback function
95 | callback.on_callback_execute(self._execution_step)
96 |
97 | def destroy_all_subscribers(self) -> None:
98 | """
99 | Destroys all node subscribers
100 | """
101 | self.get_logger().info("DESTROYING ALL SUBSCRIBERS")
102 | all_callbacks = (
103 | list(self.callbacks.values()) + list(self.trig_callbacks.values())
104 | if self.run_type is ComponentRunType.EVENT
105 | else self.callbacks.values()
106 | )
107 | for callback in all_callbacks:
108 | if callback._subscriber:
109 | self.destroy_subscription(callback._subscriber)
110 |
111 | def trigger(self, trigger: Union[Topic, List[Topic], float]) -> None:
112 | """
113 | Set component trigger
114 | """
115 | if isinstance(trigger, list):
116 | for t in trigger:
117 | if t.name not in self.callbacks:
118 | raise TypeError(
119 | f"Invalid configuration for component trigger {t.name} - A trigger needs to be one of the inputs already defined in component inputs."
120 | )
121 | self.run_type = ComponentRunType.EVENT
122 | self.trig_callbacks = {}
123 | for t in trigger:
124 | self.trig_callbacks[t.name] = self.callbacks[t.name]
125 | # remove trigger inputs from self.callbacks
126 | del self.callbacks[t.name]
127 |
128 | elif isinstance(trigger, Topic):
129 | if trigger.name not in self.callbacks:
130 | raise TypeError(
131 | f"Invalid configuration for component trigger {trigger.name} - A trigger needs to be one of the inputs already defined in component inputs."
132 | )
133 | self.run_type = ComponentRunType.EVENT
134 | self.trig_callbacks = {trigger.name: self.callbacks[trigger.name]}
135 | del self.callbacks[trigger.name]
136 |
137 | else:
138 | self.run_type = ComponentRunType.TIMED
139 | # Set component loop_rate (Hz)
140 | self.config.loop_rate = 1 / trigger
141 |
142 | self.trig_topic: Union[Topic, list[Topic], float] = trigger
143 |
144 | def validate_topics(
145 | self,
146 | topics: Sequence[Union[Topic, FixedInput]],
147 | allowed_topic_types: Optional[Dict[str, List[Type[SupportedType]]]] = None,
148 | topics_direction: str = "Topics",
149 | ):
150 | """
151 | Verify component specific inputs or outputs using allowed topics if provided
152 | """
153 | # type validation
154 | correct_type = all(isinstance(i, (Topic, FixedInput)) for i in topics)
155 | if not correct_type:
156 | raise TypeError(
157 | f"{topics_direction} to a component can only be of type Topic"
158 | )
159 |
160 | # Check that only the allowed topics (or their subtypes) have been given
161 | if not allowed_topic_types:
162 | return
163 |
164 | all_msg_types = {topic.msg_type for topic in topics}
165 | all_topic_types = allowed_topic_types["Required"] + (
166 | allowed_topic_types.get("Optional") or []
167 | )
168 |
169 | if msg_type := next(
170 | (
171 | topic
172 | for topic in all_msg_types
173 | if not any(
174 | issubclass(topic, allowed_t) for allowed_t in all_topic_types
175 | )
176 | ),
177 | None,
178 | ):
179 | raise TypeError(
180 | f"{topics_direction} to the component of type {self.__class__.__name__} can only be of the allowed datatypes: {[topic.__name__ for topic in all_topic_types]} or their subclasses. A topic of type {msg_type.__name__} cannot be given to this component."
181 | )
182 |
183 | # Check that all required topics (or subtypes) have been given
184 | sufficient_topics = all(
185 | any(issubclass(m_type, allowed_type) for m_type in all_msg_types)
186 | for allowed_type in allowed_topic_types["Required"]
187 | )
188 |
189 | if not sufficient_topics:
190 | raise TypeError(
191 | f"{self.__class__.__name__} component {topics_direction} should have at least one topic of each datatype in the following list: {[topic.__name__ for topic in allowed_topic_types['Required']]}"
192 | )
193 |
194 | @abstractmethod
195 | def _execution_step(self, **kwargs):
196 | """_execution_step.
197 |
198 | :param args:
199 | :param kwargs:
200 | """
201 | raise NotImplementedError(
202 | "This method needs to be implemented by child components."
203 | )
204 |
205 | def _update_cmd_args_list(self):
206 | """
207 | Update launch command arguments
208 | """
209 | super()._update_cmd_args_list()
210 |
211 | self.launch_cmd_args = [
212 | "--trigger",
213 | self._get_trigger_json(),
214 | ]
215 |
216 | def _get_trigger_json(self) -> Union[str, bytes, bytearray]:
217 | """
218 | Serialize component routes to json
219 |
220 | :return: Serialized inputs
221 | :rtype: str | bytes | bytearray
222 | """
223 | if isinstance(self.trig_topic, Topic):
224 | return self.trig_topic.to_json()
225 | elif isinstance(self.trig_topic, List):
226 | return json.dumps([t.to_json() for t in self.trig_topic])
227 | else:
228 | return json.dumps(self.trig_topic)
229 |
--------------------------------------------------------------------------------
/agents/agents/components/imagestovideo.py:
--------------------------------------------------------------------------------
1 | import math
2 | from typing import Optional, Union, List
3 |
4 | import cv2
5 | import numpy as np
6 |
7 | from ..config import VideoMessageMakerConfig
8 | from ..ros import Image, Topic, Video, ROSImage, ROSCompressedImage
9 | from ..utils import validate_func_args
10 | from .component_base import Component
11 |
12 |
13 | class VideoMessageMaker(Component):
14 | """
15 | This component generates ROS video messages from input image messages. A video message is a collection of image messages that have a perceivable motion.
16 | I.e. the primary task of this component is to make intentionality decisions about what sequence of consecutive images should be treated as one coherent temporal sequence.
17 | The motion estimation method used for selecting images for a video can be configured in component config.
18 |
19 | :param inputs: The input topics for the object detection.
20 | This should be a list of Topic objects or FixedInput objects, limited to Image type.
21 | :type inputs: list[Topic]
22 | :param outputs: The output topics for the object detection.
23 | This should be a list of Topic objects, Video type.
24 | :type outputs: list[Topic]
25 | :param config: The configuration for the video message generation.
26 | This should be an instance of VideoMessageMakerConfig.
27 | :type config: VideoMessageMakerConfig
28 | :param trigger: The trigger value or topic for the object detection.
29 | This can be a single Topic object or a list of Topic objects.
30 | :type trigger: Union[Topic, list[Topic]]
31 | :param callback_group: An optional callback group for the video message generation.
32 | If provided, this should be a string. Otherwise, it defaults to None.
33 | :type callback_group: str
34 | :param component_name: The name of the video message generation component.
35 | This should be a string and defaults to "video_maker_component".
36 | :type component_name: str
37 |
38 | Example usage:
39 | ```python
40 | image_topic = Topic(name="image", msg_type="Image")
41 | video_topic = Topic(name="video", msg_type="Video")
42 | config = VideoMessageMakerConfig()
43 | video_message_maker = VideoMessageMaker(
44 | inputs=[image_topic],
45 | outputs=[video_topic],
46 | config=config,
47 | component_name="video_message_maker",
48 | )
49 | ```
50 | """
51 |
52 | @validate_func_args
53 | def __init__(
54 | self,
55 | *,
56 | inputs: List[Topic],
57 | outputs: List[Topic],
58 | config: Optional[VideoMessageMakerConfig] = None,
59 | trigger: Union[Topic, List[Topic]],
60 | component_name: str,
61 | callback_group=None,
62 | **kwargs,
63 | ):
64 | if isinstance(trigger, float):
65 | raise TypeError(
66 | "VideoMessageMaker component needs to be given a valid trigger topic. It cannot be started as a timed component."
67 | )
68 |
69 | self.config: VideoMessageMakerConfig = config or VideoMessageMakerConfig()
70 | self.allowed_inputs = {"Required": [Image]}
71 | self.allowed_outputs = {"Required": [Video]}
72 |
73 | super().__init__(
74 | inputs,
75 | outputs,
76 | self.config,
77 | trigger,
78 | callback_group,
79 | component_name,
80 | **kwargs,
81 | )
82 |
83 | self._frames: Union[List[ROSImage], List[ROSCompressedImage]] = []
84 | self._last_frame: Optional[np.ndarray] = None
85 | self._capture: bool = False
86 |
87 | def _motion_estimation(self, current_frame: np.ndarray) -> bool:
88 | """Motion estimation methods between two frames.
89 | :param current_frame:
90 | :type current_frame: np.ndarray
91 | :rtype: bool
92 | """
93 | # get gray scale image
94 | gray = cv2.cvtColor(current_frame, cv2.COLOR_RGB2GRAY)
95 | if self.config.motion_estimation_func == "frame_difference":
96 | return self._frame_difference(gray, self.config.threshold)
97 | elif self.config.motion_estimation_func == "optical_flow":
98 | return self._optical_flow(
99 | gray, self.config.threshold, **self.config.flow_kwargs
100 | )
101 | else:
102 | return True
103 |
104 | def _frame_difference(self, img: np.ndarray, threshold: float) -> bool:
105 | """Calculates difference between two frames and returns true
106 | if difference is greater than defined threshold.
107 | :param img:
108 | :type img: np.ndarray
109 | :param threshold:
110 | :type threshold: int
111 | :rtype: bool
112 | """
113 | # calculate frame difference
114 | diff = cv2.subtract(img, self._last_frame)
115 | # apply blur to improve thresholding
116 | diff = cv2.medianBlur(diff, 3)
117 | # apply adaptive thresholding
118 | mask = cv2.adaptiveThreshold(
119 | diff, 1, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
120 | )
121 | return True if mask.sum() > (threshold * math.prod(img.shape) / 100) else False
122 |
123 | def _optical_flow(self, img: np.ndarray, threshold: float, **kwargs) -> bool:
124 | """Calculates optical flow between two frames and returns true
125 | if flow is greater than defined threshold.
126 | :param img:
127 | :type img: np.ndarray
128 | :param threshold:
129 | :type threshold: int
130 | :rtype: bool
131 | """
132 | # calculate optical flow
133 | flow = cv2.calcOpticalFlowFarneback(self._last_frame, img, None, **kwargs)
134 | mask = np.uint8(flow > 1) / 10
135 | return True if mask.sum() > (threshold * math.prod(img.shape) / 100) else False
136 |
137 | def _execution_step(self, *_, **kwargs) -> None:
138 | """Collects incoming image messages until a criteria is met
139 | When met, publishes image messages as video
140 | :param args:
141 | :param kwargs:
142 | """
143 | msg = kwargs.get("msg")
144 | topic = kwargs.get("topic")
145 | if msg and topic:
146 | output = self.trig_callbacks[topic.name].get_output()
147 | if self._last_frame is not None:
148 | # calculate motion estimation for start and stop
149 | self._capture = (
150 | True
151 | if self._motion_estimation(output)
152 | and len(self._frames) < self.config.max_video_frames
153 | else False
154 | )
155 | if self._capture:
156 | self._frames.append(msg)
157 | self._last_frame = cv2.cvtColor(output, cv2.COLOR_RGB2GRAY)
158 |
159 | # publish if video capture finished
160 | if (
161 | self.publishers_dict
162 | and (not self._capture)
163 | and len(self._frames) >= self.config.min_video_frames
164 | ):
165 | self.get_logger().debug(f"Sending out video of {len(self._frames)} frames")
166 | for publisher in self.publishers_dict.values():
167 | publisher.publish(output=self._frames)
168 | self._frames = []
169 |
--------------------------------------------------------------------------------
/agents/agents/components/mllm.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Union, Optional, List, Dict
2 |
3 | from ..clients.db_base import DBClient
4 | from ..clients.model_base import ModelClient
5 | from ..config import MLLMConfig
6 | from ..ros import FixedInput, Image, String, Topic, Detections
7 | from ..utils import validate_func_args
8 | from .llm import LLM
9 |
10 |
11 | class MLLM(LLM):
12 | """
13 | This component utilizes multi-modal large language models (e.g. Llava) that can be used to process text and image data.
14 |
15 | :param inputs: The input topics or fixed inputs for the MLLM component.
16 | This should be a list of Topic objects or FixedInput instances, limited to String and Image types.
17 | :type inputs: list[Topic | FixedInput]
18 | :param outputs: The output topics for the MLLM component.
19 | This should be a list of Topic objects. String type is handled automatically.
20 | :type outputs: list[Topic]
21 | :param model_client: The model client for the MLLM component.
22 | This should be an instance of ModelClient.
23 | :type model_client: ModelClient
24 | :param config: Optional configuration for the MLLM component.
25 | This should be an instance of MLLMConfig. If not provided, defaults to MLLMConfig().
26 | :type config: MLLMConfig
27 | :param trigger: The trigger value or topic for the MLLM component.
28 | This can be a single Topic object, a list of Topic objects, or a float value for a timed component. Defaults to 1.
29 | :type trigger: Union[Topic, list[Topic], float]
30 | :param callback_group: An optional callback group for the MLLM component.
31 | If provided, this should be a string. Otherwise, it defaults to None.
32 | :type callback_group: str
33 | :param component_name: The name of the MLLM component.
34 | This should be a string and defaults to "mllm_component".
35 | :type component_name: str
36 |
37 | Example usage:
38 | ```python
39 | text0 = Topic(name="text0", msg_type="String")
40 | image0 = Topic(name="image0", msg_type="Image")
41 | text0 = Topic(name="text1", msg_type="String")
42 | config = MLLMConfig()
43 | model = TransformersMLLM(name='idefics')
44 | model_client = ModelClient(model=model)
45 | mllm_component = MLLM(inputs=[text0, image0],
46 | outputs=[text1],
47 | model_client=model_client,
48 | config=config,
49 | component_name='mllm_component')
50 | ```
51 | """
52 |
53 | @validate_func_args
54 | def __init__(
55 | self,
56 | *,
57 | inputs: List[Union[Topic, FixedInput]],
58 | outputs: List[Topic],
59 | model_client: ModelClient,
60 | config: Optional[MLLMConfig] = None,
61 | db_client: Optional[DBClient] = None,
62 | trigger: Union[Topic, List[Topic], float] = 1.0,
63 | component_name: str,
64 | callback_group=None,
65 | **kwargs,
66 | ):
67 | self.allowed_inputs = {"Required": [String, Image], "Optional": [Detections]}
68 |
69 | config = config or MLLMConfig()
70 |
71 | super().__init__(
72 | inputs=inputs,
73 | outputs=outputs,
74 | model_client=model_client,
75 | config=config,
76 | db_client=db_client,
77 | trigger=trigger,
78 | callback_group=callback_group,
79 | component_name=component_name,
80 | allowed_inputs=self.allowed_inputs,
81 | **kwargs,
82 | )
83 |
84 | def _create_input(self, *_, **kwargs) -> Optional[Dict[str, Any]]:
85 | """Create inference input for MLLM models
86 | :param args:
87 | :param kwargs:
88 | :rtype: dict[str, Any]
89 | """
90 | images = []
91 | # context dict to gather all String inputs for use in system prompt
92 | context = {}
93 | # set mllm query as trigger
94 | if trigger := kwargs.get("topic"):
95 | query = self.trig_callbacks[trigger.name].get_output()
96 | context[trigger.name] = query
97 |
98 | # handle chat reset
99 | if (
100 | self.config.chat_history
101 | and query.strip().lower() == self.config.history_reset_phrase
102 | ):
103 | self.messages = []
104 | return None
105 |
106 | else:
107 | query = None
108 |
109 | # aggregate all inputs that are available
110 | for i in self.callbacks.values():
111 | if (item := i.get_output()) is not None:
112 | # set trigger equal to a topic with type String if trigger not found
113 | if i.input_topic.msg_type is String:
114 | if not query:
115 | query = item
116 | context[i.input_topic.name] = item
117 | elif i.input_topic.msg_type is Detections:
118 | context[i.input_topic.name] = item
119 | # get images from image topics
120 | if issubclass(i.input_topic.msg_type, Image):
121 | images.append(item)
122 |
123 | if not query or not images:
124 | return None
125 |
126 | # get RAG results if enabled in config and if docs retreived
127 | rag_result = self._handle_rag_query(query) if self.config.enable_rag else None
128 |
129 | # set system prompt template
130 | query = (
131 | self.component_prompt.render(context) if self.component_prompt else query
132 | )
133 |
134 | # get RAG results if enabled in config and if docs retreived
135 | query = f"{rag_result}\n{query}" if rag_result else query
136 |
137 | message = {"role": "user", "content": query}
138 | self._handle_chat_history(message)
139 |
140 | self.get_logger().debug(f"Input from component: {self.messages}")
141 |
142 | input = {
143 | "query": self.messages,
144 | "images": images,
145 | **self.config._get_inference_params(),
146 | }
147 |
148 | # Add any tools, if registered
149 | if self.config._tool_descriptions:
150 | input["tools"] = self.config._tool_descriptions
151 |
152 | return input
153 |
154 | def _warmup(self):
155 | """Warm up and stat check"""
156 | import time
157 | from pathlib import Path
158 | import cv2
159 |
160 | image = cv2.imread(str(Path(__file__).parents[1] / Path("resources/test.jpeg")))
161 |
162 | message = {"role": "user", "content": "What do you see?"}
163 | inference_input = {
164 | "query": [message],
165 | "images": [image],
166 | **self.config._get_inference_params(),
167 | }
168 |
169 | # Run inference once to warm up and once to measure time
170 | self.model_client.inference(inference_input)
171 |
172 | inference_input = {
173 | "query": [message],
174 | "images": [image],
175 | **self.config._get_inference_params(),
176 | }
177 | start_time = time.time()
178 | result = self.model_client.inference(inference_input)
179 | elapsed_time = time.time() - start_time
180 |
181 | self.get_logger().warning(f"Model Output: {result['output']}")
182 | self.get_logger().warning(f"Approximate Inference time: {elapsed_time} seconds")
183 |
--------------------------------------------------------------------------------
/agents/agents/components/model_component.py:
--------------------------------------------------------------------------------
1 | from abc import abstractmethod
2 | import inspect
3 | import json
4 | from typing import Any, Optional, Sequence, Union, List, Dict, Type
5 |
6 | from ..clients.model_base import ModelClient
7 | from ..config import ModelComponentConfig
8 | from ..ros import FixedInput, Topic, SupportedType
9 | from .component_base import Component
10 |
11 |
12 | class ModelComponent(Component):
13 | """ModelComponent."""
14 |
15 | def __init__(
16 | self,
17 | inputs: Optional[Sequence[Union[Topic, FixedInput]]] = None,
18 | outputs: Optional[Sequence[Topic]] = None,
19 | model_client: Optional[ModelClient] = None,
20 | config: Optional[ModelComponentConfig] = None,
21 | trigger: Union[Topic, List[Topic], float] = 1.0,
22 | callback_group=None,
23 | component_name: str = "model_component",
24 | **kwargs,
25 | ):
26 | # setup model client
27 | self.model_client = model_client if model_client else None
28 |
29 | self.handled_outputs: List[Type[SupportedType]]
30 |
31 | if not config:
32 | self.config = ModelComponentConfig()
33 |
34 | # Initialize Component
35 | super().__init__(
36 | inputs,
37 | outputs,
38 | config,
39 | trigger,
40 | callback_group,
41 | component_name,
42 | **kwargs,
43 | )
44 |
45 | def custom_on_configure(self):
46 | """
47 | Create model client if provided and initialize model.
48 | """
49 | self.get_logger().debug(f"Current Status: {self.health_status.value}")
50 |
51 | # validate output topics if handled_outputs exist
52 | self.get_logger().info("Validating Model Component Output Topics")
53 | self._validate_output_topics()
54 |
55 | # Initialize model
56 | if self.model_client:
57 | self.model_client.check_connection()
58 | self.model_client.initialize()
59 | if self.config.warmup:
60 | try:
61 | self._warmup()
62 | except Exception as e:
63 | self.get_logger().error(f"Error encountered in warmup: {e}")
64 |
65 | def custom_on_deactivate(self):
66 | """
67 | Destroy model client if it exists
68 | """
69 | # Deinitialize model
70 | if self.model_client:
71 | self.model_client.check_connection()
72 | self.model_client.deinitialize()
73 |
74 | def _validate_output_topics(self) -> None:
75 | """
76 | Verify that output topics that are not handled, have pre-processing functions provided. We just check that there is a pre-processing function and do not check whether the functions have output of the corresponding type.
77 | """
78 |
79 | if hasattr(self, "publishers_dict") and hasattr(self, "handled_outputs"):
80 | for name, pub in self.publishers_dict.items():
81 | if pub.output_topic.msg_type not in self.handled_outputs and (
82 | not self._external_processors
83 | ):
84 | func_body = inspect.getsource(pub.output_topic.msg_type.convert)
85 | raise TypeError(f"""{type(self).__name__} components can only handle output topics of type(s) {self.handled_outputs} automatically. Topic {name} is of type {pub.output_topic.msg_type}. EITHER provide a pre-processing function for this topic and attach it to the topic by calling the `add_publisher_preprocessor` on the component {self.node_name} OR provide a tool call that can provide structured inference output and attach it by calling `register_tool` on {self.node_name}. Make sure the output can be passed as parameter `output` to the following function:
86 | {func_body}""")
87 |
88 | @property
89 | def warmup(self) -> bool:
90 | """Enable warmup of the model."""
91 | return self.config.warmup
92 |
93 | @warmup.setter
94 | def warmup(self, value: bool) -> None:
95 | """Enable warmup of the model."""
96 | self.config.warmup = value
97 |
98 | @abstractmethod
99 | def _create_input(self, *args, **kwargs) -> Union[Dict[str, Any], None]:
100 | """_create_input.
101 |
102 | :param args:
103 | :param kwargs:
104 | :rtype: dict[str, Any] | None
105 | """
106 | raise NotImplementedError(
107 | "_create_input method needs to be implemented by child components."
108 | )
109 |
110 | @abstractmethod
111 | def _execution_step(self, *args, **kwargs):
112 | """_execution_step.
113 |
114 | :param args:
115 | :param kwargs:
116 | """
117 | raise NotImplementedError(
118 | "_execution_step method needs to be implemented by child components."
119 | )
120 |
121 | @abstractmethod
122 | def _warmup(self, *args, **kwargs):
123 | """_warmup.
124 |
125 | :param args:
126 | :param kwargs:
127 | """
128 | raise NotImplementedError(
129 | "_warmup method needs to be implemented by child components."
130 | )
131 |
132 | def _update_cmd_args_list(self):
133 | """
134 | Update launch command arguments
135 | """
136 | super()._update_cmd_args_list()
137 |
138 | self.launch_cmd_args = [
139 | "--model_client",
140 | self._get_model_client_json(),
141 | ]
142 |
143 | def _get_model_client_json(self) -> Union[str, bytes, bytearray]:
144 | """
145 | Serialize component routes to json
146 |
147 | :return: Serialized inputs
148 | :rtype: str | bytes | bytearray
149 | """
150 | if not self.model_client:
151 | return ""
152 | return json.dumps(self.model_client.serialize())
153 |
--------------------------------------------------------------------------------
/agents/agents/components/semantic_router.py:
--------------------------------------------------------------------------------
1 | from typing import Optional, List, Union
2 | import json
3 |
4 | from ..clients.db_base import DBClient
5 | from ..config import SemanticRouterConfig
6 | from ..publisher import Publisher
7 | from ..ros import String, Topic, Route
8 | from ..utils import validate_func_args
9 | from .component_base import Component
10 |
11 |
12 | class SemanticRouter(Component):
13 | """A component that routes semantic information from input topics to output topics based on pre-defined routes. The Semantic Router takes in a list of input topics, a list of routes, an optional default route, and a configuration object. It uses the database client to store and retrieve routing information.
14 |
15 | :param inputs:
16 | A list of input text topics that this component will subscribe to.
17 | :type inputs: list[Topic]
18 | :param routes:
19 | A list of pre-defined routes that publish incoming input to the routed output topics.
20 | :type routes: list[Route]
21 | :param default_route:
22 | An optional route that specifies the default behavior when no specific route matches up to a threshold. If not provided, the component will use the first route in the list.
23 | :type default_route: Optional[Route]
24 | :param config:
25 | The configuration object for this Semantic Router component.
26 | :type config: SemanticRouterConfig
27 | :param db_client:
28 | A database client that is used to store and retrieve routing information.
29 | :type db_client: DBClient
30 | :param callback_group:
31 | An optional callback group for this component.
32 | :param component_name:
33 | The name of this Semantic Router component (default: "router_component").
34 | :type component_name: str
35 | :param kwargs:
36 | Additional keyword arguments.
37 |
38 | Example usage:
39 | ```python
40 | input_text = Topic(name="text0", msg_type="String")
41 | goto_route = Route(
42 | routes_to=goto, # where goto is an input topic to another component
43 | samples=[
44 | "Go to the door",
45 | "Go to the kitchen",
46 | "Get me a glass",
47 | "Fetch a ball",
48 | "Go to hallway",
49 | "Go over there",
50 | ],
51 | )
52 | mllm_route = Route(
53 | routes_to=mllm_input, # where mllm_input is an input topic to another component
54 | samples=[
55 | "Are we indoors or outdoors",
56 | "What do you see?",
57 | "Whats in front of you?",
58 | "Where are we",
59 | "Do you see any people?",
60 | "How many things are infront of you?",
61 | "Is this room occupied?",
62 | ],
63 | )
64 | config = SemanticRouterConfig(router_name="my_router")
65 | db_client = DBClient(db=ChromaDB("database_name"))
66 | semantic_router = SemanticRouter(
67 | inputs=[input_text],
68 | routes=[route1, route2],
69 | default_route=None,
70 | config=config,
71 | db_client=db_client
72 | component_name = "router"
73 | )
74 | ```
75 | """
76 |
77 | @validate_func_args
78 | def __init__(
79 | self,
80 | *,
81 | inputs: List[Topic],
82 | routes: List[Route],
83 | config: SemanticRouterConfig,
84 | db_client: DBClient,
85 | default_route: Optional[Route] = None,
86 | component_name: str,
87 | callback_group=None,
88 | **kwargs,
89 | ):
90 | self.config: SemanticRouterConfig = config
91 | self.allowed_inputs = {"Required": [String]}
92 | self.allowed_outputs = {"Required": [String]}
93 | self.db_client = db_client
94 |
95 | super().__init__(
96 | inputs,
97 | None,
98 | self.config,
99 | inputs,
100 | callback_group,
101 | component_name,
102 | **kwargs,
103 | )
104 |
105 | # create routes
106 | self._routes(routes)
107 |
108 | if default_route:
109 | if default_route.routes_to.name not in self.routes_dict:
110 | raise TypeError("default_route must be one of the specified routes")
111 | self.default_route = self.config._default_route = default_route
112 |
113 | def custom_on_configure(self):
114 | self.get_logger().debug(f"Current Status: {self.health_status.value}")
115 |
116 | # configure the rest
117 | super().custom_on_configure()
118 |
119 | # initialize db client
120 | self.db_client.check_connection()
121 | self.db_client.initialize()
122 |
123 | # initialize routes
124 | self._initialize_routes()
125 |
126 | def deactivate(self):
127 | # deactivate db client
128 | self.db_client.check_connection()
129 | self.db_client.deinitialize()
130 |
131 | def _initialize_routes(self):
132 | """Create routes by saving route samples in the database."""
133 | self.get_logger().info("Initializing all routes")
134 | for idx, (name, route) in enumerate(self.routes_dict.items()):
135 | route_to_add = {
136 | "collection_name": self.config.router_name,
137 | "distance_func": self.config.distance_func,
138 | "documents": route.samples,
139 | "metadatas": [{"route_name": name} for _ in range(len(route.samples))],
140 | "ids": [f"{name}.{i}" for i in range(len(route.samples))],
141 | }
142 | # reset collection on the addition of first route if it exists
143 | if idx == 0:
144 | route_to_add["reset_collection"] = True
145 |
146 | self.db_client.add(route_to_add)
147 |
148 | def _execution_step(self, **kwargs):
149 | """Execution step for Semantic Router component.
150 | :param args:
151 | :param kwargs:
152 | """
153 | trigger = kwargs.get("topic")
154 | if not trigger:
155 | return
156 |
157 | self.get_logger().debug(f"Received trigger on {trigger.name}")
158 | trigger_query = self.trig_callbacks[trigger.name].get_output()
159 | # get route
160 | db_input = {
161 | "collection_name": self.config.router_name,
162 | "query": trigger_query,
163 | "n_results": 1,
164 | }
165 | result = self.db_client.query(db_input)
166 |
167 | # TODO: Add treatment of multiple results by using an averaging function
168 | if result:
169 | distance = result["output"]["distances"][0][0]
170 | # if default route is specified and distance is less than min
171 | # threshold, redirect to default route
172 | route = (
173 | self.default_route.routes_to.name
174 | if self.default_route and distance > self.config.maximum_distance
175 | else result["output"]["metadatas"][0][0]["route_name"]
176 | )
177 |
178 | self.publishers_dict[route].publish(trigger_query)
179 | else:
180 | self.health_status.set_failure()
181 |
182 | def _routes(self, routes: List[Route]):
183 | """
184 | Set component Routes (topics)
185 | """
186 | self.routes_dict = {route.routes_to.name: route for route in routes}
187 | route_topics: List[Topic] = [route.routes_to for route in routes] # type: ignore
188 | self.validate_topics(route_topics, self.allowed_outputs, "Outputs")
189 | self.publishers_dict = {
190 | route_topic.name: Publisher(route_topic) for route_topic in route_topics
191 | }
192 |
193 | def _update_cmd_args_list(self):
194 | """
195 | Update launch command arguments
196 | """
197 | super()._update_cmd_args_list()
198 |
199 | self.launch_cmd_args = [
200 | "--routes",
201 | self._get_routes_json(),
202 | ]
203 |
204 | self.launch_cmd_args = [
205 | "--db_client",
206 | self._get_db_client_json(),
207 | ]
208 |
209 | def _get_routes_json(self) -> Union[str, bytes, bytearray]:
210 | """
211 | Serialize component routes to json
212 |
213 | :return: Serialized inputs
214 | :rtype: str | bytes | bytearray
215 | """
216 | if not hasattr(self, "routes_dict"):
217 | return "[]"
218 | return json.dumps([route.to_json() for route in self.routes_dict.values()])
219 |
220 | def _get_db_client_json(self) -> Union[str, bytes, bytearray]:
221 | """
222 | Serialize component routes to json
223 |
224 | :return: Serialized inputs
225 | :rtype: str | bytes | bytearray
226 | """
227 | if not self.db_client:
228 | return ""
229 | return json.dumps(self.db_client.serialize())
230 |
--------------------------------------------------------------------------------
/agents/agents/publisher.py:
--------------------------------------------------------------------------------
1 | from ros_sugar.io import Publisher
2 |
3 | __all__ = ["Publisher"]
4 |
--------------------------------------------------------------------------------
/agents/agents/resources/test.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/agents/agents/resources/test.jpeg
--------------------------------------------------------------------------------
/agents/agents/resources/test.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/agents/agents/resources/test.wav
--------------------------------------------------------------------------------
/agents/agents/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (
2 | create_detection_context,
3 | validate_kwargs,
4 | validate_func_args,
5 | PDFReader,
6 | get_prompt_template,
7 | encode_arr_base64,
8 | VADStatus,
9 | WakeWordStatus,
10 | load_model,
11 | )
12 |
13 | __all__ = [
14 | "create_detection_context",
15 | "validate_kwargs",
16 | "validate_func_args",
17 | "PDFReader",
18 | "get_prompt_template",
19 | "encode_arr_base64",
20 | "VADStatus",
21 | "WakeWordStatus",
22 | "load_model",
23 | ]
24 |
--------------------------------------------------------------------------------
/agents/agents/vectordbs.py:
--------------------------------------------------------------------------------
1 | """
2 | The following vector DB specification classes are meant to define a comman interface for initialization of vector DBs. Currently the only supported vector DB is Chroma.
3 | """
4 |
5 | from typing import Optional, Dict
6 |
7 | from attrs import define, field
8 | from .ros import BaseAttrs
9 | from .models import Encoder
10 |
11 | __all__ = ["ChromaDB"]
12 |
13 |
14 | @define(kw_only=True)
15 | class DB(BaseAttrs):
16 | """This class describes a database initialization configuration."""
17 |
18 | name: str
19 | db_location: str = field(default="./data")
20 | username: Optional[str] = field(default=None)
21 | password: Optional[str] = field(default=None)
22 | encoder: Optional[Encoder] = field(default=None)
23 | init_timeout: int = field(default=600) # 10 minutes
24 | host: str = field(default="127.0.0.1")
25 | port: Optional[int] = field(default=None)
26 |
27 | def _get_init_params(self) -> Dict:
28 | params = {
29 | "username": self.username,
30 | "password": self.password,
31 | "db_location": self.db_location,
32 | }
33 | if self.encoder:
34 | params["encoder"] = self.encoder._get_init_params()
35 | return params
36 |
37 |
38 | @define(kw_only=True)
39 | class ChromaDB(DB):
40 | """[Chroma](https://www.trychroma.com/) is the open-source AI application database. It provides embeddings, vector search, document storage, full-text search, metadata filtering, and multi-modal retreival support.
41 |
42 | :param name: An arbitrary name given to the database.
43 | :type name: str
44 | :param db_location: The on-disk location where the database will be initialized. Defaults to "./data".
45 | :type db_location: str, optional
46 | :param username: The username for authentication. Defaults to None.
47 | :type username: Optional[str], optional
48 | :param password: The password for authentication. Defaults to None.
49 | :type password: Optional[str], optional
50 | :param encoder: An optional encoder model to use for text encoding. Defaults to None.
51 | :type encoder: Optional[Encoder], optional
52 | :param init_timeout: The timeout in seconds for the initialization process. Defaults to 10 minutes (600 seconds).
53 | :type init_timeout: int, optional
54 | :param host: The hostname or IP address of the database server. Defaults to "127.0.0.1".
55 | :type host: str, optional
56 | :param port: The port number to connect to the database server. Defaults to None.
57 | :type port: Optional[int], optional
58 |
59 | Example usage:
60 | ```python
61 | from agents.models import Encoder
62 | db_config = DB(name='my_database', username='user123', password='pass123')
63 | db_config.db_location = '/path/to/new/location'
64 | db_config.encoder = Encoder(checkpoint="BAAI/bge-small-en")
65 | ```
66 | """
67 |
68 | pass
69 |
--------------------------------------------------------------------------------
/agents/msg/Bbox2D.msg:
--------------------------------------------------------------------------------
1 | float64 top_left_x
2 | float64 top_left_y
3 | float64 bottom_right_x
4 | float64 bottom_right_y
5 |
--------------------------------------------------------------------------------
/agents/msg/Detection2D.msg:
--------------------------------------------------------------------------------
1 | std_msgs/Header header
2 |
3 | float64[] scores
4 | string[] labels
5 | Bbox2D[] boxes
6 |
7 | # Either an image or compressed image
8 | sensor_msgs/Image image
9 | sensor_msgs/CompressedImage compressed_image
10 |
--------------------------------------------------------------------------------
/agents/msg/Detections2D.msg:
--------------------------------------------------------------------------------
1 | std_msgs/Header header
2 |
3 | Detection2D[] detections
4 |
--------------------------------------------------------------------------------
/agents/msg/Point2D.msg:
--------------------------------------------------------------------------------
1 | float64 x
2 | float64 y
3 |
--------------------------------------------------------------------------------
/agents/msg/Tracking.msg:
--------------------------------------------------------------------------------
1 | std_msgs/Header header
2 |
3 | Point2D[] centroids
4 | string[] labels
5 | Bbox2D[] boxes
6 | int8[] ids
7 | Point2D[] estimated_velocities
8 |
9 | # Either an image or compressed image
10 | sensor_msgs/Image image
11 | sensor_msgs/CompressedImage compressed_image
12 |
--------------------------------------------------------------------------------
/agents/msg/Trackings.msg:
--------------------------------------------------------------------------------
1 | std_msgs/Header header
2 |
3 | Tracking[] trackings
4 |
--------------------------------------------------------------------------------
/agents/msg/Video.msg:
--------------------------------------------------------------------------------
1 | std_msgs/Header header
2 |
3 | # Eithen a list of images or compressed images
4 | sensor_msgs/Image[] frames
5 | sensor_msgs/CompressedImage[] compressed_frames
6 |
--------------------------------------------------------------------------------
/agents/package.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | automatika_embodied_agents
5 | 0.3.3
6 | agents
7 | Automatika Robotics
8 | MIT
9 |
10 | builtin_interfaces
11 | std_msgs
12 | sensor_msgs
13 | python3-tqdm
14 | python3-httpx
15 | automatika_ros_sugar
16 |
17 | ament_cmake
18 | ament_cmake_python
19 | rosidl_default_generators
20 | rosidl_default_runtime
21 | rosidl_interface_packages
22 |
23 | python3-pytest
24 |
25 |
26 | ament_cmake
27 |
28 |
29 |
--------------------------------------------------------------------------------
/agents/scripts/chainlit_client/app.py:
--------------------------------------------------------------------------------
1 | from io import BytesIO
2 | from typing import Union, Optional, List
3 | from enum import Enum
4 |
5 | import chainlit as cl
6 | from chainlit.element import ElementBased
7 | from chainlit.input_widget import TextInput
8 |
9 | import rclpy
10 | from rclpy.node import Node
11 | from std_msgs.msg import ByteMultiArray, String
12 |
13 |
14 | class Status(Enum):
15 | INIT = 0
16 | RECEIVED_TEXT = 1
17 | RECEIVED_AUDIO = 2
18 | TIMEOUT = 3
19 |
20 |
21 | class ClientNode(Node):
22 | """
23 | Cli based text client with a publisher and subscriber.
24 | """
25 |
26 | def __init__(self) -> None:
27 | """
28 | Constructs a new instance.
29 | """
30 | super().__init__("cli_client")
31 | self.msg: Optional[Union[str, bytes]] = None
32 | # Start with defaults
33 | self.set_trigger("text0", "audio0")
34 | self.set_target("text1", "audio1")
35 |
36 | def publish(self, prompt: Union[str, bytes]) -> None:
37 | """
38 | Publish to the trigger topics and listen to the target topics
39 |
40 | :param prompt: The prompt/question
41 | :type prompt: {str, bytes}
42 |
43 | :returns: None
44 | :rtype: None
45 | """
46 |
47 | # set timeout flag
48 | self.msg_received = Status.INIT
49 | # Check for publishers on available topic and quit if none available
50 | if isinstance(prompt, bytes):
51 | if not self.count_subscribers(self.audio_trigger) > 0:
52 | self.get_logger().info(
53 | f"No one is listening to {self.audio_trigger}, so I am timing out"
54 | )
55 | self.timer = self.create_timer(0, self.timer_callback)
56 | return None
57 | msg = ByteMultiArray()
58 | msg.data = prompt
59 | self.audio_publisher.publish(msg)
60 | self.get_logger().info(f"Publishing to {self.audio_trigger}")
61 | else:
62 | if not self.count_subscribers(self.text_trigger) > 0:
63 | self.get_logger().info(
64 | f"No one is listening to {self.text_trigger}, so I am timing out"
65 | )
66 | self.timer = self.create_timer(0, self.timer_callback)
67 | return None
68 | # Create and publish message
69 | msg = String()
70 | msg.data = prompt
71 | self.text_publisher.publish(msg)
72 | self.get_logger().info(f"Publishing to {self.text_trigger}")
73 |
74 | self.get_logger().info("Now listening..")
75 |
76 | def listener_callback(self, msg: Union[String, ByteMultiArray]) -> None:
77 | """
78 | Listener callback
79 |
80 | :param msg: The message
81 | :type msg: {ROS Message}
82 | """
83 | if isinstance(msg, String):
84 | self.msg_received = Status.RECEIVED_TEXT
85 | self.get_logger().info(f"A: {msg.data}")
86 | self.msg = msg.data
87 | elif isinstance(msg, ByteMultiArray):
88 | self.msg_received = Status.RECEIVED_AUDIO
89 | self.get_logger().info("A: Audio bytes")
90 | self.msg = b"".join(msg.data)
91 | else:
92 | self.get_logger().error(
93 | "Something went wrong. Received message is neither String nor ByteMultiArray"
94 | )
95 |
96 | def timer_callback(self):
97 | """
98 | Timer Callback just for destroying the time and end node spin_once
99 | """
100 | # the timer should be destroyed once utilized
101 | self.destroy_timer(self.timer)
102 | self.msg_received = Status.TIMEOUT
103 |
104 | def set_trigger(self, text_trigger: str, audio_trigger: str):
105 | """
106 | Set topic to send messages to
107 | """
108 | if hasattr(self, "text_publisher"):
109 | self.destroy_publisher(self.text_publisher)
110 | self.text_trigger = text_trigger
111 | self.text_publisher = self.create_publisher(String, self.text_trigger, 1)
112 |
113 | if hasattr(self, "audio_publisher"):
114 | self.destroy_publisher(self.audio_publisher)
115 | self.audio_trigger = audio_trigger
116 | self.audio_publisher = self.create_publisher(
117 | ByteMultiArray, self.audio_trigger, 1
118 | )
119 |
120 | def set_target(self, text_target: str, audio_target: str):
121 | """
122 | Set topic to receive messages from
123 | """
124 | if hasattr(self, "text_subscription"):
125 | self.destroy_subscription(self.text_subscription)
126 | self.text_target = text_target
127 | self.text_subscription = self.create_subscription(
128 | String, self.text_target, self.listener_callback, 1
129 | )
130 |
131 | if hasattr(self, "audio_subscription"):
132 | self.destroy_subscription(self.audio_subscription)
133 | self.audio_target = audio_target
134 | self.audio_subscription = self.create_subscription(
135 | ByteMultiArray, self.audio_target, self.listener_callback, 1
136 | )
137 |
138 |
139 | @cl.on_chat_start
140 | async def on_chat_start():
141 | """
142 | On chat start, specify default settings
143 | """
144 | # Init rclpy
145 | if not rclpy.ok():
146 | rclpy.init()
147 | await cl.ChatSettings([
148 | TextInput(
149 | id="text_trigger",
150 | label="String topic to send message to",
151 | initial="text0",
152 | ),
153 | TextInput(
154 | id="text_target",
155 | label="String topic to listen to for response",
156 | initial="text1",
157 | ),
158 | TextInput(
159 | id="audio_trigger",
160 | label="Audio topic to send message to",
161 | initial="audio0",
162 | ),
163 | TextInput(
164 | id="audio_target",
165 | label="Audio topic to listen to for response",
166 | initial="audio1",
167 | ),
168 | TextInput(id="timeout", label="Timeout (sec)", initial="30"),
169 | ]).send()
170 | cl.user_session.set("timeout", 30)
171 | client: ClientNode = ClientNode()
172 | cl.user_session.set("client", client)
173 | await cl.Message(
174 | content="Welcome to Leibniz ROS client. Set the input/output topics in settings. Then type your message or press `P` to send audio!"
175 | ).send()
176 |
177 |
178 | @cl.on_settings_update
179 | async def setup_ros_node(settings):
180 | """
181 | On settings update, update nodes
182 | """
183 | client: ClientNode = cl.user_session.get("client")
184 | client.set_trigger(settings["text_trigger"], settings["audio_trigger"])
185 | client.set_target(settings["text_target"], settings["audio_target"])
186 | if not settings["timeout"].isdigit():
187 | return
188 | cl.user_session.set("timeout", int(settings["timeout"]))
189 |
190 |
191 | @cl.step(type="run")
192 | def publish_on_ros(msg: Union[str, bytes]):
193 | """Publish input to the ROS Client node.
194 | :param msg:
195 | :type msg: Union[str, bytes]
196 | """
197 | timeout: int = cl.user_session.get("timeout")
198 | client: ClientNode = cl.user_session.get("client")
199 | client.publish(msg)
200 | rclpy.spin_once(client, timeout_sec=timeout)
201 |
202 |
203 | @cl.step(type="run")
204 | async def handle_output(msg_type: type):
205 | """Handle Output from the ROS Client node.
206 | :param msg_type:
207 | :type msg_type: type
208 | """
209 | client: ClientNode = cl.user_session.get("client")
210 | if client.msg_received is Status.INIT:
211 | await cl.Message(
212 | content=f"I did not receive a message on **{client.text_target}** or **{client.audio_target}**. Timedout.",
213 | ).send()
214 | elif client.msg_received is Status.RECEIVED_TEXT:
215 | await cl.Message(
216 | content=f"{client.msg}",
217 | ).send()
218 | elif client.msg_received is Status.RECEIVED_AUDIO:
219 | output_audio_el = cl.Audio(content=client.msg, name="Response Audio")
220 | await cl.Message(
221 | author="Robot",
222 | type="assistant_message",
223 | content="",
224 | elements=[output_audio_el],
225 | ).send()
226 | else:
227 | trig = client.audio_trigger if msg_type is bytes else client.text_trigger
228 | await cl.Message(
229 | content=f"There is no one listening on **{trig}**. Is this the correct topic. If not, set the correct trigger and response topics in the settings.",
230 | ).send()
231 |
232 |
233 | @cl.on_message
234 | async def on_message(msg: cl.Message):
235 | """
236 | On message, handle text message
237 | """
238 | publish_on_ros(msg.content)
239 | await handle_output(type(msg))
240 |
241 |
242 | @cl.on_audio_chunk
243 | async def on_audio_chunk(chunk: cl.AudioChunk):
244 | """Receive audio chunks
245 | :param chunk:
246 | :type chunk: cl.AudioChunk
247 | """
248 | if chunk.isStart:
249 | # Initialize new audio buffer
250 | buffer = BytesIO()
251 | buffer.name = "input_audio"
252 | cl.user_session.set("audio_buffer", buffer)
253 | cl.user_session.set("audio_mime_type", chunk.mimeType)
254 |
255 | # write chunks to buffer
256 | cl.user_session.get("audio_buffer").write(chunk.data)
257 |
258 |
259 | @cl.on_audio_end
260 | async def on_audio_end(elements: List[ElementBased]):
261 | """Publish audio to the topic.
262 | :param elements:
263 | :type elements: list[ElementBased]
264 | """
265 | audio_buffer: BytesIO = cl.user_session.get("audio_buffer")
266 | audio_buffer.seek(0)
267 | audio_mime_type: str = cl.user_session.get("audio_mime_type")
268 | audio_bytes = audio_buffer.read()
269 |
270 | # Add users audio to the chat
271 | input_audio_el = cl.Audio(
272 | mime=audio_mime_type, content=audio_bytes, name="User Audio"
273 | )
274 | await cl.Message(
275 | author="User",
276 | type="user_message",
277 | content="",
278 | elements=[input_audio_el, *elements],
279 | ).send()
280 |
281 | # publish using ROS client
282 | publish_on_ros(audio_bytes)
283 | await handle_output(type(audio_bytes))
284 |
285 |
286 | @cl.on_chat_end
287 | async def on_chat_end():
288 | """
289 | On chat end destroy client nodes
290 | """
291 | if rclpy.ok():
292 | client: ClientNode = cl.user_session.get("client")
293 | client.destroy_node()
294 | rclpy.shutdown()
295 |
--------------------------------------------------------------------------------
/agents/scripts/chainlit_client/chainlit.md:
--------------------------------------------------------------------------------
1 | # Tiny Web Client for ROS Agents
2 |
3 | This client is based on chainlit. In order to use it, run the following in order.
4 |
5 | `pip install chainlit`
6 |
7 | `ros2 run automatika_embodied_agents tiny_web_client`
8 |
9 | The client displays a web UI on **localhost:8000**. Open this link from browser.
10 |
11 | ROS input and output topic settings for text and audio topics can be configured from the web UI by pressing the settings icon.
12 |
--------------------------------------------------------------------------------
/agents/scripts/chainlit_client/tiny_web_client:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | from pathlib import Path
3 | from chainlit.cli import run_chainlit
4 | from chainlit.config import config
5 |
6 |
7 | def main():
8 | """Run from ROS"""
9 | # TODO: Add chainlit option handling via ROS
10 |
11 | root_path = Path(__file__).parent / Path("app.py")
12 |
13 | # Set general config options
14 | config.run.headless = True
15 | config.project.enable_telemetry = False
16 | config.root = str(root_path.parent)
17 |
18 | # Set audio config options
19 | config.features.audio.sample_rate = 16000 # type: ignore
20 | config.features.audio.initial_silence_timeout = 2000 # type: ignore
21 | config.features.audio.silence_timeout = 1000 # type: ignore
22 |
23 | run_chainlit(str(root_path))
24 |
25 |
26 | if __name__ == "__main__":
27 | main()
28 |
--------------------------------------------------------------------------------
/agents/scripts/executable:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 | import json
3 | import argparse
4 | from typing import List, Dict, Union
5 |
6 | import rclpy
7 | import setproctitle
8 | from rclpy.executors import MultiThreadedExecutor
9 | from rclpy.utilities import try_shutdown
10 |
11 | from agents import config as all_configs
12 | from agents import components as all_components
13 | from agents import clients
14 | from agents.ros import Topic, FixedInput, MapLayer, Route
15 |
16 |
17 | def _parse_args() -> tuple[argparse.Namespace, List[str]]:
18 | """Parse arguments."""
19 | parser = argparse.ArgumentParser(description="Component Executable Config")
20 | parser.add_argument(
21 | "--config_type", type=str, help="Component configuration class name"
22 | )
23 | parser.add_argument("--component_type", type=str, help="Component class name")
24 | parser.add_argument(
25 | "--node_name",
26 | type=str,
27 | help="Component ROS2 node name",
28 | )
29 | parser.add_argument("--config", type=str, help="Component configuration object")
30 | parser.add_argument(
31 | "--inputs",
32 | type=str,
33 | help="Component input topics",
34 | )
35 | parser.add_argument(
36 | "--outputs",
37 | type=str,
38 | help="Component output topics",
39 | )
40 | parser.add_argument(
41 | "--routes",
42 | type=str,
43 | help="Semantic router routes",
44 | )
45 | parser.add_argument(
46 | "--layers",
47 | type=str,
48 | help="Map Encoding layers",
49 | )
50 | parser.add_argument(
51 | "--trigger",
52 | type=str,
53 | help="Component trigger",
54 | )
55 | parser.add_argument(
56 | "--model_client",
57 | type=str,
58 | help="Model Client",
59 | )
60 | parser.add_argument(
61 | "--db_client",
62 | type=str,
63 | help="DB Client",
64 | )
65 | parser.add_argument(
66 | "--config_file", type=str, help="Path to configuration YAML file"
67 | )
68 | parser.add_argument(
69 | "--events", type=str, help="Events to be monitored by the component"
70 | )
71 | parser.add_argument(
72 | "--actions", type=str, help="Actions associated with the component Events"
73 | )
74 | parser.add_argument(
75 | "--external_processors",
76 | type=str,
77 | help="External processors associated with the component input and output topics",
78 | )
79 | return parser.parse_known_args()
80 |
81 |
82 | def _parse_component_config(
83 | args: argparse.Namespace,
84 | ) -> all_configs.BaseComponentConfig:
85 | """Parse the component config object
86 |
87 | :param args: Command line arguments
88 | :type args: argparse.Namespace
89 |
90 | :return: Component config object
91 | :rtype: object
92 | """
93 | config_type = args.config_type or None
94 | if not config_type:
95 | raise ValueError("config_type must be provided")
96 |
97 | # Get config type and update from json arg
98 | config_class = getattr(all_configs, config_type)
99 | if not config_class:
100 | raise TypeError(
101 | f"Unknown config_type '{config_type}'. Known types are {all_configs.__all__}"
102 | )
103 |
104 | config = config_class(**json.loads(args.config))
105 |
106 | return config
107 |
108 |
109 | def _parse_trigger(trigger_str: str) -> Union[Topic, List[Topic], float]:
110 | """Parse component trigger json string
111 |
112 | :param trigger_str: Trigger JSON string
113 | :type trigger_str: str
114 |
115 | :return: Trigger topics or float
116 | :rtype: Topic | List[Topic] | float
117 | """
118 | trigger_json = json.loads(trigger_str)
119 | if isinstance(trigger_json, List):
120 | return [Topic(**json.loads(t)) for t in trigger_json]
121 | elif isinstance(trigger_json, Dict):
122 | return Topic(**trigger_json)
123 | else:
124 | # return float
125 | return trigger_json
126 |
127 |
128 | def _deserialize_topics(serialized_topics: str) -> List[Dict]:
129 | list_of_str = json.loads(serialized_topics)
130 | return [json.loads(t) for t in list_of_str]
131 |
132 |
133 | def _parse_ros_args(args_names: List[str]) -> List[str]:
134 | """Parse ROS arguments from command line arguments
135 |
136 | :param args_names: List of all parsed arguments
137 | :type args_names: list[str]
138 |
139 | :return: List ROS parsed arguments
140 | :rtype: list[str]
141 | """
142 | # Look for --ros-args in ros_args
143 | ros_args_start = None
144 | if "--ros-args" in args_names:
145 | ros_args_start = args_names.index("--ros-args")
146 |
147 | if ros_args_start is not None:
148 | ros_specific_args = args_names[ros_args_start:]
149 | else:
150 | ros_specific_args = []
151 | return ros_specific_args
152 |
153 |
154 | def main():
155 | """Executable main function to run a component as a ROS2 node in a new process.
156 | Used to start a node using ROS Sugar Launcher. Extends functionality from ROS Sugar
157 |
158 | :param list_of_components: List of all known Component classes in the package
159 | :type list_of_components: List[Type]
160 | :param list_of_configs: List of all known ComponentConfig classes in the package
161 | :type list_of_configs: List[Type]
162 | :raises ValueError: If component or component config are unknown classes
163 | :raises ValueError: If component cannot be started with provided arguments
164 | """
165 | args, args_names = _parse_args()
166 |
167 | # Initialize rclpy with the ros-specific arguments
168 | rclpy.init(args=_parse_ros_args(args_names))
169 |
170 | component_type = args.component_type or None
171 |
172 | if not component_type:
173 | raise ValueError("Cannot launch without providing a component_type")
174 |
175 | comp_class = getattr(all_components, component_type)
176 |
177 | if not comp_class:
178 | raise ValueError(
179 | f"Cannot launch unknown component type '{component_type}'. Known types are: '{all_components.__all__}'"
180 | )
181 |
182 | # Get name
183 | component_name = args.node_name or None
184 |
185 | if not component_name:
186 | raise ValueError("Cannot launch component without specifying a name")
187 |
188 | # SET PROCESS NAME
189 | setproctitle.setproctitle(component_name)
190 |
191 | config = _parse_component_config(args)
192 |
193 | # Get Yaml config file if provided
194 | config_file = args.config_file or None
195 |
196 | # Get inputs/outputs/layers/routes
197 | inputs = (
198 | [
199 | FixedInput(**i) if i.get("fixed") else Topic(**i)
200 | for i in _deserialize_topics(args.inputs)
201 | ]
202 | if args.inputs
203 | else None
204 | )
205 | outputs = (
206 | [Topic(**o) for o in _deserialize_topics(args.outputs)]
207 | if args.outputs
208 | else None
209 | )
210 | layers = (
211 | [MapLayer(**i) for i in _deserialize_topics(args.layers)]
212 | if args.layers
213 | else None
214 | )
215 | routes = (
216 | [Route(**r) for r in _deserialize_topics(args.routes)] if args.routes else None
217 | )
218 |
219 | # Get triggers
220 | trigger = _parse_trigger(args.trigger)
221 |
222 | # Init the component
223 | # Semantic Router Component
224 | if component_type == all_components.SemanticRouter.__name__:
225 | db_client_json = json.loads(args.db_client)
226 | db_client = getattr(clients, db_client_json["client_type"])(**db_client_json)
227 | component = comp_class(
228 | inputs=inputs,
229 | routes=routes,
230 | db_client=db_client,
231 | config=config,
232 | default_route=config._default_route,
233 | component_name=component_name,
234 | config_file=config_file,
235 | )
236 | # Map Encoding Component
237 | elif component_type == all_components.MapEncoding.__name__:
238 | db_client_json = json.loads(args.db_client)
239 | db_client = getattr(clients, db_client_json["client_type"])(**db_client_json)
240 | component = comp_class(
241 | layers=layers,
242 | position=config._position,
243 | map_topic=config._map_topic,
244 | db_client=db_client,
245 | config=config,
246 | trigger=trigger,
247 | component_name=component_name,
248 | config_file=config_file,
249 | )
250 |
251 | # All other components
252 | else:
253 | if args.model_client:
254 | model_client_json = json.loads(args.model_client)
255 | model_client = getattr(clients, model_client_json["client_type"])(
256 | **model_client_json
257 | )
258 | else:
259 | model_client = None
260 | if args.db_client:
261 | db_client_json = json.loads(args.db_client)
262 | db_client = getattr(clients, db_client_json["client_type"])(
263 | **db_client_json
264 | )
265 | else:
266 | db_client = None
267 |
268 | component = comp_class(
269 | inputs=inputs,
270 | outputs=outputs,
271 | model_client=model_client,
272 | db_client=db_client,
273 | trigger=trigger,
274 | config=config,
275 | component_name=component_name,
276 | config_file=config_file,
277 | )
278 |
279 | # Init the node with rclpy
280 | component.rclpy_init_node()
281 |
282 | # Set events/actions
283 | events_json = args.events or None
284 | actions_json = args.actions or None
285 |
286 | if events_json and actions_json:
287 | component._events_json = events_json
288 | component._actions_json = actions_json
289 |
290 | # Set external processors
291 | external_processors = args.external_processors or None
292 | if external_processors:
293 | component._external_processors_json = external_processors
294 |
295 | executor = MultiThreadedExecutor()
296 |
297 | executor.add_node(component)
298 |
299 | try:
300 | executor.spin()
301 |
302 | except KeyboardInterrupt:
303 | pass
304 |
305 | finally:
306 | executor.remove_node(component)
307 | try_shutdown()
308 |
309 |
310 | if __name__ == "__main__":
311 | main()
312 |
--------------------------------------------------------------------------------
/agents/tests/test_clients.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import time
3 | import subprocess
4 | import shutil
5 |
6 | import cv2
7 | import pytest
8 | from agents.models import Idefics2, OllamaModel
9 | from agents.vectordbs import ChromaDB
10 | from agents.clients.roboml import (
11 | HTTPModelClient,
12 | HTTPDBClient,
13 | RESPDBClient,
14 | RESPModelClient,
15 | )
16 | from agents.clients.ollama import OllamaClient
17 |
18 | HOST = "http://localhost"
19 | RAY_PORT = 8000
20 | RESP_PORT = 6379
21 |
22 |
23 | @pytest.fixture(scope="class")
24 | def http_clients():
25 | """Fixture to run roboml ray and make its clients before tests are run"""
26 |
27 | # start server
28 | p = subprocess.Popen(["roboml"])
29 | # give it 20 seconds to start before sending request
30 | time.sleep(20)
31 | model = Idefics2(name="idefics")
32 | model_client = HTTPModelClient(model, port=RAY_PORT, logging_level="debug")
33 | db = ChromaDB(name="chroma", db_location="./http_data")
34 | db_client = HTTPDBClient(db, port=RAY_PORT, logging_level="debug")
35 |
36 | yield {"model": model_client, "db": db_client}
37 |
38 | # terminate server process - kill to remove ray monitoring child
39 | p.kill()
40 | shutil.rmtree("./http_data")
41 |
42 |
43 | @pytest.fixture(scope="class")
44 | def resp_clients():
45 | """Fixture to run roboml-resp and make its clients before tests are run"""
46 |
47 | # start server
48 | p = subprocess.Popen(["roboml-resp"])
49 | # give it 20 seconds to start before sending request
50 | time.sleep(20)
51 | model = Idefics2(name="idefics")
52 | model_client = RESPModelClient(model, logging_level="debug")
53 | db = ChromaDB(name="chroma", db_location="./resp_data")
54 | db_client = RESPDBClient(db, logging_level="debug")
55 |
56 | yield {"model": model_client, "db": db_client}
57 |
58 | # terminate server process
59 | p.terminate()
60 | shutil.rmtree("./resp_data")
61 |
62 |
63 | @pytest.fixture(scope="class")
64 | def ollama_client():
65 | """Fixture to create client ollama before tests are run"""
66 |
67 | model = OllamaModel(name="llava", checkpoint="llava")
68 | ollama_client = OllamaClient(model, logging_level="debug")
69 | yield ollama_client
70 |
71 |
72 | @pytest.fixture
73 | def loaded_img():
74 | """Fixture to load test image"""
75 | return cv2.imread("agents/resources/test.jpeg", cv2.COLOR_BGR2RGB)
76 |
77 |
78 | @pytest.fixture
79 | def data():
80 | return {
81 | "ids": ["a"],
82 | "metadatas": [{"something": "about a"}],
83 | "documents": ["description of a"],
84 | "collection_name": "alphabets",
85 | }
86 |
87 |
88 | class TestRobomlHTTPClient:
89 | """
90 | Test roboml http client
91 | """
92 |
93 | def test_model_init(self, http_clients):
94 | """
95 | Test roboml http model client init
96 | """
97 | try:
98 | http_clients["model"].check_connection()
99 | except Exception:
100 | logging.error(
101 | "Make sure roboml is installed on this machine before running these tests. roboml can be installed with `pip install roboml`"
102 | )
103 | raise
104 | http_clients["model"].initialize()
105 |
106 | def test_model_inference(self, http_clients, loaded_img):
107 | """
108 | Test roboml http model client inference
109 | """
110 | inference_input = {"query": "What do you see?", "images": [loaded_img]}
111 | result = http_clients["model"].inference(inference_input)
112 | assert result is not None
113 | assert result["output"] is not None
114 | logging.info(result["output"])
115 |
116 | def test_model_deinit(self, http_clients):
117 | """
118 | Test roboml http model client deinit
119 | """
120 | http_clients["model"].deinitialize()
121 |
122 | def test_db_init(self, http_clients):
123 | """
124 | Test roboml http db client init
125 | """
126 | http_clients["db"].check_connection()
127 | http_clients["db"].initialize()
128 |
129 | def test_db_add(self, http_clients, data):
130 | """
131 | Test roboml http db client add
132 | """
133 | result = http_clients["db"].add(data)
134 | assert result is not None
135 | assert result["output"] is not None
136 | logging.info(result["output"])
137 |
138 | def test_db_conditional_add(self, http_clients, data):
139 | """
140 | Test roboml http db client conditional add
141 | """
142 | result = http_clients["db"].conditional_add(data)
143 | assert result is not None
144 | assert result["output"] is not None
145 | logging.info(result["output"])
146 |
147 | def test_db_metadata_query(self, http_clients, data):
148 | """
149 | Test roboml http db client metadata query
150 | """
151 | metadata_query = {
152 | "metadatas": data["metadatas"],
153 | "collection_name": data["collection_name"],
154 | }
155 | result = http_clients["db"].metadata_query(metadata_query)
156 | assert result is not None
157 | assert result["output"] is not None
158 | logging.info(result["output"])
159 |
160 | def test_db_query(self, http_clients, data):
161 | """
162 | Test roboml http db client query
163 | """
164 | metadata_query = {
165 | "query": "what is a",
166 | "collection_name": data["collection_name"],
167 | }
168 | result = http_clients["db"].query(metadata_query)
169 | assert result is not None
170 | assert result["output"] is not None
171 | logging.info(result["output"])
172 |
173 | def test_db_deinit(self, http_clients):
174 | """
175 | Test roboml http db client deinit
176 | """
177 | http_clients["db"].deinitialize()
178 |
179 |
180 | class TestRobomlRESPClient:
181 | """
182 | Test roboml resp client
183 | """
184 |
185 | def test_model_init(self, resp_clients):
186 | """
187 | Test roboml resp model client init
188 | """
189 | try:
190 | resp_clients["model"].check_connection()
191 | except Exception:
192 | logging.error(
193 | "Make sure roboml is installed on this machine before running these tests. roboml can be installed with `pip install roboml`"
194 | )
195 | raise
196 | resp_clients["model"].initialize()
197 |
198 | def test_model_inference(self, resp_clients, loaded_img):
199 | """
200 | Test roboml resp model client inference
201 | """
202 | inference_input = {"query": "What do you see?", "images": [loaded_img]}
203 | result = resp_clients["model"].inference(inference_input)
204 | assert result is not None
205 | assert result["output"] is not None
206 | logging.info(result["output"])
207 |
208 | def test_model_deinit(self, resp_clients):
209 | """
210 | Test roboml resp model client deinit
211 | """
212 | resp_clients["model"].deinitialize()
213 |
214 | def test_db_init(self, resp_clients):
215 | """
216 | Test roboml resp db client init
217 | """
218 | resp_clients["db"].check_connection()
219 | resp_clients["db"].initialize()
220 |
221 | def test_db_add(self, resp_clients, data):
222 | """
223 | Test roboml resp db client add
224 | """
225 | result = resp_clients["db"].add(data)
226 | assert result is not None
227 | assert result["output"] is not None
228 | logging.info(result["output"])
229 |
230 | def test_db_conditional_add(self, resp_clients, data):
231 | """
232 | Test roboml resp db client conditional add
233 | """
234 | result = resp_clients["db"].conditional_add(data)
235 | assert result is not None
236 | assert result["output"] is not None
237 | logging.info(result["output"])
238 |
239 | def test_db_metadata_query(self, resp_clients, data):
240 | """
241 | Test roboml resp db client metadata query
242 | """
243 | metadata_query = {
244 | "metadatas": data["metadatas"],
245 | "collection_name": data["collection_name"],
246 | }
247 | result = resp_clients["db"].metadata_query(metadata_query)
248 | assert result is not None
249 | assert result["output"] is not None
250 | logging.info(result["output"])
251 |
252 | def test_db_query(self, resp_clients, data):
253 | """
254 | Test roboml resp db client query
255 | """
256 | metadata_query = {
257 | "query": "what is a",
258 | "collection_name": data["collection_name"],
259 | }
260 | result = resp_clients["db"].query(metadata_query)
261 | assert result is not None
262 | assert result["output"] is not None
263 | logging.info(result["output"])
264 |
265 | def test_db_deinit(self, resp_clients):
266 | """
267 | Test roboml resp db client deinit
268 | """
269 | resp_clients["db"].deinitialize()
270 |
271 |
272 | class TestOllamaClient:
273 | """
274 | Test ollama client
275 | """
276 |
277 | def test_model_init(self, ollama_client):
278 | """
279 | Test ollama model client init
280 | """
281 | try:
282 | ollama_client.check_connection()
283 | except Exception:
284 | logging.error(
285 | "Make sure Ollama is installed on this machine before running these tests. Visit https://ollama.com for installation instructions."
286 | )
287 | raise
288 | ollama_client.initialize()
289 |
290 | def test_model_inference(self, ollama_client, loaded_img):
291 | """
292 | Test ollama model client inference
293 | """
294 | inference_input = {"query": "What do you see?", "images": [loaded_img]}
295 | result = ollama_client.inference(inference_input)
296 | assert result is not None
297 | assert result["output"] is not None
298 | logging.info(result["output"])
299 |
300 | def test_model_deinit(self, ollama_client):
301 | """
302 | Test ollama model client deinit
303 | """
304 | ollama_client.deinitialize()
305 |
--------------------------------------------------------------------------------
/docs/_static/ROS_AGENTS.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/ROS_AGENTS.png
--------------------------------------------------------------------------------
/docs/_static/ROS_AGENTS_DARK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/ROS_AGENTS_DARK.png
--------------------------------------------------------------------------------
/docs/_static/automatika-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/automatika-logo.png
--------------------------------------------------------------------------------
/docs/_static/complete_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/complete_dark.png
--------------------------------------------------------------------------------
/docs/_static/complete_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/automatika-robotics/ros-agents/1fbf24d683bc5bcb9398c8ab3146ffd8197020fc/docs/_static/complete_light.png
--------------------------------------------------------------------------------
/docs/basics.md:
--------------------------------------------------------------------------------
1 | # Basic Concepts 📚
2 |
3 | The following is an overview of basic building blocks of ROS Agents. You can follow the links in each subsection to dig deeper.
4 |
5 | ## Component
6 |
7 | A Component is the main execution unit in ROS Agents and in essence each component is synctactic sugar over a ROS2 Lifecycle Node. All the functionalities implemented in ROS2 nodes can be found in the component. Components take a single Topic or a list of Topics as inputs and ouputs. Depending on the components functionality, certain types of Topics might be mandatory.
8 |
9 | ```{note}
10 | To learn more about components, checkout [ROS Sugar Documentation](https://automatika-robotics.github.io/ros-sugar/).
11 | ```
12 |
13 | ### Components Available in ROS Agents
14 |
15 | ROS Agents provides various ready to use components. You can see their details [here](apidocs/agents/agents.components).
16 |
17 | ### Component Config
18 |
19 | Each component can take in an optional config. Configs are generally [attrs](https://www.attrs.org/en/stable/) classes and for components that use ML models, configs are also the place where inference parameters are defined. You can see the default options for configs of each available component [here](apidocs/agents/agents.config).
20 |
21 | ### Component RunType
22 |
23 | In ROS Agents, components can be of the following two types:
24 |
25 | ```{list-table}
26 | :widths: 10 80
27 | * - **Timed**
28 | - Execute the main execution function in a timed loop.
29 | * - **Event**
30 | - Execute the main execution function based on a trigger topic/event.
31 | ```
32 |
33 | ### Health Check and Fallback
34 |
35 | Each component maintains a health status, based on which, one can configure various fallback options for the component allowing it to recover from failures or shutdown gracefully. This aspect can be significant in embodied autonomous agents, not just in terms of safety but for generally coherent and reliable performance. To learn more about these topics, check out the documentation of [ROS Sugar Documentation](https://automatika-robotics.github.io/ros-sugar/).
36 |
37 | ## Topic
38 |
39 | A [topic](apidocs/agents/agents.ros) is an idomatic wrapper for a ROS2 topic. Topics can be given as inputs or outputs to components. When given as inputs, components automatically create listeners for the topics upon their activation. And when given as outputs, components create publishers for publishing to the topic. Each topic has a name (duh?) and a data type, defining its listening callback and publishing behavior. The data type can be provided to the topic as a string. Checkout the list of supported data types [here](https://automatika-robotics.github.io/ros-sugar/advanced/types.html).
40 |
41 | ```{note}
42 | Learn more about Topics in [ROS Sugar](https://automatika-robotics.github.io/ros-sugar/).
43 | ```
44 |
45 | ## Model/DB Client
46 |
47 | Certain components in ROS Agents deal with ML models, vector DBs or both. These components take in a model or db client as one of their initialization parameters. The reason for this separate abstraction is to enforce _separation of concerns_. An ML model can be running on the edge hardware itself, or a powerful compute node in the network, or in the cloud, the components running on the robot edge can always use the model (or DB) via a client in a standardized way. This also makes the components independant of the model serving platforms, which can implement various inference optimizations which are usually model specific. Thus one can choose an ML serving platform with the best latency/accuracy tradeoff, depending on the application concerns.
48 |
49 | All clients implement a connection check. ML clients must implement a model inference and optionally model initialization and deinitialization methods (since an embodied agent can initialize different models (or fine tuned versions of the same model) for the same component, depending on some event in the environment). Similarly vector DB clients implement standard CRUD methods for vector DBs. Checkout the list of available clients [here](apidocs/agents/agents.clients).
50 |
51 | ## Models/DBs
52 |
53 | The clients we mentioned above take as input a model or vector database specification. These are in the form of [attrs](https://www.attrs.org/en/stable/) classes and define intialization parameters, such as quantization for ML models or choice of encoding model for vector DBs, among others. The available models and databases that can be instantiated on a particular model serving platform usually depend on the platform itself. However, with these model and vector DB specifications, we aim to standardize the model initialization specifications across platforms. Check the list of [models](apidocs/agents/agents.models) and [vector DBs](apidocs/agents/agents.vectordbs) that are available.
54 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | import os
3 | import sys
4 | from datetime import date
5 | import xml.etree.ElementTree as ET
6 |
7 | sys.path.insert(0, os.path.abspath(".."))
8 | version = ET.parse("../agents/package.xml").getroot()[1].text
9 | print("Found version:", version)
10 |
11 | project = "ROS Agents"
12 | copyright = f"{date.today().year}, Automatika Robotics"
13 | author = "Automatika Robotics"
14 | release = version
15 |
16 | extensions = [
17 | "sphinx.ext.viewcode",
18 | "sphinx.ext.doctest",
19 | "sphinx_copybutton", # install with `pip install sphinx-copybutton`
20 | "autodoc2", # install with `pip install sphinx-autodoc2`
21 | "myst_parser", # install with `pip install myst-parser`
22 | ]
23 |
24 | autodoc2_packages = [
25 | {
26 | "module": "agents",
27 | "path": "../agents/agents",
28 | "exclude_dirs": ["__pycache__", "utils"],
29 | "exclude_files": [
30 | "callbacks.py",
31 | "publisher.py",
32 | "component_base.py",
33 | "model_component.py",
34 | "model_base.py",
35 | "db_base.py",
36 | "executable.py",
37 | ],
38 | },
39 | ]
40 |
41 | autodoc2_docstrings = "all"
42 | autodoc2_class_docstring = "both" # bug in autodoc2, should be `merge`
43 | autodoc2_render_plugin = "myst"
44 | autodoc2_hidden_objects = ["private", "dunder", "undoc"]
45 | autodoc2_module_all_regexes = [
46 | r"agents.config",
47 | r"agents.models",
48 | r"agents.vectordbs",
49 | r"agents.ros",
50 | r"agents.clients\.[^\.]+",
51 | ]
52 |
53 | templates_path = ["_templates"]
54 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
55 |
56 | myst_enable_extensions = [
57 | "amsmath",
58 | "attrs_inline",
59 | "colon_fence",
60 | "deflist",
61 | "dollarmath",
62 | "fieldlist",
63 | "html_admonition",
64 | "html_image",
65 | "linkify",
66 | "replacements",
67 | "smartquotes",
68 | "strikethrough",
69 | "substitution",
70 | "tasklist",
71 | ]
72 | language = "en"
73 | myst_html_meta = {
74 | "google-site-verification": "cQVj-BaADcGVOGB7GOvfbkgJjxni10C2fYWCZ03jOeo"
75 | }
76 |
77 |
78 | html_theme = "sphinx_book_theme" # install with `pip install sphinx-book-theme`
79 | html_static_path = ["_static"]
80 | html_theme_options = {
81 | "logo": {
82 | "image_light": "_static/ROS_AGENTS_DARK.png",
83 | "image_dark": "_static/ROS_AGENTS.png",
84 | },
85 | "icon_links": [
86 | {
87 | "name": "Automatika",
88 | "url": "https://automatikarobotics.com/",
89 | "icon": "_static/automatika-logo.png",
90 | "type": "local",
91 | },
92 | {
93 | "name": "GitHub",
94 | "url": "https://github.com/automatika-robotics/ros-agents",
95 | "icon": "fa-brands fa-github",
96 | },
97 | {
98 | "name": "Discord",
99 | "url": "https://discord.gg/cAW3BWwt",
100 | "icon": "fa-brands fa-discord",
101 | },
102 | ],
103 | "path_to_docs": "docs",
104 | "repository_url": "https://github.com/automatika-robotics/ros-agents",
105 | "repository_branch": "main",
106 | "use_source_button": True,
107 | "use_issues_button": True,
108 | "use_edit_page_button": True,
109 | "show_navbar_depth": 2,
110 | }
111 |
--------------------------------------------------------------------------------
/docs/examples/complete.md:
--------------------------------------------------------------------------------
1 | # Bringing it all together 🤖
2 |
3 | In this example we will combine everything we implemented in the previous examples to create one big graph of components. Afterwards we will analyze what we have accomplished. Here is what the code looks like:
4 |
5 | ```python
6 | import numpy as np
7 | import json
8 | from typing import Optional
9 | from agents.components import MLLM, SpeechToText, TextToSpeech, LLM, Vision, MapEncoding, SemanticRouter
10 | from agents.config import SpeechToTextConfig, TextToSpeechConfig
11 | from agents.clients.roboml import HTTPModelClient, RESPModelClient, HTTPDBClient
12 | from agents.clients.ollama import OllamaClient
13 | from agents.models import Whisper, SpeechT5, Llava, Llama3_1, VisionModel
14 | from agents.vectordbs import ChromaDB
15 | from agents.config import VisionConfig, LLMConfig, MapConfig, SemanticRouterConfig
16 | from agents.ros import Topic, Launcher, FixedInput, MapLayer, Route
17 |
18 |
19 | ### Setup our models and vectordb ###
20 | whisper = Whisper(name="whisper")
21 | whisper_client = HTTPModelClient(whisper)
22 | speecht5 = SpeechT5(name="speecht5")
23 | speecht5_client = HTTPModelClient(speecht5)
24 | object_detection_model = VisionModel(name="dino_4scale",
25 | checkpoint="dino-4scale_r50_8xb2-12e_coco")
26 | detection_client = RESPModelClient(object_detection_model)
27 | llava = Llava(name="llava")
28 | llava_client = OllamaClient(llava)
29 | llama = Llama3_1(name="llama")
30 | llama_client = OllamaClient(llama)
31 | chroma = ChromaDB(name="MainDB")
32 | chroma_client = HTTPDBClient(db=chroma)
33 |
34 | ### Setup our components ###
35 | # Setup a speech to text component
36 | audio_in = Topic(name="audio0", msg_type="Audio")
37 | query_topic = Topic(name="question", msg_type="String")
38 |
39 | speech_to_text = SpeechToText(
40 | inputs=[audio_in],
41 | outputs=[query_topic],
42 | model_client=whisper_client,
43 | trigger=audio_in,
44 | config=SpeechToTextConfig(enable_vad=True), # option to always listen for speech through the microphone
45 | component_name="speech_to_text"
46 | )
47 |
48 | # Setup a text to speech component
49 | query_answer = Topic(name="answer", msg_type="String")
50 |
51 | t2s_config = TextToSpeechConfig(play_on_device=True)
52 |
53 | text_to_speech = TextToSpeech(
54 | inputs=[query_answer],
55 | trigger=query_answer,
56 | model_client=speecht5_client,
57 | config=t2s_config,
58 | component_name="text_to_speech",
59 | )
60 |
61 | # Setup a vision component for object detection
62 | image0 = Topic(name="image_raw", msg_type="Image")
63 | detections_topic = Topic(name="detections", msg_type="Detections")
64 |
65 | detection_config = VisionConfig(threshold=0.5)
66 | vision = Vision(
67 | inputs=[image0],
68 | outputs=[detections_topic],
69 | trigger=image0,
70 | config=detection_config,
71 | model_client=detection_client,
72 | component_name="object_detection",
73 | )
74 |
75 | # Define a generic mllm component for vqa
76 | mllm_query = Topic(name="mllm_query", msg_type="String")
77 |
78 | mllm = MLLM(
79 | inputs=[mllm_query, image0, detections_topic],
80 | outputs=[query_answer],
81 | model_client=llava_client,
82 | trigger=mllm_query,
83 | component_name="visual_q_and_a"
84 | )
85 |
86 | mllm.set_component_prompt(
87 | template="""Imagine you are a robot.
88 | This image has following items: {{ detections }}.
89 | Answer the following about this image: {{ text0 }}"""
90 | )
91 |
92 | # Define a fixed input mllm component that does introspection
93 | introspection_query = FixedInput(
94 | name="introspection_query", msg_type="String",
95 | fixed="What kind of a room is this? Is it an office, a bedroom or a kitchen? Give a one word answer, out of the given choices")
96 | introspection_answer = Topic(name="introspection_answer", msg_type="String")
97 |
98 | introspector = MLLM(
99 | inputs=[introspection_query, image0],
100 | outputs=[introspection_answer],
101 | model_client=llava_client,
102 | trigger=15.0,
103 | component_name="introspector",
104 | )
105 |
106 |
107 | def introspection_validation(output: str) -> Optional[str]:
108 | for option in ["office", "bedroom", "kitchen"]:
109 | if option in output.lower():
110 | return option
111 |
112 |
113 | introspector.add_publisher_preprocessor(introspection_answer, introspection_validation)
114 |
115 | # Define a semantic map using MapEncoding component
116 | layer1 = MapLayer(subscribes_to=detections_topic, temporal_change=True)
117 | layer2 = MapLayer(subscribes_to=introspection_answer, resolution_multiple=3)
118 |
119 | position = Topic(name="odom", msg_type="Odometry")
120 | map_topic = Topic(name="map", msg_type="OccupancyGrid")
121 |
122 | map_conf = MapConfig(map_name="map")
123 | map = MapEncoding(
124 | layers=[layer1, layer2],
125 | position=position,
126 | map_topic=map_topic,
127 | config=map_conf,
128 | db_client=chroma_client,
129 | trigger=15.0,
130 | component_name="map_encoder"
131 | )
132 |
133 | # Define a generic LLM component
134 | llm_query = Topic(name="llm_query", msg_type="String")
135 |
136 | llm = LLM(
137 | inputs=[llm_query],
138 | outputs=[query_answer],
139 | model_client=llama_client,
140 | trigger=[llm_query],
141 | component_name="general_q_and_a"
142 | )
143 |
144 | # Define a Go-to-X component using LLM
145 | goto_query = Topic(name="goto_query", msg_type="String")
146 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
147 |
148 | goto_config = LLMConfig(
149 | enable_rag=True,
150 | collection_name="map",
151 | distance_func="l2",
152 | n_results=1,
153 | add_metadata=True,
154 | )
155 |
156 | goto = LLM(
157 | inputs=[goto_query],
158 | outputs=[goal_point],
159 | model_client=llama_client,
160 | config=goto_config,
161 | db_client=chroma_client,
162 | trigger=goto_query,
163 | component_name="go_to_x",
164 | )
165 |
166 | goto.set_component_prompt(
167 | template="""From the given metadata, extract coordinates and provide
168 | the coordinates in the following json format:\n {"position": coordinates}"""
169 | )
170 |
171 |
172 | # pre-process the output before publishing to a topic of msg_type PoseStamped
173 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
174 | # extract the json part of the output string (including brackets)
175 | # one can use sophisticated regex parsing here but we'll keep it simple
176 | json_string = output[output.find("{") : output.rfind("}") + 1]
177 | # load the string as a json and extract position coordinates
178 | # if there is an error, return None, i.e. no output would be published to goal_point
179 | try:
180 | json_dict = json.loads(json_string)
181 | coordinates = np.fromstring(json_dict["position"], sep=',', dtype=np.float64)
182 | print('Coordinates Extracted:', coordinates)
183 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3:
184 | return
185 | elif coordinates.shape[0] == 2: # sometimes LLMs avoid adding the zeros of z-dimension
186 | coordinates = np.append(coordinates, 0)
187 | return coordinates
188 | except Exception:
189 | return
190 |
191 |
192 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
193 |
194 | # Define a semantic router between a generic LLM component, VQA MLLM component and Go-to-X component
195 | goto_route = Route(routes_to=goto_query,
196 | samples=["Go to the door", "Go to the kitchen",
197 | "Get me a glass", "Fetch a ball", "Go to hallway"])
198 |
199 | llm_route = Route(routes_to=llm_query,
200 | samples=["What is the capital of France?", "Is there life on Mars?",
201 | "How many tablespoons in a cup?", "How are you today?", "Whats up?"])
202 |
203 | mllm_route = Route(routes_to=mllm_query,
204 | samples=["Are we indoors or outdoors", "What do you see?", "Whats in front of you?",
205 | "Where are we", "Do you see any people?", "How many things are infront of you?",
206 | "Is this room occupied?"])
207 |
208 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2")
209 | # Initialize the router component
210 | router = SemanticRouter(
211 | inputs=[query_topic],
212 | routes=[llm_route, goto_route, mllm_route],
213 | default_route=llm_route,
214 | config=router_config,
215 | db_client=chroma_client,
216 | component_name='router'
217 | )
218 |
219 | # Launch the components
220 | launcher = Launcher()
221 | launcher.add_pkg(
222 | components=[
223 | mllm,
224 | llm,
225 | goto,
226 | introspector,
227 | map,
228 | router,
229 | speech_to_text,
230 | text_to_speech,
231 | vision
232 | ]
233 | )
234 | launcher.bringup()
235 | ```
236 | ```{note}
237 | Note how we use the same model for _general_q_and_a_ and _goto_to_x_ components. Similarly _visual_q_and_a_ and _introspector_ components share a multimodal LLM model.
238 | ```
239 |
240 | In this small code block above, we have setup a fairly sophisticated embodied agent with the following capabilities.
241 |
242 | - A conversational interface using speech-to-text and text-to-speech models that uses the robots microphone and playback speaker.
243 | - The ability to answer contextual queries based on the robots camera, using an MLLM model.
244 | - The ability to answer generic queries, using an LLM model.
245 | - A semantic map of the robots observations, that acts as a spatio-temporal memory.
246 | - The ability to respond to Go-to-X commands utilizing the semantic map.
247 | - A single input interface that routes the input to different models based on its content.
248 |
249 | We can visualize the complete graph in the following diagram:
250 | ```{figure} ../_static/complete_dark.png
251 | :class: only-dark
252 | :alt: Complete embodied agent
253 | :align: center
254 | Complete embodied agent graph
255 | ```
256 | ```{figure} ../_static/complete_light.png
257 | :class: only-light
258 | :alt: Complete embodied agent
259 | :align: center
260 | Complete embodied agent graph
261 | ```
262 |
--------------------------------------------------------------------------------
/docs/examples/goto.md:
--------------------------------------------------------------------------------
1 | # Create a Go-to-X component using map data
2 |
3 | In the previous [example](semantic_map.md) we created a semantic map using the MapEncoding component. Intuitively one can imagine that using the map data would require some form of RAG. Let us suppose that we want to create a Go-to-X component, which, when given a command like 'Go to the yellow door', would retreive the coordinates of the _yellow door_ from the map and publish them to a goal point topic of type _PoseStamped_ to be handled by our robots navigation system. We will create our Go-to-X component using the LLM component provided by ROS Agents. We will start by initializing the component, and configuring it to use RAG.
4 |
5 | ## Initialize the component
6 |
7 | ```python
8 | from agents.components import LLM
9 | from agents.models import Llama3_1
10 | from agents.config import LLMConfig
11 | from agents.clients.ollama import OllamaClient
12 | from agents.ros import Topic
13 |
14 | # Start a Llama3.1 based llm component using ollama client
15 | llama = Llama3_1(name="llama")
16 | llama_client = OllamaClient(llama)
17 |
18 | # Define LLM input and output topics including goal_point topic of type PoseStamped
19 | goto_in = Topic(name="goto_in", msg_type="String")
20 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
21 | ```
22 |
23 | In order to configure the component to use RAG, we will set the following options in its config.
24 |
25 | ```python
26 | config = LLMConfig(enable_rag=True,
27 | collection_name="map",
28 | distance_func="l2",
29 | n_results=1,
30 | add_metadata=True)
31 | ```
32 |
33 | Note that the _collection_name_ parameter is the same as the map name we set in the previous [example](semantic_map.md). We have also set _add_metadata_ parameter to true to make sure that our metadata is included in the RAG result, as the spatial coordinates we want to get are part of the metadata. Let us have a quick look at the metadata stored in the map by the MapEncoding component.
34 |
35 | ```
36 | {
37 | "coordinates": [1.1, 2.2, 0.0],
38 | "layer_name": "Topic_Name", # same as topic name that the layer is subscribed to
39 | "timestamp": 1234567,
40 | "temporal_change": True
41 | }
42 | ```
43 |
44 | With this information, we will first initialize our component.
45 | ```{caution}
46 | In the following code block we are using the same DB client that was setup in the previous [example](semantic_map.md).
47 | ```
48 |
49 | ```python
50 | # initialize the component
51 | goto = LLM(
52 | inputs=[goto_in],
53 | outputs=[goal_point],
54 | model_client=llama_client,
55 | db_client=chroma_client, # check the previous example where we setup this database client
56 | trigger=goto_in,
57 | config=config,
58 | component_name='go_to_x'
59 | )
60 | ```
61 |
62 | ## Pre-process the model output before publishing
63 |
64 | Knowing that the output of retreival will be appended to the beggining of our query as context, we will setup a component level promot for our LLM.
65 |
66 | ```python
67 | # set a component prompt
68 | goto.set_component_prompt(
69 | template="""From the given metadata, extract coordinates and provide
70 | the coordinates in the following json format:\n {"position": coordinates}"""
71 | )
72 | ```
73 |
74 | ```{note}
75 | One might notice that we have not used an input topic name in our prompt. This is because we only need the input topic to fetch data from the vector DB during the RAG step. The query to the LLM in this case would only be composed of data fetched from the DB and our prompt.
76 | ```
77 |
78 | As the LLM output will contain text other than the _json_ string that we have asked for, we need to add a pre-processing function to the output topic that extracts the required part of the text and returns the output in a format that can be published to a _PoseStamped_ topic, i.e. a numpy array of floats.
79 |
80 | ```python
81 | from typing import Optional
82 | import json
83 | import numpy as np
84 |
85 | # pre-process the output before publishing to a topic of msg_type PoseStamped
86 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
87 | # extract the json part of the output string (including brackets)
88 | # one can use sophisticated regex parsing here but we'll keep it simple
89 | json_string = output[output.find("{") : output.rfind("}") + 1]
90 | # load the string as a json and extract position coordinates
91 | # if there is an error, return None, i.e. no output would be published to goal_point
92 | try:
93 | json_dict = json.loads(json_string)
94 | coordinates = np.fromstring(json_dict["position"], sep=',', dtype=np.float64)
95 | print('Coordinates Extracted:', coordinates)
96 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3:
97 | return
98 | elif coordinates.shape[0] == 2: # sometimes LLMs avoid adding the zeros of z-dimension
99 | coordinates = np.append(coordinates, 0)
100 | return coordinates
101 | except Exception:
102 | return
103 |
104 | # add the pre-processing function to the goal_point output topic
105 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
106 | ```
107 |
108 | ## Launching the Components
109 |
110 | And we will launch our Go-to-X component.
111 |
112 | ```python
113 | from agents.ros import Launcher
114 |
115 | # Launch the component
116 | launcher = Launcher()
117 | launcher.add_pkg(
118 | components=[goto]
119 | )
120 | launcher.bringup()
121 | ```
122 |
123 | And that is all. Our Go-to-X component is ready. The complete code for this example is given below:
124 |
125 | ```{code-block} python
126 | :caption: Go-to-X Component
127 | :linenos:
128 | from typing import Optional
129 | import json
130 | import numpy as np
131 | from agents.components import LLM
132 | from agents.models import Llama3_1
133 | from agents.vectordbs import ChromaDB
134 | from agents.config import LLMConfig
135 | from agents.clients.roboml import HTTPDBClient
136 | from agents.clients.ollama import OllamaClient
137 | from agents.ros import Launcher, Topic
138 |
139 | # Start a Llama3.1 based llm component using ollama client
140 | llama = Llama3_1(name="llama")
141 | llama_client = OllamaClient(llama)
142 |
143 | # Initialize a vector DB that will store our routes
144 | chroma = ChromaDB(name="MainDB")
145 | chroma_client = HTTPDBClient(db=chroma)
146 |
147 | # Define LLM input and output topics including goal_point topic of type PoseStamped
148 | goto_in = Topic(name="goto_in", msg_type="String")
149 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
150 |
151 | config = LLMConfig(enable_rag=True,
152 | collection_name="map",
153 | distance_func="l2",
154 | n_results=1,
155 | add_metadata=True)
156 |
157 | # initialize the component
158 | goto = LLM(
159 | inputs=[goto_in],
160 | outputs=[goal_point],
161 | model_client=llama_client,
162 | db_client=chroma_client, # check the previous example where we setup this database client
163 | trigger=goto_in,
164 | config=config,
165 | component_name='go_to_x'
166 | )
167 |
168 | # set a component prompt
169 | goto.set_component_prompt(
170 | template="""From the given metadata, extract coordinates and provide
171 | the coordinates in the following json format:\n {"position": coordinates}"""
172 | )
173 |
174 |
175 | # pre-process the output before publishing to a topic of msg_type PoseStamped
176 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
177 | # extract the json part of the output string (including brackets)
178 | # one can use sophisticated regex parsing here but we'll keep it simple
179 | json_string = output[output.find("{") : output.rfind("}") + 1]
180 | # load the string as a json and extract position coordinates
181 | # if there is an error, return None, i.e. no output would be published to goal_point
182 | try:
183 | json_dict = json.loads(json_string)
184 | coordinates = np.fromstring(json_dict["position"], sep=',', dtype=np.float64)
185 | print('Coordinates Extracted:', coordinates)
186 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3:
187 | return
188 | elif coordinates.shape[0] == 2: # sometimes LLMs avoid adding the zeros of z-dimension
189 | coordinates = np.append(coordinates, 0)
190 | return coordinates
191 | except Exception:
192 | return
193 |
194 |
195 | # add the pre-processing function to the goal_point output topic
196 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
197 |
198 | # Launch the component
199 | launcher = Launcher()
200 | launcher.add_pkg(
201 | components=[goto]
202 | )
203 | launcher.bringup()
204 | ```
205 |
--------------------------------------------------------------------------------
/docs/examples/index.md:
--------------------------------------------------------------------------------
1 | # Examples ✨
2 |
3 | In this section you will find basic examples of ROS Agents usage in the form of short tutorials. These examples would show you how ROS Agents' components can be used to create real world embodied agent capabilities in robots. It is recommended to go through the examples in sequence.
4 |
5 | ```{toctree}
6 | :maxdepth: 1
7 |
8 | conversational
9 | prompt_engineering
10 | semantic_map
11 | goto
12 | tool_calling
13 | semantic_router
14 | complete
15 | multiprocessing
16 | ```
17 |
--------------------------------------------------------------------------------
/docs/examples/prompt_engineering.md:
--------------------------------------------------------------------------------
1 | # Prompt engineering for LLMs/MLLMs using vision models
2 |
3 | In this example we will use the output of an object detection component to enrich the prompt of an MLLM component. Let us start by importing the components.
4 | ```python
5 | from agents.components import Vision, MLLM
6 | ```
7 |
8 | ## Setting up the Object Detection Component
9 | For object detection and tracking, ROS Agents provides a unified Vision component. This component takes as input an image topic published by a camera device onboard our robot. The output of this component can be a _detections_ topic in case of object detection or a _trackings_ topic in case of object tracking. In this example we will use a _detections_ topic.
10 |
11 | ```python
12 | from agents.ros import Topic
13 |
14 | # Define the image input topic
15 | image0 = Topic(name="image_raw", msg_type="Image")
16 | # Create a detection topic
17 | detections_topic = Topic(name="detections", msg_type="Detections")
18 | ```
19 | Additionally the component requiers a model client with an object detection model. We will use the RESP client for RoboML and use the VisionModel a convenient model class made available in ROS Agents, for initializing all vision models available in the opensource [mmdetection](https://github.com/open-mmlab/mmdetection) library. We will specify the model we want to use by specifying the checkpoint attribute.
20 |
21 | ```{note}
22 | Learn about setting up RoboML with vision [here](https://github.com/automatika-robotics/roboml/blob/main/README.md#for-vision-models-support).
23 | ```
24 | ```{seealso}
25 | Checkout all available mmdetection models and their benchmarking results in the [mmdetection model zoo](https://github.com/open-mmlab/mmdetection?tab=readme-ov-file#overview-of-benchmark-and-model-zoo).
26 | ```
27 |
28 | ```python
29 | from agents.models import VisionModel
30 | from agents.clients.roboml import RESPModelClient, HTTPModelClient
31 | from agents.config import VisionConfig
32 |
33 | # Add an object detection model
34 | object_detection = VisionModel(name="object_detection",
35 | checkpoint="dino-4scale_r50_8xb2-12e_coco")
36 | roboml_detection = RESPModelClient(object_detection)
37 |
38 | # Initialize the Vision component
39 | detection_config = VisionConfig(threshold=0.5)
40 | vision = Vision(
41 | inputs=[image0],
42 | outputs=[detections_topic],
43 | trigger=image0,
44 | config=detection_config,
45 | model_client=roboml_detection,
46 | component_name="detection_component",
47 | )
48 | ```
49 |
50 | ```{tip}
51 | Notice that we passed in an option config to the component. Component configs can be used to setup various parameters in the component. If the component calls an ML than inference parameters for the model can be set in the component config.
52 | ```
53 |
54 | ## Setting up the MLLM Component
55 |
56 | For the MLLM component, we will provide an additional text input topic, which will listen to our queries. The output of the component will be another text topic. We will use the RoboML HTTP client with the multimodal LLM Idefics2 by the good folks at HuggingFace for this example.
57 |
58 | ```python
59 | from agents.models import Idefics2
60 |
61 | # Define MLLM input and output text topics
62 | text_query = Topic(name="text0", msg_type="String")
63 | text_answer = Topic(name="text1", msg_type="String")
64 |
65 | # Define a model client (working with roboml in this case)
66 | idefics = Idefics2(name="idefics_model")
67 | idefics_client = HTTPModelClient(idefics)
68 |
69 | # Define an MLLM component
70 | # We can pass in the detections topic which we defined previously directy as an optional input
71 | # to the MLLM component in addition to its other required inputs
72 | mllm = MLLM(
73 | inputs=[text_query, image0, detections_topic],
74 | outputs=[text_answer],
75 | model_client=idefics_client,
76 | trigger=text_query,
77 | component_name="mllm_component"
78 | )
79 | ```
80 | Next we will setup a component level prompt to ensure that our text query and the output of the detections topic are sent to the model as we intend. We will do this by passing a jinja2 template to the **set_component_prompt** function.
81 | ```python
82 | mllm.set_component_prompt(
83 | template="""Imagine you are a robot.
84 | This image has following items: {{ detections }}.
85 | Answer the following about this image: {{ text0 }}"""
86 | )
87 | ```
88 | ```{caution}
89 | The names of the topics used in the jinja2 template are the same as the name parameters set when creation the Topic objects.
90 | ```
91 |
92 | ## Launching the Components
93 |
94 | Finally we will launch our components as we did in the previous example.
95 |
96 | ```python
97 | from agents.ros import Launcher
98 |
99 | # Launch the components
100 | launcher = Launcher()
101 | launcher.add_pkg(
102 | components=[vision, mllm]
103 | )
104 | launcher.bringup()
105 | ```
106 |
107 | And there we have it. Complete code of this example is provided below.
108 |
109 | ```{code-block} python
110 | :caption: Prompt Engineering with Object Detection
111 | :linenos:
112 | from agents.components import Vision, MLLM
113 | from agents.models import VisionModel, Idefics2
114 | from agents.clients.roboml import RESPModelClient, HTTPModelClient
115 | from agents.config import VisionConfig
116 | from agents.ros import Topic, Launcher
117 |
118 | image0 = Topic(name="image_raw", msg_type="Image")
119 | detections_topic = Topic(name="detections", msg_type="Detections")
120 |
121 | object_detection = VisionModel(name="object_detection",
122 | checkpoint="dino-4scale_r50_8xb2-12e_coco")
123 | roboml_detection = RESPModelClient(object_detection)
124 |
125 | detection_config = VisionConfig(threshold=0.5)
126 | vision = Vision(
127 | inputs=[image0],
128 | outputs=[detections_topic],
129 | trigger=image0,
130 | config=detection_config,
131 | model_client=roboml_detection,
132 | component_name="detection_component",
133 | )
134 |
135 | text_query = Topic(name="text0", msg_type="String")
136 | text_answer = Topic(name="text1", msg_type="String")
137 |
138 | idefics = Idefics2(name="idefics_model")
139 | idefics_client = HTTPModelClient(idefics)
140 |
141 | mllm = MLLM(
142 | inputs=[text_query, image0, detections_topic],
143 | outputs=[text_answer],
144 | model_client=idefics_client,
145 | trigger=text_query,
146 | component_name="mllm_component"
147 | )
148 |
149 | mllm.set_component_prompt(
150 | template="""Imagine you are a robot.
151 | This image has following items: {{ detections }}.
152 | Answer the following about this image: {{ text0 }}"""
153 | )
154 | launcher = Launcher()
155 | launcher.add_pkg(
156 | components=[vision, mllm]
157 | )
158 | launcher.bringup()
159 | ```
160 |
--------------------------------------------------------------------------------
/docs/examples/semantic_router.md:
--------------------------------------------------------------------------------
1 | # Create a semantic router to route text queries between different components
2 |
3 | While semantic routing can be implemented with an LLM component, ROS Agents also provides a convenient SemanticRouter component that works directly with text encoding distances and can be utilized with a vector DB.
4 |
5 | In this example we will use the SemanticRouter component to route text queries between two components, a general purpose LLM and a Go-to-X component that we built in the previous [example](goto.md). Lets start by setting up our components.
6 |
7 | ## Setting up the components
8 |
9 | In the following code snippet we will setup our two components.
10 |
11 | ```python
12 | from agents.components import LLM
13 | from agents.clients.ollama import OllamaClient
14 | from agents.clients.roboml import HTTPModelClient
15 | from agents.models import Idefics2, Llama3_1
16 | from agents.config import LLMConfig
17 | from agents.ros import Topic
18 |
19 | # Create a llama3.1 client using Ollama
20 | llama = Llama3_1(name="llama")
21 | ollama_client = OllamaClient(llama)
22 |
23 | # Make a generic LLM component using the Llama3_1 model
24 | llm_in = Topic(name="llm_in", msg_type="String")
25 | llm_out = Topic(name="llm_out", msg_type="String")
26 |
27 | llm = LLM(
28 | inputs=[llm_in],
29 | outputs=[llm_out],
30 | model_client=llama_client,
31 | trigger=[llm_in],
32 | )
33 |
34 | # Make a Go-to-X component using the same Llama3_1 model
35 | goto_in = Topic(name="goto_in", msg_type="String")
36 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
37 |
38 | config = LLMConfig(enable_rag=True,
39 | collection_name="map",
40 | distance_func="l2",
41 | n_results=1,
42 | add_metadata=True)
43 |
44 | goto = LLM(
45 | inputs=[goto_in],
46 | outputs=[goal_point],
47 | model_client=llama_client,
48 | db_client=chroma_client,
49 | trigger=goto_in,
50 | config=config,
51 | component_name='go_to_x'
52 | )
53 |
54 | # set a component prompt
55 | goto.set_component_prompt(
56 | template="""From the given metadata, extract coordinates and provide
57 | the coordinates in the following json format:\n {"position": coordinates}"""
58 | )
59 |
60 | # pre-process the output before publishing to a topic of msg_type PoseStamped
61 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
62 | # extract the json part of the output string (including brackets)
63 | # one can use sophisticated regex parsing here but we'll keep it simple
64 | json_string = output[output.find("{"):output.find("}") + 1]
65 |
66 | # load the string as a json and extract position coordinates
67 | # if there is an error, return None, i.e. no output would be published to goal_point
68 | try:
69 | json_dict = json.loads(json_string)
70 | return np.array(json_dict['position'])
71 | except Exception:
72 | return
73 |
74 | # add the pre-processing function to the goal_point output topic
75 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
76 | ```
77 |
78 | ```{note}
79 | Note that we have reused the same model and its client for both components.
80 | ```
81 |
82 | ```{note}
83 | For a detailed explanation of the code for setting up the Go-to-X component, check the previous [example](goto.md).
84 | ```
85 |
86 | ```{caution}
87 | In the code block above we are using the same DB client that was setup in this [example](semantic_map.md).
88 | ```
89 |
90 | ## Creating the SemanticRouter
91 |
92 | The SemanticRouter takes an input _String_ topic and sends whatever is published on that topic to a _Route_. A _Route_ is a thin wrapper around _Topic_ and takes in the name of a topic to publish on and example queries, that would match a potential query that should be published to a particular topic. For example, if we ask our robot a general question, like "Whats the capital of France?", we do not want that question to be routed to a Go-to-X component, but to a generic LLM. Thus in its route, we would provide examples of general questions. The SemanticRouter component works by storing these examples in a vector DB. Distance is calculated between an incoming query's embedding and the embeddings of example queries to determine which _Route_(_Topic_) the query should be sent on. Lets start by creating our routes for the input topics of the two components above.
93 |
94 | ```python
95 | from agents.ros import Route
96 |
97 | # Create the input topic for the router
98 | query_topic = Topic(name="question", msg_type="String")
99 |
100 | # Define a route to a topic that processes go-to-x commands
101 | goto_route = Route(routes_to=goto_in,
102 | samples=["Go to the door", "Go to the kitchen",
103 | "Get me a glass", "Fetch a ball", "Go to hallway"])
104 |
105 | # Define a route to a topic that is input to an LLM component
106 | llm_route = Route(routes_to=llm_in,
107 | samples=["What is the capital of France?", "Is there life on Mars?",
108 | "How many tablespoons in a cup?", "How are you today?", "Whats up?"])
109 | ```
110 |
111 | For the database client we will use the ChromaDB client setup in [this example](semantic_map.md). We will specify a router name in our router config, which will act as a _collection_name_ in the database.
112 |
113 | ```python
114 | from agents.components import SemanticRouter
115 | from agents.config import SemanticRouterConfig
116 |
117 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2")
118 | # Initialize the router component
119 | router = SemanticRouter(
120 | inputs=[query_topic],
121 | routes=[llm_route, goto_route],
122 | default_route=llm_route, # If none of the routes fall within a distance threshold
123 | config=router_config,
124 | db_client=chroma_client, # reusing the db_client from the previous example
125 | component_name="router"
126 | )
127 | ```
128 |
129 | And that is it. Whenever something is published on the input topic **question**, it will be routed, either to a Go-to-X component or an LLM component. We can now expose this topic to our command interface. The complete code for setting up the router is given below:
130 |
131 | ```{code-block} python
132 | :caption: Semantic Routing
133 | :linenos:
134 | from typing import Optional
135 | import json
136 | import numpy as np
137 | from agents.components import LLM, SemanticRouter
138 | from agents.models import Llama3_1
139 | from agents.vectordbs import ChromaDB
140 | from agents.config import LLMConfig, SemanticRouterConfig
141 | from agents.clients.roboml import HTTPDBClient
142 | from agents.clients.ollama import OllamaClient
143 | from agents.ros import Launcher, Topic, Route
144 |
145 |
146 | # Start a Llama3.1 based llm component using ollama client
147 | llama = Llama3_1(name="llama")
148 | llama_client = OllamaClient(llama)
149 |
150 | # Initialize a vector DB that will store our routes
151 | chroma = ChromaDB(name="MainDB")
152 | chroma_client = HTTPDBClient(db=chroma)
153 |
154 |
155 | # Make a generic LLM component using the Llama3_1 model
156 | llm_in = Topic(name="llm_in", msg_type="String")
157 | llm_out = Topic(name="llm_out", msg_type="String")
158 |
159 | llm = LLM(
160 | inputs=[llm_in],
161 | outputs=[llm_out],
162 | model_client=llama_client,
163 | trigger=llm_in
164 | )
165 |
166 |
167 | # Define LLM input and output topics including goal_point topic of type PoseStamped
168 | goto_in = Topic(name="goto_in", msg_type="String")
169 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
170 |
171 | config = LLMConfig(enable_rag=True,
172 | collection_name="map",
173 | distance_func="l2",
174 | n_results=1,
175 | add_metadata=True)
176 |
177 | # initialize the component
178 | goto = LLM(
179 | inputs=[goto_in],
180 | outputs=[goal_point],
181 | model_client=llama_client,
182 | db_client=chroma_client, # check the previous example where we setup this database client
183 | trigger=goto_in,
184 | config=config,
185 | component_name='go_to_x'
186 | )
187 |
188 | # set a component prompt
189 | goto.set_component_prompt(
190 | template="""From the given metadata, extract coordinates and provide
191 | the coordinates in the following json format:\n {"position": coordinates}"""
192 | )
193 |
194 |
195 | # pre-process the output before publishing to a topic of msg_type PoseStamped
196 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
197 | # extract the json part of the output string (including brackets)
198 | # one can use sophisticated regex parsing here but we'll keep it simple
199 | json_string = output[output.find("{"):output.find("}") + 1]
200 |
201 | # load the string as a json and extract position coordinates
202 | # if there is an error, return None, i.e. no output would be published to goal_point
203 | try:
204 | json_dict = json.loads(json_string)
205 | return np.array(json_dict['position'])
206 | except Exception:
207 | return
208 |
209 |
210 | # add the pre-processing function to the goal_point output topic
211 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
212 |
213 | # Create the input topic for the router
214 | query_topic = Topic(name="question", msg_type="String")
215 |
216 | # Define a route to a topic that processes go-to-x commands
217 | goto_route = Route(routes_to=goto_in,
218 | samples=["Go to the door", "Go to the kitchen",
219 | "Get me a glass", "Fetch a ball", "Go to hallway"])
220 |
221 | # Define a route to a topic that is input to an LLM component
222 | llm_route = Route(routes_to=llm_in,
223 | samples=["What is the capital of France?", "Is there life on Mars?",
224 | "How many tablespoons in a cup?", "How are you today?", "Whats up?"])
225 |
226 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2")
227 | # Initialize the router component
228 | router = SemanticRouter(
229 | inputs=[query_topic],
230 | routes=[llm_route, goto_route],
231 | default_route=llm_route, # If none of the routes fall within a distance threshold
232 | config=router_config,
233 | db_client=chroma_client, # reusing the db_client from the previous example
234 | component_name="router",
235 | )
236 |
237 | # Launch the components
238 | launcher = Launcher()
239 | launcher.add_pkg(
240 | components=[llm, goto, router]
241 | )
242 | launcher.bringup()
243 | ```
244 |
--------------------------------------------------------------------------------
/docs/examples/tool_calling.md:
--------------------------------------------------------------------------------
1 | # Use Tool Calling in Go-to-X
2 |
3 | In the previous [example](goto.md) we created a Go-to-X component using basic text manipulation on LLM output. However, for models that have been specifically trained for tool calling, one can get better results for structured outputs by invoking tool calling. At the same time tool calling can be useful to generate responses which require intermediate use of tools by the LLM before providing a final answer. In this example we will utilize tool calling for the former utility of getting a better structured output from the LLM, by reimplementing the Go-to-X component.
4 |
5 | ## Register a tool (function) to be called by the LLM
6 | To utilize tool calling we will change our strategy of doing pre-processing to LLM text output, and instead ask the LLM to provide structured input to a function (tool). The output of this function will then be sent for publishing to the output topic. Lets see what this will look like in the following code snippets.
7 |
8 | First we will modify the component level prompt for our LLM.
9 |
10 | ```python
11 | # set a component prompt
12 | goto.set_component_prompt(
13 | template="""What are the position coordinates in the given metadata?"""
14 | )
15 | ```
16 | Next we will replace our pre-processing function, with a much simpler function that takes in a list and provides a numpy array. The LLM will be expected to call this function with the appropriate output. This strategy generally works better than getting text input from LLM and trying to parse it with an arbitrary function. To register the function as a tool, we will also need to create its description in a format that is explanatory for the LLM. This format has been specified by the _Ollama_ client.
17 |
18 | ```{caution}
19 | Tool calling is currently available only when components utilize the OllamaClient.
20 | ```
21 | ```{seealso}
22 | To see a list of models that work for tool calling using the OllamaClient, check [here](https://ollama.com/search?c=tools)
23 | ```
24 | ```python
25 | # pre-process the output before publishing to a topic of msg_type PoseStamped
26 | def get_coordinates(position: list[float]) -> np.ndarray:
27 | """Get position coordinates"""
28 | return np.array(position, dtype=float)
29 |
30 |
31 | function_description = {
32 | "type": "function",
33 | "function": {
34 | "name": "get_coordinates",
35 | "description": "Get position coordinates",
36 | "parameters": {
37 | "type": "object",
38 | "properties": {
39 | "position": {
40 | "type": "list[float]",
41 | "description": "The position coordinates in x, y and z",
42 | }
43 | },
44 | },
45 | "required": ["position"],
46 | },
47 | }
48 |
49 | # add the pre-processing function to the goal_point output topic
50 | goto.register_tool(
51 | tool=get_coordinates,
52 | tool_description=function_description,
53 | send_tool_response_to_model=False,
54 | )
55 | ```
56 | In the code above, the flag _send_tool_response_to_model_ has been set to False. This means that the function output will be sent directly for publication, since our usage of the tool in this example is limited to forcing the model to provide a structured output. If this flag was set to True, the output of the tool (function) will be sent back to the model to produce the final output, which will then be published. This latter usage is employed when a tool like a calculator, browser or code interpreter can be provided to the model for generating better answers.
57 |
58 | ## Launching the Components
59 |
60 | And as before, we will launch our Go-to-X component.
61 |
62 | ```python
63 | from agents.ros import Launcher
64 |
65 | # Launch the component
66 | launcher = Launcher()
67 | launcher.add_pkg(components=[goto])
68 | launcher.bringup()
69 | ```
70 |
71 | The complete code for this example is given below:
72 |
73 | ```{code-block} python
74 | :caption: Go-to-X Component
75 | :linenos:
76 | import numpy as np
77 | from agents.components import LLM
78 | from agents.models import Llama3_1
79 | from agents.vectordbs import ChromaDB
80 | from agents.config import LLMConfig
81 | from agents.clients.roboml import HTTPDBClient
82 | from agents.clients.ollama import OllamaClient
83 | from agents.ros import Launcher, Topic
84 |
85 | # Start a Llama3.1 based llm component using ollama client
86 | llama = Llama3_1(name="llama")
87 | llama_client = OllamaClient(llama)
88 |
89 | # Initialize a vector DB that will store our routes
90 | chroma = ChromaDB(name="MainDB")
91 | chroma_client = HTTPDBClient(db=chroma)
92 |
93 | # Define LLM input and output topics including goal_point topic of type PoseStamped
94 | goto_in = Topic(name="goto_in", msg_type="String")
95 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
96 |
97 | config = LLMConfig(
98 | enable_rag=True,
99 | collection_name="map",
100 | distance_func="l2",
101 | n_results=1,
102 | add_metadata=True,
103 | )
104 |
105 | # initialize the component
106 | goto = LLM(
107 | inputs=[goto_in],
108 | outputs=[goal_point],
109 | model_client=llama_client,
110 | db_client=chroma_client, # check the previous example where we setup this database client
111 | trigger=goto_in,
112 | config=config,
113 | component_name="go_to_x",
114 | )
115 |
116 | # set a component prompt
117 | goto.set_component_prompt(
118 | template="""What are the position coordinates in the given metadata?"""
119 | )
120 |
121 |
122 | # pre-process the output before publishing to a topic of msg_type PoseStamped
123 | def get_coordinates(position: list[float]) -> np.ndarray:
124 | """Get position coordinates"""
125 | return np.array(position, dtype=float)
126 |
127 |
128 | function_description = {
129 | "type": "function",
130 | "function": {
131 | "name": "get_coordinates",
132 | "description": "Get position coordinates",
133 | "parameters": {
134 | "type": "object",
135 | "properties": {
136 | "position": {
137 | "type": "list[float]",
138 | "description": "The position coordinates in x, y and z",
139 | }
140 | },
141 | },
142 | "required": ["position"],
143 | },
144 | }
145 |
146 | # add the pre-processing function to the goal_point output topic
147 | goto.register_tool(
148 | tool=get_coordinates,
149 | tool_description=function_description,
150 | send_tool_response_to_model=False,
151 | )
152 |
153 | # Launch the component
154 | launcher = Launcher()
155 | launcher.add_pkg(components=[goto])
156 | launcher.bringup()
157 | ```
158 |
--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: ROS Agents Documentation
3 | ---
4 |
5 |
6 | ```{include} intro.md
7 | ```
8 |
9 | ## Table of Contents
10 |
11 | ```{toctree}
12 | :maxdepth: 2
13 |
14 | intro
15 | installation
16 | quickstart
17 | basics
18 | examples/index
19 | apidocs/index
20 | ```
21 |
--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
1 | # Installation 🛠️
2 |
3 | ## Pre-Requisits
4 |
5 | ### Install ROS
6 |
7 | ROS Agents is built to be used with ROS2. All ROS distributions starting from _Iron_ are supported. Install ROS2 by following the instructions on the [official site](https://docs.ros.org/en/iron/Installation.html).
8 |
9 | ### Install a model serving platform
10 |
11 | The core of ROS Agents is agnostic to model serving platforms. It currently supports [Ollama](https://ollama.com) and [RoboML](https://github.com/automatika-robotics/RoboML). Please install either of these by following the instructions provided by respective projects. Support for new platforms will be continuously added. If you would like to support a particular platform, please open an issue/PR.
12 |
13 | ```{tip}
14 | For utilizing larger models, it is recommended that model serving platforms are not installed directly on the robot (or the edge device) but on a GPU powered machine on the local network (or one of the cloud providers).
15 | ```
16 |
17 | ## Install ROS Agents (Ubuntu)
18 |
19 | **Binary packages for Ubuntu will be released soon. Check this space.**
20 |
21 | ## Install ROS Agents from source
22 |
23 | Create your ROS workspace.
24 | ```shell
25 | mkdir -p agents_ws/src
26 | cd agents_ws/src
27 | ```
28 | ### Get Dependencies
29 |
30 | Install python dependencies
31 | ```shell
32 | pip install numpy opencv-python-headless 'attrs>=23.2.0' jinja2 httpx setproctitle msgpack msgpack-numpy numpy-quaternion platformdirs
33 | ```
34 |
35 | Download ROS Sugar.
36 | ```shell
37 | git clone https://github.com/automatika-robotics/ros-sugar
38 | ```
39 | ### Install ROS Agents
40 | ```shell
41 | git clone https://github.com/automatika-robotics/ros-agents.git
42 | cd ..
43 | colcon build
44 | source install/setup.bash
45 | python your_script.py
46 | ```
47 |
--------------------------------------------------------------------------------
/docs/intro.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | # ROS Agents 🤖
4 |
5 | ROS Agents is a fully-loaded framework for creating interactive embodied agents that can understand, remember, and act upon contextual information from their environment.
6 |
7 | - **Agents in the real world:** Designed to be used with autonomous robot systems that operate in dynamic environments, specifically AMRs.
8 | - **Intuitive API**: Simple pythonic API to utilize local or cloud based ML models (specifically **Multimodal LLMs** and other **Transformer Architectures**) on robots.
9 | - **Semantic Memory**: Integrates vector databases, semantic routing and other supporting components to quickly build arbitrarily complex graphs for agentic information flow. No need to utilize bloated "GenAI" frameworks on your robot.
10 | - **Made in ROS2**: Utilizes ROS2 as the underlying distributed communications backbone. Theoretically, all devices that provide a ROS2 package can be utilized to send data to ML models, as long as the datatype callback has been implemented.
11 |
12 | Checkout [Installation Instructions](installation.md) 🛠️
13 |
14 | Get started with the [Quickstart Guide](quickstart.md) 🚀
15 |
16 | Get familiar with [Basic Concepts](basics.md) 📚
17 |
18 | Dive right in with [Examples](examples/index.md) ✨
19 |
20 | ## Contributions
21 |
22 | ROS Agents has been developed in collaboration betweeen [Automatika Robotics](https://automatikarobotics.com/) and [Inria](https://inria.fr/). Contributions from the community are most welcome.
23 |
--------------------------------------------------------------------------------
/docs/quickstart.md:
--------------------------------------------------------------------------------
1 | # Quick Start 🚀
2 |
3 | Unlike other ROS package, ROS Agents provides a pure pythonic way of describing the node graph using [ROS Sugar](https://automatika-robotics.github.io/ros-sugar/). Copy the following code in a python script and run it.
4 |
5 | ```python
6 | from agents.clients.ollama import OllamaClient
7 | from agents.components import MLLM
8 | from agents.models import Llava
9 | from agents.ros import Topic, Launcher
10 |
11 | # Define input and output topics (pay attention to msg_type)
12 | text0 = Topic(name="text0", msg_type="String")
13 | image0 = Topic(name="image_raw", msg_type="Image")
14 | text1 = Topic(name="text1", msg_type="String")
15 |
16 | # Define a model client (working with Ollama in this case)
17 | llava = Llava(name="llava")
18 | llava_client = OllamaClient(llava)
19 |
20 | # Define an MLLM component (A component represents a node with a particular functionality)
21 | mllm = MLLM(
22 | inputs=[text0, image0],
23 | outputs=[text1],
24 | model_client=llava_client,
25 | trigger=[text0],
26 | component_name="vqa"
27 | )
28 | # Additional prompt settings
29 | mllm.set_topic_prompt(text0, template="""You are an amazing and funny robot.
30 | Answer the following about this image: {{ text0 }}"""
31 | )
32 | # Launch the component
33 | launcher = Launcher()
34 | launcher.add_pkg(components=[mllm])
35 | launcher.bringup()
36 | ```
37 |
38 | Now let us see step-by-step what we have done in this code. First we defined inputs and outputs to our component in the form of ROS Topics. Components automatically create listeners for input topics and publishers for output topics.
39 |
40 | ```python
41 | # Define input and output topics (pay attention to msg_type)
42 | text0 = Topic(name="text0", msg_type="String")
43 | image0 = Topic(name="image_raw", msg_type="Image")
44 | text1 = Topic(name="text1", msg_type="String")
45 | ```
46 |
47 | ````{important}
48 | If you are running ROS Agents on a robot, make sure you change the name of the topic to which the robot's camera is publishing the RGB images to in the following line.
49 |
50 | ```python
51 | image0 = Topic(name="NAME_OF_THE_TOPIC", msg_type="Image")
52 | ````
53 |
54 | ```{note}
55 | If you are running ROS Agents on a testing machine, and the machine has a webcam, you can install the [**ROS2 USB Cam**](https://github.com/klintan/ros2_usb_camera). Make sure you use the correct name of the image topic as above.
56 | ```
57 |
58 | Then we will create a multimodal LLM component. Components are functional units in ROS Agents. To learn more about them, check out [Basic Concepts](basics.md). Other than input/output topics, the MLLM component expects a model client. So first we will create a model client that can utilize a [Llava](https://ollama.com/library/llava) model on [Ollama](https://ollama.com) as its model serving platform.
59 |
60 | ```python
61 | # Define a model client (working with Ollama in this case)
62 | llava = Llava(name="llava")
63 | llava_client = OllamaClient(llava)
64 | ```
65 |
66 | ````{important}
67 | If you are not running Ollama on the same machine (robot) on which you are running ROS Agents, you can define access to the machine running Ollama using host and port in this line:
68 | ```python
69 | llava_client = OllamaClient(llava, host="127.0.0.1", port=8000)
70 | ````
71 |
72 | ```{note}
73 | If the use of Ollama as a model serving platform is unclear, checkout [installation instructions](installation.md).
74 | ```
75 |
76 | Now we are ready to setup our component.
77 |
78 | ```python
79 | # Define an MLLM component (A component represents a node with a particular functionality)
80 | mllm = MLLM(
81 | inputs=[text0, image0],
82 | outputs=[text1],
83 | model_client=llava_client,
84 | trigger=[text0],
85 | component_name="vqa"
86 | )
87 | # Additional prompt settings
88 | mllm.set_topic_prompt(text0, template="""You are an amazing and funny robot.
89 | Answer the following about this image: {{ text0 }}"""
90 | )
91 | ```
92 |
93 | Note how the MLLM type of component, also allows us to set a topic or component level prompt, where a jinja2 template can be used to define a template in which our input string should be embedded. Finally we will launch the component.
94 |
95 | ```python
96 | # Launch the component
97 | launcher = Launcher()
98 | launcher.add_pkg(components=[mllm])
99 | launcher.bringup()
100 | ```
101 |
102 | Now we can check that our component is running by using familiar ROS2 commands from a new terminal. We should see our component running as a ROS node and the its input and output topics in the topic list.
103 |
104 | ```shell
105 | ros2 node list
106 | ros2 topic list
107 | ```
108 |
109 | In order to interact with our component we can use the tiny web client that is bundled with ROS Agents. We can launch the client by running:
110 |
111 | ```shell
112 | ros2 run automatica_embodied_agents tiny_web_client
113 | ```
114 |
115 | The client displays a web UI on http://localhost:8000. Open this address from browser. ROS input and output topic settings for text input and output topics can be configured from the web UI by pressing the settings icon. Send a question to your ROS Agent and you should get a the reply generated by the Llava model.
116 |
--------------------------------------------------------------------------------
/examples/complete_agent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import json
3 | from typing import Optional
4 | from agents.components import (
5 | MLLM,
6 | SpeechToText,
7 | TextToSpeech,
8 | LLM,
9 | Vision,
10 | MapEncoding,
11 | SemanticRouter,
12 | )
13 | from agents.config import TextToSpeechConfig
14 | from agents.clients.roboml import HTTPModelClient, RESPModelClient, HTTPDBClient
15 | from agents.clients.ollama import OllamaClient
16 | from agents.models import Whisper, SpeechT5, Llava, Llama3_1, VisionModel
17 | from agents.vectordbs import ChromaDB
18 | from agents.config import VisionConfig, LLMConfig, MapConfig, SemanticRouterConfig
19 | from agents.ros import Topic, Launcher, FixedInput, MapLayer, Route
20 |
21 |
22 | ### Setup our models and vectordb ###
23 | whisper = Whisper(name="whisper")
24 | whisper_client = HTTPModelClient(whisper)
25 | speecht5 = SpeechT5(name="speecht5")
26 | speecht5_client = HTTPModelClient(speecht5)
27 | object_detection_model = VisionModel(
28 | name="dino_4scale", checkpoint="dino-4scale_r50_8xb2-12e_coco"
29 | )
30 | detection_client = RESPModelClient(object_detection_model)
31 | llava = Llava(name="llava")
32 | llava_client = OllamaClient(llava)
33 | llama = Llama3_1(name="llama")
34 | llama_client = OllamaClient(llama)
35 | chroma = ChromaDB(name="MainDB")
36 | chroma_client = HTTPDBClient(db=chroma)
37 |
38 | ### Setup our components ###
39 | # Setup a speech to text component
40 | audio_in = Topic(name="audio0", msg_type="Audio")
41 | query_topic = Topic(name="question", msg_type="String")
42 |
43 | speech_to_text = SpeechToText(
44 | inputs=[audio_in],
45 | outputs=[query_topic],
46 | model_client=whisper_client,
47 | trigger=audio_in,
48 | component_name="speech_to_text",
49 | )
50 |
51 | # Setup a text to speech component
52 | query_answer = Topic(name="answer", msg_type="String")
53 |
54 | t2s_config = TextToSpeechConfig(play_on_device=True)
55 |
56 | text_to_speech = TextToSpeech(
57 | inputs=[query_answer],
58 | trigger=query_answer,
59 | model_client=speecht5_client,
60 | config=t2s_config,
61 | component_name="text_to_speech",
62 | )
63 |
64 | # Setup a vision component for object detection
65 | image0 = Topic(name="image_raw", msg_type="Image")
66 | detections_topic = Topic(name="detections", msg_type="Detections")
67 |
68 | detection_config = VisionConfig(threshold=0.5)
69 | vision = Vision(
70 | inputs=[image0],
71 | outputs=[detections_topic],
72 | trigger=image0,
73 | config=detection_config,
74 | model_client=detection_client,
75 | component_name="object_detection",
76 | )
77 |
78 | # Define a generic mllm component for vqa
79 | mllm_query = Topic(name="mllm_query", msg_type="String")
80 |
81 | mllm = MLLM(
82 | inputs=[mllm_query, image0, detections_topic],
83 | outputs=[query_answer],
84 | model_client=llava_client,
85 | trigger=mllm_query,
86 | component_name="visual_q_and_a",
87 | )
88 |
89 | mllm.set_component_prompt(
90 | template="""Imagine you are a robot.
91 | This image has following items: {{ detections }}.
92 | Answer the following about this image: {{ text0 }}"""
93 | )
94 |
95 | # Define a fixed input mllm component that does introspection
96 | introspection_query = FixedInput(
97 | name="introspection_query",
98 | msg_type="String",
99 | fixed="What kind of a room is this? Is it an office, a bedroom or a kitchen? Give a one word answer, out of the given choices",
100 | )
101 | introspection_answer = Topic(name="introspection_answer", msg_type="String")
102 |
103 | introspector = MLLM(
104 | inputs=[introspection_query, image0],
105 | outputs=[introspection_answer],
106 | model_client=llava_client,
107 | trigger=15.0,
108 | component_name="introspector",
109 | )
110 |
111 |
112 | def introspection_validation(output: str) -> Optional[str]:
113 | for option in ["office", "bedroom", "kitchen"]:
114 | if option in output.lower():
115 | return option
116 |
117 |
118 | introspector.add_publisher_preprocessor(introspection_answer, introspection_validation)
119 |
120 | # Define a semantic map using MapEncoding component
121 | layer1 = MapLayer(subscribes_to=detections_topic, temporal_change=True)
122 | layer2 = MapLayer(subscribes_to=introspection_answer, resolution_multiple=3)
123 |
124 | position = Topic(name="odom", msg_type="Odometry")
125 | map_topic = Topic(name="map", msg_type="OccupancyGrid")
126 |
127 | map_conf = MapConfig(map_name="map")
128 | map = MapEncoding(
129 | layers=[layer1, layer2],
130 | position=position,
131 | map_topic=map_topic,
132 | config=map_conf,
133 | db_client=chroma_client,
134 | trigger=15.0,
135 | component_name="map_encoder",
136 | )
137 |
138 | # Define a generic LLM component
139 | llm_query = Topic(name="llm_query", msg_type="String")
140 |
141 | llm = LLM(
142 | inputs=[llm_query],
143 | outputs=[query_answer],
144 | model_client=llama_client,
145 | trigger=[llm_query],
146 | component_name="general_q_and_a",
147 | )
148 |
149 | # Define a Go-to-X component using LLM
150 | goto_query = Topic(name="goto_query", msg_type="String")
151 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
152 |
153 | goto_config = LLMConfig(
154 | enable_rag=True,
155 | collection_name="map",
156 | distance_func="l2",
157 | n_results=1,
158 | add_metadata=True,
159 | )
160 |
161 | goto = LLM(
162 | inputs=[goto_query],
163 | outputs=[goal_point],
164 | model_client=llama_client,
165 | config=goto_config,
166 | db_client=chroma_client,
167 | trigger=goto_query,
168 | component_name="go_to_x",
169 | )
170 |
171 | goto.set_component_prompt(
172 | template="""From the given metadata, extract coordinates and provide
173 | the coordinates in the following json format:\n {"position": coordinates}"""
174 | )
175 |
176 |
177 | # pre-process the output before publishing to a topic of msg_type PoseStamped
178 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
179 | # extract the json part of the output string (including brackets)
180 | # one can use sophisticated regex parsing here but we'll keep it simple
181 | json_string = output[output.find("{") : output.rfind("}") + 1]
182 | # load the string as a json and extract position coordinates
183 | # if there is an error, return None, i.e. no output would be published to goal_point
184 | try:
185 | json_dict = json.loads(json_string)
186 | coordinates = np.fromstring(json_dict["position"], sep=",", dtype=np.float64)
187 | print("Coordinates Extracted:", coordinates)
188 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3:
189 | return
190 | elif (
191 | coordinates.shape[0] == 2
192 | ): # sometimes LLMs avoid adding the zeros of z-dimension
193 | coordinates = np.append(coordinates, 0)
194 | return coordinates
195 | except Exception:
196 | return
197 |
198 |
199 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
200 |
201 | # Define a semantic router between a generic LLM component, VQA MLLM component and Go-to-X component
202 | goto_route = Route(
203 | routes_to=goto_query,
204 | samples=[
205 | "Go to the door",
206 | "Go to the kitchen",
207 | "Get me a glass",
208 | "Fetch a ball",
209 | "Go to hallway",
210 | ],
211 | )
212 |
213 | llm_route = Route(
214 | routes_to=llm_query,
215 | samples=[
216 | "What is the capital of France?",
217 | "Is there life on Mars?",
218 | "How many tablespoons in a cup?",
219 | "How are you today?",
220 | "Whats up?",
221 | ],
222 | )
223 |
224 | mllm_route = Route(
225 | routes_to=mllm_query,
226 | samples=[
227 | "Are we indoors or outdoors",
228 | "What do you see?",
229 | "Whats in front of you?",
230 | "Where are we",
231 | "Do you see any people?",
232 | "How many things are infront of you?",
233 | "Is this room occupied?",
234 | ],
235 | )
236 |
237 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2")
238 | # Initialize the router component
239 | router = SemanticRouter(
240 | inputs=[query_topic],
241 | routes=[llm_route, goto_route, mllm_route],
242 | default_route=llm_route,
243 | config=router_config,
244 | db_client=chroma_client,
245 | component_name="router",
246 | )
247 |
248 | # Launch the components
249 | launcher = Launcher()
250 | launcher.add_pkg(
251 | components=[
252 | mllm,
253 | llm,
254 | goto,
255 | introspector,
256 | map,
257 | router,
258 | speech_to_text,
259 | text_to_speech,
260 | vision,
261 | ]
262 | )
263 | launcher.bringup()
264 |
--------------------------------------------------------------------------------
/examples/complete_agent_multiprocessing.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import json
3 | from typing import Optional
4 | from agents.components import (
5 | MLLM,
6 | SpeechToText,
7 | TextToSpeech,
8 | LLM,
9 | Vision,
10 | MapEncoding,
11 | SemanticRouter,
12 | )
13 | from agents.config import TextToSpeechConfig
14 | from agents.clients.roboml import HTTPModelClient, RESPModelClient, HTTPDBClient
15 | from agents.clients.ollama import OllamaClient
16 | from agents.models import Whisper, SpeechT5, Llava, Llama3_1, VisionModel
17 | from agents.vectordbs import ChromaDB
18 | from agents.config import VisionConfig, LLMConfig, MapConfig, SemanticRouterConfig
19 | from agents.ros import Topic, Launcher, FixedInput, MapLayer, Route
20 |
21 |
22 | ### Setup our models and vectordb ###
23 | whisper = Whisper(name="whisper")
24 | whisper_client = HTTPModelClient(whisper)
25 | speecht5 = SpeechT5(name="speecht5")
26 | speecht5_client = HTTPModelClient(speecht5)
27 | object_detection_model = VisionModel(
28 | name="dino_4scale", checkpoint="dino-4scale_r50_8xb2-12e_coco"
29 | )
30 | detection_client = RESPModelClient(object_detection_model)
31 | llava = Llava(name="llava")
32 | llava_client = OllamaClient(llava)
33 | llama = Llama3_1(name="llama")
34 | llama_client = OllamaClient(llama)
35 | chroma = ChromaDB(name="MainDB")
36 | chroma_client = HTTPDBClient(db=chroma)
37 |
38 | ### Setup our components ###
39 | # Setup a speech to text component
40 | audio_in = Topic(name="audio0", msg_type="Audio")
41 | query_topic = Topic(name="question", msg_type="String")
42 |
43 | speech_to_text = SpeechToText(
44 | inputs=[audio_in],
45 | outputs=[query_topic],
46 | model_client=whisper_client,
47 | trigger=audio_in,
48 | component_name="speech_to_text",
49 | )
50 |
51 | # Setup a text to speech component
52 | query_answer = Topic(name="answer", msg_type="String")
53 |
54 | t2s_config = TextToSpeechConfig(play_on_device=True)
55 |
56 | text_to_speech = TextToSpeech(
57 | inputs=[query_answer],
58 | trigger=query_answer,
59 | model_client=speecht5_client,
60 | config=t2s_config,
61 | component_name="text_to_speech",
62 | )
63 |
64 | # Setup a vision component for object detection
65 | image0 = Topic(name="image_raw", msg_type="Image")
66 | detections_topic = Topic(name="detections", msg_type="Detections")
67 |
68 | detection_config = VisionConfig(threshold=0.5)
69 | vision = Vision(
70 | inputs=[image0],
71 | outputs=[detections_topic],
72 | trigger=image0,
73 | config=detection_config,
74 | model_client=detection_client,
75 | component_name="object_detection",
76 | )
77 |
78 | # Define a generic mllm component for vqa
79 | mllm_query = Topic(name="mllm_query", msg_type="String")
80 |
81 | mllm = MLLM(
82 | inputs=[mllm_query, image0, detections_topic],
83 | outputs=[query_answer],
84 | model_client=llava_client,
85 | trigger=mllm_query,
86 | component_name="visual_q_and_a",
87 | )
88 |
89 | mllm.set_component_prompt(
90 | template="""Imagine you are a robot.
91 | This image has following items: {{ detections }}.
92 | Answer the following about this image: {{ text0 }}"""
93 | )
94 |
95 | # Define a fixed input mllm component that does introspection
96 | introspection_query = FixedInput(
97 | name="introspection_query",
98 | msg_type="String",
99 | fixed="What kind of a room is this? Is it an office, a bedroom or a kitchen? Give a one word answer, out of the given choices",
100 | )
101 | introspection_answer = Topic(name="introspection_answer", msg_type="String")
102 |
103 | introspector = MLLM(
104 | inputs=[introspection_query, image0],
105 | outputs=[introspection_answer],
106 | model_client=llava_client,
107 | trigger=15.0,
108 | component_name="introspector",
109 | )
110 |
111 |
112 | def introspection_validation(output: str) -> Optional[str]:
113 | for option in ["office", "bedroom", "kitchen"]:
114 | if option in output.lower():
115 | return option
116 |
117 |
118 | introspector.add_publisher_preprocessor(introspection_answer, introspection_validation)
119 |
120 | # Define a semantic map using MapEncoding component
121 | layer1 = MapLayer(subscribes_to=detections_topic, temporal_change=True)
122 | layer2 = MapLayer(
123 | subscribes_to=introspection_answer,
124 | resolution_multiple=3,
125 | pre_defined=[(np.array([1.1, 2.1, 3.2]), "The door is here. DOOR.")],
126 | )
127 |
128 | position = Topic(name="odom", msg_type="Odometry")
129 | map_topic = Topic(name="map", msg_type="OccupancyGrid")
130 |
131 | map_conf = MapConfig(map_name="map")
132 | map = MapEncoding(
133 | layers=[layer1, layer2],
134 | position=position,
135 | map_topic=map_topic,
136 | config=map_conf,
137 | db_client=chroma_client,
138 | trigger=15.0,
139 | component_name="map_encoder",
140 | )
141 |
142 | # Define a generic LLM component
143 | llm_query = Topic(name="llm_query", msg_type="String")
144 |
145 | llm = LLM(
146 | inputs=[llm_query],
147 | outputs=[query_answer],
148 | model_client=llama_client,
149 | trigger=[llm_query],
150 | component_name="general_q_and_a",
151 | )
152 |
153 | # Define a Go-to-X component using LLM
154 | goto_query = Topic(name="goto_query", msg_type="String")
155 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
156 |
157 | goto_config = LLMConfig(
158 | enable_rag=True,
159 | collection_name="map",
160 | distance_func="l2",
161 | n_results=1,
162 | add_metadata=True,
163 | )
164 |
165 | goto = LLM(
166 | inputs=[goto_query],
167 | outputs=[goal_point],
168 | model_client=llama_client,
169 | config=goto_config,
170 | db_client=chroma_client,
171 | trigger=goto_query,
172 | component_name="go_to_x",
173 | )
174 |
175 | goto.set_component_prompt(
176 | template="""From the given metadata, extract coordinates and provide
177 | the coordinates in the following json format:\n {"position": coordinates}"""
178 | )
179 |
180 |
181 | # pre-process the output before publishing to a topic of msg_type PoseStamped
182 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
183 | # extract the json part of the output string (including brackets)
184 | # one can use sophisticated regex parsing here but we'll keep it simple
185 | json_string = output[output.find("{") : output.rfind("}") + 1]
186 | # load the string as a json and extract position coordinates
187 | # if there is an error, return None, i.e. no output would be published to goal_point
188 | try:
189 | json_dict = json.loads(json_string)
190 | coordinates = np.fromstring(json_dict["position"], sep=",", dtype=np.float64)
191 | print("Coordinates Extracted:", coordinates)
192 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3:
193 | return
194 | elif (
195 | coordinates.shape[0] == 2
196 | ): # sometimes LLMs avoid adding the zeros of z-dimension
197 | coordinates = np.append(coordinates, 0)
198 | return coordinates
199 | except Exception:
200 | return
201 |
202 |
203 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
204 |
205 | # Define a semantic router between a generic LLM component, VQA MLLM component and Go-to-X component
206 | goto_route = Route(
207 | routes_to=goto_query,
208 | samples=[
209 | "Go to the door",
210 | "Go to the kitchen",
211 | "Get me a glass",
212 | "Fetch a ball",
213 | "Go to hallway",
214 | ],
215 | )
216 |
217 | llm_route = Route(
218 | routes_to=llm_query,
219 | samples=[
220 | "What is the capital of France?",
221 | "Is there life on Mars?",
222 | "How many tablespoons in a cup?",
223 | "How are you today?",
224 | "Whats up?",
225 | ],
226 | )
227 |
228 | mllm_route = Route(
229 | routes_to=mllm_query,
230 | samples=[
231 | "Are we indoors or outdoors",
232 | "What do you see?",
233 | "Whats in front of you?",
234 | "Where are we",
235 | "Do you see any people?",
236 | "How many things are infront of you?",
237 | "Is this room occupied?",
238 | ],
239 | )
240 |
241 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2")
242 | # Initialize the router component
243 | router = SemanticRouter(
244 | inputs=[query_topic],
245 | routes=[llm_route, goto_route, mllm_route],
246 | default_route=llm_route,
247 | config=router_config,
248 | db_client=chroma_client,
249 | component_name="router",
250 | )
251 |
252 | # Launch the components
253 | launcher = Launcher()
254 | launcher.add_pkg(
255 | components=[
256 | mllm,
257 | llm,
258 | goto,
259 | introspector,
260 | map,
261 | router,
262 | speech_to_text,
263 | text_to_speech,
264 | vision,
265 | ],
266 | package_name="automatika_embodied_agents",
267 | multiprocessing=True,
268 | )
269 | launcher.on_fail(action_name="restart")
270 | launcher.fallback_rate = 1 / 10 # 0.1 Hz or 10 seconds
271 | launcher.bringup()
272 |
--------------------------------------------------------------------------------
/examples/conversational_agent_with_audio.py:
--------------------------------------------------------------------------------
1 | from agents.components import MLLM, SpeechToText, TextToSpeech
2 | from agents.config import SpeechToTextConfig, TextToSpeechConfig
3 | from agents.clients.roboml import HTTPModelClient
4 | from agents.clients.ollama import OllamaClient
5 | from agents.models import Whisper, SpeechT5, Llava
6 | from agents.ros import Topic, Launcher
7 |
8 | audio_in = Topic(name="audio0", msg_type="Audio")
9 | text_query = Topic(name="text0", msg_type="String")
10 |
11 | whisper = Whisper(name="whisper") # Custom model init params can be provided here
12 | roboml_whisper = HTTPModelClient(whisper)
13 |
14 | s2t_config = SpeechToTextConfig(
15 | enable_vad=True, # option to listen for speech through the microphone
16 | enable_wakeword=True, # option to invoke the component with a wakeword like 'hey jarvis'
17 | )
18 | speech_to_text = SpeechToText(
19 | inputs=[audio_in],
20 | outputs=[text_query],
21 | model_client=roboml_whisper,
22 | trigger=audio_in,
23 | config=s2t_config,
24 | component_name="speech_to_text",
25 | )
26 |
27 | image0 = Topic(name="image_raw", msg_type="Image")
28 | text_answer = Topic(name="text1", msg_type="String")
29 |
30 | llava = Llava(name="llava")
31 | llava_client = OllamaClient(llava)
32 |
33 | mllm = MLLM(
34 | inputs=[text_query, image0],
35 | outputs=[text_answer],
36 | model_client=llava_client,
37 | trigger=text_query,
38 | component_name="vqa",
39 | )
40 |
41 | # config for playing audio on device
42 | t2s_config = TextToSpeechConfig(play_on_device=True)
43 |
44 | speecht5 = SpeechT5(name="speecht5")
45 | roboml_speecht5 = HTTPModelClient(speecht5)
46 | text_to_speech = TextToSpeech(
47 | inputs=[text_answer],
48 | trigger=text_answer,
49 | model_client=roboml_speecht5,
50 | config=t2s_config,
51 | component_name="text_to_speech",
52 | )
53 |
54 | launcher = Launcher()
55 | launcher.add_pkg(
56 | components=[speech_to_text, mllm, text_to_speech],
57 | )
58 | launcher.bringup()
59 |
--------------------------------------------------------------------------------
/examples/go_to_x.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | import json
3 | import numpy as np
4 | from agents.components import LLM
5 | from agents.models import Llama3_1
6 | from agents.vectordbs import ChromaDB
7 | from agents.config import LLMConfig
8 | from agents.clients.roboml import HTTPDBClient
9 | from agents.clients.ollama import OllamaClient
10 | from agents.ros import Launcher, Topic
11 |
12 | # Start a Llama3.1 based llm component using ollama client
13 | llama = Llama3_1(name="llama")
14 | llama_client = OllamaClient(llama)
15 |
16 | # Initialize a vector DB that will store our routes
17 | chroma = ChromaDB(name="MainDB")
18 | chroma_client = HTTPDBClient(db=chroma)
19 |
20 | # Define LLM input and output topics including goal_point topic of type PoseStamped
21 | goto_in = Topic(name="goto_in", msg_type="String")
22 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
23 |
24 | config = LLMConfig(
25 | enable_rag=True,
26 | collection_name="map",
27 | distance_func="l2",
28 | n_results=1,
29 | add_metadata=True,
30 | )
31 |
32 | # initialize the component
33 | goto = LLM(
34 | inputs=[goto_in],
35 | outputs=[goal_point],
36 | model_client=llama_client,
37 | db_client=chroma_client, # check the previous example where we setup this database client
38 | trigger=goto_in,
39 | config=config,
40 | component_name="go_to_x",
41 | )
42 |
43 | # set a component prompt
44 | goto.set_component_prompt(
45 | template="""From the given metadata, extract coordinates and provide
46 | the coordinates in the following json format:\n {"position": coordinates}"""
47 | )
48 |
49 |
50 | # pre-process the output before publishing to a topic of msg_type PoseStamped
51 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
52 | # extract the json part of the output string (including brackets)
53 | # one can use sophisticated regex parsing here but we'll keep it simple
54 | json_string = output[output.find("{") : output.rfind("}") + 1]
55 | # load the string as a json and extract position coordinates
56 | # if there is an error, return None, i.e. no output would be published to goal_point
57 | try:
58 | json_dict = json.loads(json_string)
59 | coordinates = np.fromstring(json_dict["position"], sep=",", dtype=np.float64)
60 | print("Coordinates Extracted:", coordinates)
61 | if coordinates.shape[0] < 2 or coordinates.shape[0] > 3:
62 | return
63 | elif (
64 | coordinates.shape[0] == 2
65 | ): # sometimes LLMs avoid adding the zeros of z-dimension
66 | coordinates = np.append(coordinates, 0)
67 | return coordinates
68 | except Exception:
69 | return
70 |
71 |
72 | # add the pre-processing function to the goal_point output topic
73 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
74 |
75 | # Launch the component
76 | launcher = Launcher()
77 | launcher.add_pkg(components=[goto])
78 | launcher.bringup()
79 |
--------------------------------------------------------------------------------
/examples/prompt_engineering.py:
--------------------------------------------------------------------------------
1 | from agents.components import Vision, MLLM
2 | from agents.models import VisionModel, Idefics2
3 | from agents.clients.roboml import RESPModelClient, HTTPModelClient
4 | from agents.ros import Topic, Launcher
5 | from agents.config import VisionConfig
6 |
7 | image0 = Topic(name="image_raw", msg_type="Image")
8 | detections_topic = Topic(name="detections", msg_type="Detections")
9 |
10 | object_detection = VisionModel(
11 | name="object_detection", checkpoint="dino-4scale_r50_8xb2-12e_coco"
12 | )
13 | roboml_detection = RESPModelClient(object_detection)
14 |
15 | detection_config = VisionConfig(threshold=0.5)
16 | vision = Vision(
17 | inputs=[image0],
18 | outputs=[detections_topic],
19 | trigger=image0,
20 | config=detection_config,
21 | model_client=roboml_detection,
22 | component_name="detection_component",
23 | )
24 |
25 | text_query = Topic(name="text0", msg_type="String")
26 | text_answer = Topic(name="text1", msg_type="String")
27 |
28 | idefics = Idefics2(name="idefics_model")
29 | idefics_client = HTTPModelClient(idefics)
30 |
31 | mllm = MLLM(
32 | inputs=[text_query, image0, detections_topic],
33 | outputs=[text_answer],
34 | model_client=idefics_client,
35 | trigger=text_query,
36 | component_name="mllm_component",
37 | )
38 |
39 | mllm.set_component_prompt(
40 | template="""Imagine you are a robot.
41 | This image has following items: {{ detections }}.
42 | Answer the following about this image: {{ text0 }}"""
43 | )
44 | launcher = Launcher()
45 | launcher.add_pkg(components=[vision, mllm])
46 | launcher.bringup()
47 |
--------------------------------------------------------------------------------
/examples/semantic_map.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from agents.components import MapEncoding, Vision, MLLM
3 | from agents.models import VisionModel, Llava
4 | from agents.clients.roboml import RESPModelClient, HTTPDBClient
5 | from agents.clients.ollama import OllamaClient
6 | from agents.ros import Topic, MapLayer, Launcher, FixedInput
7 | from agents.vectordbs import ChromaDB
8 | from agents.config import MapConfig, VisionConfig
9 |
10 | # Define the image input topic
11 | image0 = Topic(name="image_raw", msg_type="Image")
12 | # Create a detection topic
13 | detections_topic = Topic(name="detections", msg_type="Detections")
14 |
15 | # Add an object detection model
16 | object_detection = VisionModel(
17 | name="object_detection", checkpoint="dino-4scale_r50_8xb2-12e_coco"
18 | )
19 | roboml_detection = RESPModelClient(object_detection)
20 |
21 | # Initialize the Vision component
22 | detection_config = VisionConfig(threshold=0.5)
23 | vision = Vision(
24 | inputs=[image0],
25 | outputs=[detections_topic],
26 | trigger=image0,
27 | config=detection_config,
28 | model_client=roboml_detection,
29 | component_name="detection_component",
30 | )
31 |
32 |
33 | # Define a model client (working with Ollama in this case)
34 | llava = Llava(name="llava")
35 | llava_client = OllamaClient(llava)
36 |
37 | # Define a fixed input for the component
38 | introspection_query = FixedInput(
39 | name="introspection_query",
40 | msg_type="String",
41 | fixed="What kind of a room is this? Is it an office, a bedroom or a kitchen? Give a one word answer, out of the given choices",
42 | )
43 | # Define output of the component
44 | introspection_answer = Topic(name="introspection_answer", msg_type="String")
45 |
46 | # Start a timed (periodic) component using the mllm model defined earlier
47 | # This component answers the same question after every 15 seconds
48 | introspector = MLLM(
49 | inputs=[introspection_query, image0], # we use the image0 topic defined earlier
50 | outputs=[introspection_answer],
51 | model_client=llava_client,
52 | trigger=15.0, # we provide the time interval as a float value to the trigger parameter
53 | component_name="introspector",
54 | )
55 |
56 |
57 | # Define an arbitrary function to validate the output of the introspective component
58 | # before publication.
59 | def introspection_validation(output: str) -> Optional[str]:
60 | for option in ["office", "bedroom", "kitchen"]:
61 | if option in output.lower():
62 | return option
63 |
64 |
65 | introspector.add_publisher_preprocessor(introspection_answer, introspection_validation)
66 |
67 | # Object detection output from vision component
68 | layer1 = MapLayer(subscribes_to=detections_topic, temporal_change=True)
69 | # Introspection output from mllm component
70 | layer2 = MapLayer(subscribes_to=introspection_answer, resolution_multiple=3)
71 |
72 | # Initialize mandatory topics defining the robots localization in space
73 | position = Topic(name="odom", msg_type="Odometry")
74 | map_topic = Topic(name="map", msg_type="OccupancyGrid")
75 |
76 | # Initialize a vector DB that will store our semantic map
77 | chroma = ChromaDB(name="MainDB")
78 | chroma_client = HTTPDBClient(db=chroma)
79 |
80 | # Create the map component
81 | map_conf = MapConfig(map_name="map") # We give our map a name
82 | map = MapEncoding(
83 | layers=[layer1, layer2],
84 | position=position,
85 | map_topic=map_topic,
86 | config=map_conf,
87 | db_client=chroma_client,
88 | trigger=15.0,
89 | component_name="map_encoding",
90 | )
91 |
92 | # Launch the components
93 | launcher = Launcher()
94 | launcher.add_pkg(components=[vision, introspector, map])
95 | launcher.bringup()
96 |
--------------------------------------------------------------------------------
/examples/semantic_router.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | import json
3 | import numpy as np
4 | from agents.components import LLM, SemanticRouter
5 | from agents.models import Llama3_1
6 | from agents.vectordbs import ChromaDB
7 | from agents.config import LLMConfig, SemanticRouterConfig
8 | from agents.clients.roboml import HTTPDBClient
9 | from agents.clients.ollama import OllamaClient
10 | from agents.ros import Launcher, Topic, Route
11 |
12 |
13 | # Start a Llama3.1 based llm component using ollama client
14 | llama = Llama3_1(name="llama")
15 | llama_client = OllamaClient(llama)
16 |
17 | # Initialize a vector DB that will store our routes
18 | chroma = ChromaDB(name="MainDB")
19 | chroma_client = HTTPDBClient(db=chroma)
20 |
21 |
22 | # Make a generic LLM component using the Llama3_1 model
23 | llm_in = Topic(name="text_in_llm", msg_type="String")
24 | llm_out = Topic(name="text_out_llm", msg_type="String")
25 |
26 | llm = LLM(inputs=[llm_in], outputs=[llm_out], model_client=llama_client, trigger=llm_in)
27 |
28 |
29 | # Define LLM input and output topics including goal_point topic of type PoseStamped
30 | goto_in = Topic(name="goto_in", msg_type="String")
31 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
32 |
33 | config = LLMConfig(
34 | enable_rag=True,
35 | collection_name="map",
36 | distance_func="l2",
37 | n_results=1,
38 | add_metadata=True,
39 | )
40 |
41 | # initialize the component
42 | goto = LLM(
43 | inputs=[goto_in],
44 | outputs=[goal_point],
45 | model_client=llama_client,
46 | db_client=chroma_client, # check the previous example where we setup this database client
47 | trigger=goto_in,
48 | config=config,
49 | component_name="go_to_x",
50 | )
51 |
52 | # set a component prompt
53 | goto.set_component_prompt(
54 | template="""From the given metadata, extract coordinates and provide
55 | the coordinates in the following json format:\n {"position": coordinates}"""
56 | )
57 |
58 |
59 | # pre-process the output before publishing to a topic of msg_type PoseStamped
60 | def llm_answer_to_goal_point(output: str) -> Optional[np.ndarray]:
61 | # extract the json part of the output string (including brackets)
62 | # one can use sophisticated regex parsing here but we'll keep it simple
63 | json_string = output[output.find("{") : output.find("}") + 1]
64 |
65 | # load the string as a json and extract position coordinates
66 | # if there is an error, return None, i.e. no output would be published to goal_point
67 | try:
68 | json_dict = json.loads(json_string)
69 | return np.array(json_dict["position"])
70 | except Exception:
71 | return
72 |
73 |
74 | # add the pre-processing function to the goal_point output topic
75 | goto.add_publisher_preprocessor(goal_point, llm_answer_to_goal_point)
76 |
77 | # Create the input topic for the router
78 | query_topic = Topic(name="question", msg_type="String")
79 |
80 | # Define a route to a topic that processes go-to-x commands
81 | goto_route = Route(
82 | routes_to=goto_in,
83 | samples=[
84 | "Go to the door",
85 | "Go to the kitchen",
86 | "Get me a glass",
87 | "Fetch a ball",
88 | "Go to hallway",
89 | ],
90 | )
91 |
92 | # Define a route to a topic that is input to an LLM component
93 | llm_route = Route(
94 | routes_to=llm_in,
95 | samples=[
96 | "What is the capital of France?",
97 | "Is there life on Mars?",
98 | "How many tablespoons in a cup?",
99 | "How are you today?",
100 | "Whats up?",
101 | ],
102 | )
103 |
104 | router_config = SemanticRouterConfig(router_name="go-to-router", distance_func="l2")
105 | # Initialize the router component
106 | router = SemanticRouter(
107 | inputs=[query_topic],
108 | routes=[llm_route, goto_route],
109 | default_route=llm_route, # If none of the routes fall within a distance threshold
110 | config=router_config,
111 | db_client=chroma_client, # reusing the db_client from the previous example
112 | component_name="router",
113 | )
114 |
115 | # Launch the components
116 | launcher = Launcher()
117 | launcher.add_pkg(components=[llm, goto, router])
118 | launcher.bringup()
119 |
--------------------------------------------------------------------------------
/examples/tool_calling.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from agents.components import LLM
3 | from agents.models import Llama3_1
4 | from agents.vectordbs import ChromaDB
5 | from agents.config import LLMConfig
6 | from agents.clients.roboml import HTTPDBClient
7 | from agents.clients.ollama import OllamaClient
8 | from agents.ros import Launcher, Topic
9 |
10 | # Start a Llama3.1 based llm component using ollama client
11 | llama = Llama3_1(name="llama")
12 | llama_client = OllamaClient(llama)
13 |
14 | # Initialize a vector DB that will store our routes
15 | chroma = ChromaDB(name="MainDB")
16 | chroma_client = HTTPDBClient(db=chroma)
17 |
18 | # Define LLM input and output topics including goal_point topic of type PoseStamped
19 | goto_in = Topic(name="goto_in", msg_type="String")
20 | goal_point = Topic(name="goal_point", msg_type="PoseStamped")
21 |
22 | config = LLMConfig(
23 | enable_rag=True,
24 | collection_name="map",
25 | distance_func="l2",
26 | n_results=1,
27 | add_metadata=True,
28 | )
29 |
30 | # initialize the component
31 | goto = LLM(
32 | inputs=[goto_in],
33 | outputs=[goal_point],
34 | model_client=llama_client,
35 | db_client=chroma_client, # check the previous example where we setup this database client
36 | trigger=goto_in,
37 | config=config,
38 | component_name="go_to_x",
39 | )
40 |
41 | # set a component prompt
42 | goto.set_component_prompt(
43 | template="""What are the position coordinates in the given metadata?"""
44 | )
45 |
46 |
47 | # pre-process the output before publishing to a topic of msg_type PoseStamped
48 | def get_coordinates(position: list[float]) -> np.ndarray:
49 | """Get position coordinates"""
50 | return np.array(position, dtype=float)
51 |
52 |
53 | function_description = {
54 | "type": "function",
55 | "function": {
56 | "name": "get_coordinates",
57 | "description": "Get position coordinates",
58 | "parameters": {
59 | "type": "object",
60 | "properties": {
61 | "position": {
62 | "type": "list[float]",
63 | "description": "The position coordinates in x, y and z",
64 | }
65 | },
66 | },
67 | "required": ["position"],
68 | },
69 | }
70 |
71 | # add the pre-processing function to the goal_point output topic
72 | goto.register_tool(
73 | tool=get_coordinates,
74 | tool_description=function_description,
75 | send_tool_response_to_model=False,
76 | )
77 |
78 | # Launch the component
79 | launcher = Launcher()
80 | launcher.add_pkg(components=[goto])
81 | launcher.bringup()
82 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.pytest.ini_options]
2 | minversion = "6.0"
3 | addopts = "-ra -q"
4 | log_cli = true
5 | log_cli_level = "INFO"
6 | log_cli_format="[%(levelname)s] [%(asctime)s] [%(name)s] [%(process)d-%(thread)d] %(message)s"
7 | testpaths = [
8 | "agents/tests"
9 | ]
10 |
11 | [tool.interrogate]
12 | ignore-init-method = true
13 | ignore-init-module = true
14 | ignore-magic = false
15 | ignore-semiprivate = false
16 | ignore-private = false
17 | ignore-property-decorators = false
18 | ignore-module = true
19 | ignore-nested-functions = false
20 | ignore-nested-classes = true
21 | ignore-setters = false
22 | exclude = ["setup.py", "docs", "build", "log", "install", "agents/tests", "examples"]
23 | ignore-regex = ["^get$", "^mock_.*", ".*BaseClass.*", "^main"]
24 | quiet = false
25 | whitelist-regex = []
26 | color = true
27 | generate-badge = "."
28 | badge-format = "svg"
29 |
30 | [tool.ruff]
31 | extend-exclude = [".mypy_cache", ".tox", ".venv", "buck-out", "build", ".pytest_cache"]
32 | fix = true
33 | line-length = 88
34 | preview = true
35 | [tool.ruff.lint]
36 | ignore = ["E203", "E266", "E501", "F403", "F401"]
37 | select = ["B","C","E","F","W","B9"]
38 | [tool.ruff.lint.mccabe]
39 | max-complexity = 11
40 |
41 | [tool.bumpver]
42 | current_version = "0.3.1"
43 | version_pattern = "MAJOR.MINOR.PATCH"
44 | commit_message = "(chore) bump version {old_version} -> {new_version}"
45 | tag_message = "{new_version}"
46 | tag_scope = "default"
47 | pre_commit_hook = ""
48 | post_commit_hook = ""
49 | commit = true
50 | tag = true
51 | push = true
52 |
53 | [tool.bumpver.file_patterns]
54 | "agents/package.xml" = [
55 | "{version}",
56 | ]
57 |
--------------------------------------------------------------------------------