├── .gitignore ├── LICENSE ├── README.md ├── __pycache__ ├── async_tracking.cpython-312.pyc └── constants.cpython-312.pyc ├── async_tracking.py ├── constants.py ├── main.py ├── servoing_example.py ├── tracking_publisher.py └── webcam_publisher.py /.gitignore: -------------------------------------------------------------------------------- 1 | api_key.txt 2 | *.gz -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Realtime object detection & tracking with moondream.ai 2 | 3 | A Python script that demonstrates asynchronous object detection and tracking using: 4 | - [OpenCV](https://opencv.org/) for video capture, feature tracking (Lucas-Kanade optical flow), and drawing 5 | - A [Kalman Filter](https://en.wikipedia.org/wiki/Kalman_filter) for smoothing tracked coordinates 6 | - [Moondream](https://moondream.ai/) for VLM based object detection 7 | - [Zenoh](https://zenoh.io/) for network communication between components 8 | 9 | Disclaimer: If not already obvious this document was written by an LLM but should be mostly correct. 10 | 11 | ## System Architecture 12 | 13 | The system consists of three main components that can run on different machines as long as they're on the same network: 14 | 15 | 1. **Webcam Publisher (`webcam_publisher.py`)** 16 | - Captures frames from a webcam and publishes them to a Zenoh topic (`robot/camera/frame`) 17 | - Can run on either your laptop or directly on a robot 18 | - Configurable parameters for camera resolution, FPS, and frame processing 19 | - Supports both monocular and stereo camera setups with cropping options 20 | 21 | 2. **Object Tracking Publisher (`tracking_publisher.py`)** 22 | - Subscribes to camera frames from the webcam publisher 23 | - Uses Moondream for object detection and tracking 24 | - Publishes normalized object position and size information to `tracking/position` 25 | - Position messages are published in the following JSON format: 26 | ```json 27 | { 28 | "x": float, // Normalized x position (0-1) 29 | "y": float, // Normalized y position (0-1) 30 | "width": float, // Normalized width (0-1) 31 | "height": float, // Normalized height (0-1) 32 | "timestamp": float // Unix timestamp 33 | } 34 | ``` 35 | 36 | 3. **Visual Servoing Example (`servoing_example.py`)** 37 | - Demonstrates how to use tracking for robot control 38 | - Subscribes to camera frames and computes twist commands 39 | - Publishes robot control commands to `robot/cmd` topic 40 | - Twist messages are in the following format: 41 | ```json 42 | { 43 | "x": float, // Linear velocity in m/s 44 | "theta": float // Angular velocity in rad/s 45 | } 46 | ``` 47 | 48 | ## Features 49 | 50 | 1. **Real-time Detection** 51 | A detection thread periodically attempts to detect a specified object in the camera feed using the Moondream model. 52 | 53 | 2. **Fast Optical Flow Tracking** 54 | The main loop uses lightweight Lucas-Kanade optical flow to track features within the bounding box, ensuring fast updates. 55 | 56 | 3. **Rewind and Reapply** 57 | When a new detection arrives, it's applied retroactively (rewinds to the detection frame, replays up to the present), ensuring the bounding box and features remain up-to-date. 58 | 59 | You may also need libraries for concurrency (threading) and data structures (collections), but these are typically part of Python's standard library. 60 | 61 | ## Setup 62 | 63 | 1. **Clone or Download** this repository. 64 | 2. **Install dependencies** (using `pip`): 65 | 66 | ```bash 67 | pip install opencv-python Pillow numpy moondream eclipse-zenoh rerun-sdk 68 | ``` 69 | 70 | 3. **API Key** 71 | This script reads an API key from a file named `api_key.txt` in the same directory. Make sure you have a file `api_key.txt` containing your Moondream API key: 72 | 73 | ``` 74 | your_moondream_api_key_goes_here 75 | ``` 76 | 77 | 4. **Run Components** 78 | First, start the webcam publisher: 79 | ```bash 80 | python webcam_publisher.py 81 | ``` 82 | 83 | Then in a separate terminal, start object tracking: 84 | ```bash 85 | python tracking_publisher.py --prompt "object to track" 86 | ``` 87 | 88 | Optionally, run the visual servoing example: 89 | ```bash 90 | python servoing_example.py --prompt "object to track" 91 | ``` 92 | Replace "object to track" with your target object (e.g., "red ball", "person", etc.) 93 | 94 | ## Usage 95 | 96 | 1. **Network Setup** 97 | - Ensure all machines are on the same network 98 | - Zenoh will automatically discover peers on the network 99 | - No manual IP configuration needed in most cases 100 | 101 | 2. **Webcam Publisher** 102 | - Publishes camera frames at configurable resolution and FPS 103 | - Check `constants.py` for camera configuration options 104 | - Supports stereo camera setups with options to crop to left/right frame 105 | 106 | 3. **Object Tracking** 107 | - Subscribes to camera frames and tracks specified object 108 | - Publishes normalized position information (0-1 range) 109 | - Uses Rerun for real-time visualization of tracking 110 | - Position updates are smoothed using Kalman filtering 111 | 112 | 4. **Visual Servoing Example** 113 | - Demonstrates robot control using tracking data 114 | - Configurable gains in `servoing_example.py`: 115 | - `gain`: Controls how aggressively to turn (default: 0.005) 116 | - `max_angular_z`: Maximum turning speed in rad/s (default: 0.35) 117 | 118 | 5. **Controls** 119 | - Press **Ctrl+C** in any terminal to stop the respective component 120 | -------------------------------------------------------------------------------- /__pycache__/async_tracking.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenCaunt/MoondreamObjectTracking/ae6fcdafde83ab40da0c9c741a21e170ded69673/__pycache__/async_tracking.cpython-312.pyc -------------------------------------------------------------------------------- /__pycache__/constants.cpython-312.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BenCaunt/MoondreamObjectTracking/ae6fcdafde83ab40da0c9c741a21e170ded69673/__pycache__/constants.cpython-312.pyc -------------------------------------------------------------------------------- /async_tracking.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import moondream as md 4 | from PIL import Image 5 | import threading 6 | import time 7 | import collections 8 | 9 | # ----- Simple 1D Kalman Filter for one coordinate ----- 10 | class KalmanFilter: 11 | def __init__(self, x=0.0, p=1.0, Q=0.1, R=2.0): 12 | self.x = x # state estimate 13 | self.p = p # estimation covariance 14 | self.Q = Q # process noise covariance 15 | self.R = R # measurement noise covariance 16 | 17 | def predict(self, u): 18 | # Prediction step: add optical flow displacement u 19 | self.x += u 20 | self.p += self.Q 21 | 22 | def update(self, z): 23 | # Correction step: update with measurement z 24 | K = self.p / (self.p + self.R) 25 | self.x += K * (z - self.x) 26 | self.p = (1 - K) * self.p 27 | 28 | 29 | # ----- 2D Kalman Filter using two 1D filters for center (x,y) ----- 30 | class KalmanFilter2D: 31 | def __init__(self, center=(0, 0)): 32 | self.kf_x = KalmanFilter(x=center[0]) 33 | self.kf_y = KalmanFilter(x=center[1]) 34 | 35 | def predict(self, u): 36 | self.kf_x.predict(u[0]) 37 | self.kf_y.predict(u[1]) 38 | 39 | def update(self, z): 40 | self.kf_x.update(z[0]) 41 | self.kf_y.update(z[1]) 42 | 43 | def get_state(self): 44 | return (self.kf_x.x, self.kf_y.x) 45 | 46 | 47 | # ----- Utility Functions ----- 48 | def cv2_to_pil(cv2_image): 49 | rgb = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB) 50 | return Image.fromarray(rgb) 51 | 52 | def draw_bbox(frame, bbox, color=(0, 255, 0), thickness=2): 53 | # bbox is (x_min, y_min, x_max, y_max) 54 | cv2.rectangle(frame, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, thickness) 55 | return frame 56 | 57 | 58 | # ----- Global Shared State ----- 59 | state = { 60 | 'kalman': None, 61 | 'bbox': None, 62 | 'features': None, 63 | 'lock': threading.Lock(), 64 | 'frame_buffer': collections.deque(maxlen=200), 65 | 'pending_detection': None, 66 | 'requested_detection_frame': None 67 | } 68 | 69 | 70 | # ----- Detection Thread ----- 71 | def detection_loop(model, prompt, detection_interval=2.0): 72 | """ 73 | Periodically requests detection but doesn't update Kalman right away. 74 | Instead, it records the result, which the main loop will apply at the correct 'past' frame. 75 | """ 76 | global state 77 | 78 | while True: 79 | time.sleep(detection_interval) 80 | 81 | with state['lock']: 82 | if not state['frame_buffer']: 83 | continue 84 | last_frame_index, last_frame_bgr, last_frame_gray = state['frame_buffer'][-1] 85 | state['requested_detection_frame'] = last_frame_index 86 | 87 | pil_image = cv2_to_pil(last_frame_bgr) 88 | encoded = model.encode_image(pil_image) 89 | result = model.detect(encoded, prompt) 90 | objects = result.get("objects", []) 91 | 92 | if objects: 93 | obj = objects[0] 94 | height, width = last_frame_bgr.shape[:2] 95 | x_min = int(obj['x_min'] * width) 96 | y_min = int(obj['y_min'] * height) 97 | x_max = int(obj['x_max'] * width) 98 | y_max = int(obj['y_max'] * height) 99 | with state['lock']: 100 | state['pending_detection'] = ( 101 | last_frame_index, 102 | (x_min, y_min, x_max, y_max) 103 | ) 104 | 105 | 106 | def run_async_tracking(model, prompt, display_ui=True): 107 | """ 108 | Main tracking logic that combines fast optical flow with slower but accurate detections. 109 | Optionally display a UI with trackbars and the bounding box. 110 | 111 | Args: 112 | model: moondream model instance 113 | prompt: object prompt / label (e.g. "pliers", "coffee mug") 114 | display_ui: if True, show the OpenCV window, trackbars, etc. 115 | """ 116 | global state 117 | 118 | cap = cv2.VideoCapture(0) 119 | if not cap.isOpened(): 120 | print("Error: Cannot open webcam.") 121 | return 122 | 123 | if display_ui: 124 | cv2.namedWindow("Asynchronous Tracking", cv2.WINDOW_NORMAL) 125 | # Create trackbars for maxCorners, qualityLevel (scaled by 100), and minDistance 126 | cv2.createTrackbar("maxCorners", "Asynchronous Tracking", 27, 300, lambda v: None) 127 | cv2.createTrackbar("qualityLevel x100", "Asynchronous Tracking", 10, 100, lambda v: None) 128 | cv2.createTrackbar("minDistance", "Asynchronous Tracking", 10, 50, lambda v: None) 129 | 130 | frame_index = 0 131 | ret, frame = cap.read() 132 | if not ret: 133 | print("Failed to grab initial frame.") 134 | return 135 | 136 | # Initial detection to bootstrap state 137 | pil_image = cv2_to_pil(frame) 138 | encoded = model.encode_image(pil_image) 139 | result = model.detect(encoded, prompt) 140 | objects = result.get("objects", []) 141 | if not objects: 142 | print(f"No {prompt} detected initially.") 143 | return 144 | 145 | obj = objects[0] 146 | height, width = frame.shape[:2] 147 | x_min = int(obj['x_min'] * width) 148 | y_min = int(obj['y_min'] * height) 149 | x_max = int(obj['x_max'] * width) 150 | y_max = int(obj['y_max'] * height) 151 | center_x = (x_min + x_max) // 2 152 | center_y = (y_min + y_max) // 2 153 | 154 | with state['lock']: 155 | state['bbox'] = (x_min, y_min, x_max, y_max) 156 | state['kalman'] = KalmanFilter2D(center=(center_x, center_y)) 157 | gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 158 | mask = np.zeros_like(gray) 159 | mask[y_min:y_max, x_min:x_max] = 255 160 | features = cv2.goodFeaturesToTrack( 161 | gray, 162 | mask=mask, 163 | maxCorners=20, 164 | qualityLevel=0.1, 165 | minDistance=10 166 | ) 167 | state['features'] = features 168 | 169 | # Start detection thread 170 | detection_thread = threading.Thread( 171 | target=detection_loop, 172 | args=(model, prompt), 173 | daemon=True 174 | ) 175 | detection_thread.start() 176 | 177 | prev_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 178 | 179 | while True: 180 | ret, frame = cap.read() 181 | if not ret: 182 | print("Main loop: Failed to grab frame") 183 | break 184 | 185 | curr_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 186 | 187 | with state['lock']: 188 | # Store current frame in the buffer 189 | state['frame_buffer'].append((frame_index, frame.copy(), curr_gray.copy())) 190 | features = state.get('features', None) 191 | 192 | # Lucas-Kanade Optical Flow 193 | if features is not None: 194 | new_features, status, _ = cv2.calcOpticalFlowPyrLK(prev_gray, curr_gray, features, None) 195 | good_old = features[status.flatten() == 1] 196 | good_new = new_features[status.flatten() == 1] 197 | 198 | if len(good_old) > 0: 199 | displacement = np.mean(good_new - good_old, axis=0) 200 | displacement = np.squeeze(displacement) 201 | dx, dy = displacement 202 | with state['lock']: 203 | if state['kalman'] is not None: 204 | state['kalman'].predict((dx, dy)) 205 | if state['bbox'] is not None: 206 | x_min, y_min, x_max, y_max = state['bbox'] 207 | state['bbox'] = (int(x_min + dx), int(y_min + dy), int(x_max + dx), int(y_max + dy)) 208 | 209 | with state['lock']: 210 | state['features'] = new_features 211 | 212 | if display_ui: 213 | # Draw tracked points 214 | for pt in good_new: 215 | x, y = pt.ravel() 216 | cv2.circle(frame, (int(x), int(y)), 3, (0, 0, 255), -1) 217 | 218 | # If there's a completed detection waiting, "rewind" and reapply it 219 | with state['lock']: 220 | if state['pending_detection'] is not None: 221 | det_frame_index, det_bbox = state['pending_detection'] 222 | state['pending_detection'] = None 223 | 224 | # Rewind 225 | state['kalman'] = KalmanFilter2D() 226 | state['bbox'] = None 227 | state['features'] = None 228 | 229 | detection_found = False 230 | # Search for detection frame in buffer 231 | for i, (f_idx, f_bgr, f_gray) in enumerate(state['frame_buffer']): 232 | if f_idx == det_frame_index: 233 | x_min, y_min, x_max, y_max = det_bbox 234 | cx = (x_min + x_max) // 2 235 | cy = (y_min + y_max) // 2 236 | state['kalman'] = KalmanFilter2D(center=(cx, cy)) 237 | state['bbox'] = (x_min, y_min, x_max, y_max) 238 | 239 | mask = np.zeros_like(f_gray) 240 | mask[y_min:y_max, x_min:x_max] = 255 241 | if display_ui: 242 | maxCorners = cv2.getTrackbarPos("maxCorners", "Asynchronous Tracking") 243 | ql_scaled = cv2.getTrackbarPos("qualityLevel x100", "Asynchronous Tracking") 244 | minDistance = cv2.getTrackbarPos("minDistance", "Asynchronous Tracking") 245 | qualityLevel = ql_scaled / 100.0 246 | else: 247 | # Fallback defaults if no UI is displayed 248 | maxCorners = 20 249 | qualityLevel = 0.1 250 | minDistance = 10 251 | 252 | features = cv2.goodFeaturesToTrack( 253 | f_gray, 254 | mask=mask, 255 | maxCorners=maxCorners, 256 | qualityLevel=qualityLevel, 257 | minDistance=minDistance 258 | ) 259 | state['features'] = features 260 | 261 | detection_found = True 262 | detection_index_in_buffer = i 263 | break 264 | 265 | if detection_found: 266 | for j in range(detection_index_in_buffer + 1, len(state['frame_buffer'])): 267 | prev_idx, prev_bgr, prev_gray_buf = state['frame_buffer'][j - 1] 268 | curr_idx, curr_bgr, curr_gray_buf = state['frame_buffer'][j] 269 | 270 | if state['features'] is not None and len(state['features']) > 0: 271 | new_features, status, _ = cv2.calcOpticalFlowPyrLK( 272 | prev_gray_buf, curr_gray_buf, state['features'], None 273 | ) 274 | good_old = state['features'][status.flatten() == 1] 275 | good_new = new_features[status.flatten() == 1] 276 | 277 | if len(good_old) > 0: 278 | displacement = np.mean(good_new - good_old, axis=0) 279 | displacement = np.squeeze(displacement) 280 | dx, dy = displacement 281 | 282 | if state['kalman']: 283 | state['kalman'].predict((dx, dy)) 284 | 285 | if state['bbox'] is not None: 286 | x_min, y_min, x_max, y_max = state['bbox'] 287 | state['bbox'] = (int(x_min + dx), int(y_min + dy), int(x_max + dx), int(y_max + dy)) 288 | 289 | state['features'] = new_features 290 | else: 291 | break 292 | 293 | print(f"Detection backfilled at frame {det_frame_index}; " 294 | f"state is recomputed up to frame {curr_idx}.") 295 | 296 | # Display bounding box if we have one 297 | with state['lock']: 298 | if state['kalman'] is not None and state['bbox'] is not None: 299 | center = state['kalman'].get_state() 300 | x_min, y_min, x_max, y_max = state['bbox'] 301 | w = x_max - x_min 302 | h = y_max - y_min 303 | new_bbox = ( 304 | int(center[0] - w/2), 305 | int(center[1] - h/2), 306 | int(center[0] + w/2), 307 | int(center[1] + h/2) 308 | ) 309 | if display_ui: 310 | draw_bbox(frame, new_bbox) 311 | 312 | if display_ui: 313 | cv2.imshow("Asynchronous Tracking", frame) 314 | if cv2.waitKey(1) & 0xFF == ord('q'): 315 | break 316 | 317 | prev_gray = curr_gray 318 | frame_index += 1 319 | 320 | cap.release() 321 | if display_ui: 322 | cv2.destroyAllWindows() 323 | 324 | 325 | class AsyncTracker: 326 | def __init__(self, model, prompt, detection_interval=2.0, display_ui=False): 327 | self.model = model 328 | self.prompt = prompt 329 | self.detection_interval = detection_interval 330 | self.display_ui = display_ui 331 | 332 | # Tracking state 333 | self.kalman = None 334 | self.bbox = None 335 | self.features = None 336 | self.frame_buffer = collections.deque(maxlen=200) 337 | self.pending_detection = None 338 | self.requested_detection_frame = None 339 | self.frame_index = 0 340 | 341 | # Threading 342 | self.lock = threading.Lock() 343 | self.running = False 344 | self.detection_thread = None 345 | self.last_detection_tick = time.time() # Initialize last detection tick for tick-based updates 346 | 347 | def _cv2_to_pil(self, cv2_image): 348 | rgb = cv2.cvtColor(cv2_image, cv2.COLOR_BGR2RGB) 349 | return Image.fromarray(rgb) 350 | 351 | def _detection_loop(self): 352 | """Detection loop for thread-based operation. Calls tick() repeatedly.""" 353 | while self.running: 354 | self.tick() 355 | time.sleep(0.01) 356 | 357 | def tick(self): 358 | """Perform a single detection update if the detection interval has elapsed. 359 | This function can be called externally to integrate the detection step with other event loops. 360 | """ 361 | current_time = time.time() 362 | if current_time - self.last_detection_tick >= self.detection_interval: 363 | self.last_detection_tick = current_time 364 | with self.lock: 365 | if not self.frame_buffer: 366 | print(f"[{{time.time()}}] [AsyncTracker] No frames available for detection in tick") 367 | return 368 | last_frame_index, last_frame_bgr, last_frame_gray = self.frame_buffer[-1] 369 | self.requested_detection_frame = last_frame_index 370 | pil_image = self._cv2_to_pil(last_frame_bgr) 371 | encoded = self.model.encode_image(pil_image) 372 | result = self.model.detect(encoded, self.prompt) 373 | objects = result.get("objects", []) 374 | if objects: 375 | obj = objects[0] 376 | height, width = last_frame_bgr.shape[:2] 377 | x_min = int(obj['x_min'] * width) 378 | y_min = int(obj['y_min'] * height) 379 | x_max = int(obj['x_max'] * width) 380 | y_max = int(obj['y_max'] * height) 381 | with self.lock: 382 | self.pending_detection = (last_frame_index, (x_min, y_min, x_max, y_max)) 383 | 384 | def start(self): 385 | """Start the tracking system""" 386 | self.running = True 387 | 388 | # Start detection thread 389 | self.detection_thread = threading.Thread( 390 | target=self._detection_loop, 391 | daemon=True 392 | ) 393 | self.detection_thread.start() 394 | 395 | def stop(self): 396 | """Stop the tracking system""" 397 | self.running = False 398 | if self.detection_thread and self.detection_thread.is_alive(): 399 | self.detection_thread.join(timeout=1.0) 400 | 401 | def get_state(self): 402 | """Get current tracking state""" 403 | with self.lock: 404 | return { 405 | 'kalman': self.kalman.get_state() if self.kalman else None, 406 | 'bbox': self.bbox, 407 | 'frame_index': self.frame_index 408 | } 409 | 410 | def process_frame(self, frame): 411 | """Process a new frame and update tracking state""" 412 | curr_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) 413 | print(f"[{time.time()}] [AsyncTracker] Processing frame index: {self.frame_index}") 414 | with self.lock: 415 | print(f"[{time.time()}] [AsyncTracker] Acquired lock for appending frame; buffer length before append: {len(self.frame_buffer)}") 416 | self.frame_buffer.append((self.frame_index, frame.copy(), curr_gray.copy())) 417 | print(f"[{time.time()}] [AsyncTracker] Updated frame_buffer length: {len(self.frame_buffer)}") 418 | features = self.features 419 | print(f"[{time.time()}] [AsyncTracker] Features status: {'available' if features is not None else 'None'}") 420 | 421 | # Get previous frame's grayscale image 422 | if len(self.frame_buffer) > 1: 423 | prev_idx, prev_bgr, prev_gray = self.frame_buffer[-2] 424 | else: 425 | # Initialize tracking on first frame 426 | if not self.kalman: 427 | pil_image = self._cv2_to_pil(frame) 428 | encoded = self.model.encode_image(pil_image) 429 | result = self.model.detect(encoded, self.prompt) 430 | objects = result.get("objects", []) 431 | 432 | if objects: 433 | obj = objects[0] 434 | height, width = frame.shape[:2] 435 | x_min = int(obj['x_min'] * width) 436 | y_min = int(obj['y_min'] * height) 437 | x_max = int(obj['x_max'] * width) 438 | y_max = int(obj['y_max'] * height) 439 | center_x = (x_min + x_max) // 2 440 | center_y = (y_min + y_max) // 2 441 | 442 | with self.lock: 443 | self.bbox = (x_min, y_min, x_max, y_max) 444 | self.kalman = KalmanFilter2D(center=(center_x, center_y)) 445 | mask = np.zeros_like(curr_gray) 446 | mask[y_min:y_max, x_min:x_max] = 255 447 | self.features = cv2.goodFeaturesToTrack( 448 | curr_gray, 449 | mask=mask, 450 | maxCorners=20, 451 | qualityLevel=0.1, 452 | minDistance=10 453 | ) 454 | self.frame_index += 1 455 | return 456 | 457 | # Optical flow tracking 458 | if features is not None: 459 | new_features, status, _ = cv2.calcOpticalFlowPyrLK(prev_gray, curr_gray, features, None) 460 | good_old = features[status.flatten() == 1] 461 | good_new = new_features[status.flatten() == 1] 462 | 463 | if len(good_old) > 0: 464 | displacement = np.mean(good_new - good_old, axis=0) 465 | displacement = np.squeeze(displacement) 466 | dx, dy = displacement 467 | print(f"[{time.time()}] [AsyncTracker] Optical flow displacement: dx={dx}, dy={dy}") 468 | with self.lock: 469 | print(f"[{time.time()}] [AsyncTracker] Acquired lock for updating tracking state") 470 | if self.kalman is not None: 471 | self.kalman.predict((dx, dy)) 472 | if self.bbox is not None: 473 | x_min, y_min, x_max, y_max = self.bbox 474 | self.bbox = (int(x_min + dx), int(y_min + dy), int(x_max + dx), int(y_max + dy)) 475 | self.features = new_features 476 | 477 | if self.display_ui: 478 | # Draw tracked points 479 | for pt in good_new: 480 | x, y = pt.ravel() 481 | cv2.circle(frame, (int(x), int(y)), 3, (0, 0, 255), -1) 482 | 483 | # Handle any pending detection updates 484 | with self.lock: 485 | if self.pending_detection is not None: 486 | print(f"[{time.time()}] [AsyncTracker] Pending detection found, processing detection update") 487 | det_frame_index, det_bbox = self.pending_detection 488 | self.pending_detection = None 489 | # Reset state 490 | self.kalman = KalmanFilter2D() 491 | self.bbox = None 492 | self.features = None 493 | detection_found = False 494 | for i, (f_idx, f_bgr, f_gray) in enumerate(self.frame_buffer): 495 | if f_idx == det_frame_index: 496 | print(f"[{time.time()}] [AsyncTracker] Found detection frame at index {f_idx}") 497 | x_min, y_min, x_max, y_max = det_bbox 498 | cx = (x_min + x_max) // 2 499 | cy = (y_min + y_max) // 2 500 | self.kalman = KalmanFilter2D(center=(cx, cy)) 501 | self.bbox = det_bbox 502 | mask = np.zeros_like(f_gray) 503 | mask[y_min:y_max, x_min:x_max] = 255 504 | self.features = cv2.goodFeaturesToTrack( 505 | f_gray, 506 | mask=mask, 507 | maxCorners=20, 508 | qualityLevel=0.1, 509 | minDistance=10 510 | ) 511 | detection_found = True 512 | detection_index = i 513 | break 514 | if detection_found: 515 | print(f"[{time.time()}] [AsyncTracker] Starting backfill from detection_index: {detection_index}") 516 | self._backfill_tracking(detection_index) 517 | 518 | self.frame_index += 1 519 | 520 | def _backfill_tracking(self, detection_index): 521 | """Backfill tracking state from detection frame to current frame""" 522 | print(f"[{time.time()}] [AsyncTracker] _backfill_tracking started from detection_index={detection_index}") 523 | for j in range(detection_index + 1, len(self.frame_buffer)): 524 | prev_idx, prev_bgr, prev_gray = self.frame_buffer[j - 1] 525 | curr_idx, curr_bgr, curr_gray = self.frame_buffer[j] 526 | if self.features is not None and len(self.features) > 0: 527 | new_features, status, _ = cv2.calcOpticalFlowPyrLK(prev_gray, curr_gray, self.features, None) 528 | good_old = self.features[status.flatten() == 1] 529 | good_new = new_features[status.flatten() == 1] 530 | if len(good_old) > 0: 531 | displacement = np.mean(good_new - good_old, axis=0) 532 | displacement = np.squeeze(displacement) 533 | dx, dy = displacement 534 | print(f"[{time.time()}] [AsyncTracker] Backfill displacement at j={j}: dx={dx}, dy={dy}") 535 | if self.kalman: 536 | self.kalman.predict((dx, dy)) 537 | if self.bbox is not None: 538 | x_min, y_min, x_max, y_max = self.bbox 539 | self.bbox = (int(x_min + dx), int(y_min + dy), int(x_max + dx), int(y_max + dy)) 540 | self.features = new_features 541 | else: 542 | print(f"[{time.time()}] [AsyncTracker] No good features in backfill at j={j}, breaking") 543 | break 544 | else: 545 | print(f"[{time.time()}] [AsyncTracker] No features to track in backfill at j={j}, breaking") 546 | break -------------------------------------------------------------------------------- /constants.py: -------------------------------------------------------------------------------- 1 | CAMERA_WIDTH = 640 2 | CAMERA_HEIGHT = 480 3 | CAMERA_FPS = 30 4 | CAMERA_FRAME_KEY = "robot/camera/frame" 5 | 6 | 7 | FLIP_FRAME = False 8 | # these two should be false unless you are using a stereo camera 9 | CROP_TO_MONO = False 10 | CROP_TO_RIGHT_FRAME = False -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import cv2 3 | import moondream as md 4 | from async_tracking import AsyncTracker 5 | import rerun as rr 6 | import numpy as np 7 | 8 | def main(): 9 | parser = argparse.ArgumentParser(description="Object tracking demo with Rerun visualization") 10 | parser.add_argument("--headless", action="store_true", 11 | help="Run without any visualization (default)") 12 | args = parser.parse_args() 13 | 14 | # Initialize Rerun for visualization 15 | if not args.headless: 16 | rr.init("Object Tracking", spawn=True) 17 | print("Initialized Rerun visualization") 18 | 19 | try: 20 | # Initialize camera 21 | cap = cv2.VideoCapture(0) 22 | if not cap.isOpened(): 23 | print("Error: Cannot open webcam") 24 | return 25 | 26 | # Initialize Moondream model 27 | try: 28 | with open('api_key.txt', 'r') as f: 29 | api_key = f.read().strip() 30 | model = md.vl(api_key=api_key) 31 | except FileNotFoundError: 32 | print("Error: api_key.txt not found. Please create this file with your Moondream API key.") 33 | return 34 | except Exception as e: 35 | print(f"Error initializing Moondream model: {e}") 36 | return 37 | 38 | # Get tracking target 39 | prompt = input("What object would you like to track? ") 40 | 41 | # Create and start tracker 42 | tracker = AsyncTracker(model, prompt) 43 | tracker.start() 44 | 45 | print("\nPress Ctrl+C to stop tracking") 46 | 47 | while True: 48 | ret, frame = cap.read() 49 | if not ret: 50 | print("Failed to grab frame") 51 | break 52 | 53 | # Process the frame through the tracker 54 | tracker.process_frame(frame) 55 | 56 | # Get current tracking state and visualize with Rerun 57 | if not args.headless: 58 | state = tracker.get_state() 59 | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 60 | 61 | # Log the base frame 62 | rr.log("camera/frame", rr.Image(frame_rgb)) 63 | 64 | if state['bbox'] is not None: 65 | center = state['kalman'] 66 | bbox = state['bbox'] 67 | if center and bbox: 68 | # Log tracking information 69 | rr.log("camera/tracking/center", 70 | rr.Points2D(positions=np.array([center]), 71 | colors=np.array([[255, 0, 0]]))) 72 | 73 | # Log bounding box as points 74 | bbox_points = np.array([ 75 | [bbox[0], bbox[1]], # top-left 76 | [bbox[2], bbox[1]], # top-right 77 | [bbox[2], bbox[3]], # bottom-right 78 | [bbox[0], bbox[3]], # bottom-left 79 | [bbox[0], bbox[1]] # close the box 80 | ]) 81 | rr.log("camera/tracking/bbox", 82 | rr.LineStrips2D(bbox_points, 83 | colors=np.array([[0, 255, 0]]))) 84 | 85 | # Print tracking info to console 86 | print(f"\rTracking at ({center[0]:.1f}, {center[1]:.1f})", 87 | end="", flush=True) 88 | 89 | except KeyboardInterrupt: 90 | print("\nShutting down...") 91 | except Exception as e: 92 | print(f"\nUnexpected error: {e}") 93 | finally: 94 | # Clean up 95 | print("\nCleaning up...") 96 | if tracker: 97 | tracker.stop() 98 | if cap: 99 | cap.release() 100 | if not args.headless: 101 | rr.disconnect() 102 | 103 | if __name__ == '__main__': 104 | main() 105 | -------------------------------------------------------------------------------- /servoing_example.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | import cv2 4 | import zenoh 5 | import json 6 | import time 7 | import rerun as rr 8 | import numpy as np 9 | import argparse 10 | import threading 11 | import moondream as md 12 | import sys 13 | from async_tracking import AsyncTracker, draw_bbox 14 | 15 | # Global flag to signal program termination 16 | should_exit = False 17 | 18 | class ZenohPub: 19 | def __init__(self, prompt): 20 | # Initialize Zenoh session 21 | self.session = zenoh.open(zenoh.Config()) 22 | # Publisher for twist (command) messages on topic "robot/cmd" 23 | self.cmd_pub = self.session.declare_publisher("robot/cmd") 24 | # Initialize a lock and storage for the latest camera frame. 25 | self.latest_frame = None 26 | self.frame_lock = threading.Lock() 27 | # Subscribe to the camera feed published from the robot (e.g. from webcam_publisher) 28 | self.frame_sub = self.session.declare_subscriber("robot/camera/frame", self.on_frame) 29 | 30 | # Initialize Rerun visualization 31 | self.prompt = prompt 32 | rr.init("Object Tracking", spawn=True) 33 | 34 | # Initialize Moondream model using an API key stored in api_key.txt 35 | try: 36 | with open('api_key.txt', 'r') as f: 37 | api_key = f.read().strip() 38 | self.model = md.vl(api_key=api_key) 39 | except Exception as e: 40 | print(f"Failed to initialize moondream model: {e}") 41 | exit(1) 42 | 43 | # Create AsyncTracker; detection thread will be started in run() to update tracking state concurrently. 44 | self.tracker = AsyncTracker(self.model, self.prompt, detection_interval=2.0, display_ui=False) 45 | 46 | def on_frame(self, sample): 47 | """Callback for Zenoh subscriber: decodes the frame and stores it.""" 48 | try: 49 | np_arr = np.frombuffer(sample.payload.to_bytes(), np.uint8) 50 | frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) 51 | with self.frame_lock: 52 | self.latest_frame = frame 53 | except Exception as e: 54 | print(f"Error processing camera frame: {e}") 55 | 56 | def compute_twist(self, state, frame_width, loop_rate): 57 | """ 58 | Compute a twist command based on tracking state. 59 | Here we compute the error between the target's x position (from the Kalman filter) 60 | and the center of the frame. We then compute angular velocity to steer toward the target. 61 | A constant linear velocity is used if the loop rate is above 10 Hz, else zero. 62 | """ 63 | if state['kalman'] is None: 64 | return {'x': 0.0, 'theta': 0.0} 65 | target_x = state['kalman'][0] 66 | center_x = frame_width / 2.0 67 | error_x = target_x - center_x 68 | gain = 0.005 69 | max_angular_z = 0.35 70 | angular_z = -gain * error_x 71 | angular_z = min(angular_z, max_angular_z) 72 | angular_z = max(angular_z, -max_angular_z) 73 | linear_x = 0.0 74 | return {'x': float(linear_x), 'theta': float(angular_z)} 75 | 76 | def run(self): 77 | """Main loop: processes incoming frames, updates tracking, computes twist, and publishes it.""" 78 | try: 79 | print("Running zenoh publisher for tracking and twist commands.") 80 | # Start the AsyncTracker's detection thread 81 | self.tracker.start() 82 | prev_loop_time = time.time() 83 | while not should_exit: # Check the global flag 84 | current_time = time.time() 85 | dt = current_time - prev_loop_time 86 | prev_loop_time = current_time 87 | loop_rate = 1.0 / dt if dt > 0 else 0.0 88 | 89 | frame = None 90 | with self.frame_lock: 91 | if self.latest_frame is not None: 92 | frame = self.latest_frame.copy() 93 | self.latest_frame = None 94 | if frame is None: 95 | time.sleep(0.01) 96 | continue 97 | 98 | # Process frame; AsyncTracker's detection thread updates state concurrently 99 | self.tracker.process_frame(frame) 100 | state = self.tracker.get_state() 101 | 102 | if state['bbox'] is not None and state['kalman'] is not None: 103 | frame_width = frame.shape[1] 104 | twist_cmd = self.compute_twist(state, frame_width, loop_rate) 105 | print(f"Loop rate: {loop_rate:.2f} Hz") 106 | print(f"Tracking: center={state['kalman']}, bbox={state['bbox']}") 107 | print(f"Twist command: {twist_cmd}") 108 | self.cmd_pub.put(json.dumps(twist_cmd)) 109 | 110 | # Draw bounding box on frame for visualization 111 | x_min, y_min, x_max, y_max = state['bbox'] 112 | w = x_max - x_min 113 | h = y_max - y_min 114 | center = state['kalman'] 115 | new_bbox = (int(center[0] - w/2), int(center[1] - h/2), int(center[0] + w/2), int(center[1] + h/2)) 116 | frame = draw_bbox(frame, new_bbox) 117 | else: 118 | twist_cmd = {'x': 0.0, 'theta': 0.0} 119 | print("No valid tracking, publishing zero twist command") 120 | self.cmd_pub.put(json.dumps(twist_cmd)) 121 | 122 | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 123 | rr.log("camera/frame", rr.Image(frame_rgb)) 124 | 125 | time.sleep(1.0 / 30.0) 126 | except KeyboardInterrupt: 127 | print("Shutting down zenoh publisher.") 128 | finally: 129 | self.session.close() 130 | rr.disconnect() 131 | 132 | 133 | def timeout_handler(): 134 | """Function to handle timeout by setting the global exit flag""" 135 | global should_exit 136 | print("Timeout reached, shutting down...") 137 | should_exit = True 138 | 139 | 140 | def main(): 141 | parser = argparse.ArgumentParser( 142 | description="Zenoh publisher for object tracking and twist command generation using Moondream and AsyncTracker" 143 | ) 144 | parser.add_argument("--prompt", type=str, required=True, help="Target object prompt for tracking") 145 | parser.add_argument("--timeout", type=float, default=None, help="Timeout in seconds after which the program will exit") 146 | parser.add_argument("--rerun-addr", type=str, help="Address of existing Rerun instance to connect to (e.g. '127.0.0.1:9876')") 147 | args = parser.parse_args() 148 | 149 | print(os.system("pwd")) 150 | 151 | # Initialize Rerun visualization 152 | if args.rerun_addr: 153 | print(f"Connecting to existing Rerun instance at {args.rerun_addr}") 154 | rr.connect_tcp(args.rerun_addr) 155 | else: 156 | print("Spawning new Rerun instance") 157 | rr.init("Object Tracking", spawn=True) 158 | 159 | tracker = ZenohPub(args.prompt) 160 | 161 | # Set up timeout if specified 162 | if args.timeout: 163 | print(f"Running with {args.timeout} second timeout") 164 | # First set a timer that will send a zero command shortly before actual timeout 165 | safety_margin = min(1.0, args.timeout * 0.1) # 10% of timeout or 1 second, whichever is smaller 166 | safety_timer = threading.Timer(args.timeout - safety_margin, 167 | lambda: tracker.cmd_pub.put(json.dumps({'x': 0.0, 'theta': 0.0}))) 168 | safety_timer.daemon = True 169 | safety_timer.start() 170 | 171 | # Then set the actual timeout timer 172 | timer = threading.Timer(args.timeout, timeout_handler) 173 | timer.daemon = True 174 | timer.start() 175 | 176 | try: 177 | tracker.run() 178 | except KeyboardInterrupt: 179 | print("Sending zero command to stop the robot...") 180 | tracker.cmd_pub.put(json.dumps({'x': 0.0, 'theta': 0.0})) 181 | # Small delay to ensure the command is sent 182 | time.sleep(0.2) 183 | 184 | print("Interrupted by user, shutting down...") 185 | finally: 186 | # Ensure we set the exit flag to stop any running threads 187 | global should_exit 188 | should_exit = True 189 | 190 | 191 | if __name__ == "__main__": 192 | main() 193 | -------------------------------------------------------------------------------- /tracking_publisher.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import cv2 4 | import zenoh 5 | import json 6 | import time 7 | import rerun as rr 8 | import numpy as np 9 | import argparse 10 | import threading 11 | import moondream as md 12 | from async_tracking import AsyncTracker, draw_bbox 13 | 14 | 15 | class TrackingPublisher: 16 | def __init__(self, prompt): 17 | # Initialize Zenoh session 18 | self.session = zenoh.open(zenoh.Config()) 19 | # Publisher for object position messages 20 | self.pos_pub = self.session.declare_publisher("tracking/position") 21 | # Initialize a lock and storage for the latest camera frame 22 | self.latest_frame = None 23 | self.frame_lock = threading.Lock() 24 | # Subscribe to the camera feed 25 | self.frame_sub = self.session.declare_subscriber("robot/camera/frame", self.on_frame) 26 | 27 | # Initialize Rerun visualization 28 | rr.init("Object Tracking", spawn=True) 29 | 30 | # Initialize Moondream model using an API key stored in api_key.txt 31 | try: 32 | with open('api_key.txt', 'r') as f: 33 | api_key = f.read().strip() 34 | self.model = md.vl(api_key=api_key) 35 | except Exception as e: 36 | print(f"Failed to initialize moondream model: {e}") 37 | exit(1) 38 | 39 | self.prompt = prompt 40 | # Create AsyncTracker with 2 second detection interval 41 | self.tracker = AsyncTracker(self.model, self.prompt, detection_interval=2.0, display_ui=False) 42 | 43 | def on_frame(self, sample): 44 | """Callback for Zenoh subscriber: decodes the frame and stores it.""" 45 | try: 46 | np_arr = np.frombuffer(sample.payload.to_bytes(), np.uint8) 47 | frame = cv2.imdecode(np_arr, cv2.IMREAD_COLOR) 48 | with self.frame_lock: 49 | self.latest_frame = frame 50 | except Exception as e: 51 | print(f"Error processing camera frame: {e}") 52 | 53 | def run(self): 54 | """Main loop: processes incoming frames, updates tracking, and publishes position.""" 55 | try: 56 | print("Running tracking publisher.") 57 | # Start the AsyncTracker's detection thread 58 | self.tracker.start() 59 | prev_loop_time = time.time() 60 | 61 | while True: 62 | current_time = time.time() 63 | dt = current_time - prev_loop_time 64 | prev_loop_time = current_time 65 | loop_rate = 1.0 / dt if dt > 0 else 0.0 66 | 67 | frame = None 68 | with self.frame_lock: 69 | if self.latest_frame is not None: 70 | frame = self.latest_frame.copy() 71 | self.latest_frame = None 72 | if frame is None: 73 | time.sleep(0.01) 74 | continue 75 | 76 | # Process frame; AsyncTracker's detection thread updates state concurrently 77 | self.tracker.process_frame(frame) 78 | state = self.tracker.get_state() 79 | 80 | if state['bbox'] is not None and state['kalman'] is not None: 81 | # Get frame dimensions for normalized coordinates 82 | frame_height, frame_width = frame.shape[:2] 83 | 84 | # Extract position information 85 | center_x, center_y = state['kalman'] 86 | x_min, y_min, x_max, y_max = state['bbox'] 87 | width = x_max - x_min 88 | height = y_max - y_min 89 | 90 | # Create position message with normalized coordinates 91 | pos_msg = { 92 | 'x': float(center_x / frame_width), 93 | 'y': float(center_y / frame_height), 94 | 'width': float(width / frame_width), 95 | 'height': float(height / frame_height), 96 | 'timestamp': time.time() 97 | } 98 | 99 | print(f"Loop rate: {loop_rate:.2f} Hz") 100 | print(f"Publishing position: {pos_msg}") 101 | self.pos_pub.put(json.dumps(pos_msg)) 102 | 103 | # Draw bounding box on frame for visualization 104 | new_bbox = (int(center_x - width/2), int(center_y - height/2), 105 | int(center_x + width/2), int(center_y + height/2)) 106 | frame = draw_bbox(frame, new_bbox) 107 | else: 108 | print("No valid tracking detected") 109 | 110 | # Visualize the frame with Rerun 111 | frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 112 | rr.log("camera/frame", rr.Image(frame_rgb)) 113 | 114 | time.sleep(1.0 / 30.0) # Cap at 30 FPS 115 | 116 | except KeyboardInterrupt: 117 | print("Shutting down tracking publisher.") 118 | finally: 119 | self.session.close() 120 | rr.disconnect() 121 | 122 | 123 | def main(): 124 | parser = argparse.ArgumentParser( 125 | description="Zenoh publisher for object tracking using Moondream" 126 | ) 127 | parser.add_argument("--prompt", type=str, required=True, 128 | help="Target object prompt for tracking") 129 | args = parser.parse_args() 130 | 131 | tracker = TrackingPublisher(args.prompt) 132 | tracker.run() 133 | 134 | 135 | if __name__ == "__main__": 136 | main() 137 | -------------------------------------------------------------------------------- /webcam_publisher.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import zenoh 3 | from zenoh import Config 4 | import time 5 | 6 | from constants import ( 7 | CAMERA_WIDTH, 8 | CAMERA_HEIGHT, 9 | CAMERA_FPS, 10 | CAMERA_FRAME_KEY, 11 | FLIP_FRAME, 12 | CROP_TO_MONO, 13 | CROP_TO_RIGHT_FRAME 14 | ) 15 | 16 | def crop_frame_to_mono(frame): 17 | """ 18 | Crops a stereo frame to a single camera view 19 | Args: 20 | frame: Input stereo frame 21 | Returns: 22 | Cropped monocular frame 23 | """ 24 | # Calculate the midpoint - stereo frames are side by side 25 | mid_x = frame.shape[1] // 2 26 | 27 | if CROP_TO_RIGHT_FRAME: 28 | # Take the right half of the frame 29 | return frame[:, mid_x:] 30 | else: 31 | # Take the left half of the frame 32 | return frame[:, :mid_x] 33 | 34 | def main(): 35 | # Initialize camera capture 36 | cap = cv2.VideoCapture(0) 37 | 38 | # Set camera properties 39 | cap.set(cv2.CAP_PROP_FRAME_WIDTH, CAMERA_WIDTH) 40 | cap.set(cv2.CAP_PROP_FRAME_HEIGHT, CAMERA_HEIGHT) 41 | cap.set(cv2.CAP_PROP_FPS, CAMERA_FPS) 42 | 43 | # Force MJPG format for higher FPS 44 | cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter_fourcc(*'MJPG')) 45 | 46 | if not cap.isOpened(): 47 | print("Error: Could not open webcam.") 48 | return 49 | 50 | # Initialize Zenoh session with config 51 | with zenoh.open(Config()) as z_session: 52 | print("Press Ctrl+C to quit.") 53 | 54 | # For FPS calculation 55 | frame_count = 0 56 | fps_start_time = time.monotonic() 57 | 58 | try: 59 | while True: 60 | frame_start_time = time.monotonic() 61 | 62 | # Capture frame 63 | ret, frame = cap.read() 64 | 65 | if not ret: 66 | print("Failed to grab frame") 67 | break 68 | 69 | # Crop to mono if enabled 70 | if CROP_TO_MONO: 71 | frame = crop_frame_to_mono(frame) 72 | 73 | # Flip frame if enabled 74 | if FLIP_FRAME: 75 | frame = cv2.flip(frame, -1) # -1 flips both horizontally and vertically 76 | 77 | # Publish to Zenoh 78 | success, buffer = cv2.imencode('.jpg', frame) 79 | if success: 80 | z_session.put(CAMERA_FRAME_KEY, buffer.tobytes()) 81 | 82 | # Calculate and maintain FPS 83 | frame_count += 1 84 | if frame_count % 30 == 0: # Print FPS every 30 frames 85 | current_time = time.monotonic() 86 | fps = frame_count / (current_time - fps_start_time) 87 | print(f"FPS: {fps:.1f}") 88 | frame_count = 0 89 | fps_start_time = current_time 90 | 91 | # Small sleep to maintain consistent frame rate 92 | frame_end_time = time.monotonic() 93 | frame_duration = frame_end_time - frame_start_time 94 | sleep_time = max(0, 1.0/CAMERA_FPS - frame_duration) 95 | if sleep_time > 0: 96 | time.sleep(sleep_time) 97 | 98 | except KeyboardInterrupt: 99 | print("\nShutting down...") 100 | finally: 101 | cap.release() 102 | 103 | if __name__ == "__main__": 104 | main() --------------------------------------------------------------------------------