├── .flake8
├── .github
    └── workflows
    │   ├── ci_cd.yml
    │   └── docs.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CITATION.cff
├── LICENSE
├── README.md
├── assets
    ├── gemini.png
    ├── logo_dark.svg
    ├── logo_light.svg
    └── screenshot.png
├── docs
    ├── _overrides
    │   └── main.html
    ├── api
    │   ├── agent.md
    │   ├── configs.md
    │   ├── lmm.md
    │   ├── models.md
    │   ├── sim.md
    │   └── tools.md
    └── index.md
├── examples
    ├── chat
    │   ├── .env
    │   ├── Makefile
    │   ├── README.md
    │   ├── app.py
    │   ├── chat-app
    │   │   ├── .gitignore
    │   │   ├── Makefile
    │   │   ├── ResultVisualizer.tsx
    │   │   ├── components.json
    │   │   ├── next-env.d.ts
    │   │   ├── next.config.ts
    │   │   ├── package-lock.json
    │   │   ├── package.json
    │   │   ├── postcss.config.mjs
    │   │   ├── public
    │   │   │   ├── file.svg
    │   │   │   ├── globe.svg
    │   │   │   ├── next.svg
    │   │   │   ├── vercel.svg
    │   │   │   └── window.svg
    │   │   ├── src
    │   │   │   ├── app
    │   │   │   │   ├── favicon.ico
    │   │   │   │   ├── fonts
    │   │   │   │   │   ├── GeistMonoVF.woff
    │   │   │   │   │   └── GeistVF.woff
    │   │   │   │   ├── globals.css
    │   │   │   │   ├── layout.tsx
    │   │   │   │   └── page.tsx
    │   │   │   ├── components
    │   │   │   │   ├── ChatSection.tsx
    │   │   │   │   ├── GroupedVisualizer.tsx
    │   │   │   │   ├── ImageVisualizer.tsx
    │   │   │   │   ├── PolygonDrawer.tsx
    │   │   │   │   ├── PreviewSection.tsx
    │   │   │   │   ├── VideoVisualizer.tsx
    │   │   │   │   ├── types.tsx
    │   │   │   │   ├── ui
    │   │   │   │   │   ├── button.tsx
    │   │   │   │   │   ├── card.tsx
    │   │   │   │   │   ├── collapsible.tsx
    │   │   │   │   │   ├── scroll-area.tsx
    │   │   │   │   │   └── tabs.tsx
    │   │   │   │   └── utils.tsx
    │   │   │   └── lib
    │   │   │   │   └── utils.ts
    │   │   ├── tailwind.config.ts
    │   │   └── tsconfig.json
    │   ├── package-lock.json
    │   ├── requirements.txt
    │   ├── run.py
    │   ├── run.sh
    │   └── setup.py
    ├── custom_tools
    │   ├── README.md
    │   ├── pid.png
    │   ├── pid_template.png
    │   ├── requirements.txt
    │   ├── run_custom_tool.py
    │   └── template_match.py
    ├── mask_app
    │   ├── README.md
    │   ├── app.py
    │   └── requirements.txt
    └── notebooks
    │   └── counting_cans.ipynb
├── mkdocs.yml
├── poetry.lock
├── pyproject.toml
├── tests
    ├── __init__.py
    ├── integ
    │   ├── __init__.py
    │   └── test_tools.py
    └── unit
    │   ├── __init__.py
    │   ├── fixtures.py
    │   ├── test_lmm.py
    │   ├── test_meta_tools.py
    │   ├── test_planner_tools.py
    │   ├── test_utils.py
    │   ├── test_vac.py
    │   └── tools
    │       ├── test_tools.py
    │       └── test_video.py
├── uv.lock
└── vision_agent
    ├── .sim_tools
        ├── df.csv
        └── embs.npy
    ├── __init__.py
    ├── agent
        ├── README.md
        ├── __init__.py
        ├── agent.py
        ├── vision_agent_coder_prompts_v2.py
        ├── vision_agent_coder_v2.py
        ├── vision_agent_planner_prompts_v2.py
        ├── vision_agent_planner_v2.py
        ├── vision_agent_prompts_v2.py
        └── vision_agent_v2.py
    ├── clients
        ├── __init__.py
        └── http.py
    ├── configs
        ├── __init__.py
        ├── anthropic_config.py
        ├── config.py
        └── openai_config.py
    ├── fonts
        ├── __init__.py
        └── default_font_ch_en.ttf
    ├── lmm
        ├── __init__.py
        └── lmm.py
    ├── models
        ├── __init__.py
        ├── agent_types.py
        ├── lmm_types.py
        └── tools_types.py
    ├── sim
        ├── __init__.py
        └── sim.py
    ├── tools
        ├── __init__.py
        ├── meta_tools.py
        ├── planner_tools.py
        ├── prompts.py
        └── tools.py
    └── utils
        ├── __init__.py
        ├── agent.py
        ├── exceptions.py
        ├── execute.py
        ├── image_utils.py
        ├── tools.py
        ├── tools_doc.py
        ├── video.py
        └── video_tracking.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | extend-ignore = E501,E203
3 | max-line-length = 88
4 | max-complexity = 15
5 | per-file-ignores = __init__.py:F401
6 | 


--------------------------------------------------------------------------------
/.github/workflows/ci_cd.yml:
--------------------------------------------------------------------------------
  1 | name: CI
  2 | 
  3 | on:
  4 |   push:
  5 |     branches: [ main ]
  6 |   pull_request:
  7 |     branches: [ main ]
  8 | 
  9 | env:
 10 |   VISION_AGENT_API_KEY: ${{ secrets.VISION_AGENT_API_KEY }}
 11 |   ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
 12 |   GOOGLE_API_KEY: ${{ secrets.GOOGLE_API_KEY }}
 13 |   OPENAI_API_KEY: 123test
 14 |   PYTHONUTF8: 1
 15 | 
 16 | jobs:
 17 |   unit_test:
 18 |     name: Test
 19 |     strategy:
 20 |       matrix:
 21 |         python-version: [3.9, 3.11]
 22 |         os: [ ubuntu-22.04, windows-2022, macos-14 ]
 23 |     runs-on: ${{ matrix.os }}
 24 |     env:
 25 |       RUNTIME_TAG: ci_job
 26 |     steps:
 27 |     - uses: actions/checkout@v3
 28 |     - uses: actions/setup-python@v4
 29 |       with:
 30 |         python-version: ${{ matrix.python-version }}
 31 |     - name: Install uv
 32 |       shell: bash
 33 |       run: |
 34 |         pip install uv
 35 |         uv --version
 36 |     - name: Print Python environment information
 37 |       run: |
 38 |         uv run which python
 39 |         uv run python --version
 40 |         uv run pip --version
 41 |     - name: Install dependencies
 42 |       run: |
 43 |         # Install main dependencies first so we can see their size
 44 |         uv sync --all-extras
 45 |     - name: Linting
 46 |       run: |
 47 |         # stop the build if there are Python syntax errors or undefined names
 48 |         uv run flake8 . --exclude .venv,examples,tests --count --show-source --statistics
 49 |     - name: Check Format
 50 |       run: |
 51 |         uv run black --check --diff --color vision_agent/
 52 |     - name: Type Checking
 53 |       run: |
 54 |         uv run mypy vision_agent
 55 |     - name: Test with pytest
 56 |       run: |
 57 |         uv run pytest -s -vvv tests/unit
 58 | 
 59 |   integ_test:
 60 |     name: Integration Test
 61 |     runs-on: ubuntu-22.04
 62 |     env:
 63 |       RUNTIME_TAG: ci_job
 64 |     steps:
 65 |     - uses: actions/checkout@v3
 66 |     - uses: actions/setup-python@v4
 67 |       with:
 68 |         python-version: 3.11
 69 |     - name: Install uv
 70 |       shell: bash
 71 |       run: |
 72 |         pip install uv
 73 |         uv --version
 74 |     - name: Print Python environment information
 75 |       run: |
 76 |         uv run which python
 77 |         uv run python --version
 78 |         uv run pip --version
 79 |     - name: Install dependencies
 80 |       run: |
 81 |         # Install main dependencies first so we can see their size
 82 |         uv sync --all-extras
 83 |     - name: Test with pytest
 84 |       run: |
 85 |         uv run pytest -v tests/integ
 86 | 
 87 |   release:
 88 |     name: Release
 89 |     needs: unit_test
 90 |     # https://github.community/t/how-do-i-specify-job-dependency-running-in-another-workflow/16482
 91 |     if: github.event_name == 'push' && github.ref == 'refs/heads/main' && !contains(github.event.head_commit.message, 'chore(release):') && !contains(github.event.head_commit.message, '[skip release]')
 92 |     runs-on: ubuntu-latest
 93 |     steps:
 94 |       - uses: actions/setup-python@v4
 95 |         with:
 96 |           python-version: 3.10.11
 97 |       - name: Install uv
 98 |         shell: bash
 99 |         run: |
100 |           pip install uv
101 |           uv --version
102 |       - name: Checkout code
103 |         uses: actions/checkout@v3
104 |         with:
105 |           token: ${{ secrets.GH_TOKEN }}
106 |       - name: setup git config
107 |         run: |
108 |           git config user.name "GitHub Actions Bot"
109 |           git config user.email "yazhou.cao@landing.ai"
110 |       - name: Bump up version
111 |         run: |
112 |           current_version=$(uvx --from=toml-cli toml get --toml-path=pyproject.toml project.version)
113 |           IFS='.' read -r major minor patch <<< "$current_version"
114 |           patch=$((patch + 1))
115 |           new_version="${major}.${minor}.${patch}"
116 |           uvx --from=toml-cli toml set --toml-path=pyproject.toml project.version "$new_version"
117 |           git add pyproject.toml
118 |           git commit -m "[skip ci] chore(release): ${new_version}"
119 |           git push -f
120 |       - name: Publish to PyPI
121 |         run: |
122 |           uv build
123 |           uv publish --token ${{ secrets.PYPI_TOKEN }}
124 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | name: publish-doc
 2 | 
 3 | # build the documentation whenever there are new commits on main
 4 | on:
 5 |   push:
 6 |     branches:
 7 |       - main
 8 | 
 9 | # security: restrict permissions for CI jobs.
10 | permissions:
11 |   contents: read
12 | 
13 | jobs:
14 |   # Build the documentation and upload the static HTML files as an artifact.
15 |   build:
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - uses: actions/checkout@v3
19 |       - uses: actions/setup-python@v4
20 |         with:
21 |           python-version: 3.10.11
22 |       - name: Install UV
23 |         run: |
24 |           pip install uv
25 | 
26 |       - name: Install dependencies
27 |         run: |
28 |           uv sync --all-extras
29 |           uv add mkdocs mkdocs-material mkdocstrings[python]
30 | 
31 |       - run: mkdir -p docs-build
32 |       - run: uv run mkdocs build -f mkdocs.yml -d docs-build/
33 | 
34 |       - uses: actions/upload-pages-artifact@v3
35 |         with:
36 |           path: docs-build/
37 | 
38 |   # Deploy the artifact to GitHub pages.
39 |   # This is a separate job so that only actions/deploy-pages has the necessary permissions.
40 |   deploy:
41 |     needs: build
42 |     runs-on: ubuntu-latest
43 |     permissions:
44 |       pages: write
45 |       id-token: write
46 |     environment:
47 |       name: github-pages
48 |       url: ${{ steps.deployment.outputs.page_url }}
49 |     steps:
50 |       - id: deployment
51 |         uses: actions/deploy-pages@v4


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Prerequisites
  2 | *.d
  3 | 
  4 | # Object files
  5 | *.o
  6 | *.ko
  7 | *.obj
  8 | *.elf
  9 | 
 10 | # Env files
 11 | .env
 12 | 
 13 | # Precompiled Headers
 14 | *.gch
 15 | *.pch
 16 | 
 17 | # Libraries
 18 | *.lib
 19 | *.a
 20 | *.la
 21 | *.lo
 22 | 
 23 | # Shared objects (inc. Windows DLLs)
 24 | *.dll
 25 | *.so
 26 | *.so.*
 27 | *.dylib
 28 | 
 29 | # Executables
 30 | *.exe
 31 | *.out
 32 | *.app
 33 | *.i*86
 34 | *.x86_64
 35 | *.hex
 36 | 
 37 | # Debug files
 38 | *.dSYM/
 39 | *.su
 40 | 
 41 | # Mac files
 42 | .DS_Store
 43 | .DS_STORE
 44 | 
 45 | # Old HG stuff
 46 | .hg
 47 | .hgignore
 48 | .hgtags
 49 | 
 50 | .git
 51 | __pycache__
 52 | .ipynb_checkpoints
 53 | */__pycache__
 54 | */.ipynb_checkpoints
 55 | .local
 56 | .jupyter
 57 | .ipython
 58 | */.terraform
 59 | terraform.*
 60 | .terraform.*
 61 | shinobi-dvr/*
 62 | .vscode/
 63 | 
 64 | # mypy
 65 | .mypy_cache/*
 66 | 
 67 | # Distribution / packaging
 68 | .Python
 69 | build/
 70 | develop-eggs/
 71 | dist/
 72 | downloads/
 73 | eggs/
 74 | .eggs/
 75 | lib/
 76 | lib64/
 77 | parts/
 78 | sdist/
 79 | var/
 80 | wheels/
 81 | pip-wheel-metadata/
 82 | share/python-wheels/
 83 | *.egg-info/
 84 | .installed.cfg
 85 | *.egg
 86 | MANIFEST
 87 | 
 88 | # Output from various tools
 89 | examples/output
 90 | tests/output
 91 | docs-build
 92 | site
 93 | 
 94 | # Local or WIP files
 95 | local/
 96 | 
 97 | vision-agent-benchmark/
 98 | vision_agent/tools/suggestion.py
 99 | vision_agent/agent/visual_design_patterns.py
100 | */node_modules


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 |   - repo: https://github.com/psf/black
 3 |     rev: 24.4.2
 4 |     hooks:
 5 |       - id: black
 6 |         language_version: python3.9
 7 |   - repo: https://github.com/pycqa/flake8
 8 |     rev: 7.0.0
 9 |     hooks:
10 |       - id: flake8
11 | 


--------------------------------------------------------------------------------
/CITATION.cff:
--------------------------------------------------------------------------------
 1 | cff-version: 1.2.0
 2 | message: "If you use this software, please cite it as below."
 3 | authors:
 4 | - family-names: "Laird"
 5 |   given-names: "Dillon"
 6 | - family-names: "Jagadeesan"
 7 |   given-name: "Shankar"
 8 | - family-names: "Cao"
 9 |   given-name: "Yazhou"
10 | - family-names: "Ng"
11 |   given-name: "Andrew"
12 | title: "Vision Agent"
13 | version: 0.2
14 | date-released: 2024-02-12
15 | url: "https://github.com/landing-ai/vision-agent"
16 | 


--------------------------------------------------------------------------------
/assets/gemini.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/assets/gemini.png


--------------------------------------------------------------------------------
/assets/screenshot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/assets/screenshot.png


--------------------------------------------------------------------------------
/docs/_overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %}
2 | 
3 | {% block footer %}
4 |   {{ super() }}
5 | {% endblock %}


--------------------------------------------------------------------------------
/docs/api/agent.md:
--------------------------------------------------------------------------------
1 | ::: vision_agent.agent.agent.Agent
2 | 
3 | ::: vision_agent.agent.vision_agent_v2.VisionAgentV2
4 | 
5 | ::: vision_agent.agent.vision_agent_coder_v2.VisionAgentCoderV2
6 | 
7 | ::: vision_agent.agent.vision_agent_planner_v2.VisionAgentPlannerV2


--------------------------------------------------------------------------------
/docs/api/configs.md:
--------------------------------------------------------------------------------
1 | ::: vision_agent.configs.Config
2 | 


--------------------------------------------------------------------------------
/docs/api/lmm.md:
--------------------------------------------------------------------------------
 1 | ::: vision_agent.lmm.OpenAILMM
 2 | 
 3 | ::: vision_agent.lmm.AzureOpenAILMM
 4 | 
 5 | ::: vision_agent.lmm.OllamaLMM
 6 | 
 7 | ::: vision_agent.lmm.AnthropicLMM
 8 | 
 9 | ::: vision_agent.lmm.GoogleLMM
10 | 


--------------------------------------------------------------------------------
/docs/api/models.md:
--------------------------------------------------------------------------------
 1 | ::: vision_agent.models.AgentMessage
 2 | 
 3 | ::: vision_agent.models.CodeContext
 4 | 
 5 | ::: vision_agent.models.ErrorContext
 6 | 
 7 | ::: vision_agent.models.InteractionContext
 8 | 
 9 | ::: vision_agent.models.PlanContext
10 | 
11 | ::: vision_agent.models.Message
12 | 
13 | ::: vision_agent.models.TextOrImage
14 | 
15 | ::: vision_agent.models.BboxInput
16 | 
17 | ::: vision_agent.models.BboxInputBase64
18 | 
19 | ::: vision_agent.models.BoundingBoxes
20 | 
21 | ::: vision_agent.models.Florence2FtRequest
22 | 
23 | ::: vision_agent.models.JobStatus
24 | 
25 | ::: vision_agent.models.ODResponseData
26 | 
27 | ::: vision_agent.models.PromptTask
28 | 


--------------------------------------------------------------------------------
/docs/api/sim.md:
--------------------------------------------------------------------------------
1 | ::: vision_agent.sim.AzureSim
2 | 
3 | ::: vision_agent.sim.OllamaSim
4 | 
5 | ::: vision_agent.sim.Sim
6 | 
7 | ::: vision_agent.sim.StellaSim


--------------------------------------------------------------------------------
/docs/api/tools.md:
--------------------------------------------------------------------------------
1 | ::: vision_agent.tools
2 | 
3 | ::: vision_agent.tools.tools
4 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | --8<-- "README.md:26"
2 | 


--------------------------------------------------------------------------------
/examples/chat/.env:
--------------------------------------------------------------------------------
1 | PORT_BACKEND=8000
2 | PORT_FRONTEND=3000
3 | DEBUG_HIL=false


--------------------------------------------------------------------------------
/examples/chat/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: setup run
2 | 
3 | setup:
4 | 	pip install -r requirements.txt
5 | 	cd chat-app && make && cd ..
6 | 
7 | run:
8 | 	./run.sh
9 | 


--------------------------------------------------------------------------------
/examples/chat/README.md:
--------------------------------------------------------------------------------
 1 | # VisionAgentV2 Example App
 2 | 
 3 | This is an example application to demonstrate how to run VisionAgentV2 locally.  
 4 | It only works with the **V2 version** of VisionAgent and is mainly used for debugging — expect to find bugs and issues.
 5 | 
 6 | ![screenshot](https://github.com/landing-ai/vision-agent/blob/main/assets/screenshot.png?raw=true)
 7 | 
 8 | ## Prerequisites
 9 | 
10 | - Python 3.7 or higher
11 | - Node.js 14 or higher
12 | - npm (comes with Node.js)
13 | 
14 | ## Quick Start
15 | 
16 | ### 1. Setup
17 | 
18 | #### On Windows (PowerShell)
19 | 
20 | Run the setup script:
21 | 
22 | ```powershell
23 | python setup.py
24 | ```
25 | 
26 | #### On Linux/macOS (with Make)
27 | 
28 | ```bash
29 | make setup
30 | ```
31 | 
32 | ### 2. Run the App
33 | 
34 | #### On Windows (PowerShell)
35 | 
36 | ```powershell
37 | python run.py
38 | ```
39 | 
40 | #### On Linux/macOS (with Make)
41 | 
42 | ```bash
43 | make run
44 | ```
45 | 
46 | This will:
47 | - Launch the FastAPI backend
48 | - Start the React frontend
49 | - Open your browser to the application
50 | - Handle proper cleanup when you press Ctrl+C
51 | 
52 | ## Human-in-the-loop Mode
53 | 
54 | To enable human-in-the-loop support:
55 | 
56 | 1. Open `.env`
57 | 2. Set:
58 |    ```bash
59 |    DEBUG_HIL=true
60 |    ```
61 | 
62 | **Note:** Currently, only **object detection** and **segmentation** visualizations are supported.
63 | 
64 | ## Configuration
65 | 
66 | ### Changing Ports
67 | 
68 | To modify the frontend or backend port:
69 | 
70 | 1. Open `.env`
71 | 2. Change the `PORT_BACKEND` or `PORT_FRONTEND` variables:
72 |    ```bash
73 |    PORT_BACKEND = 8000  # Change to your preferred port
74 |    PORT_FRONTEND = 3000  # Change to your preferred port
75 |    ```
76 | 
77 | ## Troubleshooting
78 | 
79 | - **Port conflicts**: The run script will attempt to free ports if they're already in use, but if not, either use another port or kill the process that is currently running on the conflicting port (make sure you know what is running on this port before killing it)
80 | - **Services not starting**: Verify you have the prerequisites installed and ran the setup
81 | - **Browser doesn't open**: Manually navigate to http://localhost:3000 (or whatever your frontend port is)
82 | - **Constant string of messages saying connection rejected/closed**: Check that you do not have multiple tabs open to http://localhost:3000 (or whatever your frontend port is)
83 | 
84 | ## Support
85 | 
86 | For issues and questions, please file an issue on the GitHub repository, or come ask questions in our Discord: 
87 | 
88 | [![](https://dcbadge.vercel.app/api/server/wPdN8RCYew?compact=true&style=flat)](https://discord.gg/wPdN8RCYew)
89 | 


--------------------------------------------------------------------------------
/examples/chat/app.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import base64
  3 | import tempfile
  4 | from typing import Any, Dict, List, Optional
  5 | 
  6 | import cv2
  7 | import httpx
  8 | import numpy as np
  9 | from fastapi import BackgroundTasks, FastAPI, WebSocket, WebSocketDisconnect
 10 | from fastapi.middleware.cors import CORSMiddleware
 11 | from pydantic import BaseModel
 12 | 
 13 | from vision_agent.agent import VisionAgentV2
 14 | from vision_agent.models import AgentMessage
 15 | from vision_agent.lmm import AnthropicLMM
 16 | from vision_agent.utils.execute import CodeInterpreterFactory
 17 | 
 18 | from dotenv import load_dotenv
 19 | import os
 20 | 
 21 | PORT_FRONTEND = os.getenv("PORT_FRONTEND")
 22 | DEBUG_HIL = os.getenv("DEBUG_HIL")
 23 | 
 24 | app = FastAPI()
 25 | 
 26 | # CORS config
 27 | app.add_middleware(
 28 |     CORSMiddleware,
 29 |     allow_origins=[f"http://localhost:{PORT_FRONTEND}"],
 30 |     allow_credentials=True,
 31 |     allow_methods=["*"],
 32 |     allow_headers=["*"],
 33 | )
 34 | 
 35 | # Single WebSocket client tracking
 36 | active_client: Optional[WebSocket] = None
 37 | active_client_lock = asyncio.Lock()
 38 | 
 39 | # Add a global flag to track if processing should be canceled
 40 | processing_canceled = False
 41 | processing_canceled_lock = asyncio.Lock()
 42 | 
 43 | async def _async_update_callback(message: Dict[str, Any]):
 44 |     global processing_canceled
 45 |     
 46 |     # Check if processing has been canceled
 47 |     async with processing_canceled_lock:
 48 |         if processing_canceled:
 49 |             # Skip sending updates if processing has been canceled
 50 |             return
 51 |     
 52 |     # Try to send message to active WebSocket client
 53 |     async with active_client_lock:
 54 |         if active_client:
 55 |             try:
 56 |                 await active_client.send_json(message)
 57 |             except Exception:
 58 |                 print("Client disconnected unexpectedly.")
 59 |         else:
 60 |             print("No active client to send to.")
 61 | 
 62 | 
 63 | def update_callback(message: Dict[str, Any]):
 64 |     # Needed for non-async context
 65 |     loop = asyncio.new_event_loop()
 66 |     asyncio.set_event_loop(loop)
 67 |     loop.run_until_complete(_async_update_callback(message))
 68 |     loop.close()
 69 | 
 70 | 
 71 | # Agent setup
 72 | if DEBUG_HIL:
 73 |     agent = VisionAgentV2(
 74 |         verbose=True,
 75 |         update_callback=update_callback,
 76 |         hil=True,
 77 |     )
 78 |     code_interpreter = CodeInterpreterFactory.new_instance(non_exiting=True)
 79 | else:
 80 |     agent = VisionAgentV2(
 81 |         agent=AnthropicLMM(model_name="claude-3-7-sonnet-20250219"),
 82 |         verbose=True,
 83 |         update_callback=update_callback,
 84 |     )
 85 |     code_interpreter = CodeInterpreterFactory.new_instance()
 86 | 
 87 | async def reset_cancellation_flag():
 88 |     global processing_canceled
 89 |     async with processing_canceled_lock:
 90 |         processing_canceled = False
 91 | 
 92 | 
 93 | def process_messages_background(messages: List[Dict[str, Any]]):
 94 |     global processing_canceled
 95 |     if processing_canceled:
 96 |         return
 97 |         
 98 |     for message in messages:
 99 |         if "media" in message and message["media"] is None:
100 |             del message["media"]
101 | 
102 |     # Process messages normally (since cancellation is checked in the callback)
103 | 
104 |     response = agent.chat(
105 |         [
106 |             AgentMessage(
107 |                 role=message["role"],
108 |                 content=message["content"],
109 |                 media=message.get("media", None),
110 |             )
111 |             for message in messages
112 |         ],
113 |         code_interpreter=code_interpreter,
114 |     )
115 | 
116 | 
117 | class Message(BaseModel):
118 |     role: str
119 |     content: str
120 |     media: Optional[List[str]] = None
121 | 
122 | 
123 | class Detection(BaseModel):
124 |     label: str
125 |     bbox: List[int]
126 |     confidence: float
127 |     mask: Optional[List[int]] = None
128 | 
129 | 
130 | def b64_video_to_frames(b64_video: str) -> List[np.ndarray]:
131 |     video_bytes = base64.b64decode(
132 |         b64_video.split(",")[1] if "," in b64_video else b64_video
133 |     )
134 |     video_frames = []
135 |     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=True) as temp_video:
136 |         temp_video.write(video_bytes)
137 |         temp_video.flush()
138 | 
139 |         cap = cv2.VideoCapture(temp_video.name)
140 |         while cap.isOpened():
141 |             ret, frame = cap.read()
142 |             if not ret:
143 |                 break
144 |             video_frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
145 |         cap.release()
146 |     return video_frames
147 | 
148 | 
149 | @app.post("/chat")
150 | async def chat(
151 |     messages: List[Message], background_tasks: BackgroundTasks
152 | ) -> Dict[str, Any]:
153 |     # Reset cancellation flag before starting new processing
154 |     await reset_cancellation_flag()
155 |     
156 |     background_tasks.add_task(
157 |         process_messages_background, [m.model_dump() for m in messages]
158 |     )
159 |     return {"status": "Processing started"}
160 | 
161 | 
162 | @app.post("/cancel")
163 | async def cancel_processing():
164 |     """Cancel any ongoing message processing."""
165 |     global processing_canceled
166 |     async with processing_canceled_lock:
167 |         processing_canceled = True
168 |     
169 |     # Also clear the active websocket if possible
170 |     async with active_client_lock:
171 |         if active_client:
172 |             try:
173 |                 # Send a cancellation message that the frontend can detect
174 |                 await active_client.send_json({
175 |                     "role": "system",
176 |                     "content": "Processing canceled by user."
177 |                 })
178 |             except Exception:
179 |                 pass
180 |     
181 |     return {"status": "Processing canceled"}
182 | 
183 | 
184 | @app.websocket("/ws")
185 | async def websocket_endpoint(websocket: WebSocket):
186 |     global active_client
187 |     
188 |     # First check if there's already a connection before accepting
189 |     async with active_client_lock:
190 |         if active_client:
191 |             # Don't immediately accept if there's already a connection
192 |             # Either reject or queue this connection
193 |             await websocket.close(code=1000, reason="Only one connection allowed")
194 |             return
195 |         
196 |         # Accept the connection only if there isn't an active client
197 |         await websocket.accept()
198 |         active_client = websocket
199 |     
200 |     try:
201 |         while True:
202 |             await websocket.receive_json()
203 |     except WebSocketDisconnect:
204 |         async with active_client_lock:
205 |             if active_client == websocket:
206 |                 active_client = None
207 | 
208 | 
209 | @app.post("/send_message")
210 | async def send_message(message: Message):
211 |     await _async_update_callback(message.model_dump())


--------------------------------------------------------------------------------
/examples/chat/chat-app/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules
2 | .next


--------------------------------------------------------------------------------
/examples/chat/chat-app/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: all
 2 | all: setup
 3 | 
 4 | # Target to install dependencies using package-lock.json
 5 | .PHONY: install-dependencies
 6 | install-dependencies:
 7 | 	npm install --legacy-peer-deps react-syntax-highlighter
 8 | 	npm config set legacy-peer-deps true
 9 | 	npm ci
10 | 
11 | # Setup target to run all necessary commands
12 | .PHONY: setup
13 | setup: install-dependencies
14 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/ResultVisualizer.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import { ChevronLeft, ChevronRight } from "lucide-react";
  4 | import React, { useRef, useState, useEffect } from "react";
  5 | 
  6 | interface DetectionItem {
  7 |   request: {
  8 |     prompts: string | string[];
  9 |     confidence: number;
 10 |     function_name: string;
 11 |   };
 12 | 
 13 |   response: {
 14 |     data: Array<{
 15 |       label: string;
 16 |       score: number;
 17 |       bbox: [number, number, number, number];
 18 |       bounding_box: [number, number, number, number];
 19 |       mask: {
 20 |         counts: number[];
 21 |         size: number[];
 22 |       };
 23 |     }>;
 24 |   };
 25 |   files: Array<[string, string]>;
 26 | }
 27 | 
 28 | interface VisualizerProps {
 29 |   detectionResults: DetectionItem[];
 30 |   onSubmit?: (functionName: string, boxThreshold: number) => void;
 31 | }
 32 | 
 33 | const VisualizerHiL: React.FC<VisualizerProps> = ({
 34 |   detectionResults,
 35 |   onSubmit,
 36 | }) => {
 37 |   const [currentIndex, setCurrentIndex] = useState(0);
 38 |   const [threshold, setThreshold] = useState(0.05);
 39 |   const canvasRef = useRef<HTMLCanvasElement>(null);
 40 | 
 41 |   const handleNext = () => {
 42 |     setCurrentIndex((prev) => (prev + 1) % detectionResults.length);
 43 |   };
 44 | 
 45 |   const handlePrevious = () => {
 46 |     setCurrentIndex(
 47 |       (prev) => (prev - 1 + detectionResults.length) % detectionResults.length,
 48 |     );
 49 |   };
 50 | 
 51 |   const handleThresholdChange = (e: React.ChangeEvent<HTMLInputElement>) => {
 52 |     setThreshold(parseFloat(e.target.value));
 53 |   };
 54 | 
 55 |   useEffect(() => {
 56 |     if (!detectionResults || detectionResults.length === 0) return;
 57 | 
 58 |     const canvas = canvasRef.current;
 59 |     if (!canvas) return;
 60 |     const ctx = canvas.getContext("2d");
 61 |     if (!ctx) return;
 62 | 
 63 |     const currentResult = detectionResults[currentIndex];
 64 | 
 65 |     const image = new Image();
 66 |     image.onload = () => {
 67 |       canvas.width = image.width;
 68 |       canvas.height = image.height;
 69 | 
 70 |       ctx.clearRect(0, 0, canvas.width, canvas.height);
 71 |       ctx.drawImage(image, 0, 0);
 72 | 
 73 |       currentResult.response.data
 74 |         .filter((detection) => detection.score >= threshold)
 75 |         .forEach((detection) => {
 76 |           // Florence2 compatibility
 77 |           if (detection.bounding_box) {
 78 |             detection.bbox = detection.bounding_box;
 79 |           }
 80 | 
 81 |           // Draw mask
 82 |           if (detection.mask && detection.mask.counts && detection.mask.size) {
 83 |             const [height, width] = detection.mask.size;
 84 |             const counts = detection.mask.counts;
 85 | 
 86 |             const tempCanvas = document.createElement("canvas");
 87 |             tempCanvas.width = width;
 88 |             tempCanvas.height = height;
 89 |             const tmpCtx = tempCanvas.getContext("2d");
 90 |             if (!tmpCtx) return;
 91 | 
 92 |             const bitmap = new Uint8Array(width * height);
 93 |             let pixelIndex = 0;
 94 |             let isOne = false;
 95 | 
 96 |             for (const count of counts) {
 97 |               for (let i = 0; i < count; i++) {
 98 |                 if (pixelIndex < bitmap.length) {
 99 |                   // Convert from row-major to column-major order
100 |                   const x = Math.floor(pixelIndex / height);
101 |                   const y = pixelIndex % height;
102 |                   const newIndex = y * width + x;
103 |                   if (newIndex < bitmap.length) {
104 |                     bitmap[newIndex] = isOne ? 1 : 0;
105 |                   }
106 |                   pixelIndex++;
107 |                 }
108 |               }
109 |               isOne = !isOne;
110 |             }
111 | 
112 |             const imageData = tmpCtx.createImageData(width, height);
113 |             for (let i = 0; i < bitmap.length; i++) {
114 |               const offset = i * 4;
115 |               if (bitmap[i] === 1) {
116 |                 imageData.data[offset] = 255;
117 |                 imageData.data[offset + 1] = 0;
118 |                 imageData.data[offset + 2] = 0;
119 |                 imageData.data[offset + 3] = 170;
120 |               }
121 |             }
122 | 
123 |             tmpCtx.putImageData(imageData, 0, 0);
124 | 
125 |             ctx.save();
126 |             ctx.globalCompositeOperation = "source-over";
127 |             ctx.drawImage(tempCanvas, 0, 0, width, height);
128 |             ctx.restore();
129 |           }
130 | 
131 |           // Draw bounding box
132 |           if (detection.bbox) {
133 |             const [x1, y1, x2, y2] = detection.bbox;
134 |             const width = x2 - x1;
135 |             const height = y2 - y1;
136 | 
137 |             ctx.strokeStyle = "rgba(255, 0, 0, 0.6)";
138 |             ctx.lineWidth = 3;
139 |             ctx.strokeRect(x1, y1, width, height);
140 | 
141 |             ctx.font = "16px Arial";
142 |             const labelText = `${detection.label}: ${detection.score.toFixed(
143 |               2,
144 |             )}`;
145 |             const textMetrics = ctx.measureText(labelText);
146 |             const textHeight = 20; // Approximate height of the text
147 |             const padding = 4;
148 | 
149 |             // Draw semi-transparent background for text
150 |             ctx.fillStyle = "rgba(0, 0, 0, 0.5)";
151 |             ctx.fillRect(
152 |               x1 - padding,
153 |               y1 - textHeight - padding,
154 |               textMetrics.width + padding * 2,
155 |               textHeight + padding * 2,
156 |             );
157 | 
158 |             // Draw text
159 |             ctx.fillStyle = "white";
160 |             ctx.fillText(labelText, x1, y1 - 5);
161 |           }
162 |         });
163 |     };
164 | 
165 |     image.src = `data:image/png;base64,${currentResult.files[0][1]}`;
166 |   }, [detectionResults, currentIndex, threshold]);
167 | 
168 |   if (!detectionResults || detectionResults.length === 0) {
169 |     return <div>No results to visualize</div>;
170 |   }
171 | 
172 |   const currentResult = detectionResults[currentIndex];
173 | 
174 |   return (
175 |     <div className="visualizer-container p-4 bg-gray-100 rounded-lg">
176 |       <div className="visualizer-info mb-4">
177 |         <h3 className="text-lg font-bold">
178 |           Function: {currentResult.request.function_name}
179 |         </h3>
180 |         <p>
181 |           Prompt:{" "}
182 |           {Array.isArray(currentResult.request.prompts)
183 |             ? currentResult.request.prompts.join(", ")
184 |             : currentResult.request.prompts}
185 |         </p>
186 | 
187 |         <div className="threshold-control mb-4">
188 |           <label htmlFor="threshold-slider" className="block mb-2">
189 |             Confidence Threshold: {threshold.toFixed(2)}
190 |           </label>
191 |           <input
192 |             id="threshold-slider"
193 |             type="range"
194 |             min="0"
195 |             max="1"
196 |             step="0.01"
197 |             value={threshold}
198 |             onChange={handleThresholdChange}
199 |             className="w-full"
200 |           />
201 |         </div>
202 |       </div>
203 | 
204 |       <div className="image-navigation-container relative flex items-center justify-center">
205 |         {detectionResults.length > 1 && (
206 |           <button
207 |             onClick={handlePrevious}
208 |             className="absolute left-0 z-10 bg-white/50 rounded-full p-2 hover:bg-white/75"
209 |           >
210 |             <ChevronLeft />
211 |           </button>
212 |         )}
213 | 
214 |         {currentResult.files[0][0] === "video" ? (
215 |           <video 
216 |             controls 
217 |             src={`data:video/mp4;base64,${currentResult.files[0][1]}`}
218 |             className="max-w-full rounded-lg"
219 |           />
220 |         ) : (
221 |           <canvas ref={canvasRef} className="visualizer-canvas max-w-full" />
222 |         )}
223 | 
224 |         {detectionResults.length > 1 && (
225 |           <button
226 |             onClick={handleNext}
227 |             className="absolute right-0 z-10 bg-white/50 rounded-full p-2 hover:bg-white/75"
228 |           >
229 |             <ChevronRight />
230 |           </button>
231 |         )}
232 |       </div>
233 | 
234 |       <div className="navigation-info text-center mt-2">
235 |         <p>
236 |           Image {currentIndex + 1} of {detectionResults.length}
237 |         </p>
238 |       </div>
239 | 
240 |       <div className="flex justify-center mt-4">
241 |         <button
242 |           onClick={() =>
243 |             onSubmit?.(currentResult.request.function_name, threshold)
244 |           }
245 |           className="bg-blue-500 hover:bg-blue-600 text-white font-semibold py-2 px-4 rounded"
246 |         >
247 |           Choose
248 |         </button>
249 |       </div>
250 |     </div>
251 |   );
252 | };
253 | 
254 | export { VisualizerHiL };
255 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/components.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://ui.shadcn.com/schema.json",
 3 |   "style": "default",
 4 |   "rsc": true,
 5 |   "tsx": true,
 6 |   "tailwind": {
 7 |     "config": "tailwind.config.ts",
 8 |     "css": "src/app/globals.css",
 9 |     "baseColor": "neutral",
10 |     "cssVariables": true,
11 |     "prefix": ""
12 |   },
13 |   "aliases": {
14 |     "components": "@/components",
15 |     "utils": "@/lib/utils",
16 |     "ui": "@/components/ui",
17 |     "lib": "@/lib",
18 |     "hooks": "@/hooks"
19 |   }
20 | }


--------------------------------------------------------------------------------
/examples/chat/chat-app/next-env.d.ts:
--------------------------------------------------------------------------------
1 | /// <reference types="next" />
2 | /// <reference types="next/image-types/global" />
3 | 
4 | // NOTE: This file should not be edited
5 | // see https://nextjs.org/docs/app/api-reference/config/typescript for more information.
6 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/next.config.ts:
--------------------------------------------------------------------------------
 1 | const path = require('path');
 2 | const dotenv = require('dotenv');
 3 | 
 4 | // Manually load the .env file from the parent directory
 5 | dotenv.config({ path: path.resolve(__dirname, '..', '.env') });
 6 | 
 7 | module.exports = {
 8 |   reactStrictMode: true,
 9 |   env: {
10 |     PORT_FRONTEND: process.env.PORT_FRONTEND,
11 |     PORT_BACKEND: process.env.PORT_BACKEND,
12 |   },
13 | };


--------------------------------------------------------------------------------
/examples/chat/chat-app/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "chat-app",
 3 |   "version": "0.1.0",
 4 |   "private": true,
 5 |   "scripts": {
 6 |     "dev": "next dev",
 7 |     "build": "next build",
 8 |     "start": "next start",
 9 |     "lint": "next lint"
10 |   },
11 |   "dependencies": {
12 |     "@radix-ui/react-collapsible": "^1.1.8",
13 |     "@radix-ui/react-scroll-area": "^1.2.6",
14 |     "@radix-ui/react-slot": "^1.2.0",
15 |     "@radix-ui/react-tabs": "^1.1.9",
16 |     "@tailwindcss/postcss": "^4.1.4",
17 |     "class-variance-authority": "^0.7.1",
18 |     "clsx": "^2.1.1",
19 |     "concurrently": "^9.1.2",
20 |     "dotenv": "^16.5.0",
21 |     "lucide-react": "^0.503.0",
22 |     "next": "^15.3.1",
23 |     "prismjs": "^1.30.0",
24 |     "react": "^19.1.0",
25 |     "react-dom": "^19.1.0",
26 |     "react-markdown": "^10.1.0",
27 |     "react-syntax-highlighter": "^15.6.1",
28 |     "rehype-highlight": "^7.0.2",
29 |     "remark-gfm": "^4.0.1",
30 |     "tailwind-merge": "^3.2.0",
31 |     "tailwind-scrollbar": "^4.0.2",
32 |     "tailwindcss-animate": "^1.0.7",
33 |     "zeromq": "^6.4.2"
34 |   },
35 |   "devDependencies": {
36 |     "@types/node": "^22",
37 |     "@types/prismjs": "^1.26.5",
38 |     "@types/react": "^19",
39 |     "@types/react-dom": "^19",
40 |     "@types/react-syntax-highlighter": "^15.5.13",
41 |     "eslint": "^9",
42 |     "eslint-config-next": "15.3.1",
43 |     "postcss": "^8.5.3",
44 |     "tailwindcss": "^4.1.4",
45 |     "typescript": "^5"
46 |   }
47 | }
48 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/postcss.config.mjs:
--------------------------------------------------------------------------------
1 | export default {
2 |   plugins: {
3 |     "@tailwindcss/postcss": {},
4 |   }
5 | }


--------------------------------------------------------------------------------
/examples/chat/chat-app/public/file.svg:
--------------------------------------------------------------------------------
1 | <svg fill="none" viewBox="0 0 16 16" xmlns="http://www.w3.org/2000/svg"><path d="M14.5 13.5V5.41a1 1 0 0 0-.3-.7L9.8.29A1 1 0 0 0 9.08 0H1.5v13.5A2.5 2.5 0 0 0 4 16h8a2.5 2.5 0 0 0 2.5-2.5m-1.5 0v-7H8v-5H3v12a1 1 0 0 0 1 1h8a1 1 0 0 0 1-1M9.5 5V2.12L12.38 5zM5.13 5h-.62v1.25h2.12V5zm-.62 3h7.12v1.25H4.5zm.62 3h-.62v1.25h7.12V11z" clip-rule="evenodd" fill="#666" fill-rule="evenodd"/></svg>


--------------------------------------------------------------------------------
/examples/chat/chat-app/public/globe.svg:
--------------------------------------------------------------------------------
1 | <svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><g clip-path="url(#a)"><path fill-rule="evenodd" clip-rule="evenodd" d="M10.27 14.1a6.5 6.5 0 0 0 3.67-3.45q-1.24.21-2.7.34-.31 1.83-.97 3.1M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16m.48-1.52a7 7 0 0 1-.96 0H7.5a4 4 0 0 1-.84-1.32q-.38-.89-.63-2.08a40 40 0 0 0 3.92 0q-.25 1.2-.63 2.08a4 4 0 0 1-.84 1.31zm2.94-4.76q1.66-.15 2.95-.43a7 7 0 0 0 0-2.58q-1.3-.27-2.95-.43a18 18 0 0 1 0 3.44m-1.27-3.54a17 17 0 0 1 0 3.64 39 39 0 0 1-4.3 0 17 17 0 0 1 0-3.64 39 39 0 0 1 4.3 0m1.1-1.17q1.45.13 2.69.34a6.5 6.5 0 0 0-3.67-3.44q.65 1.26.98 3.1M8.48 1.5l.01.02q.41.37.84 1.31.38.89.63 2.08a40 40 0 0 0-3.92 0q.25-1.2.63-2.08a4 4 0 0 1 .85-1.32 7 7 0 0 1 .96 0m-2.75.4a6.5 6.5 0 0 0-3.67 3.44 29 29 0 0 1 2.7-.34q.31-1.83.97-3.1M4.58 6.28q-1.66.16-2.95.43a7 7 0 0 0 0 2.58q1.3.27 2.95.43a18 18 0 0 1 0-3.44m.17 4.71q-1.45-.12-2.69-.34a6.5 6.5 0 0 0 3.67 3.44q-.65-1.27-.98-3.1" fill="#666"/></g><defs><clipPath id="a"><path fill="#fff" d="M0 0h16v16H0z"/></clipPath></defs></svg>


--------------------------------------------------------------------------------
/examples/chat/chat-app/public/next.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 394 80"><path fill="#000" d="M262 0h68.5v12.7h-27.2v66.6h-13.6V12.7H262V0ZM149 0v12.7H94v20.4h44.3v12.6H94v21h55v12.6H80.5V0h68.7zm34.3 0h-17.8l63.8 79.4h17.9l-32-39.7 32-39.6h-17.9l-23 28.6-23-28.6zm18.3 56.7-9-11-27.1 33.7h17.8l18.3-22.7z"/><path fill="#000" d="M81 79.3 17 0H0v79.3h13.6V17l50.2 62.3H81Zm252.6-.4c-1 0-1.8-.4-2.5-1s-1.1-1.6-1.1-2.6.3-1.8 1-2.5 1.6-1 2.6-1 1.8.3 2.5 1a3.4 3.4 0 0 1 .6 4.3 3.7 3.7 0 0 1-3 1.8zm23.2-33.5h6v23.3c0 2.1-.4 4-1.3 5.5a9.1 9.1 0 0 1-3.8 3.5c-1.6.8-3.5 1.3-5.7 1.3-2 0-3.7-.4-5.3-1s-2.8-1.8-3.7-3.2c-.9-1.3-1.4-3-1.4-5h6c.1.8.3 1.6.7 2.2s1 1.2 1.6 1.5c.7.4 1.5.5 2.4.5 1 0 1.8-.2 2.4-.6a4 4 0 0 0 1.6-1.8c.3-.8.5-1.8.5-3V45.5zm30.9 9.1a4.4 4.4 0 0 0-2-3.3 7.5 7.5 0 0 0-4.3-1.1c-1.3 0-2.4.2-3.3.5-.9.4-1.6 1-2 1.6a3.5 3.5 0 0 0-.3 4c.3.5.7.9 1.3 1.2l1.8 1 2 .5 3.2.8c1.3.3 2.5.7 3.7 1.2a13 13 0 0 1 3.2 1.8 8.1 8.1 0 0 1 3 6.5c0 2-.5 3.7-1.5 5.1a10 10 0 0 1-4.4 3.5c-1.8.8-4.1 1.2-6.8 1.2-2.6 0-4.9-.4-6.8-1.2-2-.8-3.4-2-4.5-3.5a10 10 0 0 1-1.7-5.6h6a5 5 0 0 0 3.5 4.6c1 .4 2.2.6 3.4.6 1.3 0 2.5-.2 3.5-.6 1-.4 1.8-1 2.4-1.7a4 4 0 0 0 .8-2.4c0-.9-.2-1.6-.7-2.2a11 11 0 0 0-2.1-1.4l-3.2-1-3.8-1c-2.8-.7-5-1.7-6.6-3.2a7.2 7.2 0 0 1-2.4-5.7 8 8 0 0 1 1.7-5 10 10 0 0 1 4.3-3.5c2-.8 4-1.2 6.4-1.2 2.3 0 4.4.4 6.2 1.2 1.8.8 3.2 2 4.3 3.4 1 1.4 1.5 3 1.5 5h-5.8z"/></svg>


--------------------------------------------------------------------------------
/examples/chat/chat-app/public/vercel.svg:
--------------------------------------------------------------------------------
1 | <svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1155 1000"><path d="m577.3 0 577.4 1000H0z" fill="#fff"/></svg>


--------------------------------------------------------------------------------
/examples/chat/chat-app/public/window.svg:
--------------------------------------------------------------------------------
1 | <svg fill="none" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 16 16"><path fill-rule="evenodd" clip-rule="evenodd" d="M1.5 2.5h13v10a1 1 0 0 1-1 1h-11a1 1 0 0 1-1-1zM0 1h16v11.5a2.5 2.5 0 0 1-2.5 2.5h-11A2.5 2.5 0 0 1 0 12.5zm3.75 4.5a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5M7 4.75a.75.75 0 1 1-1.5 0 .75.75 0 0 1 1.5 0m1.75.75a.75.75 0 1 0 0-1.5.75.75 0 0 0 0 1.5" fill="#666"/></svg>


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/app/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/examples/chat/chat-app/src/app/favicon.ico


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/app/fonts/GeistMonoVF.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/examples/chat/chat-app/src/app/fonts/GeistMonoVF.woff


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/app/fonts/GeistVF.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/examples/chat/chat-app/src/app/fonts/GeistVF.woff


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/app/globals.css:
--------------------------------------------------------------------------------
 1 | @import "tailwindcss";
 2 | @tailwind base;
 3 | @tailwind components;
 4 | @tailwind utilities;
 5 | 
 6 | .markdown > * {
 7 |   all: revert;
 8 | }
 9 | 
10 | /* For links */
11 | .markdown a {
12 |   color: #3b82f6; /* Tailwind's blue-500 */
13 |   text-decoration: underline;
14 | }
15 | 
16 | .markdown a:hover {
17 |   color: #1d4ed8; /* Tailwind's blue-700 */
18 | }
19 | 
20 | body {
21 |   font-family: Arial, Helvetica, sans-serif;
22 | }
23 | 
24 | @layer base {
25 |   :root {
26 |     --background: 0 0% 100%;
27 |     --foreground: 0 0% 3.9%;
28 |     --card: 0 0% 100%;
29 |     --card-foreground: 0 0% 3.9%;
30 |     --popover: 0 0% 100%;
31 |     --popover-foreground: 0 0% 3.9%;
32 |     --primary: 0 0% 9%;
33 |     --primary-foreground: 0 0% 98%;
34 |     --secondary: 0 0% 96.1%;
35 |     --secondary-foreground: 0 0% 9%;
36 |     --muted: 0 0% 96.1%;
37 |     --muted-foreground: 0 0% 45.1%;
38 |     --accent: 0 0% 96.1%;
39 |     --accent-foreground: 0 0% 9%;
40 |     --destructive: 0 84.2% 60.2%;
41 |     --destructive-foreground: 0 0% 98%;
42 |     --border: 0 0% 89.8%;
43 |     --input: 0 0% 89.8%;
44 |     --ring: 0 0% 3.9%;
45 |     --chart-1: 12 76% 61%;
46 |     --chart-2: 173 58% 39%;
47 |     --chart-3: 197 37% 24%;
48 |     --chart-4: 43 74% 66%;
49 |     --chart-5: 27 87% 67%;
50 |     --radius: 0.5rem;
51 |   }
52 |   .dark {
53 |     --background: 0 0% 3.9%;
54 |     --foreground: 0 0% 98%;
55 |     --card: 0 0% 3.9%;
56 |     --card-foreground: 0 0% 98%;
57 |     --popover: 0 0% 3.9%;
58 |     --popover-foreground: 0 0% 98%;
59 |     --primary: 0 0% 98%;
60 |     --primary-foreground: 0 0% 9%;
61 |     --secondary: 0 0% 14.9%;
62 |     --secondary-foreground: 0 0% 98%;
63 |     --muted: 0 0% 14.9%;
64 |     --muted-foreground: 0 0% 63.9%;
65 |     --accent: 0 0% 14.9%;
66 |     --accent-foreground: 0 0% 98%;
67 |     --destructive: 0 62.8% 30.6%;
68 |     --destructive-foreground: 0 0% 98%;
69 |     --border: 0 0% 14.9%;
70 |     --input: 0 0% 14.9%;
71 |     --ring: 0 0% 83.1%;
72 |     --chart-1: 220 70% 50%;
73 |     --chart-2: 160 60% 45%;
74 |     --chart-3: 30 80% 55%;
75 |     --chart-4: 280 65% 60%;
76 |     --chart-5: 340 75% 55%;
77 |   }
78 | }
79 | 
80 | @layer base {
81 |   * {
82 |     @reference border-border;
83 |   }
84 |   body {
85 |     @reference bg-background text-foreground;
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/app/layout.tsx:
--------------------------------------------------------------------------------
 1 | import type { Metadata } from "next";
 2 | import localFont from "next/font/local";
 3 | import "./globals.css";
 4 | 
 5 | const geistSans = localFont({
 6 |   src: "./fonts/GeistVF.woff",
 7 |   variable: "--font-geist-sans",
 8 |   weight: "100 900",
 9 | });
10 | const geistMono = localFont({
11 |   src: "./fonts/GeistMonoVF.woff",
12 |   variable: "--font-geist-mono",
13 |   weight: "100 900",
14 | });
15 | 
16 | export const metadata: Metadata = {
17 |   title: "Create Next App",
18 |   description: "Generated by create next app",
19 | };
20 | 
21 | export default function RootLayout({
22 |   children,
23 | }: Readonly<{
24 |   children: React.ReactNode;
25 | }>) {
26 |   return (
27 |     <html lang="en">
28 |       <body
29 |         className={`${geistSans.variable} ${geistMono.variable} antialiased`}
30 |       >
31 |         {children}
32 |       </body>
33 |     </html>
34 |   );
35 | }
36 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/app/page.tsx:
--------------------------------------------------------------------------------
 1 | "use client";
 2 | 
 3 | import { useState } from "react";
 4 | import { ChatSection } from "@/components/ChatSection";
 5 | import { PreviewSection } from "@/components/PreviewSection";
 6 | import { Polygon } from "@/components/PolygonDrawer";
 7 | 
 8 | export default function Component() {
 9 |   const [uploadedFile, setUploadedFile] = useState<string | null>(null);
10 |   const handleFileUpload = (file: string) => setUploadedFile(file);
11 | 
12 |   const [uploadedImage, setUploadedMedia] = useState<string | null>(null);
13 |   const handleMediaUpload = (image: string) => setUploadedMedia(image);
14 | 
15 |   const [uploadedResult, setUploadedResult] = useState<string | null>(null);
16 |   const handleResultUpload = (result: string) => setUploadedResult(result);
17 | 
18 |   const [polygons, setPolygons] = useState<Polygon[]>([]);
19 |   const handlePolygonChange = (polygons: Polygon[]) => setPolygons(polygons);
20 | 
21 |   return (
22 |     <div className="h-screen grid grid-cols-2 gap-4 p-4 bg-background">
23 |       <ChatSection
24 |         uploadedMedia={uploadedImage}
25 |         onUploadedMedia={handleMediaUpload}
26 |         uploadedFile={uploadedFile}
27 |         onUploadedFile={handleFileUpload}
28 |         uploadedResult={uploadedResult}
29 |         onUploadedResult={handleResultUpload}
30 |         polygons={polygons}
31 |       />
32 |       <PreviewSection
33 |         uploadedMedia={uploadedImage}
34 |         uploadedFile={uploadedFile}
35 |         uploadedResult={uploadedResult}
36 |         onPolygonsChange={handlePolygonChange}
37 |       />
38 |     </div>
39 |   );
40 | }
41 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/GroupedVisualizer.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import React, { useEffect, useMemo, useState } from "react";
  4 | import { ChevronLeft, ChevronRight, ChevronDown } from "lucide-react";
  5 | import { Detection, DetectionItem } from "./types";
  6 | import { ImageVisualizer } from "./ImageVisualizer"; // Your image canvas component
  7 | import { VideoVisualizer } from "./VideoVisualizer"; // Your video visualizer component
  8 | 
  9 | 
 10 | // --- Group Type ---
 11 | // Each group contains all items that share the same function_name.
 12 | interface GroupedDetection {
 13 |   functionName: string;
 14 |   items: DetectionItem[];
 15 | }
 16 | 
 17 | interface VisualizerProps {
 18 |   detectionResults: DetectionItem[];
 19 |   onSubmit?: (functionName: string, boxThreshold: number) => void;
 20 | }
 21 | 
 22 | // --- GroupedVisualizer Component ---
 23 | 
 24 | const GroupedVisualizer: React.FC<VisualizerProps> = ({
 25 |   detectionResults,
 26 |   onSubmit,
 27 | }) => {
 28 |   // 1. Group detectionResults by function_name.
 29 |   const groups: GroupedDetection[] = useMemo(() => {
 30 |     const groupMap: Record<string, DetectionItem[]> = {};
 31 |     detectionResults.forEach((item) => {
 32 |       const fn = item.request.function_name;
 33 |       if (!groupMap[fn]) {
 34 |         groupMap[fn] = [];
 35 |       }
 36 |       groupMap[fn].push(item);
 37 |     });
 38 |     return Object.entries(groupMap).map(([functionName, items]) => ({
 39 |       functionName,
 40 |       items,
 41 |     }));
 42 |   }, [detectionResults]);
 43 | 
 44 |   // 2. Maintain state for the currently active group (across different function_names)
 45 |   const [currentGroupIndex, setCurrentGroupIndex] = useState(0);
 46 | 
 47 |   // 3. Maintain state for the currently selected image index within each group.
 48 |   //    The key is the group’s function name.
 49 |   const [selectedIndices, setSelectedIndices] = useState<Record<string, number>>(
 50 |     {}
 51 |   );
 52 | 
 53 |   // When groups change, initialize the selectedIndices for each group to zero.
 54 |   useEffect(() => {
 55 |     const initialIndices: Record<string, number> = {};
 56 |     groups.forEach((group) => {
 57 |       initialIndices[group.functionName] = 0;
 58 |     });
 59 |     setSelectedIndices(initialIndices);
 60 |   }, [groups]);
 61 | 
 62 |   // 4. Global threshold state.
 63 |   const [threshold, setThreshold] = useState(0.05);
 64 | 
 65 |   // 5. Determine the current group and current item.
 66 |   const currentGroup = groups[currentGroupIndex];
 67 |   const currentItem =
 68 |     currentGroup.items[selectedIndices[currentGroup.functionName] ?? 0];
 69 | 
 70 |   // 6. Navigation handlers:
 71 | 
 72 |   // For switching groups (different function_names)
 73 |   const handlePreviousGroup = () => {
 74 |     setCurrentGroupIndex((prev) => (prev - 1 + groups.length) % groups.length);
 75 |   };
 76 | 
 77 |   const handleNextGroup = () => {
 78 |     setCurrentGroupIndex((prev) => (prev + 1) % groups.length);
 79 |   };
 80 | 
 81 |   // For cycling images within the same group.
 82 |   const handleNextImageInGroup = () => {
 83 |     setSelectedIndices((prev) => {
 84 |       const currentIndex = prev[currentGroup.functionName] ?? 0;
 85 |       const nextIndex = (currentIndex + 1) % currentGroup.items.length;
 86 |       return { ...prev, [currentGroup.functionName]: nextIndex };
 87 |     });
 88 |   };
 89 | 
 90 |   return (
 91 |     <div className="grouped-visualizer p-4 bg-gray-100 rounded-lg">
 92 |       {/* Group Info and Threshold */}
 93 |       <div className="visualizer-info mb-4">
 94 |         <h3 className="text-lg font-bold">
 95 |           Function: {currentGroup.functionName}
 96 |         </h3>
 97 |         <p>
 98 |           Prompt:{" "}
 99 |           {Array.isArray(currentItem.request.prompts)
100 |             ? currentItem.request.prompts.join(", ")
101 |             : currentItem.request.prompts || currentItem.request.prompt}
102 |         </p>
103 |         <div className="threshold-control mb-4">
104 |           <label htmlFor="threshold-slider" className="block mb-2">
105 |             Confidence Threshold: {threshold.toFixed(2)}
106 |           </label>
107 |           <input
108 |             id="threshold-slider"
109 |             type="range"
110 |             min="0"
111 |             max="1"
112 |             step="0.01"
113 |             value={threshold}
114 |             onChange={(e) => setThreshold(parseFloat(e.target.value))}
115 |             className="w-full"
116 |           />
117 |         </div>
118 |       </div>
119 | 
120 |       {/* Visualization Area */}
121 |       <div className="visualizer-navigation relative flex items-center justify-center">
122 |         {/* Left/Right Buttons: Navigate between groups (different function_names) */}
123 |         {groups.length > 1 && (
124 |           <>
125 |             <button
126 |               onClick={handlePreviousGroup}
127 |               className="absolute left-2 z-10 bg-white/50 rounded-full p-2 hover:bg-white/75"
128 |               title="Previous group"
129 |             >
130 |               <ChevronLeft />
131 |             </button>
132 |             <button
133 |               onClick={handleNextGroup}
134 |               className="absolute right-2 z-10 bg-white/50 rounded-full p-2 hover:bg-white/75"
135 |               title="Next group"
136 |             >
137 |               <ChevronRight />
138 |             </button>
139 |           </>
140 |         )}
141 | 
142 |         {/* Render image or video visualizer */}
143 |         {currentItem.files[0][0] === "video" ? (
144 |           <VideoVisualizer
145 |             videoSrc={`data:video/mp4;base64,${currentItem.files[0][1]}`}
146 |             // Ensure your backend returns video detections in the proper format (Detection[][])
147 |             detections={currentItem.response.data as Detection[][]}
148 |             threshold={threshold}
149 |             fps={1}
150 |           />
151 |         ) : (
152 |           <ImageVisualizer detectionItem={currentItem} threshold={threshold} />
153 |         )}
154 | 
155 |         {/* Down Arrow: Cycle within images/videos of the same group */}
156 |         {currentGroup.items.length > 1 && (
157 |           <button
158 |             onClick={handleNextImageInGroup}
159 |             className="absolute bottom-2 left-1/2 transform -translate-x-1/2 z-10 bg-white/50 rounded-full p-2 hover:bg-white/75"
160 |             title="Next image in group"
161 |           >
162 |             <ChevronDown />
163 |           </button>
164 |         )}
165 |       </div>
166 | 
167 |       {/* Navigation Info */}
168 |       <div className="navigation-info text-center mt-2">
169 |         <p>
170 |           Tool {currentGroupIndex + 1} of {groups.length} &mdash; Tool Media{" "}
171 |           {(selectedIndices[currentGroup.functionName] ?? 0) + 1} of{" "}
172 |           {currentGroup.items.length}
173 |         </p>
174 |       </div>
175 | 
176 |       {/* Submit/Choose Button */}
177 |       <div className="flex justify-center mt-4">
178 |         <button
179 |           onClick={() =>
180 |             onSubmit?.(currentItem.request.function_name, threshold)
181 |           }
182 |           className="bg-blue-500 hover:bg-blue-600 text-white font-semibold py-2 px-4 rounded"
183 |         >
184 |           Choose
185 |         </button>
186 |       </div>
187 |     </div>
188 |   );
189 | };
190 | 
191 | export { GroupedVisualizer };
192 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/ImageVisualizer.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import React, { useEffect, useRef } from "react";
  4 | import { Detection, DetectionItem } from "./types";
  5 | import { drawBoundingBox } from "./utils";
  6 | 
  7 | // (Re-use your Detection and DetectionItem types and drawBoundingBox function here)
  8 | 
  9 | interface ImageVisualizerProps {
 10 |   detectionItem: DetectionItem;
 11 |   threshold: number;
 12 | }
 13 | 
 14 | const ImageVisualizer: React.FC<ImageVisualizerProps> = ({
 15 |   detectionItem,
 16 |   threshold,
 17 | }) => {
 18 |   const canvasRef = useRef<HTMLCanvasElement>(null);
 19 | 
 20 |   useEffect(() => {
 21 |     // Skip if this is a video.
 22 |     if (detectionItem.files[0][0] === "video") return;
 23 |     const canvas = canvasRef.current;
 24 |     if (!canvas) return;
 25 |     const ctx = canvas.getContext("2d");
 26 |     if (!ctx) return;
 27 | 
 28 |     const image = new Image();
 29 |     image.onload = () => {
 30 |       canvas.width = image.width;
 31 |       canvas.height = image.height;
 32 |       ctx.clearRect(0, 0, canvas.width, canvas.height);
 33 |       ctx.drawImage(image, 0, 0);
 34 | 
 35 |       if (typeof detectionItem.response.data === "string") {
 36 |         // Draw response string (for text-based responses).
 37 |         const fontSize = Math.min(canvas.width, canvas.height) * 0.05;
 38 |         ctx.font = `${fontSize}px Arial`;
 39 |         
 40 |         // Text wrapping configuration
 41 |         const maxWidth = canvas.width - 40; // Padding on both sides
 42 |         const lineHeight = fontSize * 1.2;
 43 |         const padding = 20;
 44 |         
 45 |         // Wrap text into lines
 46 |         const words = detectionItem.response.data.split(' ');
 47 |         const lines: string[] = [];
 48 |         let currentLine = words[0];
 49 |         
 50 |         for (let i = 1; i < words.length; i++) {
 51 |             const testLine = currentLine + ' ' + words[i];
 52 |             const metrics = ctx.measureText(testLine);
 53 |             if (metrics.width > maxWidth) {
 54 |                 lines.push(currentLine);
 55 |                 currentLine = words[i];
 56 |             } else {
 57 |                 currentLine = testLine;
 58 |             }
 59 |         }
 60 |         lines.push(currentLine);
 61 |         
 62 |         // Calculate background height based on number of lines
 63 |         const bgHeight = (lines.length * lineHeight) + (padding * 2);
 64 |         
 65 |         // Draw background
 66 |         ctx.fillStyle = "rgba(0, 0, 0, 0.7)";
 67 |         ctx.fillRect(10, 10, canvas.width - 20, bgHeight);
 68 |         
 69 |         // Draw text lines
 70 |         ctx.fillStyle = "white";
 71 |         lines.forEach((line, i) => {
 72 |             ctx.fillText(line, padding, padding + (i + 1) * lineHeight);
 73 |         });
 74 |       } else if (Array.isArray(detectionItem.response.data)) {
 75 |         // For images, assume response.data is an array of Detection.
 76 |         (detectionItem.response.data as Detection[])
 77 |           .filter((detection) => detection.score >= threshold)
 78 |           .forEach((detection) => {
 79 |             // Draw mask if available (only for images).
 80 |             if (
 81 |               detection.mask &&
 82 |               detection.mask.counts &&
 83 |               detection.mask.size
 84 |             ) {
 85 |               const [height, width] = detection.mask.size;
 86 |               const counts = detection.mask.counts;
 87 | 
 88 |               const tempCanvas = document.createElement("canvas");
 89 |               tempCanvas.width = width;
 90 |               tempCanvas.height = height;
 91 |               const tmpCtx = tempCanvas.getContext("2d");
 92 |               if (!tmpCtx) return;
 93 | 
 94 |               const bitmap = new Uint8Array(width * height);
 95 |               let pixelIndex = 0;
 96 |               let isOne = false;
 97 | 
 98 |               for (const count of counts) {
 99 |                 for (let i = 0; i < count; i++) {
100 |                   if (pixelIndex < bitmap.length) {
101 |                     // Convert from row-major to column-major order.
102 |                     const x = Math.floor(pixelIndex / height);
103 |                     const y = pixelIndex % height;
104 |                     const newIndex = y * width + x;
105 |                     if (newIndex < bitmap.length) {
106 |                       bitmap[newIndex] = isOne ? 1 : 0;
107 |                     }
108 |                     pixelIndex++;
109 |                   }
110 |                 }
111 |                 isOne = !isOne;
112 |               }
113 | 
114 |               const imageData = tmpCtx.createImageData(width, height);
115 |               for (let i = 0; i < bitmap.length; i++) {
116 |                 const offset = i * 4;
117 |                 if (bitmap[i] === 1) {
118 |                   imageData.data[offset] = 255;
119 |                   imageData.data[offset + 1] = 0;
120 |                   imageData.data[offset + 2] = 0;
121 |                   imageData.data[offset + 3] = 170;
122 |                 }
123 |               }
124 |               tmpCtx.putImageData(imageData, 0, 0);
125 | 
126 |               ctx.save();
127 |               ctx.globalCompositeOperation = "source-over";
128 |               ctx.drawImage(tempCanvas, 0, 0, width, height);
129 |               ctx.restore();
130 |             }
131 | 
132 |             // (If needed, draw the mask here as in your original code.)
133 |             drawBoundingBox(ctx, detection);
134 |           });
135 |       }
136 |     };
137 |     image.src = `data:image/png;base64,${detectionItem.files[0][1]}`;
138 |   }, [detectionItem, threshold]);
139 | 
140 |   return <canvas ref={canvasRef} className="visualizer-canvas max-w-full" />;
141 | };
142 | 
143 | export { ImageVisualizer };
144 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/PreviewSection.tsx:
--------------------------------------------------------------------------------
  1 | "use client";
  2 | 
  3 | import { useState } from "react";
  4 | import { ScrollArea } from "@/components/ui/scroll-area";
  5 | import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs";
  6 | import { Card } from "@/components/ui/card";
  7 | import { Prism as SyntaxHighlighter } from "react-syntax-highlighter";
  8 | import { gruvboxLight } from "react-syntax-highlighter/dist/esm/styles/prism";
  9 | import { PolygonDrawer, Polygon } from "@/components/PolygonDrawer";
 10 | 
 11 | interface PreviewSectionProps {
 12 |   uploadedMedia: string | null;
 13 |   uploadedFile: string | null;
 14 |   uploadedResult: string | null;
 15 |   onPolygonsChange: (polygons: Polygon[]) => void;
 16 | }
 17 | 
 18 | interface File {
 19 |   name: string;
 20 |   content: string;
 21 |   type: "code" | "image";
 22 | }
 23 | 
 24 | export function PreviewSection({
 25 |   uploadedMedia,
 26 |   uploadedFile,
 27 |   uploadedResult,
 28 |   onPolygonsChange,
 29 | }: PreviewSectionProps) {
 30 |   return (
 31 |     <Card className="overflow-hidden shadow-md border border-gray-200 h-[800px] flex flex-col">
 32 |       <Tabs defaultValue="media" className="flex-1 flex flex-col">
 33 |         <TabsList className="w-full justify-start rounded-none bg-gray-50">
 34 |           <TabsTrigger value="media" className="flex items-center gap-2">
 35 |             <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
 36 |               <rect width="18" height="18" x="3" y="3" rx="2" ry="2" />
 37 |               <circle cx="9" cy="9" r="2" />
 38 |               <path d="m21 15-3.086-3.086a2 2 0 0 0-2.828 0L6 21" />
 39 |             </svg>
 40 |             Media
 41 |           </TabsTrigger>
 42 |           <TabsTrigger value="code" className="flex items-center gap-2">
 43 |             <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
 44 |               <polyline points="16 18 22 12 16 6" />
 45 |               <polyline points="8 6 2 12 8 18" />
 46 |             </svg>
 47 |             Code
 48 |           </TabsTrigger>
 49 |           <TabsTrigger value="result" className="flex items-center gap-2">
 50 |             <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="2" strokeLinecap="round" strokeLinejoin="round">
 51 |               <path d="M8 3H5a2 2 0 0 0-2 2v14c0 1.1.9 2 2 2h14a2 2 0 0 0 2-2v-3" />
 52 |               <path d="M18 3h3v3" />
 53 |               <path d="M21 13V6h-8" />
 54 |               <path d="m16 8-8 8" />
 55 |             </svg>
 56 |             Result
 57 |           </TabsTrigger>
 58 |         </TabsList>
 59 |         
 60 |         <TabsContent value="media" className="p-4 bg-white flex-1 flex flex-col">
 61 |           <ScrollArea className="flex-1">
 62 |             <div className="border rounded-md p-4 bg-gray-50 shadow-inner">
 63 |               {uploadedMedia ? (
 64 |                 <PolygonDrawer
 65 |                   media={uploadedMedia || ""}
 66 |                   onPolygonsChange={onPolygonsChange}
 67 |                 />
 68 |               ) : (
 69 |                 <div className="flex flex-col items-center justify-center p-8 text-center">
 70 |                   <svg xmlns="http://www.w3.org/2000/svg" width="48" height="48" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1" strokeLinecap="round" strokeLinejoin="round" className="text-gray-300 mb-4">
 71 |                     <rect width="18" height="18" x="3" y="3" rx="2" ry="2" />
 72 |                     <circle cx="9" cy="9" r="2" />
 73 |                     <path d="m21 15-3.086-3.086a2 2 0 0 0-2.828 0L6 21" />
 74 |                   </svg>
 75 |                   <p className="text-gray-500">No media uploaded yet.</p>
 76 |                   <p className="text-sm text-gray-400 mt-2">Upload media to begin annotation.</p>
 77 |                 </div>
 78 |               )}
 79 |             </div>
 80 |           </ScrollArea>
 81 |         </TabsContent>
 82 |         
 83 |         <TabsContent value="code" className="p-4 bg-white flex-1 flex flex-col">
 84 |           <ScrollArea className="flex-1">
 85 |             <div className="mb-4">
 86 |               {uploadedFile ? (
 87 |                 <SyntaxHighlighter
 88 |                   language="python"
 89 |                   style={gruvboxLight}
 90 |                   customStyle={{
 91 |                     padding: "1.25rem",
 92 |                     borderRadius: "0.5rem",
 93 |                     backgroundColor: "#f8f9fa",
 94 |                     fontSize: "0.875rem",
 95 |                     border: "1px solid #e2e8f0",
 96 |                     boxShadow: "inset 0 2px 4px 0 rgba(0, 0, 0, 0.05)"
 97 |                   }}
 98 |                   wrapLongLines={true}
 99 |                 >
100 |                   {uploadedFile || ""}
101 |                 </SyntaxHighlighter>
102 |               ) : (
103 |                 <div className="flex flex-col items-center justify-center p-8 text-center border rounded-md bg-gray-50">
104 |                   <svg xmlns="http://www.w3.org/2000/svg" width="48" height="48" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1" strokeLinecap="round" strokeLinejoin="round" className="text-gray-300 mb-4">
105 |                     <polyline points="16 18 22 12 16 6" />
106 |                     <polyline points="8 6 2 12 8 18" />
107 |                   </svg>
108 |                   <p className="text-gray-500">No code uploaded yet.</p>
109 |                   <p className="text-sm text-gray-400 mt-2">Ask VisionAgent a question and wait for it to generate code.</p>
110 |                 </div>
111 |               )}
112 |             </div>
113 |           </ScrollArea>
114 |         </TabsContent>
115 |         
116 |         <TabsContent value="result" className="p-4 bg-white flex-1 flex flex-col">
117 |           <ScrollArea className="flex-1">
118 |             <div className="border rounded-md p-4 bg-gray-50 shadow-inner">
119 |               {uploadedResult ? (
120 |                 <img
121 |                   src={uploadedResult}
122 |                   alt="Uploaded"
123 |                   className="max-w-full rounded-md border shadow-sm"
124 |                 />
125 |               ) : (
126 |                 <div className="flex flex-col items-center justify-center p-8 text-center">
127 |                   <svg xmlns="http://www.w3.org/2000/svg" width="48" height="48" viewBox="0 0 24 24" fill="none" stroke="currentColor" strokeWidth="1" strokeLinecap="round" strokeLinejoin="round" className="text-gray-300 mb-4">
128 |                     <path d="M8 3H5a2 2 0 0 0-2 2v14c0 1.1.9 2 2 2h14a2 2 0 0 0 2-2v-3" />
129 |                     <path d="M18 3h3v3" />
130 |                     <path d="M21 13V6h-8" />
131 |                     <path d="m16 8-8 8" />
132 |                   </svg>
133 |                   <p className="text-gray-500">No result uploaded yet.</p>
134 |                   <p className="text-sm text-gray-400 mt-2">Results will appear here after processing.</p>
135 |                 </div>
136 |               )}
137 |             </div>
138 |           </ScrollArea>
139 |         </TabsContent>
140 |       </Tabs>
141 |     </Card>
142 |   );
143 | }


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/VideoVisualizer.tsx:
--------------------------------------------------------------------------------
  1 | import { Detection } from "./types";
  2 | import { useRef, useEffect } from "react";
  3 | import { drawBoundingBox } from "./utils";
  4 | 
  5 | interface VideoVisualizerProps {
  6 |   videoSrc: string; // Base64 video source (with data URI prefix)
  7 |   detections: Detection[][] | string; // Allow both array of detections or string response
  8 |   threshold: number;
  9 |   fps?: number;
 10 | }
 11 | 
 12 | // --- VideoVisualizer Component ---
 13 | // This component renders a <video> with an overlaid canvas.
 14 | // On each time update, it computes the current frame index (using the provided fps)
 15 | // and draws only bounding boxes (no masks) for detections that meet the threshold.
 16 | const VideoVisualizer: React.FC<VideoVisualizerProps> = ({
 17 |   videoSrc,
 18 |   detections,
 19 |   threshold,
 20 |   fps = 1,
 21 | }) => {
 22 |   const videoRef = useRef<HTMLVideoElement>(null);
 23 |   const canvasRef = useRef<HTMLCanvasElement>(null);
 24 | 
 25 |   // When the video metadata loads, set the canvas size to match the video.
 26 |   useEffect(() => {
 27 |     const videoEl = videoRef.current;
 28 |     if (!videoEl) return;
 29 | 
 30 |     const handleLoadedMetadata = () => {
 31 |       if (canvasRef.current) {
 32 |         canvasRef.current.width = videoEl.videoWidth;
 33 |         canvasRef.current.height = videoEl.videoHeight;
 34 |       }
 35 |     };
 36 | 
 37 |     videoEl.addEventListener("loadedmetadata", handleLoadedMetadata);
 38 |     return () =>
 39 |       videoEl.removeEventListener("loadedmetadata", handleLoadedMetadata);
 40 |   }, []);
 41 | 
 42 |   // On each time update, clear the canvas and draw bounding boxes for the current frame.
 43 |   useEffect(() => {
 44 |     const videoEl = videoRef.current;
 45 |     if (!videoEl) return;
 46 | 
 47 |     const handleTimeUpdate = () => {
 48 |       const canvas = canvasRef.current;
 49 |       if (!canvas) return;
 50 |       const ctx = canvas.getContext("2d");
 51 |       if (!ctx) return;
 52 | 
 53 |       // Clear canvas for the new frame.
 54 |       ctx.clearRect(0, 0, canvas.width, canvas.height);
 55 | 
 56 |       // Calculate the current frame (assumes constant fps).
 57 |       const currentFrame = Math.floor(videoEl.currentTime * fps);
 58 |       if (typeof detections === "string") {
 59 |         // Draw response string (for text-based responses).
 60 |         const fontSize = Math.min(canvas.width, canvas.height) * 0.05;
 61 |         ctx.font = `${fontSize}px Arial`;
 62 |         
 63 |         // Text wrapping configuration
 64 |         const maxWidth = canvas.width - 40; // Padding on both sides
 65 |         const lineHeight = fontSize * 1.2;
 66 |         const padding = 20;
 67 |         
 68 |         // Wrap text into lines
 69 |         const words = detections.split(' ');
 70 |         const lines: string[] = [];
 71 |         let currentLine = words[0];
 72 |         
 73 |         for (let i = 1; i < words.length; i++) {
 74 |             const testLine = currentLine + ' ' + words[i];
 75 |             const metrics = ctx.measureText(testLine);
 76 |             if (metrics.width > maxWidth) {
 77 |                 lines.push(currentLine);
 78 |                 currentLine = words[i];
 79 |             } else {
 80 |                 currentLine = testLine;
 81 |             }
 82 |         }
 83 |         lines.push(currentLine);
 84 |         
 85 |         // Calculate background height based on number of lines
 86 |         const bgHeight = (lines.length * lineHeight) + (padding * 2);
 87 |         
 88 |         // Draw background
 89 |         ctx.fillStyle = "rgba(0, 0, 0, 0.7)";
 90 |         ctx.fillRect(10, 10, canvas.width - 20, bgHeight);
 91 |         
 92 |         // Draw text lines
 93 |         ctx.fillStyle = "white";
 94 |         lines.forEach((line, i) => {
 95 |             ctx.fillText(line, padding, padding + (i + 1) * lineHeight);
 96 |         });
 97 |       } else {
 98 |         const frameDetections = detections[currentFrame] || [];
 99 | 
100 |         frameDetections
101 |           .filter((det) => det.score >= threshold)
102 |           .forEach((detection) => {
103 |             drawBoundingBox(ctx, detection);
104 |           });
105 |       }
106 |     };
107 | 
108 |     videoEl.addEventListener("timeupdate", handleTimeUpdate);
109 |     return () => videoEl.removeEventListener("timeupdate", handleTimeUpdate);
110 |   }, [detections, threshold, fps]);
111 | 
112 |   return (
113 |     <div style={{ position: "relative", display: "inline-block" }}>
114 |       <video
115 |         ref={videoRef}
116 |         src={videoSrc}
117 |         controls
118 |         loop
119 |         autoPlay
120 |         muted // Required for autoplay to work in most browsers
121 |         className="max-w-full rounded-lg"
122 |       />
123 |       <canvas
124 |         ref={canvasRef}
125 |         className="visualizer-canvas"
126 |         style={{
127 |           position: "absolute",
128 |           top: 0,
129 |           left: 0,
130 |           width: "100%",
131 |           height: "100%",
132 |           pointerEvents: "none",
133 |         }}
134 |       />
135 |     </div>
136 |   );
137 | };
138 | 
139 | export { VideoVisualizer };
140 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/types.tsx:
--------------------------------------------------------------------------------
 1 | export interface Detection {
 2 |   label: string;
 3 |   score: number;
 4 |   bbox: [number, number, number, number];
 5 |   // Florence2 sometimes returns this property:
 6 |   bounding_box?: [number, number, number, number];
 7 |   // For images, masks may be available.
 8 |   mask?: {
 9 |     counts: number[];
10 |     size: number[];
11 |   };
12 | }''
13 | 
14 | export interface DetectionItem {
15 |   request: {
16 |     prompts?: string | string[];
17 |     prompt?: string; // qwen2
18 |     confidence: number;
19 |     function_name: string;
20 |   };
21 |   // For images: an array of detections.
22 |   // For video: an array of arrays (each inner array is the detections for a given frame).
23 |   response: {
24 |     data: Detection[] | Detection[][] | string;
25 |   };
26 |   // files[0][0] is either "image" or "video"
27 |   files: Array<[string, string]>;
28 | };
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/ui/button.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | import { Slot } from "@radix-ui/react-slot"
 3 | import { cva, type VariantProps } from "class-variance-authority"
 4 | import { PlusCircle, Undo2 } from "lucide-react";
 5 | import { cn } from "@/lib/utils"
 6 | 
 7 | const buttonVariants = cva(
 8 |   "inline-flex items-center justify-center gap-2 whitespace-nowrap rounded-md text-sm font-medium transition-all disabled:pointer-events-none disabled:opacity-50 [&_svg]:pointer-events-none [&_svg:not([class*='size-'])]:size-4 shrink-0 [&_svg]:shrink-0 outline-none focus-visible:border-ring focus-visible:ring-ring/50 focus-visible:ring-[3px] aria-invalid:ring-destructive/20 dark:aria-invalid:ring-destructive/40 aria-invalid:border-destructive",
 9 |   {
10 |     variants: {
11 |       variant: {
12 |         default:
13 |           "bg-primary text-primary-foreground shadow-xs hover:bg-primary/90",
14 |         destructive:
15 |           "bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40 dark:bg-destructive/60",
16 |         outline:
17 |           "border bg-background shadow-xs hover:bg-accent hover:text-accent-foreground dark:bg-input/30 dark:border-input dark:hover:bg-input/50",
18 |         secondary:
19 |           "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80",
20 |         ghost:
21 |           "hover:bg-accent hover:text-accent-foreground dark:hover:bg-accent/50",
22 |         link: "text-primary underline-offset-4 hover:underline",
23 |       },
24 |       size: {
25 |         default: "h-9 px-4 py-2 has-[>svg]:px-3",
26 |         sm: "h-8 rounded-md gap-1.5 px-3 has-[>svg]:px-2.5",
27 |         lg: "h-10 rounded-md px-6 has-[>svg]:px-4",
28 |         icon: "size-9",
29 |       },
30 |     },
31 |     defaultVariants: {
32 |       variant: "default",
33 |       size: "default",
34 |     },
35 |   }
36 | )
37 | 
38 | function Button({
39 |   className,
40 |   variant,
41 |   size,
42 |   asChild = false,
43 |   ...props
44 | }: React.ComponentProps<"button"> &
45 |   VariantProps<typeof buttonVariants> & {
46 |     asChild?: boolean
47 |   }) {
48 |   const Comp = asChild ? Slot : "button"
49 | 
50 |   return (
51 |     <Comp
52 |       data-slot="button"
53 |       className={cn(buttonVariants({ variant, size, className }))}
54 |       {...props}
55 |     />
56 |   )
57 | }
58 | 
59 | export { Button, buttonVariants }
60 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/ui/card.tsx:
--------------------------------------------------------------------------------
 1 | import * as React from "react"
 2 | 
 3 | import { cn } from "@/lib/utils"
 4 | 
 5 | function Card({ className, ...props }: React.ComponentProps<"div">) {
 6 |   return (
 7 |     <div
 8 |       data-slot="card"
 9 |       className={cn(
10 |         "bg-card text-card-foreground flex flex-col gap-6 rounded-xl border py-6 shadow-sm",
11 |         className
12 |       )}
13 |       {...props}
14 |     />
15 |   )
16 | }
17 | 
18 | function CardHeader({ className, ...props }: React.ComponentProps<"div">) {
19 |   return (
20 |     <div
21 |       data-slot="card-header"
22 |       className={cn(
23 |         "@container/card-header grid auto-rows-min grid-rows-[auto_auto] items-start gap-1.5 px-6 has-data-[slot=card-action]:grid-cols-[1fr_auto] [.border-b]:pb-6",
24 |         className
25 |       )}
26 |       {...props}
27 |     />
28 |   )
29 | }
30 | 
31 | function CardTitle({ className, ...props }: React.ComponentProps<"div">) {
32 |   return (
33 |     <div
34 |       data-slot="card-title"
35 |       className={cn("leading-none font-semibold", className)}
36 |       {...props}
37 |     />
38 |   )
39 | }
40 | 
41 | function CardDescription({ className, ...props }: React.ComponentProps<"div">) {
42 |   return (
43 |     <div
44 |       data-slot="card-description"
45 |       className={cn("text-muted-foreground text-sm", className)}
46 |       {...props}
47 |     />
48 |   )
49 | }
50 | 
51 | function CardAction({ className, ...props }: React.ComponentProps<"div">) {
52 |   return (
53 |     <div
54 |       data-slot="card-action"
55 |       className={cn(
56 |         "col-start-2 row-span-2 row-start-1 self-start justify-self-end",
57 |         className
58 |       )}
59 |       {...props}
60 |     />
61 |   )
62 | }
63 | 
64 | function CardContent({ className, ...props }: React.ComponentProps<"div">) {
65 |   return (
66 |     <div
67 |       data-slot="card-content"
68 |       className={cn("px-6", className)}
69 |       {...props}
70 |     />
71 |   )
72 | }
73 | 
74 | function CardFooter({ className, ...props }: React.ComponentProps<"div">) {
75 |   return (
76 |     <div
77 |       data-slot="card-footer"
78 |       className={cn("flex items-center px-6 [.border-t]:pt-6", className)}
79 |       {...props}
80 |     />
81 |   )
82 | }
83 | 
84 | export {
85 |   Card,
86 |   CardHeader,
87 |   CardFooter,
88 |   CardTitle,
89 |   CardAction,
90 |   CardDescription,
91 |   CardContent,
92 | }
93 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/ui/collapsible.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import * as CollapsiblePrimitive from "@radix-ui/react-collapsible"
 4 | 
 5 | function Collapsible({
 6 |   ...props
 7 | }: React.ComponentProps<typeof CollapsiblePrimitive.Root>) {
 8 |   return <CollapsiblePrimitive.Root data-slot="collapsible" {...props} />
 9 | }
10 | 
11 | function CollapsibleTrigger({
12 |   ...props
13 | }: React.ComponentProps<typeof CollapsiblePrimitive.CollapsibleTrigger>) {
14 |   return (
15 |     <CollapsiblePrimitive.CollapsibleTrigger
16 |       data-slot="collapsible-trigger"
17 |       {...props}
18 |     />
19 |   )
20 | }
21 | 
22 | function CollapsibleContent({
23 |   ...props
24 | }: React.ComponentProps<typeof CollapsiblePrimitive.CollapsibleContent>) {
25 |   return (
26 |     <CollapsiblePrimitive.CollapsibleContent
27 |       data-slot="collapsible-content"
28 |       {...props}
29 |     />
30 |   )
31 | }
32 | 
33 | export { Collapsible, CollapsibleTrigger, CollapsibleContent }
34 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/ui/scroll-area.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import * as React from "react"
 4 | import * as ScrollAreaPrimitive from "@radix-ui/react-scroll-area"
 5 | 
 6 | import { cn } from "@/lib/utils"
 7 | 
 8 | function ScrollArea({
 9 |   className,
10 |   children,
11 |   ...props
12 | }: React.ComponentProps<typeof ScrollAreaPrimitive.Root>) {
13 |   return (
14 |     <ScrollAreaPrimitive.Root
15 |       data-slot="scroll-area"
16 |       className={cn("relative", className)}
17 |       {...props}
18 |     >
19 |       <ScrollAreaPrimitive.Viewport
20 |         data-slot="scroll-area-viewport"
21 |         className="focus-visible:ring-ring/50 size-full rounded-[inherit] transition-[color,box-shadow] outline-none focus-visible:ring-[3px] focus-visible:outline-1"
22 |       >
23 |         {children}
24 |       </ScrollAreaPrimitive.Viewport>
25 |       <ScrollBar />
26 |       <ScrollAreaPrimitive.Corner />
27 |     </ScrollAreaPrimitive.Root>
28 |   )
29 | }
30 | 
31 | function ScrollBar({
32 |   className,
33 |   orientation = "vertical",
34 |   ...props
35 | }: React.ComponentProps<typeof ScrollAreaPrimitive.ScrollAreaScrollbar>) {
36 |   return (
37 |     <ScrollAreaPrimitive.ScrollAreaScrollbar
38 |       data-slot="scroll-area-scrollbar"
39 |       orientation={orientation}
40 |       className={cn(
41 |         "flex touch-none p-px transition-colors select-none",
42 |         orientation === "vertical" &&
43 |           "h-full w-2.5 border-l border-l-transparent",
44 |         orientation === "horizontal" &&
45 |           "h-2.5 flex-col border-t border-t-transparent",
46 |         className
47 |       )}
48 |       {...props}
49 |     >
50 |       <ScrollAreaPrimitive.ScrollAreaThumb
51 |         data-slot="scroll-area-thumb"
52 |         className="bg-border relative flex-1 rounded-full"
53 |       />
54 |     </ScrollAreaPrimitive.ScrollAreaScrollbar>
55 |   )
56 | }
57 | 
58 | export { ScrollArea, ScrollBar }
59 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/ui/tabs.tsx:
--------------------------------------------------------------------------------
 1 | "use client"
 2 | 
 3 | import * as React from "react"
 4 | import * as TabsPrimitive from "@radix-ui/react-tabs"
 5 | 
 6 | import { cn } from "@/lib/utils"
 7 | 
 8 | function Tabs({
 9 |   className,
10 |   ...props
11 | }: React.ComponentProps<typeof TabsPrimitive.Root>) {
12 |   return (
13 |     <TabsPrimitive.Root
14 |       data-slot="tabs"
15 |       className={cn("flex flex-col gap-2", className)}
16 |       {...props}
17 |     />
18 |   )
19 | }
20 | 
21 | function TabsList({
22 |   className,
23 |   ...props
24 | }: React.ComponentProps<typeof TabsPrimitive.List>) {
25 |   return (
26 |     <TabsPrimitive.List
27 |       data-slot="tabs-list"
28 |       className={cn(
29 |         "flex h-10 w-full items-center gap-1 border-b border-gray-200 bg-white px-2",
30 |         className
31 |       )}
32 |       {...props}
33 |     />
34 |   )
35 | }
36 | 
37 | function TabsTrigger({
38 |   className,
39 |   ...props
40 | }: React.ComponentProps<typeof TabsPrimitive.Trigger>) {
41 |   return (
42 |     <TabsPrimitive.Trigger
43 |       data-slot="tabs-trigger"
44 |       className={cn(
45 |         "inline-flex h-10 items-center justify-center gap-1.5 rounded-t-lg border-b-2 border-transparent px-4 py-2 text-sm font-medium transition-all",
46 |         "text-gray-500 hover:text-gray-700 hover:bg-gray-50",
47 |         "data-[state=active]:border-blue-500 data-[state=active]:text-blue-600 data-[state=active]:bg-white",
48 |         "focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-blue-500 focus-visible:ring-offset-2",
49 |         className
50 |       )}
51 |       {...props}
52 |     />
53 |   )
54 | }
55 | 
56 | function TabsContent({
57 |   className,
58 |   ...props
59 | }: React.ComponentProps<typeof TabsPrimitive.Content>) {
60 |   return (
61 |     <TabsPrimitive.Content
62 |       data-slot="tabs-content"
63 |       className={cn(
64 |         "flex-1 rounded-b-lg p-4 outline-none transition-all duration-200",
65 |         "focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-blue-500 focus-visible:ring-offset-2",
66 |         className
67 |       )}
68 |       {...props}
69 |     />
70 |   )
71 | }
72 | 
73 | export { Tabs, TabsList, TabsTrigger, TabsContent }
74 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/components/utils.tsx:
--------------------------------------------------------------------------------
 1 | import { Detection } from "./types";
 2 | 
 3 | 
 4 | const drawBoundingBox = (ctx: CanvasRenderingContext2D, detection: Detection) => {
 5 |   // Use Florence2 compatibility if needed.
 6 |   if (detection.bounding_box) {
 7 |     detection.bbox = detection.bounding_box;
 8 |   }
 9 |   if (detection.bbox) {
10 |     const [x1, y1, x2, y2] = detection.bbox;
11 |     const boxWidth = x2 - x1;
12 |     const boxHeight = y2 - y1;
13 | 
14 |     // Draw bounding box.
15 |     ctx.strokeStyle = "rgba(255, 0, 0, 0.6)";
16 |     ctx.lineWidth = 3;
17 |     ctx.strokeRect(x1, y1, boxWidth, boxHeight);
18 | 
19 |     // Draw label.
20 |     ctx.font = "16px Arial";
21 |     const labelText = `${detection.label}: ${detection.score.toFixed(2)}`;
22 |     const textMetrics = ctx.measureText(labelText);
23 |     const textHeight = 20; // Approximate text height.
24 |     const padding = 4;
25 | 
26 |     // Draw background behind the text.
27 |     ctx.fillStyle = "rgba(0, 0, 0, 0.5)";
28 |     ctx.fillRect(
29 |       x1 - padding,
30 |       y1 - textHeight - padding,
31 |       textMetrics.width + padding * 2,
32 |       textHeight + padding * 2
33 |     );
34 | 
35 |     // Draw text.
36 |     ctx.fillStyle = "white";
37 |     ctx.fillText(labelText, x1, y1 - 5);
38 |   }
39 | };
40 |   
41 | export { drawBoundingBox };
42 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/src/lib/utils.ts:
--------------------------------------------------------------------------------
1 | import { clsx, type ClassValue } from "clsx"
2 | import { twMerge } from "tailwind-merge"
3 | 
4 | export function cn(...inputs: ClassValue[]) {
5 |   return twMerge(clsx(inputs))
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/tailwind.config.ts:
--------------------------------------------------------------------------------
 1 | import type { Config } from "tailwindcss";
 2 | 
 3 | const config: Config = {
 4 | 	darkMode: ["class", "dark"],
 5 |     content: [
 6 |     "./src/pages/**/*.{js,ts,jsx,tsx,mdx}",
 7 |     "./src/components/**/*.{js,ts,jsx,tsx,mdx}",
 8 |     "./src/app/**/*.{js,ts,jsx,tsx,mdx}",
 9 |   ],
10 |   theme: {
11 |   	extend: {
12 |   		colors: {
13 |   			background: 'hsl(var(--background))',
14 |   			foreground: 'hsl(var(--foreground))',
15 |   			card: {
16 |   				DEFAULT: 'hsl(var(--card))',
17 |   				foreground: 'hsl(var(--card-foreground))'
18 |   			},
19 |   			popover: {
20 |   				DEFAULT: 'hsl(var(--popover))',
21 |   				foreground: 'hsl(var(--popover-foreground))'
22 |   			},
23 |   			primary: {
24 |   				DEFAULT: 'hsl(var(--primary))',
25 |   				foreground: 'hsl(var(--primary-foreground))'
26 |   			},
27 |   			secondary: {
28 |   				DEFAULT: 'hsl(var(--secondary))',
29 |   				foreground: 'hsl(var(--secondary-foreground))'
30 |   			},
31 |   			muted: {
32 |   				DEFAULT: 'hsl(var(--muted))',
33 |   				foreground: 'hsl(var(--muted-foreground))'
34 |   			},
35 |   			accent: {
36 |   				DEFAULT: 'hsl(var(--accent))',
37 |   				foreground: 'hsl(var(--accent-foreground))'
38 |   			},
39 |   			destructive: {
40 |   				DEFAULT: 'hsl(var(--destructive))',
41 |   				foreground: 'hsl(var(--destructive-foreground))'
42 |   			},
43 |   			border: 'hsl(var(--border))',
44 |   			input: 'hsl(var(--input))',
45 |   			ring: 'hsl(var(--ring))',
46 |   			chart: {
47 |   				'1': 'hsl(var(--chart-1))',
48 |   				'2': 'hsl(var(--chart-2))',
49 |   				'3': 'hsl(var(--chart-3))',
50 |   				'4': 'hsl(var(--chart-4))',
51 |   				'5': 'hsl(var(--chart-5))'
52 |   			}
53 |   		},
54 |   		borderRadius: {
55 |   			lg: 'var(--radius)',
56 |   			md: 'calc(var(--radius) - 2px)',
57 |   			sm: 'calc(var(--radius) - 4px)'
58 |   		}
59 |   	}
60 |   },
61 |   plugins: [
62 |     require("tailwindcss-animate"),
63 |     require("tailwind-scrollbar")
64 |   ],
65 | };
66 | export default config;
67 | 


--------------------------------------------------------------------------------
/examples/chat/chat-app/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2018",
 4 |     "lib": ["dom", "dom.iterable", "esnext"],
 5 |     "allowJs": true,
 6 |     "skipLibCheck": true,
 7 |     "strict": true,
 8 |     "noEmit": true,
 9 |     "esModuleInterop": true,
10 |     "module": "esnext",
11 |     "moduleResolution": "bundler",
12 |     "resolveJsonModule": true,
13 |     "isolatedModules": true,
14 |     "jsx": "preserve",
15 |     "incremental": true,
16 |     "plugins": [
17 |       {
18 |         "name": "next"
19 |       }
20 |     ],
21 |     "paths": {
22 |       "@/*": ["./src/*"]
23 |     }
24 |   },
25 |   "include": ["next-env.d.ts", "**/*.ts", "**/*.tsx", ".next/types/**/*.ts"],
26 |   "exclude": ["node_modules"]
27 | }
28 | 


--------------------------------------------------------------------------------
/examples/chat/package-lock.json:
--------------------------------------------------------------------------------
1 | {
2 |   "name": "chat",
3 |   "lockfileVersion": 3,
4 |   "requires": true,
5 |   "packages": {}
6 | }
7 | 


--------------------------------------------------------------------------------
/examples/chat/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi[standard]
2 | opencv-python
3 | vision-agent
4 | openai==1.55.3
5 | httpx==0.27.2
6 | python-dotenv


--------------------------------------------------------------------------------
/examples/chat/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Cross-platform script to run the FastAPI backend and React frontend.
  4 | Works on Windows, macOS, and Linux without any additional dependencies.
  5 | """
  6 | 
  7 | import os
  8 | import sys
  9 | import time
 10 | import signal
 11 | import webbrowser
 12 | import subprocess
 13 | import platform
 14 | from pathlib import Path
 15 | from dotenv import load_dotenv
 16 | load_dotenv()
 17 | 
 18 | # Configuration
 19 | PORT_BACKEND = os.getenv("PORT_BACKEND")
 20 | PORT_FRONTEND = os.getenv("PORT_FRONTEND")
 21 | PROCESSES = []
 22 | 
 23 | 
 24 | def is_windows():
 25 |     """Check if the current platform is Windows"""
 26 |     return platform.system() == "Windows"
 27 | 
 28 | 
 29 | def find_process_by_port(port):
 30 |     """Find process using a specific port"""
 31 |     if is_windows():
 32 |         try:
 33 |             # Windows - use netstat
 34 |             output = subprocess.check_output(
 35 |                 f"netstat -ano | findstr :{port}", shell=True
 36 |             ).decode()
 37 |             if output:
 38 |                 for line in output.splitlines():
 39 |                     if f":{port}" in line and "LISTENING" in line:
 40 |                         parts = line.strip().split()
 41 |                         return int(parts[-1])
 42 |         except subprocess.CalledProcessError:
 43 |             pass
 44 |     else:
 45 |         try:
 46 |             # Unix-like - use lsof
 47 |             output = (
 48 |                 subprocess.check_output(f"lsof -i :{port} -t", shell=True)
 49 |                 .decode()
 50 |                 .strip()
 51 |             )
 52 |             if output:
 53 |                 return int(output)
 54 |         except subprocess.CalledProcessError:
 55 |             pass
 56 |     return None
 57 | 
 58 | 
 59 | def kill_process(pid):
 60 |     """Kill a process by its PID"""
 61 |     if pid:
 62 |         try:
 63 |             if is_windows():
 64 |                 subprocess.run(f"taskkill /F /PID {pid}", shell=True, check=False)
 65 |             else:
 66 |                 os.kill(pid, signal.SIGKILL)
 67 |             print(f"[INFO] Killed process {pid}")
 68 |             return True
 69 |         except (subprocess.SubprocessError, OSError) as e:
 70 |             print(f"[WARNING] Failed to kill process {pid}: {e}")
 71 |     return False
 72 | 
 73 | 
 74 | def cleanup_port(port):
 75 |     """Clean up a specific port by killing any process using it"""
 76 |     pid = find_process_by_port(port)
 77 |     if pid:
 78 |         print(f"[INFO] Port {port} is in use by process {pid}, attempting to free...")
 79 |         kill_process(pid)
 80 |         # Wait briefly to ensure the port is released
 81 |         time.sleep(1)
 82 |         return True
 83 |     return False
 84 | 
 85 | 
 86 | def cleanup():
 87 |     """Clean up all running processes and ports"""
 88 |     print("\n[INFO] Shutting down all processes...")
 89 | 
 90 |     # First try to terminate processes gracefully
 91 |     for proc in PROCESSES:
 92 |         if proc and proc.poll() is None:  # Check if process is still running
 93 |             try:
 94 |                 if is_windows():
 95 |                     proc.terminate()
 96 |                 else:
 97 |                     proc.send_signal(signal.SIGTERM)
 98 |                 print(f"[INFO] Terminating process: {proc.pid}")
 99 |             except Exception as e:
100 |                 print(f"[WARNING] Error terminating process {proc.pid}: {e}")
101 | 
102 |     # Give processes time to shut down gracefully
103 |     time.sleep(2)
104 | 
105 |     # Force kill any remaining processes
106 |     for proc in PROCESSES:
107 |         if proc and proc.poll() is None:  # Check if process is still running
108 |             try:
109 |                 proc.kill()
110 |                 print(f"[INFO] Force killed process: {proc.pid}")
111 |             except Exception as e:
112 |                 print(f"[WARNING] Error killing process {proc.pid}: {e}")
113 | 
114 |     # Clean up ports as a last resort
115 |     cleanup_port(PORT_BACKEND)
116 |     cleanup_port(PORT_FRONTEND)
117 | 
118 |     print("[INFO] All processes terminated, ports released.")
119 | 
120 | 
121 | def signal_handler(sig, frame):
122 |     """Handle interrupt signals (Ctrl+C)"""
123 |     print("\n[INFO] Received interrupt signal. Shutting down...")
124 |     cleanup()
125 |     sys.exit(0)
126 | 
127 | 
128 | def run_command(cmd, cwd=None, shell=False):
129 |     """Run a command in a subprocess with appropriate platform considerations"""
130 |     if is_windows() and not shell:
131 |         # Windows needs shell=True unless it's a list of args
132 |         if isinstance(cmd, str):
133 |             return subprocess.Popen(cmd, cwd=cwd, shell=True)
134 |         else:
135 |             return subprocess.Popen(cmd, cwd=cwd)
136 |     else:
137 |         return subprocess.Popen(cmd, cwd=cwd, shell=shell)
138 | 
139 | 
140 | def main():
141 |     """Main function to start services"""
142 |     # Register signal handlers for graceful shutdown
143 |     signal.signal(signal.SIGINT, signal_handler)
144 |     signal.signal(signal.SIGTERM, signal_handler)
145 | 
146 |     # Set environment variable
147 |     os.environ["REPORT_TOOL_TRACES"] = "1"
148 | 
149 |     print("[INFO] Starting services. Press Ctrl+C to stop all processes.")
150 | 
151 |     # Clean up ports if they're already in use
152 |     cleanup_port(PORT_BACKEND)
153 |     cleanup_port(PORT_FRONTEND)
154 | 
155 |     try:
156 |         # Start FastAPI backend
157 |         if is_windows():
158 |             backend_proc = run_command(
159 |                 f"python -m fastapi dev app.py --port {PORT_BACKEND}"
160 |             )
161 |         else:
162 |             backend_proc = run_command(
163 |                 f"fastapi dev app.py --port {PORT_BACKEND}", shell=True
164 |             )
165 | 
166 |         PROCESSES.append(backend_proc)
167 |         print(f"[INFO] Started FastAPI backend with process ID: {backend_proc.pid}")
168 | 
169 |         # Start React frontend
170 |         chat_app_dir = Path("chat-app").absolute()
171 |         if not chat_app_dir.exists():
172 |             print(f"[ERROR] Directory not found: {chat_app_dir}")
173 |             cleanup()
174 |             sys.exit(1)
175 | 
176 |         if is_windows():
177 |             frontend_proc = run_command(
178 |                 f"$env:PORT={PORT_FRONTEND}; npm run dev",
179 |                 cwd=str(chat_app_dir),
180 |                 shell=True,
181 |             )
182 |         else:
183 |             frontend_proc = subprocess.Popen(
184 |                 f"PORT={PORT_FRONTEND} npm run dev",
185 |                 cwd=str(chat_app_dir),
186 |                 shell=False,
187 |             )
188 | 
189 |         PROCESSES.append(frontend_proc)
190 |         print(f"[INFO] Started React frontend with process ID: {frontend_proc.pid}")
191 | 
192 |         # Give services a moment to start
193 |         time.sleep(2)
194 | 
195 |         # Open browser
196 |         print("[INFO] Opening browser to http://localhost:3000")
197 |         try:
198 |             webbrowser.open(f"http://localhost:{PORT_FRONTEND}")
199 |         except Exception as e:
200 |             print(f"[WARNING] Could not open browser: {e}")
201 |             print(f"[INFO] Please manually open: http://localhost:{PORT_FRONTEND}")
202 | 
203 |         # Keep the script running until Ctrl+C
204 |         print("[INFO] Services are running. Press Ctrl+C to stop.")
205 |         while all(proc.poll() is None for proc in PROCESSES if proc):
206 |             time.sleep(1)
207 | 
208 |         # If we get here, one of the processes has ended
209 |         for proc in PROCESSES:
210 |             if proc and proc.poll() is not None:
211 |                 print(
212 |                     f"[WARNING] Process {proc.pid} exited with code {proc.returncode}"
213 |                 )
214 | 
215 |         # Clean up remaining processes
216 |         cleanup()
217 | 
218 |     except Exception as e:
219 |         print(f"[ERROR] {e}")
220 |         cleanup()
221 |         sys.exit(1)
222 | 
223 | 
224 | if __name__ == "__main__":
225 |     main()
226 | 


--------------------------------------------------------------------------------
/examples/chat/run.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export REPORT_TOOL_TRACES=1
 4 | export $(cat .env | xargs)
 5 | 
 6 | # Store the process IDs for later cleanup
 7 | PIDS=()
 8 | 
 9 | # Function to clean up all background processes
10 | cleanup() {
11 |   echo -e "\n[INFO] Shutting down all processes..."
12 |   
13 |   # Kill concurrently and its child processes first
14 |   if [ -n "$CONCURRENTLY_PID" ]; then
15 |     kill -TERM "$CONCURRENTLY_PID" 2>/dev/null
16 |   fi
17 |   
18 |   # Kill all tracked processes with increasing force if needed
19 |   for pid in "${PIDS[@]}"; do
20 |     if ps -p "$pid" > /dev/null 2>&1; then
21 |       kill -TERM "$pid" 2>/dev/null
22 |       
23 |       # Give process a moment to terminate gracefully
24 |       sleep 0.5
25 |       
26 |       # If still running, force kill
27 |       if ps -p "$pid" > /dev/null 2>&1; then
28 |         kill -9 "$pid" 2>/dev/null
29 |       fi
30 |     fi
31 |   done
32 |   
33 |   # Force kill any processes still using our ports
34 |   echo "[INFO] Releasing ports..."
35 |   for port in $PORT_FRONTEND $PORT_BACKEND; do
36 |     pid=$(lsof -ti:$port 2>/dev/null)
37 |     if [ -n "$pid" ]; then
38 |       echo "[INFO] Killing process $pid using port $port"
39 |       kill -9 $pid 2>/dev/null
40 |     fi
41 |   done
42 |   
43 |   echo "[INFO] All processes terminated, ports released."
44 |   exit 0
45 | }
46 | 
47 | # Make sure the script exits even if a subcommand hangs
48 | kill_after_timeout() {
49 |   sleep 5
50 |   echo "[WARNING] Force killing remaining processes after timeout"
51 |   pkill -P $$ 2>/dev/null
52 |   for port in $PORT_FRONTEND $PORT_BACKEND; do
53 |     lsof -ti:$port | xargs -r kill -9 2>/dev/null
54 |   done
55 |   exit 1
56 | }
57 | 
58 | # Register the cleanup function for different signals
59 | # Immediate propagation with SIG_IGN first followed by cleanup
60 | trap 'trap " " SIGTERM SIGINT; cleanup' SIGINT SIGTERM
61 | trap cleanup EXIT
62 | 
63 | echo "[INFO] Starting services. Press Ctrl+C once to stop all processes."
64 | 
65 | # Start services but capture their PIDs
66 | concurrently "fastapi dev app.py --port $PORT_BACKEND" "cd chat-app && PORT=$PORT_FRONTEND npm run dev" &
67 | CONCURRENTLY_PID=$!
68 | PIDS+=($CONCURRENTLY_PID)
69 | 
70 | # Find child processes and add them to our list
71 | sleep 2
72 | CHILD_PIDS=$(pgrep -P $CONCURRENTLY_PID 2>/dev/null)
73 | if [ -n "$CHILD_PIDS" ]; then
74 |   for pid in $CHILD_PIDS; do
75 |     PIDS+=($pid)
76 |   done
77 | fi
78 | 
79 | # Try to open browser
80 | if command -v xdg-open >/dev/null 2>&1; then
81 |   xdg-open http://localhost:$PORT_FRONTEND
82 | elif command -v gnome-open >/dev/null 2>&1; then
83 |   gnome-open http://localhost:$PORT_FRONTEND
84 | elif command -v open >/dev/null 2>&1; then
85 |   open http://localhost:$PORT_FRONTEND
86 | else
87 |   echo "[INFO] Browser couldn't be opened automatically."
88 |   echo "[INFO] Please open http://localhost:$PORT_FRONTEND in your browser."
89 | fi
90 | 
91 | # Wait for the concurrently process
92 | wait $CONCURRENTLY_PID
93 | 
94 | # Return success
95 | exit 0
96 | 


--------------------------------------------------------------------------------
/examples/chat/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Cross-platform setup script that works on Windows, macOS, and Linux.
 4 | Installs required dependencies for both backend and frontend.
 5 | """
 6 | 
 7 | import os
 8 | import sys
 9 | import subprocess
10 | import platform
11 | from pathlib import Path
12 | 
13 | def is_windows():
14 |     """Check if the current platform is Windows"""
15 |     return platform.system() == "Windows"
16 | 
17 | def run_command(cmd, cwd=None, check=True):
18 |     """Run a command and return the result"""
19 |     print(f"Running: {cmd}")
20 |     try:
21 |         if isinstance(cmd, list):
22 |             result = subprocess.run(cmd, cwd=cwd, check=check)
23 |         else:
24 |             result = subprocess.run(cmd, cwd=cwd, shell=True, check=check)
25 |         return result.returncode == 0
26 |     except subprocess.SubprocessError as e:
27 |         print(f"Error running command: {e}")
28 |         return False
29 | 
30 | def main():
31 |     """Main setup function"""
32 |     print("\n=== Setting up your application ===\n")
33 |     
34 |     # Install backend dependencies
35 |     print("\n[1/2] Installing backend dependencies...")
36 |     if not run_command([sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]):
37 |         print("Failed to install backend requirements")
38 |         return False
39 |     
40 |     # Set up frontend
41 |     print("\n[2/2] Installing and building frontend...")
42 |     chat_app_dir = Path('chat-app').absolute()
43 |     if not chat_app_dir.exists():
44 |         print(f"Error: Directory not found: {chat_app_dir}")
45 |         return False
46 |     
47 |     # Check if package.json exists
48 |     if not (chat_app_dir / "package.json").exists():
49 |         print(f"Error: package.json not found in {chat_app_dir}")
50 |         return False
51 | 
52 |     # Install npm dependencies 
53 |     print("Installing npm dependencies...")
54 |     if not run_command("npm install", cwd=str(chat_app_dir)):
55 |         print("Failed to install npm dependencies")
56 |         return False
57 |     
58 |     # Build frontend if needed (if there's a build script in package.json)
59 |     if (chat_app_dir / "package.json").read_text().find('"build"') >= 0:
60 |         print("Building frontend...")
61 |         if not run_command("npm run build", cwd=str(chat_app_dir)):
62 |             print("Warning: Frontend build may have issues, but continuing...")
63 |     
64 |     print("\n✅ Setup complete! You can now run the application with: python run.py")
65 |     return True
66 | 
67 | if __name__ == "__main__":
68 |     if not main():
69 |         print("\n❌ Setup failed. Please check the errors above and try again.")
70 |         sys.exit(1)
71 |     sys.exit(0)


--------------------------------------------------------------------------------
/examples/custom_tools/README.md:
--------------------------------------------------------------------------------
 1 | # Template Matching Custom Tool
 2 | 
 3 | ## Example
 4 | 
 5 | This demo shows you how to create a custom tool for template matching that your Vision
 6 | Agent can then use to help you answer questions. To get started, you can install the
 7 | requirements by running:
 8 | 
 9 | ```bash
10 | pip install -r requirements.txt
11 | ```
12 | 
13 | You can then run the custom tool by running:
14 | 
15 | ```bash
16 | python run_custom_tool.py
17 | ```
18 | 
19 | Tool choice can be difficult for the agent to get, so sometimes it helps to explicitly
20 | call out which tool you want to use. For example:
21 | 
22 | ```python
23 | import vision_agent as va
24 | 
25 | agent = va.agent.VisionAgentCoderV2(verbosity=2)
26 | agent(
27 |     "Can you use the 'template_match_' tool to find the location of pid_template.png in pid.png?",
28 |     media="pid.png",
29 | )
30 | ```
31 | 
32 | ## Details
33 | Because we execute code on a separate process, we need to re-register the tools inside
34 | the new process. To do this, `register_tools` copies the source code and prepends it to
35 | the code that is executed in the new process. But there's a catch, it cannot copy the
36 | imports needed to run the tool code. To solve this, you must pass in the necessary
37 | imports in the `register_tool` like so:
38 | 
39 | ```python
40 | import vision_agent as va
41 | 
42 | @va.register_tool(
43 |     imports=["import cv2"],
44 | )
45 | def custom_tool(*args):
46 |     # Your tool code here
47 |     pass
48 | ```
49 | 
50 | This way the code executed in the new process will have the necessary imports to run.
51 | 


--------------------------------------------------------------------------------
/examples/custom_tools/pid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/examples/custom_tools/pid.png


--------------------------------------------------------------------------------
/examples/custom_tools/pid_template.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/examples/custom_tools/pid_template.png


--------------------------------------------------------------------------------
/examples/custom_tools/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | torchvision
3 | 


--------------------------------------------------------------------------------
/examples/custom_tools/run_custom_tool.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from template_match import template_matching_with_rotation
 3 | 
 4 | import vision_agent as va
 5 | import vision_agent.tools as T
 6 | import vision_agent.tools.planner_tools as pt
 7 | from vision_agent.models import AgentMessage
 8 | from vision_agent.utils.image_utils import get_image_size, normalize_bbox
 9 | 
10 | 
11 | @va.tools.register_tool(
12 |     imports=[
13 |         "import numpy as np",
14 |         "from vision_agent.utils.image_utils import get_image_size, normalize_bbox",
15 |         "from template_match import template_matching_with_rotation",
16 |     ]
17 | )
18 | def template_match(target_image: np.ndarray, template_image: np.ndarray) -> dict:
19 |     """'template_match' tool that finds the locations of the template image in the
20 |     target image.
21 | 
22 |     Parameters:
23 |         target_image (np.ndarray): The target image.
24 |         template_image (np.ndarray): The template image.
25 | 
26 |     Returns:
27 |         dict: A dictionary containing the bounding boxes of the matches.
28 | 
29 |     Example
30 |     -------
31 |     >>> import cv2
32 |     >>> target_image = cv2.imread("pid.png")
33 |     >>> template_image = cv2.imread("pid_template.png")
34 |     >>> matches = template_match(target_image, template_image)
35 |     """
36 | 
37 |     image_size = get_image_size(target_image)
38 |     matches = template_matching_with_rotation(target_image, template_image)
39 |     matches["bboxes"] = [normalize_bbox(box, image_size) for box in matches["bboxes"]]
40 |     return matches
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     agent = va.agent.VisionAgentCoderV2(verbose=True)
45 |     result = agent.generate_code(
46 |         [
47 |             AgentMessage(
48 |                 role="user",
49 |                 content="Can you find the locations of the pid_template.png in pid.png and tell me if any are nearby 'NOTE 5'?",
50 |                 media=["pid.png", "pid_template.png"],
51 |             )
52 |         ]
53 |     )
54 | 


--------------------------------------------------------------------------------
/examples/custom_tools/template_match.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | import numpy as np
  3 | import torch
  4 | from torchvision.ops import nms
  5 | 
  6 | 
  7 | def rotate_image(mat, angle):
  8 |     """
  9 |     Rotates an image (angle in degrees) and expands image to avoid cropping
 10 |     """
 11 | 
 12 |     height, width = mat.shape[:2]  # image shape has 3 dimensions
 13 |     image_center = (
 14 |         width / 2,
 15 |         height / 2,
 16 |     )  # getRotationMatrix2D needs coordinates in reverse order (width, height) compared to shape
 17 | 
 18 |     rotation_mat = cv2.getRotationMatrix2D(image_center, angle, 1.0)
 19 | 
 20 |     # rotation calculates the cos and sin, taking absolutes of those.
 21 |     abs_cos = abs(rotation_mat[0, 0])
 22 |     abs_sin = abs(rotation_mat[0, 1])
 23 | 
 24 |     # find the new width and height bounds
 25 |     bound_w = int(height * abs_sin + width * abs_cos)
 26 |     bound_h = int(height * abs_cos + width * abs_sin)
 27 | 
 28 |     # subtract old image center (bringing image back to origo) and adding the new image center coordinates
 29 |     rotation_mat[0, 2] += bound_w / 2 - image_center[0]
 30 |     rotation_mat[1, 2] += bound_h / 2 - image_center[1]
 31 | 
 32 |     # rotate image with the new bounds and translated rotation matrix
 33 |     rotated_mat = cv2.warpAffine(mat, rotation_mat, (bound_w, bound_h))
 34 |     return rotated_mat
 35 | 
 36 | 
 37 | def template_matching_with_rotation(
 38 |     main_image: np.ndarray,
 39 |     template: np.ndarray,
 40 |     max_rotation: int = 360,
 41 |     step: int = 90,
 42 |     threshold: float = 0.75,
 43 |     visualize: bool = False,
 44 | ) -> dict:
 45 |     template_height, template_width = template.shape[:2]
 46 | 
 47 |     # Convert images to grayscale
 48 |     main_image_gray = cv2.cvtColor(main_image, cv2.COLOR_BGR2GRAY)
 49 |     template_gray = cv2.cvtColor(template, cv2.COLOR_BGR2GRAY)
 50 | 
 51 |     boxes = []
 52 |     scores = []
 53 | 
 54 |     for angle in range(0, max_rotation, step):
 55 |         # Rotate the template
 56 |         rotated_template = rotate_image(template_gray, angle)
 57 |         if (
 58 |             rotated_template.shape[0] > main_image_gray.shape[0]
 59 |             or rotated_template.shape[1] > main_image_gray.shape[1]
 60 |         ):
 61 |             continue
 62 | 
 63 |         # Perform template matching
 64 |         result = cv2.matchTemplate(
 65 |             main_image_gray,
 66 |             rotated_template,
 67 |             cv2.TM_CCOEFF_NORMED,
 68 |         )
 69 | 
 70 |         y_coords, x_coords = np.where(result >= threshold)
 71 |         for x, y in zip(x_coords, y_coords):
 72 |             boxes.append(
 73 |                 (x, y, x + rotated_template.shape[1], y + rotated_template.shape[0])
 74 |             )
 75 |             scores.append(result[y, x])
 76 | 
 77 |     if len(boxes) > 0:
 78 |         indices = (
 79 |             nms(
 80 |                 torch.tensor(boxes).float(),
 81 |                 torch.tensor(scores).float(),
 82 |                 0.2,
 83 |             )
 84 |             .numpy()
 85 |             .tolist()
 86 |         )
 87 |         boxes = [boxes[i] for i in indices]
 88 |         scores = [scores[i] for i in indices]
 89 | 
 90 |     if visualize:
 91 |         # Draw a rectangle around the best match
 92 |         for box in boxes:
 93 |             cv2.rectangle(main_image, (box[0], box[1]), (box[2], box[3]), 255, 2)
 94 | 
 95 |         # Display the result
 96 |         cv2.imshow("Best Match", main_image)
 97 |         cv2.waitKey(0)
 98 |         cv2.destroyAllWindows()
 99 | 
100 |     return {"bboxes": boxes, "scores": scores}
101 | 


--------------------------------------------------------------------------------
/examples/mask_app/README.md:
--------------------------------------------------------------------------------
 1 | # Generate Masks for DINOv
 2 | 
 3 | This application allows you to generate masks to use for the DINOv tool. To get started
 4 | install the requirements by running:
 5 | 
 6 | ```bash
 7 | pip install -r requirements.txt
 8 | ```
 9 | 
10 | Then you can run the streamlit app by running:
11 | 
12 | ```bash
13 | streamlit run app.py
14 | ```
15 | 
16 | From here you can upload an image, paint a mask over the image, and then save the mask.
17 | This can be used as input for the DINOv tool.
18 | 
19 | ```python
20 | import vision_agent as va
21 | 
22 | data = {
23 |     "prompt": [{"mask": "baggage.png", "image": "baggage_mask.png"}],
24 |     "image": "baggage2.png",
25 | }
26 | tool = va.tools.easytool_tools.DINOv()
27 | output = res(**data)
28 | image = va.utils.image_utils.overlay_masks("baggage2.png", output)
29 | image = va.utils.image_utils.overlay_bboxes(image, output)
30 | image.show()
31 | ```
32 | 


--------------------------------------------------------------------------------
/examples/mask_app/app.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | from PIL import Image
 3 | from streamlit_drawable_canvas import st_canvas
 4 | 
 5 | import streamlit as st
 6 | 
 7 | st.title("Image Segmentation Mask App")
 8 | 
 9 | uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "png"])
10 | if uploaded_file is not None:
11 |     image = Image.open(uploaded_file)
12 |     orig_size = image.size
13 | 
14 | stroke_width = st.sidebar.slider("Stroke width: ", 1, 50, 25)
15 | stroke_color = st.sidebar.color_picker("Stroke color hex: ")
16 | 
17 | canvas_result = st_canvas(
18 |     fill_color="rgba(255, 165, 0, 0.3)",  # Fixed fill color with some opacity
19 |     stroke_width=stroke_width,
20 |     stroke_color=stroke_color,
21 |     background_color="#eee",
22 |     background_image=Image.open(uploaded_file) if uploaded_file else None,
23 |     update_streamlit=True,
24 |     height=500,
25 |     drawing_mode="freedraw",
26 |     key="canvas",
27 | )
28 | 
29 | if canvas_result.image_data is not None:
30 |     mask = canvas_result.image_data.astype("uint8")[..., 3]
31 |     mask[mask > 0] = 255
32 |     if st.button("Save Mask Image") and orig_size:
33 |         mask = cv2.resize(mask, orig_size, interpolation=cv2.INTER_NEAREST)
34 |         cv2.imwrite("mask.png", mask)
35 |         st.success("Mask Image saved successfully.")
36 | 


--------------------------------------------------------------------------------
/examples/mask_app/requirements.txt:
--------------------------------------------------------------------------------
1 | streamlit
2 | streamlit-drawable-canvas
3 | 


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: LandingAI VisionAgent Library Documentation
 2 | site_url: https://landing-ai.github.io/
 3 | repo_url: https://github.com/landing-ai/vision-agent
 4 | edit_uri: edit/main/docs/
 5 | 
 6 | theme:
 7 |   name: material
 8 |   custom_dir: docs/_overrides
 9 |   palette:
10 |     - media: "(prefers-color-scheme: light)"
11 |       scheme: default
12 |       primary: teal
13 |       accent: amber
14 |       toggle:
15 |         icon: material/lightbulb
16 |         name: Switch to dark mode
17 |     - media: "(prefers-color-scheme: dark)"
18 |       scheme: slate
19 |       primary: teal
20 |       accent: amber
21 |       toggle:
22 |         icon: material/lightbulb-outline
23 |         name: Switch to light mode
24 |   features:
25 |     - search.suggest
26 |     - search.highlight
27 |     - content.tabs.link
28 |     - navigation.indexes
29 |     - content.tooltips
30 |     - navigation.path
31 |     - content.code.annotate
32 |     - content.code.copy
33 |     - content.code.select
34 |     - navigation.tabs
35 |   icon:
36 |     repo: fontawesome/brands/github-alt
37 | 
38 | 
39 | plugins:
40 |   search: null
41 |   mkdocstrings:
42 |     handlers:
43 |       python:
44 |         options:
45 |           show_root_heading: true
46 |           show_if_no_docstring: true
47 |           inherited_members: true
48 |           members_order: source
49 |           separate_signature: true
50 |           unwrap_annotated: true
51 |           filters:
52 |           - '!^_'
53 |           merge_init_into_class: true
54 |           docstring_section_style: spacy
55 |           signature_crossrefs: true
56 |           show_symbol_type_heading: true
57 |           show_symbol_type_toc: true
58 | 
59 | 
60 | markdown_extensions:
61 |   # Syntax highlight
62 |   - pymdownx.highlight:
63 |       anchor_linenums: true
64 |       line_spans: __span
65 |       pygments_lang_class: true
66 |   - pymdownx.inlinehilite
67 |   - pymdownx.snippets
68 |   - pymdownx.superfences
69 | 
70 |   # Multiline note/warning/etc blocks (https://squidfunk.github.io/mkdocs-material/reference/admonitions)
71 |   - admonition
72 |   - pymdownx.details
73 | 
74 | nav:
75 |   - VisionAgent: index.md
76 |   - APIs:
77 |       - vision_agent.tools: api/tools.md
78 |       - vision_agent.agent: api/agent.md
79 |       - vision_agent.lmm: api/lmm.md
80 |       - vision_agent.sim: api/sim.md
81 |       - vision_agent.configs: api/configs.md
82 |       - vision_agent.models: api/models.md
83 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [build-system]
  2 | requires = ["hatchling"]
  3 | build-backend = "hatchling.build"
  4 | 
  5 | [project]
  6 | name = "vision-agent"
  7 | version = "1.1.16"
  8 | description = "Toolset for Vision Agent"
  9 | authors = [{ name = "Landing AI", email = "dev@landing.ai" }]
 10 | requires-python = ">=3.9,<4.0"
 11 | readme = "README.md"
 12 | dependencies = [
 13 |     "numpy>=1.21.0,<2.0.0",
 14 |     "pillow==10.*",
 15 |     "requests==2.*",
 16 |     "tqdm>=4.64.0,<5.0.0",
 17 |     "pandas==2.*",
 18 |     "openai==1.55.3",
 19 |     "httpx==0.27.2",
 20 |     "flake8>=7.0.0,<8",
 21 |     "typing_extensions==4.*",
 22 |     "opencv-python==4.*",
 23 |     "tabulate>=0.9.0,<0.10",
 24 |     "scipy==1.13.*",
 25 |     "nbclient>=0.10.0,<0.11",
 26 |     "nbformat>=5.10.4,<6",
 27 |     "rich>=13.7.1,<14",
 28 |     "ipykernel>=6.29.4,<7",
 29 |     "tenacity>=8.3.0,<9",
 30 |     "pillow-heif>=0.16.0,<0.17",
 31 |     "anthropic>=0.31.0,<0.32",
 32 |     "pydantic>=2.0.0,<3",
 33 |     "av>=11.0.0,<12",
 34 |     "libcst>=1.5.0,<2",
 35 |     "matplotlib>=3.9.2,<4",
 36 |     "scikit-learn>=1.5.2,<2",
 37 |     "opentelemetry-api>=1.29.0,<2",
 38 |     "dotenv>=0.9.9,<0.10",
 39 |     "pymupdf>=1.23.0,<2",
 40 |     "google-genai>=1.0.0,<2",
 41 |     "yt-dlp>=2025.3.31",
 42 | ]
 43 | 
 44 | [project.urls]
 45 | Homepage = "https://landing.ai"
 46 | repository = "https://github.com/landing-ai/vision-agent"
 47 | documentation = "https://github.com/landing-ai/vision-agent"
 48 | 
 49 | [dependency-groups]
 50 | dev = [
 51 |     "autoflake==1.*",
 52 |     "pytest==7.*",
 53 |     "black>=23,<25",
 54 |     "isort==5.*",
 55 |     "responses>=0.23.1,<0.24",
 56 |     "mypy<1.8.0",
 57 |     "types-requests>=2.31.0.0,<3",
 58 |     "types-pillow>=9.5.0.4,<10",
 59 |     "data-science-types>=0.2.23,<0.3",
 60 |     "types-tqdm>=4.65.0.1,<5",
 61 |     "griffe>=0.45.3,<0.46",
 62 |     "mkdocs>=1.5.3,<2",
 63 |     "mkdocstrings[python]>=0.23.0,<0.24",
 64 |     "mkdocs-material>=9.4.2,<10",
 65 |     "types-tabulate>=0.9.0.20240106,<0.10",
 66 |     "scikit-image<0.23.1",
 67 |     "pre-commit>=3.8.0,<4",
 68 | ]
 69 | 
 70 | [tool.hatch.build.targets.wheel]
 71 | include = [
 72 |     "vision_agent",
 73 |     "vision_agent/.sim_tools/*",
 74 | ]
 75 | 
 76 | 
 77 | [tool.hatch.build.targets.sdist]
 78 | include = [
 79 |     "vision_agent",
 80 |     "vision_agent/.sim_tools/*",
 81 | ]
 82 | 
 83 | [tool.pytest.ini_options]
 84 | log_cli = true
 85 | log_cli_level = "INFO"
 86 | log_cli_format = "%(asctime)s [%(levelname)s] %(message)s (%(filename)s:%(lineno)s)"
 87 | log_cli_date_format = "%Y-%m-%d %H:%M:%S"
 88 | 
 89 | [tool.black]
 90 | exclude = '.vscode|.eggs|venv'
 91 | line-length = 88  # suggested by black official site
 92 | 
 93 | [tool.isort]
 94 | line_length = 88
 95 | profile = "black"
 96 | 
 97 | [tool.mypy]
 98 | plugins = "pydantic.mypy"
 99 | 
100 | exclude = "tests"
101 | show_error_context = true
102 | pretty = true
103 | check_untyped_defs = true
104 | disallow_untyped_defs = true
105 | no_implicit_optional = true
106 | strict_optional = true
107 | strict_equality = true
108 | extra_checks = true
109 | warn_redundant_casts = true
110 | warn_unused_configs = true
111 | warn_unused_ignores = true
112 | warn_return_any = true
113 | show_error_codes = true
114 | 
115 | [[tool.mypy.overrides]]
116 | ignore_missing_imports = true
117 | module = [
118 |     "cv2.*",
119 |     "openai.*",
120 |     "sentence_transformers.*",
121 | ]
122 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/tests/__init__.py


--------------------------------------------------------------------------------
/tests/integ/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/tests/integ/__init__.py


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/tests/unit/__init__.py


--------------------------------------------------------------------------------
/tests/unit/fixtures.py:
--------------------------------------------------------------------------------
 1 | from unittest.mock import MagicMock, patch
 2 | 
 3 | import pytest
 4 | 
 5 | 
 6 | @pytest.fixture
 7 | def openai_lmm_mock(request):
 8 |     content = request.param
 9 | 
10 |     def mock_generate(*args, **kwargs):
11 |         if kwargs.get("stream", False):
12 | 
13 |             def generator():
14 |                 for chunk in content.split(" ") + [None]:
15 |                     yield MagicMock(choices=[MagicMock(delta=MagicMock(content=chunk))])
16 | 
17 |             return generator()
18 |         else:
19 |             return MagicMock(choices=[MagicMock(message=MagicMock(content=content))])
20 | 
21 |     # Note the path here is adjusted to where OpenAI is used, not where it's defined
22 |     with patch("vision_agent.lmm.lmm.OpenAI") as mock:
23 |         # Setup a mock response structure that matches what your code expects
24 |         mock_instance = mock.return_value
25 |         mock_instance.chat.completions.create.return_value = mock_generate()
26 |         yield mock_instance
27 | 
28 | 
29 | @pytest.fixture
30 | def generate_ollama_lmm_mock(request):
31 |     content = request.param
32 | 
33 |     mock_resp = MagicMock()
34 |     mock_resp.status_code = 200
35 |     mock_resp.json.return_value = {"response": content}
36 |     with patch("vision_agent.lmm.lmm.requests.post") as mock:
37 |         mock.return_value = mock_resp
38 |         yield mock
39 | 
40 | 
41 | @pytest.fixture
42 | def chat_ollama_lmm_mock(request):
43 |     content = request.param
44 | 
45 |     mock_resp = MagicMock()
46 |     mock_resp.status_code = 200
47 |     mock_resp.json.return_value = {"message": {"content": content}}
48 |     with patch("vision_agent.lmm.lmm.requests.post") as mock:
49 |         mock.return_value = mock_resp
50 |         yield mock
51 | 
52 | 
53 | @pytest.fixture
54 | def google_lmm_mock(request):
55 |     content = request.param
56 | 
57 |     # Mock implementation for streaming responses
58 |     def mock_stream_generator():
59 |         for chunk in content.split(" "):
60 |             yield MagicMock(text=chunk)
61 |         yield MagicMock(text=None)
62 | 
63 |     # Mock implementation for regular responses
64 |     mock_generate_response = MagicMock()
65 |     mock_generate_response.text = content
66 | 
67 |     # Set up the client mock
68 |     mock_client = MagicMock()
69 |     mock_models = MagicMock()
70 |     mock_client.models = mock_models
71 | 
72 |     # Configure generate_content method
73 |     mock_models.generate_content.return_value = mock_generate_response
74 | 
75 |     # Configure generate_content_stream method
76 |     mock_stream = MagicMock()
77 |     mock_stream.__iter__.return_value = mock_stream_generator()
78 |     mock_models.generate_content_stream.return_value = mock_stream
79 | 
80 |     # Patch the genai.Client class
81 |     with patch("google.genai.Client") as mock_client_class:
82 |         mock_client_class.return_value = mock_client
83 |         yield mock_client
84 | 


--------------------------------------------------------------------------------
/tests/unit/test_meta_tools.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from vision_agent.tools.meta_tools import (
 4 |     Artifacts,
 5 |     check_and_load_image,
 6 | )
 7 | 
 8 | 
 9 | def test_check_and_load_image_none():
10 |     assert check_and_load_image("print('Hello, World!')") == []
11 | 
12 | 
13 | def test_check_and_load_image_one():
14 |     assert check_and_load_image("view_media_artifact(artifacts, 'image.jpg')") == [
15 |         "image.jpg"
16 |     ]
17 | 
18 | 
19 | def test_check_and_load_image_two():
20 |     code = "view_media_artifact(artifacts, 'image1.jpg')\nview_media_artifact(artifacts, 'image2.jpg')"
21 |     assert check_and_load_image(code) == ["image1.jpg", "image2.jpg"]
22 | 


--------------------------------------------------------------------------------
/tests/unit/test_planner_tools.py:
--------------------------------------------------------------------------------
 1 | from vision_agent.tools.planner_tools import check_function_call, replace_box_threshold
 2 | 
 3 | 
 4 | def test_check_function_call():
 5 |     code = """
 6 | test_function('one', image1)
 7 | """
 8 |     assert check_function_call(code, "test_function") == True
 9 |     assert check_function_call(code, "test_function2") == False
10 | 
11 | 
12 | def test_check_function_call_try_catch():
13 |     code = """
14 | try:
15 |     test_function('one', image1)
16 | except Exception as e:
17 |     pass
18 | """
19 |     assert check_function_call(code, "test_function") == True
20 |     assert check_function_call(code, "test_function2") == False
21 | 
22 | 
23 | def test_replace_box_threshold():
24 |     code = """
25 | test_function('one', image1, box_threshold=0.1)
26 | """
27 |     expected_code = """
28 | test_function('one', image1, box_threshold=0.5)
29 | """
30 |     assert replace_box_threshold(code, ["test_function"], 0.5) == expected_code
31 | 
32 | 
33 | def test_replace_box_threshold_in_function():
34 |     code = """
35 | def test_function_outer():
36 |     test_function('one', image1, box_threshold=0.1)
37 | """
38 |     expected_code = """
39 | def test_function_outer():
40 |     test_function('one', image1, box_threshold=0.5)
41 | """
42 |     assert replace_box_threshold(code, ["test_function"], 0.5) == expected_code
43 | 
44 | 
45 | def test_replace_box_threshold_no_arg():
46 |     code = """
47 | test_function('one', image1)
48 | """
49 |     expected_code = """
50 | test_function('one', image1, box_threshold=0.5)
51 | """
52 |     assert replace_box_threshold(code, ["test_function"], 0.5) == expected_code
53 | 
54 | 
55 | def test_replace_box_threshold_no_func():
56 |     code = """
57 | test_function2('one', image1)
58 | """
59 |     expected_code = """
60 | test_function2('one', image1)
61 | """
62 |     assert replace_box_threshold(code, ["test_function"], 0.5) == expected_code
63 | 


--------------------------------------------------------------------------------
/tests/unit/test_utils.py:
--------------------------------------------------------------------------------
 1 | from vision_agent.utils.agent import (
 2 |     extract_code,
 3 |     extract_json,
 4 |     extract_tag,
 5 |     remove_installs_from_code,
 6 | )
 7 | 
 8 | 
 9 | def test_basic_json_extract():
10 |     a = '{"a": 1, "b": 2}'
11 |     assert extract_json(a) == {"a": 1, "b": 2}
12 | 
13 | 
14 | def test_side_case_quotes_json_extract():
15 |     a = "{'0': 'no', '3': 'no', '6': 'no', '9': 'yes', '12': 'no', '15': 'no'}"
16 |     a_json = extract_json(a)
17 |     assert len(a_json) == 6
18 | 
19 | 
20 | def test_side_case_bool_json_extract():
21 |     a = "{'0': False, '3': False, '6': False, '9': True, '12': False, '15': False}"
22 |     a_json = extract_json(a)
23 |     assert len(a_json) == 6
24 | 
25 | 
26 | def test_complicated_case_json_extract_1():
27 |     a = """```json {     "plan1": {         "thoughts": "This plan uses the owl_v2_video tool to detect the truck and then uses ocr to read the USDOT and trailer numbers. This approach is efficient as it can process the entire video at once for truck detection.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use owl_v2_video with prompt 'truck' to detect if a truck is present in the video",             "If a truck is detected, use ocr on relevant frames to read the USDOT and trailer numbers",             "Process the OCR results to extract the USDOT and trailer numbers",             "Compile results into JSON format and save using save_json"         ]     },     "plan2": {         "thoughts": "This plan uses florence2_sam2_video_tracking to segment and track the truck, then uses florence2_ocr for text detection. This approach might be more accurate for text detection as it can focus on the relevant parts of the truck.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use florence2_sam2_video_tracking with prompt 'truck' to segment and track the truck",             "If a truck is segmented, use florence2_ocr on the segmented area to detect text",             "Process the OCR results to extract the USDOT and trailer numbers",             "Compile results into JSON format and save using save_json"         ]     },     "plan3": {         "thoughts": "This plan uses ixc25_video_vqa to directly ask questions about the truck, USDOT number, and trailer number. This approach leverages the model's ability to understand and answer complex questions about video content.",         "instructions": [             "Use extract_frames to get frames from truck1.mp4",             "Use ixc25_video_vqa with the question 'Is there a truck in this video?' to detect the presence of a truck",             "If a truck is present, use ixc25_video_vqa with the question 'What is the USDOT number on the truck?'",             "Use ixc25_video_vqa with the question 'What is the trailer number on the truck?'",             "Process the answers to extract the required information",             "Compile results into JSON format and save using save_json"         ]     } } ```"""
28 | 
29 |     a_json = extract_json(a)
30 |     assert len(a_json) == 3
31 |     assert "plan1" in a_json
32 | 
33 | 
34 | def test_complicated_case_json_extract_2():
35 |     a = """{\n    "predicted_answer": "2",\n    "thoughts": "After analyzing the image and the tool outputs, I can see that there are indeed 2 dogs in the image. One is a small grey dog on the grass, and the other is a larger white dog on the patio. All three plans correctly identified 2 dogs, but I believe plan2 using the countgd_counting tool is the best choice for this task. Here\'s why:\n\n    1. Accuracy: The countgd_counting tool provided high confidence scores (0.92 and 0.9) for both dogs, which aligns with what I can see in the image.\n    \n    2. Precision: The bounding boxes from the countgd_counting tool seem to be more precise and tightly fit around the dogs compared to the other tools.\n    \n    3. Simplicity: While plan3 offers a more complex approach with additional verification, it\'s not necessary in this case as the dogs are clearly visible and easily identifiable. The extra steps in plan3 would add unnecessary complexity and potential for errors.\n    \n    4. Efficiency: Plan2 is more straightforward and efficient than plan3, while potentially offering better accuracy than plan1 (owl_v2_image tool had lower confidence scores).",\n    "best_plan": "plan2"\n}"""
36 |     a_json = extract_json(a)
37 |     assert len(a_json) == 3
38 |     assert "predicted_answer" in a_json
39 | 
40 | 
41 | def test_basic_code_extract():
42 |     a = """```python
43 | def test_basic_json_extract():
44 |     a = '{"a": 1, "b": 2}'
45 |     assert extract_json(a) == {"a": 1, "b": 2}
46 | ```
47 | """
48 |     a_code = extract_code(a)
49 |     assert "def test_basic_json_extract():" in a_code
50 |     assert "assert extract_json(a) == {" in a_code
51 | 
52 | 
53 | def test_remove_installs_from_code():
54 |     a = """import os
55 | imoprt sys
56 | 
57 | !pip install pandas
58 | 
59 | 
60 | def test():
61 |     print("!pip install dummy")
62 | """
63 |     out = remove_installs_from_code(a)
64 |     assert "import os" in out
65 |     assert "!pip install pandas" not in out
66 |     assert "!pip install dummy" in out
67 | 
68 | 
69 | def test_chat_agent_case():
70 |     a = """{"thoughts": "The user has chosen to use the plan with owl_v2 and specified a threshold of 0.4. I'll now generate the vision code based on this plan and the user's modification.", "response": "Certainly! I'll generate the code using owl_v2 with a threshold of 0.4 as you requested. Let me create that for you now.\n\n<execute_python>generate_vision_code(artifacts, 'count_workers_with_helmets.py', 'Can you write code to count the number of workers wearing helmets?', media=['/Users/dillonlaird/landing.ai/vision-agent/examples/chat/workspace/workers.png'], plan={'thoughts': 'Using owl_v2_image seems most appropriate as it can detect and count multiple objects given a text prompt. This tool is specifically designed for object detection tasks like counting workers wearing helmets.', 'instructions': ['Load the image using load_image(\'/Users/dillonlaird/landing.ai/vision-agent/examples/chat/workspace/workers.png\')', 'Use owl_v2_image with the prompt \'worker wearing helmet\' to detect and count workers with helmets', 'Count the number of detections returned by owl_v2_image to get the final count of workers wearing helmets']}, plan_thoughts='Use a threshold of 0.4 as specified by the user', plan_context_artifact='worker_helmet_plan.json')</execute_python>", "let_user_respond": false}"""
71 |     a_json = extract_json(a)
72 |     assert "thoughts" in a_json
73 |     assert "response" in a_json
74 | 
75 | 
76 | def test_extract_execution():
77 |     a = """<execute_python>print('hello world!')</execute_python>"""
78 |     a_code = extract_tag(a, "execute_python")
79 |     assert a_code == "print('hello world!')"
80 | 
81 | 
82 | def test_extract_execution_null():
83 |     a = """<execute_python>print('hello world!')"""
84 |     a_code = extract_tag(a, "execute_python")
85 |     assert a_code is None
86 | 
87 | 
88 | def test_extract_execution_double():
89 |     a = """<execute_python>print('hello world!')</execute_python><execute_python>print('hello world!')</execute_python>"""
90 |     a_code = extract_tag(a, "execute_python")
91 |     assert a_code == "print('hello world!')\nprint('hello world!')"
92 | 


--------------------------------------------------------------------------------
/tests/unit/test_vac.py:
--------------------------------------------------------------------------------
  1 | from vision_agent.agent.vision_agent_coder_v2 import strip_function_calls
  2 | 
  3 | 
  4 | def test_strip_non_function_real_case():
  5 |     code = """import os
  6 | import numpy as np
  7 | from vision_agent.tools import *
  8 | from typing import *
  9 | from pillow_heif import register_heif_opener
 10 | register_heif_opener()
 11 | import vision_agent as va
 12 | from vision_agent.tools import register_tool
 13 | 
 14 | 
 15 | from vision_agent.tools import load_image, owl_v2_image, overlay_bounding_boxes, save_image, save_json
 16 | 
 17 | def check_helmets(image_path):
 18 |     # Load the image
 19 |     image = load_image(image_path)
 20 |     
 21 |     # Detect people and helmets
 22 |     detections = owl_v2_image("person, helmet", image, box_threshold=0.2)
 23 |     
 24 |     # Separate people and helmets
 25 |     people = [d for d in detections if d['label'] == 'person']
 26 |     helmets = [d for d in detections if d['label'] == 'helmet']
 27 |     
 28 |     people_with_helmets = 0
 29 |     people_without_helmets = 0
 30 |     
 31 |     height, width = image.shape[:2]
 32 |     
 33 |     for person in people:
 34 |         person_x = (person['bbox'][0] + person['bbox'][2]) / 2
 35 |         person_y = person['bbox'][1]  # Top of the bounding box
 36 |         
 37 |         helmet_found = False
 38 |         for helmet in helmets:
 39 |             helmet_x = (helmet['bbox'][0] + helmet['bbox'][2]) / 2
 40 |             helmet_y = (helmet['bbox'][1] + helmet['bbox'][3]) / 2
 41 |             
 42 |             # Check if the helmet is within 20 pixels of the person's head
 43 |             if (abs((helmet_x - person_x) * width) < 20 and
 44 |                 -5 < ((helmet_y - person_y) * height) < 20):
 45 |                 helmet_found = True
 46 |                 break
 47 |         
 48 |         if helmet_found:
 49 |             people_with_helmets += 1
 50 |             person['label'] = 'person with helmet'
 51 |         else:
 52 |             people_without_helmets += 1
 53 |             person['label'] = 'person without helmet'
 54 |     
 55 |     # Create the count dictionary
 56 |     count_dict = {
 57 |         "people_with_helmets": people_with_helmets,
 58 |         "people_without_helmets": people_without_helmets
 59 |     }
 60 |     
 61 |     # Visualize the results
 62 |     visualized_image = overlay_bounding_boxes(image, detections)
 63 |     
 64 |     # Save the visualized image
 65 |     save_image(visualized_image, "/home/user/visualized_result.png")
 66 |     
 67 |     # Save the count dictionary as JSON
 68 |     save_json(count_dict, "/home/user/helmet_counts.json")
 69 |     
 70 |     return count_dict
 71 | 
 72 | # The function can be called with the image path
 73 | result = check_helmets("/home/user/edQPXGK_workers.png")"""
 74 |     expected_code = """import os
 75 | import numpy as np
 76 | from vision_agent.tools import *
 77 | from typing import *
 78 | from pillow_heif import register_heif_opener
 79 | register_heif_opener()
 80 | import vision_agent as va
 81 | from vision_agent.tools import register_tool
 82 | 
 83 | 
 84 | from vision_agent.tools import load_image, owl_v2_image, overlay_bounding_boxes, save_image, save_json
 85 | 
 86 | def check_helmets(image_path):
 87 |     # Load the image
 88 |     image = load_image(image_path)
 89 |     
 90 |     # Detect people and helmets
 91 |     detections = owl_v2_image("person, helmet", image, box_threshold=0.2)
 92 |     
 93 |     # Separate people and helmets
 94 |     people = [d for d in detections if d['label'] == 'person']
 95 |     helmets = [d for d in detections if d['label'] == 'helmet']
 96 |     
 97 |     people_with_helmets = 0
 98 |     people_without_helmets = 0
 99 |     
100 |     height, width = image.shape[:2]
101 |     
102 |     for person in people:
103 |         person_x = (person['bbox'][0] + person['bbox'][2]) / 2
104 |         person_y = person['bbox'][1]  # Top of the bounding box
105 |         
106 |         helmet_found = False
107 |         for helmet in helmets:
108 |             helmet_x = (helmet['bbox'][0] + helmet['bbox'][2]) / 2
109 |             helmet_y = (helmet['bbox'][1] + helmet['bbox'][3]) / 2
110 |             
111 |             # Check if the helmet is within 20 pixels of the person's head
112 |             if (abs((helmet_x - person_x) * width) < 20 and
113 |                 -5 < ((helmet_y - person_y) * height) < 20):
114 |                 helmet_found = True
115 |                 break
116 |         
117 |         if helmet_found:
118 |             people_with_helmets += 1
119 |             person['label'] = 'person with helmet'
120 |         else:
121 |             people_without_helmets += 1
122 |             person['label'] = 'person without helmet'
123 |     
124 |     # Create the count dictionary
125 |     count_dict = {
126 |         "people_with_helmets": people_with_helmets,
127 |         "people_without_helmets": people_without_helmets
128 |     }
129 |     
130 |     # Visualize the results
131 |     visualized_image = overlay_bounding_boxes(image, detections)
132 |     
133 |     # Save the visualized image
134 |     save_image(visualized_image, "/home/user/visualized_result.png")
135 |     
136 |     # Save the count dictionary as JSON
137 |     save_json(count_dict, "/home/user/helmet_counts.json")
138 |     
139 |     return count_dict"""
140 |     code_out = strip_function_calls(code, exclusions=["register_heif_opener"])
141 |     assert code_out == expected_code
142 | 
143 | 
144 | def test_strip_function_call_if_name_equal_main():
145 |     code = """import os
146 | def f():
147 |     print("Hello!")
148 | if __name__ == "__main__":
149 |     f()"""
150 |     expected_code = """import os
151 | def f():
152 |     print("Hello!")
153 | if __name__ == "__main__":
154 |     pass"""
155 |     code_out = strip_function_calls(code)
156 |     assert code_out == expected_code
157 | 
158 | 
159 | def test_strip_function_call_for_loop():
160 |     code = """import os
161 | def f():
162 |     print("Hello!")
163 | for i in range(10):
164 |     f()"""
165 |     expected_code = """import os
166 | def f():
167 |     print("Hello!")
168 | for i in range(10):
169 |     pass"""
170 |     code_out = strip_function_calls(code)
171 |     assert code_out == expected_code
172 | 
173 | 
174 | def test_strip_function_call_while_loop():
175 |     code = """import os
176 | def f():
177 |     print("Hello!")
178 | i = 0
179 | while i < 10:
180 |     f()
181 |     i += 1"""
182 |     expected_code = """import os
183 | def f():
184 |     print("Hello!")
185 | i = 0
186 | while i < 10:
187 |     i += 1"""
188 |     code_out = strip_function_calls(code)
189 |     assert code_out == expected_code
190 | 
191 | 
192 | def test_strip_function_call_with():
193 |     code = """import os
194 | def f():
195 |     return "Hello!"
196 | 
197 | with open("file.txt", "w") as f:
198 |     out = f()
199 |     f.write(out)"""
200 |     expected_code = """import os
201 | def f():
202 |     return "Hello!"
203 | 
204 | with open("file.txt", "w") as f:
205 |     pass"""
206 |     code_out = strip_function_calls(code)
207 |     assert code_out == expected_code
208 | 
209 | 
210 | def test_strip_function_call_with_try_except():
211 |     code = """import os
212 | def f():
213 |     return "Hello!"
214 | try:
215 |     out = f()
216 | except:
217 |     out = f()
218 | finally:
219 |     out = f()"""
220 |     expected_code = """import os
221 | def f():
222 |     return "Hello!"
223 | try:
224 |     pass
225 | except:
226 |     pass
227 | finally:
228 |     pass"""
229 |     code_out = strip_function_calls(code)
230 |     assert code_out == expected_code
231 | 


--------------------------------------------------------------------------------
/tests/unit/tools/test_tools.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from pathlib import Path
 4 | 
 5 | import numpy as np
 6 | 
 7 | from vision_agent.tools.tools import overlay_bounding_boxes, save_image, save_video
 8 | 
 9 | 
10 | def test_saves_frames_without_output_path():
11 |     frames = [
12 |         np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
13 |     ]
14 |     output_path = save_video(frames)
15 |     assert Path(output_path).exists()
16 |     os.remove(output_path)
17 | 
18 | 
19 | def test_saves_frames_with_output_path():
20 |     frames = [
21 |         np.random.randint(0, 256, (480, 640, 3), dtype=np.uint8) for _ in range(10)
22 |     ]
23 | 
24 |     with tempfile.TemporaryDirectory() as tmp_dir:
25 |         video_output_path = Path(tmp_dir) / "output.mp4"
26 |         output_path = save_video(frames, str(video_output_path))
27 | 
28 |         assert output_path == str(video_output_path)
29 |         assert Path(output_path).exists()
30 | 
31 | 
32 | def test_save_null_image():
33 |     image = None
34 |     try:
35 |         save_image(image, "tmp.jpg")
36 |     except ValueError as e:
37 |         assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)"
38 | 
39 | 
40 | def test_save_empty_image():
41 |     image = np.zeros((0, 0, 3), dtype=np.uint8)
42 |     try:
43 |         save_image(image, "tmp.jpg")
44 |     except ValueError as e:
45 |         assert str(e) == "The image is not a valid NumPy array with shape (H, W, C)"
46 | 
47 | 
48 | def test_save_null_video():
49 |     frames = None
50 |     try:
51 |         save_video(frames, "tmp.mp4")
52 |     except ValueError as e:
53 |         assert str(e) == "Frames must be a list of NumPy arrays"
54 | 
55 | 
56 | def test_save_empty_list():
57 |     frames = []
58 |     try:
59 |         save_video(frames, "tmp.mp4")
60 |     except ValueError as e:
61 |         assert str(e) == "Frames must be a list of NumPy arrays"
62 | 
63 | 
64 | def test_save_invalid_frame():
65 |     frames = [np.zeros((0, 0, 3), dtype=np.uint8)]
66 |     try:
67 |         save_video(frames, "tmp.mp4")
68 |     except ValueError as e:
69 |         assert str(e) == "A frame is not a valid NumPy array with shape (H, W, C)"
70 | 
71 | 
72 | def test_overlay_bounding_boxes_with_empty_bboxes_single_image():
73 |     image = np.zeros((480, 640, 3), dtype=np.uint8)
74 |     bboxes = []
75 |     output = overlay_bounding_boxes(image, bboxes)
76 |     assert np.array_equal(image, output)
77 | 
78 | 
79 | def test_overlay_bounding_boxes_with_empty_bboxes_multiple_images():
80 |     image1 = np.zeros((480, 640, 3), dtype=np.uint8)
81 |     image2 = np.zeros((400, 600, 3), dtype=np.uint8)
82 |     bboxes = []
83 |     output1, output2 = overlay_bounding_boxes([image1, image2], bboxes)
84 |     assert np.array_equal(image1, output1)
85 |     assert np.array_equal(image2, output2)
86 | 


--------------------------------------------------------------------------------
/tests/unit/tools/test_video.py:
--------------------------------------------------------------------------------
 1 | import tempfile
 2 | from typing import Optional
 3 | 
 4 | import cv2
 5 | import numpy as np
 6 | 
 7 | from vision_agent.utils.video import extract_frames_from_video
 8 | from vision_agent.tools import extract_frames_and_timestamps
 9 | 
10 | 
11 | def test_extract_frames_from_video():
12 |     video_path = _create_video(duration=2)
13 |     # there are 48 frames at 24 fps in this video file
14 |     res = extract_frames_from_video(video_path, fps=24)
15 |     assert len(res) == 48
16 | 
17 |     res = extract_frames_from_video(video_path, fps=2)
18 |     assert len(res) == 4
19 | 
20 |     res = extract_frames_from_video(video_path, fps=1)
21 |     assert len(res) == 2
22 | 
23 | 
24 | def test_extract_frames_from_invalid_uri():
25 |     uri = "https://www.youtube.com/watch?v=HjGJvNRkuqY&ab_channel=TheSAHDStudio"
26 |     res = extract_frames_from_video(uri, 1.0)
27 |     assert len(res) == 0
28 | 
29 | 
30 | def test_extract_frames_with_illegal_fps():
31 |     video_path = _create_video(duration=1)
32 |     res = extract_frames_from_video(video_path, -1.0)
33 |     assert len(res) == 1
34 | 
35 |     res = extract_frames_from_video(video_path, None)
36 |     assert len(res) == 1
37 | 
38 |     res = extract_frames_from_video(video_path, 0.0)
39 |     assert len(res) == 1
40 | 
41 | 
42 | def test_extract_frames_with_input_video_has_no_fps():
43 |     video_path = _create_video(fps_video_prop=None)
44 |     res = extract_frames_from_video(video_path, 1.0)
45 |     assert len(res) == 0
46 | 
47 | 
48 | def test_extract_frames_and_timestamps_from_local_video():
49 |     video_path = _create_video(duration=2)
50 |     res = extract_frames_and_timestamps(video_path, fps=24)
51 |     assert isinstance(res, list)
52 |     assert len(res) == 48
53 |     assert all("frame" in item and "timestamp" in item for item in res)
54 | 
55 | 
56 | def test_extract_frames_and_timestamps_from_http():
57 |     res = extract_frames_and_timestamps(
58 |         "https://www.w3schools.com/tags/mov_bbb.mp4", fps=0.2
59 |     )
60 |     assert isinstance(res, list)
61 |     assert len(res) == 2
62 |     assert all("frame" in item and "timestamp" in item for item in res)
63 | 
64 | 
65 | def test_extract_frames_and_timestamps_invalid_local_file():
66 |     res = extract_frames_and_timestamps("non_existing_file.mp4", fps=1.0)
67 |     assert res == []
68 | 
69 | 
70 | def _create_video(
71 |     *, duration: int = 3, fps: int = 24, fps_video_prop: Optional[int] = 24
72 | ) -> str:
73 |     # Create a temporary file for the video
74 |     with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video:
75 |         video_path = temp_video.name
76 |     # Set video properties
77 |     width, height = 640, 480
78 |     # Create a VideoWriter object without setting FPS
79 |     fourcc = cv2.VideoWriter_fourcc(*"mp4v")
80 |     out = cv2.VideoWriter(video_path, fourcc, fps_video_prop, (width, height))
81 |     # Generate and write random frames
82 |     for _ in range(duration * fps):
83 |         frame = np.random.randint(0, 256, (height, width, 3), dtype=np.uint8)
84 |         out.write(frame)
85 |     out.release()
86 |     return video_path
87 | 


--------------------------------------------------------------------------------
/vision_agent/.sim_tools/embs.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/vision_agent/.sim_tools/embs.npy


--------------------------------------------------------------------------------
/vision_agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .agent import Agent
2 | from .lmm import LMM, OpenAILMM
3 | 


--------------------------------------------------------------------------------
/vision_agent/agent/README.md:
--------------------------------------------------------------------------------
 1 | ## V2 Agents
 2 | 
 3 | This gives an overview of all the V2 agents, how they communicate, and how human-in-the-loop works.
 4 | 
 5 | - `vision_agent_v2` - This is the conversation agent. It can take exactly one action and one response. The actions are fixed JSON actions (meaning it does not execute code but instead returns a JSON and we execute the code on it's behalf). This is so we can control the action arguments as well as pass around the notebook code interpreter.
 6 | - `vision_agent_planner_v2` - This agent is responsible for planning. It can run for N turns, each turn it can take some code action (it can execute it's own python code and has access to `planner_tools`) to test a new potential step in the plan.
 7 | - `vision_agent_coder_v2` - This agent is responsible for the final code. It can call the planner on it's own or it can take in the final `PlanContext` returned by `vision_agent_planner_v2` and use that to write the final code and test it.
 8 | 
 9 | ### Communication
10 | The agents communicate through `AgentMessage`'s and return `PlanContext`'s and `CodeContext`'s for the planner and coder agent respectively.
11 | ```
12 | _______________
13 | |VisionAgentV2|
14 | ---------------
15 |        |                       ____________________
16 |        -----(AgentMessage)---> |VisionAgentCoderV2|
17 |                                --------------------
18 |                                          |                        ______________________
19 |                                          -----(AgentMessage)----> |VisionAgentPlannerV2|
20 |                                                                   ----------------------
21 |                                ____________________                         |
22 |                                |VisionAgentCoderV2| <----(PlanContext)-------
23 |                                --------------------
24 | _______________                          |
25 | |VisionAgentV2|<-----(CodeContext)--------
26 | ---------------                    
27 | ```
28 | 
29 | #### AgentMessage and Contexts
30 | `AgentMessage` is a basic chat message but with extended roles. The roles can be your typical `user` and `assistant` but can also be `conversation`, `planner` or `coder` where the come from `VisionAgentV2`, `VisionAgentPlannerV2` and `VisionAgentCoderV2` respectively. `conversation`, `planner` and `coder` are all types of `assistant`. `observation`'s come from responses from executing python code internally by the planner.
31 | 
32 | The `VisionAgentPlannerV2` returns `PlanContext` which contains the a finalized version of the plan, including instructions and code snippets used during planning. `VisionAgentCoderV2` will then take in that `PlanContext` and return a `CodeContext` which contains the final code and any additional information.
33 | 
34 | 
35 | #### Callbacks
36 | If you want to receive intermediate messages you can use the `update_callback` argument in all the `V2` constructors. This will asynchronously send `AgentMessage`'s to the callback function you provide. You can see an example of how to run this in `examples/chat/app.py`
37 | 
38 | ### Human-in-the-loop
39 | Human-in-the-loop is a feature that allows the user to interact with the agents at certain points in the conversation. This is handled by using the `interaction` and `interaction_response` roles in the `AgentMessage`. You can enable this feature by passing `hil=True` to the `VisionAgentV2`, currently you can only use human-in-the-loop if you are also using the `update_callback` to collect the messages and pass them back to `VisionAgentV2`.
40 | 
41 | When the planner agent wants to interact with a human, it will return `InteractionContext` which will propogate back up to `VisionAgentV2` and then to the user. This exits the planner so it can ask for human input. If you collect the messages from `update_callback`, you will see the last `AgentMessage` has a role of `interaction` and the content will include a JSON string surrounded by `<interaction>` tags:
42 | 
43 | ```
44 | AgentMessage(
45 |     role="interaction",
46 |     content="<interaction>{\"prompt\": \"Should I use owl_v2_image or countgd_counting?\"}</interaction>",
47 |     media=None,
48 | )
49 | ```
50 | 
51 | The user can then add an additional `AgentMessage` with the role `interaction_response` and the response they want to give:
52 | 
53 | ```
54 | AgentMessage(
55 |     role="interaction_response",
56 |     content="{\"function_name\": \"owl_v2_image\"}",
57 |     media=None,
58 | )
59 | ```
60 | 
61 | You can see an example of how this works in `examples/chat/chat-app/src/components/ChatSection.tsx` under the `handleSubmit` function.
62 | 
63 | 
64 | #### Human-in-the-loop Caveats
65 | One issue with this approach is the planner is running code on a notebook and has access to all the previous executions. This means that for the internal notebook that the agents use, usually named `code_interpreter` from `CodeInterpreterFactor.new_instance`, you cannot close it or restart it. This is an issue because the notebook will close once you exit from the `VisionAgentV2` call.
66 | 
67 | To fix this you must construct a notebook outside of the chat and ensure it has `non_exiting=True`:
68 | 
69 | ```python
70 | agent = VisionAgentV2(
71 |     verbose=True,
72 |     update_callback=update_callback,
73 |     hil=True,
74 | )
75 | 
76 | code_interpreter = CodeInterpreterFactory.new_instance(non_exiting=True)
77 | agent.chat(
78 |     [
79 |         AgentMessage(
80 |             role="user",
81 |             content="Hello",
82 |             media=None
83 |         )
84 |     ],
85 |     code_interpreter=code_interpreter
86 | )
87 | ```
88 | 
89 | An example of this can be seen in `examples/chat/app.py`. Here the `code_intepreter` is constructed outside of the chat and passed in. This is so the notebook does not close when the chat ends or returns when asking for human feedback.
90 | 


--------------------------------------------------------------------------------
/vision_agent/agent/__init__.py:
--------------------------------------------------------------------------------
1 | from .agent import Agent, AgentCoder, AgentPlanner
2 | from .vision_agent_coder_v2 import VisionAgentCoderV2
3 | from .vision_agent_planner_v2 import VisionAgentPlannerV2
4 | from .vision_agent_v2 import VisionAgentV2
5 | 


--------------------------------------------------------------------------------
/vision_agent/agent/agent.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from pathlib import Path
 3 | from typing import Any, Dict, List, Optional, Union
 4 | 
 5 | from vision_agent.models import (
 6 |     AgentMessage,
 7 |     CodeContext,
 8 |     ErrorContext,
 9 |     InteractionContext,
10 |     Message,
11 |     PlanContext,
12 | )
13 | from vision_agent.utils.execute import CodeInterpreter
14 | 
15 | 
16 | class Agent(ABC):
17 |     @abstractmethod
18 |     def __call__(
19 |         self,
20 |         input: Union[str, List[Message]],
21 |         media: Optional[Union[str, Path]] = None,
22 |     ) -> Union[str, List[Message]]:
23 |         pass
24 | 
25 |     @abstractmethod
26 |     def log_progress(self, data: Dict[str, Any]) -> None:
27 |         """Log the progress of the agent.
28 |         This is a hook that is intended for reporting the progress of the agent.
29 |         """
30 |         pass
31 | 
32 | 
33 | class AgentCoder(Agent):
34 |     @abstractmethod
35 |     def generate_code(
36 |         self,
37 |         chat: List[AgentMessage],
38 |         max_steps: Optional[int] = None,
39 |         code_interpreter: Optional[CodeInterpreter] = None,
40 |     ) -> Union[CodeContext, InteractionContext, ErrorContext]:
41 |         pass
42 | 
43 |     @abstractmethod
44 |     def generate_code_from_plan(
45 |         self,
46 |         chat: List[AgentMessage],
47 |         plan_context: PlanContext,
48 |         code_interpreter: Optional[CodeInterpreter] = None,
49 |     ) -> CodeContext:
50 |         pass
51 | 
52 | 
53 | class AgentPlanner(Agent):
54 |     @abstractmethod
55 |     def generate_plan(
56 |         self,
57 |         chat: List[AgentMessage],
58 |         max_steps: Optional[int] = None,
59 |         code_interpreter: Optional[CodeInterpreter] = None,
60 |     ) -> Union[PlanContext, InteractionContext, ErrorContext]:
61 |         pass
62 | 


--------------------------------------------------------------------------------
/vision_agent/agent/vision_agent_coder_prompts_v2.py:
--------------------------------------------------------------------------------
  1 | FEEDBACK = """
  2 | ## This contains code and feedback from previous runs and is used for providing context so you do not make the same mistake again.
  3 | 
  4 | {feedback}
  5 | """
  6 | 
  7 | 
  8 | CODE = """
  9 | **Role**: You are an expert software programmer.
 10 | 
 11 | **Task**: You are given a plan by a planning agent that solves a vision problem posed by the user. You are also given code snippets that the planning agent used to solve the task. Your job is to organize the code so that it can be easily called by the user to solve the task.
 12 | 
 13 | **Documentation**:
 14 | This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
 15 | 
 16 | {docstring}
 17 | 
 18 | **User Instructions**:
 19 | {question}
 20 | 
 21 | **Plan**:
 22 | --- START PLAN ---
 23 | {plan}
 24 | --- END PLAN ---
 25 | 
 26 | **Instructions**:
 27 | 1. Reread the plan and all code and understand the task.
 28 | 2. Organize the code snippets into a single function that can be called by the user.
 29 | 3. DO NOT alter the code logic and ensure you utilize all the code provided as is without changing it.
 30 | 4. DO NOT create dummy input or functions, the code must be usable if the user provides new media.
 31 | 5. DO NOT hardcode the output, the function must work for any media provided by the user.
 32 | 6. Ensure the function is well-documented and follows the best practices and returns the expected output from the user.
 33 | 7. Output your code using <code> tags:
 34 | 
 35 | <code>
 36 | # your code here
 37 | </code>
 38 | """
 39 | 
 40 | 
 41 | TEST = """
 42 | **Role**: As a tester, your task is to create a simple test case for the provided code. This test case should verify the fundamental functionality under normal conditions.
 43 | 
 44 | **Documentation**:
 45 | This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`. You do not need to test these functions, only the code provided by the user.
 46 | 
 47 | {docstring}
 48 | 
 49 | **User Instructions**:
 50 | {question}
 51 | 
 52 | **Input Code Snippet**:
 53 | <code>
 54 | ### Please decide how would you want to generate test cases. Based on incomplete code or completed version.
 55 | {code}
 56 | </code>
 57 | 
 58 | **Instructions**:
 59 | 1. Verify the fundamental functionality under normal conditions.
 60 | 2. Ensure each test case is well-documented with comments explaining the scenario it covers.
 61 | 3. Your test case MUST run only on the given images which are {media}
 62 | 4. Your test case MUST run only with the given values which is available in the question - {question}
 63 | 5. DO NOT use any non-existent or dummy image or video files that are not provided by the user's instructions.
 64 | 6. DO NOT mock any functions, you must test their functionality as is.
 65 | 7. DO NOT assert the output value, run the code and assert only the output format or data structure.
 66 | 8. DO NOT use try except block to handle the error, let the error be raised if the code is incorrect.
 67 | 9. DO NOT import the testing function as it will available in the testing environment.
 68 | 10. Print the output of the function that is being tested and ensure it is not empty.
 69 | 11. Use the output of the function that is being tested as the return value of the testing function.
 70 | 12. Run the testing function in the end and don't assign a variable to its output.
 71 | 13. Output your test code using <code> tags:
 72 | 
 73 | <code>
 74 | # your test code here
 75 | </code>
 76 | """
 77 | 
 78 | 
 79 | FIX_BUG = """
 80 | **Role** As a coder, your job is to find the error in the code and fix it. You are running in a notebook setting but do not run !pip install to install new packages.
 81 | 
 82 | **Task**: A previous agent has written some code and some testing code according to a plan given to it. It has introduced a bug into it's code while trying to implement the plan. You are given the plan, code, test code and error. Your job is to fix the error in the code or test code.
 83 | 
 84 | **Documentation**:
 85 | This is the documentation for the functions you have access to. You may call any of these functions to help you complete the task. They are available through importing `from vision_agent.tools import *`.
 86 | 
 87 | {docstring}
 88 | 
 89 | 
 90 | **Plan**:
 91 | --- START PLAN ---
 92 | {plan}
 93 | --- END PLAN ---
 94 | 
 95 | **Instructions**:
 96 | Please re-complete the code to fix the error message. Here is the current version of the CODE:
 97 | <code>
 98 | {code}
 99 | </code>
100 | 
101 | When we run the TEST code:
102 | <test>
103 | {tests}
104 | </test>
105 | 
106 | It raises this error, if the error is empty it means the code and tests were not run:
107 | <error>
108 | {result}
109 | </error>
110 | 
111 | This is from your previous attempt to fix the bug, if it is empty no previous attempt has been made:
112 | {debug}
113 | 
114 | Please fix the bug by correcting the error. ONLY change the code logic if it is necessary to fix the bug. Do not change the code logic for any other reason. Output your fixed code using <code> tags and fixed test using <test> tags:
115 | 
116 | <thoughts>Your thoughts here...</thoughts>
117 | <code># your fixed code here</code>
118 | <test># your fixed test here</test>
119 | """
120 | 


--------------------------------------------------------------------------------
/vision_agent/agent/vision_agent_prompts_v2.py:
--------------------------------------------------------------------------------
 1 | CONVERSATION = """
 2 | **Role**: You are a help agent that called VisionAgent, built by LandingAI, that assists users write code to solve vision tasks.
 3 | 
 4 | **Actions**:
 5 | `generate_or_edit_vision_code` - This action will generate code for the user to solve a vision task. It will also edit vision code for the user, this is useful if the user wants to modify vision-related aspects of the code such as changing the vision model or the image pre-processing steps.
 6 | 
 7 | `edit_code` - If the user already has code for a vision task, from a previous call to `generate_or_edit_vision_code` and wants to quickly modify the code, you can use this action to edit the code. This is good for non-vision related changes such as formatting the response as a json or removing print statements.
 8 | 
 9 | **Exampels**:
10 | Here is an example of how you can interact with a user and Actions:
11 | --- EXAMPLE1 ---
12 | USER: Can you help me write code to detect dogs in this image? Media name images/dogs.jpg
13 | 
14 | AGENT: <response>Yes, I can help you with that. I will write the code to detect dogs in the image.</response><action>generate_or_edit_vision_code</action>
15 | 
16 | OBSERVATION:
17 | <final_code>
18 | from vision_agent.tools import load_image, owl_v2_image
19 | def detect_dogs(image_path: str) -> int:
20 |     image = load_image(image_path)
21 |     dogs = owl_v2_image(image)
22 |     return len(dogs)
23 | </final_code>
24 | <final_test>
25 | def test_detect_dogs():
26 |     dogs = detect_dogs("images/dogs.jpg")
27 |     assert isinstance(dogs, int)
28 |     print(f"Number of dogs detected: {{dogs}}")
29 |     return dogs
30 | </final_test>
31 | 
32 | OBSERVATION: ----- stdout -----
33 | Number of dogs detected: 8
34 | 
35 | ----- stderr -----
36 | 
37 | ----- Intermediate output-----
38 | None
39 | ----- Final output -----
40 | 8
41 | 
42 | AGENT: <response>Here is the code to detect dogs in the image.</response>
43 | --- END EXAMPLE1 ---
44 | 
45 | --- EXAMPLE2 ---
46 | USER: Who are you and what can you do?
47 | 
48 | AGENT: <response>I am VisionAgent, an agent built by LandingAI, to help users write code to solve vision tasks. I can write code to do the following tasks:
49 | 
50 | - Detecting objects in an image with bounding boxes
51 | - Detecting objects in an image with masks
52 | - Tracking object in a video
53 | - Classifying images
54 | - Understanding documents
55 | - Pose estimation
56 | - Visual question answering for both images and videos
57 | - Activity recognition in videos
58 | - Image generation
59 | 
60 | How can I help you?</response>
61 | --- END EXAMPLE2 ---
62 | 
63 | **Helpful Context**:
64 | - You are helping the user write vision code that utilizes the `vision-agent` library to solve different vision tasks.
65 | - The goal is to write code the user can use to solve a visual task and then deploy the code so they can run it repeatedly on that task.
66 | - If the user wants to run the code, they can `pip install vision-agent` to install the library, copy the code, and run it locally themselves.
67 | - They can find the vision-agent source code at https://github.com/landing-ai/vision-agent. They can find the documentation for vision-agent at https://landing-ai.github.io/vision-agent/.
68 | - The code utilizes LandingAI's API to run the different tools, the API is free to use and does not require an API key.
69 | - The user can run VisionAgent (you) from two places, either on our website at https://va.landing.ai or locally.
70 | - If the user is running VisionAgent on the website, they can only upload images or videos, nothing else at the moment. They can view files created by VisionAgent by clicking the "Files" button in the upper right part of the screen.
71 | - If the user is running VisionAgent locally they can ask it to do more things like run over files in a folder. They can view files created by VisionAgent by checking their local file system.
72 | 
73 | **Conversation**:
74 | Here is the current conversation so far:
75 | --- START CONVERSATION ---
76 | {conversation}
77 | --- END CONVERSATION ---
78 | 
79 | **Instructions**:
80 | 1. Only respond with a single <response> tag and a single <action> tag.
81 | 2. You can only take one action at a time in response to the user's message. Do not offer to fix code on the user's behalf, only if they have directly asked you to.
82 | 3. Respond in the following format, the <action> tag is optional and can be excluded if you do not want to take any action:
83 | 
84 | <response>Your response to the user's message</response>
85 | <action>The action you want to take from **Actions**</action>
86 | """
87 | 


--------------------------------------------------------------------------------
/vision_agent/clients/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/vision_agent/clients/__init__.py


--------------------------------------------------------------------------------
/vision_agent/clients/http.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | from requests import Session
 6 | from requests.adapters import HTTPAdapter
 7 | 
 8 | _LOGGER = logging.getLogger(__name__)
 9 | 
10 | 
11 | class BaseHTTP:
12 |     _TIMEOUT = 30  # seconds
13 |     _MAX_RETRIES = 3
14 | 
15 |     def __init__(
16 |         self, base_endpoint: str, *, headers: Optional[Dict[str, Any]] = None
17 |     ) -> None:
18 |         self._headers = headers
19 |         if headers is None:
20 |             self._headers = {
21 |                 "Content-Type": "application/json",
22 |             }
23 |         self._base_endpoint = base_endpoint
24 |         self._session = Session()
25 |         self._session.headers.update(self._headers)  # type: ignore
26 |         self._session.mount(
27 |             self._base_endpoint, HTTPAdapter(max_retries=self._MAX_RETRIES)
28 |         )
29 | 
30 |     def post(self, url: str, payload: Dict[str, Any]) -> Dict[str, Any]:
31 |         formatted_url = f"{self._base_endpoint}/{url}"
32 |         _LOGGER.info(f"Sending data to {formatted_url}")
33 |         try:
34 |             response = self._session.post(
35 |                 url=formatted_url, json=payload, timeout=self._TIMEOUT
36 |             )
37 |             response.raise_for_status()
38 |             result: Dict[str, Any] = response.json()
39 |             _LOGGER.info(json.dumps(result))
40 |         except json.JSONDecodeError:
41 |             resp_text = response.text
42 |             _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
43 |             raise
44 |         return result
45 | 
46 |     def get(self, url: str) -> Dict[str, Any]:
47 |         formatted_url = f"{self._base_endpoint}/{url}"
48 |         _LOGGER.info(f"Sending data to {formatted_url}")
49 |         try:
50 |             response = self._session.get(url=formatted_url, timeout=self._TIMEOUT)
51 |             response.raise_for_status()
52 |             result: Dict[str, Any] = response.json()
53 |             _LOGGER.info(json.dumps(result))
54 |         except json.JSONDecodeError:
55 |             resp_text = response.text
56 |             _LOGGER.warning(f"Response seems incorrect: '{resp_text}'.")
57 |             raise
58 |         return result
59 | 


--------------------------------------------------------------------------------
/vision_agent/configs/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import Config
2 | 


--------------------------------------------------------------------------------
/vision_agent/configs/anthropic_config.py:
--------------------------------------------------------------------------------
  1 | from typing import Type
  2 | 
  3 | from pydantic import BaseModel, Field
  4 | 
  5 | from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM
  6 | 
  7 | 
  8 | class Config(BaseModel):
  9 |     # for vision_agent_v2
 10 |     agent: Type[LMM] = Field(default=AnthropicLMM)
 11 |     agent_kwargs: dict = Field(
 12 |         default_factory=lambda: {
 13 |             "model_name": "claude-3-7-sonnet-20250219",
 14 |             "temperature": 0.0,
 15 |             "image_size": 768,
 16 |         }
 17 |     )
 18 | 
 19 |     # for vision_agent_planner_v2
 20 |     planner: Type[LMM] = Field(default=AnthropicLMM)
 21 |     planner_kwargs: dict = Field(
 22 |         default_factory=lambda: {
 23 |             "model_name": "claude-3-7-sonnet-20250219",
 24 |             "temperature": 0.0,
 25 |             "image_size": 768,
 26 |         }
 27 |     )
 28 | 
 29 |     summarizer: Type[LMM] = Field(default=AnthropicLMM)
 30 |     summarizer_kwargs: dict = Field(
 31 |         default_factory=lambda: {
 32 |             "model_name": "claude-3-7-sonnet-20250219",
 33 |             "temperature": 1.0,  # o1 has fixed temperature
 34 |             "image_size": 768,
 35 |         }
 36 |     )
 37 | 
 38 |     # for vision_agent_planner_v2
 39 |     critic: Type[LMM] = Field(default=AnthropicLMM)
 40 |     critic_kwargs: dict = Field(
 41 |         default_factory=lambda: {
 42 |             "model_name": "claude-3-7-sonnet-20250219",
 43 |             "temperature": 0.0,
 44 |             "image_size": 768,
 45 |         }
 46 |     )
 47 | 
 48 |     # for vision_agent_coder_v2
 49 |     coder: Type[LMM] = Field(default=AnthropicLMM)
 50 |     coder_kwargs: dict = Field(
 51 |         default_factory=lambda: {
 52 |             "model_name": "claude-3-7-sonnet-20250219",
 53 |             "temperature": 0.0,
 54 |             "image_size": 768,
 55 |         }
 56 |     )
 57 | 
 58 |     # for vision_agent_coder_v2
 59 |     tester: Type[LMM] = Field(default=AnthropicLMM)
 60 |     tester_kwargs: dict = Field(
 61 |         default_factory=lambda: {
 62 |             "model_name": "claude-3-7-sonnet-20250219",
 63 |             "temperature": 0.0,
 64 |             "image_size": 768,
 65 |         }
 66 |     )
 67 | 
 68 |     # for vision_agent_coder_v2
 69 |     debugger: Type[LMM] = Field(default=AnthropicLMM)
 70 |     debugger_kwargs: dict = Field(
 71 |         default_factory=lambda: {
 72 |             "model_name": "claude-3-7-sonnet-20250219",
 73 |             "temperature": 0.0,
 74 |             "image_size": 768,
 75 |         }
 76 |     )
 77 | 
 78 |     # for get_tool_for_task
 79 |     tool_tester: Type[LMM] = Field(default=AnthropicLMM)
 80 |     tool_tester_kwargs: dict = Field(
 81 |         default_factory=lambda: {
 82 |             "model_name": "claude-3-7-sonnet-20250219",
 83 |             "temperature": 0.0,
 84 |             "image_size": 768,
 85 |         }
 86 |     )
 87 | 
 88 |     # for get_tool_for_task
 89 |     tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
 90 |     tool_chooser_kwargs: dict = Field(
 91 |         default_factory=lambda: {
 92 |             "model_name": "claude-3-7-sonnet-20250219",
 93 |             "temperature": 1.0,
 94 |             "image_size": 768,
 95 |         }
 96 |     )
 97 | 
 98 |     # for get_tool_for_task
 99 |     od_judge: Type[LMM] = Field(default=AnthropicLMM)
100 |     od_judge_kwargs: dict = Field(
101 |         default_factory=lambda: {
102 |             "model_name": "claude-3-7-sonnet-20250219",
103 |             "temperature": 0.0,
104 |             "image_size": 512,
105 |         }
106 |     )
107 | 
108 |     # for suggestions module
109 |     suggester: Type[LMM] = Field(default=OpenAILMM)
110 |     suggester_kwargs: dict = Field(
111 |         default_factory=lambda: {
112 |             "model_name": "o1",
113 |             "temperature": 1.0,
114 |             "image_detail": "high",
115 |             "image_size": 1024,
116 |         }
117 |     )
118 | 
119 |     # for vqa module
120 |     vqa: Type[LMM] = Field(default=AnthropicLMM)
121 |     vqa_kwargs: dict = Field(
122 |         default_factory=lambda: {
123 |             "model_name": "claude-3-7-sonnet-20250219",
124 |             "temperature": 0.0,
125 |             "image_size": 768,
126 |         }
127 |     )
128 | 
129 |     def create_agent(self) -> LMM:
130 |         return self.agent(**self.agent_kwargs)
131 | 
132 |     def create_planner(self) -> LMM:
133 |         return self.planner(**self.planner_kwargs)
134 | 
135 |     def create_summarizer(self) -> LMM:
136 |         return self.summarizer(**self.summarizer_kwargs)
137 | 
138 |     def create_critic(self) -> LMM:
139 |         return self.critic(**self.critic_kwargs)
140 | 
141 |     def create_coder(self) -> LMM:
142 |         return self.coder(**self.coder_kwargs)
143 | 
144 |     def create_tester(self) -> LMM:
145 |         return self.tester(**self.tester_kwargs)
146 | 
147 |     def create_debugger(self) -> LMM:
148 |         return self.debugger(**self.debugger_kwargs)
149 | 
150 |     def create_tool_tester(self) -> LMM:
151 |         return self.tool_tester(**self.tool_tester_kwargs)
152 | 
153 |     def create_tool_chooser(self) -> LMM:
154 |         return self.tool_chooser(**self.tool_chooser_kwargs)
155 | 
156 |     def create_od_judge(self) -> LMM:
157 |         return self.od_judge(**self.od_judge_kwargs)
158 | 
159 |     def create_suggester(self) -> LMM:
160 |         return self.suggester(**self.suggester_kwargs)
161 | 
162 |     def create_vqa(self) -> LMM:
163 |         return self.vqa(**self.vqa_kwargs)
164 | 


--------------------------------------------------------------------------------
/vision_agent/configs/config.py:
--------------------------------------------------------------------------------
  1 | from typing import Type
  2 | 
  3 | from pydantic import BaseModel, Field
  4 | 
  5 | from vision_agent.lmm import LMM, AnthropicLMM, OpenAILMM, GoogleLMM
  6 | 
  7 | 
  8 | class Config(BaseModel):
  9 |     # for vision_agent_v2
 10 |     agent: Type[LMM] = Field(default=AnthropicLMM)
 11 |     agent_kwargs: dict = Field(
 12 |         default_factory=lambda: {
 13 |             "model_name": "claude-3-7-sonnet-20250219",
 14 |             "temperature": 0.0,
 15 |             "image_size": 768,
 16 |         }
 17 |     )
 18 | 
 19 |     # for vision_agent_planner_v2
 20 |     planner: Type[LMM] = Field(default=AnthropicLMM)
 21 |     planner_kwargs: dict = Field(
 22 |         default_factory=lambda: {
 23 |             "model_name": "claude-3-7-sonnet-20250219",
 24 |             "temperature": 0.0,
 25 |             "image_size": 768,
 26 |         }
 27 |     )
 28 | 
 29 |     summarizer: Type[LMM] = Field(default=AnthropicLMM)
 30 |     summarizer_kwargs: dict = Field(
 31 |         default_factory=lambda: {
 32 |             "model_name": "claude-3-7-sonnet-20250219",
 33 |             "temperature": 1.0,  # o1 has fixed temperature
 34 |             "image_size": 768,
 35 |         }
 36 |     )
 37 | 
 38 |     # for vision_agent_planner_v2
 39 |     critic: Type[LMM] = Field(default=AnthropicLMM)
 40 |     critic_kwargs: dict = Field(
 41 |         default_factory=lambda: {
 42 |             "model_name": "claude-3-7-sonnet-20250219",
 43 |             "temperature": 0.0,
 44 |             "image_size": 768,
 45 |         }
 46 |     )
 47 | 
 48 |     # for vision_agent_coder_v2
 49 |     coder: Type[LMM] = Field(default=AnthropicLMM)
 50 |     coder_kwargs: dict = Field(
 51 |         default_factory=lambda: {
 52 |             "model_name": "claude-3-7-sonnet-20250219",
 53 |             "temperature": 0.0,
 54 |             "image_size": 768,
 55 |         }
 56 |     )
 57 | 
 58 |     # for vision_agent_coder_v2
 59 |     tester: Type[LMM] = Field(default=AnthropicLMM)
 60 |     tester_kwargs: dict = Field(
 61 |         default_factory=lambda: {
 62 |             "model_name": "claude-3-7-sonnet-20250219",
 63 |             "temperature": 0.0,
 64 |             "image_size": 768,
 65 |         }
 66 |     )
 67 | 
 68 |     # for vision_agent_coder_v2
 69 |     debugger: Type[LMM] = Field(default=AnthropicLMM)
 70 |     debugger_kwargs: dict = Field(
 71 |         default_factory=lambda: {
 72 |             "model_name": "claude-3-7-sonnet-20250219",
 73 |             "temperature": 0.0,
 74 |             "image_size": 768,
 75 |         }
 76 |     )
 77 | 
 78 |     # for get_tool_for_task
 79 |     tool_tester: Type[LMM] = Field(default=AnthropicLMM)
 80 |     tool_tester_kwargs: dict = Field(
 81 |         default_factory=lambda: {
 82 |             "model_name": "claude-3-7-sonnet-20250219",
 83 |             "temperature": 0.0,
 84 |             "image_size": 768,
 85 |         }
 86 |     )
 87 | 
 88 |     # for get_tool_for_task
 89 |     tool_chooser: Type[LMM] = Field(default=AnthropicLMM)
 90 |     tool_chooser_kwargs: dict = Field(
 91 |         default_factory=lambda: {
 92 |             "model_name": "claude-3-7-sonnet-20250219",
 93 |             "temperature": 1.0,
 94 |             "image_size": 768,
 95 |         }
 96 |     )
 97 | 
 98 |     # for get_tool_for_task
 99 |     od_judge: Type[LMM] = Field(default=AnthropicLMM)
100 |     od_judge_kwargs: dict = Field(
101 |         default_factory=lambda: {
102 |             "model_name": "claude-3-7-sonnet-20250219",
103 |             "temperature": 0.0,
104 |             "image_size": 512,
105 |         }
106 |     )
107 | 
108 |     # for suggestions module
109 |     suggester: Type[LMM] = Field(default=OpenAILMM)
110 |     suggester_kwargs: dict = Field(
111 |         default_factory=lambda: {
112 |             "model_name": "o1",
113 |             "temperature": 1.0,
114 |             "image_detail": "high",
115 |             "image_size": 1024,
116 |         }
117 |     )
118 | 
119 |     # for vqa module
120 |     vqa: Type[LMM] = Field(default=GoogleLMM)
121 |     vqa_kwargs: dict = Field(
122 |         default_factory=lambda: {
123 |             "model_name": "gemini-2.0-flash-exp",
124 |             "temperature": 0.0,
125 |             "image_size": 768,
126 |         }
127 |     )
128 | 
129 |     def create_agent(self) -> LMM:
130 |         return self.agent(**self.agent_kwargs)
131 | 
132 |     def create_planner(self) -> LMM:
133 |         return self.planner(**self.planner_kwargs)
134 | 
135 |     def create_summarizer(self) -> LMM:
136 |         return self.summarizer(**self.summarizer_kwargs)
137 | 
138 |     def create_critic(self) -> LMM:
139 |         return self.critic(**self.critic_kwargs)
140 | 
141 |     def create_coder(self) -> LMM:
142 |         return self.coder(**self.coder_kwargs)
143 | 
144 |     def create_tester(self) -> LMM:
145 |         return self.tester(**self.tester_kwargs)
146 | 
147 |     def create_debugger(self) -> LMM:
148 |         return self.debugger(**self.debugger_kwargs)
149 | 
150 |     def create_tool_tester(self) -> LMM:
151 |         return self.tool_tester(**self.tool_tester_kwargs)
152 | 
153 |     def create_tool_chooser(self) -> LMM:
154 |         return self.tool_chooser(**self.tool_chooser_kwargs)
155 | 
156 |     def create_od_judge(self) -> LMM:
157 |         return self.od_judge(**self.od_judge_kwargs)
158 | 
159 |     def create_suggester(self) -> LMM:
160 |         return self.suggester(**self.suggester_kwargs)
161 | 
162 |     def create_vqa(self) -> LMM:
163 |         return self.vqa(**self.vqa_kwargs)
164 | 


--------------------------------------------------------------------------------
/vision_agent/configs/openai_config.py:
--------------------------------------------------------------------------------
  1 | from typing import Type
  2 | 
  3 | from pydantic import BaseModel, Field
  4 | 
  5 | from vision_agent.lmm import LMM, OpenAILMM
  6 | 
  7 | 
  8 | class Config(BaseModel):
  9 |     # for vision_agent_v2
 10 |     agent: Type[LMM] = Field(default=OpenAILMM)
 11 |     agent_kwargs: dict = Field(
 12 |         default_factory=lambda: {
 13 |             "model_name": "gpt-4o-2024-11-20",
 14 |             "temperature": 0.0,
 15 |             "image_size": 768,
 16 |             "image_detail": "low",
 17 |         }
 18 |     )
 19 | 
 20 |     # for vision_agent_planner_v2
 21 |     planner: Type[LMM] = Field(default=OpenAILMM)
 22 |     planner_kwargs: dict = Field(
 23 |         default_factory=lambda: {
 24 |             "model_name": "gpt-4o-2024-11-20",
 25 |             "temperature": 0.0,
 26 |             "image_size": 768,
 27 |             "image_detail": "low",
 28 |         }
 29 |     )
 30 | 
 31 |     # for vision_agent_planner_v2
 32 |     summarizer: Type[LMM] = Field(default=OpenAILMM)
 33 |     summarizer_kwargs: dict = Field(
 34 |         default_factory=lambda: {
 35 |             "model_name": "o1",
 36 |             "temperature": 1.0,
 37 |             "image_size": 768,
 38 |         }
 39 |     )
 40 | 
 41 |     # for vision_agent_planner_v2
 42 |     critic: Type[LMM] = Field(default=OpenAILMM)
 43 |     critic_kwargs: dict = Field(
 44 |         default_factory=lambda: {
 45 |             "model_name": "gpt-4o-2024-11-20",
 46 |             "temperature": 0.0,
 47 |             "image_size": 768,
 48 |             "image_detail": "low",
 49 |         }
 50 |     )
 51 | 
 52 |     # for vision_agent_coder_v2
 53 |     coder: Type[LMM] = Field(default=OpenAILMM)
 54 |     coder_kwargs: dict = Field(
 55 |         default_factory=lambda: {
 56 |             "model_name": "gpt-4o-2024-11-20",
 57 |             "temperature": 0.0,
 58 |             "image_size": 768,
 59 |             "image_detail": "low",
 60 |         }
 61 |     )
 62 | 
 63 |     # for vision_agent_coder_v2
 64 |     tester: Type[LMM] = Field(default=OpenAILMM)
 65 |     tester_kwargs: dict = Field(
 66 |         default_factory=lambda: {
 67 |             "model_name": "gpt-4o-2024-11-20",
 68 |             "temperature": 0.0,
 69 |             "image_size": 768,
 70 |             "image_detail": "low",
 71 |         }
 72 |     )
 73 | 
 74 |     # for vision_agent_coder_v2
 75 |     debugger: Type[LMM] = Field(default=OpenAILMM)
 76 |     debugger_kwargs: dict = Field(
 77 |         default_factory=lambda: {
 78 |             "model_name": "gpt-4o-2024-11-20",
 79 |             "temperature": 0.0,
 80 |             "image_size": 768,
 81 |             "image_detail": "low",
 82 |         }
 83 |     )
 84 | 
 85 |     # for get_tool_for_task
 86 |     tool_tester: Type[LMM] = Field(default=OpenAILMM)
 87 |     tool_tester_kwargs: dict = Field(
 88 |         default_factory=lambda: {
 89 |             "model_name": "gpt-4o-2024-11-20",
 90 |             "temperature": 0.0,
 91 |             "image_size": 768,
 92 |             "image_detail": "low",
 93 |         }
 94 |     )
 95 | 
 96 |     # for get_tool_for_task
 97 |     tool_chooser: Type[LMM] = Field(default=OpenAILMM)
 98 |     tool_chooser_kwargs: dict = Field(
 99 |         default_factory=lambda: {
100 |             "model_name": "gpt-4o-2024-11-20",
101 |             "temperature": 1.0,
102 |             "image_size": 768,
103 |             "image_detail": "low",
104 |         }
105 |     )
106 | 
107 |     # for suggestions module
108 |     suggester: Type[LMM] = Field(default=OpenAILMM)
109 |     suggester_kwargs: dict = Field(
110 |         default_factory=lambda: {
111 |             "model_name": "gpt-4o-2024-11-20",
112 |             "temperature": 1.0,
113 |             "image_size": 768,
114 |             "image_detail": "low",
115 |         }
116 |     )
117 | 
118 |     # for vqa module
119 |     vqa: Type[LMM] = Field(default=OpenAILMM)
120 |     vqa_kwargs: dict = Field(
121 |         default_factory=lambda: {
122 |             "model_name": "gpt-4o-2024-11-20",
123 |             "temperature": 0.0,
124 |             "image_size": 768,
125 |             "image_detail": "low",
126 |         }
127 |     )
128 | 
129 |     def create_agent(self) -> LMM:
130 |         return self.agent(**self.agent_kwargs)
131 | 
132 |     def create_planner(self) -> LMM:
133 |         return self.planner(**self.planner_kwargs)
134 | 
135 |     def create_summarizer(self) -> LMM:
136 |         return self.summarizer(**self.summarizer_kwargs)
137 | 
138 |     def create_critic(self) -> LMM:
139 |         return self.critic(**self.critic_kwargs)
140 | 
141 |     def create_coder(self) -> LMM:
142 |         return self.coder(**self.coder_kwargs)
143 | 
144 |     def create_tester(self) -> LMM:
145 |         return self.tester(**self.tester_kwargs)
146 | 
147 |     def create_debugger(self) -> LMM:
148 |         return self.debugger(**self.debugger_kwargs)
149 | 
150 |     def create_tool_tester(self) -> LMM:
151 |         return self.tool_tester(**self.tool_tester_kwargs)
152 | 
153 |     def create_tool_chooser(self) -> LMM:
154 |         return self.tool_chooser(**self.tool_chooser_kwargs)
155 | 
156 |     def create_suggester(self) -> LMM:
157 |         return self.suggester(**self.suggester_kwargs)
158 | 
159 |     def create_vqa(self) -> LMM:
160 |         return self.vqa(**self.vqa_kwargs)
161 | 


--------------------------------------------------------------------------------
/vision_agent/fonts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/vision_agent/fonts/__init__.py


--------------------------------------------------------------------------------
/vision_agent/fonts/default_font_ch_en.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/landing-ai/vision-agent/0539e545078a49dec422fba5656b80d6b3734197/vision_agent/fonts/default_font_ch_en.ttf


--------------------------------------------------------------------------------
/vision_agent/lmm/__init__.py:
--------------------------------------------------------------------------------
1 | from .lmm import LMM, AnthropicLMM, AzureOpenAILMM, GoogleLMM, OllamaLMM, OpenAILMM
2 | 


--------------------------------------------------------------------------------
/vision_agent/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .agent_types import (
 2 |     AgentMessage,
 3 |     CodeContext,
 4 |     ErrorContext,
 5 |     InteractionContext,
 6 |     PlanContext,
 7 | )
 8 | from .lmm_types import Message, TextOrImage
 9 | from .tools_types import (
10 |     BboxInput,
11 |     BboxInputBase64,
12 |     BoundingBoxes,
13 |     Florence2FtRequest,
14 |     JobStatus,
15 |     ODResponseData,
16 |     PromptTask,
17 | )
18 | 


--------------------------------------------------------------------------------
/vision_agent/models/agent_types.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import List, Literal, Optional, Union
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | from vision_agent.utils.execute import Execution
 7 | 
 8 | 
 9 | class AgentMessage(BaseModel):
10 |     """AgentMessage encompases messages sent to the entire Agentic system, which includes
11 |     both LMMs and sub-agents.
12 | 
13 |     user: The user's message.
14 |     assistant: The assistant's message.
15 |     observation: An observation made after conducting an action, either by the user or
16 |         assistant.
17 |     interaction: An interaction between the user and the assistant. For example if the
18 |         assistant wants to ask the user for help on a task, it could send an
19 |         interaction message.
20 |     interaction_response: The user's response to an interaction message.
21 |     conversation: Messages coming from the conversation agent, this is a type of
22 |         assistant messages.
23 |     planner: Messages coming from the planner agent, this is a type of assistant
24 |         messages.
25 |     coder: Messages coming from the coder agent, this is a type of assistant messages.
26 |     """
27 | 
28 |     role: Union[
29 |         Literal["user"],
30 |         Literal["assistant"],  # planner, coder and conversation are of type assistant
31 |         Literal["observation"],
32 |         Literal["final_observation"],  # the observation from the final code output
33 |         Literal["error_observation"],  # the observation from the error message
34 |         Literal["interaction"],
35 |         Literal["interaction_response"],
36 |         Literal["conversation"],
37 |         Literal["planner"],
38 |         Literal[
39 |             "planner_update"
40 |         ],  # an intermediate update from the planner to show partial information
41 |         Literal["coder"],
42 |     ]
43 |     content: str
44 |     media: Optional[List[Union[str, Path]]] = None
45 | 
46 | 
47 | class PlanContext(BaseModel):
48 |     """PlanContext is a data model that represents the context of a plan.
49 | 
50 |     plan: A description of the overall plan.
51 |     instructions: A list of step-by-step instructions.
52 |     code: Code snippets that were used during planning.
53 |     """
54 | 
55 |     plan: str
56 |     instructions: List[str]
57 |     code: str
58 | 
59 | 
60 | class CodeContext(BaseModel):
61 |     """CodeContext is a data model that represents final code and test cases.
62 | 
63 |     code: The final code that was written.
64 |     test: The test cases that were written.
65 |     success: A boolean value indicating whether the code passed the test cases.
66 |     test_result: The result of running the test cases.
67 |     """
68 | 
69 |     code: str
70 |     test: str
71 |     success: bool
72 |     test_result: Execution
73 | 
74 | 
75 | class InteractionContext(BaseModel):
76 |     """InteractionContext is a data model that represents the context of an interaction.
77 | 
78 |     chat: A list of messages exchanged between the user and the assistant.
79 |     """
80 | 
81 |     chat: List[AgentMessage]
82 | 
83 | 
84 | class ErrorContext(BaseModel):
85 |     """ErrorContext is a data model that represents an error message. These errors can
86 |     happen in the planning phase when a model does not output correctly formatted
87 |     messages (often because it considers some response to be a safety issue).
88 | 
89 |     error: The error message.
90 |     """
91 | 
92 |     error: str
93 | 


--------------------------------------------------------------------------------
/vision_agent/models/lmm_types.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Dict, Sequence, Union
 3 | 
 4 | import numpy as np
 5 | from PIL.Image import Image as ImageType
 6 | 
 7 | from vision_agent.utils.execute import Execution
 8 | 
 9 | TextOrImage = Union[str, Sequence[Union[str, Path, ImageType, np.ndarray]]]
10 | Message = Dict[str, Union[TextOrImage, Execution]]
11 | 


--------------------------------------------------------------------------------
/vision_agent/models/tools_types.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import List, Optional, Tuple, Union
 3 | from uuid import UUID
 4 | 
 5 | from pydantic import BaseModel, ConfigDict, Field, SerializationInfo, field_serializer
 6 | 
 7 | 
 8 | class BboxInput(BaseModel):
 9 |     image_path: str
10 |     labels: List[str]
11 |     bboxes: List[Tuple[int, int, int, int]]
12 | 
13 | 
14 | class BboxInputBase64(BaseModel):
15 |     image: str
16 |     filename: str
17 |     labels: List[str]
18 |     bboxes: List[Tuple[int, int, int, int]]
19 | 
20 | 
21 | class PromptTask(str, Enum):
22 |     """Valid task prompts options for the Florence2 model."""
23 | 
24 |     PHRASE_GROUNDING = "<CAPTION_TO_PHRASE_GROUNDING>"
25 | 
26 | 
27 | class Florence2FtRequest(BaseModel):
28 |     model_config = ConfigDict(populate_by_name=True)
29 | 
30 |     image: Optional[str] = None
31 |     video: Optional[bytes] = None
32 |     task: PromptTask
33 |     prompt: Optional[str] = ""
34 |     chunk_length_frames: Optional[int] = None
35 |     postprocessing: Optional[str] = None
36 |     job_id: Optional[UUID] = Field(None, alias="jobId")
37 | 
38 |     @field_serializer("job_id")
39 |     def serialize_job_id(self, job_id: UUID, _info: SerializationInfo) -> str:
40 |         return str(job_id)
41 | 
42 | 
43 | class JobStatus(str, Enum):
44 |     """The status of a fine-tuning job.
45 | 
46 |     CREATED:
47 |         The job has been created and is waiting to be scheduled to run.
48 |     STARTING:
49 |         The job has started running, but not entering the training phase.
50 |     TRAINING:
51 |         The job is training a model.
52 |     EVALUATING:
53 |         The job is evaluating the model and computing metrics.
54 |     PUBLISHING:
55 |         The job is exporting the artifact(s) to an external directory (s3 or local).
56 |     SUCCEEDED:
57 |         The job has finished, including training, evaluation and publishing the
58 |         artifact(s).
59 |     FAILED:
60 |         The job has failed for some reason internally, it can be due to resources
61 |         issues or the code itself.
62 |     STOPPED:
63 |         The job has been stopped by the use locally or in the cloud.
64 |     """
65 | 
66 |     CREATED = "CREATED"
67 |     STARTING = "STARTING"
68 |     TRAINING = "TRAINING"
69 |     EVALUATING = "EVALUATING"
70 |     PUBLISHING = "PUBLISHING"
71 |     SUCCEEDED = "SUCCEEDED"
72 |     FAILED = "FAILED"
73 |     STOPPED = "STOPPED"
74 | 
75 | 
76 | class ODResponseData(BaseModel):
77 |     label: str
78 |     score: float
79 |     bbox: Union[list[int], list[float]] = Field(alias="bounding_box")
80 | 
81 |     model_config = ConfigDict(
82 |         populate_by_name=True,
83 |     )
84 | 
85 | 
86 | BoundingBoxes = list[ODResponseData]
87 | 


--------------------------------------------------------------------------------
/vision_agent/sim/__init__.py:
--------------------------------------------------------------------------------
 1 | from .sim import (
 2 |     AzureSim,
 3 |     OllamaSim,
 4 |     Sim,
 5 |     StellaSim,
 6 |     get_tool_recommender,
 7 |     load_cached_sim,
 8 |     load_sim,
 9 | )
10 | 


--------------------------------------------------------------------------------
/vision_agent/tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Callable, List, Optional
 2 | 
 3 | from .meta_tools import (
 4 |     get_tool_descriptions,
 5 |     view_media_artifact,
 6 | )
 7 | from .planner_tools import judge_od_results
 8 | from .prompts import CHOOSE_PARAMS, SYSTEM_PROMPT
 9 | from .tools import (
10 |     agentic_activity_recognition,
11 |     agentic_document_extraction,
12 |     agentic_object_detection,
13 |     agentic_sam2_instance_segmentation,
14 |     agentic_sam2_video_tracking,
15 |     claude35_text_extraction,
16 |     closest_box_distance,
17 |     closest_mask_distance,
18 |     countgd_object_detection,
19 |     countgd_sam2_instance_segmentation,
20 |     countgd_sam2_video_tracking,
21 |     countgd_sam2_visual_instance_segmentation,
22 |     countgd_visual_object_detection,
23 |     custom_object_detection,
24 |     depth_anything_v2,
25 |     detr_segmentation,
26 |     document_extraction,
27 |     document_qa,
28 |     extract_frames_and_timestamps,
29 |     florence2_object_detection,
30 |     florence2_ocr,
31 |     florence2_sam2_instance_segmentation,
32 |     florence2_sam2_video_tracking,
33 |     gemini_image_generation,
34 |     generate_pose_image,
35 |     get_tools,
36 |     get_tools_descriptions,
37 |     get_tools_df,
38 |     get_tools_docstring,
39 |     get_utilties_docstring,
40 |     glee_object_detection,
41 |     glee_sam2_instance_segmentation,
42 |     glee_sam2_video_tracking,
43 |     load_image,
44 |     minimum_distance,
45 |     ocr,
46 |     od_sam2_video_tracking,
47 |     overlay_bounding_boxes,
48 |     overlay_heat_map,
49 |     overlay_segmentation_masks,
50 |     owlv2_object_detection,
51 |     owlv2_sam2_instance_segmentation,
52 |     owlv2_sam2_video_tracking,
53 |     qwen2_vl_images_vqa,
54 |     qwen2_vl_video_vqa,
55 |     qwen25_vl_images_vqa,
56 |     qwen25_vl_video_vqa,
57 |     sam2,
58 |     save_image,
59 |     save_json,
60 |     save_video,
61 |     siglip_classification,
62 |     template_match,
63 |     vit_image_classification,
64 |     vit_nsfw_classification,
65 | )
66 | 
67 | __new_tools__ = [
68 |     "import vision_agent as va",
69 |     "from vision_agent.tools import register_tool",
70 | ]
71 | 
72 | 
73 | def register_tool(imports: Optional[List] = None) -> Callable:
74 |     def decorator(tool: Callable) -> Callable:
75 |         import inspect
76 | 
77 |         global TOOLS, TOOLS_DF, TOOL_DESCRIPTIONS, TOOL_DOCSTRING, TOOLS_INFO
78 |         from vision_agent.tools.tools import TOOLS
79 | 
80 |         if tool not in TOOLS:  # type: ignore
81 |             TOOLS.append(tool)  # type: ignore
82 | 
83 |             globals()[tool.__name__] = tool
84 |             if imports is not None:
85 |                 for import_ in imports:
86 |                     __new_tools__.append(import_)
87 |             __new_tools__.append(inspect.getsource(tool))
88 |         return tool
89 | 
90 |     return decorator
91 | 


--------------------------------------------------------------------------------
/vision_agent/tools/meta_tools.py:
--------------------------------------------------------------------------------
  1 | import difflib
  2 | import os
  3 | import re
  4 | from pathlib import Path
  5 | from typing import Any, Dict, List, Tuple, Union
  6 | 
  7 | from IPython.display import display
  8 | 
  9 | from vision_agent.tools.tools import get_tools_descriptions as _get_tool_descriptions
 10 | from vision_agent.utils.execute import Execution, MimeType
 11 | from vision_agent.utils.tools_doc import get_tool_documentation
 12 | 
 13 | CURRENT_FILE = None
 14 | CURRENT_LINE = 0
 15 | DEFAULT_WINDOW_SIZE = 100
 16 | ZMQ_PORT = os.environ.get("ZMQ_PORT", None)
 17 | 
 18 | 
 19 | def report_progress_callback(port: int, inp: Dict[str, Any]) -> None:
 20 |     import zmq
 21 | 
 22 |     context = zmq.Context()
 23 |     socket = context.socket(zmq.PUSH)
 24 |     socket.connect(f"tcp://localhost:{port}")
 25 |     socket.send_json(inp)
 26 | 
 27 | 
 28 | def redisplay_results(execution: Execution) -> None:
 29 |     """This function is used to add previous execution results to the current output.
 30 |     This is handy if you are inside a notebook environment, call it notebook1, and you
 31 |     have a nested notebook environment, call it notebook2, and you want the execution
 32 |     results from notebook2 to be included in the execution results for notebook1.
 33 |     """
 34 |     for result in execution.results:
 35 |         if result.text is not None:
 36 |             display({MimeType.TEXT_PLAIN: result.text}, raw=True)
 37 |         if result.html is not None:
 38 |             display({MimeType.TEXT_HTML: result.html}, raw=True)
 39 |         if result.markdown is not None:
 40 |             display({MimeType.TEXT_MARKDOWN: result.markdown}, raw=True)
 41 |         if result.svg is not None:
 42 |             display({MimeType.IMAGE_SVG: result.svg}, raw=True)
 43 |         if result.png is not None:
 44 |             display({MimeType.IMAGE_PNG: result.png}, raw=True)
 45 |         if result.jpeg is not None:
 46 |             display({MimeType.IMAGE_JPEG: result.jpeg}, raw=True)
 47 |         if result.mp4 is not None:
 48 |             display({MimeType.VIDEO_MP4_B64: result.mp4}, raw=True)
 49 |         if result.latex is not None:
 50 |             display({MimeType.TEXT_LATEX: result.latex}, raw=True)
 51 |         if result.json is not None:
 52 |             display({MimeType.APPLICATION_JSON: result.json}, raw=True)
 53 |         if result.artifact is not None:
 54 |             display({MimeType.APPLICATION_ARTIFACT: result.artifact}, raw=True)
 55 |         if result.extra is not None:
 56 |             display(result.extra, raw=True)
 57 | 
 58 | 
 59 | class Artifacts:
 60 |     """Artifacts is a class that allows you to sync files between a local and remote
 61 |     environment. In our case, the remote environment could be where the VisionAgent is
 62 |     executing code and as the user adds new images, files or modifies files, those
 63 |     need to be in sync with the remote environment the VisionAgent is running in.
 64 |     """
 65 | 
 66 |     def __init__(self, cwd: Union[str, Path]) -> None:
 67 |         """Initializes the Artifacts object with it's remote and local save paths.
 68 | 
 69 |         Parameters:
 70 |             cwd (Union[str, Path]): The path to save all the chat related files. For example "/home/user/chat_abc/".
 71 |         """
 72 |         self.cwd = Path(cwd)
 73 | 
 74 |     def show(self) -> str:
 75 |         """Prints out all the files in the curret working directory"""
 76 |         output_str = "[Artifacts loaded]\n"
 77 |         for k in self:
 78 |             output_str += f"Artifact name: {k}, loaded to path: {str(self.cwd / k)}\n"
 79 |         output_str += "[End of artifacts]\n"
 80 |         print(output_str)
 81 |         return output_str
 82 | 
 83 |     def __iter__(self) -> Any:
 84 |         return iter(os.listdir(self.cwd))
 85 | 
 86 |     def __getitem__(self, name: str) -> Any:
 87 |         file_path = self.cwd / name
 88 |         if file_path.exists():
 89 |             with open(file_path, "r") as file:
 90 |                 return file.read()
 91 |         else:
 92 |             raise KeyError(f"File '{name}' not found in artifacts")
 93 | 
 94 |     def __setitem__(self, name: str, value: Any) -> None:
 95 |         file_path = self.cwd / name
 96 |         with open(file_path, "w") as file:
 97 |             file.write(value)
 98 | 
 99 |     def __contains__(self, name: str) -> bool:
100 |         return name in os.listdir(self.cwd)
101 | 
102 | 
103 | def filter_file(file_name: Union[str, Path]) -> Tuple[bool, bool]:
104 |     file_name_p = Path(file_name)
105 |     return (
106 |         file_name_p.is_file()
107 |         and "__pycache__" not in str(file_name_p)
108 |         and not file_name_p.name.startswith(".")
109 |         and file_name_p.suffix
110 |         in [".png", ".jpeg", ".jpg", ".mp4", ".txt", ".json", ".csv"]
111 |     ), file_name_p.suffix in [".png", ".jpeg", ".jpg", ".mp4"]
112 | 
113 | 
114 | # These tools are adapted from SWE-Agent https://github.com/princeton-nlp/SWE-agent
115 | 
116 | 
117 | def format_lines(lines: List[str], start_idx: int) -> str:
118 |     output = ""
119 |     for i, line in enumerate(lines):
120 |         output += f"{i + start_idx}|{line}"
121 |     return output
122 | 
123 | 
124 | def view_lines(
125 |     lines: List[str],
126 |     line_num: int,
127 |     window_size: int,
128 |     name: str,
129 |     total_lines: int,
130 |     print_output: bool = True,
131 | ) -> str:
132 |     start = max(0, line_num - window_size)
133 |     end = min(len(lines), line_num + window_size)
134 |     return_str = (
135 |         f"[Artifact: {name} ({total_lines} lines total)]\n"
136 |         + format_lines(lines[start:end], start)
137 |         + (
138 |             "\n[End of artifact]"
139 |             if end == len(lines)
140 |             else f"\n[{len(lines) - end} more lines]"
141 |         )
142 |     )
143 | 
144 |     if print_output:
145 |         print(return_str)
146 |     return return_str
147 | 
148 | 
149 | def check_and_load_image(code: str) -> List[str]:
150 |     if not code.strip():
151 |         return []
152 | 
153 |     pattern = r"view_media_artifact\(\s*([^\)]+),\s*['\"]([^\)]+)['\"]\s*\)"
154 |     matches = re.findall(pattern, code)
155 |     return [match[1] for match in matches]
156 | 
157 | 
158 | def view_media_artifact(artifacts: Artifacts, name: str) -> str:
159 |     """Allows only the agent to view the media artifact with the given name. DO NOT use
160 |     this to show media to the user, the user can already see all media saved in the
161 |     artifacts.
162 | 
163 |     Parameters:
164 |         artifacts (Artifacts): The artifacts object to show the image from.
165 |         name (str): The name of the image artifact to show.
166 |     """
167 |     if name not in artifacts:
168 |         output_str = f"[Artifact {name} does not exist]"
169 |     else:
170 |         output_str = f"[Image {name} displayed]"
171 |     print(output_str)
172 |     return output_str
173 | 
174 | 
175 | def get_tool_descriptions() -> str:
176 |     """Returns a description of all the tools that `generate_vision_code` has access to.
177 |     Helpful for answering questions about what types of vision tasks you can do with
178 |     `generate_vision_code`."""
179 |     return _get_tool_descriptions()
180 | 
181 | 
182 | def get_diff(before: str, after: str) -> str:
183 |     return "".join(
184 |         difflib.unified_diff(
185 |             before.splitlines(keepends=True), after.splitlines(keepends=True)
186 |         )
187 |     )
188 | 
189 | 
190 | def get_diff_with_prompts(name: str, before: str, after: str) -> str:
191 |     diff = get_diff(before, after)
192 |     return f"[Artifact {name} edits]\n{diff}\n[End of edits]"
193 | 
194 | 
195 | META_TOOL_DOCSTRING = get_tool_documentation(
196 |     [
197 |         get_tool_descriptions,
198 |         view_media_artifact,
199 |     ]
200 | )
201 | 


--------------------------------------------------------------------------------
/vision_agent/tools/prompts.py:
--------------------------------------------------------------------------------
 1 | SYSTEM_PROMPT = "You are a helpful assistant."
 2 | 
 3 | # EasyTool prompts
 4 | CHOOSE_PARAMS = (
 5 |     "This is an API tool documentation. Given a user's question, you need to output parameters according to the API tool documentation to successfully call the API to solve the user's question.\n"
 6 |     "This is the API tool documentation: {api_doc}\n"
 7 |     "Please note that: \n"
 8 |     "1. The Example in the API tool documentation can help you better understand the use of the API.\n"
 9 |     '2. Ensure the parameters you output are correct. The output must contain the required parameters, and can contain the optional parameters based on the question. If there are no parameters in the required parameters and optional parameters, just leave it as {{"Parameters":{{}}}}\n'
10 |     "3. If the user's question mentions other APIs, you should ONLY consider the API tool documentation I give and do not consider other APIs.\n"
11 |     '4. If you need to use this API multiple times, please set "Parameters" to a list.\n'
12 |     "5. You must ONLY output in a parsible JSON format. Two example outputs look like:\n"
13 |     "'''\n"
14 |     'Example 1: {{"Parameters":{{"keyword": "Artificial Intelligence", "language": "English"}}}}\n'
15 |     'Example 2: {{"Parameters":[{{"keyword": "Artificial Intelligence", "language": "English"}}, {{"keyword": "Machine Learning", "language": "English"}}]}}\n'
16 |     "'''\n"
17 |     "This is the user's question: {question}\n"
18 |     "Output:\n"
19 | )
20 | 


--------------------------------------------------------------------------------
/vision_agent/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .execute import (
2 |     CodeInterpreter,
3 |     CodeInterpreterFactory,
4 |     Error,
5 |     Execution,
6 |     Logs,
7 |     Result,
8 | )
9 | 


--------------------------------------------------------------------------------
/vision_agent/utils/exceptions.py:
--------------------------------------------------------------------------------
 1 | """Vision Agent exceptions."""
 2 | 
 3 | 
 4 | class InvalidApiKeyError(Exception):
 5 |     """Exception raised when the an invalid API key is provided. This error could be raised from any SDK code, not limited to a HTTP client."""
 6 | 
 7 |     def __init__(self, message: str):
 8 |         self.message = f"""{message}
 9 | For more information, see https://landing-ai.github.io/landingai-python/landingai.html#manage-api-credentials"""
10 |         super().__init__(self.message)
11 | 
12 |     def __str__(self) -> str:
13 |         return self.message
14 | 
15 | 
16 | class RemoteToolCallFailed(Exception):
17 |     """Exception raised when an error occurs during a tool call."""
18 | 
19 |     def __init__(self, tool_name: str, status_code: int, message: str):
20 |         self.message = (
21 |             f"""Tool call ({tool_name}) failed due to {status_code} - {message}"""
22 |         )
23 | 
24 | 
25 | class RemoteSandboxError(Exception):
26 |     """Exception related to remote sandbox."""
27 | 
28 |     is_retryable = False
29 | 
30 | 
31 | class RemoteSandboxCreationError(RemoteSandboxError):
32 |     """Exception raised when failed to create a remote sandbox.
33 |     This could be due to the remote sandbox service is unavailable.
34 |     """
35 | 
36 |     is_retryable = False
37 | 
38 | 
39 | class RemoteSandboxExecutionError(RemoteSandboxError):
40 |     """Exception raised when failed in a remote sandbox code execution."""
41 | 
42 |     is_retryable = False
43 | 
44 | 
45 | class RemoteSandboxClosedError(RemoteSandboxError):
46 |     """Exception raised when a remote sandbox is dead.
47 |     This is retryable in the sense that the user can try again with a new sandbox. Can't be retried in the same sandbox.
48 |     When this error is raised, the user should retry by create a new VisionAgent (i.e. a new sandbox).
49 |     """
50 | 
51 |     is_retryable = True
52 | 
53 | 
54 | class FineTuneModelNotFound(Exception):
55 |     """Exception raised when the fine-tune model is not found.
56 |     If this is raised, it's recommended to try another model id.
57 |     """
58 | 


--------------------------------------------------------------------------------
/vision_agent/utils/tools.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import os
  3 | from base64 import b64encode
  4 | from functools import cache
  5 | from typing import Any, Dict, List, MutableMapping, Optional, Tuple
  6 | 
  7 | import numpy as np
  8 | from IPython.display import display
  9 | from pydantic import BaseModel
 10 | from requests import Session
 11 | from requests.adapters import HTTPAdapter
 12 | from urllib3.util.retry import Retry
 13 | 
 14 | from vision_agent.utils.exceptions import RemoteToolCallFailed
 15 | from vision_agent.utils.execute import Error, MimeType
 16 | from vision_agent.utils.image_utils import normalize_bbox
 17 | 
 18 | _LOGGER = logging.getLogger(__name__)
 19 | _LND_BASE_URL = os.environ.get("LANDINGAI_URL", "https://api.va.landing.ai")
 20 | _LND_API_URL = f"{_LND_BASE_URL}/v1/agent/model"
 21 | _LND_API_URL_v2 = f"{_LND_BASE_URL}/v1/tools"
 22 | 
 23 | 
 24 | @cache
 25 | def get_vision_agent_api_key() -> str:
 26 |     vision_agent_api_key = os.environ.get("VISION_AGENT_API_KEY")
 27 |     if vision_agent_api_key:
 28 |         return vision_agent_api_key
 29 |     else:
 30 |         raise ValueError(
 31 |             "VISION_AGENT_API_KEY not found in environment variables, required for tool usage. You can get a free key from https://va.landing.ai/settings/api-key"
 32 |         )
 33 | 
 34 | 
 35 | def should_report_tool_traces() -> bool:
 36 |     return bool(os.environ.get("REPORT_TOOL_TRACES", False))
 37 | 
 38 | 
 39 | class ToolCallTrace(BaseModel):
 40 |     endpoint_url: str
 41 |     type: str
 42 |     request: MutableMapping[str, Any]
 43 |     response: MutableMapping[str, Any]
 44 |     error: Optional[Error]
 45 |     files: Optional[List[tuple[str, str]]]
 46 | 
 47 | 
 48 | def send_inference_request(
 49 |     payload: Dict[str, Any],
 50 |     endpoint_name: str,
 51 |     files: Optional[List[Tuple[Any, ...]]] = None,
 52 |     v2: bool = False,
 53 |     metadata_payload: Optional[Dict[str, Any]] = None,
 54 |     is_form: bool = False,
 55 | ) -> Any:
 56 |     url = f"{_LND_API_URL_v2 if v2 else _LND_API_URL}/{endpoint_name}"
 57 |     if "TOOL_ENDPOINT_URL" in os.environ:
 58 |         url = os.environ["TOOL_ENDPOINT_URL"]
 59 | 
 60 |     vision_agent_api_key = get_vision_agent_api_key()
 61 |     headers = {
 62 |         "Authorization": f"Basic {vision_agent_api_key}",
 63 |         "X-Source": "vision_agent",
 64 |     }
 65 | 
 66 |     if runtime_tag := os.environ.get("RUNTIME_TAG", "vision-agent"):
 67 |         headers["runtime_tag"] = runtime_tag
 68 | 
 69 |     session = _create_requests_session(
 70 |         url=url,
 71 |         num_retry=3,
 72 |         headers=headers,
 73 |     )
 74 | 
 75 |     function_name = "unknown"
 76 |     if "function_name" in payload:
 77 |         function_name = payload["function_name"]
 78 |     elif metadata_payload is not None and "function_name" in metadata_payload:
 79 |         function_name = metadata_payload["function_name"]
 80 | 
 81 |     response = _call_post(url, payload, session, files, function_name, is_form)
 82 | 
 83 |     return response["data"]
 84 | 
 85 | 
 86 | def send_task_inference_request(
 87 |     payload: Dict[str, Any],
 88 |     task_name: str,
 89 |     files: Optional[List[Tuple[Any, ...]]] = None,
 90 |     metadata: Optional[Dict[str, Any]] = None,
 91 |     is_form: bool = False,
 92 | ) -> Any:
 93 |     url = f"{_LND_API_URL_v2}/{task_name}"
 94 |     vision_agent_api_key = get_vision_agent_api_key()
 95 |     headers = {
 96 |         "Authorization": f"Basic {vision_agent_api_key}",
 97 |         "X-Source": "vision_agent",
 98 |     }
 99 |     session = _create_requests_session(
100 |         url=url,
101 |         num_retry=3,
102 |         headers=headers,
103 |     )
104 | 
105 |     function_name = "unknown"
106 |     if metadata is not None and "function_name" in metadata:
107 |         function_name = metadata["function_name"]
108 |     response = _call_post(url, payload, session, files, function_name, is_form)
109 |     return response["data"]
110 | 
111 | 
112 | def _create_requests_session(
113 |     url: str, num_retry: int, headers: Dict[str, str]
114 | ) -> Session:
115 |     """Create a requests session with retry"""
116 |     session = Session()
117 |     retries = Retry(
118 |         total=num_retry,
119 |         backoff_factor=2,
120 |         raise_on_redirect=True,
121 |         raise_on_status=False,
122 |         allowed_methods=["GET", "POST", "PUT"],
123 |         status_forcelist=[
124 |             408,  # Request Timeout
125 |             429,  # Too Many Requests (ie. rate limiter).
126 |             502,  # Bad Gateway
127 |             503,  # Service Unavailable (include cloud circuit breaker)
128 |             504,  # Gateway Timeout
129 |         ],
130 |     )
131 |     session.mount(url, HTTPAdapter(max_retries=retries if num_retry > 0 else 0))
132 |     session.headers.update(headers)
133 |     return session
134 | 
135 | 
136 | def _call_post(
137 |     url: str,
138 |     payload: dict[str, Any],
139 |     session: Session,
140 |     files: Optional[List[Tuple[Any, ...]]] = None,
141 |     function_name: str = "unknown",
142 |     is_form: bool = False,
143 | ) -> Any:
144 |     files_in_b64 = None
145 |     if files:
146 |         files_in_b64 = [(file[0], b64encode(file[1]).decode("utf-8")) for file in files]
147 | 
148 |     tool_call_trace = None
149 |     try:
150 |         if files is not None:
151 |             response = session.post(url, data=payload, files=files)
152 |         elif is_form:
153 |             response = session.post(url, data=payload)
154 |         else:
155 |             response = session.post(url, json=payload)
156 | 
157 |         tool_call_trace_payload = (
158 |             payload
159 |             if "function_name" in payload
160 |             else {**payload, **{"function_name": function_name}}
161 |         )
162 |         tool_call_trace = ToolCallTrace(
163 |             endpoint_url=url,
164 |             type="tool_call",
165 |             request=tool_call_trace_payload,
166 |             response={},
167 |             error=None,
168 |             files=files_in_b64,
169 |         )
170 | 
171 |         if response.status_code != 200:
172 |             tool_call_trace.error = Error(
173 |                 name="RemoteToolCallFailed",
174 |                 value=f"{response.status_code} - {response.text}",
175 |                 traceback_raw=[],
176 |             )
177 |             _LOGGER.error(f"Request failed: {response.status_code} {response.text}")
178 |             raise RemoteToolCallFailed(
179 |                 function_name, response.status_code, response.text
180 |             )
181 | 
182 |         result = response.json()
183 |         tool_call_trace.response = result
184 |         return result
185 |     finally:
186 |         if tool_call_trace is not None and should_report_tool_traces():
187 |             trace = tool_call_trace.model_dump()
188 |             display({MimeType.APPLICATION_JSON: trace}, raw=True)
189 | 
190 | 
191 | def add_bboxes_from_masks(
192 |     all_preds: List[List[Dict[str, Any]]],
193 | ) -> List[List[Dict[str, Any]]]:
194 |     for frame_preds in all_preds:
195 |         for preds in frame_preds:
196 |             mask = preds["mask"]
197 |             if mask.sum() == 0:
198 |                 preds["bbox"] = []
199 |             else:
200 |                 # Get indices where mask is True using axis operations
201 |                 rows = np.any(mask, axis=1)
202 |                 cols = np.any(mask, axis=0)
203 | 
204 |                 # Find boundaries using argmax/argmin
205 |                 y_min = np.argmax(rows)
206 |                 y_max = len(rows) - np.argmax(rows[::-1])
207 |                 x_min = np.argmax(cols)
208 |                 x_max = len(cols) - np.argmax(cols[::-1])
209 | 
210 |                 bbox = [float(x_min), float(y_min), float(x_max), float(y_max)]
211 |                 bbox = normalize_bbox(bbox, mask.shape)
212 |                 preds["bbox"] = bbox
213 | 
214 |     return all_preds
215 | 
216 | 
217 | def calculate_iou(bbox1: List[float], bbox2: List[float]) -> float:
218 |     x1, y1, x2, y2 = bbox1
219 |     x3, y3, x4, y4 = bbox2
220 | 
221 |     x_overlap = max(0, min(x2, x4) - max(x1, x3))
222 |     y_overlap = max(0, min(y2, y4) - max(y1, y3))
223 |     intersection = x_overlap * y_overlap
224 | 
225 |     area1 = (x2 - x1) * (y2 - y1)
226 |     area2 = (x4 - x3) * (y4 - y3)
227 |     union = area1 + area2 - intersection
228 | 
229 |     return intersection / union if union > 0 else 0
230 | 
231 | 
232 | def single_nms(
233 |     preds: List[Dict[str, Any]], iou_threshold: float
234 | ) -> List[Dict[str, Any]]:
235 |     for i in range(len(preds)):
236 |         for j in range(i + 1, len(preds)):
237 |             if calculate_iou(preds[i]["bbox"], preds[j]["bbox"]) > iou_threshold:
238 |                 if preds[i]["score"] > preds[j]["score"]:
239 |                     preds[j]["score"] = 0
240 |                 else:
241 |                     preds[i]["score"] = 0
242 | 
243 |     return [pred for pred in preds if pred["score"] > 0]
244 | 
245 | 
246 | def nms(
247 |     all_preds: List[List[Dict[str, Any]]], iou_threshold: float
248 | ) -> List[List[Dict[str, Any]]]:
249 |     if not isinstance(all_preds[0], List):
250 |         all_preds = [all_preds]
251 | 
252 |     return_preds = []
253 |     for frame_preds in all_preds:
254 |         frame_preds = single_nms(frame_preds, iou_threshold)
255 |         return_preds.append(frame_preds)
256 | 
257 |     return return_preds
258 | 


--------------------------------------------------------------------------------
/vision_agent/utils/tools_doc.py:
--------------------------------------------------------------------------------
 1 | import inspect
 2 | from typing import Any, Callable, Dict, List, Optional
 3 | 
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def get_tool_documentation(funcs: List[Callable[..., Any]]) -> str:
 8 |     docstrings = ""
 9 |     for func in funcs:
10 |         docstrings += f"{func.__name__}{inspect.signature(func)}:\n{strip_notes(func.__doc__)}\n\n"
11 | 
12 |     return docstrings
13 | 
14 | 
15 | def strip_notes(doc: Optional[str]) -> Optional[str]:
16 |     if doc is None:
17 |         return None
18 |     return doc[: doc.find("Notes\n")].strip()
19 | 
20 | 
21 | def get_tool_descriptions(funcs: List[Callable[..., Any]]) -> str:
22 |     descriptions = ""
23 |     for func in funcs:
24 |         description = strip_notes(func.__doc__)
25 |         if description is None:
26 |             description = ""
27 | 
28 |         if "Parameters:" in description:
29 |             description = (
30 |                 description[: description.find("Parameters:")]
31 |                 .replace("\n", " ")
32 |                 .strip()
33 |             )
34 | 
35 |         description = " ".join(description.split())
36 |         descriptions += f"- {func.__name__}{inspect.signature(func)}: {description}\n"
37 |     return descriptions
38 | 
39 | 
40 | def get_tool_descriptions_by_names(
41 |     tool_name: Optional[List[str]],
42 |     funcs: List[Callable[..., Any]],
43 |     util_funcs: List[
44 |         Callable[..., Any]
45 |     ],  # util_funcs will always be added to the list of functions
46 | ) -> str:
47 |     if tool_name is None:
48 |         return get_tool_descriptions(funcs + util_funcs)
49 | 
50 |     invalid_names = [
51 |         name for name in tool_name if name not in {func.__name__ for func in funcs}
52 |     ]
53 | 
54 |     if invalid_names:
55 |         raise ValueError(f"Invalid customized tool names: {', '.join(invalid_names)}")
56 | 
57 |     filtered_funcs = (
58 |         funcs
59 |         if not tool_name
60 |         else [func for func in funcs if func.__name__ in tool_name]
61 |     )
62 |     return get_tool_descriptions(filtered_funcs + util_funcs)
63 | 
64 | 
65 | def get_tools_df(funcs: List[Callable[..., Any]]) -> pd.DataFrame:
66 |     data: Dict[str, List[str]] = {"desc": [], "doc": [], "name": []}
67 | 
68 |     for func in funcs:
69 |         desc = strip_notes(func.__doc__)
70 |         if desc is None:
71 |             desc = ""
72 |         desc = desc[: desc.find("Parameters:")].replace("\n", " ").strip()
73 |         desc = " ".join(desc.split())
74 | 
75 |         doc = f"{func.__name__}{inspect.signature(func)}:\n{strip_notes(func.__doc__)}"
76 |         data["desc"].append(desc)
77 |         data["doc"].append(doc)
78 |         data["name"].append(func.__name__)
79 | 
80 |     return pd.DataFrame(data)  # type: ignore
81 | 
82 | 
83 | def get_tools_info(funcs: List[Callable[..., Any]]) -> Dict[str, str]:
84 |     data: Dict[str, str] = {}
85 | 
86 |     for func in funcs:
87 |         desc = strip_notes(func.__doc__)
88 |         if desc is None:
89 |             desc = ""
90 | 
91 |         data[func.__name__] = f"{func.__name__}{inspect.signature(func)}:\n{desc}"
92 | 
93 |     return data
94 | 


--------------------------------------------------------------------------------
/vision_agent/utils/video.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import tempfile
  3 | from functools import lru_cache
  4 | from typing import IO, List, Optional, Tuple
  5 | 
  6 | import av  # type: ignore
  7 | import cv2
  8 | import numpy as np
  9 | 
 10 | _LOGGER = logging.getLogger(__name__)
 11 | # The maximum length of the clip to extract frames from, in seconds
 12 | 
 13 | _DEFAULT_VIDEO_FPS = 24
 14 | _DEFAULT_INPUT_FPS = 1.0
 15 | 
 16 | 
 17 | def _resize_frame(frame: np.ndarray) -> np.ndarray:
 18 |     height, width = frame.shape[:2]
 19 |     new_width = width - (width % 2)
 20 |     new_height = height - (height % 2)
 21 |     return cv2.resize(frame, (new_width, new_height))
 22 | 
 23 | 
 24 | def video_writer(
 25 |     frames: List[np.ndarray],
 26 |     fps: float = _DEFAULT_INPUT_FPS,
 27 |     file: Optional[IO[bytes]] = None,
 28 | ) -> str:
 29 |     if isinstance(fps, str):
 30 |         # fps could be a string when it's passed in from a web endpoint deployment
 31 |         fps = float(fps)
 32 |     if file is None:
 33 |         file = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
 34 |     with av.open(file, "w") as container:
 35 |         stream = container.add_stream("h264", rate=fps)
 36 |         height, width = frames[0].shape[:2]
 37 |         stream.height = height - (height % 2)
 38 |         stream.width = width - (width % 2)
 39 |         stream.pix_fmt = "yuv420p"
 40 |         stream.options = {"crf": "10"}
 41 |         for frame in frames:
 42 |             # Remove the alpha channel (convert RGBA to RGB)
 43 |             frame_rgb = frame[:, :, :3]
 44 |             # Resize the frame to make dimensions divisible by 2
 45 |             frame_rgb = _resize_frame(frame_rgb)
 46 |             av_frame = av.VideoFrame.from_ndarray(frame_rgb, format="rgb24")
 47 |             for packet in stream.encode(av_frame):
 48 |                 container.mux(packet)
 49 | 
 50 |         for packet in stream.encode():
 51 |             container.mux(packet)
 52 |     return file.name
 53 | 
 54 | 
 55 | def frames_to_bytes(
 56 |     frames: List[np.ndarray], fps: float = _DEFAULT_INPUT_FPS, file_ext: str = ".mp4"
 57 | ) -> bytes:
 58 |     r"""Convert a list of frames to a video file encoded into a byte string.
 59 | 
 60 |     Parameters:
 61 |         frames: the list of frames
 62 |         fps: the frames per second of the video
 63 |         file_ext: the file extension of the video file
 64 |     """
 65 |     if isinstance(fps, str):
 66 |         # fps could be a string when it's passed in from a web endpoint deployment
 67 |         fps = float(fps)
 68 |     with tempfile.NamedTemporaryFile(delete=True, suffix=file_ext) as f:
 69 |         video_writer(frames, fps, f)
 70 |         f.seek(0)
 71 |         buffer_bytes = f.read()
 72 |     return buffer_bytes
 73 | 
 74 | 
 75 | def rescale(frame: np.ndarray, max_size: Tuple[int, int]) -> np.ndarray:
 76 |     h, w = frame.shape[:2]
 77 |     new_h, new_w = h, w
 78 |     if new_h > max_size[0]:
 79 |         new_h = max_size[0]
 80 |         new_w = int(w * new_h / h)
 81 |     if new_w > max_size[1]:
 82 |         new_w = max_size[1]
 83 |         new_h = int(h * new_w / w)
 84 |     if h != new_h or w != new_w:
 85 |         frame = cv2.resize(frame, (new_w, new_h))
 86 |     return frame
 87 | 
 88 | 
 89 | # WARNING: This cache is a little dangerous because if the underlying video
 90 | # contents change but the filename remains the same it will return the old file contents.
 91 | # For vision agent it's unlikely to change the file contents while keeping the
 92 | # same file name and the time savings are very large.
 93 | @lru_cache(maxsize=8)
 94 | def extract_frames_from_video(
 95 |     video_uri: str, fps: float = _DEFAULT_INPUT_FPS
 96 | ) -> List[Tuple[np.ndarray, float]]:
 97 |     """Extract frames from a video along with the timestamp in seconds.
 98 | 
 99 |     Parameters:
100 |         video_uri (str): the path to the video file or a video file url
101 |         fps (float): the frame rate per second to extract the frames
102 | 
103 |     Returns:
104 |         a list of tuples containing the extracted frame and the timestamp in seconds.
105 |             E.g. [(frame1, 0.0), (frame2, 0.5), ...]. The timestamp is the time in seconds
106 |             from the start of the video. E.g. 12.125 means 12.125 seconds from the start of
107 |             the video. The frames are sorted by the timestamp in ascending order.
108 |     """
109 |     if isinstance(fps, str):
110 |         # fps could be a string when it's passed in from a web endpoint deployment
111 |         fps = float(fps)
112 | 
113 |     cap = cv2.VideoCapture(video_uri)
114 |     orig_fps = cap.get(cv2.CAP_PROP_FPS)
115 |     if not orig_fps or orig_fps <= 0:
116 |         _LOGGER.warning(
117 |             f"Input video, {video_uri}, has no fps, using the default value {_DEFAULT_VIDEO_FPS}"
118 |         )
119 |         orig_fps = _DEFAULT_VIDEO_FPS
120 |     if not fps or fps <= 0:
121 |         _LOGGER.warning(
122 |             f"Input fps, {fps}, is illegal, using the default value: {_DEFAULT_INPUT_FPS}"
123 |         )
124 |         fps = _DEFAULT_INPUT_FPS
125 |     orig_frame_time = 1 / orig_fps
126 |     targ_frame_time = 1 / fps
127 |     frames: List[Tuple[np.ndarray, float]] = []
128 |     i = 0
129 |     elapsed_time = 0.0
130 |     while cap.isOpened():
131 |         ret, frame = cap.read()
132 |         if not ret:
133 |             break
134 | 
135 |         elapsed_time += orig_frame_time
136 |         # This is to prevent float point precision loss issue, which can cause
137 |         # the elapsed time to be slightly less than the target frame time, which
138 |         # causes the last frame to be skipped
139 |         elapsed_time = round(elapsed_time, 8)
140 |         if elapsed_time >= targ_frame_time:
141 |             frame = rescale(frame, (1024, 1024))
142 |             frames.append((cv2.cvtColor(frame, cv2.COLOR_BGR2RGB), i / orig_fps))
143 |             elapsed_time -= targ_frame_time
144 | 
145 |         i += 1
146 |     cap.release()
147 |     _LOGGER.info(f"Extracted {len(frames)} frames from {video_uri}")
148 |     return frames
149 | 


--------------------------------------------------------------------------------