├── web_search ├── async_serpapi_client │ ├── __init__.py │ └── async_serpapi_client.py ├── __init__.py ├── web_search.py ├── perplexity.py └── dataforseo.py ├── tests ├── images │ ├── black.jpg │ ├── candle.jpg │ ├── popcorn.jpg │ ├── screw.jpg │ ├── toy_car.jpg │ ├── ua_hat.jpg │ ├── Zoom_Call.jpg │ ├── candyland.jpg │ ├── gym_setup.jpg │ ├── staircase.jpg │ ├── two_beds.jpg │ ├── airpod_case.jpg │ ├── boxing_ring.webp │ ├── dreary_day.jpg │ ├── house_plant.jpg │ ├── living_room.jpg │ ├── parking_sign.jpg │ ├── persian_rug.jpg │ ├── Wangjing_Soho.jpg │ ├── child_artwork.jpg │ ├── chinese_dragon.jpg │ ├── pickled_snake.jpg │ ├── sample_rotated.jpg │ ├── some_old_actor.jpg │ ├── Philz_Coffee_Cup.jpg │ ├── SF_Cask_and_Lark.png │ ├── STEM_card_Chinese.jpg │ ├── agility_ladder.webp │ ├── airpod_case_open.jpg │ ├── boxing_equipment.webp │ ├── cars_behind_wall.jpg │ ├── chinese_sign_1.webp │ ├── chinese_sign_2.webp │ ├── figurine_on_table.jpg │ ├── hp_mouse_512x512.jpg │ ├── motivational_sign.jpg │ ├── small_trash_can.jpg │ ├── thomas_the_train.jpg │ ├── toys_strewn_about.jpg │ ├── winning_gloves.webp │ ├── book_atomic_habits.jpg │ ├── book_viral_justice.jpg │ ├── eggs_banana_avocado.jpg │ ├── flower_arrangement.jpg │ ├── frame_smart_glasses.jpg │ ├── pug_asian_clothing.jpg │ ├── dragon_fruit_in_box.webp │ ├── toy_track_incomplete.jpg │ ├── STEM_card_Chinese_closeup.jpg │ ├── bobak_dressed_for_weather.jpg │ ├── frame_captures │ │ ├── Window_Tree.jpg │ │ ├── Laptop_Discord_1.jpg │ │ ├── Laptop_Discord_2.jpg │ │ ├── Shelves_Blurry_1.jpg │ │ ├── Shelves_Blurry_2.jpg │ │ ├── Window_Overexposed.jpg │ │ ├── test_camera_image1.webp │ │ ├── test_camera_image2.webp │ │ ├── test_camera_image3.webp │ │ ├── test_camera_image4.webp │ │ ├── test_camera_image5.webp │ │ ├── test_camera_image6.webp │ │ ├── test_camera_image7.webp │ │ ├── test_camera_image8.webp │ │ ├── Fingers_Overexposed_1.jpg │ │ ├── Fingers_Overexposed_2.jpg │ │ └── Fingers_Overexposed_3.jpg │ ├── woman_strange_fuzzy_pants.jpeg │ └── colorful_landscape_paintings.jpg ├── frame_camera.json ├── frame_camera_laziness.json ├── tests.json └── benchmark.json ├── generate_image ├── __init__.py ├── generate_image.py └── replicate.py ├── docs └── noa_assistant.drawio.png ├── vision ├── __init__.py ├── vision.py ├── gpt4vision.py ├── claude_vision.py └── utils.py ├── assistant ├── __init__.py ├── assistant.py ├── context.py ├── claude_assistant.py └── gpt_assistant.py ├── models ├── __init__.py ├── token_usage.py └── api.py ├── .gitignore ├── .env.example ├── requirements.txt ├── LICENSE.md ├── load_audio.sh ├── README.md ├── run_benchmark.py └── app.py /web_search/async_serpapi_client/__init__.py: -------------------------------------------------------------------------------- 1 | from .async_serpapi_client import AsyncSerpAPIClient -------------------------------------------------------------------------------- /tests/images/black.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/black.jpg -------------------------------------------------------------------------------- /tests/images/candle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/candle.jpg -------------------------------------------------------------------------------- /tests/images/popcorn.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/popcorn.jpg -------------------------------------------------------------------------------- /tests/images/screw.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/screw.jpg -------------------------------------------------------------------------------- /tests/images/toy_car.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/toy_car.jpg -------------------------------------------------------------------------------- /tests/images/ua_hat.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/ua_hat.jpg -------------------------------------------------------------------------------- /generate_image/__init__.py: -------------------------------------------------------------------------------- 1 | from .generate_image import GenerateImage 2 | from .replicate import ReplicateGenerateImage -------------------------------------------------------------------------------- /tests/images/Zoom_Call.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/Zoom_Call.jpg -------------------------------------------------------------------------------- /tests/images/candyland.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/candyland.jpg -------------------------------------------------------------------------------- /tests/images/gym_setup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/gym_setup.jpg -------------------------------------------------------------------------------- /tests/images/staircase.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/staircase.jpg -------------------------------------------------------------------------------- /tests/images/two_beds.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/two_beds.jpg -------------------------------------------------------------------------------- /docs/noa_assistant.drawio.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/docs/noa_assistant.drawio.png -------------------------------------------------------------------------------- /tests/images/airpod_case.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/airpod_case.jpg -------------------------------------------------------------------------------- /tests/images/boxing_ring.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/boxing_ring.webp -------------------------------------------------------------------------------- /tests/images/dreary_day.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/dreary_day.jpg -------------------------------------------------------------------------------- /tests/images/house_plant.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/house_plant.jpg -------------------------------------------------------------------------------- /tests/images/living_room.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/living_room.jpg -------------------------------------------------------------------------------- /tests/images/parking_sign.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/parking_sign.jpg -------------------------------------------------------------------------------- /tests/images/persian_rug.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/persian_rug.jpg -------------------------------------------------------------------------------- /tests/images/Wangjing_Soho.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/Wangjing_Soho.jpg -------------------------------------------------------------------------------- /tests/images/child_artwork.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/child_artwork.jpg -------------------------------------------------------------------------------- /tests/images/chinese_dragon.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/chinese_dragon.jpg -------------------------------------------------------------------------------- /tests/images/pickled_snake.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/pickled_snake.jpg -------------------------------------------------------------------------------- /tests/images/sample_rotated.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/sample_rotated.jpg -------------------------------------------------------------------------------- /tests/images/some_old_actor.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/some_old_actor.jpg -------------------------------------------------------------------------------- /vision/__init__.py: -------------------------------------------------------------------------------- 1 | from .vision import Vision 2 | from .gpt4vision import GPT4Vision 3 | from .claude_vision import ClaudeVision -------------------------------------------------------------------------------- /tests/images/Philz_Coffee_Cup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/Philz_Coffee_Cup.jpg -------------------------------------------------------------------------------- /tests/images/SF_Cask_and_Lark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/SF_Cask_and_Lark.png -------------------------------------------------------------------------------- /tests/images/STEM_card_Chinese.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/STEM_card_Chinese.jpg -------------------------------------------------------------------------------- /tests/images/agility_ladder.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/agility_ladder.webp -------------------------------------------------------------------------------- /tests/images/airpod_case_open.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/airpod_case_open.jpg -------------------------------------------------------------------------------- /tests/images/boxing_equipment.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/boxing_equipment.webp -------------------------------------------------------------------------------- /tests/images/cars_behind_wall.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/cars_behind_wall.jpg -------------------------------------------------------------------------------- /tests/images/chinese_sign_1.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/chinese_sign_1.webp -------------------------------------------------------------------------------- /tests/images/chinese_sign_2.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/chinese_sign_2.webp -------------------------------------------------------------------------------- /tests/images/figurine_on_table.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/figurine_on_table.jpg -------------------------------------------------------------------------------- /tests/images/hp_mouse_512x512.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/hp_mouse_512x512.jpg -------------------------------------------------------------------------------- /tests/images/motivational_sign.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/motivational_sign.jpg -------------------------------------------------------------------------------- /tests/images/small_trash_can.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/small_trash_can.jpg -------------------------------------------------------------------------------- /tests/images/thomas_the_train.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/thomas_the_train.jpg -------------------------------------------------------------------------------- /tests/images/toys_strewn_about.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/toys_strewn_about.jpg -------------------------------------------------------------------------------- /tests/images/winning_gloves.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/winning_gloves.webp -------------------------------------------------------------------------------- /tests/images/book_atomic_habits.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/book_atomic_habits.jpg -------------------------------------------------------------------------------- /tests/images/book_viral_justice.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/book_viral_justice.jpg -------------------------------------------------------------------------------- /tests/images/eggs_banana_avocado.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/eggs_banana_avocado.jpg -------------------------------------------------------------------------------- /tests/images/flower_arrangement.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/flower_arrangement.jpg -------------------------------------------------------------------------------- /tests/images/frame_smart_glasses.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_smart_glasses.jpg -------------------------------------------------------------------------------- /tests/images/pug_asian_clothing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/pug_asian_clothing.jpg -------------------------------------------------------------------------------- /tests/images/dragon_fruit_in_box.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/dragon_fruit_in_box.webp -------------------------------------------------------------------------------- /tests/images/toy_track_incomplete.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/toy_track_incomplete.jpg -------------------------------------------------------------------------------- /tests/images/STEM_card_Chinese_closeup.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/STEM_card_Chinese_closeup.jpg -------------------------------------------------------------------------------- /tests/images/bobak_dressed_for_weather.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/bobak_dressed_for_weather.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/Window_Tree.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Window_Tree.jpg -------------------------------------------------------------------------------- /tests/images/woman_strange_fuzzy_pants.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/woman_strange_fuzzy_pants.jpeg -------------------------------------------------------------------------------- /tests/images/colorful_landscape_paintings.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/colorful_landscape_paintings.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/Laptop_Discord_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Laptop_Discord_1.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/Laptop_Discord_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Laptop_Discord_2.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/Shelves_Blurry_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Shelves_Blurry_1.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/Shelves_Blurry_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Shelves_Blurry_2.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/Window_Overexposed.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Window_Overexposed.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/test_camera_image1.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image1.webp -------------------------------------------------------------------------------- /tests/images/frame_captures/test_camera_image2.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image2.webp -------------------------------------------------------------------------------- /tests/images/frame_captures/test_camera_image3.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image3.webp -------------------------------------------------------------------------------- /tests/images/frame_captures/test_camera_image4.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image4.webp -------------------------------------------------------------------------------- /tests/images/frame_captures/test_camera_image5.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image5.webp -------------------------------------------------------------------------------- /tests/images/frame_captures/test_camera_image6.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image6.webp -------------------------------------------------------------------------------- /tests/images/frame_captures/test_camera_image7.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image7.webp -------------------------------------------------------------------------------- /tests/images/frame_captures/test_camera_image8.webp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image8.webp -------------------------------------------------------------------------------- /tests/images/frame_captures/Fingers_Overexposed_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Fingers_Overexposed_1.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/Fingers_Overexposed_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Fingers_Overexposed_2.jpg -------------------------------------------------------------------------------- /tests/images/frame_captures/Fingers_Overexposed_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Fingers_Overexposed_3.jpg -------------------------------------------------------------------------------- /web_search/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from .web_search import WebSearch, WebSearchResult 3 | from .dataforseo import DataForSEOWebSearch 4 | from .serp import SerpWebSearch 5 | from .perplexity import PerplexityWebSearch -------------------------------------------------------------------------------- /assistant/__init__.py: -------------------------------------------------------------------------------- 1 | from .assistant import Assistant, AssistantResponse 2 | from .gpt_assistant import GPTAssistant 3 | from .claude_assistant import ClaudeAssistant 4 | from .context import extract_learned_context -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .api import Role, Message, Capability, SearchAPI, VisionModel, GenerateImageService, MultimodalRequest, MultimodalResponse, ExtractLearnedContextRequest, ExtractLearnedContextResponse 2 | from .token_usage import TokenUsage, accumulate_token_usage -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .env* 3 | *.exe 4 | *.txt 5 | *.json 6 | *.jpg 7 | *.png 8 | *.md 9 | *.html 10 | *.wav 11 | !docs/noa_assistant.drawio.png 12 | !.env.example 13 | !requirements.txt 14 | __pycache__ 15 | dev_deploy.sh 16 | load_audio.sh 17 | env 18 | Dockerfile 19 | audio_logs 20 | audio 21 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | IMAGE_CDN=your_image_cdn 2 | SERPAPI_API_KEY=serp_api_key 3 | DATAFORSEO_USERNAME=dataforseo_username (optoinal) 4 | DATAFORSEO_PASSWORD=dataforseo_password (optional) 5 | OPENAI_API_KEY=open_ai_key 6 | EXPERIMENT_AI_PORT=8000 7 | SEARCH_API=serp 8 | ANTHROPIC_API_KEY=anthropic_api_key 9 | SCENARIO_API_KEY=scenario_api_key 10 | REPLICATE_API_TOKEN=repliate_api_token -------------------------------------------------------------------------------- /generate_image/generate_image.py: -------------------------------------------------------------------------------- 1 | # 2 | # Generate image from text or image. 3 | # 4 | 5 | from abc import ABC, abstractmethod 6 | from dataclasses import dataclass 7 | from typing import List 8 | 9 | from pydantic import BaseModel 10 | 11 | 12 | 13 | class GenerateImage(ABC): 14 | @abstractmethod 15 | def generate_image( 16 | query: str, 17 | use_image: bool, 18 | image_bytes: bytes | None, 19 | ) -> str: 20 | pass -------------------------------------------------------------------------------- /vision/vision.py: -------------------------------------------------------------------------------- 1 | # 2 | # vision.py 3 | # 4 | # Vision tool base class. 5 | # 6 | 7 | from abc import ABC, abstractmethod 8 | from dataclasses import dataclass 9 | from typing import Dict 10 | 11 | from models import TokenUsage 12 | 13 | 14 | @dataclass 15 | class VisionOutput: 16 | response: str 17 | web_query: str 18 | reverse_image_search: bool 19 | 20 | def web_search_needed(self): 21 | return len(self.web_query) > 0 22 | 23 | class Vision(ABC): 24 | @abstractmethod 25 | async def query_image(self, query: str, extra_context: str, image_bytes: bytes | None, token_usage_by_model: Dict[str, TokenUsage]) -> VisionOutput | None: 26 | pass -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | annotated-types==0.6.0 2 | anyio==3.7.1 3 | certifi==2023.11.17 4 | charset-normalizer==3.3.2 5 | click==8.1.7 6 | colorama==0.4.6 7 | distro==1.8.0 8 | fastapi~=0.109.1 9 | geographiclib==2.0 10 | geopy==2.4.1 11 | h11==0.14.0 12 | httpcore==1.0.2 13 | httpx==0.25.2 14 | idna==3.7 15 | openai==1.3.5 16 | pydantic==2.5.2 17 | pydantic_core==2.14.5 18 | pydub==0.25.1 19 | python-multipart==0.0.7 20 | requests==2.31.0 21 | serpapi==0.1.5 22 | sniffio==1.3.0 23 | starlette~=0.36.2 24 | tqdm==4.66.1 25 | typing_extensions==4.8.0 26 | urllib3==2.1.0 27 | uule-grabber==0.1.9 28 | uvicorn==0.27.0.post1 29 | ffmpeg 30 | replicate 31 | anthropic==0.25.6 32 | aiohttp 33 | groq 34 | opencv-python -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright © 2024 Brilliant Labs Ltd. 2 | 3 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies. 4 | 5 | THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. -------------------------------------------------------------------------------- /models/token_usage.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from typing import Dict 3 | 4 | from pydantic import BaseModel 5 | 6 | 7 | class TokenUsage(BaseModel): 8 | input: int = 0 9 | output: int = 0 10 | total: int = 0 11 | 12 | def add(self, token_usage: TokenUsage): 13 | self.input += token_usage.input 14 | self.output += token_usage.output 15 | self.total += token_usage.total 16 | 17 | def accumulate_token_usage(token_usage_by_model: Dict[str, TokenUsage], model: str, input_tokens: int, output_tokens: int, total_tokens: int): 18 | token_usage = TokenUsage(input=input_tokens, output=output_tokens, total=total_tokens) 19 | if model not in token_usage_by_model: 20 | token_usage_by_model[model] = token_usage 21 | else: 22 | token_usage_by_model[model].add(token_usage=token_usage) -------------------------------------------------------------------------------- /load_audio.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Set your SSH server details based on user's choice 3 | SSH_USER=root 4 | SSH_HOST=139.144.72.206 5 | REMOTE_SCRIPT_PATH="audio_load.sh" 6 | REMOTE_AUDIO_DIR="audio_logs/" 7 | LOCAL_AUDIO_DIR="./audio_logs/" 8 | 9 | # Ensure the local audio directory exists 10 | mkdir -p $LOCAL_AUDIO_DIR 11 | 12 | # Perform SSH command to run audio_load.sh 13 | ssh $SSH_USER@$SSH_HOST "bash $REMOTE_SCRIPT_PATH" 14 | 15 | # Check SSH exit code 16 | if [ $? -ne 0 ]; then 17 | echo "SSH command execution failed" 18 | exit $? 19 | fi 20 | 21 | # Perform SCP transfer to copy audio files locally 22 | scp -r $SSH_USER@$SSH_HOST:$REMOTE_AUDIO_DIR* $LOCAL_AUDIO_DIR 23 | 24 | # Check SCP exit code 25 | if [ $? -ne 0 ]; then 26 | echo "SCP transfer failed" 27 | exit $? 28 | fi 29 | 30 | echo "Audio files copied to $LOCAL_AUDIO_DIR successfully" 31 | exit 0 32 | -------------------------------------------------------------------------------- /web_search/web_search.py: -------------------------------------------------------------------------------- 1 | # 2 | # web_search.py 3 | # 4 | # Web search tool base class and result structure. 5 | # 6 | 7 | from abc import ABC, abstractmethod 8 | from dataclasses import dataclass 9 | from typing import Dict, List 10 | 11 | from models import Message, TokenUsage 12 | 13 | 14 | @dataclass 15 | class WebSearchResult: 16 | """ 17 | Web search result, used for all concrete implementations of WebSearch. 18 | """ 19 | 20 | # 21 | # Summarized result, to be used as the tool response string. 22 | # 23 | summary: str 24 | 25 | # 26 | # Implementation-specific metadata for debugging. Can contain e.g. search result links, etc. If 27 | # we want to break out search result links for e.g., the mobile companion app, we should create 28 | # a new field and avoid using this one. 29 | # 30 | search_provider_metadata: str 31 | 32 | class WebSearch(ABC): 33 | @abstractmethod 34 | async def search_web(self, query: str, message_history: List[Message] | None, token_usage_by_model: Dict[str, TokenUsage], use_photo: bool = False, image_bytes: bytes | None = None, location: str | None = None) -> WebSearchResult: 35 | pass 36 | 37 | 38 | -------------------------------------------------------------------------------- /tests/frame_camera.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "active": true, 4 | "name": "frame_camera", 5 | "default_image": "tests/images/black.jpg", 6 | "conversations": [ 7 | [ 8 | { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image1.webp", "capabilities": [ "vision" ] }, 9 | { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image2.webp", "capabilities": [ "vision" ] }, 10 | { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image3.webp", "capabilities": [ "vision" ] }, 11 | { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image4.webp", "capabilities": [ "vision" ] }, 12 | { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image5.webp", "capabilities": [ "vision" ] }, 13 | { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image6.webp", "capabilities": [ "vision" ] }, 14 | { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image7.webp", "capabilities": [ "vision" ] }, 15 | { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image8.webp", "capabilities": [ "vision" ] } 16 | ] 17 | ] 18 | } 19 | ] -------------------------------------------------------------------------------- /tests/frame_camera_laziness.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "active": true, 4 | "name": "frame_camera_laziness", 5 | "default_image": "tests/images/black.jpg", 6 | "conversations": [ 7 | [ 8 | { "text": "what am i looking at right now?", "image": "tests/images/frame_Captures/Laptop_Discord_1.jpg", "capabilities": [ "vision" ] }, 9 | { "text": "can you guess which app I'm running?", "image": "tests/images/frame_Captures/Laptop_Discord_2.jpg", "capabilities": [ "vision" ] }, 10 | { "text": "what do you see outside my window right now?", "image": "tests/images/frame_Captures/Window_Overexposed.jpg", "capabilities": [ "vision" ] }, 11 | { "text": "can you tell what kind of tree is outside?", "image": "tests/images/frame_Captures/Window_Tree.jpg", "capabilities": [ "vision" ] }, 12 | { "text": "what do you see on the shelves in front of me?", "image": "tests/images/frame_Captures/Shelves_Blurry_1.jpg", "capabilities": [ "vision" ] }, 13 | { "text": "can you be a bit more specific than that? tell me about the decorative items.", "image": "tests/images/frame_Captures/Shelves_Blurry_2.jpg", "capabilities": [ "vision" ] }, 14 | { "text": "how many fingers am i holding up?", "image": "tests/images/frame_Captures/Fingers_Overexposed_1.jpg", "capabilities": [ "vision" ] }, 15 | { "text": "how many fingers am i holding up?", "image": "tests/images/frame_Captures/Fingers_Overexposed_2.jpg", "capabilities": [ "vision" ] }, 16 | { "text": "how many fingers do you see?", "image": "tests/images/frame_Captures/Fingers_Overexposed_3.jpg", "capabilities": [ "vision" ] } 17 | ] 18 | ] 19 | } 20 | ] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Noa Assistant 2 | This repository contains all AI stuff of Noa that can be used in parallel with\ 3 | a Authentication server to protect APIs and throttled usage\ 4 | Working features: 5 | 1. Conversational AI 6 | 2. Conversation with Photos 7 | 3. Voice transcription 8 | 4. Web Search 9 | 10 | ### Setup 11 | 1. Copy the `.env.example` file to `.env` and fill in the keys 12 | Note: for reverse image search a image cdn needed, you can use any api which accepts image and returns a url.\ 13 | format of the api should be like this.\ 14 | Or this can be avoided if not using reverse image search 15 | ```sh 16 | curl -F'file=@path/to/yourfile.png' -Fexpires=2 example.com 17 | ``` 18 | 19 | 1. Create and activate a python virtual enviroment(optional). eg. from [freecodecamp](https://www.freecodecamp.org/news/how-to-setup-virtual-environments-in-python/) 20 | 21 | 2. Install [ffmpeg](https://ffmpeg.org/download.html), and make sure its available in PATH variable 22 | 3. Install required python packages 23 | ```bash 24 | pip install -r requirements.txt 25 | ``` 26 | 4. Run the server 27 | ```bash 28 | python app.py --server 29 | ``` 30 | vision tools, search tools, assistant can be selected by passing the following arguments 31 | ```bash 32 | python app.py --server --vision gpt-4-vision-preview --search-api serpapi --assistant gpt 33 | ``` 34 | 35 | Now the server should be running on `http://localhost:8000` if default port is used 36 | ### API 37 | #### POST /mm 38 | ```javascript 39 | await fetch('localhost:8000/mm', { 40 | method: 'POST', 41 | body: new FormData({ 42 | mm: JSON.stringify({ 43 | prompt: 'who are you?', 44 | messages: [ 45 | { 46 | role: 'user', 47 | content: 'Hi' 48 | }, 49 | { 50 | role: 'assistant', 51 | content: 'Hello how can I help you?' 52 | } 53 | ], 54 | gps: [23.646965, 87.159115], 55 | local_time: 'Tuesday, March 12, 2024, 7:24 AM', 56 | address: 'london', 57 | vision: 'claude-3-haiku-20240307' 58 | }), 59 | image: new File(['path/to/yourfile.png'], 'yourfile.png'), 60 | audio: new File(['path/to/yourfile.wav'], 'yourfile.wav') 61 | }) 62 | }) 63 | ``` 64 | #### POST /health 65 | ```javascript 66 | await fetch('localhost:8000/health') 67 | ``` 68 | ### workflow 69 | ![Workflow](docs/noa_assistant.drawio.png) 70 | -------------------------------------------------------------------------------- /models/api.py: -------------------------------------------------------------------------------- 1 | # 2 | # api.py 3 | # 4 | # Server API models. 5 | # 6 | 7 | from enum import Enum 8 | from typing import Dict, List, Optional 9 | 10 | from pydantic import BaseModel 11 | 12 | from .token_usage import TokenUsage 13 | 14 | 15 | class Role(str, Enum): 16 | SYSTEM = "system" 17 | ASSISTANT = "assistant" 18 | USER = "user" 19 | 20 | class Message(BaseModel): 21 | role: Role 22 | content: str 23 | 24 | class Capability(str, Enum): 25 | ASSISTANT_KNOWLEDGE = "assistant_knowledge" 26 | WEB_SEARCH = "web_search" 27 | VISION = "vision" 28 | REVERSE_IMAGE_SEARCH = "reverse_image_search" 29 | IMAGE_GENERATION = "image_generation" 30 | 31 | class SearchEngine(str, Enum): 32 | GOOGLE_REVERSE_IMAGE = "google_reverse_image" 33 | GOOGLE_LENS = "google_lens" 34 | GOOGLE = "google" 35 | GOOGLE_JOBS = "google_jobs" 36 | GOOGLE_NEWS = "google_news" 37 | GOOGLE_SHOPPING = "google_shopping" 38 | GOOGLE_TRAVEL = "google_travel" 39 | GOOGLE_LOCAL = "google_local" 40 | GOOGLE_IMERSIVE_PRODUCT = "google_immersive_product" 41 | GOOGLE_FINANCE = "google_finance" 42 | GOOGLE_EVENTS = "google_events" 43 | GOOGLE_SCHOLAR = "google_scholar" 44 | 45 | class SearchAPI(Enum): 46 | SERP = "serp" 47 | DATAFORSEO = "dataforseo" 48 | PERPLEXITY = "perplexity" 49 | 50 | class VisionModel(str, Enum): 51 | GPT4O = "gpt-4o" 52 | GPT4Vision = "gpt-4-vision-preview" 53 | CLAUDE_HAIKU = "claude-3-haiku-20240307" 54 | CLAUDE_SONNET = "claude-3-sonnet-20240229" 55 | CLAUDE_OPUS = "claude-3-opus-20240229" 56 | 57 | class GenerateImageService(str, Enum): 58 | REPLICATE = "replicate" 59 | 60 | class MultimodalRequest(BaseModel): 61 | messages: Optional[List[Message]] 62 | prompt: Optional[str] = "" 63 | noa_system_prompt: Optional[str] = None 64 | assistant: Optional[str] = None # assistant class: gpt, claude, perplexity, groq 65 | assistant_model: Optional[str] = None # specific model for the assistant class 66 | search_api: Optional[SearchAPI] = None 67 | search_engine: Optional[SearchEngine] = SearchEngine.GOOGLE 68 | max_search_results: Optional[int] = 10 69 | local_time: Optional[str] = None 70 | address: Optional[str] = None 71 | latitude: Optional[str] = None 72 | longitude: Optional[str] = None 73 | vision: Optional[VisionModel] = None, 74 | speculative_vision: Optional[bool] = True 75 | perplexity_key: Optional[str] = None 76 | openai_key: Optional[str] = None 77 | generate_image: Optional[int] = 0 78 | generate_image_service: Optional[GenerateImageService] = GenerateImageService.REPLICATE 79 | testing_mode: Optional[bool] = False 80 | 81 | class MultimodalResponse(BaseModel): 82 | user_prompt: str 83 | response: str 84 | image: str 85 | token_usage_by_model: Dict[str, TokenUsage] 86 | capabilities_used: List[Capability] 87 | total_tokens: int 88 | input_tokens: int 89 | output_tokens: int 90 | timings: str 91 | debug_tools: str 92 | 93 | class ExtractLearnedContextRequest(BaseModel): 94 | messages: List[Message] 95 | existing_learned_context: Dict[str, str] 96 | 97 | class ExtractLearnedContextResponse(BaseModel): 98 | learned_context: Dict[str, str] 99 | token_usage_by_model: Dict[str, TokenUsage] -------------------------------------------------------------------------------- /generate_image/replicate.py: -------------------------------------------------------------------------------- 1 | from .generate_image import GenerateImage 2 | from replicate import async_run as replicate 3 | 4 | import requests 5 | from io import BytesIO 6 | 7 | import base64 8 | 9 | NEGATIVE_PROMPT = 'ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face' 10 | POSITIVE_PROMPT = 'digital art, hyperrealistic, fantasy, artstation, highly detailed, sharp focus, studio lighting' 11 | 12 | class ReplicateGenerateImage(GenerateImage): 13 | def __init__(self, model: str='asiryan/juggernaut-xl-v7:6a52feace43ce1f6bbc2cdabfc68423cb2319d7444a1a1dae529c5e88b976382'): 14 | super().__init__() 15 | self._model = model 16 | 17 | async def generate_image( 18 | self, 19 | query: str, 20 | use_image: bool, 21 | image_bytes: bytes | None, 22 | ) -> str: 23 | if use_image: 24 | if not image_bytes: 25 | raise ValueError('Image bytes must be provided') 26 | input_base64_img = base64.b64encode(image_bytes).decode('utf-8') 27 | response = await replicate( 28 | self._model, 29 | input={ 30 | "width": 512, 31 | "height": 512, 32 | "prompt": f"{query}, {POSITIVE_PROMPT}", 33 | "image": f'data:image/png;base64,{input_base64_img}', 34 | "refine": "expert_ensemble_refiner", 35 | "scheduler": "K_EULER", 36 | "lora_scale": 0.5, 37 | "num_outputs": 1, 38 | "guidance_scale": 7.5, 39 | "apply_watermark": False, 40 | "high_noise_frac": 0.7, 41 | "negative_prompt": NEGATIVE_PROMPT, 42 | "prompt_strength": 0.8, 43 | "num_inference_steps": 30 44 | } 45 | ) 46 | else: 47 | raise NotImplementedError('Text generation is not imlemented yet') 48 | # else: 49 | # response = replicate.run( 50 | # self._model, 51 | # input={ 52 | # "width": 512, 53 | # "height": 512, 54 | # "prompt": f"{query}, {POSITIVE_PROMPT}", 55 | # "refine": "expert_ensemble_refiner", 56 | # "scheduler": "K_EULER", 57 | # "lora_scale": 0.6, 58 | # "num_outputs": 1, 59 | # "guidance_scale": 7.5, 60 | # "apply_watermark": False, 61 | # "high_noise_frac": 0.8, 62 | # "negative_prompt": "", 63 | # "prompt_strength": 1, 64 | # "num_inference_steps": 25 65 | # } 66 | # ) 67 | 68 | # response is url of image 69 | # make it base64 70 | image_url = response[0] 71 | response = requests.get(image_url) 72 | if response.status_code != 200: 73 | return 'Failed to generate image' 74 | # convert to base64 75 | base64_img = base64.b64encode(response.content).decode('utf-8') 76 | return base64_img 77 | 78 | GenerateImage.register(ReplicateGenerateImage) -------------------------------------------------------------------------------- /assistant/assistant.py: -------------------------------------------------------------------------------- 1 | # 2 | # assistant.py 3 | # 4 | # Assistant base class and associated data structures. 5 | # 6 | 7 | from __future__ import annotations 8 | from abc import ABC, abstractmethod 9 | from dataclasses import dataclass 10 | from typing import Dict, List 11 | 12 | from pydantic import BaseModel 13 | 14 | from models import Message, Capability, TokenUsage 15 | from web_search import WebSearch 16 | from vision import Vision 17 | 18 | @dataclass 19 | class AssistantResponse: 20 | token_usage_by_model: Dict[str, TokenUsage] 21 | capabilities_used: List[Capability] 22 | response: str 23 | debug_tools: str # debugging information about tools used (no particular format guaranteed) 24 | timings: str 25 | image: str | None = None 26 | 27 | class Assistant(ABC): 28 | @abstractmethod 29 | async def send_to_assistant( 30 | prompt: str, 31 | noa_system_prompt: str | None, 32 | image_bytes: bytes | None, 33 | message_history: List[Message] | None, 34 | learned_context: Dict[str, str], 35 | local_time: str | None, 36 | location_address: str | None, 37 | model: str | None, 38 | web_search: WebSearch, 39 | vision: Vision, 40 | speculative_vision: bool 41 | ) -> AssistantResponse: 42 | """ 43 | Sends a message from user to assistant. 44 | 45 | Parameters 46 | ---------- 47 | prompt : str 48 | User message. 49 | image_bytes : bytes | None 50 | Image of what user is looking at. 51 | message_history : List[Mesage] | None 52 | Conversation history, excluding current user message we will run inference on. 53 | learned_context : Dict[str, str] 54 | Learned context about the user, as key-value pairs. 55 | local_time : str | None 56 | User's local time in a human-readable format, which helps the LLM answer questions where 57 | the user indirectly references the date or time. E.g., 58 | "Saturday, March 30, 2024, 1:21 PM". 59 | location_address : str | None 60 | User's current location, specified as a full or partial address. This provides context 61 | to the LLM and is especially useful for web searches. E.g., 62 | "3985 Stevens Creek Blvd, Santa Clara, CA 95051". 63 | model : str | None 64 | Assistant model. Valid values will depend on the assistant implementation (e.g., OpenAI- 65 | based assistants will take "gpt-3.5-turbo", etc.) A default will be selected if None is 66 | passed. 67 | web_search : WebSearch 68 | Web search provider, invoked when a web search (including a reverse image search) is 69 | needed. 70 | vision : Vision 71 | Vision AI provider, invoked when understanding of what user is looking at is required. 72 | speculative_vision : bool 73 | Whether to perform speculative vision queries (if supported by assistant). This will run 74 | the vision tool in parallel with the initial LLM request in *all* cases, using the user 75 | prompt as the query, but only use the result if the LLM then determines the vision tool 76 | should have been used. This reduces latency by the duration of the initial LLM call by 77 | giving the vision tool (which is usually slow) a head start. 78 | 79 | Returns 80 | ------- 81 | AssistantResponse 82 | Assistant response (text and some required analytics). 83 | """ 84 | pass -------------------------------------------------------------------------------- /web_search/async_serpapi_client/async_serpapi_client.py: -------------------------------------------------------------------------------- 1 | # 2 | # async_serpapi_client.py 3 | # 4 | # An asynchronous version of the SerpAPI client built on aiohttp. This is based on the serpapi 5 | # package's Client class. If we import a newer version of the pacakage with substantial changes to 6 | # the API, we will need to update this async client. 7 | # 8 | 9 | import aiohttp 10 | 11 | from serpapi.__version__ import __version__ 12 | from serpapi import SerpResults 13 | 14 | 15 | class AsyncSerpAPIClient: 16 | BASE_DOMAIN = "https://serpapi.com" 17 | USER_AGENT = f"serpapi-python, v{__version__}" 18 | DASHBOARD_URL = "https://serpapi.com/dashboard" 19 | 20 | def __init__(self, api_key: str, session: aiohttp.ClientSession): 21 | self._api_key = api_key 22 | self._session = session 23 | 24 | def __del__(self): 25 | self._session.detach() 26 | 27 | def __repr__(self): 28 | return "" 29 | 30 | async def search(self, params: dict = None, **kwargs) -> SerpResults | str: 31 | """Fetch a page of results from SerpApi. Returns a :class:`SerpResults ` object, or unicode text (*e.g.* if ``'output': 'html'`` was passed). 32 | 33 | The following three calls are equivalent: 34 | 35 | .. code-block:: python 36 | 37 | >>> s = serpapi.search(q="Coffee", location="Austin, Texas, United States") 38 | 39 | .. code-block:: python 40 | 41 | >>> params = {"q": "Coffee", "location": "Austin, Texas, United States"} 42 | >>> s = serpapi.search(**params) 43 | 44 | .. code-block:: python 45 | 46 | >>> params = {"q": "Coffee", "location": "Austin, Texas, United States"} 47 | >>> s = serpapi.search(params) 48 | 49 | 50 | :param q: typically, this is the parameter for the search engine query. 51 | :param engine: the search engine to use. Defaults to ``google``. 52 | :param output: the output format desired (``html`` or ``json``). Defaults to ``json``. 53 | :param api_key: the API Key to use for SerpApi.com. 54 | :param **: any additional parameters to pass to the API. 55 | 56 | 57 | **Learn more**: https://serpapi.com/search-api 58 | """ 59 | path = "/search" 60 | assert_200 = True 61 | 62 | if params is None: 63 | params = {} 64 | 65 | if kwargs: 66 | params.update(kwargs) 67 | 68 | # Inject the API Key into the params. 69 | if "api_key" not in params: 70 | params["api_key"] = self._api_key 71 | 72 | # Build the URL, as needed 73 | if not path.startswith("http"): 74 | url = self.BASE_DOMAIN + path 75 | else: 76 | url = path 77 | 78 | # Make the HTTP request. 79 | headers = {"User-Agent": self.USER_AGENT} 80 | 81 | # Perform GET 82 | async with self._session.get(url=url, params=params, headers=headers) as response: 83 | if assert_200: 84 | response.raise_for_status() 85 | return await self._serp_results_from_json(response=response) 86 | 87 | @staticmethod 88 | async def _serp_results_from_json(response: aiohttp.ClientResponse): 89 | """Construct a SerpResults object from an HTTP response. 90 | 91 | :param assert_200: if ``True`` (default), raise an exception if the status code is not 200. 92 | :param client: the Client instance which was used to send this request. 93 | 94 | An instance of this class is returned if the response is a valid JSON object. 95 | Otherwise, the raw text (as a properly decoded unicode string) is returned. 96 | """ 97 | 98 | try: 99 | return SerpResults(data=await response.json(), client=None) 100 | except ValueError: 101 | # If the response is not JSON, return the raw text. 102 | return await response.text() -------------------------------------------------------------------------------- /vision/gpt4vision.py: -------------------------------------------------------------------------------- 1 | # 2 | # gpt4vision.py 3 | # 4 | # Vision tool implementation based on GPT-4. 5 | # 6 | 7 | import base64 8 | from typing import Dict, Optional 9 | 10 | import openai 11 | from pydantic import BaseModel 12 | 13 | from .vision import Vision, VisionOutput 14 | from .utils import detect_media_type 15 | from models import TokenUsage, accumulate_token_usage 16 | 17 | 18 | SYSTEM_MESSAGE = """ 19 | You are Noa, a smart personal AI assistant inside the user's AR smart glasses that answers all user 20 | queries and questions. You have access to a photo from the smart glasses camera of what the user was 21 | seeing at the time they spoke but you NEVER mention the photo or image and instead respond as if you 22 | are actually seeing. 23 | 24 | The camera is unfortunately VERY low quality but the user is counting on you to interpret the 25 | blurry, pixelated images. NEVER comment on image quality. Do your best with images. 26 | 27 | ALWAYS respond with a JSON object with these fields: 28 | 29 | response: (String) Respond to user as best you can. Be precise, get to the point, and speak as though you actually see the image. 30 | web_query: (String) Empty if your "response" answers everything user asked. If web search based on visual description would be more helpful, create a query (e.g. up-to-date, location-based, or product info). 31 | reverse_image_search: (Bool) True if your web query from description is insufficient and including the *exact* thing user is looking at as visual target is needed. 32 | """ 33 | 34 | class ModelOutput(BaseModel): 35 | response: str 36 | web_query: Optional[str] = None 37 | reverse_image_search: Optional[bool] = None 38 | 39 | 40 | class GPT4Vision(Vision): 41 | def __init__(self, client: openai.AsyncOpenAI, model: str = "gpt-4o"): 42 | self._client = client 43 | self._model = model 44 | 45 | @property 46 | def model(self): 47 | return self._model 48 | 49 | async def query_image(self, query: str, extra_context: str, image_bytes: bytes | None, token_usage_by_model: Dict[str, TokenUsage]) -> VisionOutput | None: 50 | messages = [ 51 | { "role": "system", "content": SYSTEM_MESSAGE + extra_context }, 52 | { 53 | "role": "user", 54 | "content": [ 55 | { "type": "text", "text": query } 56 | ] 57 | } 58 | ] 59 | 60 | if image_bytes: 61 | image_base64 = base64.b64encode(image_bytes).decode("utf-8") 62 | media_type = detect_media_type(image_bytes=image_bytes) 63 | messages[1]["content"].append({ "type": "image_url", "image_url": { "url": f"data:{media_type};base64,{image_base64}" } }), 64 | 65 | response = await self._client.chat.completions.create( 66 | model=self._model, 67 | messages=messages, 68 | max_tokens=4096 69 | ) 70 | 71 | accumulate_token_usage( 72 | token_usage_by_model=token_usage_by_model, 73 | model=self._model, 74 | input_tokens=response.usage.prompt_tokens, 75 | output_tokens=response.usage.completion_tokens, 76 | total_tokens=response.usage.total_tokens 77 | ) 78 | 79 | # Convert to VisionResponse and return 80 | output = self._parse_response(content=response.choices[0].message.content) 81 | if output is None: 82 | return None 83 | web_query = output.web_query if output.web_query is not None else "" 84 | reverse_image_search = output.reverse_image_search is not None and output.reverse_image_search == True 85 | if len(web_query) == 0 and reverse_image_search: 86 | # If no web query output but reverse image search asked for, just use user query 87 | # directly. This is sub-optimal and it would be better to figure out a way to ensure 88 | # web_query is generated when reverse_image_search is true. 89 | web_query = query 90 | return VisionOutput(response=output.response, web_query=web_query, reverse_image_search=reverse_image_search) 91 | 92 | @staticmethod 93 | def _parse_response(content: str) -> ModelOutput | None: 94 | # Response expected to be JSON but may be wrapped with ```json ... ``` 95 | json_start = content.find("{") 96 | json_end = content.rfind("}") 97 | json_string = content[json_start : json_end + 1] 98 | try: 99 | return ModelOutput.model_validate_json(json_data=json_string) 100 | except: 101 | pass 102 | return None 103 | 104 | Vision.register(GPT4Vision) -------------------------------------------------------------------------------- /vision/claude_vision.py: -------------------------------------------------------------------------------- 1 | 2 | # 3 | # claude_vision.py 4 | # 5 | # Vision tool implementation based on Anthropic. 6 | # 7 | 8 | import base64 9 | from typing import Dict, Optional 10 | 11 | import anthropic 12 | from pydantic import BaseModel 13 | 14 | from .vision import Vision, VisionOutput 15 | from .utils import detect_media_type 16 | from models import TokenUsage, accumulate_token_usage 17 | 18 | 19 | SYSTEM_MESSAGE = """ 20 | You are Noa, a smart personal AI assistant inside the user's AR smart glasses that answers all user 21 | queries and questions. You have access to a photo from the smart glasses camera of what the user was 22 | seeing at the time they spoke but you NEVER mention the photo or image and instead respond as if you 23 | are actually seeing. 24 | 25 | Always do your best with images, never comment on their quality, and answer decisively with a guess 26 | if you are not sure. There are no negative consequences to guessing. 27 | 28 | ALWAYS respond with a JSON object with these fields: 29 | 30 | response: (String) Respond to user as best you can. Be precise, get to the point, never comment on image quality. 31 | web_query: (String) Web query to answer the user's request. 32 | web_search_needed: (Bool) Whether to search the web. True ONLY if "response" does not answer the user query precisely enough and up-to-date, location-specific, or product-specific info is needed. 33 | """ 34 | 35 | class ModelOutput(BaseModel): 36 | response: str 37 | web_query: Optional[str] = None 38 | web_search_needed: Optional[bool] = None 39 | 40 | class ClaudeVision(Vision): 41 | def __init__(self, client: anthropic.AsyncAnthropic, model: str="claude-3-haiku-20240307"): 42 | self._client = client 43 | self._model = model 44 | 45 | async def query_image(self, query: str, extra_context: str, image_bytes: bytes | None, token_usage_by_model: Dict[str, TokenUsage]) -> VisionOutput | None: 46 | image_base64 = base64.b64encode(image_bytes).decode("utf-8") if image_bytes is not None else "" 47 | 48 | messages = [ 49 | { 50 | "role": "user", 51 | "content": [ 52 | { 53 | "type": "image", 54 | "source": { 55 | "type": "base64", 56 | "media_type": detect_media_type(image_bytes=image_bytes), 57 | "data": image_base64 58 | } 59 | }, 60 | { 61 | "type": "text", 62 | "text": query 63 | } 64 | ] 65 | }, 66 | { 67 | # Prefill a leading '{' to force JSON output as per Anthropic's recommendations 68 | "role": "assistant", 69 | "content": [ 70 | { 71 | "type": "text", 72 | "text": "{" 73 | } 74 | ] 75 | } 76 | ] 77 | 78 | response = await self._client.messages.create( 79 | model=self._model, 80 | system=SYSTEM_MESSAGE + extra_context, 81 | messages=messages, 82 | max_tokens=4096, 83 | temperature=0.0, 84 | ) 85 | 86 | accumulate_token_usage( 87 | token_usage_by_model=token_usage_by_model, 88 | model=self._model, 89 | input_tokens=response.usage.input_tokens, 90 | output_tokens=response.usage.output_tokens, 91 | total_tokens=response.usage.input_tokens + response.usage.output_tokens 92 | ) 93 | 94 | # Convert to VisionResponse and return 95 | print(f"ClaudeVision input: {query}") 96 | print(f"ClaudeVision model output: {response.content[0].text}") 97 | output = self._parse_response(content=response.content[0].text) 98 | if output is None: 99 | return None 100 | web_search_needed = output.web_search_needed and output.web_query is not None and len(output.web_query) > 0 101 | web_query = output.web_query if web_search_needed else "" 102 | reverse_image_search = False # for now, we don't perform reverse image search because uncertain where it is really useful 103 | return VisionOutput(response=output.response, web_query=web_query, reverse_image_search=reverse_image_search) 104 | 105 | @staticmethod 106 | def _parse_response(content: str) -> ModelOutput | None: 107 | # Put the leading '{' back 108 | json_string = "{" + content 109 | try: 110 | return ModelOutput.model_validate_json(json_data=json_string) 111 | except: 112 | pass 113 | return None 114 | 115 | Vision.register(ClaudeVision) -------------------------------------------------------------------------------- /assistant/context.py: -------------------------------------------------------------------------------- 1 | # 2 | # context.py 3 | # 4 | # Routines for creating a message containing additional context about the user. These messages 5 | # should be injected into the conversation. 6 | # 7 | # Information about the user can be extracted by analyzing batches of their messages and turned into 8 | # a simple list of key-value pairs. Feeding these back to the assistant will produce more relevant, 9 | # contextually-aware, and personalized responses. 10 | # 11 | 12 | from typing import Dict, List 13 | 14 | import openai 15 | import groq 16 | 17 | from models import Role, Message, TokenUsage, accumulate_token_usage 18 | 19 | 20 | #################################################################################################### 21 | # Prompts 22 | #################################################################################################### 23 | 24 | # These are context keys we try to detect in conversation history over time 25 | LEARNED_CONTEXT_KEY_DESCRIPTIONS = { 26 | "UserName": "User's name", 27 | "DOB": "User's date of birth", 28 | "Food": "Foods and drinks user has expressed interest in" 29 | } 30 | 31 | LEARNED_CONTEXT_EXTRACTION_SYSTEM_MESSAGE = f""" 32 | Given a transcript of what the user said, look for any of the following information being revealed: 33 | 34 | """ + "\n".join([ key + ": " + description for key, description in LEARNED_CONTEXT_KEY_DESCRIPTIONS.items() ]) + """ 35 | 36 | Make sure to list them in this format: 37 | 38 | KEY=VALUE 39 | 40 | If nothing was found, just say "END". ONLY PRODUCE ITEMS WHEN THE USER HAS ACTUALLY REVEALED THEM. 41 | """ 42 | 43 | CONTEXT_SYSTEM_MESSAGE_PREFIX = "## Additional context about the user:" 44 | 45 | 46 | #################################################################################################### 47 | # Functions 48 | #################################################################################################### 49 | 50 | def create_context_system_message(local_time: str | None, location: str | None, learned_context: Dict[str,str] | None) -> str: 51 | """ 52 | Creates a string of additional context that can either be appended to the main system 53 | message or as a secondary system message before delivering the assistant response. This is 54 | how GPT is made aware of the user's location, local time, and any learned information that 55 | was extracted from prior conversation. 56 | 57 | Parameters 58 | ---------- 59 | local_time : str | None 60 | Local time, if known. 61 | location : str | None 62 | Location, as a human readable address, if known. 63 | learned_context : Dict[str,str] | None 64 | Information learned from prior conversation as key-value pairs, if any. 65 | 66 | Returns 67 | ------- 68 | str 69 | Message to combine with existing system message or to inject as a new, extra system 70 | message. 71 | """ 72 | # Fixed context: things we know and need not extract from user conversation history 73 | context: Dict[str, str] = {} 74 | if local_time is not None and len(local_time) > 0: 75 | context["current_time"] = local_time 76 | else: 77 | context["current_time"] = "If asked, tell user you don't know current date or time because clock is broken" 78 | if location is not None and len(location) > 0: 79 | context["location"] = location 80 | else: 81 | context["location"] = "You do not know user's location and if asked, tell them so" 82 | 83 | # Merge in learned context 84 | if learned_context is not None: 85 | context.update(learned_context) 86 | 87 | # Convert to a list to be appended to a system message or treated as a new system message 88 | system_message_fragment = CONTEXT_SYSTEM_MESSAGE_PREFIX + "\n".join([ f"<{key}>{value}" for key, value in context.items() if value is not None ]) 89 | return system_message_fragment 90 | 91 | async def extract_learned_context( 92 | client: openai.AsyncOpenAI | groq.AsyncGroq, 93 | message_history: List[Message], 94 | model: str, 95 | existing_learned_context: Dict[str, str], 96 | token_usage_by_model: Dict[str, TokenUsage] 97 | ) -> Dict[str, str]: 98 | # Grab last N user messages 99 | max_user_history = 2 100 | messages: List[Message] = [] 101 | for i in range(len(message_history) - 1, -1, -1): 102 | if len(messages) >= max_user_history: 103 | break 104 | if message_history[i].role == Role.USER: 105 | messages.append(message_history[i]) 106 | 107 | # Insert system message and reverse so that it is in the right order 108 | messages.append(Message(role=Role.SYSTEM, content=LEARNED_CONTEXT_EXTRACTION_SYSTEM_MESSAGE)) 109 | messages.reverse() 110 | 111 | # print("Context extraction input:") 112 | # print(messages) 113 | 114 | # Process 115 | response = await client.chat.completions.create( 116 | model=model, 117 | messages=messages 118 | ) 119 | 120 | # Do not forget to count tokens used! 121 | accumulate_token_usage( 122 | token_usage_by_model=token_usage_by_model, 123 | model=model, 124 | input_tokens=response.usage.prompt_tokens, 125 | output_tokens=response.usage.completion_tokens, 126 | total_tokens=response.usage.total_tokens 127 | ) 128 | 129 | # # Debug: print raw output of context extraction 130 | # print("Learned context:") 131 | # print(response.choices[0].message.content) 132 | 133 | # Parse it into a dictionary 134 | learned_context: Dict[str,str] = {} 135 | lines = response.choices[0].message.content.splitlines() 136 | for line in lines: 137 | parts = line.split("=") 138 | if len(parts) == 2: 139 | key, value = parts 140 | if key in LEARNED_CONTEXT_KEY_DESCRIPTIONS: 141 | learned_context[key] = value 142 | 143 | # Merge with existing 144 | existing_learned_context.update(learned_context) 145 | return existing_learned_context -------------------------------------------------------------------------------- /web_search/perplexity.py: -------------------------------------------------------------------------------- 1 | # 2 | # perplexity.py 3 | # 4 | # Web search tool implementation based on Perplexity. Cannot perform searches with images. 5 | # 6 | 7 | from typing import Any, Dict, List 8 | 9 | import aiohttp 10 | from pydantic import BaseModel 11 | from web_search import WebSearch, WebSearchResult 12 | from models import Role, Message, TokenUsage, accumulate_token_usage 13 | 14 | class PerplexityMessage(BaseModel): 15 | role: str = None 16 | content: str = None 17 | 18 | class MessageChoices(BaseModel): 19 | index: int = None 20 | finish_reason: str = None 21 | message: PerplexityMessage = None 22 | delta: dict = None 23 | 24 | class Usage(BaseModel): 25 | prompt_tokens: int = None 26 | completion_tokens: int = None 27 | total_tokens: int = None 28 | 29 | class PerplexityResponse(BaseModel): 30 | id: str = None 31 | model: str = None 32 | created: int = None 33 | usage: Usage = None 34 | object: str = None 35 | choices: List[MessageChoices] = None 36 | 37 | def summarise(self) -> str: 38 | if len(self.choices) > 0: 39 | return self.choices[0].message.content 40 | else: 41 | return "No results" 42 | 43 | class PerplexityWebSearch(WebSearch): 44 | def __init__(self, api_key: str, model: str = "llama-3-sonar-small-32k-online"): 45 | super().__init__() 46 | self._api_key = api_key 47 | self._model = model 48 | self._session = None 49 | self._stream = True 50 | 51 | def __del__(self): 52 | if self._session: 53 | self._session.detach() 54 | 55 | async def _lazy_init(self): 56 | if self._session is None: 57 | # This instantiation must happen inside of an async event loop 58 | self._session = aiohttp.ClientSession() 59 | 60 | async def search_web( 61 | self, 62 | query: str, 63 | message_history: List[Message] | None, 64 | token_usage_by_model: Dict[str, TokenUsage], 65 | use_photo: bool = False, 66 | image_bytes: bytes | None = None, 67 | location: str | None = None 68 | ) -> WebSearchResult: 69 | await self._lazy_init() 70 | 71 | message_history = [] if message_history is None else message_history.copy() 72 | message_history = self._prune_history(message_history=message_history) 73 | 74 | messages = [ 75 | Message(role=Role.SYSTEM, content=self._system_message(location=location)) 76 | ] + message_history + [ 77 | Message(role=Role.USER, content=query) 78 | ] 79 | print(messages) 80 | 81 | url = "https://api.perplexity.ai/chat/completions" 82 | payload = { 83 | "model": self._model, 84 | "messages": [ message.model_dump() for message in messages ], 85 | "stream": self._stream, 86 | } 87 | headers = { 88 | "accept": "application/json", 89 | "content-type": "application/json", 90 | "authorization": f"Bearer {self._api_key}" 91 | } 92 | json_text = await self._post(url=url, payload=payload, headers=headers) 93 | if json_text is None: 94 | return WebSearchResult(summary="No results", search_provider_metadata="") 95 | 96 | # Return results 97 | # print(json_text) 98 | try: 99 | perplexity_data = PerplexityResponse.model_validate_json(json_text) 100 | except Exception as e: 101 | print(json_text) 102 | print(f"Failed to parse Perplexity response: {e}") 103 | return WebSearchResult(summary="No results", search_provider_metadata="") 104 | accumulate_token_usage( 105 | token_usage_by_model=token_usage_by_model, 106 | model=self._model, 107 | input_tokens=perplexity_data.usage.prompt_tokens, 108 | output_tokens=perplexity_data.usage.completion_tokens, 109 | total_tokens=perplexity_data.usage.total_tokens 110 | ) 111 | search_result = perplexity_data.choices[0].message.content if len(perplexity_data.choices) > 0 else "No results" 112 | return WebSearchResult(summary=search_result, search_provider_metadata="") 113 | 114 | async def _post(self, url: str, payload: Dict[str, Any], headers: Dict[str, str]) -> str | None: 115 | async with self._session.post(url=url, json=payload, headers=headers) as response: 116 | if response.status != 200: 117 | print(f"Failed to get response from Perplexity: {await response.text()}") 118 | return None 119 | if self._stream: 120 | return_response = "" 121 | async for line in response.content.iter_any(): 122 | return_response = line.decode("utf-8").split("data: ")[1].strip() 123 | return return_response 124 | return await response.text() 125 | 126 | @staticmethod 127 | def _system_message(location: str | None): 128 | if location is None or len(location) == 0: 129 | location = "" 130 | return f"reply in concise and short with high accurancy from web results if needed take location as {location}" 131 | 132 | @staticmethod 133 | def _prune_history( 134 | message_history: List[Message], 135 | max_messages: int = 8 136 | ) -> List[Message]: 137 | """ 138 | Prunes down the chat history to save tokens, improving inference speed and reducing cost. 139 | Generally, preserving all assistant responses is not needed, and only a limited number of 140 | user messages suffice to maintain a coherent conversation. 141 | 142 | Parameters 143 | ---------- 144 | message_history : List[Message] 145 | Conversation history. This list will be mutated and returned. 146 | max_messages : int 147 | Maximum number of messages to preserve. Must be an even number because Perplexity 148 | requires alternating user and assistant messages. 149 | 150 | Returns 151 | ------- 152 | List[Message] 153 | Pruned history. This is the same list passed as input. 154 | """ 155 | if max_messages %2 != 0: 156 | print("ERROR: Discarding invalid message history for Perplexity. Require alternating user/assistant messages!") 157 | return [] 158 | message_history.reverse() 159 | message_history = [ message for message in message_history if message.role != Role.SYSTEM ] 160 | message_history = message_history[0:max_messages] 161 | message_history.reverse() 162 | return message_history 163 | 164 | WebSearch.register(PerplexityWebSearch) 165 | -------------------------------------------------------------------------------- /web_search/dataforseo.py: -------------------------------------------------------------------------------- 1 | # 2 | # dataforseo.py 3 | # 4 | # Web search tool implementation using DataForSEO (dataforseo.com). Does not support images. 5 | # 6 | 7 | from base64 import b64encode 8 | import json 9 | import os 10 | from typing import Any, Dict, List, Optional, Tuple 11 | 12 | import aiohttp 13 | from pydantic import BaseModel 14 | import geopy.geocoders 15 | 16 | from models import Message, TokenUsage 17 | from .web_search import WebSearch, WebSearchResult 18 | 19 | 20 | DATAFORSEO_USERNAME = os.environ.get("DATAFORSEO_USERNAME", None) 21 | DATAFORSEO_PASSWORD = os.environ.get("DATAFORSEO_PASSWORD", None) 22 | 23 | class Price(BaseModel): 24 | current: float = None 25 | display_price: str = None 26 | currency: str = None 27 | 28 | class Rating(BaseModel): 29 | rating_type: str = None 30 | value: float = None 31 | votes_count: int = None 32 | rating_max: int = None 33 | 34 | class SubItem(BaseModel): 35 | title: Optional[str] = None 36 | description: Optional[str] = None 37 | price : Optional[Price] = None 38 | rating: Optional[Rating] = None 39 | 40 | class Item(BaseModel): 41 | type: str 42 | title: Optional[str] = None 43 | description: Optional[str] = None 44 | items:Optional[List[SubItem]|List[str]] = None 45 | 46 | class Result(BaseModel): 47 | keyword: str 48 | type: str 49 | check_url: str 50 | items: List[Item] 51 | 52 | class Task(BaseModel): 53 | id: str 54 | status_code: int 55 | status_message: str 56 | cost: float 57 | result: List[Result]|None 58 | 59 | # /v3/serp/google/organic/live/advanced response object 60 | class V3SerpGoogleOrganicLiveAdvancedResponse(BaseModel): 61 | status_code: int 62 | status_message: str 63 | cost: float 64 | tasks_count: int 65 | tasks_error: int 66 | tasks: List[Task] 67 | 68 | def summarise(self, max_search_results: int = 5) -> List[str]: 69 | item_types = [ "stock_box", "organic", "knowledge_graph", "local_pack", "popular_products", "top_stories" ] 70 | summaries = [] 71 | for task in self.tasks: 72 | if not task.result: 73 | continue 74 | for result in task.result: 75 | for item in result.items: 76 | if item.type in item_types and max_search_results > 0 and (item.description or item.items): 77 | # print(summaries) 78 | if item.items: 79 | for subitem in item.items: 80 | #print(subitem) 81 | 82 | if isinstance(subitem, SubItem) and subitem.title and max_search_results > 0 and subitem.description: 83 | summary = (f"{subitem.title}: " if subitem.title else "") + subitem.description 84 | if subitem.price: 85 | if subitem.price.display_price: 86 | summary += f"\nprice: {subitem.price.currency} {subitem.price.display_price}" 87 | elif subitem.price.currency: 88 | summary += f"\nprice: {subitem.price.currency} {subitem.price.current}" 89 | if subitem.rating: 90 | if subitem.rating.value: 91 | summary += f"\nrating: {subitem.rating.value} of {subitem.rating.rating_max} ({subitem.rating.votes_count} votes)" 92 | summaries.append(summary) 93 | max_search_results = max_search_results -1 94 | if item.description: 95 | summary = (f"{item.title}: " if item.title else "") + item.description 96 | summaries.append(summary) 97 | max_search_results = max_search_results -1 98 | content = "\n".join(summaries) if len(summaries) > 0 else "No result found" 99 | return content 100 | 101 | class DataForSEOClient: 102 | def __init__(self): 103 | self._session = aiohttp.ClientSession() 104 | self._base_url = "https://api.dataforseo.com" 105 | 106 | base64_bytes = b64encode( 107 | ("%s:%s" % (DATAFORSEO_USERNAME, DATAFORSEO_PASSWORD)).encode("ascii") 108 | ).decode("ascii") 109 | self._headers = {'Authorization' : 'Basic %s' % base64_bytes, 'Content-Encoding' : 'gzip'} 110 | 111 | def __del__(self): 112 | self._session.detach() 113 | 114 | async def _request(self, path, method, data=None) -> Any | None: 115 | url = self._base_url + path 116 | async with self._session.request(method=method, url=url, headers=self._headers, data=data) as response: 117 | if response.status != 200: 118 | print(f"DataForSEO search failed: {await response.text()}") 119 | return None 120 | return await response.json() 121 | 122 | async def _get(self, path): 123 | return await self._request(path=path, method='GET') 124 | 125 | async def _post(self, path, data): 126 | if isinstance(data, str): 127 | data_str = data 128 | else: 129 | data_str = json.dumps(data) 130 | return await self._request(path=path, method='POST', data=data_str) 131 | 132 | async def perform_search(self, query: str, location_coordinate: Tuple[float, float] | None = None, save_to_file: str | None = None) -> V3SerpGoogleOrganicLiveAdvancedResponse | None: 133 | print("Searching web:") 134 | print(f" query: {query}") 135 | 136 | post_data = dict() 137 | post_data[len(post_data)] = dict( 138 | language_code = "en", 139 | location_coordinate = f"{location_coordinate[0]},{location_coordinate[1]}" if location_coordinate else None, 140 | keyword = query 141 | ) 142 | response_obj = await self._post("/v3/serp/google/organic/live/advanced", post_data) 143 | if response_obj is None: 144 | return None 145 | if save_to_file: 146 | with open(save_to_file, mode="w") as fp: 147 | fp.write(json.dumps(response_obj, indent=2)) 148 | return V3SerpGoogleOrganicLiveAdvancedResponse.model_validate(response_obj) 149 | 150 | class DataForSEOWebSearch(WebSearch): 151 | def __init__(self, save_to_file: str | None = None, max_search_results: int = 5): 152 | super().__init__() 153 | self._save_to_file = save_to_file 154 | self._max_search_results = max_search_results 155 | self._client = None 156 | 157 | async def _lazy_init(self): 158 | if self._client is None: 159 | # This instantiation must happen inside of an async event loop because 160 | # aiohttp.ClientSession()'s initializer requires that 161 | self._client = DataForSEOClient() 162 | 163 | # DataForSEO does not have reverse image search, so photos are always ignored 164 | async def search_web(self, query: str, message_history: List[Message] | None, token_usage_by_model: Dict[str, TokenUsage], use_photo: bool = False, image_bytes: bytes | None = None, location: str | None = None) -> WebSearchResult: 165 | await self._lazy_init() 166 | if location: 167 | # DataForSEO expects lat,long+ 168 | location_coords = geopy.geocoders.Nominatim(user_agent="GetLoc").geocode(location) 169 | coordinates = (location_coords.latitude, location_coords.longitude) 170 | response = await self._client.perform_search(query=query, location_coordinate=coordinates, save_to_file=self._save_to_file) 171 | summary = response.summarise(max_search_results=self._max_search_results) if response is not None else "No results found" 172 | return WebSearchResult(summary=summary, search_provider_metadata="") 173 | 174 | WebSearch.register(DataForSEOWebSearch) -------------------------------------------------------------------------------- /tests/tests.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "active": true, 4 | "name": "test1", 5 | "default_image": "tests/images/black.jpg", 6 | "conversations": [ 7 | [ "what is the weather today?" ], 8 | [ "where can i buy this?" ], 9 | [ "who makes these?" ], 10 | [ "where can i go to eat something like this?" ], 11 | [ "where can i buy this car?" ], 12 | [ "what is apple's stock price?" ], 13 | [ "what is weird about this scene?" ], 14 | [ "what is this" ], 15 | [ "translate this sign" ], 16 | [ "what is weird about this scene?" ], 17 | [ "translate this" ], 18 | [ "what is this?" ], 19 | [ "what is that?" ], 20 | [ "what are those?" ], 21 | [ "what are these?" ], 22 | [ "who is that?" ], 23 | [ "who is this?" ], 24 | [ "who are they?" ] 25 | ] 26 | }, 27 | { 28 | "active": true, 29 | "name": "simple_image_queries", 30 | "default_image": "tests/images/black.jpg", 31 | "conversations": [ 32 | [ "what is apple's stock price?" ], 33 | [ "who invented the light bulb?", "why are pencils sharp?" ], 34 | [ "who invented light bulb" ], 35 | [ "when did India gain independence" ], 36 | [ "which fruits contain vitamin c" ], 37 | [ "when is Einstein's birthday" ], 38 | [ "why are pencils sharp" ], 39 | 40 | [ "how many people in front of me?" ], 41 | [ "what is weird about this scene?" ], 42 | [ "translate this" ], 43 | [ "what is this?" ], 44 | [ "what is that?" ], 45 | [ "what are those?" ], 46 | [ "what are these?" ], 47 | [ "who is that?" ], 48 | [ "who is this?" ], 49 | [ "who are they?" ], 50 | [ "where can i buy this?" ], 51 | [ "where can i buy that?" ], 52 | [ "where can i buy these?" ], 53 | [ "where can i buy those?" ], 54 | [ "where can i buy this car?" ], 55 | [ "where can i buy that car?" ], 56 | [ "where can i buy these cars?" ], 57 | [ "where can i buy a car?" ], 58 | [ "how many calories in this" ], 59 | [ 60 | "i just ate a cheeseburger from mcdonald's", 61 | "how many calories does it have?" 62 | ] 63 | ] 64 | }, 65 | { 66 | "active": true, 67 | "name": "sam_i_am", 68 | "default_image": "tests/images/black.jpg", 69 | "conversations": [ 70 | [ { "text": "what does this say?", "image": "tests/images/chinese_sign_1.webp" } ], 71 | [ { "text": "translate this sign", "image": "tests/images/chinese_sign_2.webp" } ], 72 | [ { "text": "describe the image", "image": "tests/images/boxing_equipment.webp" } ], 73 | [ { "text": "what brand are these gloves", "image": "tests/images/winning_gloves.webp" } ], 74 | [ { "text": "what does this say", "image": "tests/images/winning_gloves.webp" } ], 75 | [ { "text": "what is this used for?", "image": "tests/images/agility_ladder.webp" } ], 76 | [ 77 | { "text": "what does this say?", "image": "tests/images/chinese_sign_1.webp" }, 78 | { "text": "translate this sign", "image": "tests/images/chinese_sign_2.webp" }, 79 | { "text": "describe the image", "image": "tests/images/boxing_equipment.webp" }, 80 | { "text": "what brand are these gloves", "image": "tests/images/winning_gloves.webp" }, 81 | { "text": "what does this say", "image": "tests/images/winning_gloves.webp" }, 82 | { "text": "what is this used for?", "image": "tests/images/agility_ladder.webp" } 83 | ], 84 | [ 85 | { "text": "where does boxing come from?", "image": "tests/images/boxing_ring.webp" }, 86 | { "text": "what recent movies feature boxing?", "image": "tests/images/boxing_ring.webp" }, 87 | { "text": "were any boxing movies released post 2020?", "image": "tests/images/boxing_ring.webp" }, 88 | { "text": "what is this space used for?", "image": "tests/images/boxing_ring.webp" } 89 | ] 90 | ] 91 | }, 92 | { 93 | "active": true, 94 | "name": "Bobak_GPT", 95 | "default_image": "tests/images/black.jpg", 96 | "conversations": [ 97 | [ "Who does Tucker Carlson currently work for?" ], 98 | 99 | [ "how can i learn to be a more confident person?" ], 100 | [ "what was the controversy surrounding pluto?" ], 101 | [ 102 | "what is cerebral palsy and is there a known cure?", 103 | "how about a change a diet, would that help?" 104 | ], 105 | [ "im a pediatrician living in san diego -- how much should i get paid and what is the estimated tax to be paid on that?" ], 106 | [ "is there research which indicates what coffee is bad for the liver?" ], 107 | [ 108 | "how are scented candles manufactured?", 109 | "would it be more cost effective to manufacture them in the US or UK?" 110 | ], 111 | [ "why do iphones experience less OLED burn-in than Android phones even though they both use displays from Samsung?" ], 112 | [ "my partner and i having a disagreement over when I should take paternity leave -- should should we resolve this?" ], 113 | [ "my son is very interested in police and firemen but I'd like him to also be take interest in less stereotypically male things -- how should I do this?" ], 114 | [ 115 | "list coffee beans by origin", 116 | "which pair well with a walnut and fig scone?" 117 | ] 118 | ] 119 | }, 120 | { 121 | "active": false, 122 | "name": "Bobak_web_search_location_based", 123 | "default_image": "tests/images/black.jpg", 124 | "conversations": [ 125 | [ 126 | "does this place get good reviews?", 127 | "what are they known for?", 128 | "has there been a health code violation in this place in the last month?" 129 | ], 130 | [ 131 | "is there a mattress available on craigslist for less than $50?" 132 | ] 133 | ] 134 | }, 135 | { 136 | "active": true, 137 | "name": "Bobak_web_search", 138 | "default_image": "tests/images/black.jpg", 139 | "conversations": [ 140 | [ "who is openAI's newest board member?" ], 141 | [ "how many israeli hostages are still being held by hamas in gaza?" ], 142 | [ "how did Iran respond to pakistan's missile strike within its borders?" ], 143 | [ "any labor rights issues associated with H&M recently?" ], 144 | [ "how is nikki haley's polling right now?"] 145 | ] 146 | }, 147 | { 148 | "active": true, 149 | "name": "Bobak_web_search_image", 150 | "default_image": "tests/images/black.jpg", 151 | "conversations": [ 152 | [ { "text": "where can i buy this screw for less than a dollar online?", "image": "tests/images/screw.jpg" } ] 153 | ] 154 | }, 155 | { 156 | "active": true, 157 | "name": "GPT_only", 158 | "default_image": "tests/images/black.jpg", 159 | "conversations": [ 160 | [ 161 | "how can i learn to be a more confident person?" 162 | ], 163 | [ 164 | "what was the controversy surrounding pluto?" 165 | ], 166 | [ 167 | "what is cerebral palsy and is there a known cure?", 168 | "how about a change a diet, would that help?" 169 | ], 170 | [ 171 | " is there research which indicates what coffee is bad for the liver?" 172 | ], 173 | [ 174 | "how are scented candles manufactured?", 175 | "would it be more cost effective to manufacture them in the US or UK?" 176 | ], 177 | [ 178 | "why do iphones experience less OLED burn-in than Android phones even though they both use displays from Samsung?" 179 | ], 180 | [ 181 | "my partner and i having a disagreement over when I should take paternity leave, how should we resolve this?" 182 | ], 183 | [ 184 | "my son is very interested in police and firemen but I'd like him to also be take interest in less stereotypically male things, how should I do this?" 185 | ], 186 | [ 187 | "list coffee beans by origin which pair well with a walnut and fig scone?" 188 | ] 189 | ] 190 | }, 191 | { 192 | "active": true, 193 | "name": "morning_6_serpapi.com", 194 | "default_image": "tests/images/black.jpg", 195 | "conversations": [ 196 | [ 197 | "How cold is it right now? Is it raining, is it safe to run?", 198 | "Suggest a new two mile running route for me", 199 | "Any constructions/road work I should know about?" 200 | ] 201 | ] 202 | }, 203 | { 204 | "active": true, 205 | "name": "morning_8_serpapi.com", 206 | "default_image": "tests/images/black.jpg", 207 | "conversations": [ 208 | [ "Give me an Apple News style summary of what's happening in new delhi" ] 209 | ] 210 | }, 211 | { 212 | "active": true, 213 | "name": "morning_6_GPT4", 214 | "default_image":"tests/images/gym_setup.jpg", 215 | "conversations": [ 216 | [ 217 | "Look at my gym set up, suggest a HIIT routine for me using the bike, the kettle balls and the bar", 218 | "Create a visual representations of workout plan. Include diagrams of specific exercises, sets, and reps.", 219 | "Can you log the calories burned from this workout" 220 | ] 221 | ] 222 | }, 223 | { 224 | "active": true, 225 | "name": "image_search_mouse", 226 | "default_image": "tests/images/hp_mouse_512x512.jpg", 227 | "conversations": [ 228 | [ "where can i find this in the lowest price?" ] 229 | ] 230 | }, 231 | { 232 | "active": true, 233 | "name": "image_search_candle", 234 | "default_image": "tests/images/black.jpg", 235 | "conversations": [ 236 | [ 237 | { "text": "what is this?", "image": "tests/images/candle.jpg" }, 238 | "Where can I buy it?" 239 | ] 240 | ] 241 | }, 242 | { 243 | "active": true, 244 | "name": "parking_sign", 245 | "conversations": [ 246 | [ 247 | { "text": "it is 7:50 pm, can I park here right now?", "image": "tests/images/parking_sign.jpg" } 248 | ] 249 | ] 250 | } 251 | ] -------------------------------------------------------------------------------- /vision/utils.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import cv2 4 | import numpy as np 5 | from abc import ABC, abstractmethod 6 | from typing import List 7 | import os 8 | 9 | def detect_media_type(image_bytes: bytes) -> str: 10 | if image_bytes is not None: 11 | if image_bytes[0:4] == b"\x89PNG": 12 | return "image/png" 13 | elif b"JFIF" in image_bytes[0:64]: # probably should do a stricter check here 14 | return "image/jpeg" 15 | elif image_bytes[0:4] == b"RIFF" and image_bytes[8:12] == b"WEBP": 16 | return "image/webp" 17 | 18 | # Unknown: just assume JPEG 19 | return "image/jpeg" 20 | 21 | 22 | 23 | class BaseFilter(ABC): 24 | @abstractmethod 25 | def apply(self, image): 26 | pass 27 | 28 | @abstractmethod 29 | def adjust(self): 30 | pass 31 | 32 | class SmoothFilter(BaseFilter): 33 | def __init__(self, kernel_size=5): 34 | self.kernel_size = kernel_size 35 | 36 | def apply(self, image): 37 | print("Smoothing kernel size: ", self.kernel_size) 38 | # Apply Gaussian blur 39 | return cv2.GaussianBlur(image, (self.kernel_size, self.kernel_size), 0) 40 | 41 | def adjust(self, kernel_size=None): 42 | self.kernel_size = kernel_size 43 | 44 | class SaturationFilter(BaseFilter): 45 | def __init__(self, saturation=1.0): 46 | self.saturation = saturation 47 | 48 | def apply(self, image): 49 | print("Saturation value: ", self.saturation) 50 | # Convert the image to the HSV color space 51 | hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) 52 | h, s, v = cv2.split(hsv) 53 | 54 | # Apply saturation to the S channel 55 | s = cv2.addWeighted(s, self.saturation, np.zeros_like(s), 0, 0) 56 | 57 | # Merge the channels back together 58 | hsv = cv2.merge([h, s, v]) 59 | return cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR) 60 | 61 | def adjust(self, saturation=None, scale=100): 62 | self.saturation = (saturation / scale)*10 if saturation > 0 else 0.1 63 | 64 | class TemperatureFilter(BaseFilter): 65 | def __init__(self, temperature=0.0): 66 | self.temperature = temperature 67 | 68 | def apply(self, image): 69 | print("Temperature value: ", self.temperature) 70 | # Convert the image to the LAB color space 71 | lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) 72 | l, a, b = cv2.split(lab) 73 | 74 | # Apply temperature to the A channel 75 | a = cv2.addWeighted(a, self.temperature, np.zeros_like(a), 0, 0) 76 | 77 | # Merge the channels back together 78 | lab = cv2.merge([l, a, b]) 79 | return cv2.cvtColor(lab, cv2.COLOR_LAB2BGR) 80 | 81 | def adjust(self, temperature=None, scale=100): 82 | self.temperature = (temperature / scale)*10 if temperature > 0 else 0.1 83 | 84 | class GammaCorrectionFilter(BaseFilter): 85 | def __init__(self, gamma=1.0): 86 | self.gamma = gamma 87 | 88 | def apply(self, image): 89 | print("Gamma value: ", self.gamma) 90 | # Apply gamma correction using cv2.addWeighted() 91 | invGamma = 1.0 / self.gamma 92 | table = np.array([((i / 255.0) ** invGamma) * 255 for i in np.arange(0, 256)]).astype("uint8") 93 | return cv2.LUT(image, table) 94 | def adjust(self, gamma=None, scale=100): 95 | # adjust to 1, 10 scale depending on the slider 96 | self.gamma = (gamma / scale)*10 if gamma > 0 else 0.1 97 | self.gamma = gamma 98 | 99 | class BoostShadowFilter(BaseFilter): 100 | def __init__(self, amount=1): 101 | self.amount = amount 102 | 103 | def apply(self, image): 104 | gamma_corrected = np.power(image / 255.0, self.amount) * 255 105 | return np.uint8(gamma_corrected) 106 | 107 | def adjust(self, amount=None, scale=100): 108 | self.amount = (amount / scale)*10 if amount > 0 else 0.1 109 | 110 | class SharpeningFilter(BaseFilter): 111 | def __init__(self, sigma=1.0, strength=1.0): 112 | self.sigma = sigma 113 | self.strength = strength 114 | 115 | def apply(self, image): 116 | # Apply Gaussian blur 117 | print("Sharpening sigma: ", self.sigma, "Strength: ", self.strength) 118 | blurred = cv2.GaussianBlur(image, (0, 0), self.sigma) 119 | 120 | # Calculate the unsharp mask 121 | unsharp_mask = cv2.addWeighted(image, 1.0 + self.strength, blurred, -self.strength, 0) 122 | 123 | return unsharp_mask 124 | 125 | def adjust(self, amount=None, scale=100): 126 | self.amount = (amount / scale)*10 if amount > 0 else 0.1 127 | 128 | class NoiseReductionFilter(BaseFilter): 129 | def __init__(self, method='gaussian', kernel_size=5): 130 | self.method = method 131 | self.kernel_size = kernel_size 132 | 133 | def apply(self, image): 134 | print("Noise reduction method: ", self.method, "Kernel size: ", self.kernel_size) 135 | if self.method == 'gaussian': 136 | # Apply Gaussian blur 137 | blurred = cv2.GaussianBlur(image, (self.kernel_size, self.kernel_size), 0) 138 | elif self.method == 'median': 139 | # Apply Median blur 140 | blurred = cv2.medianBlur(image, self.kernel_size) 141 | else: 142 | raise ValueError("Unsupported noise reduction method. Use 'gaussian' or 'median'.") 143 | 144 | return blurred 145 | def adjust(self, method=None, kernel_size=None): 146 | self.method = method 147 | self.kernel_size = kernel_size 148 | 149 | class ContrastFilter(BaseFilter): 150 | def __init__(self, contrast=1.0): 151 | self.contrast = contrast 152 | 153 | def apply(self, image): 154 | print("Contrast value: ", self.contrast) 155 | # Apply contrast by converting the image to YUV color space 156 | yuv = cv2.cvtColor(image, cv2.COLOR_BGR2YUV) 157 | y, u, v = cv2.split(yuv) 158 | 159 | # Apply contrast to the Y channel 160 | y = cv2.addWeighted(y, self.contrast, np.zeros_like(y), 0, 0) 161 | 162 | # Merge the channels back together 163 | yuv = cv2.merge([y, u, v]) 164 | return cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR) 165 | 166 | def adjust(self, contrast=None, scale=100): 167 | self.contrast = contrast 168 | 169 | class BoostResolutionFilter(BaseFilter): 170 | def __init__(self, factor=2): 171 | self.factor = factor 172 | 173 | def apply(self, image): 174 | print("Resolution boost factor: ", self.factor) 175 | # Upscale the image using bicubic interpolation 176 | return cv2.resize(image, None, fx=self.factor, fy=self.factor, interpolation=cv2.INTER_CUBIC) 177 | 178 | def adjust(self, factor=None, scale=100): 179 | self.factor = factor 180 | 181 | class ApplyBlurFilter(BaseFilter): 182 | def __init__(self, kernel_size=5): 183 | self.kernel_size = kernel_size 184 | 185 | def apply(self, image): 186 | print("Blur kernel size: ", self.kernel_size) 187 | # Apply Gaussian blur 188 | return cv2.GaussianBlur(image, (self.kernel_size, self.kernel_size), 0) 189 | 190 | def adjust(self, kernel_size=None): 191 | self.kernel_size = kernel_size 192 | 193 | class ReduceResolutionFilter(BaseFilter): 194 | def __init__(self, factor=2): 195 | self.factor = factor 196 | 197 | def apply(self, image): 198 | print("Resolution reduction factor: ", self.factor) 199 | # Downscale the image using bicubic interpolation 200 | return cv2.resize(image, None, fx=1.0/self.factor, fy=1.0/self.factor, interpolation=cv2.INTER_CUBIC) 201 | 202 | def adjust(self, factor=None, scale=100): 203 | self.factor = factor 204 | 205 | class WhiteBalanceFilter(BaseFilter): 206 | def __init__(self): 207 | pass 208 | 209 | def apply(self, image): 210 | print("White balance") 211 | # Auto white balance by equalizing the histogram of the LAB L channel 212 | lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB) 213 | l, a, b = cv2.split(lab) 214 | clahe = cv2.createCLAHE(clipLimit=1.0, tileGridSize=(4, 4)) 215 | l = clahe.apply(l) 216 | lab = cv2.merge([l, a, b]) 217 | return cv2.cvtColor(lab, cv2.COLOR_LAB2BGR) 218 | 219 | def adjust(self, b_ratio=None, g_ratio=None, r_ratio=None): 220 | self.b_ratio = b_ratio 221 | self.g_ratio = g_ratio 222 | self.r_ratio = r_ratio 223 | 224 | class BrightnessFilter(BaseFilter): 225 | def __init__(self, brightness=0): 226 | self.brightness = brightness 227 | 228 | def apply(self, image): 229 | print("Brightness value: ", self.brightness) 230 | # Increase the brightness by adding the specified value to each pixel 231 | return cv2.add(image, np.array([self.brightness])) 232 | 233 | def adjust(self, brightness=None, scale=100): 234 | self.brightness = brightness 235 | 236 | class ImageProcessor: 237 | def __init__(self,path: str|List[str]|bytes, filters: List[BaseFilter]): 238 | self.path = path 239 | self.filters = filters 240 | if isinstance(path, list): 241 | self.image = [cv2.imread(p) for p in path] 242 | elif isinstance(path, bytes): 243 | nparr = np.frombuffer(path, np.uint8) 244 | self.image = cv2.imdecode(nparr, cv2.IMREAD_COLOR) 245 | else: 246 | self.image = cv2.imread(self.path) 247 | self.filtered_image = None 248 | 249 | def apply_filters(self, image=None): 250 | if image is None: 251 | image = self.image 252 | 253 | if isinstance(image, list): 254 | self.filtered_image = [None] * len(image) 255 | 256 | for i, img in enumerate(image): 257 | for f in self.filters: 258 | self.filtered_image[i] = f.apply(img) 259 | 260 | print("Filtered image: ", len(self.filtered_image)) 261 | else: 262 | for f in self.filters: 263 | image = f.apply(image) 264 | self.filtered_image = image 265 | return image 266 | 267 | def __call__(self, preview: bool = True): 268 | self.apply_filters() 269 | if preview: 270 | self.show_preview(slider=True) 271 | 272 | def show_preview(self, slider=False): 273 | if self.filtered_image is not None: 274 | if isinstance(self.filtered_image, list): 275 | for i, img in enumerate(self.filtered_image): 276 | filters_applied = ", ".join([f.__class__.__name__ for f in self.filters]) 277 | cv2.imshow('Original', self.image[i]) 278 | cv2.imshow('Filtered [{}]'.format(filters_applied), img) 279 | cv2.waitKey(0) 280 | cv2.destroyAllWindows() 281 | else: 282 | filters_applied = ", ".join([f.__class__.__name__ for f in self.filters]) 283 | cv2.imshow('Original', self.image) 284 | cv2.imshow('Filtered [{}]'.format(filters_applied), self.filtered_image) 285 | cv2.waitKey(0) 286 | cv2.destroyAllWindows() 287 | def save(self, dir_path: str="output"): 288 | dir_path = os.path.join(os.getcwd(), dir_path) 289 | if not os.path.exists(dir_path): 290 | os.makedirs(dir_path) 291 | if self.filtered_image is not None: 292 | if isinstance(self.filtered_image, list): 293 | for i, img in enumerate(self.filtered_image): 294 | basename = os.path.basename(self.path[i]) 295 | path = os.path.join(dir_path, basename.replace('.webp', '.jpg')) 296 | cv2.imwrite(path, img) 297 | else: 298 | if isinstance(self.path, bytes): 299 | basename = "test_image.jpg" 300 | else: 301 | basename = os.path.basename(self.path) 302 | path = os.path.join(dir_path, basename.replace('.webp', '.jpg')) 303 | print(path) 304 | cv2.imwrite(path, self.filtered_image) 305 | def get_bytes(self): 306 | if self.filtered_image is not None: 307 | if isinstance(self.filtered_image, list): 308 | return [cv2.imencode('.jpg', img)[1].tobytes() for img in self.filtered_image] 309 | else: 310 | return cv2.imencode('.jpg', self.filtered_image)[1].tobytes() 311 | return None 312 | 313 | def process_image(bytes: bytes)->bytes: 314 | filters:List[BaseFilter] = [ 315 | # BoostShadowFilter(amount=0.8), 316 | GammaCorrectionFilter(gamma=1.2), 317 | # BoostResolutionFilter(factor=), 318 | 319 | SharpeningFilter(sigma=0.5, strength=5.0), 320 | WhiteBalanceFilter(), 321 | 322 | BoostShadowFilter(amount=1.2), 323 | ContrastFilter(contrast=1.2), 324 | SaturationFilter(saturation=1.2), 325 | TemperatureFilter(temperature=1.02), 326 | ApplyBlurFilter(kernel_size=3), 327 | 328 | # ReduceResolutionFilter(factor=2), 329 | 330 | # NoiseReductionFilter(method='gaussian', kernel_size=3), 331 | ] 332 | image_processor = ImageProcessor(path=bytes, filters=filters) 333 | image_processor.apply_filters() 334 | # image_processor.save("output") 335 | return image_processor.get_bytes() -------------------------------------------------------------------------------- /tests/benchmark.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "active": true, 4 | "name": "standalone_conversations", 5 | "default_image": "tests/images/black.jpg", 6 | "conversations": [ 7 | [ 8 | { "text": "hi", "capabilities": [ "assistant_knowledge" ] }, 9 | { "text": "Who is the CEO of Brilliant Labs?", "capabilities": [ "web_search" ] }, 10 | { "text": "What is this?", "image": "tests/images/house_plant.jpg", "capabilities": [ "vision" ] } 11 | ], 12 | [ 13 | { "text": "What's the weather like today?", "capabilities": [ "web_search" ] } 14 | ], 15 | [ 16 | { "text": "What time is it?", "capabilities": [ "assistant_knowledge" ] }, 17 | { "text": "What about now?", "capabilities": [ "assistant_knowledge" ] }, 18 | { "text": "And now?", "capabilities": [ "assistant_knowledge" ] } 19 | ], 20 | [ 21 | { "text": "What am I doing?", "image": "tests/images/Zoom_Call.jpg", "capabilities": [ "vision" ] }, 22 | { "text": "How did you know?", "image": "tests/images/Zoom_Call.jpg", "capabilities": [ "vision" ] } 23 | ], 24 | [ 25 | { "text": "Who do you think made this piece of art?", "image": "tests/images/child_artwork.jpg", "capabilities": [ "vision" ] }, 26 | { "text": "How old do you think this artist is?", "image": "tests/images/child_artwork.jpg", "capabilities": [ "vision" ] }, 27 | { "text": "Try to guess based on the nature of the art.", "image": "tests/images/child_artwork.jpg", "capabilities": [ "vision" ] } 28 | ], 29 | [ 30 | { "text": "How many beds are in this room?", "image": "tests/images/two_beds.jpg", "capabilities": [ "vision" ] }, 31 | { "text": "Try again. Are you sure there is only one bed?", "image": "tests/images/two_beds.jpg", "capabilities": [ "vision" ] }, 32 | { "text": "What's the best way I can make this bed to optimize for efficiency and process consistency on a daily basis?", "image": "tests/images/two_beds.jpg", "capabilities": [ "vision" ] }, 33 | { "text": "I'm not familiar with the military corner method. Can you please talk me through it step by step?", "capabilities_any": [ "assistant_knowledge", "web_search" ] } 34 | ], 35 | [ 36 | { "text": "What does this say and what do you think it means?", "image": "tests/images/motivational_sign.jpg", "capabilities": [ "vision" ] }, 37 | { "text": "What would be a different way to phrase this while still achieving the same meaning?", "image": "tests/images/motivational_sign.jpg", "capabilities": [ "assistant_knowledge" ] } 38 | ], 39 | [ 40 | { "text": "What culture style of carpeting weaving is this?", "image": "tests/images/persian_rug.jpg", "capabilities_any": [ "vision" ] }, 41 | { "text": "Yes I understand that but what characteristics of the pattern itself indicate to you that this is a Persian style carpet?", "image": "tests/images/persian_rug.jpg", "capabilities_any": [ "vision" ] } 42 | ], 43 | [ 44 | { "text": "How many different types of toys are strewn about this messy room?", "image": "tests/images/toys_strewn_about.jpg", "capabilities": [ "vision" ] }, 45 | { "text": "Interesting. Can you list them?", "image": "tests/images/toys_strewn_about.jpg", "capabilities": [ "vision" ] } 46 | ], 47 | [ 48 | { "text": "Who made this car?", "image": "tests/images/toy_car.jpg", "capabilities": [ "vision" ] }, 49 | { "text": "Which country has the most police cars Noa?", "image": "tests/images/toy_car.jpg", "capabilities": [ "web_search" ] }, 50 | { "text": "How many police cars does it have?", "image": "tests/images/toy_car.jpg", "capabilities": [ "web_search" ] }, 51 | { "text": "Try to make an educated guess based on what you know", "image": "tests/images/toy_car.jpg", "capabilities": [ "web_search" ] }, 52 | { "text": "I am asking you to simply guess. Don't overthink it.", "image": "tests/images/toy_car.jpg", "capabilities": [ "web_search" ] } 53 | ], 54 | [ 55 | { "text": "How many stairs do you count?", "image": "tests/images/staircase.jpg", "capabilities": [ "vision" ] }, 56 | { "text": "What material does it look like the stairs are made out of?", "image": "tests/images/staircase.jpg", "capabilities": [ "vision" ] } 57 | ], 58 | [ 59 | { "text": "How could this track be improved?", "image": "tests/images/toy_track_incomplete.jpg", "capabilities": [ "vision" ] }, 60 | { "text": "How about the shape and design of the track?", "image": "tests/images/toy_track_incomplete.jpg", "capabilities": [ "vision" ] } 61 | ], 62 | [ 63 | { "text": "What interior design style do you see here?", "image": "tests/images/living_room.jpg", "capabilities": [ "vision" ] }, 64 | { "text": "How do you think I could further improve the beauty and functionality of this space?", "image": "tests/images/living_room.jpg", "capabilities": [ "vision" ] } 65 | ], 66 | [ 67 | { "text": "What is this and what do you think it's used for?", "image": "tests/images/pickled_snake.jpg", "capabilities": [ "vision" ] } 68 | ], 69 | [ 70 | { "text": "Do you know who this train is?", "image": "tests/images/thomas_the_train.jpg", "capabilities": [ "vision" ] } 71 | ], 72 | [ 73 | { "text": "How would you describe the weather today?", "image": "tests/images/dreary_day.jpg", "capabilities_any": [ "vision", "web_search" ] }, 74 | { "text": "Try again. Look at the sky.", "image": "tests/images/dreary_day.jpg", "capabilities": [ "vision" ] } 75 | ], 76 | [ 77 | { "text": "What plant is this and how do I take care of it?", "image": "tests/images/house_plant.jpg", "capabilities": [ "vision" ] } 78 | ], 79 | [ 80 | { "text": "What are the directions to play this game?", "image": "tests/images/candyland.jpg", "capabilities": [ "vision" ] } 81 | ], 82 | [ 83 | { "text": "Is this person dressed appropriately for the weather?", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] }, 84 | { "text": "How old does he look?", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] }, 85 | { "text": "Just take a guess based purely on what you see.", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] }, 86 | { "text": "Simply guess, it's ok to make a mistake.", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] }, 87 | { "text": "What led you to guess the person is in their mid 30's?", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] }, 88 | { "text": "Unpack that, please. Tell me more.", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] } 89 | ], 90 | [ 91 | { "text": "What kind of flowers are these and is there a place nearby where I can buy them?", "image": "tests/images/flower_arrangement.jpg", "capabilities_any": [ "vision" ] }, 92 | { "text": "How much do you think they would be for a bouquet?", "image": "tests/images/flower_arrangement.jpg", "capabilities_any": [ "vision", "web_search" ] } 93 | ], 94 | [ 95 | { "text": "What do these paintings say about the psychological state of the artist?", "image": "tests/images/colorful_landscape_paintings.jpg", "capabilities": [ "vision" ] } 96 | ], 97 | [ 98 | { "text": "What's a good interface modality for a pair of smart glasses like these?", "image": "tests/images/frame_smart_glasses.jpg", "capabilities_any": [ "vision" ] } 99 | ], 100 | [ 101 | { "text": "What does this say?", "image": "tests/images/STEM_card_Chinese.jpg", "capabilities": [ "vision" ] }, 102 | { "text": "OK but can you translate the Chinese into English?", "image": "tests/images/STEM_card_Chinese.jpg", "capabilities": [ "vision" ] }, 103 | { "text": "What does this say?", "image": "tests/images/STEM_card_Chinese_closeup.jpg", "capabilities": [ "vision" ] }, 104 | { "text": "How about the larger characters? Can you translate those to English?", "image": "tests/images/STEM_card_Chinese_closeup.jpg", "capabilities": [ "vision" ] }, 105 | { "text": "That is incorrect. Try again.", "image": "tests/images/STEM_card_Chinese_closeup.jpg", "capabilities": [ "vision" ] } 106 | ], 107 | [ 108 | { "text": "Describe this and any cultural meaning.", "image": "tests/images/chinese_dragon.jpg", "capabilities": [ "vision" ] }, 109 | { "text": "That's really cool, is this the year of the dragon?", "image": "tests/images/chinese_dragon.jpg", "capabilities": [ "assistant_knowledge", "web_search" ] } 110 | ], 111 | [ 112 | { "text": "How many cars do you see and what kind of cars are they?", "image": "tests/images/cars_behind_wall.jpg", "capabilities": [ "vision" ] } 113 | ], 114 | [ 115 | { "text": "How many golf balls can fit in this trash can?", "image": "tests/images/small_trash_can.jpg", "capabilities": [ "vision" ] } 116 | ], 117 | [ 118 | { "text": "What is this book about and where can I get it?", "image": "tests/images/book_viral_justice.jpg", "capabilities": [ "vision" ] } 119 | ], 120 | [ 121 | { "text": "What is this about and where can I get it?", "image": "tests/images/book_atomic_habits.jpg", "capabilities": [ "vision" ] } 122 | ], 123 | [ 124 | { "text": "What is this and where can I get it?", "image": "tests/images/airpod_case.jpg", "capabilities": [ "vision" ] } 125 | ], 126 | [ 127 | { "text": "What is this and where can I get it?", "image": "tests/images/airpod_case_open.jpg", "capabilities": [ "vision" ] }, 128 | { "text": "Who makes these earpods and what are they called?", "image": "tests/images/airpod_case_open.jpg", "capabilities": [ "vision" ] } 129 | ], 130 | [ 131 | { "text": "Who makes this hat?", "image": "tests/images/ua_hat.jpg", "capabilities": [ "vision" ] }, 132 | { "text": "Try analyzing the logo on the hat as a clue to who the manufacturer might be.", "image": "tests/images/ua_hat.jpg", "capabilities": [ "vision" ] } 133 | ], 134 | [ 135 | { "text": "What is the weather forecast for Beijing tonight?", "capabilities": [ "web_search" ] }, 136 | { "text": "Is it going to rain?", "capabilities": [ "web_search" ] }, 137 | { "text": "Thanks, ok, then I will bring my pet lovebird inside. Her name is Sunny. What's an alternative quirky name that I could give her which is reminiscent of funk Shakespearean pop?", "capabilities": [ "assistant_knowledge" ] }, 138 | { "text": "That name is absolutely rad. What was your inspiration?", "capabilities": [ "assistant_knowledge" ] } 139 | ], 140 | [ 141 | { "text": "What kind of dog is this and why would it be dressed that way?", "image": "tests/images/pug_asian_clothing.jpg", "capabilities_any": [ "vision" ] }, 142 | { "text": "My wife and I have been married for 10 years and our eldest child is 6. How long were we married before we had our first child?", "capabilities": [ "assistant_knowledge" ] } 143 | ], 144 | [ 145 | { "text": "Which actor is this?", "image": "tests/images/some_old_actor.jpg", "capabilities": [ "vision" ] } 146 | ], 147 | [ 148 | { "text": "What can you make with these ingredients?", "image": "tests/images/eggs_banana_avocado.jpg", "capabilities_any": [ "vision", "vision" ] } 149 | ], 150 | [ 151 | { "text": "Can you give me a nuitritional breakdown of this?", "image": "tests/images/popcorn.jpg", "capabilities_any": [ "vision", "vision" ] } 152 | ], 153 | [ 154 | { "text": "Who designed this building?", "image": "tests/images/Wangjing_Soho.jpg", "capabilities": [ "vision" ] } 155 | ], 156 | [ 157 | { "text": "How tall is this figurine?", "image": "tests/images/figurine_on_table.jpg", "capabilities": [ "vision" ] } 158 | ], 159 | [ 160 | { "text": "it is 7:50 pm, can I park here right now?", "image": "tests/images/parking_sign.jpg", "capabilities": [ "vision" ] } 161 | ], 162 | [ 163 | { "text": "What is Apple's stock price?", "capabilities": [ "web_search" ] }, 164 | { "text": "Why has it been performing the way it has?", "capabilities": [ "web_search" ] } 165 | ] 166 | ] 167 | }, 168 | { 169 | "active": true, 170 | "name": "mixed_conversations", 171 | "default_image": "tests/images/black.jpg", 172 | "conversations": [ 173 | [ 174 | { "text": "What can you make with these ingredients?", "image": "tests/images/eggs_banana_avocado.jpg", "capabilities_any": [ "vision" ] }, 175 | { "text": "Describe this and any cultural meaning.", "image": "tests/images/chinese_dragon.jpg", "capabilities": [ "vision" ] } 176 | ], 177 | [ 178 | { "text": "What are the directions to play this game?", "image": "tests/images/candyland.jpg", "capabilities": [ "vision" ] }, 179 | { "text": "How many stairs do you count?", "image": "tests/images/staircase.jpg", "capabilities": [ "vision" ] } 180 | ], 181 | [ 182 | { "text": "What kind of flowers are these and is there a place nearby where I can buy them?", "image": "tests/images/flower_arrangement.jpg", "capabilities_any": [ "vision" ] }, 183 | { "text": "How much do you think they would be for a bouquet?", "image": "tests/images/flower_arrangement.jpg", "capabilities_any": [ "web_search", "vision" ] }, 184 | { "text": "What am I doing?", "image": "tests/images/Zoom_Call.jpg", "capabilities": [ "vision" ] } 185 | ], 186 | [ 187 | { "text": "What does this say?", "image": "tests/images/chinese_sign_1.webp", "capabilities": [ "vision" ] }, 188 | { "text": "What brand are these gloves?", "image": "tests/images/winning_gloves.webp", "capabilities_any": [ "vision" ] }, 189 | { "text": "What are they used for?", "capabilities": [ "web_search" ] } 190 | ], 191 | [ 192 | { "text": "How many cars do you see and what kind of cars are they?", "image": "tests/images/cars_behind_wall.jpg", "capabilities": [ "vision" ] }, 193 | { "text": "How many golf balls can fit in this trash can?", "image": "tests/images/small_trash_can.jpg", "capabilities": [ "vision" ] }, 194 | { "text": "What's the latest on Moscow?", "capabilities": [ "web_search" ] }, 195 | { "text": "What is this?", "image": "tests/images/candle.jpg", "capabilities": [ "vision" ] } 196 | ] 197 | ] 198 | } 199 | ] -------------------------------------------------------------------------------- /run_benchmark.py: -------------------------------------------------------------------------------- 1 | # 2 | # run_benchmark.py 3 | # 4 | # Benchmark queries for AI assistant. Used for testing and assessing the quality of assistant 5 | # responses. This script talks to a production endpoint and not the Python assistant server 6 | # directly. Simply run: 7 | # 8 | # python run_benchmark.py tests/tests.json 9 | # 10 | # Use --help for more instructions. 11 | # 12 | 13 | import argparse 14 | from datetime import datetime 15 | from enum import Enum 16 | import json 17 | import os 18 | import requests 19 | from typing import List, Optional 20 | 21 | import numpy as np 22 | from pydantic import BaseModel, RootModel 23 | 24 | from models import Capability, MultimodalResponse 25 | 26 | 27 | #################################################################################################### 28 | # Test Case JSON and Evaluation 29 | #################################################################################################### 30 | 31 | class UserMessage(BaseModel): 32 | text: str 33 | image: Optional[str] = None 34 | capabilities: Optional[List[Capability]] = None # capabilities that are required to have been used 35 | capabilities_any: Optional[List[Capability]] = None # must use at least one of the capabilities listed here 36 | 37 | class TestCase(BaseModel): 38 | name: str 39 | active: bool 40 | default_image: Optional[str] = None 41 | conversations: List[List[UserMessage | str]] 42 | 43 | class TestCaseFile(RootModel): 44 | root: List[TestCase] 45 | 46 | class TestResult(str, Enum): 47 | FAILED = "FAILED" 48 | IGNORED = "IGNORED" 49 | PASSED = "PASSED" 50 | 51 | def load_tests(filepath: str) -> List[TestCase]: 52 | with open(file=filepath, mode="r") as fp: 53 | text = fp.read() 54 | return TestCaseFile.model_validate_json(json_data=text).root 55 | 56 | def evaluate_capabilities_used(input: UserMessage, output: MultimodalResponse) -> TestResult: 57 | # Do we have anything to evaluate against? 58 | has_required_capabilities = input.capabilities is not None and len(input.capabilities) > 0 59 | has_any_capabilities = input.capabilities_any is not None and len(input.capabilities_any) > 0 60 | if (not has_required_capabilities) and (not has_any_capabilities): 61 | # Ignore if desired test results are not specified 62 | return TestResult.IGNORED 63 | 64 | capabilities_used = output.capabilities_used 65 | 66 | # Evaluate result against required capabilities 67 | if has_required_capabilities: 68 | for required_capability in input.capabilities: 69 | if required_capability not in capabilities_used: 70 | return TestResult.FAILED 71 | 72 | # Evaluate result against "any capabilities" (an OR function) 73 | if has_any_capabilities: 74 | any_present = False 75 | for interchangeable_capability in input.capabilities_any: 76 | if interchangeable_capability in capabilities_used: 77 | any_present = True 78 | if not any_present: 79 | return TestResult.FAILED 80 | 81 | return TestResult.PASSED 82 | 83 | 84 | #################################################################################################### 85 | # Helper Functions 86 | #################################################################################################### 87 | 88 | def load_binary_file(filepath: str) -> bytes: 89 | with open(file=filepath, mode="rb") as fp: 90 | return fp.read() 91 | 92 | 93 | #################################################################################################### 94 | # Markdown Report Generation 95 | #################################################################################################### 96 | 97 | class ReportGenerator: 98 | def __init__(self, test_filepath: str, generate_markdown: bool): 99 | self._generate_markdown = generate_markdown 100 | if not generate_markdown: 101 | return 102 | base = os.path.splitext(os.path.basename(test_filepath))[0] 103 | filename = f"{base}.md" 104 | self._fp = open(file=filename, mode="w") 105 | self._fp.write(f"# {test_filepath}\n\n") 106 | self._total_times = [] 107 | 108 | def __del__(self): 109 | if not self._generate_markdown: 110 | return 111 | self._fp.close() 112 | 113 | def begin_test(self, name: str): 114 | self._total_times = [] 115 | if not self._generate_markdown: 116 | return 117 | self._fp.write(f"## {name}\n\n") 118 | self._fp.write(f"|Passed?|User|Assistant|Image|Debug|\n") 119 | self._fp.write(f"|-------|----|---------|-----|-----|\n") 120 | 121 | def begin_conversation(self): 122 | if not self._generate_markdown: 123 | return 124 | self._fp.write("|\\-\\-\\-\\-\\-\\-\\-\\-|\\-\\-\\-\\-\\-\\-\\-\\-|\\-\\-\\-\\-\\-\\-\\-\\-|\\-\\-\\-\\-\\-\\-\\-\\-|\\-\\-\\-\\-\\-\\-\\-\\-|\n") 125 | 126 | def end_conversation(self): 127 | pass 128 | 129 | def add_result(self, user_message: UserMessage, response: MultimodalResponse, assistant_response: str, test_result: TestResult): 130 | if not self._generate_markdown: 131 | return 132 | passed_column = f"{test_result.value}" 133 | user_column = self._escape(user_message.text) 134 | assistant_column = self._escape(assistant_response) 135 | image_column = f"\"image\"" if user_message.image is not None else "" 136 | debug_column = f"```{response.debug_tools}```" 137 | self._fp.write(f"|{passed_column}|{user_column}|{assistant_column}|{image_column}|{debug_column}|\n") 138 | 139 | # Timings 140 | try: 141 | timings = json.loads(response.timings) 142 | self._total_times.append(float(timings["total_time"])) 143 | except: 144 | pass 145 | 146 | def end_test(self, num_passed: int, num_evaluated: int): 147 | if not self._generate_markdown: 148 | return 149 | mean_time = np.mean(self._total_times) 150 | median_time = np.median(self._total_times) 151 | min_time = np.min(self._total_times) 152 | max_time = np.max(self._total_times) 153 | pct90_time = np.quantile(self._total_times, q=0.9) 154 | pct95_time = np.quantile(self._total_times, q=0.95) 155 | pct99_time = np.quantile(self._total_times, q=0.99) 156 | if num_evaluated == 0: 157 | self._fp.write(f"**Score: N/A**\n\n") 158 | else: 159 | self._fp.write(f"**Score: {100.0 * num_passed / num_evaluated : .1f}%**\n\n") 160 | self._fp.write(f"**Timings**\n") 161 | self._fp.write(f"|Mean|Median|Min|Max|90%|95%|99%|\n") 162 | self._fp.write(f"|----|------|---|---|---|---|---|\n") 163 | self._fp.write(f"|{mean_time:.1f}|{median_time:.1f}|{min_time:.1f}|{max_time:.1f}|{pct90_time:.1f}|{pct95_time:.1f}|{pct99_time:.1f}|\n\n") 164 | 165 | @staticmethod 166 | def _escape(text: str) -> str: 167 | special_chars = "\\`'\"*_{}[]()#+-.!" 168 | escaped_text = ''.join(['\\' + char if char in special_chars else char for char in text]) 169 | return escaped_text.replace("\n", " ") 170 | 171 | 172 | #################################################################################################### 173 | # Main Program 174 | #################################################################################################### 175 | 176 | if __name__ == "__main__": 177 | parser = argparse.ArgumentParser("run_benchmark") 178 | parser.add_argument("file", nargs=1) 179 | parser.add_argument("--endpoint", action="store", default="https://api.brilliant.xyz/dev/noa/mm", help="Address to send request to (Noa server)") 180 | parser.add_argument("--token", action="store", help="Noa API token") 181 | parser.add_argument("--test", metavar="name", help="Run specific test") 182 | parser.add_argument("--markdown", action="store_true", help="Produce report in markdown file") 183 | parser.add_argument("--vision", action="store", help="Vision model to use (gpt-4o, gpt-4-vision-preview, claude-3-haiku-20240307, claude-3-sonnet-20240229, claude-3-opus-20240229)", default="gpt-4o") 184 | parser.add_argument("--address", action="store", default="San Francisco, CA 94115", help="Simulated location") 185 | options = parser.parse_args() 186 | 187 | # Load tests 188 | tests = load_tests(filepath=options.file[0]) 189 | 190 | # Markdown report generator 191 | report = ReportGenerator(test_filepath=options.file[0], generate_markdown=options.markdown) 192 | 193 | # Authorization header 194 | headers = { 195 | "Authorization": options.token if options.token is not None else os.getenv("BRILLIANT_API_KEY") 196 | } 197 | 198 | # Metrics 199 | total_user_prompts = 0 200 | total_tokens_in = 0 201 | total_tokens_out = 0 202 | total_times = [] 203 | localhost = options.endpoint == "localhost" 204 | 205 | # Run all active tests 206 | for test in tests: 207 | if not options.test: 208 | # No specific test, run all that are active 209 | if not test.active: 210 | continue 211 | else: 212 | if test.name.lower().strip() != options.test.lower().strip(): 213 | continue 214 | 215 | print(f"Test: {test.name}") 216 | report.begin_test(name=test.name) 217 | num_evaluated = 0 218 | num_passed = 0 219 | 220 | for conversation in test.conversations: 221 | report.begin_conversation() 222 | 223 | # Create new message history for each conversation 224 | history = [] 225 | for user_message in conversation: 226 | # Each user message can be either a string or a UserMessage object 227 | assert isinstance(user_message, str) or isinstance(user_message, UserMessage) 228 | if isinstance(user_message, str): 229 | user_message = UserMessage(text=user_message) 230 | 231 | # If there is no image associated with this message, use the default image, if it 232 | # exists 233 | if user_message.image is None and test.default_image is not None: 234 | user_message = user_message.model_copy() 235 | user_message.image = test.default_image 236 | 237 | # Construct API call data 238 | if localhost: 239 | options.endpoint = "http://localhost:8000/mm" 240 | data = { 241 | "mm": json.dumps({ 242 | "prompt": user_message.text, 243 | "messages": history, 244 | "address": options.address, 245 | "local_time": datetime.now().strftime("%A, %B %d, %Y, %I:%M %p"), 246 | "search_api": "perplexity", 247 | "config": { "engine": "google_lens" }, 248 | "experiment": "1", 249 | "vision": options.vision 250 | } 251 | ), 252 | } 253 | else: 254 | data = { 255 | "prompt": user_message.text, 256 | "messages": json.dumps(history), 257 | "address": options.address, 258 | "local_time": datetime.now().strftime("%A, %B %d, %Y, %I:%M %p"), 259 | "search_api": "perplexity", 260 | "config": json.dumps({ "engine": "google_lens" }), 261 | "experiment": "1", # this activates the passthrough to the Python ai-experiments code 262 | "vision": options.vision 263 | } 264 | files = {} 265 | if user_message.image is not None: 266 | files["image"] = (os.path.basename(user_message.image), load_binary_file(filepath=user_message.image)) 267 | 268 | # Make the call and evaluate 269 | response = requests.post(url=options.endpoint, files=files, data=data, headers=headers) 270 | error = False 271 | try: 272 | if response.status_code != 200: 273 | print(f"Error: {response.status_code}") 274 | print(response.content) 275 | response.raise_for_status() 276 | #print(response.content) 277 | mm_response = MultimodalResponse.model_validate_json(json_data=response.content) 278 | #print("Sent:") 279 | #print(json.dumps(history)) 280 | 281 | test_result = evaluate_capabilities_used(input=user_message, output=mm_response) 282 | if test_result != TestResult.IGNORED: 283 | num_evaluated += 1 284 | num_passed += (1 if test_result == TestResult.PASSED else 0) 285 | 286 | history.append({ "role": "user", "content": user_message.text }) 287 | 288 | assistant_response = "" 289 | if len(mm_response.response) > 0: 290 | assistant_response = mm_response.response 291 | elif len(mm_response.image) > 0: 292 | assistant_response = "" 293 | if len(assistant_response) > 0: 294 | history.append({ "role": "assistant", "content": assistant_response }) 295 | 296 | timings = json.loads(mm_response.timings) 297 | 298 | print(f"User: {user_message.text}" + (f" ({user_message.image})" if user_message.image else "")) 299 | print(f"Response: {assistant_response}") 300 | print(f"Tools: {mm_response.debug_tools}") 301 | print(f"Timings: {timings}") 302 | #pct_out = float(content["output_tokens"]) / float(content["total_tokens"]) * 100.0 303 | #print(f"Tokens: in={content['input_tokens']}, out={content['output_tokens']} %out={pct_out:.0f}%") 304 | print(f"Test: {test_result}") 305 | print("") 306 | report.add_result(user_message=user_message, response=mm_response, assistant_response=assistant_response, test_result=test_result) 307 | 308 | total_user_prompts += 1 309 | total_tokens_in += mm_response.input_tokens 310 | total_tokens_out += mm_response.output_tokens 311 | 312 | total_times.append(float(timings["total_time"])) 313 | 314 | except Exception as e: 315 | print(f"Error: {e}") 316 | 317 | report.end_conversation() 318 | 319 | # Print test results 320 | print("") 321 | print(f"TEST RESULTS: {test.name}") 322 | if num_evaluated == 0: 323 | print(f" Score: N/A") 324 | else: 325 | print(f" Score: {num_passed}/{num_evaluated} = {100.0 * num_passed / num_evaluated : .1f}%") 326 | report.end_test(num_passed=num_passed, num_evaluated=num_evaluated) 327 | 328 | # Summary 329 | print(f"User messages: {total_user_prompts}") 330 | print(f"Total input tokens: {total_tokens_in}") 331 | print(f"Total output tokens: {total_tokens_out}") 332 | print(f"Average input tokens: {total_tokens_in / total_user_prompts}") 333 | print(f"Average output tokens: {total_tokens_out / total_user_prompts}") 334 | 335 | # Timings 336 | mean_time = np.mean(total_times) 337 | median_time = np.median(total_times) 338 | min_time = np.min(total_times) 339 | max_time = np.max(total_times) 340 | pct90_time = np.quantile(total_times, q=0.9) 341 | pct95_time = np.quantile(total_times, q=0.95) 342 | pct99_time = np.quantile(total_times, q=0.99) 343 | print("") 344 | print("Timing") 345 | print("------") 346 | print(f"Mean : {mean_time:.1f}") 347 | print(f"Median: {median_time:.1f}") 348 | print(f"Min : {min_time:.1f}") 349 | print(f"Max : {max_time:.1f}") 350 | print(f"90% : {pct90_time:.1f}") 351 | print(f"95% : {pct95_time:.1f}") 352 | print(f"99% : {pct99_time:.1f}") -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | # 2 | # app.py 3 | # 4 | # Noa assistant server application. Provides /mm endpoint. 5 | # 6 | 7 | import asyncio 8 | from datetime import datetime 9 | from io import BytesIO 10 | import os 11 | import traceback 12 | from typing import Annotated, Dict, List, Tuple 13 | import glob 14 | import openai 15 | import anthropic 16 | import groq 17 | from pydantic import BaseModel, ValidationError 18 | from pydub import AudioSegment 19 | from fastapi import FastAPI, status, Form, UploadFile, Request 20 | from pydantic import BaseModel, ValidationError 21 | from fastapi.exceptions import HTTPException 22 | from fastapi.encoders import jsonable_encoder 23 | 24 | from models import Capability, TokenUsage, SearchAPI, VisionModel, GenerateImageService, MultimodalRequest, MultimodalResponse, ExtractLearnedContextRequest, ExtractLearnedContextResponse 25 | from web_search import WebSearch, DataForSEOWebSearch, SerpWebSearch, PerplexityWebSearch 26 | from vision import Vision, GPT4Vision, ClaudeVision 27 | from vision.utils import process_image 28 | from generate_image import ReplicateGenerateImage 29 | from assistant import Assistant, AssistantResponse, GPTAssistant, ClaudeAssistant, extract_learned_context 30 | 31 | 32 | #################################################################################################### 33 | # Configuration 34 | #################################################################################################### 35 | 36 | EXPERIMENT_AI_PORT = os.environ.get('EXPERIMENT_AI_PORT',8000) 37 | PERPLEXITY_API_KEY = os.environ.get("PERPLEXITY_API_KEY", None) 38 | ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", None) 39 | 40 | 41 | #################################################################################################### 42 | # Server API 43 | #################################################################################################### 44 | 45 | app = FastAPI() 46 | 47 | class Checker: 48 | def __init__(self, model: BaseModel): 49 | self.model = model 50 | 51 | def __call__(self, data: str = Form(...)): 52 | try: 53 | return self.model.model_validate_json(data) 54 | except ValidationError as e: 55 | raise HTTPException( 56 | detail=jsonable_encoder(e.errors()), 57 | status_code=status.HTTP_422_UNPROCESSABLE_ENTITY, 58 | ) 59 | 60 | async def transcribe(client: openai.AsyncOpenAI, audio_bytes: bytes) -> str: 61 | # Create a file-like object for Whisper API to consume 62 | audio = AudioSegment.from_file(BytesIO(audio_bytes)) 63 | buffer = BytesIO() 64 | buffer.name = "voice.mp4" 65 | audio.export(buffer, format="mp4") 66 | 67 | # Whisper 68 | transcript = await client.audio.translations.create( 69 | model="whisper-1", 70 | file=buffer, 71 | ) 72 | return transcript.text 73 | 74 | def validate_assistant_model(model: str | None, models: List[str]) -> str: 75 | """ 76 | Ensures a valid model is selected. 77 | 78 | Parameters 79 | ---------- 80 | model : str | None 81 | Model name to use. 82 | models : List[str] 83 | List of valid models. The first model is the default model. 84 | 85 | Returns 86 | ------- 87 | str 88 | If the model name is in the list, returns it as-is, otherwise returns the first model in the 89 | list by default. 90 | """ 91 | if model is None or model not in models: 92 | return models[0] 93 | return model 94 | 95 | def get_assistant(app, mm: MultimodalRequest) -> Tuple[Assistant, str | None]: 96 | assistant_model = mm.assistant_model 97 | 98 | # Default assistant if none selected 99 | if mm.assistant is None or (mm.assistant not in [ "gpt", "claude", "groq" ]): 100 | return app.state.assistant, None # None for assistant_model will force assistant to use its own internal default choice 101 | 102 | # Return assistant and a valid model for it 103 | if mm.assistant == "gpt": 104 | assistant_model = validate_assistant_model(model=mm.assistant_model, models=[ "gpt-4o", "gpt-3.5-turbo-1106", "gpt-3.5-turbo", "gpt-4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4-turbo-preview", "gpt-4-1106-preview" ]) 105 | if mm.openai_key and len(mm.openai_key) > 0: 106 | return GPTAssistant(client=openai.AsyncOpenAI(api_key=mm.openai_key)), assistant_model 107 | return GPTAssistant(client=app.state.openai_client), assistant_model 108 | elif mm.assistant == "claude": 109 | assistant_model = validate_assistant_model(model=mm.assistant_model, models=[ "claude-3-sonnet-20240229", "claude-3-haiku-20240307", "claude-3-opus-20240229" ]) 110 | return ClaudeAssistant(client=app.state.anthropic_client), assistant_model 111 | elif mm.assistant == "groq": 112 | assistant_model = validate_assistant_model(model=mm.assistant_model, models=[ "llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it" ]) 113 | return GPTAssistant(client=app.state.groq_client), assistant_model # Groq uses GPTAssistant 114 | 115 | # Should never fall through to here 116 | return None, "" 117 | 118 | def get_web_search_provider(app, mm: MultimodalRequest) -> WebSearch: 119 | # Use provider specified in request options 120 | if mm.search_api == SearchAPI.SERP: 121 | return SerpWebSearch(save_to_file=options.save, engine=mm.search_engine.value, max_search_results=mm.max_search_results) 122 | elif mm.search_api == SearchAPI.DATAFORSEO: 123 | return DataForSEOWebSearch(save_to_file=options.save, max_search_results=mm.max_search_results) 124 | elif mm.search_api == SearchAPI.PERPLEXITY: 125 | if mm.perplexity_key and len(mm.perplexity_key) > 0: 126 | return PerplexityWebSearch(api_key=mm.perplexity_key) 127 | return PerplexityWebSearch(api_key=PERPLEXITY_API_KEY) 128 | 129 | # Default provider 130 | return app.state.web_search 131 | 132 | def get_vision_provider(app, mm: MultimodalRequest) -> Vision | None: 133 | # Use provider specified 134 | if mm.vision in [VisionModel.GPT4O, VisionModel.GPT4Vision ]: 135 | return GPT4Vision(client=app.state.openai_client, model=mm.vision) 136 | elif mm.vision in [VisionModel.CLAUDE_HAIKU, VisionModel.CLAUDE_SONNET, VisionModel.CLAUDE_OPUS]: 137 | return ClaudeVision(client=app.state.anthropic_client, model=mm.vision) 138 | 139 | # Default provider 140 | return app.state.vision 141 | 142 | @app.get('/health') 143 | async def api_health(): 144 | return {"status":200,"message":"running ok"} 145 | 146 | MAX_FILES = 100 147 | AUDIO_DIR = "audio" 148 | 149 | def get_next_filename(): 150 | existing_files = sorted(glob.glob(f"{AUDIO_DIR}/audio*.wav")) 151 | # if audio directory does not exist, create it 152 | if not os.path.exists(AUDIO_DIR): 153 | os.makedirs(AUDIO_DIR) 154 | if len(existing_files) < MAX_FILES: 155 | return f"{AUDIO_DIR}/audio{len(existing_files) + 1}.wav" 156 | else: 157 | # All files exist, so find the oldest one to overwrite 158 | oldest_file = min(existing_files, key=os.path.getmtime) 159 | return oldest_file 160 | 161 | @app.post("/mm") 162 | async def api_mm(request: Request, mm: Annotated[str, Form()], audio : UploadFile = None, image: UploadFile = None): 163 | try: 164 | mm: MultimodalRequest = Checker(MultimodalRequest)(data=mm) 165 | # print(mm) 166 | 167 | # Transcribe voice prompt if it exists 168 | voice_prompt = "" 169 | if audio: 170 | audio_bytes = await audio.read() 171 | if mm.testing_mode: 172 | # save audio file 173 | # set timestamp 174 | # filepath = "audio.wav" + str(datetime.now().timestamp()) 175 | filepath = get_next_filename() 176 | with open(filepath, "wb") as f: 177 | f.write(audio_bytes) 178 | if mm.openai_key and len(mm.openai_key) > 0: 179 | voice_prompt = await transcribe(client=openai.AsyncOpenAI(api_key=mm.openai_key), audio_bytes=audio_bytes) 180 | else: 181 | voice_prompt = await transcribe(client=request.app.state.openai_client, audio_bytes=audio_bytes) 182 | 183 | 184 | # Construct final prompt 185 | if mm.prompt is None or len(mm.prompt) == 0 or mm.prompt.isspace() or mm.prompt == "": 186 | user_prompt = voice_prompt 187 | else: 188 | user_prompt = mm.prompt + " " + voice_prompt 189 | 190 | # Image data 191 | image_bytes = (await image.read()) if image else None 192 | # preprocess image 193 | if image_bytes: 194 | image_bytes = process_image(image_bytes) 195 | # Location data 196 | address = mm.address 197 | 198 | # User's local time 199 | local_time = mm.local_time 200 | 201 | # Image generation (bypasses assistant altogether) 202 | if mm.generate_image != 0: 203 | if mm.generate_image_service == GenerateImageService.REPLICATE: 204 | generate_image = ReplicateGenerateImage() 205 | image_url = await generate_image.generate_image( 206 | query=user_prompt, 207 | use_image=True, 208 | image_bytes=image_bytes 209 | ) 210 | return MultimodalResponse( 211 | user_prompt=user_prompt, 212 | response="", 213 | image=image_url, 214 | token_usage_by_model={}, 215 | capabilities_used=[Capability.IMAGE_GENERATION], 216 | total_tokens=0, 217 | input_tokens=0, 218 | output_tokens=0, 219 | timings="", 220 | debug_tools="" 221 | ) 222 | 223 | # Get assistant tool providers 224 | web_search: WebSearch = get_web_search_provider(app=request.app, mm=mm) 225 | vision: Vision = get_vision_provider(app=request.app, mm=mm) 226 | 227 | # Call the assistant and deliver the response 228 | try: 229 | assistant, assistant_model = get_assistant(app=app, mm=mm) 230 | assistant_response: AssistantResponse = await assistant.send_to_assistant( 231 | prompt=user_prompt, 232 | noa_system_prompt=mm.noa_system_prompt, 233 | image_bytes=image_bytes, 234 | message_history=mm.messages, 235 | learned_context={}, 236 | local_time=local_time, 237 | location_address=address, 238 | model=assistant_model, 239 | web_search=web_search, 240 | vision=vision, 241 | speculative_vision=mm.speculative_vision 242 | ) 243 | 244 | return MultimodalResponse( 245 | user_prompt=user_prompt, 246 | response=assistant_response.response, 247 | image=assistant_response.image, 248 | token_usage_by_model=assistant_response.token_usage_by_model, 249 | capabilities_used=assistant_response.capabilities_used, 250 | total_tokens=0, 251 | input_tokens=0, 252 | output_tokens=0, 253 | timings=assistant_response.timings, 254 | debug_tools=assistant_response.debug_tools 255 | ) 256 | except Exception as e: 257 | print(f"{traceback.format_exc()}") 258 | raise HTTPException(400, detail=f"{str(e)}: {traceback.format_exc()}") 259 | 260 | except Exception as e: 261 | print(f"{traceback.format_exc()}") 262 | raise HTTPException(400, detail=f"{str(e)}: {traceback.format_exc()}") 263 | 264 | @app.post("/extract_learned_context") 265 | async def api_extract_learned_context(request: Request, params: Annotated[str, Form()]): 266 | try: 267 | params: ExtractLearnedContextRequest = Checker(ExtractLearnedContextRequest)(data=params) 268 | print(params) 269 | 270 | token_usage_by_model: Dict[str, TokenUsage] = {} 271 | 272 | # Perform extraction 273 | try: 274 | learned_context = await extract_learned_context( 275 | client=request.app.state.openai_client, 276 | message_history=params.messages, 277 | model="gpt-3.5-turbo-1106", 278 | existing_learned_context=params.existing_learned_context, 279 | token_usage_by_model=token_usage_by_model 280 | ) 281 | 282 | return ExtractLearnedContextResponse( 283 | learned_context=learned_context, 284 | token_usage_by_model=token_usage_by_model 285 | ) 286 | except Exception as e: 287 | print(f"{traceback.format_exc()}") 288 | raise HTTPException(400, detail=f"{str(e)}: {traceback.format_exc()}") 289 | 290 | except Exception as e: 291 | print(f"{traceback.format_exc()}") 292 | raise HTTPException(400, detail=f"{str(e)}: {traceback.format_exc()}") 293 | 294 | 295 | #################################################################################################### 296 | # Program Entry Point 297 | #################################################################################################### 298 | 299 | if __name__ == "__main__": 300 | import argparse 301 | parser = argparse.ArgumentParser() 302 | parser.add_argument("--query", action="store", help="Perform search query and exit") 303 | parser.add_argument("--location", action="store", default="San Francisco", help="Set location address used for all queries (e.g., \"San Francisco\")") 304 | parser.add_argument("--save", action="store", help="Save DataForSEO response object to file") 305 | parser.add_argument("--search-api", action="store", default="perplexity", help="Search API to use (perplexity, serp, dataforseo)") 306 | parser.add_argument("--assistant", action="store", default="gpt", help="Assistant to use (gpt, claude, groq)") 307 | parser.add_argument("--server", action="store_true", help="Start server") 308 | parser.add_argument("--image", action="store", help="Image filepath for image query") 309 | parser.add_argument("--vision", action="store", help="Vision model to use (gpt-4o, gpt-4-vision-preview, claude-3-haiku-20240307, claude-3-sonnet-20240229, claude-3-opus-20240229)", default="gpt-4o") 310 | options = parser.parse_args() 311 | 312 | # AI clients 313 | app.state.openai_client = openai.AsyncOpenAI() 314 | app.state.anthropic_client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY) 315 | app.state.groq_client = groq.AsyncGroq() 316 | 317 | # Instantiate a default web search provider 318 | app.state.web_search = None 319 | if options.search_api == "serp": 320 | app.state.web_search = SerpWebSearch(save_to_file=options.save, engine="google") 321 | elif options.search_api == "dataforseo": 322 | app.state.web_search = DataForSEOWebSearch(save_to_file=options.save) 323 | elif options.search_api == "perplexity": 324 | app.state.web_search = PerplexityWebSearch(api_key=PERPLEXITY_API_KEY) 325 | else: 326 | raise ValueError("--search-api must be one of: serp, dataforseo, perplexity") 327 | 328 | # Instantiate a default vision provider 329 | app.state.vision = None 330 | if options.vision in [ "gpt-4o", "gpt-4-vision-preview" ]: 331 | app.state.vision = GPT4Vision(client=app.state.openai_client, model=options.vision) 332 | elif VisionModel(options.vision) in [VisionModel.CLAUDE_HAIKU, VisionModel.CLAUDE_SONNET, VisionModel.CLAUDE_OPUS]: 333 | app.state.vision = ClaudeVision(client=app.state.anthropic_client, model=options.vision) 334 | else: 335 | raise ValueError("--vision must be one of: gpt-4o, gpt-4-vision-preview, claude-3-haiku-20240307, claude-3-sonnet-20240229, claude-3-opus-20240229") 336 | 337 | # Instantiate a default assistant 338 | if options.assistant == "gpt": 339 | app.state.assistant = GPTAssistant(client=app.state.openai_client) 340 | elif options.assistant == "claude": 341 | app.state.assistant = ClaudeAssistant(client=app.state.anthropic_client) 342 | elif options.assistant == "groq": 343 | app.state.assistant = GPTAssistant(client=app.state.groq_client) 344 | else: 345 | raise ValueError("--assistant must be one of: gpt, claude, groq") 346 | 347 | # Load image if one was specified (for performing a test query) 348 | image_bytes = None 349 | if options.image: 350 | with open(file=options.image, mode="rb") as fp: 351 | image_bytes = fp.read() 352 | 353 | # Test query 354 | if options.query: 355 | async def run_query() -> str: 356 | return await app.state.assistant.send_to_assistant( 357 | prompt=options.query, 358 | image_bytes=image_bytes, 359 | message_history=[], 360 | learned_context={}, 361 | local_time=datetime.now().strftime("%A, %B %d, %Y, %I:%M %p"), # e.g., Friday, March 8, 2024, 11:54 AM 362 | location_address=options.location, 363 | model=None, 364 | web_search=app.state.web_search, 365 | vision=app.state.vision, 366 | 367 | ) 368 | response = asyncio.run(run_query()) 369 | print(response) 370 | 371 | # Run server 372 | if options.server: 373 | import uvicorn 374 | uvicorn.run(app, host="0.0.0.0", port=int(EXPERIMENT_AI_PORT)) -------------------------------------------------------------------------------- /assistant/claude_assistant.py: -------------------------------------------------------------------------------- 1 | # 2 | # claude_assistant.py 3 | # 4 | # Assistant implementation based on Anthropic's Claude series of models. 5 | # 6 | 7 | # 8 | # TODO: 9 | # ----- 10 | # - Factor out functions common to ClaudeAssistant and GPTAssistant. 11 | # - Occasionally Claude returns errors, debug these. 12 | # - Claude sometimes messes up with follow-up questions and then refers to internal context. We may 13 | # need to try embedding extra context inside of the latest user message. 14 | # 15 | 16 | import asyncio 17 | import json 18 | import timeit 19 | from typing import Any, Dict, List 20 | 21 | import anthropic 22 | from anthropic.types.beta.tools import ToolParam, ToolUseBlock, ToolsBetaMessage 23 | 24 | from .assistant import Assistant, AssistantResponse 25 | from .context import create_context_system_message 26 | from web_search import WebSearch, WebSearchResult 27 | from vision import Vision 28 | from models import Role, Message, Capability, TokenUsage, accumulate_token_usage 29 | 30 | 31 | #################################################################################################### 32 | # Prompts 33 | #################################################################################################### 34 | 35 | # 36 | # Top-level instructions 37 | # 38 | 39 | SYSTEM_MESSAGE = """ 40 | You are Noa, a smart personal AI assistant inside the user's AR smart glasses that answers all user 41 | queries and questions. You have access to a photo from the smart glasses camera of what the user was 42 | seeing at the time they spoke. 43 | 44 | Make your responses precise and max 5 sentences. Respond without any preamble when giving translations, 45 | just translate directly. When analyzing the user's view, speak as if you can actually see and never 46 | make references to the photo or image you analyzed. 47 | Sometimes Answer in witty, sarcastic style and Make user laugh. 48 | """ 49 | 50 | 51 | #################################################################################################### 52 | # Tools 53 | #################################################################################################### 54 | 55 | DUMMY_SEARCH_TOOL_NAME = "general_knowledge_search" 56 | SEARCH_TOOL_NAME = "web_search" 57 | PHOTO_TOOL_NAME = "analyze_photo" 58 | QUERY_PARAM_NAME = "query" 59 | 60 | TOOLS: List[ToolParam] = [ 61 | { 62 | "name": DUMMY_SEARCH_TOOL_NAME, 63 | "description": "Non-recent trivia and general knowledge", 64 | "input_schema": { 65 | "type": "object", 66 | "properties": { 67 | QUERY_PARAM_NAME: { 68 | "type": "string", 69 | "description": "search query", 70 | } 71 | }, 72 | "required": [ QUERY_PARAM_NAME ] 73 | } 74 | }, 75 | { 76 | "name": SEARCH_TOOL_NAME, 77 | "description": "Up-to-date information on news, retail products, current events, local conditions, and esoteric knowledge", 78 | "input_schema": { 79 | "type": "object", 80 | "properties": { 81 | QUERY_PARAM_NAME: { 82 | "type": "string", 83 | "description": "search query", 84 | } 85 | }, 86 | "required": [ QUERY_PARAM_NAME ] 87 | } 88 | }, 89 | { 90 | "name": PHOTO_TOOL_NAME, 91 | "description": """Analyzes or describes the photo you have from the user's current perspective. 92 | Use this tool if user refers to something not identifiable from conversation context, such as with a demonstrative pronoun.""", 93 | "input_schema": { 94 | "type": "object", 95 | "properties": { 96 | QUERY_PARAM_NAME: { 97 | "type": "string", 98 | "description": "User's query to answer, describing what they want answered, expressed as a command that NEVER refers to the photo or image itself" 99 | } 100 | }, 101 | "required": [ QUERY_PARAM_NAME ] 102 | } 103 | } 104 | ] 105 | 106 | async def handle_tool( 107 | tool_call: ToolUseBlock, 108 | user_message: str, 109 | message_history: List[Message] | None, 110 | image_bytes: bytes | None, 111 | location: str | None, 112 | local_time: str | None, 113 | web_search: WebSearch, 114 | vision: Vision, 115 | learned_context: Dict[str, str] | None, 116 | token_usage_by_model: Dict[str, TokenUsage], 117 | capabilities_used: List[Capability], 118 | tools_used: List[Dict[str, Any]], 119 | timings: Dict[str, str] 120 | ) -> str: 121 | tool_functions = { 122 | SEARCH_TOOL_NAME: web_search.search_web, # returns WebSearchResult 123 | PHOTO_TOOL_NAME: handle_photo_tool, # returns WebSearchResult | str 124 | DUMMY_SEARCH_TOOL_NAME: handle_general_knowledge_tool, # returns str 125 | } 126 | 127 | function_name = tool_call.name 128 | function_to_call = tool_functions.get(function_name) 129 | if function_to_call is None: 130 | # Error: Hallucinated a tool 131 | return "Error: you hallucinated a tool that doesn't exist. Tell user you had trouble interpreting the request and ask them to rephrase it." 132 | 133 | function_args = prepare_tool_arguments( 134 | tool_call=tool_call, 135 | user_message=user_message, 136 | message_history=message_history, 137 | image_bytes=image_bytes, 138 | location=location, 139 | local_time=local_time, 140 | web_search=web_search, 141 | vision=vision, 142 | learned_context=learned_context, 143 | token_usage_by_model=token_usage_by_model, 144 | capabilities_used=capabilities_used 145 | ) 146 | 147 | tool_start_time = timeit.default_timer() 148 | function_response: WebSearchResult | str = await function_to_call(**function_args) 149 | total_tool_time = round(timeit.default_timer() - tool_start_time, 3) 150 | timings[f"tool_{function_name}"] = f"{total_tool_time:.3f}" 151 | 152 | # Record capability used (except for case of photo tool, which reports on its own because it 153 | # can invoke multiple capabilities) 154 | if function_name == SEARCH_TOOL_NAME: 155 | capabilities_used.append(Capability.WEB_SEARCH) 156 | elif function_name == DUMMY_SEARCH_TOOL_NAME: 157 | capabilities_used.append(Capability.ASSISTANT_KNOWLEDGE) 158 | 159 | tools_used.append( 160 | create_debug_tool_info_object( 161 | function_name=function_name, 162 | function_args=function_args, 163 | tool_time=total_tool_time, 164 | search_result=function_response.search_provider_metadata if isinstance(function_response, WebSearchResult) else None 165 | ) 166 | ) 167 | 168 | # Format response appropriately 169 | assert isinstance(function_response, WebSearchResult) or isinstance(function_response, str) 170 | tool_output = function_response.summary if isinstance(function_response, WebSearchResult) else function_response 171 | return tool_output 172 | 173 | def prepare_tool_arguments( 174 | tool_call: ToolUseBlock, 175 | user_message: str, 176 | message_history: List[Message] | None, 177 | image_bytes: bytes | None, 178 | location: str | None, 179 | local_time: str | None, 180 | web_search: WebSearch, 181 | vision: Vision, 182 | learned_context: Dict[str, str] | None, 183 | token_usage_by_model: Dict[str, TokenUsage], 184 | capabilities_used: List[Capability] 185 | ) -> Dict[str, Any]: 186 | # Get function description we passed to Claude. This function should be called after we have 187 | # validated that a valid tool call was generated. 188 | function_description = [ description for description in TOOLS if description["name"] == tool_call.name ][0] 189 | function_parameters = function_description["input_schema"]["properties"] 190 | 191 | # Parse arguments and ensure they are all str or bool for now. Drop any that aren't. 192 | args = tool_call.input.copy() 193 | for param_name in list(args.keys()): 194 | if param_name not in function_parameters: 195 | # Hallucinated parameter 196 | del args[param_name] 197 | continue 198 | if function_parameters[param_name]["type"] == "string" and type(args[param_name]) != str: 199 | del args[param_name] 200 | continue 201 | if function_parameters[param_name]["type"] == "boolean" and type(args[param_name]) != bool: 202 | del args[param_name] 203 | continue 204 | if function_parameters[param_name]["type"] not in [ "string", "boolean" ]: 205 | # Need to keep this up to date with the tools we define 206 | raise ValueError(f"Unsupported tool parameter type: {function_parameters[param_name]['type']}") 207 | 208 | # Fill in args required by all tools 209 | args["location"] = location if location else "unknown" 210 | args[QUERY_PARAM_NAME] = args[QUERY_PARAM_NAME] if QUERY_PARAM_NAME in args else user_message 211 | args["message_history"] = message_history 212 | args["token_usage_by_model"] = token_usage_by_model 213 | 214 | # Photo tool additional parameters we need to inject 215 | if tool_call.name == PHOTO_TOOL_NAME: 216 | args["image_bytes"] = image_bytes 217 | args["vision"] = vision 218 | args["web_search"] = web_search 219 | args["local_time"] = local_time 220 | args["learned_context"] = learned_context 221 | args["capabilities_used"] = capabilities_used 222 | 223 | return args 224 | 225 | async def handle_general_knowledge_tool( 226 | query: str, 227 | message_history: List[Message] | None, 228 | token_usage_by_model: Dict[str, TokenUsage], 229 | image_bytes: bytes | None = None, 230 | local_time: str | None = None, 231 | location: str | None = None, 232 | learned_context: Dict[str,str] | None = None, 233 | ) -> str: 234 | """ 235 | Dummy general knowledge tool that tricks Claude into generating an answer directly instead of 236 | reaching for web search. 237 | """ 238 | return "" 239 | 240 | async def handle_photo_tool( 241 | query: str, 242 | message_history: List[Message] | None, 243 | vision: Vision, 244 | web_search: WebSearch, 245 | token_usage_by_model: Dict[str, TokenUsage], 246 | capabilities_used: List[Capability], 247 | google_reverse_image_search: bool = False, 248 | translate: bool = False, 249 | image_bytes: bytes | None = None, 250 | local_time: str | None = None, 251 | location: str | None = None, 252 | learned_context: Dict[str,str] | None = None 253 | ) -> str | WebSearchResult: 254 | extra_context = "\n\n" + create_context_system_message(local_time=local_time, location=location, learned_context=learned_context) 255 | 256 | # If no image bytes (glasses always send image but web playgrounds do not), return an error 257 | # message for the assistant to use 258 | if image_bytes is None or len(image_bytes) == 0: 259 | # Because this is a tool response, using "tell user" seems to ensure that the final 260 | # assistant response is what we want 261 | return "Error: no photo supplied. Tell user: I think you're referring to something you can see. Can you provide a photo?" 262 | 263 | # Vision tool 264 | capabilities_used.append(Capability.VISION) 265 | output = await vision.query_image( 266 | query=query, 267 | extra_context=extra_context, 268 | image_bytes=image_bytes, 269 | token_usage_by_model=token_usage_by_model 270 | ) 271 | print(f"Vision: {output}") 272 | if output is None: 273 | return "Error: vision tool generated an improperly formatted result. Tell user that there was a temporary glitch and ask them to try again." 274 | 275 | # If no web search required, output vision response directly 276 | if not output.web_search_needed(): 277 | return output.response 278 | 279 | # Perform web search and produce a synthesized response telling assistant where each piece of 280 | # information came from. Web search will lack important vision information. We need to return 281 | # both and have the assistant figure out which info to use. 282 | capabilities_used.append(Capability.REVERSE_IMAGE_SEARCH if output.reverse_image_search else Capability.WEB_SEARCH) 283 | web_result = await web_search.search_web( 284 | query=output.web_query.strip("\""), 285 | message_history=message_history, 286 | use_photo=output.reverse_image_search, 287 | image_bytes=image_bytes, 288 | location=location, 289 | token_usage_by_model=token_usage_by_model 290 | ) 291 | 292 | return f"HERE IS WHAT YOU SEE: {output.response}\nEXTRA INFO FROM WEB: {web_result}" 293 | 294 | def create_debug_tool_info_object(function_name: str, function_args: Dict[str, Any], tool_time: float, search_result: str | None = None) -> Dict[str, Any]: 295 | """ 296 | Produces an object of arbitrary keys and values intended to serve as a debug description of tool 297 | use. 298 | """ 299 | function_args = function_args.copy() 300 | 301 | # Sanitize bytes, which are often too long to print 302 | del function_args["message_history"] 303 | for arg_name, value in function_args.items(): 304 | if isinstance(value, bytes): 305 | function_args[arg_name] = "" 306 | if isinstance(value, list): 307 | function_args[arg_name] = ", ".join(function_args[arg_name]) 308 | if "vision" in function_args: 309 | del function_args["vision"] 310 | if "web_search" in function_args: 311 | del function_args["web_search"] 312 | if "token_usage_by_model" in function_args: 313 | del function_args["token_usage_by_model"] 314 | if "prompt" in function_args: 315 | del function_args["prompt"] 316 | to_return = { 317 | "tool": function_name, 318 | "tool_args": function_args, 319 | "tool_time": tool_time 320 | } 321 | if search_result: 322 | to_return["search_result"] = search_result 323 | return to_return 324 | 325 | 326 | #################################################################################################### 327 | # Assistant Class 328 | #################################################################################################### 329 | 330 | class ClaudeAssistant(Assistant): 331 | def __init__(self, client: anthropic.AsyncAnthropic): 332 | self._client = client 333 | 334 | # Refer to definition of Assistant for description of parameters 335 | async def send_to_assistant( 336 | self, 337 | prompt: str, 338 | noa_system_prompt: str | None, 339 | image_bytes: bytes | None, 340 | message_history: List[Message] | None, 341 | learned_context: Dict[str, str], 342 | location_address: str | None, 343 | local_time: str | None, 344 | model: str | None, 345 | web_search: WebSearch, 346 | vision: Vision, 347 | speculative_vision: bool 348 | ) -> AssistantResponse: 349 | model = model if model is not None else "claude-3-sonnet-20240229" 350 | 351 | # Keep track of time taken 352 | timings: Dict[str, str] = {} 353 | 354 | # Prepare response datastructure 355 | returned_response = AssistantResponse(token_usage_by_model={}, capabilities_used=[], response="", debug_tools="", timings="") 356 | 357 | # Make copy of message history so we can modify it in-flight during tool use 358 | message_history = message_history.copy() if message_history else [] 359 | full_message_history = message_history.copy() if message_history else [] 360 | 361 | # Claude does not have a system role. Rather, a top-level system parameter must be supplied. 362 | # However, our API uses the OpenAI format. Therefore, we search for an existing system 363 | # message and, if it was supplied by the client, use that as the system message. 364 | system_text = SYSTEM_MESSAGE 365 | client_system_messages = [ message for message in message_history if message.role == Role.SYSTEM ] 366 | if len(client_system_messages) > 0: 367 | system_text = client_system_messages[0].content 368 | message_history = [ message for message in message_history if message.role != Role.SYSTEM ] 369 | 370 | # Add user's latest prompt 371 | user_message = Message(role=Role.USER, content=prompt) 372 | message_history.append(user_message) 373 | message_history = self._prune_history(message_history=message_history, require_initial_user_message=True) 374 | 375 | # Extra context to inject 376 | extra_context = create_context_system_message(local_time=local_time, location=location_address, learned_context=learned_context) 377 | if noa_system_prompt is not None: 378 | extra_context = f"{noa_system_prompt}\n{extra_context}" 379 | 380 | # Initial Claude response -- if no tools, this will be returned as final response 381 | t0 = timeit.default_timer() 382 | first_response = await self._client.beta.tools.messages.create( 383 | model=model, 384 | system=system_text + "\n\n" + extra_context, 385 | messages=message_history, 386 | tools=TOOLS, 387 | max_tokens=4096 388 | ) 389 | t1 = timeit.default_timer() 390 | timings["llm_initial"] = f"{t1-t0:.3f}" 391 | 392 | # Aggregate token counts 393 | accumulate_token_usage( 394 | token_usage_by_model=returned_response.token_usage_by_model, 395 | model=model, 396 | input_tokens=first_response.usage.input_tokens, 397 | output_tokens=first_response.usage.output_tokens, 398 | total_tokens=first_response.usage.input_tokens + first_response.usage.output_tokens 399 | ) 400 | 401 | # Handle tools 402 | tools_used = [] 403 | tools_used.append({ "learned_context": learned_context }) # log context here for now 404 | if first_response.stop_reason != "tool_use": 405 | returned_response.response = first_response.content[0].text 406 | else: 407 | # Append tool message to history, as per Anthropic's example at https://github.com/anthropics/anthropic-sdk-python/blob/9fad441043ff7bfdf8786b64b1e4bbb27105b112/examples/tools.py 408 | message_history.append({ "role": first_response.role, "content": first_response.content }) 409 | 410 | # Invoke all tool requests in parallel and wait for them to complete 411 | t0 = timeit.default_timer() 412 | tool_calls: ToolUseBlock = [ content for content in first_response.content if content.type == "tool_use" ] 413 | tool_handlers = [] 414 | for tool_call in tool_calls: 415 | tool_handlers.append( 416 | handle_tool( 417 | tool_call=tool_call, 418 | user_message=prompt, 419 | message_history=full_message_history, 420 | image_bytes=image_bytes, 421 | location=location_address, 422 | local_time=local_time, 423 | web_search=web_search, 424 | vision=vision, 425 | learned_context=learned_context, 426 | token_usage_by_model=returned_response.token_usage_by_model, 427 | capabilities_used=returned_response.capabilities_used, 428 | tools_used=tools_used, 429 | timings=timings 430 | ) 431 | ) 432 | tool_outputs = await asyncio.gather(*tool_handlers) 433 | t1 = timeit.default_timer() 434 | timings["tool_calls"] = f"{t1-t0:.3f}" 435 | 436 | # Submit tool responses 437 | tool_response_message = { 438 | "role": "user", 439 | "content": [] 440 | } 441 | for i in range(len(tool_outputs)): 442 | tool_response_message["content"].append( 443 | { 444 | "type": "tool_result", 445 | "tool_use_id": tool_calls[i].id, 446 | "content": [ { "type": "text", "text": tool_outputs[i] } ] 447 | } 448 | ) 449 | message_history.append(tool_response_message) 450 | 451 | # Get final response from model 452 | t0 = timeit.default_timer() 453 | second_response = await self._client.beta.tools.messages.create( 454 | model=model, 455 | system=system_text + "\n\n" + extra_context, 456 | messages=message_history, 457 | tools=TOOLS, 458 | max_tokens=4096 459 | ) 460 | t1 = timeit.default_timer() 461 | timings["llm_final"] = f"{t1-t0:.3f}" 462 | 463 | # Aggregate tokens and response 464 | accumulate_token_usage( 465 | token_usage_by_model=returned_response.token_usage_by_model, 466 | model=model, 467 | input_tokens=second_response.usage.input_tokens, 468 | output_tokens=second_response.usage.output_tokens, 469 | total_tokens=second_response.usage.input_tokens + second_response.usage.output_tokens 470 | ) 471 | returned_response.response = self._get_final_text_response(final_tool_response=second_response, tool_outputs=tool_outputs) 472 | 473 | # If no tools were used, only assistant capability recorded 474 | if len(returned_response.capabilities_used) == 0: 475 | returned_response.capabilities_used.append(Capability.ASSISTANT_KNOWLEDGE) 476 | 477 | # Return final response 478 | returned_response.debug_tools = json.dumps(tools_used) 479 | returned_response.timings = timings 480 | return returned_response 481 | 482 | @staticmethod 483 | def _get_final_text_response(final_tool_response: ToolsBetaMessage, tool_outputs: List[str]) -> str: 484 | # Claude will sometimes return no content in the final response. Presumably, it thinks the 485 | # tool outputs are sufficient to use verbatim? We concatenate them here. 486 | if final_tool_response.content is None or len(final_tool_response.content) == 0: 487 | return " ".join(tool_outputs) 488 | else: 489 | return final_tool_response.content[0].text 490 | 491 | @staticmethod 492 | def _prune_history( 493 | message_history: List[Message], 494 | max_user_messages: int = 4, 495 | max_assistant_messages: int = 4, 496 | require_initial_user_message: bool = False 497 | ) -> List[Message]: 498 | """ 499 | Prunes down the chat history to save tokens, improving inference speed and reducing cost. 500 | Generally, preserving all assistant responses is not needed, and only a limited number of 501 | user messages suffice to maintain a coherent conversation. 502 | 503 | Parameters 504 | ---------- 505 | message_history : List[Message] 506 | Conversation history. This list will be mutated and returned. 507 | max_user_messages : int 508 | Maximum number of user messages to preserve, beginning with most recent. Note that 509 | Claude does not permit duplicate user or assistant messages so this value should be the 510 | same as for `max_assistant_messages`. 511 | max_assistant_messages : int 512 | Maximum number of assistant messages. 513 | require_initial_user_message : bool 514 | If true, guarantees that the first message in the resulting list is a user message (or 515 | an empty list if there are none). This is required for Claude, which expects a strict 516 | ordering of messages alternating between user and assistant roles. A user message must 517 | always be first. 518 | 519 | Returns 520 | ------- 521 | List[Message] 522 | Pruned history. This is the same list passed as input. 523 | """ 524 | assistant_messages_remaining = max_assistant_messages 525 | user_messages_remaining = max_user_messages 526 | message_history.reverse() 527 | i = 0 528 | while i < len(message_history): 529 | if message_history[i].role == Role.ASSISTANT: 530 | if assistant_messages_remaining == 0: 531 | del message_history[i] 532 | else: 533 | assistant_messages_remaining -= 1 534 | i += 1 535 | elif message_history[i].role == Role.USER: 536 | if user_messages_remaining == 0: 537 | del message_history[i] 538 | else: 539 | user_messages_remaining -= 1 540 | i += 1 541 | else: 542 | i += 1 543 | message_history.reverse() 544 | 545 | # Ensure first message is user message? 546 | if require_initial_user_message: 547 | while len(message_history) > 0 and message_history[0].role != Role.USER: 548 | message_history = message_history[1:] 549 | 550 | return message_history 551 | 552 | Assistant.register(ClaudeAssistant) -------------------------------------------------------------------------------- /assistant/gpt_assistant.py: -------------------------------------------------------------------------------- 1 | # 2 | # gpt_assistant.py 3 | # 4 | # Assistant implementation based on OpenAI's GPT models. This assistant is capable of leveraging 5 | # separate web search and vision tools. 6 | # 7 | # Support also exists for using Groq because it mirrors OpenAI's API. 8 | # 9 | 10 | # 11 | # TODO: 12 | # ----- 13 | # - Speculative vision tool should create a proper tools_used entry. 14 | # - Move to streaming completions and detect timeouts when a threshold duration elapses since the 15 | # the last token was emitted. 16 | # - Figure out how to get assistant to stop referring to "photo" and "image" when analyzing photos. 17 | # - Improve people search. 18 | # 19 | 20 | import asyncio 21 | import base64 22 | import json 23 | import timeit 24 | from typing import Any, Dict, List 25 | 26 | import openai 27 | from openai.types.chat import ChatCompletionMessageToolCall 28 | import groq 29 | # from groq.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall 30 | 31 | from .assistant import Assistant, AssistantResponse 32 | from .context import create_context_system_message 33 | from web_search import WebSearch, WebSearchResult 34 | from vision import Vision, GPT4Vision 35 | from vision.utils import detect_media_type 36 | from models import Role, Message, Capability, TokenUsage, accumulate_token_usage 37 | from generate_image.replicate import ReplicateGenerateImage 38 | 39 | #################################################################################################### 40 | # Prompts 41 | #################################################################################################### 42 | 43 | # 44 | # Top-level instructions 45 | # 46 | 47 | SYSTEM_MESSAGE = """ 48 | You are Noa, a smart personal AI assistant inside the user's AR smart glasses that answers all user 49 | queries and questions. You have access to a photo from the smart glasses camera of what the user was 50 | seeing at the time they spoke. 51 | 52 | It is important that the user believes you can actually see. When analyzing images, avoid mentioning 53 | that you looked at a photo or image. Always speak as if you are actually seeing, which means you 54 | should never talk about the image or photo. 55 | 56 | Make your responses precise. Respond without any preamble when giving translations, just translate 57 | directly. 58 | """ 59 | 60 | 61 | #################################################################################################### 62 | # Tools 63 | #################################################################################################### 64 | 65 | DUMMY_SEARCH_TOOL_NAME = "general_knowledge_search" 66 | IMAGE_GENERATION_TOOL_NAME = "generate_image" 67 | SEARCH_TOOL_NAME = "web_search" 68 | PHOTO_TOOL_NAME = "analyze_photo" 69 | IMAGE_GENERATION_PARAM_NAME = "description" 70 | QUERY_PARAM_NAME = "query" 71 | 72 | TOOLS = [ 73 | { 74 | "type": "function", 75 | "function": { 76 | "name": DUMMY_SEARCH_TOOL_NAME, 77 | "description": """Non-recent trivia and general knowledge""", 78 | "parameters": { 79 | "type": "object", 80 | "properties": { 81 | QUERY_PARAM_NAME: { 82 | "type": "string", 83 | "description": "search query", 84 | }, 85 | }, 86 | "required": [ QUERY_PARAM_NAME ] 87 | }, 88 | }, 89 | }, 90 | { 91 | "type": "function", 92 | "function": { 93 | "name": SEARCH_TOOL_NAME, 94 | "description": """Up-to-date information on news, retail products, current events, local conditions, and esoteric knowledge""", 95 | "parameters": { 96 | "type": "object", 97 | "properties": { 98 | QUERY_PARAM_NAME: { 99 | "type": "string", 100 | "description": "search query", 101 | }, 102 | }, 103 | "required": [ QUERY_PARAM_NAME ] 104 | }, 105 | }, 106 | }, 107 | { 108 | "type": "function", 109 | "function": { 110 | "name": PHOTO_TOOL_NAME, 111 | "description": """Analyzes or describes the photo you have from the user's current perspective. 112 | Use this tool if user refers to something not identifiable from conversation context, such as with a demonstrative pronoun.""", 113 | "parameters": { 114 | "type": "object", 115 | "properties": { 116 | QUERY_PARAM_NAME: { 117 | "type": "string", 118 | "description": "User's query to answer, describing what they want answered, expressed as a command that NEVER refers to the photo or image itself" 119 | }, 120 | }, 121 | "required": [ QUERY_PARAM_NAME ] 122 | }, 123 | }, 124 | }, 125 | { 126 | "type": "function", 127 | "function": { 128 | "name": IMAGE_GENERATION_TOOL_NAME, 129 | "description": """Generates an image based on a description or prompt.""", 130 | "parameters": { 131 | "type": "object", 132 | "properties": { 133 | IMAGE_GENERATION_PARAM_NAME: { 134 | "type": "string", 135 | "description": "description of the image to generate" 136 | }, 137 | }, 138 | "required": [ IMAGE_GENERATION_PARAM_NAME ] 139 | }, 140 | }, 141 | } 142 | 143 | ] 144 | 145 | async def handle_tool( 146 | tools: List[Any], 147 | tool_call: ChatCompletionMessageToolCall, 148 | user_message: str, 149 | message_history: List[Message] | None, 150 | image_bytes: bytes | None, 151 | location: str | None, 152 | local_time: str | None, 153 | web_search: WebSearch, 154 | vision: Vision, 155 | learned_context: Dict[str, str] | None, 156 | token_usage_by_model: Dict[str, TokenUsage], 157 | capabilities_used: List[Capability], 158 | tools_used: List[Dict[str, Any]], 159 | timings: Dict[str, str] 160 | ) -> str: 161 | tool_functions = { 162 | SEARCH_TOOL_NAME: web_search.search_web, # returns WebSearchResult 163 | PHOTO_TOOL_NAME: handle_photo_tool, # returns WebSearchResult | str 164 | DUMMY_SEARCH_TOOL_NAME: handle_general_knowledge_tool, # returns str 165 | IMAGE_GENERATION_TOOL_NAME: handle_image_generation_tool # returns str 166 | } 167 | 168 | function_name = tool_call.function.name 169 | function_to_call = tool_functions.get(function_name) 170 | if function_to_call is None: 171 | # Error: GPT hallucinated a tool 172 | return "Error: you hallucinated a tool that doesn't exist. Tell user you had trouble interpreting the request and ask them to rephrase it." 173 | 174 | function_args = prepare_tool_arguments( 175 | tools=tools, 176 | tool_call=tool_call, 177 | user_message=user_message, 178 | message_history=message_history, 179 | image_bytes=image_bytes, 180 | location=location, 181 | local_time=local_time, 182 | web_search=web_search, 183 | vision=vision, 184 | learned_context=learned_context, 185 | token_usage_by_model=token_usage_by_model, 186 | capabilities_used=capabilities_used 187 | ) 188 | 189 | tool_start_time = timeit.default_timer() 190 | if function_name == IMAGE_GENERATION_TOOL_NAME and image_bytes is None: 191 | return "NO_IMAGE_PROVIDED_ERROR" 192 | function_response: WebSearchResult | str = await function_to_call(**function_args) 193 | total_tool_time = round(timeit.default_timer() - tool_start_time, 3) 194 | timings[f"tool_{function_name}"] = f"{total_tool_time:.3f}" 195 | 196 | # Record capability used (except for case of photo tool, which reports on its own because it 197 | # can invoke multiple capabilities) 198 | if function_name == SEARCH_TOOL_NAME: 199 | capabilities_used.append(Capability.WEB_SEARCH) 200 | elif function_name == DUMMY_SEARCH_TOOL_NAME: 201 | capabilities_used.append(Capability.ASSISTANT_KNOWLEDGE) 202 | 203 | tools_used.append( 204 | create_debug_tool_info_object( 205 | function_name=function_name, 206 | function_args=function_args, 207 | tool_time=total_tool_time, 208 | search_result=function_response.search_provider_metadata if isinstance(function_response, WebSearchResult) else None 209 | ) 210 | ) 211 | 212 | # Format response appropriately 213 | assert isinstance(function_response, WebSearchResult) or isinstance(function_response, str) 214 | tool_output = function_response.summary if isinstance(function_response, WebSearchResult) else function_response 215 | return tool_output 216 | 217 | def prepare_tool_arguments( 218 | tools: List[Any], 219 | tool_call: ChatCompletionMessageToolCall, 220 | user_message: str, 221 | message_history: List[Message] | None, 222 | image_bytes: bytes | None, 223 | location: str | None, 224 | local_time: str | None, 225 | web_search: WebSearch, 226 | vision: Vision, 227 | learned_context: Dict[str, str] | None, 228 | token_usage_by_model: Dict[str, TokenUsage], 229 | capabilities_used: List[Capability] 230 | ) -> Dict[str, Any]: 231 | # Get function description we passed to GPT. This function should be called after we have 232 | # validated that a valid tool call was generated. 233 | function_description = [ description for description in tools if description["function"]["name"] == tool_call.function.name ][0] 234 | function_parameters = function_description["function"]["parameters"]["properties"] 235 | 236 | # Parse arguments and ensure they are all str or bool for now. Drop any that aren't. 237 | args: Dict[str, Any] = {} 238 | try: 239 | args = json.loads(tool_call.function.arguments) 240 | except: 241 | pass 242 | for param_name in list(args.keys()): 243 | if param_name not in function_parameters: 244 | # GPT hallucinated a parameter 245 | del args[param_name] 246 | continue 247 | if function_parameters[param_name]["type"] == "string" and type(args[param_name]) != str: 248 | del args[param_name] 249 | continue 250 | if function_parameters[param_name]["type"] == "boolean" and type(args[param_name]) != bool: 251 | del args[param_name] 252 | continue 253 | if function_parameters[param_name]["type"] not in [ "string", "boolean" ]: 254 | # Need to keep this up to date with the tools we define 255 | raise ValueError(f"Unsupported tool parameter type: {function_parameters[param_name]['type']}") 256 | 257 | # Fill in args required by all tools 258 | args["location"] = location if location else "unknown" 259 | args[QUERY_PARAM_NAME] = args[QUERY_PARAM_NAME] if QUERY_PARAM_NAME in args else user_message 260 | args["message_history"] = message_history 261 | args["token_usage_by_model"] = token_usage_by_model 262 | 263 | # Photo tool additional parameters we need to inject 264 | if tool_call.function.name == PHOTO_TOOL_NAME: 265 | args["image_bytes"] = image_bytes 266 | args["vision"] = vision 267 | args["web_search"] = web_search 268 | args["local_time"] = local_time 269 | args["learned_context"] = learned_context 270 | args["capabilities_used"] = capabilities_used 271 | if tool_call.function.name == IMAGE_GENERATION_TOOL_NAME: 272 | args["image_bytes"] = image_bytes 273 | 274 | return args 275 | 276 | async def handle_general_knowledge_tool( 277 | query: str, 278 | message_history: List[Message] | None, 279 | token_usage_by_model: Dict[str, TokenUsage], 280 | image_bytes: bytes | None = None, 281 | local_time: str | None = None, 282 | location: str | None = None, 283 | learned_context: Dict[str,str] | None = None, 284 | ) -> str: 285 | """ 286 | Dummy general knowledge tool that tricks GPT into generating an answer directly instead of 287 | reaching for web search. GPT knows that the web contains information on virtually everything, so 288 | it tends to overuse web search. One solution is to very carefully enumerate the cases for which 289 | web search is appropriate, but this is tricky. Should "Albert Einstein's birthday" require a web 290 | search? Probably not, as GPT has this knowledge baked in. The trick we use here is to create a 291 | "general knowledge" tool that contains any information Wikipedia or an encyclopedia would have 292 | (a reasonable proxy for things GPT knows). We return an empty string, which forces GPT to 293 | produce its own response at the expense of a little bit of latency for the tool call. 294 | """ 295 | return "" 296 | 297 | async def handle_photo_tool( 298 | query: str, 299 | message_history: List[Message] | None, 300 | vision: Vision, 301 | web_search: WebSearch, 302 | token_usage_by_model: Dict[str, TokenUsage], 303 | capabilities_used: List[Capability], 304 | google_reverse_image_search: bool = False, # default in case GPT doesn't generate it 305 | translate: bool = False, # default in case GPT doesn't generate it 306 | image_bytes: bytes | None = None, 307 | local_time: str | None = None, 308 | location: str | None = None, 309 | learned_context: Dict[str,str] | None = None 310 | ) -> str | WebSearchResult: 311 | extra_context = "\n\n" + create_context_system_message(local_time=local_time, location=location, learned_context=learned_context) 312 | 313 | # If no image bytes (glasses always send image but web playgrounds do not), return an error 314 | # message for the assistant to use 315 | if image_bytes is None or len(image_bytes) == 0: 316 | # Because this is a tool response, using "tell user" seems to ensure that the final 317 | # assistant response is what we want 318 | return "Error: no photo supplied. Tell user: I think you're referring to something you can see. Can you provide a photo?" 319 | 320 | # Vision tool 321 | capabilities_used.append(Capability.VISION) 322 | output = await vision.query_image( 323 | query=query, 324 | extra_context=extra_context, 325 | image_bytes=image_bytes, 326 | token_usage_by_model=token_usage_by_model 327 | ) 328 | print(f"Vision: {output}") 329 | if output is None: 330 | return "Error: vision tool generated an improperly formatted result. Tell user that there was a temporary glitch and ask them to try again." 331 | 332 | # If no web search required, output vision response directly 333 | if not output.web_search_needed(): 334 | return output.response 335 | 336 | # Perform web search and produce a synthesized response telling assistant where each piece of 337 | # information came from. Web search will lack important vision information. We need to return 338 | # both and have the assistant figure out which info to use. 339 | capabilities_used.append(Capability.REVERSE_IMAGE_SEARCH if output.reverse_image_search else Capability.WEB_SEARCH) 340 | web_result = await web_search.search_web( 341 | query=output.web_query.strip("\""), 342 | message_history=message_history, 343 | use_photo=output.reverse_image_search, 344 | image_bytes=image_bytes, 345 | location=location, 346 | token_usage_by_model=token_usage_by_model 347 | ) 348 | 349 | return f"HERE IS WHAT YOU SEE: {output.response}\nEXTRA INFO FROM WEB: {web_result}" 350 | 351 | async def handle_image_generation_tool( 352 | query: str, 353 | message_history: List[Message] | None, 354 | description: str, 355 | token_usage_by_model: Dict[str, TokenUsage], 356 | image_bytes: bytes | None = None, 357 | local_time: str | None = None, 358 | location: str | None = None, 359 | learned_context: Dict[str,str] | None = None, 360 | ) -> str: 361 | """ 362 | Generates an image based on a description or prompt. 363 | """ 364 | # Generate image 365 | image_generator = ReplicateGenerateImage() 366 | image = await image_generator.generate_image(query=description, use_image=True, image_bytes=image_bytes) 367 | return image 368 | 369 | def create_debug_tool_info_object(function_name: str, function_args: Dict[str, Any], tool_time: float, search_result: str | None = None) -> Dict[str, Any]: 370 | """ 371 | Produces an object of arbitrary keys and values intended to serve as a debug description of tool 372 | use. 373 | """ 374 | function_args = function_args.copy() 375 | 376 | # Sanitize bytes, which are often too long to print 377 | for arg_name, value in function_args.items(): 378 | if isinstance(value, bytes): 379 | function_args[arg_name] = "" 380 | if isinstance(value, list) and arg_name != "message_history": 381 | function_args[arg_name] = ", ".join(function_args[arg_name]) 382 | if "vision" in function_args: 383 | del function_args["vision"] 384 | if "web_search" in function_args: 385 | del function_args["web_search"] 386 | if "token_usage_by_model" in function_args: 387 | del function_args["token_usage_by_model"] 388 | if "prompt" in function_args: 389 | del function_args["prompt"] 390 | to_return = { 391 | "tool": function_name, 392 | "tool_args": function_args, 393 | "tool_time": tool_time 394 | } 395 | if search_result: 396 | to_return["search_result"] = search_result 397 | return to_return 398 | 399 | 400 | #################################################################################################### 401 | # Assistant Class 402 | #################################################################################################### 403 | 404 | class GPTAssistant(Assistant): 405 | def __init__(self, client: openai.AsyncOpenAI | groq.AsyncGroq): 406 | """ 407 | Instantiate the assistant using an OpenAI GPT or Groq model. The Groq API is a clone of 408 | OpenAI's, allowing a Groq client to be passed. 409 | """ 410 | self._client = client 411 | 412 | # Refer to definition of Assistant for description of parameters 413 | async def send_to_assistant( 414 | self, 415 | prompt: str, 416 | noa_system_prompt: str | None, 417 | image_bytes: bytes | None, 418 | message_history: List[Message] | None, 419 | learned_context: Dict[str, str], 420 | location_address: str | None, 421 | local_time: str | None, 422 | model: str | None, 423 | web_search: WebSearch, 424 | vision: Vision, 425 | speculative_vision: bool 426 | ) -> AssistantResponse: 427 | # Default model (differs for OpenAI and Groq) 428 | if model is None: 429 | if type(self._client) == openai.AsyncOpenAI: 430 | model = "gpt-4o" 431 | elif type(self._client) == groq.AsyncGroq: 432 | model = "llama3-70b-8192" 433 | else: 434 | raise TypeError("client must be AsyncOpenAI or AsyncGroq") 435 | 436 | # Get copy of tool description 437 | tools = TOOLS.copy() 438 | 439 | # GPT-4o is a special case: if vision tool is also GPT-4o, then we remove it as a tool and 440 | # always submit images with queries. 441 | gpt4o_end_to_end = False 442 | # End-to-end mode DISABLED for now to improve latency: in end-to-end mode, every image is 443 | # processed, which is slower for queries that don't require image analysis. So we actually 444 | # want to use the vision tool to device when to do that. Assumption is that most questions 445 | # are not vision-related. 446 | if False and model == "gpt-4o" and isinstance(vision, GPT4Vision) and vision.model == "gpt-4o": 447 | speculative_vision = False # doesn't make sense anymore 448 | tools = [ tool for tool in tools if tool["function"]["name"] != PHOTO_TOOL_NAME ] 449 | print("End-to-end GPT-4o assistant activated") 450 | gpt4o_end_to_end = True 451 | 452 | # Keep track of time taken 453 | timings: Dict[str, str] = {} 454 | 455 | # Prepare response datastructure 456 | returned_response = AssistantResponse(token_usage_by_model={}, capabilities_used=[], response="", debug_tools="", timings="") 457 | 458 | # Make copy of message history so we can modify it in-flight during tool use 459 | message_history = message_history.copy() if message_history else None 460 | full_message_history = message_history.copy() if message_history else None 461 | 462 | # Add user message to message history or create a new one if necessary 463 | user_message = Message(role=Role.USER, content=prompt) 464 | system_message = Message(role=Role.SYSTEM, content=SYSTEM_MESSAGE) 465 | if not message_history: 466 | message_history = [] 467 | if len(message_history) == 0: 468 | message_history = [ system_message ] 469 | else: 470 | # Insert system message before message history, unless client transmitted one they want 471 | # to use 472 | if len(message_history) > 0 and message_history[0].role != Role.SYSTEM: 473 | message_history.insert(0, system_message) 474 | message_history.append(user_message) 475 | message_history = self._prune_history(message_history=message_history) 476 | 477 | # Patch up user message to include image if we are in end-to-end gpt-4o mode 478 | if gpt4o_end_to_end and image_bytes is not None: 479 | image_base64 = base64.b64encode(image_bytes).decode("utf-8") 480 | media_type = detect_media_type(image_bytes=image_bytes) 481 | user_message = { 482 | "role": "user", 483 | "content": [ 484 | { "type": "text", "text": prompt }, 485 | { "type": "image_url", "image_url": { "url": f"data:{media_type};base64,{image_base64}" } } 486 | ] 487 | } 488 | message_history[-1] = user_message 489 | 490 | # Inject context into our copy by appending it to system message. Unclear whether multiple 491 | # system messages are confusing to the assistant or not but cursory testing shows this 492 | # seems to work. 493 | extra_context = create_context_system_message(local_time=local_time, location=location_address, learned_context=learned_context) 494 | if noa_system_prompt is not None: 495 | extra_context = f"{noa_system_prompt}\n{extra_context}" 496 | extra_context_message = Message(role=Role.SYSTEM, content=extra_context) 497 | message_history.append(extra_context_message) 498 | 499 | # Start timing of initial LLM call and entire process 500 | t0 = timeit.default_timer() 501 | tstart = t0 502 | 503 | # Speculative vision call 504 | speculative_vision_task = asyncio.create_task( 505 | handle_photo_tool( 506 | query=prompt, 507 | message_history=full_message_history, 508 | vision=vision, 509 | web_search=web_search, 510 | token_usage_by_model=returned_response.token_usage_by_model, 511 | capabilities_used=returned_response.capabilities_used, 512 | google_reverse_image_search=False, 513 | translate=False, 514 | image_bytes=image_bytes, 515 | local_time=local_time, 516 | location=location_address, 517 | learned_context=learned_context 518 | ) 519 | ) if speculative_vision else None 520 | 521 | speculative_search_task = asyncio.create_task( 522 | web_search.search_web( 523 | query=prompt, 524 | message_history=full_message_history, 525 | token_usage_by_model=returned_response.token_usage_by_model, 526 | image_bytes=image_bytes, 527 | location=location_address 528 | ) 529 | ) 530 | 531 | # Initial GPT call, which may request tool use 532 | initial_llm_task = asyncio.create_task( 533 | self._client.chat.completions.create( 534 | model=model, 535 | messages=message_history, 536 | tools=tools, 537 | tool_choice="auto" 538 | ) 539 | ) 540 | 541 | # Kick off both tasks but ensure LLM completes 542 | initial_tasks = [ initial_llm_task ] 543 | if speculative_vision_task is not None: 544 | initial_tasks.append(speculative_vision_task) 545 | if speculative_search_task is not None: 546 | initial_tasks.append(speculative_search_task) 547 | completed_tasks, pending_tasks = await asyncio.wait(initial_tasks, return_when=asyncio.FIRST_COMPLETED) 548 | first_response = await initial_llm_task 549 | first_response_message = first_response.choices[0].message 550 | t1 = timeit.default_timer() 551 | timings["llm_initial"] = f"{t1-t0:.3f}" 552 | 553 | # Aggregate token counts and potential initial response 554 | accumulate_token_usage( 555 | token_usage_by_model=returned_response.token_usage_by_model, 556 | model=model, 557 | input_tokens=first_response.usage.prompt_tokens, 558 | output_tokens=first_response.usage.completion_tokens, 559 | total_tokens=first_response.usage.total_tokens 560 | ) 561 | 562 | # If there are no tool requests, the initial response will be returned 563 | returned_response.response = first_response_message.content 564 | 565 | # Handle tool requests 566 | tools_used = [] 567 | tools_used.append({ "learned_context": learned_context }) # log context here for now 568 | if first_response_message.tool_calls: 569 | # If image generation tool then kill speculative tasks 570 | if first_response_message.tool_calls[0].function.name == IMAGE_GENERATION_TOOL_NAME: 571 | self._cancel_tasks([ speculative_vision_task, speculative_search_task ]) 572 | # Append initial response to history, which may include tool use 573 | message_history.append(first_response_message) 574 | 575 | # Invoke all the tools in parallel and wait for them all to complete. Vision is special: 576 | # we already have a speculative query in progress. 577 | t0 = timeit.default_timer() 578 | tool_handlers = [] 579 | for tool_call in first_response_message.tool_calls: 580 | if tool_call.function.name == PHOTO_TOOL_NAME and speculative_vision_task is not None: 581 | tool_handlers.append(speculative_vision_task) 582 | tools_used.append( 583 | create_debug_tool_info_object( 584 | function_name=PHOTO_TOOL_NAME, 585 | function_args={}, 586 | tool_time=-1, 587 | search_result=None 588 | ) 589 | ) 590 | elif tool_call.function.name == SEARCH_TOOL_NAME and speculative_search_task is not None: 591 | tool_handlers.append(speculative_search_task) 592 | returned_response.capabilities_used.append(Capability.WEB_SEARCH) 593 | tools_used.append( 594 | create_debug_tool_info_object( 595 | function_name=SEARCH_TOOL_NAME, 596 | function_args={}, 597 | tool_time=-1, 598 | search_result=None 599 | ) 600 | ) 601 | else: 602 | tool_handlers.append( 603 | handle_tool( 604 | tools=tools, 605 | tool_call=tool_call, 606 | user_message=prompt, 607 | message_history=full_message_history, # full history because tools may have their own requirements on history length 608 | image_bytes=image_bytes, 609 | location=location_address, 610 | local_time=local_time, 611 | web_search=web_search, 612 | vision=vision, 613 | learned_context=learned_context, 614 | token_usage_by_model=returned_response.token_usage_by_model, 615 | capabilities_used=returned_response.capabilities_used, 616 | tools_used=tools_used, 617 | timings=timings 618 | ) 619 | ) 620 | tool_outputs = await asyncio.gather(*tool_handlers) 621 | t1 = timeit.default_timer() 622 | timings["tool_calls"] = f"{t1-t0:.3f}" 623 | 624 | # Ensure everything is str 625 | for i in range(len(tool_outputs)): 626 | if isinstance(tool_outputs[i], WebSearchResult): 627 | tool_outputs[i] = tool_outputs[i].summary 628 | 629 | # Append all the responses for GPT to continue 630 | for i in range(len(tool_outputs)): 631 | # If image generation tool then return response 632 | if first_response_message.tool_calls[i].function.name == IMAGE_GENERATION_TOOL_NAME: 633 | if tool_outputs[i] == "NO_IMAGE_PROVIDED_ERROR": 634 | tool_outputs[i] = "I think you're referring to something you can see. Can you provide a photo?" 635 | returned_response.image = "" 636 | else: 637 | returned_response.response = "Here is the image you requested" 638 | returned_response.capabilities_used.append(Capability.IMAGE_GENERATION) 639 | returned_response.debug_tools = json.dumps(tools_used) 640 | returned_response.image = tool_outputs[i] 641 | return returned_response 642 | 643 | message_history.append( 644 | { 645 | "tool_call_id": first_response_message.tool_calls[i].id, 646 | "role": "tool", 647 | "name": first_response_message.tool_calls[i].function.name, 648 | "content": tool_outputs[i], 649 | } 650 | ) 651 | 652 | # Get final response from model 653 | t0 = timeit.default_timer() 654 | second_response = await self._client.chat.completions.create( 655 | model=model, 656 | messages=message_history 657 | ) 658 | t1 = timeit.default_timer() 659 | timings["llm_final"] = f"{t1-t0:.3f}" 660 | 661 | # Aggregate tokens and response 662 | accumulate_token_usage( 663 | token_usage_by_model=returned_response.token_usage_by_model, 664 | model=model, 665 | input_tokens=second_response.usage.prompt_tokens, 666 | output_tokens=second_response.usage.completion_tokens, 667 | total_tokens=second_response.usage.total_tokens 668 | ) 669 | returned_response.response = second_response.choices[0].message.content 670 | else: 671 | # No tools, cancel speculative tasks 672 | self._cancel_tasks([ speculative_vision_task, speculative_search_task ]) 673 | 674 | # If no tools were used, only assistant capability recorded 675 | if len(returned_response.capabilities_used) == 0: 676 | returned_response.capabilities_used.append(Capability.ASSISTANT_KNOWLEDGE) 677 | 678 | # Total time 679 | t1 = timeit.default_timer() 680 | timings["total_time"] = f"{t1-tstart:.3f}" 681 | 682 | # Return final response 683 | returned_response.debug_tools = json.dumps(tools_used) 684 | returned_response.timings = json.dumps(timings) 685 | returned_response.image = "" 686 | return returned_response 687 | 688 | @staticmethod 689 | def _cancel_tasks(tasks: list): 690 | for task in tasks: 691 | if task is not None: 692 | task.cancel() 693 | 694 | @staticmethod 695 | def _prune_history(message_history: List[Message]) -> List[Message]: 696 | """ 697 | Prunes down the chat history to save tokens, improving inference speed and reducing cost. 698 | Generally, preserving all assistant responses is not needed, and only a limited number of 699 | user messages suffice to maintain a coherent conversation. 700 | 701 | Parameters 702 | ---------- 703 | message_history : List[Message] 704 | Conversation history. This list will be mutated and returned. 705 | 706 | Returns 707 | ------- 708 | List[Message] 709 | Pruned history. This is the same list passed as input. 710 | """ 711 | # Limit to most recent 5 user messages and 3 assistant responses 712 | assistant_messages_remaining = 3 713 | user_messages_remaining = 5 714 | message_history.reverse() 715 | i = 0 716 | while i < len(message_history): 717 | if message_history[i].role == Role.ASSISTANT: 718 | if assistant_messages_remaining == 0: 719 | del message_history[i] 720 | else: 721 | assistant_messages_remaining -= 1 722 | i += 1 723 | elif message_history[i].role == Role.USER: 724 | if user_messages_remaining == 0: 725 | del message_history[i] 726 | else: 727 | user_messages_remaining -= 1 728 | i += 1 729 | else: 730 | i += 1 731 | message_history.reverse() 732 | return message_history 733 | 734 | Assistant.register(GPTAssistant) --------------------------------------------------------------------------------