├── web_search
    ├── async_serpapi_client
    │   ├── __init__.py
    │   └── async_serpapi_client.py
    ├── __init__.py
    ├── web_search.py
    ├── perplexity.py
    └── dataforseo.py
├── tests
    ├── images
    │   ├── black.jpg
    │   ├── candle.jpg
    │   ├── popcorn.jpg
    │   ├── screw.jpg
    │   ├── toy_car.jpg
    │   ├── ua_hat.jpg
    │   ├── Zoom_Call.jpg
    │   ├── candyland.jpg
    │   ├── gym_setup.jpg
    │   ├── staircase.jpg
    │   ├── two_beds.jpg
    │   ├── airpod_case.jpg
    │   ├── boxing_ring.webp
    │   ├── dreary_day.jpg
    │   ├── house_plant.jpg
    │   ├── living_room.jpg
    │   ├── parking_sign.jpg
    │   ├── persian_rug.jpg
    │   ├── Wangjing_Soho.jpg
    │   ├── child_artwork.jpg
    │   ├── chinese_dragon.jpg
    │   ├── pickled_snake.jpg
    │   ├── sample_rotated.jpg
    │   ├── some_old_actor.jpg
    │   ├── Philz_Coffee_Cup.jpg
    │   ├── SF_Cask_and_Lark.png
    │   ├── STEM_card_Chinese.jpg
    │   ├── agility_ladder.webp
    │   ├── airpod_case_open.jpg
    │   ├── boxing_equipment.webp
    │   ├── cars_behind_wall.jpg
    │   ├── chinese_sign_1.webp
    │   ├── chinese_sign_2.webp
    │   ├── figurine_on_table.jpg
    │   ├── hp_mouse_512x512.jpg
    │   ├── motivational_sign.jpg
    │   ├── small_trash_can.jpg
    │   ├── thomas_the_train.jpg
    │   ├── toys_strewn_about.jpg
    │   ├── winning_gloves.webp
    │   ├── book_atomic_habits.jpg
    │   ├── book_viral_justice.jpg
    │   ├── eggs_banana_avocado.jpg
    │   ├── flower_arrangement.jpg
    │   ├── frame_smart_glasses.jpg
    │   ├── pug_asian_clothing.jpg
    │   ├── dragon_fruit_in_box.webp
    │   ├── toy_track_incomplete.jpg
    │   ├── STEM_card_Chinese_closeup.jpg
    │   ├── bobak_dressed_for_weather.jpg
    │   ├── frame_captures
    │   │   ├── Window_Tree.jpg
    │   │   ├── Laptop_Discord_1.jpg
    │   │   ├── Laptop_Discord_2.jpg
    │   │   ├── Shelves_Blurry_1.jpg
    │   │   ├── Shelves_Blurry_2.jpg
    │   │   ├── Window_Overexposed.jpg
    │   │   ├── test_camera_image1.webp
    │   │   ├── test_camera_image2.webp
    │   │   ├── test_camera_image3.webp
    │   │   ├── test_camera_image4.webp
    │   │   ├── test_camera_image5.webp
    │   │   ├── test_camera_image6.webp
    │   │   ├── test_camera_image7.webp
    │   │   ├── test_camera_image8.webp
    │   │   ├── Fingers_Overexposed_1.jpg
    │   │   ├── Fingers_Overexposed_2.jpg
    │   │   └── Fingers_Overexposed_3.jpg
    │   ├── woman_strange_fuzzy_pants.jpeg
    │   └── colorful_landscape_paintings.jpg
    ├── frame_camera.json
    ├── frame_camera_laziness.json
    ├── tests.json
    └── benchmark.json
├── generate_image
    ├── __init__.py
    ├── generate_image.py
    └── replicate.py
├── docs
    └── noa_assistant.drawio.png
├── vision
    ├── __init__.py
    ├── vision.py
    ├── gpt4vision.py
    ├── claude_vision.py
    └── utils.py
├── assistant
    ├── __init__.py
    ├── assistant.py
    ├── context.py
    ├── claude_assistant.py
    └── gpt_assistant.py
├── models
    ├── __init__.py
    ├── token_usage.py
    └── api.py
├── .gitignore
├── .env.example
├── requirements.txt
├── LICENSE.md
├── load_audio.sh
├── README.md
├── run_benchmark.py
└── app.py


/web_search/async_serpapi_client/__init__.py:
--------------------------------------------------------------------------------
1 | from .async_serpapi_client import AsyncSerpAPIClient


--------------------------------------------------------------------------------
/tests/images/black.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/black.jpg


--------------------------------------------------------------------------------
/tests/images/candle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/candle.jpg


--------------------------------------------------------------------------------
/tests/images/popcorn.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/popcorn.jpg


--------------------------------------------------------------------------------
/tests/images/screw.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/screw.jpg


--------------------------------------------------------------------------------
/tests/images/toy_car.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/toy_car.jpg


--------------------------------------------------------------------------------
/tests/images/ua_hat.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/ua_hat.jpg


--------------------------------------------------------------------------------
/generate_image/__init__.py:
--------------------------------------------------------------------------------
1 | from .generate_image import GenerateImage
2 | from .replicate import ReplicateGenerateImage


--------------------------------------------------------------------------------
/tests/images/Zoom_Call.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/Zoom_Call.jpg


--------------------------------------------------------------------------------
/tests/images/candyland.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/candyland.jpg


--------------------------------------------------------------------------------
/tests/images/gym_setup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/gym_setup.jpg


--------------------------------------------------------------------------------
/tests/images/staircase.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/staircase.jpg


--------------------------------------------------------------------------------
/tests/images/two_beds.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/two_beds.jpg


--------------------------------------------------------------------------------
/docs/noa_assistant.drawio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/docs/noa_assistant.drawio.png


--------------------------------------------------------------------------------
/tests/images/airpod_case.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/airpod_case.jpg


--------------------------------------------------------------------------------
/tests/images/boxing_ring.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/boxing_ring.webp


--------------------------------------------------------------------------------
/tests/images/dreary_day.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/dreary_day.jpg


--------------------------------------------------------------------------------
/tests/images/house_plant.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/house_plant.jpg


--------------------------------------------------------------------------------
/tests/images/living_room.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/living_room.jpg


--------------------------------------------------------------------------------
/tests/images/parking_sign.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/parking_sign.jpg


--------------------------------------------------------------------------------
/tests/images/persian_rug.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/persian_rug.jpg


--------------------------------------------------------------------------------
/tests/images/Wangjing_Soho.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/Wangjing_Soho.jpg


--------------------------------------------------------------------------------
/tests/images/child_artwork.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/child_artwork.jpg


--------------------------------------------------------------------------------
/tests/images/chinese_dragon.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/chinese_dragon.jpg


--------------------------------------------------------------------------------
/tests/images/pickled_snake.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/pickled_snake.jpg


--------------------------------------------------------------------------------
/tests/images/sample_rotated.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/sample_rotated.jpg


--------------------------------------------------------------------------------
/tests/images/some_old_actor.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/some_old_actor.jpg


--------------------------------------------------------------------------------
/vision/__init__.py:
--------------------------------------------------------------------------------
1 | from .vision import Vision
2 | from .gpt4vision import GPT4Vision
3 | from .claude_vision import ClaudeVision


--------------------------------------------------------------------------------
/tests/images/Philz_Coffee_Cup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/Philz_Coffee_Cup.jpg


--------------------------------------------------------------------------------
/tests/images/SF_Cask_and_Lark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/SF_Cask_and_Lark.png


--------------------------------------------------------------------------------
/tests/images/STEM_card_Chinese.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/STEM_card_Chinese.jpg


--------------------------------------------------------------------------------
/tests/images/agility_ladder.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/agility_ladder.webp


--------------------------------------------------------------------------------
/tests/images/airpod_case_open.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/airpod_case_open.jpg


--------------------------------------------------------------------------------
/tests/images/boxing_equipment.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/boxing_equipment.webp


--------------------------------------------------------------------------------
/tests/images/cars_behind_wall.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/cars_behind_wall.jpg


--------------------------------------------------------------------------------
/tests/images/chinese_sign_1.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/chinese_sign_1.webp


--------------------------------------------------------------------------------
/tests/images/chinese_sign_2.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/chinese_sign_2.webp


--------------------------------------------------------------------------------
/tests/images/figurine_on_table.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/figurine_on_table.jpg


--------------------------------------------------------------------------------
/tests/images/hp_mouse_512x512.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/hp_mouse_512x512.jpg


--------------------------------------------------------------------------------
/tests/images/motivational_sign.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/motivational_sign.jpg


--------------------------------------------------------------------------------
/tests/images/small_trash_can.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/small_trash_can.jpg


--------------------------------------------------------------------------------
/tests/images/thomas_the_train.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/thomas_the_train.jpg


--------------------------------------------------------------------------------
/tests/images/toys_strewn_about.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/toys_strewn_about.jpg


--------------------------------------------------------------------------------
/tests/images/winning_gloves.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/winning_gloves.webp


--------------------------------------------------------------------------------
/tests/images/book_atomic_habits.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/book_atomic_habits.jpg


--------------------------------------------------------------------------------
/tests/images/book_viral_justice.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/book_viral_justice.jpg


--------------------------------------------------------------------------------
/tests/images/eggs_banana_avocado.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/eggs_banana_avocado.jpg


--------------------------------------------------------------------------------
/tests/images/flower_arrangement.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/flower_arrangement.jpg


--------------------------------------------------------------------------------
/tests/images/frame_smart_glasses.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_smart_glasses.jpg


--------------------------------------------------------------------------------
/tests/images/pug_asian_clothing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/pug_asian_clothing.jpg


--------------------------------------------------------------------------------
/tests/images/dragon_fruit_in_box.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/dragon_fruit_in_box.webp


--------------------------------------------------------------------------------
/tests/images/toy_track_incomplete.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/toy_track_incomplete.jpg


--------------------------------------------------------------------------------
/tests/images/STEM_card_Chinese_closeup.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/STEM_card_Chinese_closeup.jpg


--------------------------------------------------------------------------------
/tests/images/bobak_dressed_for_weather.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/bobak_dressed_for_weather.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/Window_Tree.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Window_Tree.jpg


--------------------------------------------------------------------------------
/tests/images/woman_strange_fuzzy_pants.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/woman_strange_fuzzy_pants.jpeg


--------------------------------------------------------------------------------
/tests/images/colorful_landscape_paintings.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/colorful_landscape_paintings.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/Laptop_Discord_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Laptop_Discord_1.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/Laptop_Discord_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Laptop_Discord_2.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/Shelves_Blurry_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Shelves_Blurry_1.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/Shelves_Blurry_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Shelves_Blurry_2.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/Window_Overexposed.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Window_Overexposed.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/test_camera_image1.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image1.webp


--------------------------------------------------------------------------------
/tests/images/frame_captures/test_camera_image2.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image2.webp


--------------------------------------------------------------------------------
/tests/images/frame_captures/test_camera_image3.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image3.webp


--------------------------------------------------------------------------------
/tests/images/frame_captures/test_camera_image4.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image4.webp


--------------------------------------------------------------------------------
/tests/images/frame_captures/test_camera_image5.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image5.webp


--------------------------------------------------------------------------------
/tests/images/frame_captures/test_camera_image6.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image6.webp


--------------------------------------------------------------------------------
/tests/images/frame_captures/test_camera_image7.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image7.webp


--------------------------------------------------------------------------------
/tests/images/frame_captures/test_camera_image8.webp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/test_camera_image8.webp


--------------------------------------------------------------------------------
/tests/images/frame_captures/Fingers_Overexposed_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Fingers_Overexposed_1.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/Fingers_Overexposed_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Fingers_Overexposed_2.jpg


--------------------------------------------------------------------------------
/tests/images/frame_captures/Fingers_Overexposed_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brilliantlabsAR/noa-assistant/HEAD/tests/images/frame_captures/Fingers_Overexposed_3.jpg


--------------------------------------------------------------------------------
/web_search/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | from .web_search import WebSearch, WebSearchResult
3 | from .dataforseo import DataForSEOWebSearch
4 | from .serp import SerpWebSearch
5 | from .perplexity import PerplexityWebSearch


--------------------------------------------------------------------------------
/assistant/__init__.py:
--------------------------------------------------------------------------------
1 | from .assistant import Assistant, AssistantResponse
2 | from .gpt_assistant import GPTAssistant
3 | from .claude_assistant import ClaudeAssistant
4 | from .context import extract_learned_context


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .api import Role, Message, Capability, SearchAPI, VisionModel, GenerateImageService, MultimodalRequest, MultimodalResponse, ExtractLearnedContextRequest, ExtractLearnedContextResponse
2 | from .token_usage import TokenUsage, accumulate_token_usage


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .DS_Store
 2 | .env*
 3 | *.exe
 4 | *.txt
 5 | *.json
 6 | *.jpg
 7 | *.png
 8 | *.md
 9 | *.html
10 | *.wav
11 | !docs/noa_assistant.drawio.png
12 | !.env.example
13 | !requirements.txt
14 | __pycache__
15 | dev_deploy.sh
16 | load_audio.sh
17 | env
18 | Dockerfile
19 | audio_logs
20 | audio
21 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | IMAGE_CDN=your_image_cdn
 2 | SERPAPI_API_KEY=serp_api_key
 3 | DATAFORSEO_USERNAME=dataforseo_username (optoinal)
 4 | DATAFORSEO_PASSWORD=dataforseo_password (optional)
 5 | OPENAI_API_KEY=open_ai_key
 6 | EXPERIMENT_AI_PORT=8000
 7 | SEARCH_API=serp
 8 | ANTHROPIC_API_KEY=anthropic_api_key
 9 | SCENARIO_API_KEY=scenario_api_key
10 | REPLICATE_API_TOKEN=repliate_api_token


--------------------------------------------------------------------------------
/generate_image/generate_image.py:
--------------------------------------------------------------------------------
 1 | # 
 2 | #  Generate image from text or image.
 3 | # 
 4 | 
 5 | from abc import ABC, abstractmethod
 6 | from dataclasses import dataclass
 7 | from typing import List
 8 | 
 9 | from pydantic import BaseModel
10 | 
11 | 
12 | 
13 | class GenerateImage(ABC):
14 |     @abstractmethod
15 |     def generate_image(
16 |         query: str,
17 |         use_image: bool,
18 |         image_bytes: bytes | None,
19 |     ) -> str:
20 |         pass


--------------------------------------------------------------------------------
/vision/vision.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # vision.py
 3 | #
 4 | # Vision tool base class.
 5 | #
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | from dataclasses import dataclass
 9 | from typing import Dict
10 | 
11 | from models import TokenUsage
12 | 
13 | 
14 | @dataclass
15 | class VisionOutput:
16 |     response: str
17 |     web_query: str
18 |     reverse_image_search: bool
19 | 
20 |     def web_search_needed(self):
21 |         return len(self.web_query) > 0
22 | 
23 | class Vision(ABC):
24 |     @abstractmethod
25 |     async def query_image(self, query: str, extra_context: str, image_bytes: bytes | None, token_usage_by_model: Dict[str, TokenUsage]) -> VisionOutput | None:
26 |         pass


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | annotated-types==0.6.0
 2 | anyio==3.7.1
 3 | certifi==2023.11.17
 4 | charset-normalizer==3.3.2
 5 | click==8.1.7
 6 | colorama==0.4.6
 7 | distro==1.8.0
 8 | fastapi~=0.109.1
 9 | geographiclib==2.0
10 | geopy==2.4.1
11 | h11==0.14.0
12 | httpcore==1.0.2
13 | httpx==0.25.2
14 | idna==3.7
15 | openai==1.3.5
16 | pydantic==2.5.2
17 | pydantic_core==2.14.5
18 | pydub==0.25.1
19 | python-multipart==0.0.7
20 | requests==2.31.0
21 | serpapi==0.1.5
22 | sniffio==1.3.0
23 | starlette~=0.36.2
24 | tqdm==4.66.1
25 | typing_extensions==4.8.0
26 | urllib3==2.1.0
27 | uule-grabber==0.1.9
28 | uvicorn==0.27.0.post1
29 | ffmpeg
30 | replicate
31 | anthropic==0.25.6
32 | aiohttp
33 | groq
34 | opencv-python


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright © 2024 Brilliant Labs Ltd.
2 | 
3 | Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above copyright notice and this permission notice appear in all copies.
4 | 
5 | THE SOFTWARE IS PROVIDED “AS IS” AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.


--------------------------------------------------------------------------------
/models/token_usage.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from typing import Dict
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | 
 7 | class TokenUsage(BaseModel):
 8 |     input: int = 0
 9 |     output: int = 0
10 |     total: int = 0
11 | 
12 |     def add(self, token_usage: TokenUsage):
13 |         self.input += token_usage.input
14 |         self.output += token_usage.output
15 |         self.total += token_usage.total
16 | 
17 | def accumulate_token_usage(token_usage_by_model: Dict[str, TokenUsage], model: str, input_tokens: int, output_tokens: int, total_tokens: int):
18 |     token_usage = TokenUsage(input=input_tokens, output=output_tokens, total=total_tokens)
19 |     if model not in token_usage_by_model:
20 |         token_usage_by_model[model] = token_usage
21 |     else:
22 |         token_usage_by_model[model].add(token_usage=token_usage)


--------------------------------------------------------------------------------
/load_audio.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Set your SSH server details based on user's choice
 3 | SSH_USER=root
 4 | SSH_HOST=139.144.72.206
 5 | REMOTE_SCRIPT_PATH="audio_load.sh"
 6 | REMOTE_AUDIO_DIR="audio_logs/"
 7 | LOCAL_AUDIO_DIR="./audio_logs/"
 8 | 
 9 | # Ensure the local audio directory exists
10 | mkdir -p $LOCAL_AUDIO_DIR
11 | 
12 | # Perform SSH command to run audio_load.sh
13 | ssh $SSH_USER@$SSH_HOST "bash $REMOTE_SCRIPT_PATH"
14 | 
15 | # Check SSH exit code
16 | if [ $? -ne 0 ]; then
17 |     echo "SSH command execution failed"
18 |     exit $?
19 | fi
20 | 
21 | # Perform SCP transfer to copy audio files locally
22 | scp -r $SSH_USER@$SSH_HOST:$REMOTE_AUDIO_DIR* $LOCAL_AUDIO_DIR
23 | 
24 | # Check SCP exit code
25 | if [ $? -ne 0 ]; then
26 |     echo "SCP transfer failed"
27 |     exit $?
28 | fi
29 | 
30 | echo "Audio files copied to $LOCAL_AUDIO_DIR successfully"
31 | exit 0
32 | 


--------------------------------------------------------------------------------
/web_search/web_search.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # web_search.py
 3 | #
 4 | # Web search tool base class and result structure.
 5 | #
 6 | 
 7 | from abc import ABC, abstractmethod
 8 | from dataclasses import dataclass
 9 | from typing import Dict, List
10 | 
11 | from models import Message, TokenUsage
12 | 
13 | 
14 | @dataclass
15 | class WebSearchResult:
16 |     """
17 |     Web search result, used for all concrete implementations of WebSearch.
18 |     """
19 | 
20 |     #
21 |     # Summarized result, to be used as the tool response string.
22 |     #
23 |     summary: str
24 | 
25 |     #
26 |     # Implementation-specific metadata for debugging. Can contain e.g. search result links, etc. If
27 |     # we want to break out search result links for e.g., the mobile companion app, we should create
28 |     # a new field and avoid using this one.
29 |     #
30 |     search_provider_metadata: str
31 | 
32 | class WebSearch(ABC):
33 |     @abstractmethod
34 |     async def search_web(self, query: str, message_history: List[Message] | None, token_usage_by_model: Dict[str, TokenUsage], use_photo: bool = False, image_bytes: bytes | None = None, location: str | None = None) -> WebSearchResult:
35 |         pass
36 | 
37 | 
38 | 


--------------------------------------------------------------------------------
/tests/frame_camera.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "active": true,
 4 |         "name": "frame_camera",
 5 |         "default_image": "tests/images/black.jpg",
 6 |         "conversations": [
 7 |             [
 8 |                 { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image1.webp", "capabilities": [ "vision" ] },
 9 |                 { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image2.webp", "capabilities": [ "vision" ] },
10 |                 { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image3.webp", "capabilities": [ "vision" ] },
11 |                 { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image4.webp", "capabilities": [ "vision" ] },
12 |                 { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image5.webp", "capabilities": [ "vision" ] },
13 |                 { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image6.webp", "capabilities": [ "vision" ] },
14 |                 { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image7.webp", "capabilities": [ "vision" ] },
15 |                 { "text": "what is this?", "image": "tests/images/frame_Captures/test_camera_image8.webp", "capabilities": [ "vision" ] }
16 |             ]
17 |         ]
18 |     }
19 | ]


--------------------------------------------------------------------------------
/tests/frame_camera_laziness.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "active": true,
 4 |         "name": "frame_camera_laziness",
 5 |         "default_image": "tests/images/black.jpg",
 6 |         "conversations": [
 7 |             [
 8 |                 { "text": "what am i looking at right now?", "image": "tests/images/frame_Captures/Laptop_Discord_1.jpg", "capabilities": [ "vision" ] },
 9 |                 { "text": "can you guess which app I'm running?", "image": "tests/images/frame_Captures/Laptop_Discord_2.jpg", "capabilities": [ "vision" ] },
10 |                 { "text": "what do you see outside my window right now?", "image": "tests/images/frame_Captures/Window_Overexposed.jpg", "capabilities": [ "vision" ] },
11 |                 { "text": "can you tell what kind of tree is outside?", "image": "tests/images/frame_Captures/Window_Tree.jpg", "capabilities": [ "vision" ] },
12 |                 { "text": "what do you see on the shelves in front of me?", "image": "tests/images/frame_Captures/Shelves_Blurry_1.jpg", "capabilities": [ "vision" ] },
13 |                 { "text": "can you be a bit more specific than that? tell me about the decorative items.", "image": "tests/images/frame_Captures/Shelves_Blurry_2.jpg", "capabilities": [ "vision" ] },
14 |                 { "text": "how many fingers am i holding up?", "image": "tests/images/frame_Captures/Fingers_Overexposed_1.jpg", "capabilities": [ "vision" ] },
15 |                 { "text": "how many fingers am i holding up?", "image": "tests/images/frame_Captures/Fingers_Overexposed_2.jpg", "capabilities": [ "vision" ] },
16 |                 { "text": "how many fingers do you see?", "image": "tests/images/frame_Captures/Fingers_Overexposed_3.jpg", "capabilities": [ "vision" ] }
17 |             ]
18 |         ]
19 |     }
20 | ]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Noa Assistant
 2 |  This repository contains all AI stuff of Noa that can be used in parallel with\
 3 |  a Authentication server to protect APIs and throttled usage\
 4 | Working features:
 5 | 1. Conversational AI
 6 | 2. Conversation with Photos
 7 | 3. Voice transcription
 8 | 4. Web Search
 9 | 
10 | ### Setup
11 | 1. Copy the `.env.example` file to `.env` and fill in the keys
12 | Note: for reverse image search a image cdn needed, you can use any api which accepts image and returns a url.\
13 | format of the api should be like this.\
14 | Or this can be avoided if not using reverse image search
15 | ```sh
16 | curl -F'file=@path/to/yourfile.png' -Fexpires=2 example.com
17 | ```
18 | 
19 | 1. Create and activate a python virtual enviroment(optional). eg. from [freecodecamp](https://www.freecodecamp.org/news/how-to-setup-virtual-environments-in-python/)
20 | 
21 | 2. Install [ffmpeg](https://ffmpeg.org/download.html), and make sure its available in PATH variable
22 | 3. Install required python packages
23 | ```bash
24 | pip install -r requirements.txt
25 | ```
26 | 4. Run the server
27 | ```bash
28 | python app.py --server
29 | ```
30 | vision tools, search tools, assistant can be selected by passing the following arguments
31 | ```bash
32 | python app.py --server --vision gpt-4-vision-preview --search-api serpapi --assistant gpt
33 | ```
34 | 
35 | Now the server should be running on `http://localhost:8000` if default port is used
36 | ### API
37 | #### POST /mm
38 | ```javascript
39 | await fetch('localhost:8000/mm', {
40 |   method: 'POST',
41 |   body: new FormData({
42 |     mm: JSON.stringify({
43 |       prompt: 'who are you?',
44 |       messages: [
45 |         {
46 |             role: 'user',
47 |             content: 'Hi'
48 |         },
49 |         {
50 |             role: 'assistant',
51 |             content: 'Hello how can I help you?'
52 |         }
53 |       ],
54 |       gps: [23.646965, 87.159115],
55 |       local_time: 'Tuesday, March 12, 2024, 7:24 AM',
56 |       address: 'london',
57 |       vision: 'claude-3-haiku-20240307'
58 |     }),
59 |     image: new File(['path/to/yourfile.png'], 'yourfile.png'),
60 |     audio: new File(['path/to/yourfile.wav'], 'yourfile.wav')
61 |   })
62 | })
63 | ```
64 | #### POST /health
65 | ```javascript
66 | await fetch('localhost:8000/health')
67 | ```
68 | ### workflow
69 | ![Workflow](docs/noa_assistant.drawio.png)
70 | 


--------------------------------------------------------------------------------
/models/api.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # api.py
 3 | #
 4 | # Server API models.
 5 | #
 6 | 
 7 | from enum import Enum
 8 | from typing import Dict, List, Optional
 9 | 
10 | from pydantic import BaseModel
11 | 
12 | from .token_usage import TokenUsage
13 | 
14 | 
15 | class Role(str, Enum):
16 |     SYSTEM = "system"
17 |     ASSISTANT = "assistant"
18 |     USER = "user"
19 | 
20 | class Message(BaseModel):
21 |     role: Role
22 |     content: str
23 | 
24 | class Capability(str, Enum):
25 |     ASSISTANT_KNOWLEDGE = "assistant_knowledge"
26 |     WEB_SEARCH = "web_search"
27 |     VISION = "vision"
28 |     REVERSE_IMAGE_SEARCH = "reverse_image_search"
29 |     IMAGE_GENERATION = "image_generation"
30 | 
31 | class SearchEngine(str, Enum):
32 |     GOOGLE_REVERSE_IMAGE = "google_reverse_image"
33 |     GOOGLE_LENS = "google_lens"
34 |     GOOGLE = "google"
35 |     GOOGLE_JOBS = "google_jobs"
36 |     GOOGLE_NEWS = "google_news"
37 |     GOOGLE_SHOPPING = "google_shopping"
38 |     GOOGLE_TRAVEL = "google_travel"
39 |     GOOGLE_LOCAL = "google_local"
40 |     GOOGLE_IMERSIVE_PRODUCT = "google_immersive_product"
41 |     GOOGLE_FINANCE = "google_finance"
42 |     GOOGLE_EVENTS = "google_events"
43 |     GOOGLE_SCHOLAR = "google_scholar"
44 | 
45 | class SearchAPI(Enum):
46 |     SERP = "serp"
47 |     DATAFORSEO = "dataforseo"
48 |     PERPLEXITY = "perplexity"
49 | 
50 | class VisionModel(str, Enum):
51 |     GPT4O = "gpt-4o"
52 |     GPT4Vision = "gpt-4-vision-preview"
53 |     CLAUDE_HAIKU = "claude-3-haiku-20240307"
54 |     CLAUDE_SONNET = "claude-3-sonnet-20240229"
55 |     CLAUDE_OPUS = "claude-3-opus-20240229"
56 | 
57 | class GenerateImageService(str, Enum):
58 |     REPLICATE   = "replicate"
59 | 
60 | class MultimodalRequest(BaseModel):
61 |     messages: Optional[List[Message]]
62 |     prompt: Optional[str] = ""
63 |     noa_system_prompt: Optional[str] = None
64 |     assistant: Optional[str] = None         # assistant class: gpt, claude, perplexity, groq
65 |     assistant_model: Optional[str] = None   # specific model for the assistant class
66 |     search_api: Optional[SearchAPI] = None
67 |     search_engine: Optional[SearchEngine] = SearchEngine.GOOGLE
68 |     max_search_results: Optional[int] = 10
69 |     local_time: Optional[str] = None
70 |     address: Optional[str] = None
71 |     latitude: Optional[str] = None
72 |     longitude: Optional[str] = None
73 |     vision: Optional[VisionModel] = None,
74 |     speculative_vision: Optional[bool] = True
75 |     perplexity_key: Optional[str] = None
76 |     openai_key: Optional[str] = None
77 |     generate_image: Optional[int] = 0
78 |     generate_image_service: Optional[GenerateImageService] = GenerateImageService.REPLICATE
79 |     testing_mode: Optional[bool] = False
80 | 
81 | class MultimodalResponse(BaseModel):
82 |     user_prompt: str
83 |     response: str
84 |     image: str
85 |     token_usage_by_model: Dict[str, TokenUsage]
86 |     capabilities_used: List[Capability]
87 |     total_tokens: int
88 |     input_tokens: int
89 |     output_tokens: int
90 |     timings: str
91 |     debug_tools: str
92 | 
93 | class ExtractLearnedContextRequest(BaseModel):
94 |     messages: List[Message]
95 |     existing_learned_context: Dict[str, str]
96 | 
97 | class ExtractLearnedContextResponse(BaseModel):
98 |     learned_context: Dict[str, str]
99 |     token_usage_by_model: Dict[str, TokenUsage]


--------------------------------------------------------------------------------
/generate_image/replicate.py:
--------------------------------------------------------------------------------
 1 | from .generate_image import GenerateImage
 2 | from replicate import async_run as replicate
 3 | 
 4 | import requests
 5 | from io import BytesIO
 6 | 
 7 | import base64
 8 | 
 9 | NEGATIVE_PROMPT = 'ugly, tiling, poorly drawn hands, poorly drawn feet, poorly drawn face, out of frame, extra limbs, disfigured, deformed, body out of frame, bad anatomy, watermark, signature, cut off, low contrast, underexposed, overexposed, bad art, beginner, amateur, distorted face'
10 | POSITIVE_PROMPT = 'digital art, hyperrealistic, fantasy, artstation, highly detailed, sharp focus, studio lighting'
11 | 
12 | class ReplicateGenerateImage(GenerateImage):
13 |     def __init__(self, model: str='asiryan/juggernaut-xl-v7:6a52feace43ce1f6bbc2cdabfc68423cb2319d7444a1a1dae529c5e88b976382'):
14 |         super().__init__()
15 |         self._model = model
16 |     
17 |     async def generate_image(
18 |         self,
19 |         query: str,
20 |         use_image: bool,
21 |         image_bytes: bytes | None,
22 |     ) -> str:
23 |         if use_image:
24 |             if not image_bytes:
25 |                 raise ValueError('Image bytes must be provided')
26 |             input_base64_img = base64.b64encode(image_bytes).decode('utf-8')
27 |             response = await replicate(
28 |                 self._model,
29 |                 input={
30 |                     "width": 512,
31 |                     "height": 512,
32 |                     "prompt": f"{query}, {POSITIVE_PROMPT}",
33 |                     "image": f'data:image/png;base64,{input_base64_img}',
34 |                     "refine": "expert_ensemble_refiner",
35 |                     "scheduler": "K_EULER",
36 |                     "lora_scale": 0.5,
37 |                     "num_outputs": 1,
38 |                     "guidance_scale": 7.5,
39 |                     "apply_watermark": False,
40 |                     "high_noise_frac": 0.7,
41 |                     "negative_prompt": NEGATIVE_PROMPT,
42 |                     "prompt_strength": 0.8,
43 |                     "num_inference_steps": 30
44 |                 }
45 |             )
46 |         else:
47 |             raise NotImplementedError('Text generation is not imlemented yet')
48 |         # else:
49 |         #     response = replicate.run(
50 |         #         self._model,
51 |         #         input={
52 |         #             "width": 512,
53 |         #             "height": 512,
54 |         #             "prompt": f"{query}, {POSITIVE_PROMPT}",
55 |         #             "refine": "expert_ensemble_refiner",
56 |         #             "scheduler": "K_EULER",
57 |         #             "lora_scale": 0.6,
58 |         #             "num_outputs": 1,
59 |         #             "guidance_scale": 7.5,
60 |         #             "apply_watermark": False,
61 |         #             "high_noise_frac": 0.8,
62 |         #             "negative_prompt": "",
63 |         #             "prompt_strength": 1,
64 |         #             "num_inference_steps": 25
65 |         #         }
66 |         #     )
67 |             
68 |         #  response is url of image
69 |         # make it base64
70 |         image_url = response[0]
71 |         response = requests.get(image_url)
72 |         if response.status_code != 200:
73 |             return 'Failed to generate image'
74 |         # convert to base64
75 |         base64_img = base64.b64encode(response.content).decode('utf-8')
76 |         return base64_img
77 |         
78 | GenerateImage.register(ReplicateGenerateImage)


--------------------------------------------------------------------------------
/assistant/assistant.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # assistant.py
 3 | #
 4 | # Assistant base class and associated data structures.
 5 | #
 6 | 
 7 | from __future__ import annotations
 8 | from abc import ABC, abstractmethod
 9 | from dataclasses import dataclass
10 | from typing import Dict, List
11 | 
12 | from pydantic import BaseModel
13 | 
14 | from models import Message, Capability, TokenUsage
15 | from web_search import WebSearch
16 | from vision import Vision
17 | 
18 | @dataclass
19 | class AssistantResponse:
20 |     token_usage_by_model: Dict[str, TokenUsage]
21 |     capabilities_used: List[Capability]
22 |     response: str
23 |     debug_tools: str    # debugging information about tools used (no particular format guaranteed)
24 |     timings: str
25 |     image: str | None = None
26 | 
27 | class Assistant(ABC):
28 |     @abstractmethod
29 |     async def send_to_assistant(
30 |         prompt: str,
31 |         noa_system_prompt: str | None,
32 |         image_bytes: bytes | None, 
33 |         message_history: List[Message] | None, 
34 |         learned_context: Dict[str, str],
35 |         local_time: str | None,
36 |         location_address: str | None,
37 |         model: str | None,
38 |         web_search: WebSearch,
39 |         vision: Vision,
40 |         speculative_vision: bool
41 |     ) -> AssistantResponse:
42 |         """
43 |         Sends a message from user to assistant.
44 | 
45 |         Parameters
46 |         ----------
47 |         prompt : str
48 |             User message.
49 |         image_bytes : bytes | None
50 |             Image of what user is looking at.
51 |         message_history : List[Mesage] | None
52 |             Conversation history, excluding current user message we will run inference on.
53 |         learned_context : Dict[str, str]
54 |             Learned context about the user, as key-value pairs.
55 |         local_time : str | None
56 |             User's local time in a human-readable format, which helps the LLM answer questions where
57 |             the user indirectly references the date or time. E.g.,
58 |             "Saturday, March 30, 2024, 1:21 PM".
59 |         location_address : str | None
60 |             User's current location, specified as a full or partial address. This provides context
61 |             to the LLM and is especially useful for web searches. E.g.,
62 |             "3985 Stevens Creek Blvd, Santa Clara, CA 95051".
63 |         model : str | None
64 |             Assistant model. Valid values will depend on the assistant implementation (e.g., OpenAI-
65 |             based assistants will take "gpt-3.5-turbo", etc.) A default will be selected if None is
66 |             passed.
67 |         web_search : WebSearch
68 |             Web search provider, invoked when a web search (including a reverse image search) is
69 |             needed.
70 |         vision : Vision
71 |             Vision AI provider, invoked when understanding of what user is looking at is required.
72 |         speculative_vision : bool
73 |             Whether to perform speculative vision queries (if supported by assistant). This will run
74 |             the vision tool in parallel with the initial LLM request in *all* cases, using the user 
75 |             prompt as the query, but only use the result if the LLM then determines the vision tool
76 |             should have been used. This reduces latency by the duration of the initial LLM call by
77 |             giving the vision tool (which is usually slow) a head start.
78 | 
79 |         Returns
80 |         -------
81 |         AssistantResponse
82 |             Assistant response (text and some required analytics).
83 |         """
84 |         pass


--------------------------------------------------------------------------------
/web_search/async_serpapi_client/async_serpapi_client.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # async_serpapi_client.py
  3 | #
  4 | # An asynchronous version of the SerpAPI client built on aiohttp. This is based on the serpapi
  5 | # package's Client class. If we import a newer version of the pacakage with substantial changes to
  6 | # the API, we will need to update this async client.
  7 | #
  8 | 
  9 | import aiohttp
 10 | 
 11 | from serpapi.__version__ import __version__
 12 | from serpapi import SerpResults
 13 | 
 14 | 
 15 | class AsyncSerpAPIClient:
 16 |     BASE_DOMAIN = "https://serpapi.com"
 17 |     USER_AGENT = f"serpapi-python, v{__version__}"
 18 |     DASHBOARD_URL = "https://serpapi.com/dashboard"
 19 | 
 20 |     def __init__(self, api_key: str, session: aiohttp.ClientSession):
 21 |         self._api_key = api_key
 22 |         self._session = session
 23 | 
 24 |     def __del__(self):
 25 |         self._session.detach()
 26 | 
 27 |     def __repr__(self):
 28 |         return "<Async SerpApi Client>"
 29 | 
 30 |     async def search(self, params: dict = None, **kwargs) -> SerpResults | str:
 31 |         """Fetch a page of results from SerpApi. Returns a :class:`SerpResults <serpapi.client.SerpResults>` object, or unicode text (*e.g.* if ``'output': 'html'`` was passed).
 32 | 
 33 |         The following three calls are equivalent:
 34 | 
 35 |         .. code-block:: python
 36 | 
 37 |             >>> s = serpapi.search(q="Coffee", location="Austin, Texas, United States")
 38 | 
 39 |         .. code-block:: python
 40 | 
 41 |             >>> params = {"q": "Coffee", "location": "Austin, Texas, United States"}
 42 |             >>> s = serpapi.search(**params)
 43 | 
 44 |         .. code-block:: python
 45 | 
 46 |             >>> params = {"q": "Coffee", "location": "Austin, Texas, United States"}
 47 |             >>> s = serpapi.search(params)
 48 | 
 49 | 
 50 |         :param q: typically, this is the parameter for the search engine query.
 51 |         :param engine: the search engine to use. Defaults to ``google``.
 52 |         :param output: the output format desired (``html`` or ``json``). Defaults to ``json``.
 53 |         :param api_key: the API Key to use for SerpApi.com.
 54 |         :param **: any additional parameters to pass to the API.
 55 | 
 56 | 
 57 |         **Learn more**: https://serpapi.com/search-api
 58 |         """
 59 |         path = "/search"
 60 |         assert_200 = True
 61 | 
 62 |         if params is None:
 63 |             params = {}
 64 | 
 65 |         if kwargs:
 66 |             params.update(kwargs)
 67 |         
 68 |         # Inject the API Key into the params.
 69 |         if "api_key" not in params:
 70 |             params["api_key"] = self._api_key
 71 | 
 72 |         # Build the URL, as needed
 73 |         if not path.startswith("http"):
 74 |             url = self.BASE_DOMAIN + path
 75 |         else:
 76 |             url = path
 77 | 
 78 |         # Make the HTTP request.       
 79 |         headers = {"User-Agent": self.USER_AGENT}
 80 | 
 81 |         # Perform GET
 82 |         async with self._session.get(url=url, params=params, headers=headers) as response:
 83 |             if assert_200:
 84 |                 response.raise_for_status()
 85 |             return await self._serp_results_from_json(response=response)
 86 |     
 87 |     @staticmethod
 88 |     async def _serp_results_from_json(response: aiohttp.ClientResponse):
 89 |         """Construct a SerpResults object from an HTTP response.
 90 | 
 91 |         :param assert_200: if ``True`` (default), raise an exception if the status code is not 200.
 92 |         :param client: the Client instance which was used to send this request.
 93 | 
 94 |         An instance of this class is returned if the response is a valid JSON object.
 95 |         Otherwise, the raw text (as a properly decoded unicode string) is returned.
 96 |         """
 97 | 
 98 |         try:
 99 |             return SerpResults(data=await response.json(), client=None)
100 |         except ValueError:
101 |             # If the response is not JSON, return the raw text.
102 |             return await response.text()


--------------------------------------------------------------------------------
/vision/gpt4vision.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # gpt4vision.py
  3 | #
  4 | # Vision tool implementation based on GPT-4.
  5 | #
  6 | 
  7 | import base64
  8 | from typing import Dict, Optional
  9 | 
 10 | import openai
 11 | from pydantic import BaseModel
 12 | 
 13 | from .vision import Vision, VisionOutput
 14 | from .utils import detect_media_type
 15 | from models import TokenUsage, accumulate_token_usage
 16 | 
 17 | 
 18 | SYSTEM_MESSAGE = """
 19 | You are Noa, a smart personal AI assistant inside the user's AR smart glasses that answers all user
 20 | queries and questions. You have access to a photo from the smart glasses camera of what the user was
 21 | seeing at the time they spoke but you NEVER mention the photo or image and instead respond as if you
 22 | are actually seeing.
 23 | 
 24 | The camera is unfortunately VERY low quality but the user is counting on you to interpret the
 25 | blurry, pixelated images. NEVER comment on image quality. Do your best with images.
 26 | 
 27 | ALWAYS respond with a JSON object with these fields:
 28 | 
 29 | response: (String) Respond to user as best you can. Be precise, get to the point, and speak as though you actually see the image.
 30 | web_query: (String) Empty if your "response" answers everything user asked. If web search based on visual description would be more helpful, create a query (e.g. up-to-date, location-based, or product info).
 31 | reverse_image_search: (Bool) True if your web query from description is insufficient and including the *exact* thing user is looking at as visual target is needed.
 32 | """
 33 | 
 34 | class ModelOutput(BaseModel):
 35 |     response: str
 36 |     web_query: Optional[str] = None
 37 |     reverse_image_search: Optional[bool] = None
 38 | 
 39 | 
 40 | class GPT4Vision(Vision):
 41 |     def __init__(self, client: openai.AsyncOpenAI, model: str = "gpt-4o"):
 42 |         self._client = client
 43 |         self._model = model
 44 |     
 45 |     @property
 46 |     def model(self):
 47 |         return self._model
 48 |     
 49 |     async def query_image(self, query: str, extra_context: str, image_bytes: bytes | None, token_usage_by_model: Dict[str, TokenUsage]) -> VisionOutput | None:
 50 |         messages = [
 51 |             { "role": "system", "content": SYSTEM_MESSAGE + extra_context },
 52 |             {
 53 |                 "role": "user",
 54 |                 "content": [
 55 |                     { "type": "text", "text": query }
 56 |                 ]
 57 |             }
 58 |         ]
 59 |         
 60 |         if image_bytes:
 61 |             image_base64 = base64.b64encode(image_bytes).decode("utf-8")
 62 |             media_type = detect_media_type(image_bytes=image_bytes)
 63 |             messages[1]["content"].append({ "type": "image_url", "image_url": { "url": f"data:{media_type};base64,{image_base64}" } }),
 64 |         
 65 |         response = await self._client.chat.completions.create(
 66 |             model=self._model,
 67 |             messages=messages,
 68 |             max_tokens=4096
 69 |         )
 70 | 
 71 |         accumulate_token_usage(
 72 |             token_usage_by_model=token_usage_by_model,
 73 |             model=self._model,
 74 |             input_tokens=response.usage.prompt_tokens,
 75 |             output_tokens=response.usage.completion_tokens,
 76 |             total_tokens=response.usage.total_tokens
 77 |         )
 78 |         
 79 |         # Convert to VisionResponse and return
 80 |         output = self._parse_response(content=response.choices[0].message.content)
 81 |         if output is None:
 82 |             return None
 83 |         web_query = output.web_query if output.web_query is not None else ""
 84 |         reverse_image_search = output.reverse_image_search is not None and output.reverse_image_search == True
 85 |         if len(web_query) == 0 and reverse_image_search:
 86 |             # If no web query output but reverse image search asked for, just use user query
 87 |             # directly. This is sub-optimal and it would be better to figure out a way to ensure
 88 |             # web_query is generated when reverse_image_search is true.
 89 |             web_query = query
 90 |         return VisionOutput(response=output.response, web_query=web_query, reverse_image_search=reverse_image_search)
 91 |     
 92 |     @staticmethod
 93 |     def _parse_response(content: str) -> ModelOutput | None:
 94 |         # Response expected to be JSON but may be wrapped with ```json ... ```
 95 |         json_start = content.find("{")
 96 |         json_end = content.rfind("}")
 97 |         json_string = content[json_start : json_end + 1]
 98 |         try:
 99 |             return ModelOutput.model_validate_json(json_data=json_string)
100 |         except:
101 |             pass
102 |         return None
103 |     
104 | Vision.register(GPT4Vision)


--------------------------------------------------------------------------------
/vision/claude_vision.py:
--------------------------------------------------------------------------------
  1 | 
  2 | #
  3 | # claude_vision.py
  4 | #
  5 | # Vision tool implementation based on Anthropic.
  6 | #
  7 | 
  8 | import base64
  9 | from typing import Dict, Optional
 10 | 
 11 | import anthropic
 12 | from pydantic import BaseModel
 13 | 
 14 | from .vision import Vision, VisionOutput
 15 | from .utils import detect_media_type
 16 | from models import TokenUsage, accumulate_token_usage
 17 | 
 18 | 
 19 | SYSTEM_MESSAGE = """
 20 | You are Noa, a smart personal AI assistant inside the user's AR smart glasses that answers all user
 21 | queries and questions. You have access to a photo from the smart glasses camera of what the user was
 22 | seeing at the time they spoke but you NEVER mention the photo or image and instead respond as if you
 23 | are actually seeing.
 24 | 
 25 | Always do your best with images, never comment on their quality, and answer decisively with a guess
 26 | if you are not sure. There are no negative consequences to guessing.
 27 | 
 28 | ALWAYS respond with a JSON object with these fields:
 29 | 
 30 | response: (String) Respond to user as best you can. Be precise, get to the point, never comment on image quality.
 31 | web_query: (String) Web query to answer the user's request.
 32 | web_search_needed: (Bool) Whether to search the web. True ONLY if "response" does not answer the user query precisely enough and up-to-date, location-specific, or product-specific info is needed.
 33 | """
 34 | 
 35 | class ModelOutput(BaseModel):
 36 |     response: str
 37 |     web_query: Optional[str] = None
 38 |     web_search_needed: Optional[bool] = None
 39 | 
 40 | class ClaudeVision(Vision):
 41 |     def __init__(self, client: anthropic.AsyncAnthropic, model: str="claude-3-haiku-20240307"):
 42 |         self._client = client
 43 |         self._model = model
 44 |     
 45 |     async def query_image(self, query: str, extra_context: str, image_bytes: bytes | None, token_usage_by_model: Dict[str, TokenUsage]) -> VisionOutput | None:
 46 |         image_base64 = base64.b64encode(image_bytes).decode("utf-8") if image_bytes is not None else ""
 47 | 
 48 |         messages = [
 49 |             {
 50 |                 "role": "user",
 51 |                 "content": [
 52 |                     {
 53 |                         "type": "image",
 54 |                         "source": {
 55 |                             "type": "base64",
 56 |                             "media_type": detect_media_type(image_bytes=image_bytes),
 57 |                             "data": image_base64
 58 |                         }
 59 |                     },
 60 |                     {
 61 |                         "type": "text",
 62 |                         "text": query
 63 |                     }
 64 |                 ]
 65 |             },
 66 |             {
 67 |                 # Prefill a leading '{' to force JSON output as per Anthropic's recommendations
 68 |                 "role": "assistant",
 69 |                 "content": [
 70 |                     {
 71 |                         "type": "text",
 72 |                         "text": "{"
 73 |                     }
 74 |                 ]
 75 |             }   
 76 |         ]
 77 | 
 78 |         response = await self._client.messages.create(
 79 |             model=self._model,
 80 |             system=SYSTEM_MESSAGE + extra_context,
 81 |             messages=messages,
 82 |             max_tokens=4096,
 83 |             temperature=0.0,
 84 |         )
 85 | 
 86 |         accumulate_token_usage(
 87 |             token_usage_by_model=token_usage_by_model,
 88 |             model=self._model,
 89 |             input_tokens=response.usage.input_tokens,
 90 |             output_tokens=response.usage.output_tokens,
 91 |             total_tokens=response.usage.input_tokens + response.usage.output_tokens
 92 |         )
 93 |   
 94 |         # Convert to VisionResponse and return
 95 |         print(f"ClaudeVision input: {query}")
 96 |         print(f"ClaudeVision model output: {response.content[0].text}")
 97 |         output = self._parse_response(content=response.content[0].text)
 98 |         if output is None:
 99 |             return None
100 |         web_search_needed = output.web_search_needed and output.web_query is not None and len(output.web_query) > 0
101 |         web_query = output.web_query if web_search_needed else ""
102 |         reverse_image_search = False    # for now, we don't perform reverse image search because uncertain where it is really useful
103 |         return VisionOutput(response=output.response, web_query=web_query, reverse_image_search=reverse_image_search)
104 |     
105 |     @staticmethod
106 |     def _parse_response(content: str) -> ModelOutput | None:
107 |         # Put the leading '{' back
108 |         json_string = "{" + content
109 |         try:
110 |             return ModelOutput.model_validate_json(json_data=json_string)
111 |         except:
112 |             pass
113 |         return None
114 | 
115 | Vision.register(ClaudeVision)


--------------------------------------------------------------------------------
/assistant/context.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # context.py
  3 | #
  4 | # Routines for creating a message containing additional context about the user. These messages
  5 | # should be injected into the conversation.
  6 | #
  7 | # Information about the user can be extracted by analyzing batches of their messages and turned into
  8 | # a simple list of key-value pairs. Feeding these back to the assistant will produce more relevant,
  9 | # contextually-aware, and personalized responses.
 10 | #
 11 | 
 12 | from typing import Dict, List
 13 | 
 14 | import openai
 15 | import groq
 16 | 
 17 | from models import Role, Message, TokenUsage, accumulate_token_usage
 18 | 
 19 | 
 20 | ####################################################################################################
 21 | # Prompts
 22 | ####################################################################################################
 23 | 
 24 | # These are context keys we try to detect in conversation history over time
 25 | LEARNED_CONTEXT_KEY_DESCRIPTIONS = {
 26 |     "UserName": "User's name",
 27 |     "DOB": "User's date of birth",
 28 |     "Food": "Foods and drinks user has expressed interest in"
 29 | }
 30 | 
 31 | LEARNED_CONTEXT_EXTRACTION_SYSTEM_MESSAGE = f"""
 32 | Given a transcript of what the user said, look for any of the following information being revealed:
 33 | 
 34 | """ + "\n".join([ key + ": "  + description for key, description in LEARNED_CONTEXT_KEY_DESCRIPTIONS.items() ]) + """
 35 | 
 36 | Make sure to list them in this format:
 37 | 
 38 | KEY=VALUE
 39 | 
 40 | If nothing was found, just say "END". ONLY PRODUCE ITEMS WHEN THE USER HAS ACTUALLY REVEALED THEM.
 41 | """
 42 | 
 43 | CONTEXT_SYSTEM_MESSAGE_PREFIX = "## Additional context about the user:"
 44 | 
 45 | 
 46 | ####################################################################################################
 47 | # Functions
 48 | ####################################################################################################
 49 | 
 50 | def create_context_system_message(local_time: str | None, location: str | None, learned_context: Dict[str,str] | None) -> str:
 51 |     """
 52 |     Creates a string of additional context that can either be appended to the main system
 53 |     message or as a secondary system message before delivering the assistant response. This is
 54 |     how GPT is made aware of the user's location, local time, and any learned information that
 55 |     was extracted from prior conversation.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |     local_time : str | None
 60 |         Local time, if known.
 61 |     location : str | None
 62 |         Location, as a human readable address, if known.
 63 |     learned_context : Dict[str,str] | None
 64 |         Information learned from prior conversation as key-value pairs, if any.
 65 | 
 66 |     Returns
 67 |     -------
 68 |     str
 69 |         Message to combine with existing system message or to inject as a new, extra system
 70 |         message.
 71 |     """
 72 |     # Fixed context: things we know and need not extract from user conversation history
 73 |     context: Dict[str, str] = {}
 74 |     if local_time is not None and len(local_time) > 0:
 75 |         context["current_time"] = local_time
 76 |     else:
 77 |         context["current_time"] = "If asked, tell user you don't know current date or time because clock is broken"
 78 |     if location is not None and len(location) > 0:
 79 |         context["location"] = location
 80 |     else:
 81 |         context["location"] = "You do not know user's location and if asked, tell them so"
 82 | 
 83 |     # Merge in learned context
 84 |     if learned_context is not None:
 85 |         context.update(learned_context)
 86 | 
 87 |     # Convert to a list to be appended to a system message or treated as a new system message
 88 |     system_message_fragment = CONTEXT_SYSTEM_MESSAGE_PREFIX + "\n".join([ f"<{key}>{value}</{key}>" for key, value in context.items() if value is not None ])
 89 |     return system_message_fragment
 90 | 
 91 | async def extract_learned_context(
 92 |     client: openai.AsyncOpenAI | groq.AsyncGroq,
 93 |     message_history: List[Message],
 94 |     model: str,
 95 |     existing_learned_context: Dict[str, str],
 96 |     token_usage_by_model: Dict[str, TokenUsage]
 97 | ) -> Dict[str, str]:
 98 |     # Grab last N user messages
 99 |     max_user_history = 2
100 |     messages: List[Message] = []
101 |     for i in range(len(message_history) - 1, -1, -1):
102 |         if len(messages) >= max_user_history:
103 |             break
104 |         if message_history[i].role == Role.USER:
105 |             messages.append(message_history[i])
106 | 
107 |     # Insert system message and reverse so that it is in the right order
108 |     messages.append(Message(role=Role.SYSTEM, content=LEARNED_CONTEXT_EXTRACTION_SYSTEM_MESSAGE))
109 |     messages.reverse()
110 | 
111 |     # print("Context extraction input:")
112 |     # print(messages)
113 | 
114 |     # Process
115 |     response = await client.chat.completions.create(
116 |         model=model,
117 |         messages=messages
118 |     )
119 | 
120 |     # Do not forget to count tokens used!
121 |     accumulate_token_usage(
122 |         token_usage_by_model=token_usage_by_model,
123 |         model=model,
124 |         input_tokens=response.usage.prompt_tokens,
125 |         output_tokens=response.usage.completion_tokens,
126 |         total_tokens=response.usage.total_tokens
127 |     )
128 | 
129 |     # # Debug: print raw output of context extraction
130 |     # print("Learned context:")
131 |     # print(response.choices[0].message.content)
132 | 
133 |     # Parse it into a dictionary
134 |     learned_context: Dict[str,str] = {}
135 |     lines = response.choices[0].message.content.splitlines()
136 |     for line in lines:
137 |         parts = line.split("=")
138 |         if len(parts) == 2:
139 |             key, value = parts
140 |             if key in LEARNED_CONTEXT_KEY_DESCRIPTIONS:
141 |                 learned_context[key] = value
142 |     
143 |     # Merge with existing
144 |     existing_learned_context.update(learned_context)
145 |     return existing_learned_context


--------------------------------------------------------------------------------
/web_search/perplexity.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # perplexity.py
  3 | #
  4 | # Web search tool implementation based on Perplexity. Cannot perform searches with images.
  5 | #
  6 | 
  7 | from typing import Any, Dict, List
  8 | 
  9 | import aiohttp
 10 | from pydantic import BaseModel
 11 | from web_search import WebSearch, WebSearchResult
 12 | from models import Role, Message, TokenUsage, accumulate_token_usage
 13 | 
 14 | class PerplexityMessage(BaseModel):
 15 |     role: str = None
 16 |     content: str = None
 17 | 
 18 | class MessageChoices(BaseModel):
 19 |     index: int = None
 20 |     finish_reason: str = None
 21 |     message: PerplexityMessage = None
 22 |     delta: dict = None
 23 | 
 24 | class Usage(BaseModel):
 25 |     prompt_tokens: int = None
 26 |     completion_tokens: int = None
 27 |     total_tokens: int = None
 28 | 
 29 | class PerplexityResponse(BaseModel):
 30 |     id: str = None
 31 |     model: str = None
 32 |     created: int = None
 33 |     usage: Usage = None
 34 |     object: str = None
 35 |     choices: List[MessageChoices] = None
 36 |     
 37 |     def summarise(self) -> str:
 38 |         if len(self.choices) > 0:
 39 |             return self.choices[0].message.content
 40 |         else:
 41 |             return "No results"
 42 | 
 43 | class PerplexityWebSearch(WebSearch):
 44 |     def __init__(self, api_key: str, model: str = "llama-3-sonar-small-32k-online"):
 45 |         super().__init__()
 46 |         self._api_key = api_key
 47 |         self._model = model
 48 |         self._session = None
 49 |         self._stream = True
 50 | 
 51 |     def __del__(self):
 52 |         if self._session:
 53 |             self._session.detach()
 54 | 
 55 |     async def _lazy_init(self):
 56 |         if self._session is None:
 57 |             # This instantiation must happen inside of an async event loop
 58 |             self._session = aiohttp.ClientSession()
 59 |     
 60 |     async def search_web(
 61 |         self,
 62 |         query: str,
 63 |         message_history: List[Message] | None,
 64 |         token_usage_by_model: Dict[str, TokenUsage],
 65 |         use_photo: bool = False,
 66 |         image_bytes: bytes | None = None,
 67 |         location: str | None = None
 68 |     ) -> WebSearchResult:
 69 |         await self._lazy_init()
 70 | 
 71 |         message_history = [] if message_history is None else message_history.copy()
 72 |         message_history = self._prune_history(message_history=message_history)
 73 | 
 74 |         messages = [
 75 |             Message(role=Role.SYSTEM, content=self._system_message(location=location))
 76 |         ] + message_history + [
 77 |             Message(role=Role.USER, content=query)
 78 |         ]
 79 |         print(messages)
 80 | 
 81 |         url = "https://api.perplexity.ai/chat/completions"
 82 |         payload = {
 83 |             "model": self._model,
 84 |             "messages": [ message.model_dump() for message in messages ],
 85 |             "stream": self._stream,
 86 |         }
 87 |         headers = {
 88 |             "accept": "application/json",
 89 |             "content-type": "application/json",
 90 |             "authorization": f"Bearer {self._api_key}"
 91 |         }
 92 |         json_text = await self._post(url=url, payload=payload, headers=headers)
 93 |         if json_text is None:
 94 |             return WebSearchResult(summary="No results", search_provider_metadata="")
 95 | 
 96 |         # Return results
 97 |         # print(json_text)
 98 |         try:
 99 |             perplexity_data = PerplexityResponse.model_validate_json(json_text)
100 |         except Exception as e:
101 |             print(json_text)
102 |             print(f"Failed to parse Perplexity response: {e}")
103 |             return WebSearchResult(summary="No results", search_provider_metadata="")
104 |         accumulate_token_usage(
105 |             token_usage_by_model=token_usage_by_model,
106 |             model=self._model,
107 |             input_tokens=perplexity_data.usage.prompt_tokens,
108 |             output_tokens=perplexity_data.usage.completion_tokens,
109 |             total_tokens=perplexity_data.usage.total_tokens
110 |         )
111 |         search_result = perplexity_data.choices[0].message.content if len(perplexity_data.choices) > 0 else "No results"
112 |         return WebSearchResult(summary=search_result, search_provider_metadata="")
113 |     
114 |     async def _post(self, url: str, payload: Dict[str, Any], headers: Dict[str, str]) -> str | None:
115 |         async with self._session.post(url=url, json=payload, headers=headers) as response:
116 |             if response.status != 200:
117 |                 print(f"Failed to get response from Perplexity: {await response.text()}")
118 |                 return None
119 |             if self._stream:
120 |                 return_response = ""
121 |                 async for line in response.content.iter_any():
122 |                     return_response = line.decode("utf-8").split("data: ")[1].strip()
123 |                 return return_response
124 |             return await response.text()
125 | 
126 |     @staticmethod
127 |     def _system_message(location: str | None):
128 |         if location is None or len(location) == 0:
129 |             location = "<you do not know user's location and if asked, tell them so>"
130 |         return f"reply in concise and short with high accurancy from web results if needed take location as {location}"
131 | 
132 |     @staticmethod
133 |     def _prune_history(
134 |         message_history: List[Message],
135 |         max_messages: int = 8
136 |      ) -> List[Message]:
137 |         """
138 |         Prunes down the chat history to save tokens, improving inference speed and reducing cost.
139 |         Generally, preserving all assistant responses is not needed, and only a limited number of
140 |         user messages suffice to maintain a coherent conversation.
141 | 
142 |         Parameters
143 |         ----------
144 |         message_history : List[Message]
145 |             Conversation history. This list will be mutated and returned.
146 |         max_messages : int
147 |             Maximum number of messages to preserve. Must be an even number because Perplexity
148 |             requires alternating user and assistant messages.
149 | 
150 |         Returns
151 |         -------
152 |         List[Message]
153 |             Pruned history. This is the same list passed as input.
154 |         """
155 |         if max_messages %2 != 0:
156 |             print("ERROR: Discarding invalid message history for Perplexity. Require alternating user/assistant messages!")
157 |             return []
158 |         message_history.reverse()
159 |         message_history = [ message for message in message_history if message.role != Role.SYSTEM ]
160 |         message_history = message_history[0:max_messages]
161 |         message_history.reverse()
162 |         return message_history
163 | 
164 | WebSearch.register(PerplexityWebSearch)
165 | 


--------------------------------------------------------------------------------
/web_search/dataforseo.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # dataforseo.py
  3 | #
  4 | # Web search tool implementation using DataForSEO (dataforseo.com). Does not support images.
  5 | #
  6 | 
  7 | from base64 import b64encode
  8 | import json
  9 | import os
 10 | from typing import Any, Dict, List, Optional, Tuple
 11 | 
 12 | import aiohttp
 13 | from pydantic import BaseModel
 14 | import geopy.geocoders
 15 | 
 16 | from models import Message, TokenUsage
 17 | from .web_search import WebSearch, WebSearchResult
 18 | 
 19 | 
 20 | DATAFORSEO_USERNAME =  os.environ.get("DATAFORSEO_USERNAME", None)
 21 | DATAFORSEO_PASSWORD =  os.environ.get("DATAFORSEO_PASSWORD", None)
 22 | 
 23 | class Price(BaseModel):
 24 |     current: float = None
 25 |     display_price: str = None
 26 |     currency: str = None
 27 | 
 28 | class Rating(BaseModel):
 29 |     rating_type: str = None
 30 |     value: float = None
 31 |     votes_count: int = None
 32 |     rating_max: int = None
 33 | 
 34 | class SubItem(BaseModel):
 35 |     title: Optional[str] = None
 36 |     description: Optional[str] = None
 37 |     price : Optional[Price] = None
 38 |     rating: Optional[Rating] = None
 39 | 
 40 | class Item(BaseModel):
 41 |     type: str
 42 |     title: Optional[str] = None
 43 |     description: Optional[str] = None
 44 |     items:Optional[List[SubItem]|List[str]] = None
 45 | 
 46 | class Result(BaseModel):
 47 |     keyword: str
 48 |     type: str
 49 |     check_url: str
 50 |     items: List[Item]
 51 | 
 52 | class Task(BaseModel):
 53 |     id: str
 54 |     status_code: int
 55 |     status_message: str
 56 |     cost: float
 57 |     result: List[Result]|None
 58 | 
 59 | # /v3/serp/google/organic/live/advanced response object
 60 | class V3SerpGoogleOrganicLiveAdvancedResponse(BaseModel):
 61 |     status_code: int
 62 |     status_message: str
 63 |     cost: float
 64 |     tasks_count: int
 65 |     tasks_error: int
 66 |     tasks: List[Task]
 67 | 
 68 |     def summarise(self, max_search_results: int = 5) -> List[str]:
 69 |         item_types = [ "stock_box", "organic", "knowledge_graph", "local_pack", "popular_products", "top_stories" ]
 70 |         summaries = []
 71 |         for task in self.tasks:
 72 |             if not task.result:
 73 |                 continue
 74 |             for result in task.result:
 75 |                 for item in result.items:
 76 |                     if  item.type in item_types and max_search_results > 0 and (item.description or item.items):
 77 |                         # print(summaries)
 78 |                         if item.items:
 79 |                             for subitem in item.items:
 80 |                                 #print(subitem)
 81 | 
 82 |                                 if isinstance(subitem, SubItem) and subitem.title and max_search_results > 0 and subitem.description:
 83 |                                     summary = (f"{subitem.title}: " if subitem.title else "") + subitem.description
 84 |                                     if subitem.price:
 85 |                                         if subitem.price.display_price:
 86 |                                             summary += f"\nprice: {subitem.price.currency} {subitem.price.display_price}"
 87 |                                         elif subitem.price.currency:
 88 |                                             summary += f"\nprice: {subitem.price.currency} {subitem.price.current}"
 89 |                                     if subitem.rating:
 90 |                                         if subitem.rating.value:
 91 |                                             summary += f"\nrating: {subitem.rating.value} of {subitem.rating.rating_max} ({subitem.rating.votes_count} votes)"
 92 |                                     summaries.append(summary)
 93 |                                     max_search_results = max_search_results -1
 94 |                         if  item.description:
 95 |                             summary = (f"{item.title}: " if item.title else "") + item.description
 96 |                             summaries.append(summary)
 97 |                             max_search_results = max_search_results -1
 98 |         content = "\n".join(summaries) if len(summaries) > 0 else "No result found"
 99 |         return content
100 | 
101 | class DataForSEOClient:
102 |     def __init__(self):
103 |         self._session = aiohttp.ClientSession()
104 |         self._base_url = "https://api.dataforseo.com"
105 | 
106 |         base64_bytes = b64encode(
107 |                 ("%s:%s" % (DATAFORSEO_USERNAME, DATAFORSEO_PASSWORD)).encode("ascii")
108 |                 ).decode("ascii")
109 |         self._headers = {'Authorization' : 'Basic %s' %  base64_bytes, 'Content-Encoding' : 'gzip'}
110 | 
111 |     def __del__(self):
112 |         self._session.detach()
113 | 
114 |     async def _request(self, path, method, data=None) -> Any | None:
115 |         url = self._base_url + path
116 |         async with self._session.request(method=method, url=url, headers=self._headers, data=data) as response:
117 |             if response.status != 200:
118 |                 print(f"DataForSEO search failed: {await response.text()}")
119 |                 return None
120 |             return await response.json()
121 | 
122 |     async def _get(self, path):
123 |         return await self._request(path=path, method='GET')
124 | 
125 |     async def _post(self, path, data):
126 |         if isinstance(data, str):
127 |             data_str = data
128 |         else:
129 |             data_str = json.dumps(data)
130 |         return await self._request(path=path, method='POST', data=data_str)
131 |     
132 |     async def perform_search(self, query: str, location_coordinate: Tuple[float, float] | None = None, save_to_file: str | None = None) -> V3SerpGoogleOrganicLiveAdvancedResponse | None:
133 |         print("Searching web:")
134 |         print(f"  query: {query}")
135 | 
136 |         post_data = dict()
137 |         post_data[len(post_data)] = dict(
138 |             language_code = "en",
139 |             location_coordinate = f"{location_coordinate[0]},{location_coordinate[1]}" if location_coordinate else None,
140 |             keyword = query
141 |         )
142 |         response_obj = await self._post("/v3/serp/google/organic/live/advanced", post_data)
143 |         if response_obj is None:
144 |             return None
145 |         if save_to_file:
146 |             with open(save_to_file, mode="w") as fp:
147 |                 fp.write(json.dumps(response_obj, indent=2))
148 |         return V3SerpGoogleOrganicLiveAdvancedResponse.model_validate(response_obj)
149 | 
150 | class DataForSEOWebSearch(WebSearch):
151 |     def __init__(self, save_to_file: str | None = None, max_search_results: int = 5):
152 |         super().__init__()
153 |         self._save_to_file = save_to_file
154 |         self._max_search_results = max_search_results
155 |         self._client = None
156 |     
157 |     async def _lazy_init(self):
158 |         if self._client is None:
159 |             # This instantiation must happen inside of an async event loop because
160 |             # aiohttp.ClientSession()'s initializer requires that
161 |             self._client = DataForSEOClient()
162 | 
163 |     # DataForSEO does not have reverse image search, so photos are always ignored
164 |     async def search_web(self, query: str, message_history: List[Message] | None, token_usage_by_model: Dict[str, TokenUsage], use_photo: bool = False, image_bytes: bytes | None = None, location: str | None = None) -> WebSearchResult:
165 |         await self._lazy_init()
166 |         if location:
167 |             # DataForSEO expects lat,long+
168 |             location_coords = geopy.geocoders.Nominatim(user_agent="GetLoc").geocode(location)
169 |             coordinates = (location_coords.latitude, location_coords.longitude)
170 |         response = await self._client.perform_search(query=query, location_coordinate=coordinates, save_to_file=self._save_to_file)
171 |         summary = response.summarise(max_search_results=self._max_search_results) if response is not None else "No results found"
172 |         return WebSearchResult(summary=summary, search_provider_metadata="")
173 | 
174 | WebSearch.register(DataForSEOWebSearch)


--------------------------------------------------------------------------------
/tests/tests.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "active": true,
  4 |         "name": "test1",
  5 |         "default_image": "tests/images/black.jpg",
  6 |         "conversations": [
  7 |             [ "what is the weather today?" ],
  8 |             [ "where can i buy this?" ],
  9 |             [ "who makes these?" ],
 10 |             [ "where can i go to eat something like this?" ],
 11 |             [ "where can i buy this car?" ],
 12 |             [ "what is apple's stock price?" ],
 13 |             [ "what is weird about this scene?" ],
 14 |             [ "what is this" ],
 15 |             [ "translate this sign" ],
 16 |             [ "what is weird about this scene?" ],
 17 |             [ "translate this" ],
 18 |             [ "what is this?" ],
 19 |             [ "what is that?" ],
 20 |             [ "what are those?" ],
 21 |             [ "what are these?" ],
 22 |             [ "who is that?" ],
 23 |             [ "who is this?" ],
 24 |             [ "who are they?" ]
 25 |         ]
 26 |     },
 27 |     {
 28 |         "active": true,
 29 |         "name": "simple_image_queries",
 30 |         "default_image": "tests/images/black.jpg",
 31 |         "conversations": [
 32 |             [ "what is apple's stock price?" ],
 33 |             [ "who invented the light bulb?", "why are pencils sharp?" ],
 34 |             [ "who invented light bulb" ],
 35 |             [ "when did India gain independence" ],
 36 |             [ "which fruits contain vitamin c" ],
 37 |             [ "when is Einstein's birthday" ],
 38 |             [ "why are pencils sharp" ],
 39 | 
 40 |             [ "how many people in front of me?" ],
 41 |             [ "what is weird about this scene?" ],
 42 |             [ "translate this" ],
 43 |             [ "what is this?" ],
 44 |             [ "what is that?" ],
 45 |             [ "what are those?" ],
 46 |             [ "what are these?" ],
 47 |             [ "who is that?" ],
 48 |             [ "who is this?" ],
 49 |             [ "who are they?" ],
 50 |             [ "where can i buy this?" ],
 51 |             [ "where can i buy that?" ],
 52 |             [ "where can i buy these?" ],
 53 |             [ "where can i buy those?" ],
 54 |             [ "where can i buy this car?" ],
 55 |             [ "where can i buy that car?" ],
 56 |             [ "where can i buy these cars?" ],
 57 |             [ "where can i buy a car?" ],
 58 |             [ "how many calories in this" ],
 59 |             [
 60 |                 "i just ate a cheeseburger from mcdonald's",
 61 |                 "how many calories does it have?"
 62 |             ]
 63 |         ]
 64 |     },
 65 |     {
 66 |         "active": true,
 67 |         "name": "sam_i_am",
 68 |         "default_image": "tests/images/black.jpg",
 69 |         "conversations": [
 70 |             [ { "text": "what does this say?", "image": "tests/images/chinese_sign_1.webp" } ],
 71 |             [ { "text": "translate this sign", "image": "tests/images/chinese_sign_2.webp" } ],
 72 |             [ { "text": "describe the image", "image": "tests/images/boxing_equipment.webp" } ],
 73 |             [ { "text": "what brand are these gloves", "image": "tests/images/winning_gloves.webp" } ],
 74 |             [ { "text": "what does this say", "image": "tests/images/winning_gloves.webp" } ],
 75 |             [ { "text": "what is this used for?", "image": "tests/images/agility_ladder.webp" } ],
 76 |             [
 77 |                 { "text": "what does this say?", "image": "tests/images/chinese_sign_1.webp" },
 78 |                 { "text": "translate this sign", "image": "tests/images/chinese_sign_2.webp" },
 79 |                 { "text": "describe the image", "image": "tests/images/boxing_equipment.webp" },
 80 |                 { "text": "what brand are these gloves", "image": "tests/images/winning_gloves.webp" },
 81 |                 { "text": "what does this say", "image": "tests/images/winning_gloves.webp" },
 82 |                 { "text": "what is this used for?", "image": "tests/images/agility_ladder.webp" }
 83 |             ],
 84 |             [
 85 |                 { "text": "where does boxing come from?", "image": "tests/images/boxing_ring.webp" },
 86 |                 { "text": "what recent movies feature boxing?", "image": "tests/images/boxing_ring.webp" },
 87 |                 { "text": "were any boxing movies released post 2020?", "image": "tests/images/boxing_ring.webp" },
 88 |                 { "text": "what is this space used for?", "image": "tests/images/boxing_ring.webp" }
 89 |             ]
 90 |         ]
 91 |     },
 92 |     {
 93 |         "active": true,
 94 |         "name": "Bobak_GPT",
 95 |         "default_image": "tests/images/black.jpg",
 96 |         "conversations": [
 97 |             [ "Who does Tucker Carlson currently work for?" ],
 98 | 
 99 |             [ "how can i learn to be a more confident person?" ],
100 |             [ "what was the controversy surrounding pluto?" ],
101 |             [
102 |                 "what is cerebral palsy and is there a known cure?",
103 |                 "how about a change a diet, would that help?"
104 |             ],
105 |             [ "im a pediatrician living in san diego -- how much should i get paid and what is the estimated tax to be paid on that?" ],
106 |             [ "is there research which indicates what coffee is bad for the liver?" ],
107 |             [ 
108 |                 "how are scented candles manufactured?",
109 |                 "would it be more cost effective to manufacture them in the US or UK?"
110 |             ],
111 |             [ "why do iphones experience less OLED burn-in than Android phones even though they both use displays from Samsung?" ],
112 |             [ "my partner and i having a disagreement over when I should take paternity leave -- should should we resolve this?" ],
113 |             [ "my son is very interested in police and firemen but I'd like him to also be take interest in less stereotypically male things -- how should I do this?" ],
114 |             [
115 |                 "list coffee beans by origin",
116 |                 "which pair well with a walnut and fig scone?"
117 |             ]
118 |         ]
119 |     },
120 |     {
121 |         "active": false,
122 |         "name": "Bobak_web_search_location_based",
123 |         "default_image": "tests/images/black.jpg",
124 |         "conversations": [
125 |             [ 
126 |                 "does this place get good reviews?",
127 |                 "what are they known for?",
128 |                 "has there been a health code violation in this place in the last month?"
129 |             ],
130 |             [
131 |                 "is there a mattress available on craigslist for less than $50?"
132 |             ]
133 |         ]
134 |     },
135 |     {
136 |         "active": true,
137 |         "name": "Bobak_web_search",
138 |         "default_image": "tests/images/black.jpg",
139 |         "conversations": [
140 |             [ "who is openAI's newest board member?" ],
141 |             [ "how many israeli hostages are still being held by hamas in gaza?" ],
142 |             [ "how did Iran respond to pakistan's missile strike within its borders?" ],
143 |             [ "any labor rights issues associated with H&M recently?" ],
144 |             [ "how is nikki haley's polling right now?"]
145 |         ]
146 |     },
147 |     { 
148 |         "active": true,
149 |         "name": "Bobak_web_search_image",
150 |         "default_image": "tests/images/black.jpg",
151 |         "conversations": [
152 |             [ { "text": "where can i buy this screw for less than a dollar online?", "image": "tests/images/screw.jpg" } ]
153 |         ]
154 |     },
155 |     {
156 |         "active": true,
157 |         "name": "GPT_only",
158 |         "default_image": "tests/images/black.jpg",
159 |         "conversations": [
160 |             [
161 |                 "how can i learn to be a more confident person?"
162 |             ],
163 |             [
164 |                 "what was the controversy surrounding pluto?"
165 |             ],
166 |             [
167 |                 "what is cerebral palsy and is there a known cure?",
168 |                 "how about a change a diet, would that help?"
169 |             ],
170 |             [
171 |                 " is there research which indicates what coffee is bad for the liver?"
172 |             ],
173 |             [
174 |                 "how are scented candles manufactured?",
175 |                 "would it be more cost effective to manufacture them in the US or UK?"
176 |             ],
177 |             [
178 |                 "why do iphones experience less OLED burn-in than Android phones even though they both use displays from Samsung?"
179 |             ],
180 |             [
181 |                 "my partner and i having a disagreement over when I should take paternity leave, how should we resolve this?"
182 |             ],
183 |             [
184 |                 "my son is very interested in police and firemen but I'd like him to also be take interest in less stereotypically male things, how should I do this?"
185 |             ],
186 |             [
187 |                 "list coffee beans by origin which pair well with a walnut and fig scone?"
188 |             ]
189 |         ]
190 |     },
191 |     {
192 |         "active": true,
193 |         "name": "morning_6_serpapi.com",
194 |         "default_image": "tests/images/black.jpg",
195 |         "conversations": [
196 |             [
197 |                 "How cold is it right now? Is it raining, is it safe to run?",
198 |                 "Suggest a new two mile running route for me",
199 |                 "Any constructions/road work I should know about?"
200 |             ]
201 |         ]
202 |     },
203 |     {
204 |         "active": true,
205 |         "name": "morning_8_serpapi.com",
206 |         "default_image": "tests/images/black.jpg",
207 |         "conversations": [
208 |             [ "Give me an Apple News style summary of what's happening in new delhi" ]
209 |         ]
210 |     },
211 |     {
212 |         "active": true,
213 |         "name": "morning_6_GPT4",
214 |         "default_image":"tests/images/gym_setup.jpg",
215 |         "conversations": [
216 |             [
217 |                 "Look at my gym set up, suggest a HIIT routine for me using the bike, the kettle balls and the bar",
218 |                 "Create a visual representations of workout plan. Include diagrams of specific exercises, sets, and reps.",
219 |                 "Can you log the calories burned from this workout"
220 |             ]
221 |         ]
222 |     },
223 |     {
224 |         "active": true,
225 |         "name": "image_search_mouse",
226 |         "default_image": "tests/images/hp_mouse_512x512.jpg",
227 |         "conversations": [
228 |             [ "where can i find this in the lowest price?" ]
229 |         ]
230 |     },
231 |     {
232 |         "active": true,
233 |         "name": "image_search_candle",
234 |         "default_image": "tests/images/black.jpg",
235 |         "conversations": [
236 |             [
237 |                 { "text": "what is this?", "image": "tests/images/candle.jpg" },
238 |                 "Where can I buy it?"
239 |             ]
240 |         ]
241 |     },
242 |     {
243 |         "active": true,
244 |         "name": "parking_sign",
245 |         "conversations": [
246 |             [
247 |                 { "text": "it is 7:50 pm, can I park here right now?", "image": "tests/images/parking_sign.jpg" }
248 |             ]
249 |         ]
250 |     }
251 | ]


--------------------------------------------------------------------------------
/vision/utils.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | import cv2
  4 | import numpy as np
  5 | from abc import ABC, abstractmethod
  6 | from typing import List
  7 | import os
  8 | 
  9 | def detect_media_type(image_bytes: bytes) -> str:
 10 |     if image_bytes is not None:
 11 |         if image_bytes[0:4] == b"\x89PNG":
 12 |             return "image/png"
 13 |         elif b"JFIF" in image_bytes[0:64]:  # probably should do a stricter check here
 14 |             return "image/jpeg"
 15 |         elif image_bytes[0:4] == b"RIFF" and image_bytes[8:12] == b"WEBP":
 16 |             return "image/webp"
 17 | 
 18 |     # Unknown: just assume JPEG
 19 |     return "image/jpeg"
 20 | 
 21 | 
 22 | 
 23 | class BaseFilter(ABC):
 24 |     @abstractmethod
 25 |     def apply(self, image):
 26 |         pass
 27 | 
 28 |     @abstractmethod
 29 |     def adjust(self):
 30 |         pass
 31 | 
 32 | class SmoothFilter(BaseFilter):
 33 |     def __init__(self, kernel_size=5):
 34 |         self.kernel_size = kernel_size
 35 | 
 36 |     def apply(self, image):
 37 |         print("Smoothing kernel size: ", self.kernel_size)
 38 |         # Apply Gaussian blur
 39 |         return cv2.GaussianBlur(image, (self.kernel_size, self.kernel_size), 0)
 40 |     
 41 |     def adjust(self, kernel_size=None):
 42 |         self.kernel_size = kernel_size
 43 | 
 44 | class SaturationFilter(BaseFilter):
 45 |     def __init__(self, saturation=1.0):
 46 |         self.saturation = saturation
 47 | 
 48 |     def apply(self, image):
 49 |         print("Saturation value: ", self.saturation)
 50 |         # Convert the image to the HSV color space
 51 |         hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
 52 |         h, s, v = cv2.split(hsv)
 53 |         
 54 |         # Apply saturation to the S channel
 55 |         s = cv2.addWeighted(s, self.saturation, np.zeros_like(s), 0, 0)
 56 |         
 57 |         # Merge the channels back together
 58 |         hsv = cv2.merge([h, s, v])
 59 |         return cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
 60 |     
 61 |     def adjust(self, saturation=None, scale=100):
 62 |         self.saturation = (saturation / scale)*10 if saturation > 0 else 0.1
 63 | 
 64 | class TemperatureFilter(BaseFilter):
 65 |     def __init__(self, temperature=0.0):
 66 |         self.temperature = temperature
 67 | 
 68 |     def apply(self, image):
 69 |         print("Temperature value: ", self.temperature)
 70 |         # Convert the image to the LAB color space
 71 |         lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
 72 |         l, a, b = cv2.split(lab)
 73 |         
 74 |         # Apply temperature to the A channel
 75 |         a = cv2.addWeighted(a, self.temperature, np.zeros_like(a), 0, 0)
 76 |         
 77 |         # Merge the channels back together
 78 |         lab = cv2.merge([l, a, b])
 79 |         return cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
 80 |     
 81 |     def adjust(self, temperature=None, scale=100):
 82 |         self.temperature = (temperature / scale)*10 if temperature > 0 else 0.1
 83 | 
 84 | class GammaCorrectionFilter(BaseFilter):
 85 |     def __init__(self, gamma=1.0):
 86 |         self.gamma = gamma
 87 | 
 88 |     def apply(self, image):
 89 |         print("Gamma value: ", self.gamma)
 90 |             # Apply gamma correction using cv2.addWeighted()
 91 |         invGamma = 1.0 / self.gamma
 92 |         table = np.array([((i / 255.0) ** invGamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
 93 |         return cv2.LUT(image, table)
 94 |     def adjust(self, gamma=None, scale=100):
 95 |         # adjust to 1, 10 scale depending on the slider
 96 |         self.gamma = (gamma / scale)*10 if gamma > 0 else 0.1
 97 |         self.gamma = gamma
 98 | 
 99 | class BoostShadowFilter(BaseFilter):
100 |     def __init__(self, amount=1):
101 |         self.amount = amount
102 | 
103 |     def apply(self, image):
104 |         gamma_corrected = np.power(image / 255.0, self.amount) * 255
105 |         return np.uint8(gamma_corrected)
106 |     
107 |     def adjust(self, amount=None, scale=100):
108 |         self.amount = (amount / scale)*10 if amount > 0 else 0.1
109 | 
110 | class SharpeningFilter(BaseFilter):
111 |     def __init__(self, sigma=1.0, strength=1.0):
112 |         self.sigma = sigma
113 |         self.strength = strength
114 | 
115 |     def apply(self, image):
116 |         # Apply Gaussian blur
117 |         print("Sharpening sigma: ", self.sigma, "Strength: ", self.strength)
118 |         blurred = cv2.GaussianBlur(image, (0, 0), self.sigma)
119 |         
120 |         # Calculate the unsharp mask
121 |         unsharp_mask = cv2.addWeighted(image, 1.0 + self.strength, blurred, -self.strength, 0)
122 |         
123 |         return unsharp_mask
124 |     
125 |     def adjust(self, amount=None, scale=100):
126 |         self.amount = (amount / scale)*10 if amount > 0 else 0.1    
127 | 
128 | class NoiseReductionFilter(BaseFilter):
129 |     def __init__(self, method='gaussian', kernel_size=5):
130 |         self.method = method
131 |         self.kernel_size = kernel_size
132 | 
133 |     def apply(self, image):
134 |         print("Noise reduction method: ", self.method, "Kernel size: ", self.kernel_size)
135 |         if self.method == 'gaussian':
136 |             # Apply Gaussian blur
137 |             blurred = cv2.GaussianBlur(image, (self.kernel_size, self.kernel_size), 0)
138 |         elif self.method == 'median':
139 |             # Apply Median blur
140 |             blurred = cv2.medianBlur(image, self.kernel_size)
141 |         else:
142 |             raise ValueError("Unsupported noise reduction method. Use 'gaussian' or 'median'.")
143 | 
144 |         return blurred
145 |     def adjust(self, method=None, kernel_size=None):
146 |         self.method = method
147 |         self.kernel_size = kernel_size
148 |         
149 | class ContrastFilter(BaseFilter):
150 |     def __init__(self, contrast=1.0):
151 |         self.contrast = contrast
152 | 
153 |     def apply(self, image):
154 |         print("Contrast value: ", self.contrast)
155 |         # Apply contrast by converting the image to YUV color space
156 |         yuv = cv2.cvtColor(image, cv2.COLOR_BGR2YUV)
157 |         y, u, v = cv2.split(yuv)
158 |         
159 |         # Apply contrast to the Y channel
160 |         y = cv2.addWeighted(y, self.contrast, np.zeros_like(y), 0, 0)
161 |         
162 |         # Merge the channels back together
163 |         yuv = cv2.merge([y, u, v])
164 |         return cv2.cvtColor(yuv, cv2.COLOR_YUV2BGR)
165 |     
166 |     def adjust(self, contrast=None, scale=100):
167 |         self.contrast = contrast
168 | 
169 | class BoostResolutionFilter(BaseFilter):
170 |     def __init__(self, factor=2):
171 |         self.factor = factor
172 | 
173 |     def apply(self, image):
174 |         print("Resolution boost factor: ", self.factor)
175 |         # Upscale the image using bicubic interpolation
176 |         return cv2.resize(image, None, fx=self.factor, fy=self.factor, interpolation=cv2.INTER_CUBIC)
177 |     
178 |     def adjust(self, factor=None, scale=100):
179 |         self.factor = factor
180 | 
181 | class ApplyBlurFilter(BaseFilter):
182 |     def __init__(self, kernel_size=5):
183 |         self.kernel_size = kernel_size
184 | 
185 |     def apply(self, image):
186 |         print("Blur kernel size: ", self.kernel_size)
187 |         # Apply Gaussian blur
188 |         return cv2.GaussianBlur(image, (self.kernel_size, self.kernel_size), 0)
189 |     
190 |     def adjust(self, kernel_size=None):
191 |         self.kernel_size = kernel_size
192 | 
193 | class ReduceResolutionFilter(BaseFilter):
194 |     def __init__(self, factor=2):
195 |         self.factor = factor
196 | 
197 |     def apply(self, image):
198 |         print("Resolution reduction factor: ", self.factor)
199 |         # Downscale the image using bicubic interpolation
200 |         return cv2.resize(image, None, fx=1.0/self.factor, fy=1.0/self.factor, interpolation=cv2.INTER_CUBIC)
201 |     
202 |     def adjust(self, factor=None, scale=100):
203 |         self.factor = factor
204 | 
205 | class WhiteBalanceFilter(BaseFilter):
206 |     def __init__(self):
207 |         pass
208 | 
209 |     def apply(self, image):
210 |         print("White balance")
211 |         # Auto white balance by equalizing the histogram of the LAB L channel
212 |         lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
213 |         l, a, b = cv2.split(lab)
214 |         clahe = cv2.createCLAHE(clipLimit=1.0, tileGridSize=(4, 4))
215 |         l = clahe.apply(l)
216 |         lab = cv2.merge([l, a, b])
217 |         return cv2.cvtColor(lab, cv2.COLOR_LAB2BGR)
218 |     
219 |     def adjust(self, b_ratio=None, g_ratio=None, r_ratio=None):
220 |         self.b_ratio = b_ratio
221 |         self.g_ratio = g_ratio
222 |         self.r_ratio = r_ratio
223 | 
224 | class BrightnessFilter(BaseFilter):
225 |     def __init__(self, brightness=0):
226 |         self.brightness = brightness
227 | 
228 |     def apply(self, image):
229 |         print("Brightness value: ", self.brightness)
230 |         # Increase the brightness by adding the specified value to each pixel
231 |         return cv2.add(image, np.array([self.brightness]))
232 |     
233 |     def adjust(self, brightness=None, scale=100):
234 |         self.brightness = brightness
235 | 
236 | class ImageProcessor:
237 |     def __init__(self,path: str|List[str]|bytes, filters: List[BaseFilter]):
238 |         self.path = path
239 |         self.filters = filters
240 |         if isinstance(path, list):
241 |             self.image = [cv2.imread(p) for p in path]
242 |         elif isinstance(path, bytes):
243 |             nparr = np.frombuffer(path, np.uint8)
244 |             self.image = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
245 |         else:
246 |             self.image = cv2.imread(self.path)
247 |         self.filtered_image = None
248 | 
249 |     def apply_filters(self, image=None):
250 |         if image is None:
251 |             image = self.image
252 | 
253 |         if isinstance(image, list):
254 |             self.filtered_image = [None] * len(image)
255 | 
256 |             for i, img in enumerate(image):
257 |                 for f in self.filters:
258 |                     self.filtered_image[i] = f.apply(img)
259 |                     
260 |             print("Filtered image: ", len(self.filtered_image))
261 |         else:
262 |             for f in self.filters:
263 |                 image = f.apply(image)
264 |             self.filtered_image = image
265 |         return image
266 |     
267 |     def __call__(self, preview: bool = True):
268 |         self.apply_filters()
269 |         if preview:
270 |             self.show_preview(slider=True)
271 | 
272 |     def show_preview(self, slider=False):
273 |         if self.filtered_image is not None:
274 |             if isinstance(self.filtered_image, list):
275 |                 for i, img in enumerate(self.filtered_image):
276 |                     filters_applied = ", ".join([f.__class__.__name__ for f in self.filters])
277 |                     cv2.imshow('Original', self.image[i])
278 |                     cv2.imshow('Filtered [{}]'.format(filters_applied), img)
279 |                 cv2.waitKey(0)
280 |                 cv2.destroyAllWindows()
281 |             else:
282 |                 filters_applied = ", ".join([f.__class__.__name__ for f in self.filters])
283 |                 cv2.imshow('Original', self.image)
284 |                 cv2.imshow('Filtered [{}]'.format(filters_applied), self.filtered_image)
285 |                 cv2.waitKey(0)
286 |                 cv2.destroyAllWindows()
287 |     def save(self, dir_path: str="output"):
288 |         dir_path = os.path.join(os.getcwd(), dir_path)
289 |         if not os.path.exists(dir_path):
290 |             os.makedirs(dir_path)
291 |         if self.filtered_image is not None:
292 |             if isinstance(self.filtered_image, list):
293 |                 for i, img in enumerate(self.filtered_image):
294 |                     basename = os.path.basename(self.path[i])
295 |                     path = os.path.join(dir_path, basename.replace('.webp', '.jpg'))
296 |                     cv2.imwrite(path, img)
297 |             else:
298 |                 if isinstance(self.path, bytes):
299 |                     basename = "test_image.jpg"
300 |                 else:
301 |                     basename = os.path.basename(self.path)
302 |                 path = os.path.join(dir_path, basename.replace('.webp', '.jpg'))
303 |                 print(path)
304 |                 cv2.imwrite(path, self.filtered_image)
305 |     def get_bytes(self):
306 |         if self.filtered_image is not None:
307 |             if isinstance(self.filtered_image, list):
308 |                 return [cv2.imencode('.jpg', img)[1].tobytes() for img in self.filtered_image]
309 |             else:
310 |                 return cv2.imencode('.jpg', self.filtered_image)[1].tobytes()
311 |         return None
312 | 
313 | def process_image(bytes: bytes)->bytes:
314 |     filters:List[BaseFilter] = [
315 |         # BoostShadowFilter(amount=0.8),
316 |         GammaCorrectionFilter(gamma=1.2),
317 |         # BoostResolutionFilter(factor=),
318 | 
319 |         SharpeningFilter(sigma=0.5, strength=5.0),
320 |         WhiteBalanceFilter(),
321 | 
322 |         BoostShadowFilter(amount=1.2),
323 |         ContrastFilter(contrast=1.2),
324 |         SaturationFilter(saturation=1.2),
325 |         TemperatureFilter(temperature=1.02),
326 |         ApplyBlurFilter(kernel_size=3),
327 | 
328 |         # ReduceResolutionFilter(factor=2),
329 | 
330 |         # NoiseReductionFilter(method='gaussian', kernel_size=3),
331 |     ]
332 |     image_processor = ImageProcessor(path=bytes, filters=filters)
333 |     image_processor.apply_filters()
334 |     # image_processor.save("output")
335 |     return image_processor.get_bytes()


--------------------------------------------------------------------------------
/tests/benchmark.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "active": true,
  4 |         "name": "standalone_conversations",
  5 |         "default_image": "tests/images/black.jpg",
  6 |         "conversations": [
  7 |             [
  8 |                 { "text": "hi", "capabilities": [ "assistant_knowledge" ] },
  9 |                 { "text": "Who is the CEO of Brilliant Labs?", "capabilities": [ "web_search" ] },
 10 |                 { "text": "What is this?", "image": "tests/images/house_plant.jpg", "capabilities": [ "vision" ] }
 11 |             ],
 12 |             [
 13 |                 { "text": "What's the weather like today?", "capabilities": [ "web_search" ] }
 14 |             ],
 15 |             [
 16 |                 { "text": "What time is it?", "capabilities": [ "assistant_knowledge" ] },
 17 |                 { "text": "What about now?", "capabilities": [ "assistant_knowledge" ] },
 18 |                 { "text": "And now?", "capabilities": [ "assistant_knowledge" ] }
 19 |             ],
 20 |             [
 21 |                 { "text": "What am I doing?", "image": "tests/images/Zoom_Call.jpg", "capabilities": [ "vision" ] },
 22 |                 { "text": "How did you know?", "image": "tests/images/Zoom_Call.jpg", "capabilities": [ "vision" ] }
 23 |             ],
 24 |             [
 25 |                 { "text": "Who do you think made this piece of art?", "image": "tests/images/child_artwork.jpg", "capabilities": [ "vision" ] },
 26 |                 { "text": "How old do you think this artist is?", "image": "tests/images/child_artwork.jpg", "capabilities": [ "vision" ] },
 27 |                 { "text": "Try to guess based on the nature of the art.", "image": "tests/images/child_artwork.jpg", "capabilities": [ "vision" ] }
 28 |             ],
 29 |             [
 30 |                 { "text": "How many beds are in this room?", "image": "tests/images/two_beds.jpg", "capabilities": [ "vision" ] },
 31 |                 { "text": "Try again. Are you sure there is only one bed?", "image": "tests/images/two_beds.jpg", "capabilities": [ "vision" ] },
 32 |                 { "text": "What's the best way I can make this bed to optimize for efficiency and process consistency on a daily basis?", "image": "tests/images/two_beds.jpg", "capabilities": [ "vision" ] },
 33 |                 { "text": "I'm not familiar with the military corner method. Can you please talk me through it step by step?", "capabilities_any": [ "assistant_knowledge", "web_search" ] }
 34 |             ],
 35 |             [
 36 |                 { "text": "What does this say and what do you think it means?", "image": "tests/images/motivational_sign.jpg", "capabilities": [ "vision" ] },
 37 |                 { "text": "What would be a different way to phrase this while still achieving the same meaning?", "image": "tests/images/motivational_sign.jpg", "capabilities": [ "assistant_knowledge" ] }
 38 |             ],
 39 |             [
 40 |                 { "text": "What culture style of carpeting weaving is this?", "image": "tests/images/persian_rug.jpg", "capabilities_any": [ "vision" ] },
 41 |                 { "text": "Yes I understand that but what characteristics of the pattern itself indicate to you that this is a Persian style carpet?", "image": "tests/images/persian_rug.jpg", "capabilities_any": [ "vision" ] }
 42 |             ],
 43 |             [
 44 |                 { "text": "How many different types of toys are strewn about this messy room?", "image": "tests/images/toys_strewn_about.jpg", "capabilities": [ "vision" ] },
 45 |                 { "text": "Interesting. Can you list them?", "image": "tests/images/toys_strewn_about.jpg", "capabilities": [ "vision" ] }
 46 |             ],
 47 |             [
 48 |                 { "text": "Who made this car?", "image": "tests/images/toy_car.jpg", "capabilities": [ "vision" ] },
 49 |                 { "text": "Which country has the most police cars Noa?", "image": "tests/images/toy_car.jpg", "capabilities": [ "web_search" ] },
 50 |                 { "text": "How many police cars does it have?", "image": "tests/images/toy_car.jpg", "capabilities": [ "web_search" ] },
 51 |                 { "text": "Try to make an educated guess based on what you know", "image": "tests/images/toy_car.jpg", "capabilities": [ "web_search" ] },
 52 |                 { "text": "I am asking you to simply guess. Don't overthink it.", "image": "tests/images/toy_car.jpg", "capabilities": [ "web_search" ] }
 53 |             ],
 54 |             [
 55 |                 { "text": "How many stairs do you count?", "image": "tests/images/staircase.jpg", "capabilities": [ "vision" ] },
 56 |                 { "text": "What material does it look like the stairs are made out of?", "image": "tests/images/staircase.jpg", "capabilities": [ "vision" ] }
 57 |             ],
 58 |             [
 59 |                 { "text": "How could this track be improved?", "image": "tests/images/toy_track_incomplete.jpg", "capabilities": [ "vision" ] },
 60 |                 { "text": "How about the shape and design of the track?", "image": "tests/images/toy_track_incomplete.jpg", "capabilities": [ "vision" ] }
 61 |             ],
 62 |             [
 63 |                 { "text": "What interior design style do you see here?", "image": "tests/images/living_room.jpg", "capabilities": [ "vision" ] },
 64 |                 { "text": "How do you think I could further improve the beauty and functionality of this space?", "image": "tests/images/living_room.jpg", "capabilities": [ "vision" ] }
 65 |             ],
 66 |             [
 67 |                 { "text": "What is this and what do you think it's used for?", "image": "tests/images/pickled_snake.jpg", "capabilities": [ "vision" ] }
 68 |             ],
 69 |             [
 70 |                 { "text": "Do you know who this train is?", "image": "tests/images/thomas_the_train.jpg", "capabilities": [ "vision" ] }
 71 |             ],
 72 |             [
 73 |                 { "text": "How would you describe the weather today?", "image": "tests/images/dreary_day.jpg", "capabilities_any": [ "vision", "web_search" ] },
 74 |                 { "text": "Try again. Look at the sky.", "image": "tests/images/dreary_day.jpg", "capabilities": [ "vision" ] }
 75 |             ],
 76 |             [
 77 |                 { "text": "What plant is this and how do I take care of it?", "image": "tests/images/house_plant.jpg", "capabilities": [ "vision" ] }
 78 |             ],
 79 |             [
 80 |                 { "text": "What are the directions to play this game?", "image": "tests/images/candyland.jpg", "capabilities": [ "vision" ] }
 81 |             ],
 82 |             [
 83 |                 { "text": "Is this person dressed appropriately for the weather?", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] },
 84 |                 { "text": "How old does he look?", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] },
 85 |                 { "text": "Just take a guess based purely on what you see.", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] },
 86 |                 { "text": "Simply guess, it's ok to make a mistake.", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] },
 87 |                 { "text": "What led you to guess the person is in their mid 30's?", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] },
 88 |                 { "text": "Unpack that, please. Tell me more.", "image": "tests/images/bobak_dressed_for_weather.jpg", "capabilities": [ "vision" ] }
 89 |             ],
 90 |             [
 91 |                 { "text": "What kind of flowers are these and is there a place nearby where I can buy them?", "image": "tests/images/flower_arrangement.jpg", "capabilities_any": [ "vision" ] },
 92 |                 { "text": "How much do you think they would be for a bouquet?", "image": "tests/images/flower_arrangement.jpg", "capabilities_any": [ "vision", "web_search" ] }
 93 |             ],
 94 |             [
 95 |                 { "text": "What do these paintings say about the psychological state of the artist?", "image": "tests/images/colorful_landscape_paintings.jpg", "capabilities": [ "vision" ] }
 96 |             ],
 97 |             [
 98 |                 { "text": "What's a good interface modality for a pair of smart glasses like these?", "image": "tests/images/frame_smart_glasses.jpg", "capabilities_any": [ "vision" ] }
 99 |             ],
100 |             [
101 |                 { "text": "What does this say?", "image": "tests/images/STEM_card_Chinese.jpg", "capabilities": [ "vision" ] },
102 |                 { "text": "OK but can you translate the Chinese into English?", "image": "tests/images/STEM_card_Chinese.jpg", "capabilities": [ "vision" ] },
103 |                 { "text": "What does this say?", "image": "tests/images/STEM_card_Chinese_closeup.jpg", "capabilities": [ "vision" ] },
104 |                 { "text": "How about the larger characters? Can you translate those to English?", "image": "tests/images/STEM_card_Chinese_closeup.jpg", "capabilities": [ "vision" ] },
105 |                 { "text": "That is incorrect. Try again.", "image": "tests/images/STEM_card_Chinese_closeup.jpg", "capabilities": [ "vision" ] }
106 |             ],
107 |             [
108 |                 { "text": "Describe this and any cultural meaning.", "image": "tests/images/chinese_dragon.jpg", "capabilities": [ "vision" ] },
109 |                 { "text": "That's really cool, is this the year of the dragon?", "image": "tests/images/chinese_dragon.jpg", "capabilities": [ "assistant_knowledge", "web_search" ] }
110 |             ],
111 |             [
112 |                 { "text": "How many cars do you see and what kind of cars are they?", "image": "tests/images/cars_behind_wall.jpg", "capabilities": [ "vision" ] }
113 |             ],
114 |             [
115 |                 { "text": "How many golf balls can fit in this trash can?", "image": "tests/images/small_trash_can.jpg", "capabilities": [ "vision" ] }
116 |             ],
117 |             [
118 |                 { "text": "What is this book about and where can I get it?", "image": "tests/images/book_viral_justice.jpg", "capabilities": [ "vision" ] }
119 |             ],
120 |             [
121 |                 { "text": "What is this about and where can I get it?", "image": "tests/images/book_atomic_habits.jpg", "capabilities": [ "vision" ] }
122 |             ],
123 |             [
124 |                 { "text": "What is this and where can I get it?", "image": "tests/images/airpod_case.jpg", "capabilities": [ "vision" ] }
125 |             ],
126 |             [
127 |                 { "text": "What is this and where can I get it?", "image": "tests/images/airpod_case_open.jpg", "capabilities": [ "vision" ] },
128 |                 { "text": "Who makes these earpods and what are they called?", "image": "tests/images/airpod_case_open.jpg", "capabilities": [ "vision" ] }
129 |             ],
130 |             [
131 |                 { "text": "Who makes this hat?", "image": "tests/images/ua_hat.jpg", "capabilities": [ "vision" ] },
132 |                 { "text": "Try analyzing the logo on the hat as a clue to who the manufacturer might be.", "image": "tests/images/ua_hat.jpg", "capabilities": [ "vision" ] }
133 |             ],
134 |             [
135 |                 { "text": "What is the weather forecast for Beijing tonight?", "capabilities": [ "web_search" ] },
136 |                 { "text": "Is it going to rain?", "capabilities": [ "web_search" ] },
137 |                 { "text": "Thanks, ok, then I will bring my pet lovebird inside. Her name is Sunny. What's an alternative quirky name that I could give her which is reminiscent of funk Shakespearean pop?", "capabilities": [ "assistant_knowledge" ] },
138 |                 { "text": "That name is absolutely rad. What was your inspiration?", "capabilities": [ "assistant_knowledge" ] }
139 |             ],
140 |             [
141 |                 { "text": "What kind of dog is this and why would it be dressed that way?", "image": "tests/images/pug_asian_clothing.jpg", "capabilities_any": [ "vision" ] },
142 |                 { "text": "My wife and I have been married for 10 years and our eldest child is 6. How long were we married before we had our first child?", "capabilities": [ "assistant_knowledge" ] }
143 |             ],
144 |             [
145 |                 { "text": "Which actor is this?", "image": "tests/images/some_old_actor.jpg", "capabilities": [ "vision" ] }
146 |             ],
147 |             [
148 |                 { "text": "What can you make with these ingredients?", "image": "tests/images/eggs_banana_avocado.jpg", "capabilities_any": [ "vision", "vision" ] }
149 |             ],
150 |             [
151 |                 { "text": "Can you give me a nuitritional breakdown of this?", "image": "tests/images/popcorn.jpg", "capabilities_any": [ "vision", "vision" ] }
152 |             ],
153 |             [
154 |                 { "text": "Who designed this building?", "image": "tests/images/Wangjing_Soho.jpg", "capabilities": [ "vision" ] }
155 |             ],
156 |             [
157 |                 { "text": "How tall is this figurine?", "image": "tests/images/figurine_on_table.jpg", "capabilities": [ "vision" ] }
158 |             ],
159 |             [
160 |                 { "text": "it is 7:50 pm, can I park here right now?", "image": "tests/images/parking_sign.jpg", "capabilities": [ "vision" ] }
161 |             ],
162 |             [
163 |                 { "text": "What is Apple's stock price?", "capabilities": [ "web_search" ] },
164 |                 { "text": "Why has it been performing the way it has?", "capabilities": [ "web_search" ] }
165 |             ]
166 |         ]
167 |     },
168 |     {
169 |         "active": true,
170 |         "name": "mixed_conversations",
171 |         "default_image": "tests/images/black.jpg",
172 |         "conversations": [
173 |             [
174 |                 { "text": "What can you make with these ingredients?", "image": "tests/images/eggs_banana_avocado.jpg", "capabilities_any": [ "vision" ] },
175 |                 { "text": "Describe this and any cultural meaning.", "image": "tests/images/chinese_dragon.jpg", "capabilities": [ "vision" ] }
176 |             ],
177 |             [
178 |                 { "text": "What are the directions to play this game?", "image": "tests/images/candyland.jpg", "capabilities": [ "vision" ]  },
179 |                 { "text": "How many stairs do you count?", "image": "tests/images/staircase.jpg", "capabilities": [ "vision" ]  }
180 |             ],
181 |             [
182 |                 { "text": "What kind of flowers are these and is there a place nearby where I can buy them?", "image": "tests/images/flower_arrangement.jpg", "capabilities_any": [ "vision" ] },
183 |                 { "text": "How much do you think they would be for a bouquet?", "image": "tests/images/flower_arrangement.jpg", "capabilities_any": [ "web_search", "vision" ] },
184 |                 { "text": "What am I doing?", "image": "tests/images/Zoom_Call.jpg", "capabilities": [ "vision" ] }
185 |             ],
186 |             [
187 |                 { "text": "What does this say?", "image": "tests/images/chinese_sign_1.webp", "capabilities": [ "vision" ] },
188 |                 { "text": "What brand are these gloves?", "image": "tests/images/winning_gloves.webp", "capabilities_any": [ "vision" ] },
189 |                 { "text": "What are they used for?", "capabilities": [ "web_search" ] }
190 |             ],
191 |             [
192 |                 { "text": "How many cars do you see and what kind of cars are they?", "image": "tests/images/cars_behind_wall.jpg", "capabilities": [ "vision" ] },
193 |                 { "text": "How many golf balls can fit in this trash can?", "image": "tests/images/small_trash_can.jpg", "capabilities": [ "vision" ] },
194 |                 { "text": "What's the latest on Moscow?", "capabilities": [ "web_search" ] },
195 |                 { "text": "What is this?", "image": "tests/images/candle.jpg", "capabilities": [ "vision" ] }
196 |             ]
197 |         ]
198 |     }       
199 | ]


--------------------------------------------------------------------------------
/run_benchmark.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # run_benchmark.py
  3 | # 
  4 | # Benchmark queries for AI assistant. Used for testing and assessing the quality of assistant
  5 | # responses. This script talks to a production endpoint and not the Python assistant server
  6 | # directly. Simply run:
  7 | #
  8 | #   python run_benchmark.py tests/tests.json
  9 | #
 10 | # Use --help for more instructions.
 11 | #
 12 | 
 13 | import argparse
 14 | from datetime import datetime
 15 | from enum import Enum
 16 | import json
 17 | import os
 18 | import requests
 19 | from typing import List, Optional
 20 | 
 21 | import numpy as np
 22 | from pydantic import BaseModel, RootModel
 23 | 
 24 | from models import Capability, MultimodalResponse
 25 | 
 26 | 
 27 | ####################################################################################################
 28 | # Test Case JSON and Evaluation
 29 | ####################################################################################################
 30 | 
 31 | class UserMessage(BaseModel):
 32 |     text: str
 33 |     image: Optional[str] = None
 34 |     capabilities: Optional[List[Capability]] = None # capabilities that are required to have been used
 35 |     capabilities_any: Optional[List[Capability]] = None # must use at least one of the capabilities listed here
 36 | 
 37 | class TestCase(BaseModel):
 38 |     name: str
 39 |     active: bool
 40 |     default_image: Optional[str] = None
 41 |     conversations: List[List[UserMessage | str]]
 42 | 
 43 | class TestCaseFile(RootModel):
 44 |     root: List[TestCase]
 45 | 
 46 | class TestResult(str, Enum):
 47 |     FAILED = "FAILED"
 48 |     IGNORED = "IGNORED"
 49 |     PASSED = "PASSED"
 50 | 
 51 | def load_tests(filepath: str) -> List[TestCase]:
 52 |     with open(file=filepath, mode="r") as fp:
 53 |         text = fp.read()
 54 |     return TestCaseFile.model_validate_json(json_data=text).root
 55 | 
 56 | def evaluate_capabilities_used(input: UserMessage, output: MultimodalResponse) -> TestResult:
 57 |     # Do we have anything to evaluate against?
 58 |     has_required_capabilities = input.capabilities is not None and len(input.capabilities) > 0
 59 |     has_any_capabilities = input.capabilities_any is not None and len(input.capabilities_any) > 0
 60 |     if (not has_required_capabilities) and (not has_any_capabilities):
 61 |         # Ignore if desired test results are not specified
 62 |         return TestResult.IGNORED
 63 |     
 64 |     capabilities_used = output.capabilities_used
 65 |     
 66 |     # Evaluate result against required capabilities
 67 |     if has_required_capabilities:
 68 |         for required_capability in input.capabilities:
 69 |             if required_capability not in capabilities_used:
 70 |                 return TestResult.FAILED
 71 |     
 72 |     # Evaluate result against "any capabilities" (an OR function)
 73 |     if has_any_capabilities:
 74 |         any_present = False
 75 |         for interchangeable_capability in input.capabilities_any:
 76 |             if interchangeable_capability in capabilities_used:
 77 |                 any_present = True
 78 |         if not any_present:
 79 |             return TestResult.FAILED
 80 |     
 81 |     return TestResult.PASSED
 82 | 
 83 | 
 84 | ####################################################################################################
 85 | # Helper Functions
 86 | ####################################################################################################
 87 | 
 88 | def load_binary_file(filepath: str) -> bytes:
 89 |     with open(file=filepath, mode="rb") as fp:
 90 |         return fp.read()
 91 | 
 92 | 
 93 | ####################################################################################################
 94 | # Markdown Report Generation
 95 | ####################################################################################################
 96 | 
 97 | class ReportGenerator:
 98 |     def __init__(self, test_filepath: str, generate_markdown: bool):
 99 |         self._generate_markdown = generate_markdown
100 |         if not generate_markdown:
101 |             return
102 |         base = os.path.splitext(os.path.basename(test_filepath))[0]
103 |         filename = f"{base}.md"
104 |         self._fp = open(file=filename, mode="w")
105 |         self._fp.write(f"# {test_filepath}\n\n")
106 |         self._total_times = []
107 |     
108 |     def __del__(self):
109 |         if not self._generate_markdown:
110 |             return
111 |         self._fp.close()
112 | 
113 |     def begin_test(self, name: str):
114 |         self._total_times = []
115 |         if not self._generate_markdown:
116 |             return
117 |         self._fp.write(f"## {name}\n\n")
118 |         self._fp.write(f"|Passed?|User|Assistant|Image|Debug|\n")
119 |         self._fp.write(f"|-------|----|---------|-----|-----|\n")
120 | 
121 |     def begin_conversation(self):
122 |         if not self._generate_markdown:
123 |             return
124 |         self._fp.write("|\\-\\-\\-\\-\\-\\-\\-\\-|\\-\\-\\-\\-\\-\\-\\-\\-|\\-\\-\\-\\-\\-\\-\\-\\-|\\-\\-\\-\\-\\-\\-\\-\\-|\\-\\-\\-\\-\\-\\-\\-\\-|\n")
125 | 
126 |     def end_conversation(self):
127 |         pass
128 | 
129 |     def add_result(self, user_message: UserMessage, response: MultimodalResponse, assistant_response: str, test_result: TestResult):
130 |         if not self._generate_markdown:
131 |             return
132 |         passed_column = f"{test_result.value}"
133 |         user_column = self._escape(user_message.text)
134 |         assistant_column = self._escape(assistant_response)
135 |         image_column = f"<img src=\"{user_message.image}\" alt=\"image\" style=\"width:200px;\"/>" if user_message.image is not None else ""
136 |         debug_column = f"```{response.debug_tools}```"
137 |         self._fp.write(f"|{passed_column}|{user_column}|{assistant_column}|{image_column}|{debug_column}|\n")
138 | 
139 |         # Timings
140 |         try:
141 |             timings = json.loads(response.timings)
142 |             self._total_times.append(float(timings["total_time"]))
143 |         except:
144 |             pass
145 | 
146 |     def end_test(self, num_passed: int, num_evaluated: int):
147 |         if not self._generate_markdown:
148 |             return
149 |         mean_time = np.mean(self._total_times)
150 |         median_time = np.median(self._total_times)
151 |         min_time = np.min(self._total_times)
152 |         max_time = np.max(self._total_times)
153 |         pct90_time = np.quantile(self._total_times, q=0.9)
154 |         pct95_time = np.quantile(self._total_times, q=0.95)
155 |         pct99_time = np.quantile(self._total_times, q=0.99)
156 |         if num_evaluated == 0:
157 |             self._fp.write(f"**Score: N/A**\n\n")
158 |         else:
159 |             self._fp.write(f"**Score: {100.0 * num_passed / num_evaluated : .1f}%**\n\n")
160 |         self._fp.write(f"**Timings**\n")
161 |         self._fp.write(f"|Mean|Median|Min|Max|90%|95%|99%|\n")
162 |         self._fp.write(f"|----|------|---|---|---|---|---|\n")
163 |         self._fp.write(f"|{mean_time:.1f}|{median_time:.1f}|{min_time:.1f}|{max_time:.1f}|{pct90_time:.1f}|{pct95_time:.1f}|{pct99_time:.1f}|\n\n")
164 | 
165 |     @staticmethod
166 |     def _escape(text: str) -> str:
167 |         special_chars = "\\`'\"*_{}[]()#+-.!"
168 |         escaped_text = ''.join(['\\' + char if char in special_chars else char for char in text])
169 |         return escaped_text.replace("\n", " ")
170 | 
171 | 
172 | ####################################################################################################
173 | # Main Program
174 | ####################################################################################################
175 | 
176 | if __name__ == "__main__":
177 |     parser = argparse.ArgumentParser("run_benchmark")
178 |     parser.add_argument("file", nargs=1)
179 |     parser.add_argument("--endpoint", action="store", default="https://api.brilliant.xyz/dev/noa/mm", help="Address to send request to (Noa server)")
180 |     parser.add_argument("--token", action="store", help="Noa API token")
181 |     parser.add_argument("--test", metavar="name", help="Run specific test")
182 |     parser.add_argument("--markdown", action="store_true", help="Produce report in markdown file")
183 |     parser.add_argument("--vision", action="store", help="Vision model to use (gpt-4o, gpt-4-vision-preview, claude-3-haiku-20240307, claude-3-sonnet-20240229, claude-3-opus-20240229)", default="gpt-4o")
184 |     parser.add_argument("--address", action="store", default="San Francisco, CA 94115", help="Simulated location")
185 |     options = parser.parse_args()
186 | 
187 |     # Load tests
188 |     tests = load_tests(filepath=options.file[0])
189 | 
190 |     # Markdown report generator
191 |     report = ReportGenerator(test_filepath=options.file[0], generate_markdown=options.markdown)
192 | 
193 |     # Authorization header
194 |     headers = {
195 |         "Authorization": options.token if options.token is not None else os.getenv("BRILLIANT_API_KEY")
196 |     }
197 | 
198 |     # Metrics
199 |     total_user_prompts = 0
200 |     total_tokens_in = 0
201 |     total_tokens_out = 0
202 |     total_times = []
203 |     localhost = options.endpoint == "localhost"
204 | 
205 |     # Run all active tests
206 |     for test in tests:
207 |         if not options.test:
208 |             # No specific test, run all that are active
209 |             if not test.active:
210 |                 continue
211 |         else:
212 |             if test.name.lower().strip() != options.test.lower().strip():
213 |                 continue
214 | 
215 |         print(f"Test: {test.name}")
216 |         report.begin_test(name=test.name)
217 |         num_evaluated = 0
218 |         num_passed = 0
219 | 
220 |         for conversation in test.conversations:
221 |             report.begin_conversation()
222 | 
223 |             # Create new message history for each conversation
224 |             history = []
225 |             for user_message in conversation:
226 |                 # Each user message can be either a string or a UserMessage object
227 |                 assert isinstance(user_message, str) or isinstance(user_message, UserMessage)
228 |                 if isinstance(user_message, str):
229 |                     user_message = UserMessage(text=user_message)
230 | 
231 |                 # If there is no image associated with this message, use the default image, if it
232 |                 # exists
233 |                 if user_message.image is None and test.default_image is not None:
234 |                     user_message = user_message.model_copy()
235 |                     user_message.image = test.default_image
236 | 
237 |                 # Construct API call data
238 |                 if localhost:
239 |                     options.endpoint = "http://localhost:8000/mm"
240 |                     data = { 
241 |                         "mm": json.dumps({
242 |                                     "prompt": user_message.text,
243 |                                     "messages": history,
244 |                                     "address": options.address,
245 |                                     "local_time": datetime.now().strftime("%A, %B %d, %Y, %I:%M %p"),
246 |                                     "search_api": "perplexity",
247 |                                     "config": { "engine": "google_lens" },
248 |                                     "experiment": "1",
249 |                                     "vision": options.vision
250 |                                 }
251 |                             ),
252 |                     }
253 |                 else:
254 |                     data = { 
255 |                         "prompt": user_message.text,
256 |                         "messages": json.dumps(history),
257 |                         "address": options.address,
258 |                         "local_time": datetime.now().strftime("%A, %B %d, %Y, %I:%M %p"),
259 |                         "search_api": "perplexity",
260 |                         "config": json.dumps({ "engine": "google_lens" }),
261 |                         "experiment": "1",  # this activates the passthrough to the Python ai-experiments code
262 |                         "vision": options.vision
263 |                     }
264 |                 files = {}
265 |                 if user_message.image is not None:
266 |                     files["image"] = (os.path.basename(user_message.image), load_binary_file(filepath=user_message.image))
267 | 
268 |                 # Make the call and evaluate
269 |                 response = requests.post(url=options.endpoint, files=files, data=data, headers=headers)
270 |                 error = False
271 |                 try:
272 |                     if response.status_code != 200:
273 |                         print(f"Error: {response.status_code}")
274 |                         print(response.content)
275 |                         response.raise_for_status()
276 |                     #print(response.content)
277 |                     mm_response = MultimodalResponse.model_validate_json(json_data=response.content)
278 |                     #print("Sent:")
279 |                     #print(json.dumps(history))
280 | 
281 |                     test_result = evaluate_capabilities_used(input=user_message, output=mm_response)
282 |                     if test_result != TestResult.IGNORED:
283 |                         num_evaluated += 1
284 |                         num_passed += (1 if test_result == TestResult.PASSED else 0)
285 | 
286 |                     history.append({ "role": "user", "content": user_message.text })
287 | 
288 |                     assistant_response = ""
289 |                     if len(mm_response.response) > 0:
290 |                         assistant_response = mm_response.response
291 |                     elif len(mm_response.image) > 0:
292 |                         assistant_response = "<generated image>"
293 |                     if len(assistant_response) > 0:
294 |                         history.append({ "role": "assistant", "content": assistant_response })
295 | 
296 |                     timings = json.loads(mm_response.timings)
297 | 
298 |                     print(f"User: {user_message.text}" + (f" ({user_message.image})" if user_message.image else ""))
299 |                     print(f"Response: {assistant_response}")
300 |                     print(f"Tools: {mm_response.debug_tools}")
301 |                     print(f"Timings: {timings}")
302 |                     #pct_out = float(content["output_tokens"]) / float(content["total_tokens"]) * 100.0
303 |                     #print(f"Tokens: in={content['input_tokens']}, out={content['output_tokens']} %out={pct_out:.0f}%")
304 |                     print(f"Test: {test_result}")
305 |                     print("")
306 |                     report.add_result(user_message=user_message, response=mm_response, assistant_response=assistant_response, test_result=test_result)
307 | 
308 |                     total_user_prompts += 1
309 |                     total_tokens_in += mm_response.input_tokens
310 |                     total_tokens_out += mm_response.output_tokens
311 | 
312 |                     total_times.append(float(timings["total_time"]))
313 |                     
314 |                 except Exception as e:
315 |                     print(f"Error: {e}")
316 | 
317 |             report.end_conversation()
318 | 
319 |         # Print test results
320 |         print("")
321 |         print(f"TEST RESULTS: {test.name}")
322 |         if num_evaluated == 0:
323 |             print(f"  Score: N/A")
324 |         else:
325 |             print(f"  Score: {num_passed}/{num_evaluated} = {100.0 * num_passed / num_evaluated : .1f}%")
326 |         report.end_test(num_passed=num_passed, num_evaluated=num_evaluated)
327 | 
328 |     # Summary
329 |     print(f"User messages: {total_user_prompts}")
330 |     print(f"Total input tokens: {total_tokens_in}")
331 |     print(f"Total output tokens: {total_tokens_out}")
332 |     print(f"Average input tokens: {total_tokens_in / total_user_prompts}")
333 |     print(f"Average output tokens: {total_tokens_out / total_user_prompts}")
334 | 
335 |     # Timings
336 |     mean_time = np.mean(total_times)
337 |     median_time = np.median(total_times)
338 |     min_time = np.min(total_times)
339 |     max_time = np.max(total_times)
340 |     pct90_time = np.quantile(total_times, q=0.9)
341 |     pct95_time = np.quantile(total_times, q=0.95)
342 |     pct99_time = np.quantile(total_times, q=0.99)
343 |     print("")
344 |     print("Timing")
345 |     print("------")
346 |     print(f"Mean  : {mean_time:.1f}")
347 |     print(f"Median: {median_time:.1f}")
348 |     print(f"Min   : {min_time:.1f}")
349 |     print(f"Max   : {max_time:.1f}")
350 |     print(f"90%   : {pct90_time:.1f}")
351 |     print(f"95%   : {pct95_time:.1f}")
352 |     print(f"99%   : {pct99_time:.1f}")


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # app.py
  3 | #
  4 | # Noa assistant server application. Provides /mm endpoint.
  5 | #
  6 | 
  7 | import asyncio
  8 | from datetime import datetime
  9 | from io import BytesIO
 10 | import os
 11 | import traceback
 12 | from typing import Annotated, Dict, List, Tuple
 13 | import glob
 14 | import openai
 15 | import anthropic
 16 | import groq
 17 | from pydantic import BaseModel, ValidationError
 18 | from pydub import AudioSegment
 19 | from fastapi import FastAPI, status, Form, UploadFile, Request
 20 | from pydantic import BaseModel, ValidationError
 21 | from fastapi.exceptions import HTTPException
 22 | from fastapi.encoders import jsonable_encoder
 23 | 
 24 | from models import Capability, TokenUsage, SearchAPI, VisionModel, GenerateImageService, MultimodalRequest, MultimodalResponse, ExtractLearnedContextRequest, ExtractLearnedContextResponse
 25 | from web_search import WebSearch, DataForSEOWebSearch, SerpWebSearch, PerplexityWebSearch
 26 | from vision import Vision, GPT4Vision, ClaudeVision
 27 | from vision.utils import process_image
 28 | from generate_image import ReplicateGenerateImage
 29 | from assistant import Assistant, AssistantResponse, GPTAssistant, ClaudeAssistant, extract_learned_context
 30 | 
 31 | 
 32 | ####################################################################################################
 33 | # Configuration
 34 | ####################################################################################################
 35 | 
 36 | EXPERIMENT_AI_PORT = os.environ.get('EXPERIMENT_AI_PORT',8000)
 37 | PERPLEXITY_API_KEY = os.environ.get("PERPLEXITY_API_KEY", None)
 38 | ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", None)
 39 | 
 40 | 
 41 | ####################################################################################################
 42 | # Server API 
 43 | ####################################################################################################
 44 | 
 45 | app = FastAPI()
 46 | 
 47 | class Checker:
 48 |     def __init__(self, model: BaseModel):
 49 |         self.model = model
 50 | 
 51 |     def __call__(self, data: str = Form(...)):
 52 |         try:
 53 |             return self.model.model_validate_json(data)
 54 |         except ValidationError as e:
 55 |             raise HTTPException(
 56 |                 detail=jsonable_encoder(e.errors()),
 57 |                 status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
 58 |             )
 59 | 
 60 | async def transcribe(client: openai.AsyncOpenAI, audio_bytes: bytes) -> str:
 61 |     # Create a file-like object for Whisper API to consume
 62 |     audio = AudioSegment.from_file(BytesIO(audio_bytes))
 63 |     buffer = BytesIO()
 64 |     buffer.name = "voice.mp4"
 65 |     audio.export(buffer, format="mp4")
 66 | 
 67 |     # Whisper
 68 |     transcript = await client.audio.translations.create(
 69 |         model="whisper-1", 
 70 |         file=buffer,
 71 |     )
 72 |     return transcript.text
 73 | 
 74 | def validate_assistant_model(model: str | None, models: List[str]) -> str:
 75 |     """
 76 |     Ensures a valid model is selected.
 77 | 
 78 |     Parameters
 79 |     ----------
 80 |     model : str | None
 81 |         Model name to use.
 82 |     models : List[str]
 83 |         List of valid models. The first model is the default model.
 84 | 
 85 |     Returns
 86 |     -------
 87 |     str
 88 |         If the model name is in the list, returns it as-is, otherwise returns the first model in the
 89 |         list by default.
 90 |     """
 91 |     if model is None or model not in models:
 92 |         return models[0]
 93 |     return model
 94 | 
 95 | def get_assistant(app, mm: MultimodalRequest) -> Tuple[Assistant, str | None]:
 96 |     assistant_model = mm.assistant_model
 97 | 
 98 |     # Default assistant if none selected
 99 |     if mm.assistant is None or (mm.assistant not in [ "gpt", "claude", "groq" ]):
100 |         return app.state.assistant, None    # None for assistant_model will force assistant to use its own internal default choice
101 |     
102 |     # Return assistant and a valid model for it
103 |     if mm.assistant == "gpt":
104 |         assistant_model = validate_assistant_model(model=mm.assistant_model, models=[ "gpt-4o", "gpt-3.5-turbo-1106", "gpt-3.5-turbo", "gpt-4-turbo", "gpt-4-turbo-2024-04-09", "gpt-4-turbo-preview", "gpt-4-1106-preview" ])
105 |         if mm.openai_key and len(mm.openai_key) > 0:
106 |             return GPTAssistant(client=openai.AsyncOpenAI(api_key=mm.openai_key)), assistant_model
107 |         return GPTAssistant(client=app.state.openai_client), assistant_model
108 |     elif mm.assistant == "claude":
109 |         assistant_model = validate_assistant_model(model=mm.assistant_model, models=[ "claude-3-sonnet-20240229", "claude-3-haiku-20240307", "claude-3-opus-20240229" ])
110 |         return ClaudeAssistant(client=app.state.anthropic_client), assistant_model
111 |     elif mm.assistant == "groq":
112 |         assistant_model = validate_assistant_model(model=mm.assistant_model, models=[ "llama3-70b-8192", "llama3-8b-8192", "mixtral-8x7b-32768", "gemma-7b-it" ])
113 |         return GPTAssistant(client=app.state.groq_client), assistant_model # Groq uses GPTAssistant
114 |     
115 |     # Should never fall through to here
116 |     return None, ""
117 | 
118 | def get_web_search_provider(app, mm: MultimodalRequest) -> WebSearch:
119 |     # Use provider specified in request options
120 |     if mm.search_api == SearchAPI.SERP:
121 |         return SerpWebSearch(save_to_file=options.save, engine=mm.search_engine.value, max_search_results=mm.max_search_results)
122 |     elif mm.search_api == SearchAPI.DATAFORSEO:
123 |         return DataForSEOWebSearch(save_to_file=options.save, max_search_results=mm.max_search_results)
124 |     elif mm.search_api == SearchAPI.PERPLEXITY:
125 |         if mm.perplexity_key and len(mm.perplexity_key) > 0:
126 |             return PerplexityWebSearch(api_key=mm.perplexity_key)
127 |         return PerplexityWebSearch(api_key=PERPLEXITY_API_KEY)
128 | 
129 |     # Default provider
130 |     return app.state.web_search
131 | 
132 | def get_vision_provider(app, mm: MultimodalRequest) -> Vision | None:
133 |     # Use provider specified 
134 |     if mm.vision in [VisionModel.GPT4O, VisionModel.GPT4Vision ]:
135 |         return GPT4Vision(client=app.state.openai_client, model=mm.vision)
136 |     elif mm.vision in [VisionModel.CLAUDE_HAIKU, VisionModel.CLAUDE_SONNET, VisionModel.CLAUDE_OPUS]:
137 |         return ClaudeVision(client=app.state.anthropic_client, model=mm.vision)
138 |     
139 |     # Default provider
140 |     return app.state.vision
141 | 
142 | @app.get('/health')
143 | async def api_health():
144 |     return {"status":200,"message":"running ok"}
145 | 
146 | MAX_FILES = 100
147 | AUDIO_DIR = "audio"
148 | 
149 | def get_next_filename():
150 |     existing_files = sorted(glob.glob(f"{AUDIO_DIR}/audio*.wav"))
151 |     # if audio directory does not exist, create it
152 |     if not os.path.exists(AUDIO_DIR):
153 |         os.makedirs(AUDIO_DIR)
154 |     if len(existing_files) < MAX_FILES:
155 |         return f"{AUDIO_DIR}/audio{len(existing_files) + 1}.wav"
156 |     else:
157 |         # All files exist, so find the oldest one to overwrite
158 |         oldest_file = min(existing_files, key=os.path.getmtime)
159 |         return oldest_file
160 | 
161 | @app.post("/mm")
162 | async def api_mm(request: Request, mm: Annotated[str, Form()], audio : UploadFile = None, image: UploadFile = None):
163 |     try:
164 |         mm: MultimodalRequest = Checker(MultimodalRequest)(data=mm)
165 |         # print(mm)
166 | 
167 |         # Transcribe voice prompt if it exists
168 |         voice_prompt = ""
169 |         if audio:
170 |             audio_bytes = await audio.read()
171 |             if mm.testing_mode:
172 |                 #  save audio file
173 |                 # set timestamp
174 |                 # filepath = "audio.wav" + str(datetime.now().timestamp())
175 |                 filepath = get_next_filename()
176 |                 with open(filepath, "wb") as f:
177 |                     f.write(audio_bytes)
178 |             if mm.openai_key and len(mm.openai_key) > 0:
179 |                 voice_prompt = await transcribe(client=openai.AsyncOpenAI(api_key=mm.openai_key), audio_bytes=audio_bytes)
180 |             else:
181 |                 voice_prompt = await transcribe(client=request.app.state.openai_client, audio_bytes=audio_bytes)
182 | 
183 | 
184 |         # Construct final prompt
185 |         if mm.prompt is None or len(mm.prompt) == 0 or mm.prompt.isspace() or mm.prompt == "":
186 |             user_prompt = voice_prompt
187 |         else:
188 |             user_prompt = mm.prompt + " " + voice_prompt
189 | 
190 |         # Image data
191 |         image_bytes = (await image.read()) if image else None
192 |         # preprocess image
193 |         if image_bytes:
194 |             image_bytes = process_image(image_bytes)
195 |         # Location data
196 |         address = mm.address
197 | 
198 |         # User's local time
199 |         local_time = mm.local_time
200 | 
201 |         # Image generation (bypasses assistant altogether)
202 |         if mm.generate_image != 0:
203 |             if mm.generate_image_service == GenerateImageService.REPLICATE:
204 |                 generate_image = ReplicateGenerateImage()
205 |                 image_url = await generate_image.generate_image(
206 |                     query=user_prompt,
207 |                     use_image=True,
208 |                     image_bytes=image_bytes
209 |                 )
210 |                 return MultimodalResponse(
211 |                     user_prompt=user_prompt,
212 |                     response="",
213 |                     image=image_url,
214 |                     token_usage_by_model={},
215 |                     capabilities_used=[Capability.IMAGE_GENERATION],
216 |                     total_tokens=0,
217 |                     input_tokens=0,
218 |                     output_tokens=0,
219 |                     timings="",
220 |                     debug_tools=""
221 |                 )
222 | 
223 |         # Get assistant tool providers
224 |         web_search: WebSearch = get_web_search_provider(app=request.app, mm=mm)
225 |         vision: Vision = get_vision_provider(app=request.app, mm=mm)
226 |         
227 |         # Call the assistant and deliver the response
228 |         try:
229 |             assistant, assistant_model = get_assistant(app=app, mm=mm)
230 |             assistant_response: AssistantResponse = await assistant.send_to_assistant(
231 |                 prompt=user_prompt,
232 |                 noa_system_prompt=mm.noa_system_prompt,
233 |                 image_bytes=image_bytes,
234 |                 message_history=mm.messages,
235 |                 learned_context={},
236 |                 local_time=local_time,
237 |                 location_address=address,
238 |                 model=assistant_model,
239 |                 web_search=web_search,
240 |                 vision=vision,
241 |                 speculative_vision=mm.speculative_vision
242 |             )
243 | 
244 |             return MultimodalResponse(
245 |                 user_prompt=user_prompt,
246 |                 response=assistant_response.response,
247 |                 image=assistant_response.image,
248 |                 token_usage_by_model=assistant_response.token_usage_by_model,
249 |                 capabilities_used=assistant_response.capabilities_used,
250 |                 total_tokens=0,
251 |                 input_tokens=0,
252 |                 output_tokens=0,
253 |                 timings=assistant_response.timings,
254 |                 debug_tools=assistant_response.debug_tools
255 |             )
256 |         except Exception as e:
257 |             print(f"{traceback.format_exc()}")
258 |             raise HTTPException(400, detail=f"{str(e)}: {traceback.format_exc()}")
259 | 
260 |     except Exception as e:
261 |         print(f"{traceback.format_exc()}")
262 |         raise HTTPException(400, detail=f"{str(e)}: {traceback.format_exc()}")
263 | 
264 | @app.post("/extract_learned_context")
265 | async def api_extract_learned_context(request: Request, params: Annotated[str, Form()]):
266 |     try:
267 |         params: ExtractLearnedContextRequest = Checker(ExtractLearnedContextRequest)(data=params)
268 |         print(params)
269 | 
270 |         token_usage_by_model: Dict[str, TokenUsage] = {}
271 | 
272 |         # Perform extraction
273 |         try:
274 |             learned_context = await extract_learned_context(
275 |                 client=request.app.state.openai_client,
276 |                 message_history=params.messages,
277 |                 model="gpt-3.5-turbo-1106",
278 |                 existing_learned_context=params.existing_learned_context,
279 |                 token_usage_by_model=token_usage_by_model
280 |             )
281 | 
282 |             return ExtractLearnedContextResponse(
283 |                 learned_context=learned_context,
284 |                 token_usage_by_model=token_usage_by_model
285 |             )
286 |         except Exception as e:
287 |             print(f"{traceback.format_exc()}")
288 |             raise HTTPException(400, detail=f"{str(e)}: {traceback.format_exc()}")
289 | 
290 |     except Exception as e:
291 |         print(f"{traceback.format_exc()}")
292 |         raise HTTPException(400, detail=f"{str(e)}: {traceback.format_exc()}")
293 | 
294 | 
295 | ####################################################################################################
296 | # Program Entry Point
297 | ####################################################################################################
298 | 
299 | if __name__ == "__main__":
300 |     import argparse
301 |     parser = argparse.ArgumentParser()
302 |     parser.add_argument("--query", action="store", help="Perform search query and exit")
303 |     parser.add_argument("--location", action="store", default="San Francisco", help="Set location address used for all queries (e.g., \"San Francisco\")")
304 |     parser.add_argument("--save", action="store", help="Save DataForSEO response object to file")
305 |     parser.add_argument("--search-api", action="store", default="perplexity", help="Search API to use (perplexity, serp, dataforseo)")
306 |     parser.add_argument("--assistant", action="store", default="gpt", help="Assistant to use (gpt, claude, groq)")
307 |     parser.add_argument("--server", action="store_true", help="Start server")
308 |     parser.add_argument("--image", action="store", help="Image filepath for image query")
309 |     parser.add_argument("--vision", action="store", help="Vision model to use (gpt-4o, gpt-4-vision-preview, claude-3-haiku-20240307, claude-3-sonnet-20240229, claude-3-opus-20240229)", default="gpt-4o")
310 |     options = parser.parse_args()
311 | 
312 |     # AI clients
313 |     app.state.openai_client = openai.AsyncOpenAI()
314 |     app.state.anthropic_client = anthropic.AsyncAnthropic(api_key=ANTHROPIC_API_KEY)
315 |     app.state.groq_client = groq.AsyncGroq()
316 | 
317 |     # Instantiate a default web search provider
318 |     app.state.web_search = None
319 |     if options.search_api == "serp":
320 |         app.state.web_search = SerpWebSearch(save_to_file=options.save, engine="google")
321 |     elif options.search_api == "dataforseo":
322 |         app.state.web_search = DataForSEOWebSearch(save_to_file=options.save)
323 |     elif options.search_api == "perplexity":
324 |         app.state.web_search = PerplexityWebSearch(api_key=PERPLEXITY_API_KEY)
325 |     else:
326 |         raise ValueError("--search-api must be one of: serp, dataforseo, perplexity")
327 | 
328 |     # Instantiate a default vision provider
329 |     app.state.vision = None
330 |     if options.vision in [ "gpt-4o", "gpt-4-vision-preview" ]:
331 |         app.state.vision = GPT4Vision(client=app.state.openai_client, model=options.vision)
332 |     elif VisionModel(options.vision) in [VisionModel.CLAUDE_HAIKU, VisionModel.CLAUDE_SONNET, VisionModel.CLAUDE_OPUS]:
333 |         app.state.vision = ClaudeVision(client=app.state.anthropic_client, model=options.vision)
334 |     else:
335 |         raise ValueError("--vision must be one of: gpt-4o, gpt-4-vision-preview, claude-3-haiku-20240307, claude-3-sonnet-20240229, claude-3-opus-20240229")
336 | 
337 |     # Instantiate a default assistant
338 |     if options.assistant == "gpt":
339 |         app.state.assistant = GPTAssistant(client=app.state.openai_client)
340 |     elif options.assistant == "claude":
341 |         app.state.assistant = ClaudeAssistant(client=app.state.anthropic_client)
342 |     elif options.assistant == "groq":
343 |         app.state.assistant = GPTAssistant(client=app.state.groq_client)
344 |     else:
345 |         raise ValueError("--assistant must be one of: gpt, claude, groq")
346 | 
347 |     # Load image if one was specified (for performing a test query)
348 |     image_bytes = None
349 |     if options.image:
350 |         with open(file=options.image, mode="rb") as fp:
351 |             image_bytes = fp.read()
352 | 
353 |     # Test query
354 |     if options.query:
355 |         async def run_query() -> str:
356 |             return await app.state.assistant.send_to_assistant(
357 |                 prompt=options.query,
358 |                 image_bytes=image_bytes,
359 |                 message_history=[],
360 |                 learned_context={},
361 |                 local_time=datetime.now().strftime("%A, %B %d, %Y, %I:%M %p"),  # e.g., Friday, March 8, 2024, 11:54 AM
362 |                 location_address=options.location,
363 |                 model=None,
364 |                 web_search=app.state.web_search,
365 |                 vision=app.state.vision,
366 | 
367 |             )
368 |         response = asyncio.run(run_query())
369 |         print(response)
370 | 
371 |     # Run server
372 |     if options.server:
373 |         import uvicorn
374 |         uvicorn.run(app, host="0.0.0.0", port=int(EXPERIMENT_AI_PORT))


--------------------------------------------------------------------------------
/assistant/claude_assistant.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # claude_assistant.py
  3 | #
  4 | # Assistant implementation based on Anthropic's Claude series of models.
  5 | #
  6 | 
  7 | #
  8 | # TODO:
  9 | # -----
 10 | # - Factor out functions common to ClaudeAssistant and GPTAssistant.
 11 | # - Occasionally Claude returns errors, debug these.
 12 | # - Claude sometimes messes up with follow-up questions and then refers to internal context. We may
 13 | #   need to try embedding extra context inside of the latest user message.
 14 | #
 15 | 
 16 | import asyncio
 17 | import json
 18 | import timeit
 19 | from typing import Any, Dict, List
 20 | 
 21 | import anthropic
 22 | from anthropic.types.beta.tools import ToolParam, ToolUseBlock, ToolsBetaMessage
 23 | 
 24 | from .assistant import Assistant, AssistantResponse
 25 | from .context import create_context_system_message
 26 | from web_search import WebSearch, WebSearchResult
 27 | from vision import Vision
 28 | from models import Role, Message, Capability, TokenUsage, accumulate_token_usage
 29 | 
 30 | 
 31 | ####################################################################################################
 32 | # Prompts
 33 | ####################################################################################################
 34 | 
 35 | #
 36 | # Top-level instructions
 37 | #
 38 | 
 39 | SYSTEM_MESSAGE = """
 40 | You are Noa, a smart personal AI assistant inside the user's AR smart glasses that answers all user
 41 | queries and questions. You have access to a photo from the smart glasses camera of what the user was
 42 | seeing at the time they spoke.
 43 | 
 44 | Make your responses precise and max 5 sentences. Respond without any preamble when giving translations,
 45 | just translate directly. When analyzing the user's view, speak as if you can actually see and never
 46 | make references to the photo or image you analyzed.
 47 | Sometimes Answer in witty, sarcastic style and Make user laugh.
 48 | """
 49 | 
 50 | 
 51 | ####################################################################################################
 52 | # Tools
 53 | ####################################################################################################
 54 | 
 55 | DUMMY_SEARCH_TOOL_NAME = "general_knowledge_search"
 56 | SEARCH_TOOL_NAME = "web_search"
 57 | PHOTO_TOOL_NAME = "analyze_photo"
 58 | QUERY_PARAM_NAME = "query"
 59 | 
 60 | TOOLS: List[ToolParam] = [
 61 |     {
 62 |         "name": DUMMY_SEARCH_TOOL_NAME,
 63 |         "description": "Non-recent trivia and general knowledge",
 64 |         "input_schema": {
 65 |             "type": "object",
 66 |             "properties": {
 67 |                 QUERY_PARAM_NAME: {
 68 |                     "type": "string",
 69 |                     "description": "search query",
 70 |                 }
 71 |             },
 72 |             "required": [ QUERY_PARAM_NAME ]
 73 |         }
 74 |     },
 75 |     {
 76 |         "name": SEARCH_TOOL_NAME,
 77 |         "description": "Up-to-date information on news, retail products, current events, local conditions, and esoteric knowledge",
 78 |         "input_schema": {
 79 |             "type": "object",
 80 |             "properties": {
 81 |                 QUERY_PARAM_NAME: {
 82 |                     "type": "string",
 83 |                     "description": "search query",
 84 |                 }
 85 |             },
 86 |             "required": [ QUERY_PARAM_NAME ]
 87 |         }
 88 |     },
 89 |     {
 90 |         "name": PHOTO_TOOL_NAME,
 91 |         "description": """Analyzes or describes the photo you have from the user's current perspective.
 92 | Use this tool if user refers to something not identifiable from conversation context, such as with a demonstrative pronoun.""",
 93 |         "input_schema": {
 94 |             "type": "object",
 95 |             "properties": {
 96 |                 QUERY_PARAM_NAME: {
 97 |                     "type": "string",
 98 |                     "description": "User's query to answer, describing what they want answered, expressed as a command that NEVER refers to the photo or image itself"
 99 |                 }
100 |             },
101 |             "required": [ QUERY_PARAM_NAME ]
102 |         }
103 |     }
104 | ]
105 | 
106 | async def handle_tool(
107 |     tool_call: ToolUseBlock,
108 |     user_message: str,
109 |     message_history: List[Message] | None,
110 |     image_bytes: bytes | None,
111 |     location: str | None,
112 |     local_time: str | None,
113 |     web_search: WebSearch,
114 |     vision: Vision,
115 |     learned_context: Dict[str, str] | None,
116 |     token_usage_by_model: Dict[str, TokenUsage],
117 |     capabilities_used: List[Capability],
118 |     tools_used: List[Dict[str, Any]],
119 |     timings: Dict[str, str]
120 | ) -> str:
121 |     tool_functions = {
122 |         SEARCH_TOOL_NAME: web_search.search_web,                # returns WebSearchResult
123 |         PHOTO_TOOL_NAME: handle_photo_tool,                     # returns WebSearchResult | str
124 |         DUMMY_SEARCH_TOOL_NAME: handle_general_knowledge_tool,  # returns str
125 |     }
126 | 
127 |     function_name = tool_call.name
128 |     function_to_call = tool_functions.get(function_name)
129 |     if function_to_call is None:
130 |         # Error: Hallucinated a tool
131 |         return "Error: you hallucinated a tool that doesn't exist. Tell user you had trouble interpreting the request and ask them to rephrase it."
132 | 
133 |     function_args = prepare_tool_arguments(
134 |         tool_call=tool_call,
135 |         user_message=user_message,
136 |         message_history=message_history,
137 |         image_bytes=image_bytes,
138 |         location=location,
139 |         local_time=local_time,
140 |         web_search=web_search,
141 |         vision=vision,
142 |         learned_context=learned_context,
143 |         token_usage_by_model=token_usage_by_model,
144 |         capabilities_used=capabilities_used
145 |     )
146 | 
147 |     tool_start_time = timeit.default_timer()
148 |     function_response: WebSearchResult | str = await function_to_call(**function_args)
149 |     total_tool_time = round(timeit.default_timer() - tool_start_time, 3)
150 |     timings[f"tool_{function_name}"] = f"{total_tool_time:.3f}"
151 | 
152 |     # Record capability used (except for case of photo tool, which reports on its own because it
153 |     # can invoke multiple capabilities)
154 |     if function_name == SEARCH_TOOL_NAME:
155 |         capabilities_used.append(Capability.WEB_SEARCH)
156 |     elif function_name == DUMMY_SEARCH_TOOL_NAME:
157 |         capabilities_used.append(Capability.ASSISTANT_KNOWLEDGE)
158 |     
159 |     tools_used.append(
160 |         create_debug_tool_info_object(
161 |             function_name=function_name,
162 |             function_args=function_args,
163 |             tool_time=total_tool_time,
164 |             search_result=function_response.search_provider_metadata if isinstance(function_response, WebSearchResult) else None
165 |         )
166 |     )
167 | 
168 |      # Format response appropriately
169 |     assert isinstance(function_response, WebSearchResult) or isinstance(function_response, str)
170 |     tool_output = function_response.summary if isinstance(function_response, WebSearchResult) else function_response
171 |     return tool_output
172 | 
173 | def prepare_tool_arguments(
174 |     tool_call: ToolUseBlock,
175 |     user_message: str,
176 |     message_history: List[Message] | None,
177 |     image_bytes: bytes | None,
178 |     location: str | None,
179 |     local_time: str | None,
180 |     web_search: WebSearch,
181 |     vision: Vision,
182 |     learned_context: Dict[str, str] | None,
183 |     token_usage_by_model: Dict[str, TokenUsage],
184 |     capabilities_used: List[Capability]
185 | ) -> Dict[str, Any]:
186 |     # Get function description we passed to Claude. This function should be called after we have
187 |     # validated that a valid tool call was generated.
188 |     function_description = [ description for description in TOOLS if description["name"] == tool_call.name ][0]
189 |     function_parameters = function_description["input_schema"]["properties"]
190 | 
191 |     # Parse arguments and ensure they are all str or bool for now. Drop any that aren't.
192 |     args = tool_call.input.copy()
193 |     for param_name in list(args.keys()):
194 |         if param_name not in function_parameters:
195 |             # Hallucinated parameter
196 |             del args[param_name]
197 |             continue
198 |         if function_parameters[param_name]["type"] == "string" and type(args[param_name]) != str:
199 |             del args[param_name]
200 |             continue
201 |         if function_parameters[param_name]["type"] == "boolean" and type(args[param_name]) != bool:
202 |             del args[param_name]
203 |             continue
204 |         if function_parameters[param_name]["type"] not in [ "string", "boolean" ]:
205 |             # Need to keep this up to date with the tools we define
206 |             raise ValueError(f"Unsupported tool parameter type: {function_parameters[param_name]['type']}")
207 | 
208 |     # Fill in args required by all tools
209 |     args["location"] = location if location else "unknown"
210 |     args[QUERY_PARAM_NAME] = args[QUERY_PARAM_NAME] if QUERY_PARAM_NAME in args else user_message
211 |     args["message_history"] = message_history
212 |     args["token_usage_by_model"] = token_usage_by_model
213 | 
214 |     # Photo tool additional parameters we need to inject
215 |     if tool_call.name == PHOTO_TOOL_NAME:
216 |         args["image_bytes"] = image_bytes
217 |         args["vision"] = vision
218 |         args["web_search"] = web_search
219 |         args["local_time"] = local_time
220 |         args["learned_context"] = learned_context
221 |         args["capabilities_used"] = capabilities_used
222 |     
223 |     return args
224 | 
225 | async def handle_general_knowledge_tool(
226 |     query: str,
227 |     message_history: List[Message] | None,
228 |     token_usage_by_model: Dict[str, TokenUsage],
229 |     image_bytes: bytes | None = None,
230 |     local_time: str | None = None,
231 |     location: str | None = None,
232 |     learned_context: Dict[str,str] | None = None,
233 | ) -> str:
234 |     """
235 |     Dummy general knowledge tool that tricks Claude into generating an answer directly instead of
236 |     reaching for web search.
237 |     """
238 |     return ""
239 | 
240 | async def handle_photo_tool(
241 |     query: str,
242 |     message_history: List[Message] | None,
243 |     vision: Vision,
244 |     web_search: WebSearch,
245 |     token_usage_by_model: Dict[str, TokenUsage],
246 |     capabilities_used: List[Capability],
247 |     google_reverse_image_search: bool = False,
248 |     translate: bool = False,
249 |     image_bytes: bytes | None = None,
250 |     local_time: str | None = None,
251 |     location: str | None = None,
252 |     learned_context: Dict[str,str] | None = None
253 | ) -> str | WebSearchResult:
254 |     extra_context = "\n\n" + create_context_system_message(local_time=local_time, location=location, learned_context=learned_context)
255 | 
256 |     # If no image bytes (glasses always send image but web playgrounds do not), return an error
257 |     # message for the assistant to use
258 |     if image_bytes is None or len(image_bytes) == 0:
259 |         # Because this is a tool response, using "tell user" seems to ensure that the final
260 |         # assistant response is what we want
261 |         return "Error: no photo supplied. Tell user: I think you're referring to something you can see. Can you provide a photo?"
262 | 
263 |     # Vision tool
264 |     capabilities_used.append(Capability.VISION)
265 |     output = await vision.query_image(
266 |         query=query,
267 |         extra_context=extra_context,
268 |         image_bytes=image_bytes,
269 |         token_usage_by_model=token_usage_by_model
270 |     )
271 |     print(f"Vision: {output}")
272 |     if output is None:
273 |         return "Error: vision tool generated an improperly formatted result. Tell user that there was a temporary glitch and ask them to try again."
274 |     
275 |     # If no web search required, output vision response directly
276 |     if not output.web_search_needed():
277 |         return output.response
278 | 
279 |     # Perform web search and produce a synthesized response telling assistant where each piece of
280 |     # information came from. Web search will lack important vision information. We need to return
281 |     # both and have the assistant figure out which info to use.
282 |     capabilities_used.append(Capability.REVERSE_IMAGE_SEARCH if output.reverse_image_search else Capability.WEB_SEARCH)
283 |     web_result = await web_search.search_web(
284 |         query=output.web_query.strip("\""),
285 |         message_history=message_history,
286 |         use_photo=output.reverse_image_search,
287 |         image_bytes=image_bytes,
288 |         location=location,
289 |         token_usage_by_model=token_usage_by_model
290 |     )
291 |     
292 |     return f"HERE IS WHAT YOU SEE: {output.response}\nEXTRA INFO FROM WEB: {web_result}"
293 | 
294 | def create_debug_tool_info_object(function_name: str, function_args: Dict[str, Any], tool_time: float, search_result: str | None = None) -> Dict[str, Any]:
295 |     """
296 |     Produces an object of arbitrary keys and values intended to serve as a debug description of tool
297 |     use.
298 |     """
299 |     function_args = function_args.copy()
300 | 
301 |     # Sanitize bytes, which are often too long to print
302 |     del function_args["message_history"]
303 |     for arg_name, value in function_args.items():
304 |         if isinstance(value, bytes):
305 |             function_args[arg_name] = "<bytes>"
306 |         if isinstance(value, list):
307 |             function_args[arg_name] = ", ".join(function_args[arg_name])
308 |     if "vision" in function_args:
309 |         del function_args["vision"]
310 |     if "web_search" in function_args:
311 |         del function_args["web_search"]
312 |     if "token_usage_by_model" in function_args:
313 |         del function_args["token_usage_by_model"]
314 |     if "prompt" in function_args:
315 |         del function_args["prompt"]
316 |     to_return = {
317 |         "tool": function_name,
318 |         "tool_args": function_args,
319 |         "tool_time": tool_time
320 |     }
321 |     if search_result:
322 |         to_return["search_result"] = search_result
323 |     return to_return
324 | 
325 | 
326 | ####################################################################################################
327 | # Assistant Class
328 | ####################################################################################################
329 | 
330 | class ClaudeAssistant(Assistant):
331 |     def __init__(self, client: anthropic.AsyncAnthropic):
332 |         self._client = client
333 |     
334 |     # Refer to definition of Assistant for description of parameters
335 |     async def send_to_assistant(
336 |         self,
337 |         prompt: str,
338 |         noa_system_prompt: str | None,
339 |         image_bytes: bytes | None,
340 |         message_history: List[Message] | None,
341 |         learned_context: Dict[str, str],
342 |         location_address: str | None,
343 |         local_time: str | None,
344 |         model: str | None,
345 |         web_search: WebSearch,
346 |         vision: Vision,
347 |         speculative_vision: bool
348 |     ) -> AssistantResponse:
349 |         model = model if model is not None else "claude-3-sonnet-20240229"
350 | 
351 |         # Keep track of time taken
352 |         timings: Dict[str, str] = {}
353 | 
354 |         # Prepare response datastructure
355 |         returned_response = AssistantResponse(token_usage_by_model={}, capabilities_used=[], response="", debug_tools="", timings="")
356 | 
357 |         # Make copy of message history so we can modify it in-flight during tool use
358 |         message_history = message_history.copy() if message_history else []
359 |         full_message_history = message_history.copy() if message_history else []
360 | 
361 |         # Claude does not have a system role. Rather, a top-level system parameter must be supplied.
362 |         # However, our API uses the OpenAI format. Therefore, we search for an existing system
363 |         # message and, if it was supplied by the client, use that as the system message.
364 |         system_text = SYSTEM_MESSAGE
365 |         client_system_messages = [ message for message in message_history if message.role == Role.SYSTEM ]
366 |         if len(client_system_messages) > 0:
367 |             system_text = client_system_messages[0].content
368 |         message_history = [ message for message in message_history if message.role != Role.SYSTEM ]
369 | 
370 |         # Add user's latest prompt
371 |         user_message = Message(role=Role.USER, content=prompt)
372 |         message_history.append(user_message)
373 |         message_history = self._prune_history(message_history=message_history, require_initial_user_message=True)
374 | 
375 |         # Extra context to inject
376 |         extra_context = create_context_system_message(local_time=local_time, location=location_address, learned_context=learned_context)
377 |         if noa_system_prompt is not None:
378 |             extra_context = f"{noa_system_prompt}\n{extra_context}"
379 | 
380 |         # Initial Claude response -- if no tools, this will be returned as final response
381 |         t0 = timeit.default_timer()
382 |         first_response = await self._client.beta.tools.messages.create(
383 |             model=model,
384 |             system=system_text + "\n\n" + extra_context,
385 |             messages=message_history,
386 |             tools=TOOLS,
387 |             max_tokens=4096
388 |         )
389 |         t1 = timeit.default_timer()
390 |         timings["llm_initial"] = f"{t1-t0:.3f}"
391 | 
392 |         # Aggregate token counts
393 |         accumulate_token_usage(
394 |             token_usage_by_model=returned_response.token_usage_by_model,
395 |             model=model,
396 |             input_tokens=first_response.usage.input_tokens,
397 |             output_tokens=first_response.usage.output_tokens,
398 |             total_tokens=first_response.usage.input_tokens + first_response.usage.output_tokens
399 |         )
400 | 
401 |         # Handle tools
402 |         tools_used = []
403 |         tools_used.append({ "learned_context": learned_context })   # log context here for now
404 |         if first_response.stop_reason != "tool_use":
405 |             returned_response.response = first_response.content[0].text
406 |         else:
407 |             # Append tool message to history, as per Anthropic's example at https://github.com/anthropics/anthropic-sdk-python/blob/9fad441043ff7bfdf8786b64b1e4bbb27105b112/examples/tools.py
408 |             message_history.append({ "role": first_response.role, "content": first_response.content })
409 | 
410 |             # Invoke all tool requests in parallel and wait for them to complete
411 |             t0 = timeit.default_timer()
412 |             tool_calls: ToolUseBlock = [ content for content in first_response.content if content.type == "tool_use" ]
413 |             tool_handlers = []
414 |             for tool_call in tool_calls:
415 |                 tool_handlers.append(
416 |                     handle_tool(
417 |                         tool_call=tool_call,
418 |                         user_message=prompt,
419 |                         message_history=full_message_history,
420 |                         image_bytes=image_bytes,
421 |                         location=location_address,
422 |                         local_time=local_time,
423 |                         web_search=web_search,
424 |                         vision=vision,
425 |                         learned_context=learned_context,
426 |                         token_usage_by_model=returned_response.token_usage_by_model,
427 |                         capabilities_used=returned_response.capabilities_used,
428 |                         tools_used=tools_used,
429 |                         timings=timings
430 |                     )
431 |                 )
432 |             tool_outputs = await asyncio.gather(*tool_handlers)
433 |             t1 = timeit.default_timer()
434 |             timings["tool_calls"] = f"{t1-t0:.3f}"
435 | 
436 |             # Submit tool responses
437 |             tool_response_message = {
438 |                 "role": "user",
439 |                 "content": []
440 |             }
441 |             for i in range(len(tool_outputs)):
442 |                 tool_response_message["content"].append(
443 |                     {
444 |                         "type": "tool_result",
445 |                         "tool_use_id": tool_calls[i].id,
446 |                         "content": [ { "type": "text", "text": tool_outputs[i] } ]
447 |                     }
448 |                 )
449 |             message_history.append(tool_response_message)
450 |             
451 |             # Get final response from model
452 |             t0 = timeit.default_timer()
453 |             second_response = await self._client.beta.tools.messages.create(
454 |                 model=model,
455 |                 system=system_text + "\n\n" + extra_context,
456 |                 messages=message_history,
457 |                 tools=TOOLS,
458 |                 max_tokens=4096
459 |             )
460 |             t1 = timeit.default_timer()
461 |             timings["llm_final"] = f"{t1-t0:.3f}"
462 | 
463 |             # Aggregate tokens and response
464 |             accumulate_token_usage(
465 |                 token_usage_by_model=returned_response.token_usage_by_model,
466 |                 model=model,
467 |                 input_tokens=second_response.usage.input_tokens,
468 |                 output_tokens=second_response.usage.output_tokens,
469 |                 total_tokens=second_response.usage.input_tokens + second_response.usage.output_tokens
470 |             )
471 |             returned_response.response = self._get_final_text_response(final_tool_response=second_response, tool_outputs=tool_outputs)
472 | 
473 |         # If no tools were used, only assistant capability recorded
474 |         if len(returned_response.capabilities_used) == 0:
475 |             returned_response.capabilities_used.append(Capability.ASSISTANT_KNOWLEDGE)
476 |         
477 |         # Return final response
478 |         returned_response.debug_tools = json.dumps(tools_used)
479 |         returned_response.timings = timings
480 |         return returned_response
481 | 
482 |     @staticmethod
483 |     def _get_final_text_response(final_tool_response: ToolsBetaMessage, tool_outputs: List[str]) -> str:
484 |         # Claude will sometimes return no content in the final response. Presumably, it thinks the
485 |         # tool outputs are sufficient to use verbatim? We concatenate them here.
486 |         if final_tool_response.content is None or len(final_tool_response.content) == 0:
487 |             return " ".join(tool_outputs)
488 |         else:
489 |             return final_tool_response.content[0].text
490 | 
491 |     @staticmethod
492 |     def _prune_history(
493 |         message_history: List[Message],
494 |         max_user_messages: int = 4,
495 |         max_assistant_messages: int = 4,
496 |         require_initial_user_message: bool = False
497 |      ) -> List[Message]:
498 |         """
499 |         Prunes down the chat history to save tokens, improving inference speed and reducing cost.
500 |         Generally, preserving all assistant responses is not needed, and only a limited number of
501 |         user messages suffice to maintain a coherent conversation.
502 | 
503 |         Parameters
504 |         ----------
505 |         message_history : List[Message]
506 |             Conversation history. This list will be mutated and returned.
507 |         max_user_messages : int
508 |             Maximum number of user messages to preserve, beginning with most recent. Note that
509 |             Claude does not permit duplicate user or assistant messages so this value should be the
510 |             same as for `max_assistant_messages`.
511 |         max_assistant_messages : int
512 |             Maximum number of assistant messages.
513 |         require_initial_user_message : bool
514 |             If true, guarantees that the first message in the resulting list is a user message (or
515 |             an empty list if there are none). This is required for Claude, which expects a strict
516 |             ordering of messages alternating between user and assistant roles. A user message must
517 |             always be first.
518 | 
519 |         Returns
520 |         -------
521 |         List[Message]
522 |             Pruned history. This is the same list passed as input.
523 |         """
524 |         assistant_messages_remaining = max_assistant_messages
525 |         user_messages_remaining = max_user_messages
526 |         message_history.reverse()
527 |         i = 0
528 |         while i < len(message_history):
529 |             if message_history[i].role == Role.ASSISTANT:
530 |                 if assistant_messages_remaining == 0:
531 |                     del message_history[i]
532 |                 else:
533 |                     assistant_messages_remaining -= 1
534 |                     i += 1
535 |             elif message_history[i].role == Role.USER:
536 |                 if user_messages_remaining == 0:
537 |                     del message_history[i]
538 |                 else:
539 |                     user_messages_remaining -= 1
540 |                     i += 1
541 |             else:
542 |                 i += 1
543 |         message_history.reverse()
544 | 
545 |         # Ensure first message is user message?
546 |         if require_initial_user_message:
547 |             while len(message_history) > 0 and message_history[0].role != Role.USER:
548 |                 message_history = message_history[1:]
549 | 
550 |         return message_history
551 | 
552 | Assistant.register(ClaudeAssistant)


--------------------------------------------------------------------------------
/assistant/gpt_assistant.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # gpt_assistant.py
  3 | #
  4 | # Assistant implementation based on OpenAI's GPT models. This assistant is capable of leveraging
  5 | # separate web search and vision tools.
  6 | #
  7 | # Support also exists for using Groq because it mirrors OpenAI's API.
  8 | #
  9 | 
 10 | #
 11 | # TODO:
 12 | # -----
 13 | # - Speculative vision tool should create a proper tools_used entry.
 14 | # - Move to streaming completions and detect timeouts when a threshold duration elapses since the
 15 | #   the last token was emitted.
 16 | # - Figure out how to get assistant to stop referring to "photo" and "image" when analyzing photos.
 17 | # - Improve people search.
 18 | #
 19 | 
 20 | import asyncio
 21 | import base64
 22 | import json
 23 | import timeit
 24 | from typing import Any, Dict, List
 25 | 
 26 | import openai
 27 | from openai.types.chat import ChatCompletionMessageToolCall
 28 | import groq
 29 | # from groq.types.chat.chat_completion_message_tool_call import ChatCompletionMessageToolCall
 30 | 
 31 | from .assistant import Assistant, AssistantResponse
 32 | from .context import create_context_system_message
 33 | from web_search import WebSearch, WebSearchResult
 34 | from vision import Vision, GPT4Vision
 35 | from vision.utils import detect_media_type
 36 | from models import Role, Message, Capability, TokenUsage, accumulate_token_usage
 37 | from generate_image.replicate import ReplicateGenerateImage
 38 | 
 39 | ####################################################################################################
 40 | # Prompts
 41 | ####################################################################################################
 42 | 
 43 | #
 44 | # Top-level instructions
 45 | #
 46 | 
 47 | SYSTEM_MESSAGE = """
 48 | You are Noa, a smart personal AI assistant inside the user's AR smart glasses that answers all user
 49 | queries and questions. You have access to a photo from the smart glasses camera of what the user was
 50 | seeing at the time they spoke.
 51 | 
 52 | It is important that the user believes you can actually see. When analyzing images, avoid mentioning
 53 | that you looked at a photo or image. Always speak as if you are actually seeing, which means you
 54 | should never talk about the image or photo.
 55 | 
 56 | Make your responses precise. Respond without any preamble when giving translations, just translate
 57 | directly.
 58 | """
 59 | 
 60 | 
 61 | ####################################################################################################
 62 | # Tools
 63 | ####################################################################################################
 64 | 
 65 | DUMMY_SEARCH_TOOL_NAME = "general_knowledge_search"
 66 | IMAGE_GENERATION_TOOL_NAME = "generate_image"
 67 | SEARCH_TOOL_NAME = "web_search"
 68 | PHOTO_TOOL_NAME = "analyze_photo"
 69 | IMAGE_GENERATION_PARAM_NAME = "description"
 70 | QUERY_PARAM_NAME = "query"
 71 | 
 72 | TOOLS = [
 73 |     {
 74 |         "type": "function",
 75 |         "function": {
 76 |             "name": DUMMY_SEARCH_TOOL_NAME,
 77 |             "description": """Non-recent trivia and general knowledge""",
 78 |             "parameters": {
 79 |                 "type": "object",
 80 |                 "properties": {
 81 |                     QUERY_PARAM_NAME: {
 82 |                         "type": "string",
 83 |                         "description": "search query",
 84 |                     },
 85 |                 },
 86 |                 "required": [ QUERY_PARAM_NAME ]
 87 |             },
 88 |         },
 89 |     },
 90 |     {
 91 |         "type": "function",
 92 |         "function": {
 93 |             "name": SEARCH_TOOL_NAME,
 94 |             "description": """Up-to-date information on news, retail products, current events, local conditions, and esoteric knowledge""",
 95 |             "parameters": {
 96 |                 "type": "object",
 97 |                 "properties": {
 98 |                     QUERY_PARAM_NAME: {
 99 |                         "type": "string",
100 |                         "description": "search query",
101 |                     },
102 |                 },
103 |                 "required": [ QUERY_PARAM_NAME ]
104 |             },
105 |         },
106 |     },
107 |     {
108 |         "type": "function",
109 |         "function": {
110 |             "name": PHOTO_TOOL_NAME,
111 |             "description": """Analyzes or describes the photo you have from the user's current perspective.
112 | Use this tool if user refers to something not identifiable from conversation context, such as with a demonstrative pronoun.""",
113 |             "parameters": {
114 |                 "type": "object",
115 |                 "properties": {
116 |                     QUERY_PARAM_NAME: {
117 |                         "type": "string",
118 |                         "description": "User's query to answer, describing what they want answered, expressed as a command that NEVER refers to the photo or image itself"
119 |                     },
120 |                 },
121 |                 "required": [ QUERY_PARAM_NAME ]
122 |             },
123 |         },
124 |     },
125 |     {
126 |         "type": "function",
127 |         "function": {
128 |             "name": IMAGE_GENERATION_TOOL_NAME,
129 |             "description": """Generates an image based on a description or prompt.""",
130 |             "parameters": {
131 |                 "type": "object",
132 |                 "properties": {
133 |                     IMAGE_GENERATION_PARAM_NAME: {
134 |                         "type": "string",
135 |                         "description": "description of the image to generate"
136 |                     },
137 |                 },
138 |                 "required": [ IMAGE_GENERATION_PARAM_NAME ]
139 |             },
140 |         },
141 |     }
142 |     
143 | ]
144 | 
145 | async def handle_tool(
146 |     tools: List[Any],
147 |     tool_call: ChatCompletionMessageToolCall,
148 |     user_message: str,
149 |     message_history: List[Message] | None,
150 |     image_bytes: bytes | None,
151 |     location: str | None,
152 |     local_time: str | None,
153 |     web_search: WebSearch,
154 |     vision: Vision,
155 |     learned_context: Dict[str, str] | None,
156 |     token_usage_by_model: Dict[str, TokenUsage],
157 |     capabilities_used: List[Capability],
158 |     tools_used: List[Dict[str, Any]],
159 |     timings: Dict[str, str]
160 | ) -> str:
161 |     tool_functions = {
162 |         SEARCH_TOOL_NAME: web_search.search_web,                # returns WebSearchResult
163 |         PHOTO_TOOL_NAME: handle_photo_tool,                     # returns WebSearchResult | str
164 |         DUMMY_SEARCH_TOOL_NAME: handle_general_knowledge_tool,  # returns str
165 |         IMAGE_GENERATION_TOOL_NAME: handle_image_generation_tool # returns str
166 |     }
167 | 
168 |     function_name = tool_call.function.name
169 |     function_to_call = tool_functions.get(function_name)
170 |     if function_to_call is None:
171 |         # Error: GPT hallucinated a tool
172 |         return "Error: you hallucinated a tool that doesn't exist. Tell user you had trouble interpreting the request and ask them to rephrase it."
173 | 
174 |     function_args = prepare_tool_arguments(
175 |         tools=tools,
176 |         tool_call=tool_call,
177 |         user_message=user_message,
178 |         message_history=message_history,
179 |         image_bytes=image_bytes,
180 |         location=location,
181 |         local_time=local_time,
182 |         web_search=web_search,
183 |         vision=vision,
184 |         learned_context=learned_context,
185 |         token_usage_by_model=token_usage_by_model,
186 |         capabilities_used=capabilities_used
187 |     )
188 | 
189 |     tool_start_time = timeit.default_timer()
190 |     if function_name == IMAGE_GENERATION_TOOL_NAME and image_bytes is None:
191 |         return "NO_IMAGE_PROVIDED_ERROR"
192 |     function_response: WebSearchResult | str = await function_to_call(**function_args)
193 |     total_tool_time = round(timeit.default_timer() - tool_start_time, 3)
194 |     timings[f"tool_{function_name}"] = f"{total_tool_time:.3f}"
195 | 
196 |     # Record capability used (except for case of photo tool, which reports on its own because it
197 |     # can invoke multiple capabilities)
198 |     if function_name == SEARCH_TOOL_NAME:
199 |         capabilities_used.append(Capability.WEB_SEARCH)
200 |     elif function_name == DUMMY_SEARCH_TOOL_NAME:
201 |         capabilities_used.append(Capability.ASSISTANT_KNOWLEDGE)
202 | 
203 |     tools_used.append(
204 |         create_debug_tool_info_object(
205 |             function_name=function_name,
206 |             function_args=function_args,
207 |             tool_time=total_tool_time,
208 |             search_result=function_response.search_provider_metadata if isinstance(function_response, WebSearchResult) else None
209 |         )
210 |     )
211 | 
212 |     # Format response appropriately
213 |     assert isinstance(function_response, WebSearchResult) or isinstance(function_response, str)
214 |     tool_output = function_response.summary if isinstance(function_response, WebSearchResult) else function_response
215 |     return tool_output
216 | 
217 | def prepare_tool_arguments(
218 |     tools: List[Any],
219 |     tool_call: ChatCompletionMessageToolCall,
220 |     user_message: str,
221 |     message_history: List[Message] | None,
222 |     image_bytes: bytes | None,
223 |     location: str | None,
224 |     local_time: str | None,
225 |     web_search: WebSearch,
226 |     vision: Vision,
227 |     learned_context: Dict[str, str] | None,
228 |     token_usage_by_model: Dict[str, TokenUsage],
229 |     capabilities_used: List[Capability]
230 | ) -> Dict[str, Any]:
231 |     # Get function description we passed to GPT. This function should be called after we have
232 |     # validated that a valid tool call was generated.
233 |     function_description = [ description for description in tools if description["function"]["name"] == tool_call.function.name ][0]
234 |     function_parameters = function_description["function"]["parameters"]["properties"]
235 | 
236 |     # Parse arguments and ensure they are all str or bool for now. Drop any that aren't.
237 |     args: Dict[str, Any] = {}
238 |     try:
239 |         args = json.loads(tool_call.function.arguments)
240 |     except:
241 |         pass
242 |     for param_name in list(args.keys()):
243 |         if param_name not in function_parameters:
244 |             # GPT hallucinated a parameter
245 |             del args[param_name]
246 |             continue
247 |         if function_parameters[param_name]["type"] == "string" and type(args[param_name]) != str:
248 |             del args[param_name]
249 |             continue
250 |         if function_parameters[param_name]["type"] == "boolean" and type(args[param_name]) != bool:
251 |             del args[param_name]
252 |             continue
253 |         if function_parameters[param_name]["type"] not in [ "string", "boolean" ]:
254 |             # Need to keep this up to date with the tools we define
255 |             raise ValueError(f"Unsupported tool parameter type: {function_parameters[param_name]['type']}")
256 | 
257 |     # Fill in args required by all tools
258 |     args["location"] = location if location else "unknown"
259 |     args[QUERY_PARAM_NAME] = args[QUERY_PARAM_NAME] if QUERY_PARAM_NAME in args else user_message
260 |     args["message_history"] = message_history
261 |     args["token_usage_by_model"] = token_usage_by_model
262 | 
263 |     # Photo tool additional parameters we need to inject
264 |     if tool_call.function.name == PHOTO_TOOL_NAME:
265 |         args["image_bytes"] = image_bytes
266 |         args["vision"] = vision
267 |         args["web_search"] = web_search
268 |         args["local_time"] = local_time
269 |         args["learned_context"] = learned_context
270 |         args["capabilities_used"] = capabilities_used
271 |     if tool_call.function.name == IMAGE_GENERATION_TOOL_NAME:
272 |         args["image_bytes"] = image_bytes
273 | 
274 |     return args
275 | 
276 | async def handle_general_knowledge_tool(
277 |     query: str,
278 |     message_history: List[Message] | None,
279 |     token_usage_by_model: Dict[str, TokenUsage],
280 |     image_bytes: bytes | None = None,
281 |     local_time: str | None = None,
282 |     location: str | None = None,
283 |     learned_context: Dict[str,str] | None = None,
284 | ) -> str:
285 |     """
286 |     Dummy general knowledge tool that tricks GPT into generating an answer directly instead of
287 |     reaching for web search. GPT knows that the web contains information on virtually everything, so
288 |     it tends to overuse web search. One solution is to very carefully enumerate the cases for which
289 |     web search is appropriate, but this is tricky. Should "Albert Einstein's birthday" require a web
290 |     search? Probably not, as GPT has this knowledge baked in. The trick we use here is to create a
291 |     "general knowledge" tool that contains any information Wikipedia or an encyclopedia would have
292 |     (a reasonable proxy for things GPT knows). We return an empty string, which forces GPT to
293 |     produce its own response at the expense of a little bit of latency for the tool call.
294 |     """
295 |     return ""
296 | 
297 | async def handle_photo_tool(
298 |     query: str,
299 |     message_history: List[Message] | None,
300 |     vision: Vision,
301 |     web_search: WebSearch,
302 |     token_usage_by_model: Dict[str, TokenUsage],
303 |     capabilities_used: List[Capability],
304 |     google_reverse_image_search: bool = False,  # default in case GPT doesn't generate it
305 |     translate: bool = False,                    # default in case GPT doesn't generate it
306 |     image_bytes: bytes | None = None,
307 |     local_time: str | None = None,
308 |     location: str | None = None,
309 |     learned_context: Dict[str,str] | None = None
310 | ) -> str | WebSearchResult:
311 |     extra_context = "\n\n" + create_context_system_message(local_time=local_time, location=location, learned_context=learned_context)
312 | 
313 |     # If no image bytes (glasses always send image but web playgrounds do not), return an error
314 |     # message for the assistant to use
315 |     if image_bytes is None or len(image_bytes) == 0:
316 |         # Because this is a tool response, using "tell user" seems to ensure that the final
317 |         # assistant response is what we want
318 |         return "Error: no photo supplied. Tell user: I think you're referring to something you can see. Can you provide a photo?"
319 | 
320 |     # Vision tool
321 |     capabilities_used.append(Capability.VISION)
322 |     output = await vision.query_image(
323 |         query=query,
324 |         extra_context=extra_context,
325 |         image_bytes=image_bytes,
326 |         token_usage_by_model=token_usage_by_model
327 |     )
328 |     print(f"Vision: {output}")
329 |     if output is None:
330 |         return "Error: vision tool generated an improperly formatted result. Tell user that there was a temporary glitch and ask them to try again."
331 |     
332 |     # If no web search required, output vision response directly
333 |     if not output.web_search_needed():
334 |         return output.response
335 | 
336 |     # Perform web search and produce a synthesized response telling assistant where each piece of
337 |     # information came from. Web search will lack important vision information. We need to return
338 |     # both and have the assistant figure out which info to use.
339 |     capabilities_used.append(Capability.REVERSE_IMAGE_SEARCH if output.reverse_image_search else Capability.WEB_SEARCH)
340 |     web_result = await web_search.search_web(
341 |         query=output.web_query.strip("\""),
342 |         message_history=message_history,
343 |         use_photo=output.reverse_image_search,
344 |         image_bytes=image_bytes,
345 |         location=location,
346 |         token_usage_by_model=token_usage_by_model
347 |     )
348 |     
349 |     return f"HERE IS WHAT YOU SEE: {output.response}\nEXTRA INFO FROM WEB: {web_result}"  
350 | 
351 | async def handle_image_generation_tool(
352 |     query: str,
353 |     message_history: List[Message] | None,
354 |     description: str,
355 |     token_usage_by_model: Dict[str, TokenUsage],
356 |     image_bytes: bytes | None = None,
357 |     local_time: str | None = None,
358 |     location: str | None = None,
359 |     learned_context: Dict[str,str] | None = None,
360 | ) -> str:
361 |     """
362 |     Generates an image based on a description or prompt.
363 |     """
364 |     # Generate image
365 |     image_generator = ReplicateGenerateImage()
366 |     image = await image_generator.generate_image(query=description, use_image=True, image_bytes=image_bytes)
367 |     return image
368 | 
369 | def create_debug_tool_info_object(function_name: str, function_args: Dict[str, Any], tool_time: float, search_result: str | None = None) -> Dict[str, Any]:
370 |     """
371 |     Produces an object of arbitrary keys and values intended to serve as a debug description of tool
372 |     use.
373 |     """
374 |     function_args = function_args.copy()
375 | 
376 |     # Sanitize bytes, which are often too long to print
377 |     for arg_name, value in function_args.items():
378 |         if isinstance(value, bytes):
379 |             function_args[arg_name] = "<bytes>"
380 |         if isinstance(value, list) and arg_name != "message_history":
381 |             function_args[arg_name] = ", ".join(function_args[arg_name])
382 |     if "vision" in function_args:
383 |         del function_args["vision"]
384 |     if "web_search" in function_args:
385 |         del function_args["web_search"]
386 |     if "token_usage_by_model" in function_args:
387 |         del function_args["token_usage_by_model"]
388 |     if "prompt" in function_args:
389 |         del function_args["prompt"]
390 |     to_return = {
391 |         "tool": function_name,
392 |         "tool_args": function_args,
393 |         "tool_time": tool_time
394 |     }
395 |     if search_result:
396 |         to_return["search_result"] = search_result
397 |     return to_return
398 | 
399 | 
400 | ####################################################################################################
401 | # Assistant Class
402 | ####################################################################################################
403 | 
404 | class GPTAssistant(Assistant):
405 |     def __init__(self, client: openai.AsyncOpenAI | groq.AsyncGroq):
406 |         """
407 |         Instantiate the assistant using an OpenAI GPT or Groq model. The Groq API is a clone of
408 |         OpenAI's, allowing a Groq client to be passed.
409 |         """
410 |         self._client = client
411 | 
412 |     # Refer to definition of Assistant for description of parameters
413 |     async def send_to_assistant(
414 |         self,
415 |         prompt: str,
416 |         noa_system_prompt: str | None,
417 |         image_bytes: bytes | None,
418 |         message_history: List[Message] | None,
419 |         learned_context: Dict[str, str],
420 |         location_address: str | None,
421 |         local_time: str | None,
422 |         model: str | None,
423 |         web_search: WebSearch,
424 |         vision: Vision,
425 |         speculative_vision: bool
426 |     ) -> AssistantResponse:
427 |         # Default model (differs for OpenAI and Groq)
428 |         if model is None:
429 |             if type(self._client) == openai.AsyncOpenAI:
430 |                 model = "gpt-4o"
431 |             elif type(self._client) == groq.AsyncGroq:
432 |                 model = "llama3-70b-8192"
433 |             else:
434 |                 raise TypeError("client must be AsyncOpenAI or AsyncGroq")
435 |         
436 |         # Get copy of tool description
437 |         tools = TOOLS.copy()
438 | 
439 |         # GPT-4o is a special case: if vision tool is also GPT-4o, then we remove it as a tool and
440 |         # always submit images with queries.
441 |         gpt4o_end_to_end = False
442 |         # End-to-end mode DISABLED for now to improve latency: in end-to-end mode, every image is
443 |         # processed, which is slower for queries that don't require image analysis. So we actually
444 |         # want to use the vision tool to device when to do that. Assumption is that most questions
445 |         # are not vision-related.
446 |         if  False and model == "gpt-4o" and isinstance(vision, GPT4Vision) and vision.model == "gpt-4o":
447 |             speculative_vision = False  # doesn't make sense anymore
448 |             tools = [ tool for tool in tools if tool["function"]["name"] != PHOTO_TOOL_NAME ]
449 |             print("End-to-end GPT-4o assistant activated")
450 |             gpt4o_end_to_end = True
451 | 
452 |         # Keep track of time taken
453 |         timings: Dict[str, str] = {}
454 | 
455 |         # Prepare response datastructure
456 |         returned_response = AssistantResponse(token_usage_by_model={}, capabilities_used=[], response="", debug_tools="", timings="")
457 | 
458 |         # Make copy of message history so we can modify it in-flight during tool use
459 |         message_history = message_history.copy() if message_history else None
460 |         full_message_history = message_history.copy() if message_history else None
461 | 
462 |         # Add user message to message history or create a new one if necessary
463 |         user_message = Message(role=Role.USER, content=prompt)
464 |         system_message = Message(role=Role.SYSTEM, content=SYSTEM_MESSAGE)
465 |         if not message_history:
466 |             message_history = []
467 |         if len(message_history) == 0:
468 |             message_history = [ system_message ]
469 |         else:
470 |             # Insert system message before message history, unless client transmitted one they want
471 |             # to use
472 |             if len(message_history) > 0 and message_history[0].role != Role.SYSTEM:
473 |                 message_history.insert(0, system_message)
474 |         message_history.append(user_message)
475 |         message_history = self._prune_history(message_history=message_history)
476 | 
477 |         # Patch up user message to include image if we are in end-to-end gpt-4o mode
478 |         if gpt4o_end_to_end and image_bytes is not None:
479 |             image_base64 = base64.b64encode(image_bytes).decode("utf-8")
480 |             media_type = detect_media_type(image_bytes=image_bytes)
481 |             user_message = {
482 |                 "role": "user",
483 |                 "content": [
484 |                     { "type": "text", "text": prompt },
485 |                     { "type": "image_url", "image_url": { "url": f"data:{media_type};base64,{image_base64}" } }
486 |                 ]
487 |             }
488 |             message_history[-1] = user_message
489 | 
490 |         # Inject context into our copy by appending it to system message. Unclear whether multiple
491 |         # system messages are confusing to the assistant or not but cursory testing shows this
492 |         # seems to work.
493 |         extra_context = create_context_system_message(local_time=local_time, location=location_address, learned_context=learned_context)
494 |         if noa_system_prompt is not None:
495 |             extra_context = f"{noa_system_prompt}\n{extra_context}"
496 |         extra_context_message = Message(role=Role.SYSTEM, content=extra_context)
497 |         message_history.append(extra_context_message)
498 | 
499 |         # Start timing of initial LLM call and entire process
500 |         t0 = timeit.default_timer()
501 |         tstart = t0
502 | 
503 |         # Speculative vision call
504 |         speculative_vision_task = asyncio.create_task(
505 |             handle_photo_tool(
506 |                 query=prompt,
507 |                 message_history=full_message_history,
508 |                 vision=vision,
509 |                 web_search=web_search,
510 |                 token_usage_by_model=returned_response.token_usage_by_model,
511 |                 capabilities_used=returned_response.capabilities_used,
512 |                 google_reverse_image_search=False,
513 |                 translate=False,
514 |                 image_bytes=image_bytes,
515 |                 local_time=local_time,
516 |                 location=location_address,
517 |                 learned_context=learned_context
518 |             )
519 |         ) if speculative_vision else None
520 | 
521 |         speculative_search_task = asyncio.create_task(
522 |             web_search.search_web(
523 |                 query=prompt,
524 |                 message_history=full_message_history,
525 |                 token_usage_by_model=returned_response.token_usage_by_model,
526 |                 image_bytes=image_bytes,
527 |                 location=location_address
528 |             )
529 |         ) 
530 | 
531 |         # Initial GPT call, which may request tool use
532 |         initial_llm_task = asyncio.create_task(
533 |             self._client.chat.completions.create(
534 |                 model=model,
535 |                 messages=message_history,
536 |                 tools=tools,
537 |                 tool_choice="auto"
538 |             )
539 |         )
540 | 
541 |         # Kick off both tasks but ensure LLM completes
542 |         initial_tasks = [ initial_llm_task ]
543 |         if speculative_vision_task is not None:
544 |             initial_tasks.append(speculative_vision_task)
545 |         if speculative_search_task is not None:
546 |             initial_tasks.append(speculative_search_task)
547 |         completed_tasks, pending_tasks = await asyncio.wait(initial_tasks, return_when=asyncio.FIRST_COMPLETED)
548 |         first_response = await initial_llm_task
549 |         first_response_message = first_response.choices[0].message
550 |         t1 = timeit.default_timer()
551 |         timings["llm_initial"] = f"{t1-t0:.3f}"
552 | 
553 |         # Aggregate token counts and potential initial response
554 |         accumulate_token_usage(
555 |             token_usage_by_model=returned_response.token_usage_by_model,
556 |             model=model,
557 |             input_tokens=first_response.usage.prompt_tokens,
558 |             output_tokens=first_response.usage.completion_tokens,
559 |             total_tokens=first_response.usage.total_tokens
560 |         )
561 | 
562 |         # If there are no tool requests, the initial response will be returned
563 |         returned_response.response = first_response_message.content
564 | 
565 |         # Handle tool requests
566 |         tools_used = []
567 |         tools_used.append({ "learned_context": learned_context })   # log context here for now
568 |         if first_response_message.tool_calls:
569 |             # If image generation tool then kill speculative tasks
570 |             if first_response_message.tool_calls[0].function.name == IMAGE_GENERATION_TOOL_NAME:
571 |                 self._cancel_tasks([ speculative_vision_task, speculative_search_task ])
572 |             # Append initial response to history, which may include tool use
573 |             message_history.append(first_response_message)
574 | 
575 |             # Invoke all the tools in parallel and wait for them all to complete. Vision is special:
576 |             # we already have a speculative query in progress.
577 |             t0 = timeit.default_timer()
578 |             tool_handlers = []
579 |             for tool_call in first_response_message.tool_calls:
580 |                 if tool_call.function.name == PHOTO_TOOL_NAME and speculative_vision_task is not None:
581 |                     tool_handlers.append(speculative_vision_task)
582 |                     tools_used.append(
583 |                         create_debug_tool_info_object(
584 |                             function_name=PHOTO_TOOL_NAME,
585 |                             function_args={},
586 |                             tool_time=-1,
587 |                             search_result=None
588 |                         )
589 |                     )
590 |                 elif tool_call.function.name == SEARCH_TOOL_NAME and speculative_search_task is not None:
591 |                     tool_handlers.append(speculative_search_task)
592 |                     returned_response.capabilities_used.append(Capability.WEB_SEARCH)
593 |                     tools_used.append(
594 |                         create_debug_tool_info_object(
595 |                             function_name=SEARCH_TOOL_NAME,
596 |                             function_args={},
597 |                             tool_time=-1,
598 |                             search_result=None
599 |                         )
600 |                     )
601 |                 else:
602 |                     tool_handlers.append(
603 |                         handle_tool(
604 |                             tools=tools,
605 |                             tool_call=tool_call,
606 |                             user_message=prompt,
607 |                             message_history=full_message_history,   # full history because tools may have their own requirements on history length
608 |                             image_bytes=image_bytes,
609 |                             location=location_address,
610 |                             local_time=local_time,
611 |                             web_search=web_search,
612 |                             vision=vision,
613 |                             learned_context=learned_context,
614 |                             token_usage_by_model=returned_response.token_usage_by_model,
615 |                             capabilities_used=returned_response.capabilities_used,
616 |                             tools_used=tools_used,
617 |                             timings=timings
618 |                         )
619 |                     )
620 |             tool_outputs = await asyncio.gather(*tool_handlers)
621 |             t1 = timeit.default_timer()
622 |             timings["tool_calls"] = f"{t1-t0:.3f}"
623 | 
624 |             # Ensure everything is str
625 |             for i in range(len(tool_outputs)):
626 |                 if isinstance(tool_outputs[i], WebSearchResult):
627 |                     tool_outputs[i] = tool_outputs[i].summary
628 | 
629 |             # Append all the responses for GPT to continue
630 |             for i in range(len(tool_outputs)):
631 |                 # If image generation tool then return response
632 |                 if first_response_message.tool_calls[i].function.name == IMAGE_GENERATION_TOOL_NAME:
633 |                     if tool_outputs[i] == "NO_IMAGE_PROVIDED_ERROR":
634 |                         tool_outputs[i] = "I think you're referring to something you can see. Can you provide a photo?"
635 |                         returned_response.image = ""
636 |                     else:
637 |                         returned_response.response = "Here is the image you requested"
638 |                         returned_response.capabilities_used.append(Capability.IMAGE_GENERATION)
639 |                         returned_response.debug_tools = json.dumps(tools_used)
640 |                         returned_response.image = tool_outputs[i]
641 |                         return returned_response
642 |                 
643 |                 message_history.append(
644 |                     {
645 |                         "tool_call_id": first_response_message.tool_calls[i].id,
646 |                         "role": "tool",
647 |                         "name": first_response_message.tool_calls[i].function.name,
648 |                         "content": tool_outputs[i],
649 |                     }
650 |                 )
651 | 
652 |             # Get final response from model
653 |             t0 = timeit.default_timer()
654 |             second_response = await self._client.chat.completions.create(
655 |                 model=model,
656 |                 messages=message_history
657 |             )
658 |             t1 = timeit.default_timer()
659 |             timings["llm_final"] = f"{t1-t0:.3f}"
660 | 
661 |             # Aggregate tokens and response
662 |             accumulate_token_usage(
663 |                 token_usage_by_model=returned_response.token_usage_by_model,
664 |                 model=model,
665 |                 input_tokens=second_response.usage.prompt_tokens,
666 |                 output_tokens=second_response.usage.completion_tokens,
667 |                 total_tokens=second_response.usage.total_tokens
668 |             )
669 |             returned_response.response = second_response.choices[0].message.content
670 |         else:
671 |             # No tools, cancel speculative tasks
672 |             self._cancel_tasks([ speculative_vision_task, speculative_search_task ])
673 | 
674 |         # If no tools were used, only assistant capability recorded
675 |         if len(returned_response.capabilities_used) == 0:
676 |             returned_response.capabilities_used.append(Capability.ASSISTANT_KNOWLEDGE)
677 | 
678 |         # Total time
679 |         t1 = timeit.default_timer()
680 |         timings["total_time"] = f"{t1-tstart:.3f}"
681 | 
682 |         # Return final response
683 |         returned_response.debug_tools = json.dumps(tools_used)
684 |         returned_response.timings = json.dumps(timings)
685 |         returned_response.image = ""
686 |         return returned_response
687 |     
688 |     @staticmethod
689 |     def _cancel_tasks(tasks: list):
690 |         for task in tasks:
691 |             if task is not None:
692 |                 task.cancel()
693 | 
694 |     @staticmethod
695 |     def _prune_history(message_history: List[Message]) -> List[Message]:
696 |         """
697 |         Prunes down the chat history to save tokens, improving inference speed and reducing cost.
698 |         Generally, preserving all assistant responses is not needed, and only a limited number of
699 |         user messages suffice to maintain a coherent conversation.
700 | 
701 |         Parameters
702 |         ----------
703 |         message_history : List[Message]
704 |             Conversation history. This list will be mutated and returned.
705 | 
706 |         Returns
707 |         -------
708 |         List[Message]
709 |             Pruned history. This is the same list passed as input.
710 |         """
711 |         # Limit to most recent 5 user messages and 3 assistant responses
712 |         assistant_messages_remaining = 3
713 |         user_messages_remaining = 5
714 |         message_history.reverse()
715 |         i = 0
716 |         while i < len(message_history):
717 |             if message_history[i].role == Role.ASSISTANT:
718 |                 if assistant_messages_remaining == 0:
719 |                     del message_history[i]
720 |                 else:
721 |                     assistant_messages_remaining -= 1
722 |                     i += 1
723 |             elif message_history[i].role == Role.USER:
724 |                 if user_messages_remaining == 0:
725 |                     del message_history[i]
726 |                 else:
727 |                     user_messages_remaining -= 1
728 |                     i += 1
729 |             else:
730 |                 i += 1
731 |         message_history.reverse()
732 |         return message_history
733 | 
734 | Assistant.register(GPTAssistant)


--------------------------------------------------------------------------------