├── aana_chat_with_video ├── __init__.py ├── core │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ └── video_status.py │ └── prompts │ │ ├── __init__.py │ │ ├── test.j2 │ │ └── loader.py ├── tests │ ├── __init__.py │ ├── conftest.py │ ├── test_extended_video_repo.py │ ├── test_extended_video_transcript_repo.py │ ├── test_extended_video_caption_repo.py │ └── test_app.py ├── utils │ ├── __init__.py │ └── core.py ├── alembic │ ├── __init__.py │ ├── script.py.mako │ ├── versions │ │ ├── b9860676dd49_set_server_default_for_task_completed_.py │ │ ├── d93a90261ee5_added_extended_video.py │ │ └── 5ad873484aa3_init.py │ └── env.py ├── configs │ ├── __init__.py │ ├── settings.py │ ├── endpoints.py │ └── deployments.py ├── endpoints │ ├── __init__.py │ ├── delete_video.py │ ├── get_video_status.py │ ├── load_video_metadata.py │ ├── video_chat.py │ └── index_video.py ├── storage │ ├── __init__.py │ ├── repository │ │ ├── __init__.py │ │ ├── extended_video.py │ │ ├── extended_video_transcript.py │ │ └── extended_video_caption.py │ ├── op.py │ └── models │ │ ├── __init__.py │ │ ├── extended_video_caption.py │ │ ├── extended_video.py │ │ └── extended_video_transcript.py ├── deployments │ └── __init__.py ├── app.py ├── exceptions │ └── core.py └── alembic.ini ├── volume.dstack.yaml ├── install.sh ├── .devcontainer ├── Dockerfile └── devcontainer.json ├── .vscode └── settings.json ├── .github └── workflows │ └── tests.yml ├── pyproject.toml ├── Dockerfile ├── app.dstack.yaml ├── docker-compose.yaml ├── .gitignore ├── README.md └── LICENSE /aana_chat_with_video/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/alembic/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/configs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/endpoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/core/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/core/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/deployments/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/repository/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /volume.dstack.yaml: -------------------------------------------------------------------------------- 1 | type: volume 2 | name: demo-data 3 | 4 | backend: runpod 5 | region: EU-SE-1 6 | 7 | size: 100GB -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | poetry install 3 | poetry run pip install flash-attn --no-build-isolation # temporary fix for flash-attn bug in vLLM -------------------------------------------------------------------------------- /aana_chat_with_video/core/prompts/test.j2: -------------------------------------------------------------------------------- 1 | Define your prompts for LLMs here. Use jinja2 templating to include variables like {{ your_variable }}. -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04 2 | RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 ffmpeg 3 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.testing.pytestArgs": [ 3 | "." 4 | ], 5 | "python.testing.unittestEnabled": false, 6 | "python.testing.pytestEnabled": true, 7 | "python.analysis.packageIndexDepths": [ 8 | { 9 | "name": "aana", 10 | "depth": 10, 11 | } 12 | ], 13 | } -------------------------------------------------------------------------------- /aana_chat_with_video/configs/settings.py: -------------------------------------------------------------------------------- 1 | from aana.configs.settings import Settings as AanaSettings 2 | 3 | 4 | class Settings(AanaSettings): 5 | """A pydantic model for App settings.""" 6 | 7 | asr_model_name: str = "whisper_medium" 8 | captioning_model_name: str = "hf_blip2_opt_2_7b" 9 | max_video_len: int = 60 * 20 # 20 minutes 10 | 11 | 12 | settings = Settings() 13 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/op.py: -------------------------------------------------------------------------------- 1 | from aana.storage.op import run_alembic_migrations as run_alembic_migrations_aana 2 | from aana.utils.core import get_module_dir 3 | 4 | 5 | def run_alembic_migrations(settings): 6 | """Runs alembic migrations before starting up.""" 7 | root_path = get_module_dir("aana_chat_with_video") 8 | 9 | run_alembic_migrations_aana(settings, root_path) 10 | -------------------------------------------------------------------------------- /aana_chat_with_video/app.py: -------------------------------------------------------------------------------- 1 | from aana.sdk import AanaSDK 2 | from aana_chat_with_video.configs.deployments import deployments 3 | from aana_chat_with_video.configs.endpoints import endpoints 4 | from aana_chat_with_video.storage.op import run_alembic_migrations 5 | 6 | aana_app = AanaSDK(name="aana_chat_with_video", migration_func=run_alembic_migrations) 7 | 8 | for deployment in deployments: 9 | aana_app.register_deployment(**deployment) 10 | 11 | for endpoint in endpoints: 12 | aana_app.register_endpoint(**endpoint) 13 | 14 | if __name__ == "__main__": 15 | aana_app.connect() 16 | aana_app.migrate() 17 | aana_app.deploy() 18 | -------------------------------------------------------------------------------- /aana_chat_with_video/core/models/video_status.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated, Any 2 | 3 | from pydantic import Field, ValidationInfo, ValidatorFunctionWrapHandler, WrapValidator 4 | 5 | from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus 6 | 7 | 8 | def process_video_status( 9 | v: Any, handler: ValidatorFunctionWrapHandler, info: ValidationInfo 10 | ) -> str: 11 | """Validates the media_id.""" 12 | if isinstance(v, str): 13 | return VideoProcessingStatus(v) 14 | return v 15 | 16 | 17 | VideoStatus = Annotated[ 18 | VideoProcessingStatus, 19 | Field(description="Video processing status."), 20 | WrapValidator(process_video_status), 21 | ] 22 | """ 23 | Video processing status. 24 | """ 25 | -------------------------------------------------------------------------------- /aana_chat_with_video/endpoints/delete_video.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict 2 | 3 | from aana.api.api_generation import Endpoint 4 | from aana.core.models.media import MediaId 5 | from aana.storage.session import get_session 6 | from aana_chat_with_video.storage.repository.extended_video import ( 7 | ExtendedVideoRepository, 8 | ) 9 | 10 | 11 | class DeleteVideoOutput(TypedDict): 12 | """The output of the delete media endpoint.""" 13 | 14 | media_id: MediaId 15 | 16 | 17 | class DeleteVideoEndpoint(Endpoint): 18 | """Delete video endpoint.""" 19 | 20 | async def run(self, media_id: MediaId) -> DeleteVideoOutput: 21 | """Delete video.""" 22 | with get_session() as session: 23 | ExtendedVideoRepository(session).delete(media_id) 24 | return DeleteVideoOutput(media_id=media_id) 25 | -------------------------------------------------------------------------------- /aana_chat_with_video/core/prompts/loader.py: -------------------------------------------------------------------------------- 1 | from jinja2 import Environment, PackageLoader, Template 2 | 3 | 4 | def get_prompt_template(name: str) -> Template: 5 | """Load a prompt template by name. 6 | 7 | Use this function to load a prompt templates for LLMs: 8 | 9 | ```python 10 | from aana_chat_with_video.core.prompts.loader import get_prompt_template 11 | 12 | template = get_prompt_template("test") 13 | prompt = template.render(your_variable="your_value") 14 | ``` 15 | 16 | Args: 17 | name (str): The name of the prompt template. 18 | 19 | Returns: 20 | Template: The prompt template. 21 | """ 22 | env = Environment( 23 | loader=PackageLoader("aana_chat_with_video.core", "prompts"), autoescape=True 24 | ) 25 | template = env.get_template(f"{name}.j2") 26 | return template 27 | -------------------------------------------------------------------------------- /aana_chat_with_video/alembic/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from typing import Sequence 9 | 10 | from alembic import op 11 | import sqlalchemy as sa 12 | ${imports if imports else ""} 13 | 14 | # revision identifiers, used by Alembic. 15 | revision: str = ${repr(up_revision)} 16 | down_revision: str | None = ${repr(down_revision)} 17 | branch_labels: str | Sequence[str] | None = ${repr(branch_labels)} 18 | depends_on: str | Sequence[str] | None = ${repr(depends_on)} 19 | 20 | 21 | def upgrade() -> None: 22 | """Upgrade database to this revision from previous.""" 23 | ${upgrades if upgrades else "pass"} 24 | 25 | 26 | def downgrade() -> None: 27 | """Downgrade database from this revision to previous.""" 28 | ${downgrades if downgrades else "pass"} 29 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - '**' # Runs on push to any branch 7 | pull_request: 8 | branches: 9 | - '**' # Runs on pull requests to any branch 10 | workflow_dispatch: # Allows for manual triggering 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | python-version: ["3.10", "3.11", "3.12"] 20 | 21 | steps: 22 | - name: Checkout code 23 | uses: actions/checkout@v3 24 | - name: Bootstrap poetry 25 | run: | 26 | curl -sSL https://install.python-poetry.org | python - -y 27 | - name: Update PATH 28 | run: echo "$HOME/.local/bin" >> $GITHUB_PATH 29 | - name: Install dependencies 30 | run: | 31 | poetry install 32 | - name: Test with pytest 33 | run: poetry run pytest -vv 34 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "aana_chat_with_video" 3 | version = "0.1.0" 4 | description = "A multimodal chat application that allows users to upload a video and ask questions about the video content based on the visual and audio information" 5 | authors = ["Mobius Labs GmbH "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.10" 10 | aana = "0.2.3" 11 | vllm = "0.6.3.post1" 12 | transformers = ">=4.47.0" 13 | 14 | [tool.poetry.group.dev.dependencies] 15 | ipykernel = "^6.29.4" 16 | ruff = "^0.1.5" 17 | pytest-asyncio = "^0.23.6" 18 | pytest-dotenv = "^0.5.2" 19 | pytest-env = "^1.1.3" 20 | pytest-mock = "^3.12.0" 21 | pytest-postgresql = "6.0.0" 22 | pytest-timeout = "^2.2.0" 23 | 24 | [build-system] 25 | requires = ["poetry-core"] 26 | build-backend = "poetry.core.masonry.api" 27 | 28 | [tool.pytest.ini_options] 29 | timeout = 600 30 | env = [ 31 | "TEST_MODE=True" 32 | ] 33 | -------------------------------------------------------------------------------- /aana_chat_with_video/endpoints/get_video_status.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict 2 | 3 | from aana.api.api_generation import Endpoint 4 | from aana.core.models.media import MediaId 5 | from aana.storage.session import get_session 6 | from aana_chat_with_video.core.models.video_status import VideoStatus 7 | from aana_chat_with_video.storage.repository.extended_video import ( 8 | ExtendedVideoRepository, 9 | ) 10 | 11 | 12 | class VideoStatusOutput(TypedDict): 13 | """The output of the video status endpoint.""" 14 | 15 | status: VideoStatus 16 | 17 | 18 | class GetVideoStatusEndpoint(Endpoint): 19 | """Get video status endpoint.""" 20 | 21 | async def run(self, media_id: MediaId) -> VideoStatusOutput: 22 | """Load video metadata.""" 23 | with get_session() as session: 24 | video_status = ExtendedVideoRepository(session).get_status(media_id) 25 | return VideoStatusOutput(status=video_status) 26 | -------------------------------------------------------------------------------- /aana_chat_with_video/endpoints/load_video_metadata.py: -------------------------------------------------------------------------------- 1 | from typing import TypedDict 2 | 3 | from aana.api.api_generation import Endpoint 4 | from aana.core.models.media import MediaId 5 | from aana.core.models.video import VideoMetadata 6 | from aana.storage.session import get_session 7 | from aana_chat_with_video.storage.repository.extended_video import ( 8 | ExtendedVideoRepository, 9 | ) 10 | 11 | 12 | class LoadVideoMetadataOutput(TypedDict): 13 | """The output of the load video metadata endpoint.""" 14 | 15 | metadata: VideoMetadata 16 | 17 | 18 | class LoadVideoMetadataEndpoint(Endpoint): 19 | """Load video metadata endpoint.""" 20 | 21 | async def run(self, media_id: MediaId) -> LoadVideoMetadataOutput: 22 | """Load video metadata.""" 23 | with get_session() as session: 24 | video_metadata = ExtendedVideoRepository(session).get_metadata(media_id) 25 | return LoadVideoMetadataOutput(metadata=video_metadata) 26 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Ubuntu", 3 | "build": { 4 | "dockerfile": "Dockerfile" 5 | }, 6 | "capAdd": [ 7 | "SYS_PTRACE" 8 | ], 9 | "features": { 10 | "ghcr.io/devcontainers/features/python:1": { 11 | "installTools": true, 12 | "version": "3.10" 13 | }, 14 | "ghcr.io/devcontainers-contrib/features/poetry:2": { 15 | "version": "latest" 16 | } 17 | }, 18 | "hostRequirements": { 19 | "gpu": "optional" 20 | }, 21 | "securityOpt": [ 22 | "seccomp=unconfined" 23 | ], 24 | "postStartCommand": "git config --global --add safe.directory ${containerWorkspaceFolder}", 25 | "customizations": { 26 | "vscode": { 27 | "extensions": [ 28 | "charliermarsh.ruff", 29 | "ms-python.python", 30 | "ms-python.mypy-type-checker", 31 | "ms-toolsai.jupyter" 32 | ] 33 | } 34 | } 35 | } -------------------------------------------------------------------------------- /aana_chat_with_video/storage/models/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F401 2 | # We need to import all db models here and, other than in the class definitions 3 | # themselves, only import them from aana.models.db directly. The reason for 4 | # this is the way SQLAlchemy's declarative base works. You can use forward 5 | # references like `parent = reference("Parent", backreferences="child")`, but the 6 | # forward reference needs to have been resolved before the first constructor 7 | # is called so that SqlAlchemy "knows" about it. 8 | # See: 9 | # https://docs.pylonsproject.org/projects/pyramid_cookbook/en/latest/database/sqlalchemy.html#importing-all-sqlalchemy-models 10 | # (even if not using Pyramid) 11 | 12 | from aana_chat_with_video.storage.models.extended_video import ExtendedVideoEntity 13 | from aana_chat_with_video.storage.models.extended_video_caption import ( 14 | ExtendedVideoCaptionEntity, 15 | ) 16 | from aana_chat_with_video.storage.models.extended_video_transcript import ( 17 | ExtendedVideoTranscriptEntity, 18 | ) 19 | -------------------------------------------------------------------------------- /aana_chat_with_video/exceptions/core.py: -------------------------------------------------------------------------------- 1 | from aana.core.models.media import MediaId 2 | from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus 3 | 4 | 5 | class UnfinishedVideoException(BaseException): 6 | """Exception raised when try to fetch unfinished video. 7 | 8 | Attributes: 9 | media_id (int | MediaId): The id of video. 10 | status (VideoStatus): The current video status. 11 | message (str): The error message. 12 | """ 13 | 14 | def __init__( 15 | self, media_id: int | MediaId, status: VideoProcessingStatus, message: str 16 | ): 17 | """Constructor. 18 | 19 | Args: 20 | media_id (int | MediaId): The id of video. 21 | status (VideoStatus): The current video status. 22 | message (str): The error message. 23 | """ 24 | super().__init__(media_id=media_id, status=status, message=message) 25 | self.media_id = media_id 26 | self.status = status 27 | self.message = message 28 | 29 | def __reduce__(self): 30 | """Used for pickling.""" 31 | return (self.__class__, (self.media_id, self.status, self.message)) 32 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Use NVIDIA CUDA as base image 2 | FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 3 | 4 | # Build args 5 | ARG INSTALL_FLASH_ATTENTION=false 6 | 7 | # Set working directory 8 | WORKDIR /app 9 | 10 | # Set environment variables to non-interactive (this prevents some prompts) 11 | ENV DEBIAN_FRONTEND=non-interactive 12 | 13 | # Install required libraries, tools, and Python3 14 | RUN apt-get update && apt-get install -y ffmpeg curl git python3.10 python3-pip 15 | 16 | # Install poetry 17 | RUN curl -sSL https://install.python-poetry.org | python3 - 18 | 19 | # Update PATH 20 | RUN echo 'export PATH="/root/.local/bin:$PATH"' >> /root/.bashrc 21 | ENV PATH="/root/.local/bin:$PATH" 22 | 23 | # Copy project files into the container 24 | COPY . /app 25 | 26 | # Install the package with poetry 27 | RUN poetry install 28 | 29 | # Install flash attention 30 | RUN poetry run pip install torch --index-url https://download.pytorch.org/whl/cu121 31 | RUN if [[ "$INSTALL_FLASH_ATTENTION" = "true" ]] ; then \ 32 | poetry run pip install flash-attn --no-build-isolation; \ 33 | else \ 34 | echo Skip flash_atten installation ; \ 35 | fi 36 | 37 | # Disable buffering for stdout and stderr to get the logs in real time 38 | ENV PYTHONUNBUFFERED=1 39 | 40 | # Expose the desired port 41 | EXPOSE 8000 42 | 43 | # Run the app 44 | CMD ["poetry", "run", "aana", "deploy", "aana_chat_with_video.app:aana_app", "--host", "0.0.0.0"] 45 | -------------------------------------------------------------------------------- /aana_chat_with_video/configs/endpoints.py: -------------------------------------------------------------------------------- 1 | from aana_chat_with_video.endpoints.delete_video import DeleteVideoEndpoint 2 | from aana_chat_with_video.endpoints.get_video_status import GetVideoStatusEndpoint 3 | from aana_chat_with_video.endpoints.index_video import IndexVideoEndpoint 4 | from aana_chat_with_video.endpoints.load_video_metadata import LoadVideoMetadataEndpoint 5 | from aana_chat_with_video.endpoints.video_chat import VideoChatEndpoint 6 | 7 | endpoints: list[dict] = [ 8 | { 9 | "name": "index_video_stream", 10 | "path": "/video/index_stream", 11 | "summary": "Index a video and return the captions and transcriptions (streaming)", 12 | "endpoint_cls": IndexVideoEndpoint, 13 | }, 14 | { 15 | "name": "video_metadata", 16 | "path": "/video/metadata", 17 | "summary": "Load video metadata", 18 | "endpoint_cls": LoadVideoMetadataEndpoint, 19 | }, 20 | { 21 | "name": "video_chat_stream", 22 | "path": "/video/chat_stream", 23 | "summary": "Chat with video (streaming)", 24 | "endpoint_cls": VideoChatEndpoint, 25 | }, 26 | { 27 | "name": "video_status", 28 | "path": "/video/status", 29 | "summary": "Get video status", 30 | "endpoint_cls": GetVideoStatusEndpoint, 31 | }, 32 | { 33 | "name": "delete_media", 34 | "path": "/video/delete", 35 | "summary": "Delete video", 36 | "endpoint_cls": DeleteVideoEndpoint, 37 | }, 38 | ] 39 | -------------------------------------------------------------------------------- /aana_chat_with_video/alembic/versions/b9860676dd49_set_server_default_for_task_completed_.py: -------------------------------------------------------------------------------- 1 | """Set server default for task.completed_at and task.assigned_at to none and add num_retries. 2 | Revision ID: b9860676dd49 3 | Revises: 5ad873484aa3 4 | Create Date: 2024-08-22 07:54:55.921710 5 | """ 6 | from collections.abc import Sequence 7 | 8 | import sqlalchemy as sa 9 | from alembic import op 10 | 11 | # revision identifiers, used by Alembic. 12 | revision: str = "b9860676dd49" 13 | down_revision: str | None = "5ad873484aa3" 14 | branch_labels: str | Sequence[str] | None = None 15 | depends_on: str | Sequence[str] | None = None 16 | 17 | 18 | def upgrade() -> None: 19 | """Upgrade database to this revision from previous.""" 20 | with op.batch_alter_table("tasks", schema=None) as batch_op: 21 | batch_op.alter_column( 22 | "completed_at", 23 | server_default=None, 24 | ) 25 | batch_op.alter_column( 26 | "assigned_at", 27 | server_default=None, 28 | ) 29 | batch_op.add_column( 30 | sa.Column( 31 | "num_retries", 32 | sa.Integer(), 33 | nullable=False, 34 | comment="Number of retries", 35 | server_default=sa.text("0"), 36 | ) 37 | ) 38 | 39 | # ### end Alembic commands ### 40 | 41 | 42 | def downgrade() -> None: 43 | """Downgrade database from this revision to previous.""" 44 | with op.batch_alter_table("tasks", schema=None) as batch_op: 45 | batch_op.drop_column("num_retries") 46 | 47 | # ### end Alembic commands ### 48 | -------------------------------------------------------------------------------- /app.dstack.yaml: -------------------------------------------------------------------------------- 1 | type: service 2 | 3 | name: aana-chat-with-video 4 | 5 | image: nvidia/cuda:12.3.2-cudnn9-devel-ubuntu22.04 6 | 7 | env: 8 | - NUM_WORKERS=5 9 | - TASK_QUEUE__EXECUTION_TIMEOUT=10000 10 | - TASK_QUEUE__NUM_WORKERS=5 11 | - TMP_DATA_DIR=/demo_data/aana 12 | - IMAGE_DIR=/demo_data/aana/images 13 | - VIDEO_DIR=/demo_data/aana/videos 14 | - AUDIO_DIR=/demo_data/aana/audios 15 | - MODEL_DIR=/demo_data/aana/models 16 | - DB_CONFIG__DATASTORE_TYPE=sqlite 17 | - DB_CONFIG__DATASTORE_CONFIG__PATH=/demo_data/aana.db 18 | 19 | commands: 20 | - apt-get update 21 | - apt-get install -y libgl1 libglib2.0-0 ffmpeg python3 python3-dev git nvtop htop sqlite3 cron 22 | - curl -sSL https://install.python-poetry.org | python3 - 23 | - export PATH=$PATH:/root/.local/bin 24 | - sh install.sh 25 | - mkdir -p /demo_data 26 | - mkdir -p /demo_data/hf_cache 27 | - (crontab -l 2>/dev/null; echo "0 3 * * * find $TMP_DATA_DIR/videos/* -type f -atime +3 -exec rm -f {} \;") | sort -u | crontab - 28 | - (crontab -l 2>/dev/null; echo "0 3 * * * find $TMP_DATA_DIR/audios/* -type f -atime +3 -exec rm -f {} \;") | sort -u | crontab - 29 | - (crontab -l 2>/dev/null; echo "0 3 * * * find $TMP_DATA_DIR/images/* -type f -atime +3 -exec rm -f {} \;") | sort -u | crontab - 30 | - HF_HUB_CACHE="/demo_data/hf_cache" CUDA_VISIBLE_DEVICES="0" poetry run aana deploy aana_chat_with_video.app:aana_app 31 | 32 | port: 8000 33 | 34 | replicas: 1 35 | 36 | auth: False 37 | 38 | spot_policy: on-demand 39 | 40 | max_price: 0.5 41 | 42 | volumes: 43 | - name: demo-data 44 | path: /demo_data 45 | 46 | resources: 47 | gpu: 48GB.. 48 | cpu: 8.. 49 | memory: 50GB.. 50 | disk: 50GB.. 51 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/models/extended_video_caption.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations # Let classes use themselves in type annotations 2 | 3 | import typing 4 | 5 | from sqlalchemy import ForeignKey 6 | from sqlalchemy.orm import Mapped, mapped_column, relationship 7 | 8 | from aana.core.models.media import MediaId # noqa: TCH001 9 | from aana.storage.models.caption import CaptionEntity 10 | 11 | if typing.TYPE_CHECKING: 12 | from aana.core.models.captions import Caption 13 | 14 | 15 | class ExtendedVideoCaptionEntity(CaptionEntity): 16 | """ORM model for video captions in extended video.""" 17 | 18 | __tablename__ = "extended_video_caption" 19 | 20 | id: Mapped[int] = mapped_column(ForeignKey("caption.id"), primary_key=True) 21 | 22 | media_id: Mapped[MediaId] = mapped_column( 23 | ForeignKey("extended_video.id"), 24 | nullable=False, 25 | comment="Foreign key to video table", 26 | ) 27 | 28 | video = relationship( 29 | "ExtendedVideoEntity", back_populates="captions", uselist=False 30 | ) 31 | 32 | __mapper_args__ = { # noqa: RUF012 33 | "polymorphic_identity": "extended_video_caption", 34 | } 35 | 36 | @classmethod 37 | def from_caption_output( 38 | cls, 39 | model_name: str, 40 | caption: Caption, 41 | media_id: MediaId, 42 | frame_id: int, 43 | timestamp: float, 44 | ) -> ExtendedVideoCaptionEntity: 45 | """Converts a Caption pydantic model to a ExtendedVideoCaptionEntity.""" 46 | caption_entity = CaptionEntity.from_caption_output( 47 | model_name=model_name, 48 | frame_id=frame_id, 49 | timestamp=timestamp, 50 | caption=caption, 51 | ) 52 | return cls.from_parent(caption_entity, media_id=media_id) 53 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/models/extended_video.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | from sqlalchemy import ForeignKey 4 | from sqlalchemy.orm import Mapped, mapped_column, relationship 5 | 6 | from aana.core.models.media import MediaId 7 | from aana.storage.models.video import VideoEntity 8 | from aana_chat_with_video.storage.models.extended_video_caption import ( 9 | ExtendedVideoCaptionEntity, 10 | ) 11 | from aana_chat_with_video.storage.models.extended_video_transcript import ( 12 | ExtendedVideoTranscriptEntity, 13 | ) 14 | 15 | 16 | class VideoProcessingStatus(str, Enum): 17 | """Enum for video status.""" 18 | 19 | CREATED = "created" 20 | RUNNING = "running" 21 | COMPLETED = "completed" 22 | FAILED = "failed" 23 | 24 | 25 | class ExtendedVideoEntity(VideoEntity): 26 | """ORM class for videos with additional metadata.""" 27 | 28 | __tablename__ = "extended_video" 29 | 30 | id: Mapped[MediaId] = mapped_column(ForeignKey("video.id"), primary_key=True) 31 | duration: Mapped[float | None] = mapped_column(comment="Video duration in seconds") 32 | status: Mapped[VideoProcessingStatus] = mapped_column( 33 | nullable=False, 34 | default=VideoProcessingStatus.CREATED, 35 | comment="Processing status", 36 | ) 37 | 38 | captions: Mapped[list[ExtendedVideoCaptionEntity]] = relationship( 39 | "ExtendedVideoCaptionEntity", 40 | back_populates="video", 41 | cascade="all, delete", 42 | uselist=True, 43 | ) 44 | transcript: Mapped[list[ExtendedVideoTranscriptEntity]] = relationship( 45 | "ExtendedVideoTranscriptEntity", 46 | back_populates="video", 47 | cascade="all, delete", 48 | uselist=True, 49 | ) 50 | 51 | __mapper_args__ = { # noqa: RUF012 52 | "polymorphic_identity": "extended_video", 53 | } 54 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/models/extended_video_transcript.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations # Let classes use themselves in type annotations 2 | 3 | from typing import TYPE_CHECKING 4 | 5 | from sqlalchemy import ForeignKey 6 | from sqlalchemy.orm import Mapped, mapped_column, relationship 7 | 8 | from aana.core.models.media import MediaId # noqa: TCH001 9 | from aana.storage.models.transcript import TranscriptEntity 10 | 11 | if TYPE_CHECKING: 12 | from aana.core.models.asr import ( 13 | AsrSegments, 14 | AsrTranscription, 15 | AsrTranscriptionInfo, 16 | ) 17 | 18 | 19 | class ExtendedVideoTranscriptEntity(TranscriptEntity): 20 | """ORM class for extended video transcripts.""" 21 | 22 | __tablename__ = "extended_video_transcript" 23 | 24 | id: Mapped[int] = mapped_column(ForeignKey("transcript.id"), primary_key=True) 25 | media_id: Mapped[MediaId] = mapped_column( 26 | ForeignKey("extended_video.id"), 27 | nullable=False, 28 | comment="Foreign key to video table", 29 | ) 30 | 31 | video = relationship( 32 | "ExtendedVideoEntity", back_populates="transcript", uselist=False 33 | ) 34 | 35 | __mapper_args__ = { # noqa: RUF012 36 | "polymorphic_identity": "extended_video_transcript", 37 | } 38 | 39 | @classmethod 40 | def from_asr_output( 41 | cls, 42 | model_name: str, 43 | media_id: MediaId, 44 | info: AsrTranscriptionInfo, 45 | transcription: AsrTranscription, 46 | segments: AsrSegments, 47 | ) -> ExtendedVideoTranscriptEntity: 48 | """Converts an AsrTranscriptionInfo and AsrTranscription to a single Transcript entity.""" 49 | transcript_entity = super().from_asr_output( 50 | model_name=model_name, 51 | info=info, 52 | transcription=transcription, 53 | segments=segments, 54 | ) 55 | return cls.from_parent(transcript_entity, media_id=media_id) 56 | -------------------------------------------------------------------------------- /aana_chat_with_video/tests/conftest.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: S101 2 | import os 3 | import tempfile 4 | 5 | import pytest 6 | from sqlalchemy.orm import Session 7 | 8 | from aana.configs.db import DbSettings, SQLiteConfig 9 | from aana.configs.settings import settings as aana_settings 10 | from aana.exceptions.runtime import EmptyMigrationsException 11 | from aana.storage.op import DbType, run_alembic_migrations 12 | from aana.tests.conftest import app_factory, call_endpoint # noqa: F401 13 | from aana.utils.json import jsonify 14 | from aana_chat_with_video.configs.settings import settings 15 | from aana_chat_with_video.storage.op import ( 16 | run_alembic_migrations as run_app_alembic_migrations, 17 | ) 18 | 19 | 20 | @pytest.fixture(scope="function") 21 | def db_session(): 22 | """Creates a new database file and session for each test.""" 23 | with tempfile.NamedTemporaryFile(dir=settings.tmp_data_dir) as tmp: 24 | db_config = DbSettings( 25 | datastore_type=DbType.SQLITE, 26 | datastore_config=SQLiteConfig(path=tmp.name), 27 | ) 28 | os.environ["DB_CONFIG"] = jsonify(db_config) 29 | 30 | settings.db_config = db_config 31 | settings.db_config._engine = None 32 | aana_settings.db_config = db_config 33 | aana_settings.db_config._engine = None 34 | 35 | try: 36 | run_app_alembic_migrations(settings) 37 | except EmptyMigrationsException: 38 | print( 39 | "No versions found in the custom migrations. Using default migrations." 40 | ) 41 | run_alembic_migrations(settings) 42 | 43 | # Create a new session 44 | engine = settings.db_config.get_engine() 45 | with Session(engine) as session: 46 | yield session 47 | 48 | 49 | @pytest.fixture(scope="module") 50 | def app_setup(app_factory): 51 | """Setup app for testing.""" 52 | app, tmp_database_path = app_factory("aana_chat_with_video.app", "aana_app") 53 | yield app 54 | tmp_database_path.unlink() 55 | app.shutdown() 56 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | postgres: 5 | restart: always 6 | container_name: aana_chat_with_video_db 7 | image: postgres 8 | command: postgres -c 'max_connections=1000' 9 | healthcheck: 10 | test: /usr/bin/pg_isready 11 | timeout: 45s 12 | interval: 10s 13 | retries: 10 14 | ports: 15 | - '15430:15430' 16 | expose: 17 | - 15430 18 | environment: 19 | PGPASSWORD: '${POSTGRES_PASSWORD:-Yf?5nX39}' 20 | PGUSER: '${POSTGRES_USER:-aana_db_user}' 21 | PGDATABASE: '${POSTGRES_DB:-aana_db}' 22 | POSTGRES_PASSWORD: '${POSTGRES_PASSWORD:-Yf?5nX39}' 23 | POSTGRES_USER: '${POSTGRES_USER:-aana_db_user}' 24 | POSTGRES_DB: '${POSTGRES_DB:-aana_db}' 25 | PGPORT: '15430' 26 | PGDATA: '/pgdata' 27 | volumes: 28 | - pg_data:/pgdata 29 | 30 | aana_chat_with_video_app: 31 | restart: always 32 | container_name: aana_chat_with_video_app 33 | depends_on: 34 | postgres: 35 | condition: service_healthy 36 | ports: 37 | - 8000:8000 # request server 38 | expose: 39 | - '8000' 40 | build: 41 | context: . 42 | dockerfile: Dockerfile 43 | args: 44 | INSTALL_FLASH_ATTENTION: '${INSTALL_FLASH_ATTENTION:-false}' 45 | deploy: 46 | resources: 47 | reservations: 48 | devices: 49 | - capabilities: ["gpu"] 50 | environment: 51 | CUDA_VISIBLE_DEVICES: 52 | HF_HUB_ENABLE_HF_TRANSFER: '${HF_HUB_ENABLE_HF_TRANSFER:-1}' 53 | HF_TOKEN: '${HF_TOKEN}' 54 | HF_DATASETS_CACHE: /root/.cache/huggingface 55 | NUM_WORKERS: '${NUM_WORKERS:-2}' 56 | TMP_DATA_DIR: /tmp/aana_data 57 | DB_CONFIG: '{"datastore_type":"postgresql","datastore_config":{"host":"postgres","port":"15430","user":"${POSTGRES_USER:-aana_db_user}","password":"${POSTGRES_PASSWORD:-Yf?5nX39}","database":"${POSTGRES_DB:-aana_db}"}}' 58 | volumes: 59 | - app_data:/tmp/aana_data 60 | - hf_datasets_cache:/root/.cache/huggingface 61 | 62 | volumes: 63 | pg_data: 64 | name: aana_chat_with_video_postgres_data 65 | app_data: 66 | name: aana_chat_with_video_app_data 67 | hf_datasets_cache: 68 | name: hf_datasets_cache 69 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/repository/extended_video.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm import Session 2 | 3 | from aana.core.models.media import MediaId 4 | from aana.core.models.video import Video, VideoMetadata 5 | from aana.storage.repository.video import VideoRepository 6 | from aana_chat_with_video.storage.models.extended_video import ( 7 | ExtendedVideoEntity, 8 | VideoProcessingStatus, 9 | ) 10 | 11 | 12 | class ExtendedVideoRepository(VideoRepository[ExtendedVideoEntity]): 13 | """Repository for videos with additional metadata.""" 14 | 15 | def __init__(self, session: Session): 16 | """Constructor.""" 17 | super().__init__(session, ExtendedVideoEntity) 18 | 19 | def save(self, video: Video, duration: float | None = None) -> dict: 20 | """Saves a video to datastore. 21 | 22 | Args: 23 | video (Video): The video object. 24 | duration (float): the duration of the video object 25 | 26 | Returns: 27 | dict: The dictionary with video and media IDs. 28 | """ 29 | video_entity = ExtendedVideoEntity( 30 | id=video.media_id, 31 | path=str(video.path), 32 | url=video.url, 33 | title=video.title, 34 | description=video.description, 35 | duration=duration, 36 | ) 37 | self.create(video_entity) 38 | return video_entity 39 | 40 | def get_status(self, media_id: MediaId) -> VideoProcessingStatus: 41 | """Get the status of a video. 42 | 43 | Args: 44 | media_id (str): The media ID. 45 | 46 | Returns: 47 | VideoProcessingStatus: The status of the video. 48 | """ 49 | entity: ExtendedVideoEntity = self.read(media_id) 50 | return entity.status 51 | 52 | def update_status(self, media_id: MediaId, status: VideoProcessingStatus): 53 | """Update the status of a video. 54 | 55 | Args: 56 | media_id (str): The media ID. 57 | status (VideoProcessingStatus): The status of the video. 58 | """ 59 | entity: ExtendedVideoEntity = self.read(media_id) 60 | entity.status = status 61 | self.session.commit() 62 | 63 | def get_metadata(self, media_id: MediaId) -> VideoMetadata: 64 | """Get the metadata of a video. 65 | 66 | Args: 67 | media_id (MediaId): The media ID. 68 | 69 | Returns: 70 | VideoMetadata: The video metadata. 71 | """ 72 | entity: ExtendedVideoEntity = self.read(media_id) 73 | return VideoMetadata( 74 | title=entity.title, 75 | description=entity.description, 76 | duration=entity.duration, 77 | ) 78 | -------------------------------------------------------------------------------- /aana_chat_with_video/alembic/env.py: -------------------------------------------------------------------------------- 1 | from logging.config import fileConfig 2 | 3 | from alembic import context 4 | from sqlalchemy import engine_from_config, pool 5 | 6 | from aana.configs.settings import settings 7 | from aana.storage.models.base import BaseEntity 8 | # Import all models to be included in the migration 9 | import aana.storage.models # noqa: F401 10 | import aana_chat_with_video.storage.models # noqa: F401 11 | 12 | # this is the Alembic Config object, which provides 13 | # access to the values within the .ini file in use. 14 | config = context.config 15 | 16 | # Interpret the config file for Python logging. 17 | # This line sets up loggers basically. 18 | if config.config_file_name is not None: 19 | fileConfig(config.config_file_name) 20 | 21 | # add your model's MetaData object here 22 | # for 'autogenerate' support 23 | # from myapp import mymodel 24 | # target_metadata = mymodel.Base.metadata 25 | 26 | target_metadata = BaseEntity.metadata 27 | 28 | # other values from the config, defined by the needs of env.py, 29 | # can be acquired: 30 | # my_important_option = config.get_main_option("my_important_option") 31 | # ... etc. 32 | 33 | 34 | def run_migrations_offline() -> None: 35 | """Run migrations in 'offline' mode. 36 | 37 | Modified to use our existing db config module. 38 | 39 | Calls to context.execute() here emit the given string to the 40 | script output. 41 | 42 | """ 43 | engine = settings.db_config.get_engine() 44 | context.configure( 45 | url=engine.url, 46 | target_metadata=target_metadata, 47 | literal_binds=True, 48 | dialect_opts={"paramstyle": "named"}, 49 | render_as_batched=True, 50 | ) 51 | 52 | with context.begin_transaction(): 53 | context.run_migrations() 54 | 55 | 56 | def run_migrations_online() -> None: 57 | """Run migrations in 'online' mode. 58 | 59 | In this scenario we need to create an Engine 60 | and associate a connection with the context. 61 | 62 | """ 63 | config_section = config.get_section(config.config_ini_section, {}) 64 | engine = settings.db_config.get_engine() 65 | config_section["sqlalchemy.url"] = engine.url 66 | connectable = engine_from_config( 67 | config_section, 68 | prefix="sqlalchemy.", 69 | poolclass=pool.NullPool, 70 | ) 71 | 72 | with connectable.connect() as connection: 73 | context.configure( 74 | connection=connection, target_metadata=target_metadata, render_as_batch=True 75 | ) 76 | 77 | with context.begin_transaction(): 78 | context.run_migrations() 79 | 80 | 81 | if context.is_offline_mode(): 82 | run_migrations_offline() 83 | else: 84 | run_migrations_online() 85 | -------------------------------------------------------------------------------- /aana_chat_with_video/tests/test_extended_video_repo.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: S101 2 | 3 | import uuid 4 | from importlib import resources 5 | 6 | import pytest 7 | 8 | from aana.core.models.video import Video, VideoMetadata 9 | from aana.exceptions.db import MediaIdAlreadyExistsException, NotFoundException 10 | from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus 11 | from aana_chat_with_video.storage.repository.extended_video import ( 12 | ExtendedVideoRepository, 13 | ) 14 | 15 | 16 | @pytest.fixture(scope="function") 17 | def dummy_video(): 18 | """Creates a dummy video for testing.""" 19 | media_id = str(uuid.uuid4()) 20 | path = resources.path("aana.tests.files.videos", "squirrel.mp4") 21 | video = Video( 22 | path=path, media_id=media_id, title="Squirrel", description="A squirrel video" 23 | ) 24 | return video 25 | 26 | 27 | def test_save_video(db_session, dummy_video): 28 | """Tests saving a video.""" 29 | video_repo = ExtendedVideoRepository(db_session) 30 | video_repo.save(dummy_video, duration=10) 31 | 32 | video_entity = video_repo.read(dummy_video.media_id) 33 | assert video_entity 34 | assert video_entity.id == dummy_video.media_id 35 | 36 | # Try to save the same video again 37 | with pytest.raises(MediaIdAlreadyExistsException): 38 | video_repo.save(dummy_video) 39 | 40 | video_repo.delete(dummy_video.media_id) 41 | with pytest.raises(NotFoundException): 42 | video_repo.read(dummy_video.media_id) 43 | 44 | 45 | def test_get_metadata(db_session, dummy_video): 46 | """Tests getting video metadata.""" 47 | video_repo = ExtendedVideoRepository(db_session) 48 | video_repo.save(dummy_video, duration=10) 49 | 50 | metadata = video_repo.get_metadata(dummy_video.media_id) 51 | assert isinstance(metadata, VideoMetadata) 52 | assert metadata.title == dummy_video.title 53 | assert metadata.description == dummy_video.description 54 | assert metadata.duration == 10 55 | 56 | video_repo.delete(dummy_video.media_id) 57 | with pytest.raises(NotFoundException): 58 | video_repo.get_metadata(dummy_video.media_id) 59 | 60 | 61 | def test_status(db_session, dummy_video): 62 | """Tests getting and updating video status.""" 63 | video_repo = ExtendedVideoRepository(db_session) 64 | video_repo.save(dummy_video, duration=10) 65 | 66 | assert video_repo.get_status(dummy_video.media_id) == VideoProcessingStatus.CREATED 67 | 68 | video_repo.update_status(dummy_video.media_id, VideoProcessingStatus.RUNNING) 69 | 70 | assert video_repo.get_status(dummy_video.media_id) == VideoProcessingStatus.RUNNING 71 | 72 | video_repo.delete(dummy_video.media_id) 73 | 74 | with pytest.raises(NotFoundException): 75 | video_repo.get_status(dummy_video.media_id) 76 | video_repo.update_status(dummy_video.media_id, VideoProcessingStatus.COMPLETED) 77 | -------------------------------------------------------------------------------- /aana_chat_with_video/configs/deployments.py: -------------------------------------------------------------------------------- 1 | from aana.core.models.sampling import SamplingParams 2 | from aana.core.models.types import Dtype 3 | from aana.deployments.vad_deployment import VadConfig, VadDeployment 4 | from aana.deployments.hf_blip2_deployment import HFBlip2Config, HFBlip2Deployment 5 | from aana.deployments.vllm_deployment import VLLMConfig, VLLMDeployment 6 | from aana.deployments.whisper_deployment import ( 7 | WhisperComputeType, 8 | WhisperConfig, 9 | WhisperDeployment, 10 | WhisperModelSize, 11 | ) 12 | 13 | deployments: list[dict] = [ 14 | { 15 | "name": "asr_deployment", 16 | "instance": WhisperDeployment.options( 17 | num_replicas=1, 18 | max_ongoing_requests=1000, 19 | ray_actor_options={"num_gpus": 0.25}, 20 | user_config=WhisperConfig( 21 | model_size=WhisperModelSize.TURBO, 22 | compute_type=WhisperComputeType.FLOAT16, 23 | ).model_dump(mode="json"), 24 | ), 25 | }, 26 | { 27 | "name": "vad_deployment", 28 | "instance": VadDeployment.options( 29 | num_replicas=1, 30 | max_ongoing_requests=1000, 31 | ray_actor_options={"num_gpus": 0.05}, 32 | user_config=VadConfig( 33 | model=( 34 | "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation/" 35 | "0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin" 36 | ), 37 | onset=0.5, 38 | sample_rate=16000, 39 | ).model_dump(mode="json"), 40 | ), 41 | }, 42 | { 43 | "name": "captioning_deployment", 44 | "instance": HFBlip2Deployment.options( 45 | num_replicas=1, 46 | max_ongoing_requests=1000, 47 | ray_actor_options={"num_gpus": 0.25}, 48 | user_config=HFBlip2Config( 49 | model="Salesforce/blip2-opt-2.7b", 50 | dtype=Dtype.FLOAT16, 51 | batch_size=2, 52 | num_processing_threads=2, 53 | ).model_dump(mode="json"), 54 | ), 55 | }, 56 | { 57 | "name": "llm_deployment", 58 | "instance": VLLMDeployment.options( 59 | num_replicas=1, 60 | ray_actor_options={"num_gpus": 0.45}, 61 | user_config=VLLMConfig( 62 | model="internlm/internlm2_5-7b-chat", 63 | dtype=Dtype.AUTO, 64 | gpu_memory_reserved=30000, 65 | max_model_len=50000, 66 | enforce_eager=True, 67 | default_sampling_params=SamplingParams( 68 | temperature=0.0, top_p=1.0, top_k=-1, max_tokens=1024 69 | ), 70 | engine_args={"trust_remote_code": True}, 71 | ).model_dump(mode="json"), 72 | ), 73 | }, 74 | ] 75 | -------------------------------------------------------------------------------- /aana_chat_with_video/alembic/versions/d93a90261ee5_added_extended_video.py: -------------------------------------------------------------------------------- 1 | """added extended video. 2 | 3 | Revision ID: d93a90261ee5 4 | Revises: b9860676dd49 5 | Create Date: 2024-07-29 12:41:04.976640 6 | 7 | """ 8 | from collections.abc import Sequence 9 | 10 | import sqlalchemy as sa 11 | from alembic import op 12 | 13 | # revision identifiers, used by Alembic. 14 | revision: str = 'd93a90261ee5' 15 | down_revision: str | None = 'b9860676dd49' 16 | branch_labels: str | Sequence[str] | None = None 17 | depends_on: str | Sequence[str] | None = None 18 | 19 | 20 | def upgrade() -> None: 21 | """Upgrade database to this revision from previous.""" 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.create_table('extended_video', 24 | sa.Column('id', sa.String(length=36), nullable=False), 25 | sa.Column('duration', sa.Float(), nullable=True, comment='Video duration in seconds'), 26 | sa.Column('status', sa.Enum('CREATED', 'RUNNING', 'COMPLETED', 'FAILED', name='videoprocessingstatus'), nullable=False, comment='Processing status'), 27 | sa.ForeignKeyConstraint(['id'], ['video.id'], name=op.f('fk_extended_video_id_video')), 28 | sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video')) 29 | ) 30 | op.create_table('extended_video_caption', 31 | sa.Column('id', sa.Integer(), nullable=False), 32 | sa.Column('media_id', sa.String(length=36), nullable=False, comment='Foreign key to video table'), 33 | sa.ForeignKeyConstraint(['id'], ['caption.id'], name=op.f('fk_extended_video_caption_id_caption')), 34 | sa.ForeignKeyConstraint(['media_id'], ['extended_video.id'], name=op.f('fk_extended_video_caption_media_id_extended_video')), 35 | sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video_caption')) 36 | ) 37 | op.create_table('extended_video_transcript', 38 | sa.Column('id', sa.Integer(), nullable=False), 39 | sa.Column('media_id', sa.String(length=36), nullable=False, comment='Foreign key to video table'), 40 | sa.ForeignKeyConstraint(['id'], ['transcript.id'], name=op.f('fk_extended_video_transcript_id_transcript')), 41 | sa.ForeignKeyConstraint(['media_id'], ['extended_video.id'], name=op.f('fk_extended_video_transcript_media_id_extended_video')), 42 | sa.PrimaryKeyConstraint('id', name=op.f('pk_extended_video_transcript')) 43 | ) 44 | with op.batch_alter_table('tasks', schema=None) as batch_op: 45 | batch_op.alter_column('id', 46 | existing_type=sa.NUMERIC(), 47 | type_=sa.UUID(), 48 | existing_nullable=False) 49 | 50 | # ### end Alembic commands ### 51 | 52 | 53 | def downgrade() -> None: 54 | """Downgrade database from this revision to previous.""" 55 | # ### commands auto generated by Alembic - please adjust! ### 56 | with op.batch_alter_table('tasks', schema=None) as batch_op: 57 | batch_op.alter_column('id', 58 | existing_type=sa.UUID(), 59 | type_=sa.NUMERIC(), 60 | existing_nullable=False) 61 | 62 | op.drop_table('extended_video_transcript') 63 | op.drop_table('extended_video_caption') 64 | op.drop_table('extended_video') 65 | # ### end Alembic commands ### 66 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/repository/extended_video_transcript.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm import Session 2 | 3 | from aana.core.models.asr import ( 4 | AsrSegment, 5 | AsrSegments, 6 | AsrTranscription, 7 | AsrTranscriptionInfo, 8 | ) 9 | from aana.core.models.media import MediaId 10 | from aana.exceptions.db import NotFoundException 11 | from aana.storage.repository.transcript import TranscriptRepository 12 | from aana_chat_with_video.storage.models.extended_video_transcript import ( 13 | ExtendedVideoTranscriptEntity, 14 | ) 15 | 16 | 17 | class ExtendedVideoTranscriptRepository( 18 | TranscriptRepository[ExtendedVideoTranscriptEntity] 19 | ): 20 | """Repository for Transcripts.""" 21 | 22 | def __init__(self, session: Session): 23 | """Constructor.""" 24 | super().__init__(session, ExtendedVideoTranscriptEntity) 25 | 26 | def save( 27 | self, 28 | model_name: str, 29 | media_id: MediaId, 30 | transcription_info: AsrTranscriptionInfo, 31 | transcription: AsrTranscription, 32 | segments: AsrSegments, 33 | ) -> ExtendedVideoTranscriptEntity: 34 | """Save transcripts. 35 | 36 | Args: 37 | model_name (str): The name of the model used to generate the transcript. 38 | media_id (MediaId): The media id of the video 39 | transcription_info (AsrTranscriptionInfo): The ASR transcription info. 40 | transcription (AsrTranscription): The ASR transcription. 41 | segments (AsrSegments): The ASR segments. 42 | 43 | Returns: 44 | ExtendedVideoTranscriptEntity: The transcript entity. 45 | """ 46 | transcript_entity = ExtendedVideoTranscriptEntity.from_asr_output( 47 | model_name=model_name, 48 | media_id=media_id, 49 | transcription=transcription, 50 | segments=segments, 51 | info=transcription_info, 52 | ) 53 | self.session.add(transcript_entity) 54 | self.session.commit() 55 | return transcript_entity 56 | 57 | def get_transcript(self, model_name: str, media_id: MediaId) -> dict: 58 | """Get the transcript for a video. 59 | 60 | Args: 61 | model_name (str): The name of the model used to generate the transcript. 62 | media_id (MediaId): The media ID. 63 | 64 | Returns: 65 | dict: The dictionary with the transcript, segments, and info. 66 | """ 67 | entity = ( 68 | self.session.query(self.model_class) 69 | .filter_by(model=model_name, media_id=media_id) 70 | .first() 71 | ) 72 | if not entity: 73 | raise NotFoundException(self.table_name, media_id) 74 | transcription = AsrTranscription(text=entity.transcript) 75 | segments = [AsrSegment(**s) for s in entity.segments] 76 | info = AsrTranscriptionInfo( 77 | language=entity.language, 78 | language_confidence=entity.language_confidence, 79 | ) 80 | return { 81 | "transcription": transcription, 82 | "segments": segments, 83 | "transcription_info": info, 84 | } 85 | -------------------------------------------------------------------------------- /aana_chat_with_video/tests/test_extended_video_transcript_repo.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: S101 2 | 3 | import pytest 4 | 5 | from aana.core.models.asr import AsrSegment, AsrTranscription, AsrTranscriptionInfo 6 | from aana.core.models.time import TimeInterval 7 | from aana.exceptions.db import NotFoundException 8 | from aana.storage.models.transcript import TranscriptEntity 9 | from aana_chat_with_video.storage.repository.extended_video_transcript import ( 10 | ExtendedVideoTranscriptRepository, 11 | ) 12 | 13 | transcript_entity = TranscriptEntity.from_asr_output( 14 | model_name="whisper", 15 | transcription=AsrTranscription(text="This is a transcript"), 16 | segments=[], 17 | info=AsrTranscriptionInfo(), 18 | ) 19 | 20 | 21 | @pytest.fixture(scope="function") 22 | def dummy_transcript(): 23 | """Creates a dummy transcript for testing.""" 24 | transcript = AsrTranscription(text="This is a transcript") 25 | segments = [ 26 | AsrSegment(text="This is a segment", time_interval=TimeInterval(start=0, end=1)) 27 | ] 28 | info = AsrTranscriptionInfo(language="en", language_confidence=0.9) 29 | return transcript, segments, info 30 | 31 | 32 | def test_save_transcript(db_session, dummy_transcript): 33 | """Tests saving a transcript.""" 34 | transcript, segments, info = dummy_transcript 35 | model_name = "whisper" 36 | media_id = "test_media_id" 37 | 38 | transcript_repo = ExtendedVideoTranscriptRepository(db_session) 39 | transcript_entity = transcript_repo.save( 40 | model_name=model_name, 41 | media_id=media_id, 42 | transcription_info=info, 43 | transcription=transcript, 44 | segments=segments, 45 | ) 46 | 47 | transcript_id = transcript_entity.id 48 | 49 | transcript_entity = transcript_repo.read(transcript_id) 50 | assert transcript_entity 51 | assert transcript_entity.id == transcript_id 52 | assert transcript_entity.media_id == media_id 53 | assert transcript_entity.model == model_name 54 | assert transcript_entity.transcript == transcript.text 55 | assert len(transcript_entity.segments) == len(segments) 56 | assert transcript_entity.language == info.language 57 | assert transcript_entity.language_confidence == info.language_confidence 58 | 59 | transcript_repo.delete(transcript_id) 60 | with pytest.raises(NotFoundException): 61 | transcript_repo.read(transcript_id) 62 | 63 | 64 | def test_get_transcript(db_session, dummy_transcript): 65 | """Tests getting a transcript.""" 66 | transcript, segments, info = dummy_transcript 67 | model_name = "whisper" 68 | media_id = "test_media_id" 69 | 70 | transcript_repo = ExtendedVideoTranscriptRepository(db_session) 71 | _ = transcript_repo.save( 72 | model_name=model_name, 73 | media_id=media_id, 74 | transcription_info=info, 75 | transcription=transcript, 76 | segments=segments, 77 | ) 78 | 79 | transcript = transcript_repo.get_transcript(model_name, media_id) 80 | assert "transcription" in transcript 81 | assert "segments" in transcript 82 | assert "transcription_info" in transcript 83 | 84 | assert isinstance(transcript["transcription"], AsrTranscription) 85 | assert isinstance(transcript["segments"], list) 86 | assert all(isinstance(s, AsrSegment) for s in transcript["segments"]) 87 | assert isinstance(transcript["transcription_info"], AsrTranscriptionInfo) 88 | -------------------------------------------------------------------------------- /aana_chat_with_video/endpoints/video_chat.py: -------------------------------------------------------------------------------- 1 | import json 2 | from collections.abc import AsyncGenerator 3 | from typing import Annotated, TypedDict 4 | 5 | from aana.storage.session import get_session 6 | from pydantic import Field 7 | 8 | from aana.api.api_generation import Endpoint 9 | from aana.core.models.chat import Question 10 | from aana.core.models.media import MediaId 11 | from aana.core.models.sampling import SamplingParams 12 | from aana.deployments.aana_deployment_handle import AanaDeploymentHandle 13 | from aana_chat_with_video.configs.settings import settings 14 | from aana_chat_with_video.exceptions.core import UnfinishedVideoException 15 | from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus 16 | from aana_chat_with_video.storage.repository.extended_video import ( 17 | ExtendedVideoRepository, 18 | ) 19 | from aana_chat_with_video.storage.repository.extended_video_caption import ( 20 | ExtendedVideoCaptionRepository, 21 | ) 22 | from aana_chat_with_video.storage.repository.extended_video_transcript import ( 23 | ExtendedVideoTranscriptRepository, 24 | ) 25 | from aana_chat_with_video.utils.core import generate_combined_timeline, generate_dialog 26 | 27 | 28 | class VideoChatEndpointOutput(TypedDict): 29 | """Video chat endpoint output.""" 30 | 31 | completion: Annotated[str, Field(description="Generated text.")] 32 | 33 | 34 | class VideoChatEndpoint(Endpoint): 35 | """Video chat endpoint.""" 36 | 37 | async def initialize(self): 38 | """Initialize the endpoint.""" 39 | await super().initialize() 40 | self.llm_handle = await AanaDeploymentHandle.create("llm_deployment") 41 | 42 | 43 | async def run( 44 | self, media_id: MediaId, question: Question, sampling_params: SamplingParams 45 | ) -> AsyncGenerator[VideoChatEndpointOutput, None]: 46 | """Run the video chat endpoint.""" 47 | with get_session() as session: 48 | transcript_repo = ExtendedVideoTranscriptRepository(session) 49 | caption_repo = ExtendedVideoCaptionRepository(session) 50 | video_repo = ExtendedVideoRepository(session) 51 | 52 | # check to see if video already processed 53 | video_status = video_repo.get_status(media_id) 54 | if video_status != VideoProcessingStatus.COMPLETED: 55 | raise UnfinishedVideoException( 56 | media_id=media_id, 57 | status=video_status, 58 | message=f"The video data is not available, status: {video_status}", 59 | ) 60 | 61 | video_metadata = video_repo.get_metadata(media_id) 62 | 63 | transcription_output = transcript_repo.get_transcript( 64 | model_name=settings.asr_model_name, media_id=media_id 65 | ) 66 | 67 | captions_output = caption_repo.get_captions( 68 | model_name=settings.captioning_model_name, media_id=media_id 69 | ) 70 | 71 | timeline_output = generate_combined_timeline( 72 | transcription_segments=transcription_output["segments"], 73 | captions=captions_output["captions"], 74 | caption_timestamps=captions_output["timestamps"], 75 | ) 76 | timeline_json = json.dumps( 77 | timeline_output["timeline"], indent=4, separators=(",", ": ") 78 | ) 79 | 80 | dialog = generate_dialog( 81 | metadata=video_metadata, 82 | timeline=timeline_json, 83 | question=question, 84 | ) 85 | async for item in self.llm_handle.chat_stream( 86 | dialog=dialog, sampling_params=sampling_params 87 | ): 88 | yield {"completion": item["text"]} 89 | -------------------------------------------------------------------------------- /aana_chat_with_video/storage/repository/extended_video_caption.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy.orm import Session 2 | 3 | from aana.core.models.captions import Caption, CaptionsList 4 | from aana.core.models.media import MediaId 5 | from aana.storage.repository.base import BaseRepository 6 | from aana_chat_with_video.storage.models.extended_video_caption import ( 7 | ExtendedVideoCaptionEntity, 8 | ) 9 | 10 | 11 | class ExtendedVideoCaptionRepository(BaseRepository[ExtendedVideoCaptionEntity]): 12 | """Repository for Captions.""" 13 | 14 | def __init__(self, session: Session): 15 | """Constructor.""" 16 | super().__init__(session, ExtendedVideoCaptionEntity) 17 | 18 | def save( 19 | self, 20 | model_name: str, 21 | media_id: MediaId, 22 | caption: Caption, 23 | timestamp: float, 24 | frame_id: int, 25 | ): 26 | """Save a caption. 27 | 28 | Args: 29 | model_name (str): The name of the model used to generate the caption. 30 | media_id (MediaId): The media ID. 31 | caption (Caption): The caption. 32 | timestamp (float): The timestamp. 33 | frame_id (int): The frame ID. 34 | """ 35 | entity = ExtendedVideoCaptionEntity.from_caption_output( 36 | model_name=model_name, 37 | media_id=media_id, 38 | frame_id=frame_id, 39 | timestamp=timestamp, 40 | caption=caption, 41 | ) 42 | self.create(entity) 43 | return entity 44 | 45 | def save_all( 46 | self, 47 | model_name: str, 48 | media_id: MediaId, 49 | captions: CaptionsList, 50 | timestamps: list[float], 51 | frame_ids: list[int], 52 | ) -> list[ExtendedVideoCaptionEntity]: 53 | """Save captions. 54 | 55 | Args: 56 | model_name (str): The name of the model used to generate the captions. 57 | media_id (MediaId): the media ID of the video. 58 | captions (CaptionsList): The captions. 59 | timestamps (list[float]): The timestamps. 60 | frame_ids (list[int]): The frame IDs. 61 | 62 | Returns: 63 | list[ExtendedVideoCaptionEntity]: The list of caption entities. 64 | """ 65 | entities = [ 66 | ExtendedVideoCaptionEntity.from_caption_output( 67 | model_name=model_name, 68 | media_id=media_id, 69 | frame_id=frame_id, 70 | timestamp=timestamp, 71 | caption=caption, 72 | ) 73 | for caption, timestamp, frame_id in zip( 74 | captions, timestamps, frame_ids, strict=True 75 | ) 76 | ] 77 | results = self.create_multiple(entities) 78 | return results 79 | 80 | def get_captions(self, model_name: str, media_id: MediaId) -> dict: 81 | """Get the captions for a video. 82 | 83 | Args: 84 | model_name (str): The model name. 85 | media_id (MediaId): The media ID. 86 | 87 | Returns: 88 | dict: The dictionary with the captions, timestamps, and frame IDs. 89 | """ 90 | entities: list[ExtendedVideoCaptionEntity] = ( 91 | self.session.query(self.model_class) 92 | .filter_by(media_id=media_id, model=model_name) 93 | .order_by(self.model_class.frame_id) 94 | .all() 95 | ) 96 | captions = [c.caption for c in entities] 97 | timestamps = [c.timestamp for c in entities] 98 | frame_ids = [c.frame_id for c in entities] 99 | return { 100 | "captions": captions, 101 | "timestamps": timestamps, 102 | "frame_ids": frame_ids, 103 | } 104 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 110 | .pdm.toml 111 | .pdm-python 112 | .pdm-build/ 113 | 114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 115 | __pypackages__/ 116 | 117 | # Celery stuff 118 | celerybeat-schedule 119 | celerybeat.pid 120 | 121 | # SageMath parsed files 122 | *.sage.py 123 | 124 | # Environments 125 | .env 126 | .venv 127 | env/ 128 | venv/ 129 | ENV/ 130 | env.bak/ 131 | venv.bak/ 132 | 133 | # Spyder project settings 134 | .spyderproject 135 | .spyproject 136 | 137 | # Rope project settings 138 | .ropeproject 139 | 140 | # mkdocs documentation 141 | /site 142 | 143 | # mypy 144 | .mypy_cache/ 145 | .dmypy.json 146 | dmypy.json 147 | 148 | # Pyre type checker 149 | .pyre/ 150 | 151 | # pytype static type analyzer 152 | .pytype/ 153 | 154 | # Cython debug symbols 155 | cython_debug/ 156 | 157 | # PyCharm 158 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 159 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 160 | # and can be added to the global gitignore or merged into this file. For a more nuclear 161 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 162 | #.idea/ 163 | -------------------------------------------------------------------------------- /aana_chat_with_video/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = alembic 6 | 7 | # template used to generate migration file names; The default value is %%(rev)s_%%(slug)s 8 | # Uncomment the line below if you want the files to be prepended with date and time 9 | # see https://alembic.sqlalchemy.org/en/latest/tutorial.html#editing-the-ini-file 10 | # for all available tokens 11 | # file_template = %%(year)d_%%(month).2d_%%(day).2d_%%(hour).2d%%(minute).2d-%%(rev)s_%%(slug)s 12 | 13 | # sys.path path, will be prepended to sys.path if present. 14 | # defaults to the current working directory. 15 | prepend_sys_path = . 16 | 17 | # timezone to use when rendering the date within the migration file 18 | # as well as the filename. 19 | # If specified, requires the python-dateutil library that can be 20 | # installed by adding `alembic[tz]` to the pip requirements 21 | # string value is passed to dateutil.tz.gettz() 22 | # leave blank for localtime 23 | # timezone = 24 | 25 | # max length of characters to apply to the 26 | # "slug" field 27 | # truncate_slug_length = 40 28 | 29 | # set to 'true' to run the environment during 30 | # the 'revision' command, regardless of autogenerate 31 | # revision_environment = false 32 | 33 | # set to 'true' to allow .pyc and .pyo files without 34 | # a source .py file to be detected as revisions in the 35 | # versions/ directory 36 | # sourceless = false 37 | 38 | # version location specification; This defaults 39 | # to alembic/versions. When using multiple version 40 | # directories, initial revisions must be specified with --version-path. 41 | # The path separator used here should be the separator specified by "version_path_separator" below. 42 | # version_locations = %(here)s/bar:%(here)s/bat:alembic/versions 43 | 44 | # version path separator; As mentioned above, this is the character used to split 45 | # version_locations. The default within new alembic.ini files is "os", which uses os.pathsep. 46 | # If this key is omitted entirely, it falls back to the legacy behavior of splitting on spaces and/or commas. 47 | # Valid values for version_path_separator are: 48 | # 49 | # version_path_separator = : 50 | # version_path_separator = ; 51 | # version_path_separator = space 52 | version_path_separator = os # Use os.pathsep. Default configuration used for new projects. 53 | 54 | # set to 'true' to search source files recursively 55 | # in each "version_locations" directory 56 | # new in Alembic version 1.10 57 | # recursive_version_locations = false 58 | 59 | # the output encoding used when revision files 60 | # are written from script.py.mako 61 | # output_encoding = utf-8 62 | 63 | # sqlalchemy.url = driver://user:pass@localhost/dbname 64 | 65 | 66 | [post_write_hooks] 67 | # post_write_hooks defines scripts or Python functions that are run 68 | # on newly generated revision scripts. See the documentation for further 69 | # detail and examples 70 | 71 | # format using "black" - use the console_scripts runner, against the "black" entrypoint 72 | # hooks = black 73 | # black.type = console_scripts 74 | # black.entrypoint = black 75 | # black.options = -l 79 REVISION_SCRIPT_FILENAME 76 | 77 | # lint with attempts to fix using "ruff" - use the exec runner, execute a binary 78 | hooks = ruff 79 | ruff.type = exec 80 | ruff.executable = ruff 81 | ruff.options = --fix REVISION_SCRIPT_FILENAME 82 | 83 | # Logging configuration 84 | [loggers] 85 | keys = root,sqlalchemy,alembic 86 | 87 | [handlers] 88 | keys = console 89 | 90 | [formatters] 91 | keys = generic 92 | 93 | [logger_root] 94 | level = WARN 95 | handlers = console 96 | qualname = 97 | 98 | [logger_sqlalchemy] 99 | level = WARN 100 | handlers = 101 | qualname = sqlalchemy.engine 102 | 103 | [logger_alembic] 104 | level = INFO 105 | handlers = 106 | qualname = alembic 107 | 108 | [handler_console] 109 | class = StreamHandler 110 | args = (sys.stderr,) 111 | level = NOTSET 112 | formatter = generic 113 | 114 | [formatter_generic] 115 | format = %(levelname)-5.5s [%(name)s] %(message)s 116 | datefmt = %H:%M:%S 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Chat with Video App 2 | 3 | **Chat with Video App** is a multimodal chat application that allows users to upload a video and ask questions about the video content based on the visual and audio information. See [Chat with Video Demo notebook](notebooks/chat_with_video_demo.ipynb) for more information. 4 | 5 | ## Installation 6 | 7 | To install the project, follow these steps: 8 | 9 | 1. Clone the repository. 10 | 11 | 2. Install additional libraries. 12 | 13 | For optimal performance, you should also install [PyTorch](https://pytorch.org/get-started/locally/) version >=2.1 appropriate for your system. You can continue directly to the next step, but it will install a default version that may not make optimal use of your system's resources, for example, a GPU or even some SIMD operations. Therefore we recommend choosing your PyTorch package carefully and installing it manually. 14 | 15 | Some models use Flash Attention. Install Flash Attention library for better performance. See [flash attention installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) for more details and supported GPUs. 16 | 17 | 3. Install the package with poetry. 18 | 19 | The project is managed with [Poetry](https://python-poetry.org/docs/). See the [Poetry installation instructions](https://python-poetry.org/docs/#installation) on how to install it on your system. 20 | 21 | It will install the package and all dependencies in a virtual environment. 22 | 23 | ```bash 24 | poetry install 25 | ``` 26 | 27 | 4. Run the app. 28 | 29 | ```bash 30 | CUDA_VISIBLE_DEVICES="0" aana deploy aana_chat_with_video.app:aana_app 31 | ``` 32 | 33 | ## Usage 34 | 35 | To use the project, follow these steps: 36 | 37 | 1. Run the app as described in the installation section. 38 | 39 | ```bash 40 | CUDA_VISIBLE_DEVICES="0" aana deploy aana_chat_with_video.app:aana_app 41 | ``` 42 | 43 | Once the application is running, you will see the message `Deployed successfully.` in the logs. It will also show the URL for the API documentation. 44 | 45 | > **⚠️ Warning** 46 | > 47 | > The applications require 1 large GPUs to run. GPU should have at least 48GB of memory. 48 | > 49 | > The applications will detect the available GPU automatically but you need to make sure that `CUDA_VISIBLE_DEVICES` is set correctly. 50 | > 51 | > Sometimes `CUDA_VISIBLE_DEVICES` is set to an empty string and the application will not be able to detect the GPU. Use `unset CUDA_VISIBLE_DEVICES` to unset the variable. 52 | > 53 | > You can also set the `CUDA_VISIBLE_DEVICES` environment variable to the GPU index you want to use: `export CUDA_VISIBLE_DEVICES=0`. 54 | 55 | 2. Send a POST request to the app. 56 | 57 | See [Chat with Video Demo notebook](notebooks/chat_with_video_demo.ipynb) for more information. 58 | 59 | ## Running with Docker 60 | 61 | We provide a docker-compose configuration to run the application in a Docker container. 62 | 63 | Requirements: 64 | 65 | - Docker Engine >= 26.1.0 66 | - Docker Compose >= 1.29.2 67 | - NVIDIA Driver >= 525.60.13 68 | 69 | To run the application, simply run the following command: 70 | 71 | ```bash 72 | docker-compose up 73 | ``` 74 | 75 | The application will be accessible at `http://localhost:8000` on the host server. 76 | 77 | 78 | > **⚠️ Warning** 79 | > 80 | > The applications require 1 GPUs to run. 81 | > 82 | > The applications will detect the available GPU automatically but you need to make sure that `CUDA_VISIBLE_DEVICES` is set correctly. 83 | > 84 | > Sometimes `CUDA_VISIBLE_DEVICES` is set to an empty string and the application will not be able to detect the GPU. Use `unset CUDA_VISIBLE_DEVICES` to unset the variable. 85 | > 86 | > You can also set the `CUDA_VISIBLE_DEVICES` environment variable to the GPU index you want to use: `CUDA_VISIBLE_DEVICES=0 docker-compose up`. 87 | 88 | 89 | > **💡Tip** 90 | > 91 | > Some models use Flash Attention for better performance. You can set the build argument `INSTALL_FLASH_ATTENTION` to `true` to install Flash Attention. 92 | > 93 | > ```bash 94 | > INSTALL_FLASH_ATTENTION=true docker-compose build 95 | > ``` 96 | > 97 | > After building the image, you can use `docker-compose up` command to run the application. 98 | > 99 | > You can also set the `INSTALL_FLASH_ATTENTION` environment variable to `true` in the `docker-compose.yaml` file. 100 | -------------------------------------------------------------------------------- /aana_chat_with_video/tests/test_extended_video_caption_repo.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: S101 2 | 3 | import random 4 | import uuid 5 | 6 | import pytest 7 | 8 | from aana.core.models.captions import Caption 9 | from aana.exceptions.db import NotFoundException 10 | from aana_chat_with_video.storage.repository.extended_video_caption import ( 11 | ExtendedVideoCaptionRepository, 12 | ) 13 | 14 | 15 | @pytest.fixture(scope="function") 16 | def dummy_caption(): 17 | """Creates a dummy caption for testing.""" 18 | caption = Caption(f"This is a caption {uuid.uuid4()}") 19 | frame_id = random.randint(0, 100) # noqa: S311 20 | timestamp = random.random() # noqa: S311 21 | return caption, frame_id, timestamp 22 | 23 | 24 | def test_save_caption(db_session, dummy_caption): 25 | """Tests saving a caption.""" 26 | caption, frame_id, timestamp = dummy_caption 27 | model_name = "blip2" 28 | media_id = "test_media_id" 29 | 30 | caption_repo = ExtendedVideoCaptionRepository(db_session) 31 | caption_entity = caption_repo.save( 32 | model_name=model_name, 33 | media_id=media_id, 34 | caption=caption, 35 | frame_id=frame_id, 36 | timestamp=timestamp, 37 | ) 38 | caption_id = caption_entity.id 39 | 40 | caption_entity = caption_repo.read(caption_id) 41 | assert caption_entity.model == model_name 42 | assert caption_entity.media_id == media_id 43 | assert caption_entity.frame_id == frame_id 44 | assert caption_entity.timestamp == timestamp 45 | assert caption_entity.caption == caption 46 | 47 | caption_repo.delete(caption_id) 48 | with pytest.raises(NotFoundException): 49 | caption_repo.read(caption_id) 50 | 51 | 52 | def test_save_all_captions(db_session, dummy_caption): 53 | """Tests saving all captions.""" 54 | captions, frame_ids, timestamps = [], [], [] 55 | for _ in range(3): 56 | caption, frame_id, timestamp = dummy_caption 57 | captions.append(caption) 58 | frame_ids.append(frame_id) 59 | timestamps.append(timestamp) 60 | model_name = "blip2" 61 | media_id = "test_media_id_all" 62 | 63 | caption_repo = ExtendedVideoCaptionRepository(db_session) 64 | caption_entities = caption_repo.save_all( 65 | model_name=model_name, 66 | media_id=media_id, 67 | captions=captions, 68 | timestamps=timestamps, 69 | frame_ids=frame_ids, 70 | ) 71 | assert len(caption_entities) == len(captions) 72 | 73 | caption_ids = [caption_entity.id for caption_entity in caption_entities] 74 | for caption_id, caption, frame_id, timestamp in zip( 75 | caption_ids, captions, frame_ids, timestamps, strict=True 76 | ): 77 | caption_entity = caption_repo.read(caption_id) 78 | 79 | assert caption_entity.model == model_name 80 | assert caption_entity.media_id == media_id 81 | assert caption_entity.frame_id == frame_id 82 | assert caption_entity.timestamp == timestamp 83 | assert caption_entity.caption == caption 84 | 85 | # delete all captions 86 | for caption_id in caption_ids: 87 | caption_repo.delete(caption_id) 88 | with pytest.raises(NotFoundException): 89 | caption_repo.read(caption_id) 90 | 91 | 92 | def test_get_captions(db_session, dummy_caption): 93 | """Tests getting all captions.""" 94 | captions, frame_ids, timestamps = [], [], [] 95 | for _ in range(3): 96 | caption, frame_id, timestamp = dummy_caption 97 | captions.append(caption) 98 | frame_ids.append(frame_id) 99 | timestamps.append(timestamp) 100 | model_name = "blip2" 101 | media_id = "test_media_id_get_captions" 102 | 103 | caption_repo = ExtendedVideoCaptionRepository(db_session) 104 | caption_entities = caption_repo.save_all( 105 | model_name=model_name, 106 | media_id=media_id, 107 | captions=captions, 108 | timestamps=timestamps, 109 | frame_ids=frame_ids, 110 | ) 111 | assert len(caption_entities) == len(captions) 112 | 113 | saved_captions = caption_repo.get_captions(model_name, media_id) 114 | 115 | assert saved_captions["captions"] == captions 116 | assert saved_captions["frame_ids"] == frame_ids 117 | assert saved_captions["timestamps"] == timestamps 118 | 119 | # delete all captions 120 | for caption_entity in caption_entities: 121 | caption_repo.delete(caption_entity.id) 122 | with pytest.raises(NotFoundException): 123 | caption_repo.read(caption_entity.id) 124 | -------------------------------------------------------------------------------- /aana_chat_with_video/tests/test_app.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: S101 2 | # Test chat with video endpoints. 3 | 4 | from importlib import resources 5 | 6 | import pytest 7 | 8 | from aana.tests.utils import is_gpu_available 9 | 10 | VIDEO_INDEX_ENDPOINT = "/video/index_stream" 11 | VIDEO_METADATA_ENDPOINT = "/video/metadata" 12 | VIDEO_CHAT_ENDPOINT = "/video/chat_stream" 13 | VIDEO_STATUS_ENDPOINT = "/video/status" 14 | VIDEO_DELETE_ENDPOINT = "/video/delete" 15 | 16 | 17 | @pytest.mark.skipif( 18 | not is_gpu_available(), 19 | reason="GPU is not available", 20 | ) 21 | @pytest.mark.parametrize( 22 | "video, whisper_params", 23 | [ 24 | ( 25 | { 26 | "url": "https://mobius-public.s3.eu-west-1.amazonaws.com/squirrel.mp4", 27 | "media_id": "squirrel.mp4", 28 | }, 29 | {"temperature": 0.0}, 30 | ), 31 | ( 32 | { 33 | "path": str( 34 | resources.path("aana.tests.files.videos", "physicsworks.webm") 35 | ), 36 | "media_id": "physicsworks.webm", 37 | }, 38 | {"temperature": 0.0}, 39 | ), 40 | ( 41 | { 42 | "path": str( 43 | resources.path("aana.tests.files.videos", "physicsworks_audio.webm") 44 | ), 45 | "media_id": "physicsworks_audio.webm", 46 | }, 47 | {"temperature": 0.0}, 48 | ), 49 | ], 50 | ) 51 | def test_chat_with_video(call_endpoint, video, whisper_params): 52 | """Test chat with video endpoint.""" 53 | media_id = video["media_id"] 54 | 55 | call_endpoint( 56 | VIDEO_INDEX_ENDPOINT, 57 | {"video": video, "whisper_params": whisper_params}, 58 | ) 59 | 60 | # if we try to index the same video again, we should get an error MediaIdAlreadyExistsException 61 | call_endpoint( 62 | VIDEO_INDEX_ENDPOINT, 63 | {"video": video, "whisper_params": whisper_params}, 64 | expected_error="MediaIdAlreadyExistsException", 65 | ) 66 | 67 | # load video metadata 68 | call_endpoint( 69 | VIDEO_METADATA_ENDPOINT, 70 | {"media_id": media_id}, 71 | ) 72 | 73 | # get video status 74 | call_endpoint( 75 | VIDEO_STATUS_ENDPOINT, 76 | {"media_id": media_id}, 77 | ) 78 | 79 | # delete video 80 | call_endpoint( 81 | VIDEO_DELETE_ENDPOINT, 82 | {"media_id": media_id}, 83 | ) 84 | 85 | # get video status 86 | call_endpoint( 87 | VIDEO_STATUS_ENDPOINT, 88 | {"media_id": media_id}, 89 | expected_error="NotFoundException", 90 | ) 91 | 92 | # after deleting the video video metadata should not be available 93 | call_endpoint( 94 | VIDEO_METADATA_ENDPOINT, 95 | {"media_id": media_id}, 96 | expected_error="NotFoundException", 97 | ) 98 | 99 | # after deleting the video, we should be able to index it again 100 | call_endpoint( 101 | VIDEO_INDEX_ENDPOINT, 102 | {"video": video, "whisper_params": whisper_params}, 103 | ) 104 | 105 | # load video metadata 106 | call_endpoint( 107 | VIDEO_METADATA_ENDPOINT, 108 | {"media_id": media_id}, 109 | ) 110 | 111 | # chat with video 112 | question = "Summarize the video" 113 | 114 | call_endpoint( 115 | VIDEO_CHAT_ENDPOINT, 116 | {"media_id": media_id, "question": question}, 117 | ) 118 | 119 | # delete video 120 | call_endpoint( 121 | VIDEO_DELETE_ENDPOINT, 122 | {"media_id": media_id}, 123 | ) 124 | 125 | # after deleting the video, we should not be able to chat with it 126 | call_endpoint( 127 | VIDEO_CHAT_ENDPOINT, 128 | {"media_id": media_id, "question": question}, 129 | expected_error="NotFoundException", 130 | ) 131 | 132 | 133 | @pytest.mark.skipif( 134 | not is_gpu_available(), 135 | reason="GPU is not available", 136 | ) 137 | @pytest.mark.parametrize( 138 | "endpoint, data", 139 | [ 140 | (VIDEO_METADATA_ENDPOINT, {}), 141 | (VIDEO_CHAT_ENDPOINT, {}), 142 | (VIDEO_CHAT_ENDPOINT, {"media_id": "squirrel.mp4"}), 143 | (VIDEO_CHAT_ENDPOINT, {"question": "Summarize the video"}), 144 | (VIDEO_INDEX_ENDPOINT, {}), 145 | (VIDEO_DELETE_ENDPOINT, {}), 146 | ], 147 | ) 148 | def test_missing_params(call_endpoint, endpoint, data): 149 | """Test missing params.""" 150 | call_endpoint( 151 | endpoint, 152 | data, 153 | expected_error="ValidationError", 154 | ) 155 | -------------------------------------------------------------------------------- /aana_chat_with_video/utils/core.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from math import floor 3 | 4 | from aana.core.models.asr import AsrSegments 5 | from aana.core.models.chat import ChatDialog, ChatMessage, Question 6 | from aana.core.models.video import VideoMetadata 7 | 8 | 9 | def generate_dialog( 10 | metadata: VideoMetadata, 11 | timeline: str, 12 | question: Question, 13 | ) -> ChatDialog: 14 | """Generates a dialog from the metadata and timeline of a video. 15 | 16 | Args: 17 | metadata (VideoMetadata): the metadata of the video 18 | timeline (str): the timeline of the video 19 | question (Question): the question to ask 20 | 21 | Returns: 22 | ChatDialog: the generated dialog 23 | """ 24 | system_prompt_preamble = """You are a helpful, respectful, and honest assistant. Always answer as helpfully as possible, while ensuring safety. You will be provided with a script in json format for a video containing information from visual captions and audio transcripts. Each entry in the script follows the format: 25 | 26 | {{ 27 | "start_time":"start_time_in_seconds", 28 | "end_time": "end_time_in_seconds", 29 | "audio_transcript": "the_transcript_from_automatic_speech_recognition_system", 30 | "visual_caption": "the_caption_of_the_visuals_using_computer_vision_system" 31 | }} 32 | Note that the audio_transcript can sometimes be empty. 33 | 34 | Ensure you do not introduce any new named entities in your output and maintain the utmost factual accuracy in your responses. 35 | 36 | In the addition you will be provided with title of video extracted. 37 | """ 38 | instruction = ( 39 | "Provide a short and concise answer to the following user's question. " 40 | "Avoid mentioning any details about the script in JSON format. " 41 | "For example, a good response would be: 'Based on the analysis, " 42 | "here are the most relevant/useful/aesthetic moments.' " 43 | "A less effective response would be: " 44 | "'Based on the provided visual caption/audio transcript, " 45 | "here are the most relevant/useful/aesthetic moments. The user question is " 46 | ) 47 | 48 | user_prompt_template = ( 49 | "{instruction}" 50 | "Given the timeline of audio and visual activities in the video below " 51 | "I want to find out the following: {question}" 52 | "The timeline is: " 53 | "{timeline}" 54 | "\n" 55 | "The title of the video is {video_title}" 56 | ) 57 | 58 | messages = [] 59 | messages.append(ChatMessage(content=system_prompt_preamble, role="system")) 60 | messages.append( 61 | ChatMessage( 62 | content=user_prompt_template.format( 63 | instruction=instruction, 64 | question=question, 65 | timeline=timeline, 66 | video_title=metadata.title, 67 | ), 68 | role="user", 69 | ) 70 | ) 71 | 72 | dialog = ChatDialog(messages=messages) 73 | return dialog 74 | 75 | 76 | def generate_combined_timeline( 77 | transcription_segments: AsrSegments, 78 | captions: list[str], 79 | caption_timestamps: list[float], 80 | chunk_size: float = 10.0, 81 | ): 82 | """Generates a combined timeline from the ASR segments and the captions. 83 | 84 | Args: 85 | transcription_segments (AsrSegments): the ASR segments 86 | captions (list[str]): the captions 87 | caption_timestamps (list[float]): the timestamps for the captions 88 | chunk_size (float, optional): the chunk size for the combined timeline in seconds. Defaults to 10.0. 89 | 90 | Returns: 91 | dict: dictionary containing one key, "timeline", which is a list of dictionaries with the following keys: 92 | "start_time": the start time of the chunk in seconds 93 | "end_time": the end time of the chunk in seconds 94 | "audio_transcript": the audio transcript for the chunk 95 | "visual_caption": the visual caption for the chunk 96 | """ 97 | timeline_dict: defaultdict[int, dict[str, list[str]]] = defaultdict( 98 | lambda: {"transcription": [], "captions": []} 99 | ) 100 | for segment in transcription_segments: 101 | segment_start = segment.time_interval.start 102 | chunk_index = floor(segment_start / chunk_size) 103 | timeline_dict[chunk_index]["transcription"].append(segment.text) 104 | 105 | if len(captions) != len(caption_timestamps): 106 | raise ValueError( # noqa: TRY003 107 | f"Length of captions ({len(captions)}) and timestamps ({len(caption_timestamps)}) do not match" 108 | ) 109 | 110 | for timestamp, caption in zip(caption_timestamps, captions, strict=True): 111 | chunk_index = floor(timestamp / chunk_size) 112 | timeline_dict[chunk_index]["captions"].append(caption) 113 | 114 | num_chunks = max(timeline_dict.keys()) + 1 115 | 116 | timeline = [ 117 | { 118 | "start_time": chunk_index * chunk_size, 119 | "end_time": (chunk_index + 1) * chunk_size, 120 | "audio_transcript": "\n".join(timeline_dict[chunk_index]["transcription"]), 121 | "visual_caption": "\n".join(timeline_dict[chunk_index]["captions"]), 122 | } 123 | for chunk_index in range(num_chunks) 124 | ] 125 | 126 | return { 127 | "timeline": timeline, 128 | } 129 | -------------------------------------------------------------------------------- /aana_chat_with_video/alembic/versions/5ad873484aa3_init.py: -------------------------------------------------------------------------------- 1 | """init. 2 | 3 | Revision ID: 5ad873484aa3 4 | Revises: 5 | Create Date: 2024-07-25 13:09:44.450321 6 | 7 | """ 8 | from collections.abc import Sequence 9 | 10 | import sqlalchemy as sa 11 | from alembic import op 12 | 13 | # revision identifiers, used by Alembic. 14 | revision: str = "5ad873484aa3" 15 | down_revision: str | None = None 16 | branch_labels: str | Sequence[str] | None = None 17 | depends_on: str | Sequence[str] | None = None 18 | 19 | 20 | def upgrade() -> None: 21 | """Upgrade database to this revision from previous.""" 22 | # ### commands auto generated by Alembic - please adjust! ### 23 | op.create_table( 24 | "caption", 25 | sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), 26 | sa.Column( 27 | "model", 28 | sa.String(), 29 | nullable=False, 30 | comment="Name of model used to generate the caption", 31 | ), 32 | sa.Column( 33 | "frame_id", 34 | sa.Integer(), 35 | nullable=False, 36 | comment="The 0-based frame id of video for caption", 37 | ), 38 | sa.Column("caption", sa.String(), nullable=False, comment="Frame caption"), 39 | sa.Column( 40 | "timestamp", 41 | sa.Float(), 42 | nullable=False, 43 | comment="Frame timestamp in seconds", 44 | ), 45 | sa.Column( 46 | "caption_type", sa.String(), nullable=False, comment="The type of caption" 47 | ), 48 | sa.Column( 49 | "created_at", 50 | sa.DateTime(timezone=True), 51 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 52 | nullable=False, 53 | comment="Timestamp when row is inserted", 54 | ), 55 | sa.Column( 56 | "updated_at", 57 | sa.DateTime(timezone=True), 58 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 59 | nullable=False, 60 | comment="Timestamp when row is updated", 61 | ), 62 | sa.PrimaryKeyConstraint("id", name=op.f("pk_caption")), 63 | ) 64 | op.create_table( 65 | "media", 66 | sa.Column( 67 | "id", 68 | sa.String(length=36), 69 | nullable=False, 70 | comment="Unique identifier for the media", 71 | ), 72 | sa.Column( 73 | "media_type", sa.String(), nullable=False, comment="The type of media" 74 | ), 75 | sa.Column( 76 | "created_at", 77 | sa.DateTime(timezone=True), 78 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 79 | nullable=False, 80 | comment="Timestamp when row is inserted", 81 | ), 82 | sa.Column( 83 | "updated_at", 84 | sa.DateTime(timezone=True), 85 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 86 | nullable=False, 87 | comment="Timestamp when row is updated", 88 | ), 89 | sa.PrimaryKeyConstraint("id", name=op.f("pk_media")), 90 | ) 91 | op.create_table( 92 | "tasks", 93 | sa.Column("id", sa.UUID(), nullable=False, comment="Task ID"), 94 | sa.Column( 95 | "endpoint", 96 | sa.String(), 97 | nullable=False, 98 | comment="The endpoint to which the task is assigned", 99 | ), 100 | sa.Column("data", sa.PickleType(), nullable=False, comment="Data for the task"), 101 | sa.Column( 102 | "status", 103 | sa.Enum( 104 | "CREATED", 105 | "ASSIGNED", 106 | "COMPLETED", 107 | "RUNNING", 108 | "FAILED", 109 | "NOT_FINISHED", 110 | name="status", 111 | ), 112 | nullable=False, 113 | comment="Status of the task", 114 | ), 115 | sa.Column( 116 | "priority", 117 | sa.Integer(), 118 | nullable=False, 119 | comment="Priority of the task (0 is the lowest)", 120 | ), 121 | sa.Column( 122 | "assigned_at", 123 | sa.DateTime(timezone=True), 124 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 125 | nullable=True, 126 | comment="Timestamp when the task was assigned", 127 | ), 128 | sa.Column( 129 | "completed_at", 130 | sa.DateTime(timezone=True), 131 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 132 | nullable=True, 133 | comment="Timestamp when the task was completed", 134 | ), 135 | sa.Column( 136 | "progress", 137 | sa.Float(), 138 | nullable=False, 139 | comment="Progress of the task in percentage", 140 | ), 141 | sa.Column( 142 | "result", 143 | sa.JSON(), 144 | nullable=True, 145 | comment="Result of the task in JSON format", 146 | ), 147 | sa.Column( 148 | "created_at", 149 | sa.DateTime(timezone=True), 150 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 151 | nullable=False, 152 | comment="Timestamp when row is inserted", 153 | ), 154 | sa.Column( 155 | "updated_at", 156 | sa.DateTime(timezone=True), 157 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 158 | nullable=False, 159 | comment="Timestamp when row is updated", 160 | ), 161 | sa.PrimaryKeyConstraint("id", name=op.f("pk_tasks")), 162 | ) 163 | op.create_table( 164 | "transcript", 165 | sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), 166 | sa.Column( 167 | "model", 168 | sa.String(), 169 | nullable=False, 170 | comment="Name of model used to generate transcript", 171 | ), 172 | sa.Column( 173 | "transcript", 174 | sa.String(), 175 | nullable=False, 176 | comment="Full text transcript of media", 177 | ), 178 | sa.Column( 179 | "segments", sa.JSON(), nullable=False, comment="Segments of the transcript" 180 | ), 181 | sa.Column( 182 | "language", 183 | sa.String(), 184 | nullable=False, 185 | comment="Language of the transcript as predicted by model", 186 | ), 187 | sa.Column( 188 | "language_confidence", 189 | sa.Float(), 190 | nullable=False, 191 | comment="Confidence score of language prediction", 192 | ), 193 | sa.Column( 194 | "transcript_type", 195 | sa.String(), 196 | nullable=False, 197 | comment="The type of transcript", 198 | ), 199 | sa.Column( 200 | "created_at", 201 | sa.DateTime(timezone=True), 202 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 203 | nullable=False, 204 | comment="Timestamp when row is inserted", 205 | ), 206 | sa.Column( 207 | "updated_at", 208 | sa.DateTime(timezone=True), 209 | server_default=sa.text("(CURRENT_TIMESTAMP)"), 210 | nullable=False, 211 | comment="Timestamp when row is updated", 212 | ), 213 | sa.PrimaryKeyConstraint("id", name=op.f("pk_transcript")), 214 | ) 215 | op.create_table( 216 | "video", 217 | sa.Column("id", sa.String(length=36), nullable=False), 218 | sa.Column("path", sa.String(), nullable=True, comment="Path"), 219 | sa.Column("url", sa.String(), nullable=True, comment="URL"), 220 | sa.Column("title", sa.String(), nullable=True, comment="Title"), 221 | sa.Column("description", sa.String(), nullable=True, comment="Description"), 222 | sa.ForeignKeyConstraint(["id"], ["media.id"], name=op.f("fk_video_id_media")), 223 | sa.PrimaryKeyConstraint("id", name=op.f("pk_video")), 224 | ) 225 | # ### end Alembic commands ### 226 | 227 | 228 | def downgrade() -> None: 229 | """Downgrade database from this revision to previous.""" 230 | # ### commands auto generated by Alembic - please adjust! ### 231 | op.drop_table("video") 232 | op.drop_table("transcript") 233 | op.drop_table("tasks") 234 | op.drop_table("media") 235 | op.drop_table("caption") 236 | # ### end Alembic commands ### 237 | -------------------------------------------------------------------------------- /aana_chat_with_video/endpoints/index_video.py: -------------------------------------------------------------------------------- 1 | from collections.abc import AsyncGenerator 2 | from typing import TYPE_CHECKING, Annotated, TypedDict 3 | import asyncio 4 | from aana.storage.session import get_session 5 | from pydantic import Field 6 | 7 | from aana.api.api_generation import Endpoint 8 | from aana.core.models.asr import ( 9 | AsrSegments, 10 | AsrTranscription, 11 | AsrTranscriptionInfo, 12 | ) 13 | from aana.core.models.media import MediaId 14 | from aana.core.models.vad import VadParams 15 | from aana.core.models.video import VideoInput, VideoMetadata, VideoParams 16 | from aana.core.models.whisper import BatchedWhisperParams 17 | from aana.core.models.image_chat import ImageChatDialog 18 | 19 | from aana.deployments.aana_deployment_handle import AanaDeploymentHandle 20 | from aana.exceptions.db import MediaIdAlreadyExistsException 21 | from aana.exceptions.io import VideoTooLongException 22 | from aana.integrations.external.decord import generate_frames, get_video_duration 23 | from aana.integrations.external.yt_dlp import download_video, get_video_metadata 24 | from aana.processors.remote import run_remote 25 | from aana.processors.video import extract_audio 26 | from aana_chat_with_video.configs.settings import settings 27 | from aana_chat_with_video.storage.models.extended_video import VideoProcessingStatus 28 | from aana_chat_with_video.storage.repository.extended_video import ( 29 | ExtendedVideoRepository, 30 | ) 31 | from aana_chat_with_video.storage.repository.extended_video_caption import ( 32 | ExtendedVideoCaptionRepository, 33 | ) 34 | from aana_chat_with_video.storage.repository.extended_video_transcript import ( 35 | ExtendedVideoTranscriptRepository, 36 | ) 37 | 38 | if TYPE_CHECKING: 39 | from aana.core.models.audio import Audio 40 | from aana.core.models.video import Video 41 | 42 | 43 | class IndexVideoOutput(TypedDict): 44 | """The output of the transcribe video endpoint.""" 45 | 46 | media_id: MediaId 47 | metadata: VideoMetadata 48 | transcription: AsrTranscription 49 | transcription_info: AsrTranscriptionInfo 50 | segments: AsrSegments 51 | 52 | captions: Annotated[list[str], Field(..., description="Captions")] 53 | timestamps: Annotated[ 54 | list[float], Field(..., description="Timestamps for each caption in seconds") 55 | ] 56 | 57 | transcription_id: Annotated[int, Field(..., description="Transcription Id")] 58 | caption_ids: Annotated[list[int], Field(..., description="Caption Ids")] 59 | 60 | 61 | class IndexVideoEndpoint(Endpoint): 62 | """Transcribe video in chunks endpoint.""" 63 | 64 | async def initialize(self): 65 | """Initialize the endpoint.""" 66 | await super().initialize() 67 | self.asr_handle = await AanaDeploymentHandle.create("asr_deployment") 68 | self.vad_handle = await AanaDeploymentHandle.create("vad_deployment") 69 | self.captioning_handle = await AanaDeploymentHandle.create( 70 | "captioning_deployment" 71 | ) 72 | 73 | async def run( # noqa: C901 74 | self, 75 | video: VideoInput, 76 | video_params: VideoParams, 77 | whisper_params: BatchedWhisperParams, 78 | vad_params: VadParams, 79 | ) -> AsyncGenerator[IndexVideoOutput, None]: 80 | """Transcribe video in chunks.""" 81 | media_id = video.media_id 82 | with get_session() as session: 83 | if ExtendedVideoRepository(session).check_media_exists(media_id): 84 | raise MediaIdAlreadyExistsException(table_name="media", media_id=video) 85 | 86 | video_duration = None 87 | if video.url is not None: 88 | video_metadata = get_video_metadata(video.url) 89 | video_duration = video_metadata.duration 90 | 91 | # precheck for max video length before actually download the video if possible 92 | if video_duration and video_duration > settings.max_video_len: 93 | raise VideoTooLongException( 94 | video=video, 95 | video_len=video_duration, 96 | max_len=settings.max_video_len, 97 | ) 98 | 99 | video_obj: Video = await run_remote(download_video)(video_input=video) 100 | if video_duration is None: 101 | video_duration = await run_remote(get_video_duration)(video=video_obj) 102 | 103 | if video_duration > settings.max_video_len: 104 | raise VideoTooLongException( 105 | video=video_obj, 106 | video_len=video_duration, 107 | max_len=settings.max_video_len, 108 | ) 109 | 110 | with get_session() as session: 111 | ExtendedVideoRepository(session).save(video=video_obj, duration=video_duration) 112 | 113 | yield { 114 | "media_id": media_id, 115 | "metadata": VideoMetadata( 116 | title=video_obj.title, 117 | description=video_obj.description, 118 | duration=video_duration, 119 | ), 120 | } 121 | 122 | try: 123 | with get_session() as session: 124 | ExtendedVideoRepository(session).update_status( 125 | media_id, VideoProcessingStatus.RUNNING 126 | ) 127 | audio: Audio = extract_audio(video=video_obj) 128 | 129 | # TODO: Update once batched whisper PR is merged 130 | # vad_output = await self.vad_handle.asr_preprocess_vad( 131 | # audio=audio, params=vad_params 132 | # ) 133 | # vad_segments = vad_output["segments"] 134 | 135 | transcription_list = [] 136 | segments_list = [] 137 | transcription_info_list = [] 138 | async for whisper_output in self.asr_handle.transcribe_stream( 139 | audio=audio, params=whisper_params 140 | ): 141 | transcription_list.append(whisper_output["transcription"]) 142 | segments_list.append(whisper_output["segments"]) 143 | transcription_info_list.append(whisper_output["transcription_info"]) 144 | yield { 145 | "transcription": whisper_output["transcription"], 146 | "segments": whisper_output["segments"], 147 | "transcription_info": whisper_output["transcription_info"], 148 | } 149 | transcription = sum(transcription_list, AsrTranscription()) 150 | segments = sum(segments_list, AsrSegments()) 151 | transcription_info = sum(transcription_info_list, AsrTranscriptionInfo()) 152 | 153 | captions = [] 154 | timestamps = [] 155 | frame_ids = [] 156 | 157 | async for frames_dict in run_remote(generate_frames)( 158 | video=video_obj, params=video_params 159 | ): 160 | if len(frames_dict["frames"]) == 0: 161 | break 162 | 163 | timestamps.extend(frames_dict["timestamps"]) 164 | frame_ids.extend(frames_dict["frame_ids"]) 165 | captioning_output = await self.captioning_handle.generate_batch( 166 | images=frames_dict["frames"] 167 | ) 168 | captions.extend(captioning_output["captions"]) 169 | 170 | yield { 171 | "captions": captioning_output["captions"], 172 | "timestamps": frames_dict["timestamps"], 173 | } 174 | 175 | with get_session() as session: 176 | transcription_entity = ExtendedVideoTranscriptRepository(session).save( 177 | model_name=settings.asr_model_name, 178 | media_id=video_obj.media_id, 179 | transcription=transcription, 180 | segments=segments, 181 | transcription_info=transcription_info, 182 | ) 183 | 184 | caption_entities = ExtendedVideoCaptionRepository(session).save_all( 185 | model_name=settings.captioning_model_name, 186 | media_id=video_obj.media_id, 187 | captions=captions, 188 | timestamps=timestamps, 189 | frame_ids=frame_ids, 190 | ) 191 | 192 | yield { 193 | "transcription_id": transcription_entity.id, 194 | "caption_ids": [c.id for c in caption_entities], 195 | } 196 | except BaseException: 197 | with get_session() as session: 198 | ExtendedVideoRepository(session).update_status( 199 | media_id, VideoProcessingStatus.FAILED 200 | ) 201 | raise 202 | else: 203 | with get_session() as session: 204 | ExtendedVideoRepository(session).update_status( 205 | media_id, VideoProcessingStatus.COMPLETED 206 | ) 207 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------