├── .dockerignore ├── .github └── workflows │ └── main.yml ├── .gitignore ├── .pre-commit-config.yaml ├── Dockerfile ├── Makefile ├── README.md ├── docker-compose.yml ├── otel-collector ├── Dockerfile └── otel-collector-config.yaml ├── pyproject.toml ├── render.yaml ├── src ├── __init__.py ├── __main__.py ├── common │ ├── __init__.py │ ├── db │ │ ├── __init__.py │ │ ├── github.py │ │ └── slack.py │ └── embeddings.py ├── tiling │ ├── __init__.py │ └── build_map.py ├── webui │ ├── __init__.py │ ├── llm.py │ ├── main.py │ ├── settings.py │ ├── shared.py │ ├── slack.py │ └── web_hooks.py └── worker │ ├── __init__.py │ ├── docs_embeddings.py │ ├── github_similar_content.py │ └── settings.py └── uv.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | .git 2 | dist 3 | .venv 4 | .logfire 5 | .github 6 | 7 | **/node_modules/ 8 | **/__pycache__/ 9 | scratch 10 | 11 | **/target/ 12 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | tags: 8 | - '**' 9 | pull_request: {} 10 | 11 | jobs: 12 | lint: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | 17 | - name: Install uv 18 | uses: astral-sh/setup-uv@v3 19 | with: 20 | version: "0.4.30" 21 | enable-cache: true 22 | 23 | - name: Install dependencies 24 | run: uv sync --python 3.12 --frozen 25 | 26 | - run: make lint 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # python generated files 2 | __pycache__/ 3 | *.py[oc] 4 | build/ 5 | dist/ 6 | wheels/ 7 | *.egg-info 8 | 9 | # venv 10 | .venv 11 | 12 | # misc space for stuff not in VCS 13 | /scratch/ 14 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.0.1 4 | hooks: 5 | - id: no-commit-to-branch 6 | - id: check-yaml 7 | - id: check-toml 8 | - id: end-of-file-fixer 9 | - id: trailing-whitespace 10 | - id: check-added-large-files 11 | 12 | - repo: local 13 | hooks: 14 | - id: format 15 | name: Format 16 | entry: make format 17 | types: [python] 18 | language: system 19 | pass_filenames: false 20 | - id: lint 21 | name: Lint 22 | entry: make lint 23 | types: [python] 24 | language: system 25 | pass_filenames: false 26 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-alpine AS build 2 | 3 | WORKDIR /app 4 | 5 | # required for logfire[system-metrics] which in turn requires psutlils 6 | RUN apk add --no-cache gcc musl-dev linux-headers && rm -rf /var/cache/apk/* 7 | 8 | RUN pip install uv 9 | 10 | COPY pyproject.toml uv.lock ./ 11 | 12 | ENV UV_COMPILE_BYTECODE=1 13 | 14 | RUN uv sync --locked --no-install-project --no-dev 15 | 16 | COPY ./src /app/src 17 | 18 | ARG LOGFIRE_TOKEN 19 | ENV LOGFIRE_TOKEN=$LOGFIRE_TOKEN 20 | 21 | FROM python:3.12-alpine AS main 22 | 23 | COPY --from=build --chown=app:app /app /app 24 | WORKDIR /app 25 | ENV PATH="/app/.venv/bin:$PATH" 26 | 27 | CMD ["python", "-m", "src"] 28 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .DEFAULT_GOAL := all 2 | 3 | .PHONY: install 4 | install: 5 | uv sync --frozen 6 | pre-commit install 7 | 8 | .PHONY: build-docker 9 | build-docker: 10 | docker compose build 11 | 12 | .PHONY: up 13 | up: 14 | docker compose up --build 15 | 16 | .PHONY: format 17 | format: 18 | uv run ruff check --fix-only src 19 | uv run ruff format src 20 | 21 | .PHONY: lint 22 | lint: 23 | uv run ruff check src 24 | uv run ruff format --check src 25 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # logfire-demo 2 | 3 | This project demonstrates [Pydantic Logfire](https://pydantic.dev/logfire). 4 | 5 | It's designed to be a simple app with enough functionality to show some of the things Logfire can do. 6 | 7 | ## Running the demo 8 | 9 | 1. Follow [these](https://docs.pydantic.dev/logfire/guides/first_steps/) instructions to get setup with logfire, you'll want to export your logfire write token as `LOGFIRE_TOKEN` so it can be used by docker compose. 10 | 2. Create a GitHub app and set the `GITHUB_APP_ID`, `GITHUB_APP_INSTALLATION_ID` and `GITHUB_APP_PRIVATE_KEY` environment variables. (this is used for the "GitHub similar issues suggestion" in demo). 11 | 3. Create an OpenAI token and set the `OPENAI_API_KEY` environment variable (this is used for the "LLM Query" demo) 12 | 4. Run `make up`. 13 | 14 | Now you can go to the [Logfire demo page](http://localhost:8000/) and try the app. 15 | 16 | You can find your project `Dashboard` link at the end of the page. Click on the dashboard link 17 | to see the live logs from the demo project. 18 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | webui: 3 | build: . 4 | platform: linux/amd64 5 | ports: 6 | - 8000:8000 7 | environment: 8 | SERVICE: webui 9 | PG_DSN: postgres://postgres:postgres@postgres/logfire_demo 10 | REDIS_DSN: redis://redis:6379/0 11 | TILING_SERVER: "http://tiling:8000" 12 | # forward the openapi key from the host environment 13 | OPENAI_API_KEY: $OPENAI_API_KEY 14 | LOGFIRE_TOKEN: $LOGFIRE_TOKEN 15 | LOGFIRE_BASE_URL: $LOGFIRE_BASE_URL 16 | GITHUB_WEBHOOK_SECRET: $GITHUB_WEBHOOK_SECRET 17 | SLACK_SIGNING_SECRET: $SLACK_SIGNING_SECRET 18 | SLACK_CHANNEL: ${SLACK_CHANNEL:-{}} 19 | healthcheck: 20 | test: python -c "import urllib.request as r; assert r.urlopen('http://localhost:8000/health').status == 200" 21 | depends_on: 22 | - postgres 23 | - redis 24 | 25 | tiling: 26 | build: . 27 | platform: linux/amd64 28 | environment: 29 | SERVICE: tiling 30 | LOGFIRE_TOKEN: $LOGFIRE_TOKEN 31 | LOGFIRE_BASE_URL: $LOGFIRE_BASE_URL 32 | healthcheck: 33 | test: python -c "import urllib.request as r; assert r.urlopen('http://localhost:8000/health').status == 200" 34 | 35 | worker: 36 | build: . 37 | platform: linux/amd64 38 | environment: 39 | SERVICE: worker 40 | PG_DSN: postgres://postgres:postgres@postgres/logfire_demo 41 | REDIS_DSN: redis://redis:6379/0 42 | OPENAI_API_KEY: $OPENAI_API_KEY 43 | LOGFIRE_TOKEN: $LOGFIRE_TOKEN 44 | LOGFIRE_BASE_URL: $LOGFIRE_BASE_URL 45 | GITHUB_APP_ID: ${GITHUB_APP_ID:-0} 46 | GITHUB_APP_INSTALLATION_ID: ${GITHUB_APP_INSTALLATION_ID:-0} 47 | GITHUB_APP_PRIVATE_KEY: $GITHUB_APP_PRIVATE_KEY 48 | VECTOR_DISTANCE_THRESHOLD: 0.4 49 | AI_SIMILARITY_THRESHOLD: 85 50 | healthcheck: 51 | test: arq src.worker.WorkerSettings --check 52 | 53 | postgres: 54 | image: ankane/pgvector:latest 55 | container_name: logfire-demo-postgres 56 | environment: 57 | POSTGRES_USER: postgres 58 | POSTGRES_PASSWORD: postgres 59 | POSTGRES_DB: logfire_demo 60 | ports: 61 | # to connect: `pgcli postgres://postgres:postgres@localhost:54320/logfire_demo` 62 | - 54320:5432 63 | volumes: 64 | - postgres-data:/var/lib/postgresql/data 65 | restart: unless-stopped 66 | healthcheck: 67 | test: ["CMD-SHELL", "pg_isready -U postgres -d postgres"] 68 | 69 | redis: 70 | image: redis:latest 71 | container_name: logfire-demo-redis 72 | volumes: 73 | - redis-data:/data 74 | ports: 75 | - "63790:6379" 76 | healthcheck: 77 | test: ["CMD", "redis-cli", "ping"] 78 | 79 | volumes: 80 | postgres-data: 81 | redis-data: 82 | 83 | networks: 84 | default: 85 | name: logfire-demo-dev 86 | -------------------------------------------------------------------------------- /otel-collector/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM otel/opentelemetry-collector-contrib:0.114.0 2 | 3 | # Copy configuration and entry script 4 | COPY otel-collector-config.yaml /etc/otel-collector-config.yaml 5 | 6 | CMD [ "--config=/etc/otel-collector-config.yaml" ] 7 | -------------------------------------------------------------------------------- /otel-collector/otel-collector-config.yaml: -------------------------------------------------------------------------------- 1 | receivers: 2 | otlp: 3 | protocols: 4 | http: 5 | endpoint: "0.0.0.0:4318" 6 | 7 | exporters: 8 | debug: 9 | otlphttp/us: 10 | endpoint: https://api-us.pydantic.dev 11 | compression: none 12 | tls: 13 | insecure: true 14 | headers: 15 | Authorization: ${env:US_WRITE_TOKEN} 16 | otlphttp/eu: 17 | endpoint: https://api-eu.pydantic.dev 18 | compression: none 19 | tls: 20 | insecure: true 21 | headers: 22 | Authorization: ${env:EU_WRITE_TOKEN} 23 | processors: 24 | batch: 25 | timeout: 1s 26 | send_batch_size: 32768 27 | 28 | extensions: 29 | health_check: 30 | endpoint: "0.0.0.0:13133" 31 | 32 | service: 33 | extensions: [health_check] 34 | pipelines: 35 | traces: 36 | receivers: [otlp] 37 | processors: [batch] 38 | exporters: [debug, otlphttp/us, otlphttp/eu] 39 | metrics: 40 | receivers: [otlp] 41 | processors: [batch] 42 | exporters: [debug, otlphttp/us, otlphttp/eu] 43 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "logfire-demo" 3 | version = "0.0.0" 4 | description = "Demonstrate what Pydantic Logfire can do." 5 | authors = [ 6 | { name = "Samuel Colvin", email = "s@muelcolvin.com" } 7 | ] 8 | requires-python = ">= 3.12" 9 | dependencies = [ 10 | "arq>=0.26.1", 11 | "asyncpg>=0.29.0", 12 | "cryptography>=44.0.1", 13 | "fastapi>=0.115.0", 14 | "fastui>=0.7.0", 15 | "httpx>=0.27.2", 16 | "logfire[asyncpg,fastapi,httpx,system-metrics]>=3.6.2", 17 | "openai>=1.47.1", 18 | "pillow>=10.4.0", 19 | "pydantic>=2.9.2", 20 | "pydantic-ai>=0.0.35", 21 | "pydantic-settings>=2.5.2", 22 | "pyjwt>=2.10.1", 23 | "python-multipart>=0.0.10", 24 | "tiktoken>=0.7.0", 25 | "uvicorn[standard]>=0.30.6", 26 | ] 27 | 28 | [tool.uv] 29 | dev-dependencies = [ 30 | "asyncpg-stubs>=0.29.1", 31 | "devtools>=0.12.2", 32 | "ipython>=8.27.0", 33 | "pyright>=1.1.382", 34 | "ruff>=0.6.7", 35 | "watchfiles>=0.24.0", 36 | ] 37 | 38 | [tool.ruff] 39 | line-length = 120 40 | target-version = "py312" 41 | lint.extend-select = ["Q", "RUF100", "C90", "UP", "I"] 42 | lint.flake8-quotes = {inline-quotes = "single", multiline-quotes = "double"} 43 | lint.mccabe = { max-complexity = 14 } 44 | format.quote-style = "single" 45 | -------------------------------------------------------------------------------- /render.yaml: -------------------------------------------------------------------------------- 1 | services: 2 | - type: web 3 | name: webui 4 | runtime: docker 5 | rootDir: . 6 | dockerfilePath: ./Dockerfile 7 | envVars: 8 | - key: SERVICE 9 | value: webui 10 | - key: LOGFIRE_TOKEN 11 | sync: false 12 | - key: OTEL_EXPORTER_OTLP_ENDPOINT 13 | sync: false 14 | - key: TILING_SERVER 15 | value: "https://tiling.onrender.com" 16 | - key: OPENAI_API_KEY 17 | sync: false 18 | - key: GITHUB_WEBHOOK_SECRET 19 | sync: false 20 | - key: SLACK_SIGNING_SECRET 21 | sync: false 22 | - key: SLACK_CHANNEL 23 | sync: false 24 | - key: CREATE_DATABASE 25 | value: "false" 26 | - key: PG_DSN 27 | fromDatabase: 28 | name: postgres 29 | property: connectionString 30 | - key: REDIS_DSN 31 | fromService: 32 | type: redis 33 | name: arq-redis 34 | property: connectionString 35 | - type: web 36 | name: tiling 37 | runtime: docker 38 | rootDir: . 39 | dockerfilePath: ./Dockerfile 40 | envVars: 41 | - key: SERVICE 42 | value: tiling 43 | - key: LOGFIRE_TOKEN 44 | sync: false 45 | - key: OTEL_EXPORTER_OTLP_ENDPOINT 46 | sync: false 47 | - type: worker 48 | name: arq-worker 49 | runtime: docker 50 | rootDir: . 51 | dockerfilePath: ./Dockerfile 52 | envVars: 53 | - key: SERVICE 54 | value: worker 55 | - key: LOGFIRE_TOKEN 56 | sync: false 57 | - key: OTEL_EXPORTER_OTLP_ENDPOINT 58 | sync: false 59 | - key: OPENAI_API_KEY 60 | sync: false 61 | - key: PG_DSN 62 | fromDatabase: 63 | name: postgres 64 | property: connectionString 65 | - key: REDIS_DSN 66 | fromService: 67 | type: redis 68 | name: arq-redis 69 | property: connectionString 70 | - key: GITHUB_APP_ID 71 | sync: false 72 | - key: GITHUB_APP_INSTALLATION_ID 73 | sync: false 74 | - key: GITHUB_APP_PRIVATE_KEY 75 | sync: false 76 | - key: VECTOR_DISTANCE_THRESHOLD 77 | sync: false 78 | - key: AI_SIMILARITY_THRESHOLD 79 | sync: false 80 | - name: otel-collector 81 | type: pserv # Ensures it's not exposed to the internet 82 | runtime: docker 83 | dockerfilePath: ./otel-collector/Dockerfile 84 | envVars: 85 | - key: US_WRITE_TOKEN 86 | sync: false 87 | - key: EU_WRITE_TOKEN 88 | sync: false 89 | autoDeploy: true 90 | - type: redis 91 | name: arq-redis 92 | plan: starter 93 | ipAllowList: [] # only allow internal connections 94 | 95 | databases: 96 | - name: postgres 97 | plan: starter 98 | -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pydantic/logfire-demo/f7ce538b77174866c750c8187935a311a2764896/src/__init__.py -------------------------------------------------------------------------------- /src/__main__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | import sys 4 | 5 | import logfire 6 | from pydantic_ai import Agent 7 | 8 | service = sys.argv[1] if len(sys.argv) == 2 else os.getenv('SERVICE') 9 | services = 'webui', 'tiling', 'worker' 10 | # min duration is 100ms 11 | logfire.install_auto_tracing(modules=[f'src.{s}' for s in services], min_duration=0.1) 12 | if service is None: 13 | print('service argument variable not provided', file=sys.stderr) 14 | print('Available services:', ', '.join(services), file=sys.stderr) 15 | elif service in services: 16 | 17 | def scrubbing_callback(match: logfire.ScrubMatch): 18 | if ( 19 | match.path 20 | in [ 21 | ['message', 'gh_data'], 22 | ['message', 'prompt'], 23 | ['attributes', 'prompt'], 24 | ['attributes', 'result', 'reason'], 25 | ] 26 | or match.path[:2] 27 | in [ 28 | ['attributes', 'all_messages'], 29 | ['attributes', 'gh_data'], 30 | ] 31 | or match.path[:3] 32 | in [ 33 | ['attributes', 'response', 'parts'], 34 | ] 35 | ): 36 | return match.value 37 | 38 | logfire.configure( 39 | service_name=service, 40 | code_source=logfire.CodeSource( 41 | repository='https://github.com/pydantic/logfire-demo', 42 | revision='main', 43 | ), 44 | scrubbing=logfire.ScrubbingOptions(callback=scrubbing_callback), 45 | distributed_tracing=True, 46 | ) 47 | logfire.instrument_system_metrics() 48 | logfire.instrument_asyncpg() 49 | Agent.instrument_all() 50 | 51 | module = importlib.import_module(f'.{service}', package='src') 52 | module.run() 53 | else: 54 | print(f'Unknown service: {service}', file=sys.stderr) 55 | print('Available services:', ', '.join(services), file=sys.stderr) 56 | -------------------------------------------------------------------------------- /src/common/__init__.py: -------------------------------------------------------------------------------- 1 | import urllib.parse 2 | from typing import Annotated, Any 3 | 4 | from arq import ArqRedis 5 | from fastapi import Depends, Request 6 | from httpx import AsyncClient 7 | from pydantic_settings import BaseSettings 8 | 9 | 10 | def _get_http_client(request: Request) -> AsyncClient: 11 | return request.app.state.httpx_client 12 | 13 | 14 | AsyncClientDep = Annotated[AsyncClient, Depends(_get_http_client)] 15 | 16 | 17 | def build_params(**params: Any) -> str: 18 | return urllib.parse.urlencode({k: str(v) for k, v in params.items()}) 19 | 20 | 21 | def _arq_redis(request: Request) -> ArqRedis: 22 | return request.app.state.arq_redis 23 | 24 | 25 | ArqRedisDep = Annotated[ArqRedis, Depends(_arq_redis)] 26 | 27 | 28 | class GeneralSettings(BaseSettings): 29 | pg_dsn: str = 'postgres://postgres:postgres@localhost/logfire_demo' 30 | redis_dsn: str = 'redis://localhost:6379/0' 31 | -------------------------------------------------------------------------------- /src/common/db/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from collections.abc import AsyncIterator 3 | from contextlib import asynccontextmanager 4 | from dataclasses import dataclass 5 | from typing import Annotated, Self 6 | from urllib.parse import urlparse 7 | 8 | import asyncpg 9 | import logfire 10 | from asyncpg.connection import Connection 11 | from fastapi import Depends, Request 12 | 13 | __all__ = ('Database',) 14 | 15 | 16 | @dataclass 17 | class _Database: 18 | """ 19 | Wrapper for asyncpg with some utilities and usable as a fastapi dependency. 20 | """ 21 | 22 | _pool: asyncpg.Pool 23 | 24 | @classmethod 25 | @asynccontextmanager 26 | async def create(cls, dsn: str, prepare_db: bool = False, create_database: bool = False) -> AsyncIterator[Self]: 27 | if prepare_db: 28 | with logfire.span('prepare DB'): 29 | await _prepare_db(dsn, create_database) 30 | pool = await asyncpg.create_pool(dsn) 31 | if not pool: 32 | raise ValueError('Failed to create pool') 33 | try: 34 | yield cls(_pool=pool) 35 | finally: 36 | await asyncio.wait_for(pool.close(), timeout=2.0) 37 | 38 | @asynccontextmanager 39 | async def acquire(self) -> AsyncIterator[Connection]: 40 | con = await self._pool.acquire() 41 | try: 42 | yield con 43 | finally: 44 | await self._pool.release(con) 45 | 46 | @asynccontextmanager 47 | async def acquire_trans(self) -> AsyncIterator[Connection]: 48 | async with self._pool.acquire() as conn: 49 | async with conn.transaction(): 50 | yield conn 51 | 52 | 53 | def _get_db(request: Request) -> _Database: 54 | return request.app.state.db 55 | 56 | 57 | Database = Annotated[_Database, Depends(_get_db)] 58 | 59 | 60 | async def _prepare_db(dsn: str, create_database: bool) -> None: 61 | if create_database: 62 | with logfire.span('check and create DB'): 63 | parse_result = urlparse(dsn) 64 | database = parse_result.path.lstrip('/') 65 | server_dsn = dsn[: dsn.rindex('/')] 66 | conn = await asyncpg.connect(server_dsn) 67 | try: 68 | db_exists = await conn.fetchval('SELECT 1 FROM pg_database WHERE datname = $1', database) 69 | if not db_exists: 70 | await conn.execute(f'CREATE DATABASE {database}') 71 | finally: 72 | await conn.close() 73 | 74 | with logfire.span('create schema'): 75 | conn = await asyncpg.connect(dsn) 76 | try: 77 | async with conn.transaction(): 78 | await _create_schema(conn) 79 | finally: 80 | await conn.close() 81 | 82 | 83 | async def _create_schema(conn: Connection) -> None: 84 | await conn.execute(""" 85 | CREATE TABLE IF NOT EXISTS chats ( 86 | id UUID PRIMARY KEY DEFAULT gen_random_uuid(), 87 | created_at TIMESTAMPTZ NOT NULL DEFAULT NOW() 88 | ); 89 | CREATE INDEX IF NOT EXISTS chats_created_at_idx ON chats (created_at desc); 90 | 91 | CREATE TABLE IF NOT EXISTS messages ( 92 | id UUID PRIMARY KEY DEFAULT gen_random_uuid(), 93 | chat_id UUID NOT NULL REFERENCES chats(id) ON DELETE CASCADE, 94 | created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), 95 | role TEXT NOT NULL, 96 | message TEXT NOT NULL, 97 | cost INT 98 | ); 99 | CREATE INDEX IF NOT EXISTS messages_chat_id_idx ON messages (chat_id); 100 | CREATE INDEX IF NOT EXISTS messages_created_at_idx ON messages (created_at); 101 | 102 | CREATE TABLE IF NOT EXISTS llm_results ( 103 | questions_hash TEXT PRIMARY KEY, 104 | created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(), 105 | chunks JSON -- isn't filtered, so use JSON instead of JSONB 106 | ); 107 | 108 | CREATE EXTENSION IF NOT EXISTS vector; 109 | CREATE TABLE IF NOT EXISTS embeddings ( 110 | id SERIAL PRIMARY KEY, -- Unique ID for each entry 111 | source TEXT NOT NULL, -- "github_issue", "slack_message", "pydantic_docs", ... 112 | external_reference TEXT, -- GitHub link, Slack message ID 113 | parent TEXT, -- GitHub issue, Thread TS (for Slack threads) 114 | text TEXT NOT NULL, -- The actual text content 115 | hash TEXT UNIQUE NOT NULL, -- Hash of the text content 116 | author TEXT, -- Author of the message 117 | event_ts TIMESTAMPTZ DEFAULT NOW(), -- Timestamp of when the event occurred 118 | created_at TIMESTAMPTZ DEFAULT NOW(), -- Timestamp of when the entry was created 119 | embedding VECTOR(1536) -- For storing embeddings 120 | ); 121 | 122 | CREATE TABLE IF NOT EXISTS github_contents ( 123 | id SERIAL PRIMARY KEY, -- Unique ID for each entry 124 | project TEXT NOT NULL, -- "pydantic", "logfire" 125 | source TEXT NOT NULL, -- "issue" 126 | content_id BIGINT NOT NULL, -- GitHub content ID 127 | external_reference TEXT NOT NULL, -- GitHub link 128 | text TEXT NOT NULL, -- The actual text content 129 | event_ts TIMESTAMPTZ DEFAULT NOW(), -- Timestamp of when the event occurred 130 | created_at TIMESTAMPTZ DEFAULT NOW(), -- Timestamp of when the entry was created 131 | updated_at TIMESTAMPTZ DEFAULT NOW(), -- Timestamp of when the entry was last updated 132 | embedding VECTOR(1536), -- For storing embeddings 133 | similar_issues JSONB, -- Similar issues 134 | unique (project, source, content_id) -- Unique constraint 135 | ); 136 | 137 | CREATE TABLE IF NOT EXISTS slack_messages ( 138 | id SERIAL PRIMARY KEY, -- Unique ID for each entry 139 | channel TEXT NOT NULL, -- Slack channel 140 | author TEXT NOT NULL, -- Message author 141 | message_id TEXT NOT NULL, -- Slack message ID 142 | event_ts TEXT NOT NULL, -- Timestamp of when the event occurred (text) 143 | parent_event_ts TEXT, -- Slack message thread timestamp 144 | text TEXT NOT NULL, -- The actual text content 145 | ts TIMESTAMPTZ, -- Message timestamp 146 | created_at TIMESTAMPTZ DEFAULT NOW(), -- Timestamp of when the entry was created 147 | embedding VECTOR(1536) -- For storing embeddings 148 | ); 149 | """) 150 | -------------------------------------------------------------------------------- /src/common/db/github.py: -------------------------------------------------------------------------------- 1 | import json 2 | from datetime import datetime 3 | from typing import Any, Literal 4 | 5 | from asyncpg import Connection 6 | 7 | GithubContentProject = Literal['pydantic', 'logfire', 'pydantic-ai'] 8 | GithubContentSource = Literal['issue'] 9 | 10 | 11 | async def create_github_content( 12 | conn: Connection, 13 | project: GithubContentProject, 14 | source: GithubContentSource, 15 | content_id: int, 16 | external_reference: str, 17 | text: str, 18 | event_ts: datetime, 19 | embedding: list[float], 20 | ) -> None: 21 | """Save GitHub content to the database.""" 22 | embedding_str = '[' + ','.join(map(str, embedding)) + ']' 23 | await conn.execute( 24 | """ 25 | INSERT INTO github_contents (project, source, content_id, external_reference, text, event_ts, embedding) 26 | VALUES ($1, $2, $3, $4, $5, $6, $7) 27 | """, 28 | project, 29 | source, 30 | content_id, 31 | external_reference, 32 | text, 33 | event_ts, 34 | embedding_str, 35 | ) 36 | 37 | 38 | async def get_github_content( 39 | conn: Connection, 40 | project: GithubContentProject, 41 | source: GithubContentSource, 42 | content_id: int, 43 | ) -> dict[str, Any] | None: 44 | """Fetch GitHub content from the database by ID.""" 45 | return await conn.fetchrow( 46 | """ 47 | SELECT id, text, embedding FROM github_contents WHERE project=$1 AND source=$2 AND content_id=$3 48 | """, 49 | project, 50 | source, 51 | content_id, 52 | ) 53 | 54 | 55 | async def update_github_content( 56 | conn: Connection, 57 | project: GithubContentProject, 58 | source: GithubContentSource, 59 | content_id: int, 60 | text: str, 61 | embedding: list[float], 62 | ) -> None: 63 | """Update GitHub content in the database.""" 64 | embedding_str = '[' + ','.join(map(str, embedding)) + ']' 65 | await conn.execute( 66 | """ 67 | UPDATE github_contents SET text=$1, embedding=$2 WHERE project=$3 AND source=$4 AND content_id=$5 68 | """, 69 | text, 70 | embedding_str, 71 | project, 72 | source, 73 | content_id, 74 | ) 75 | 76 | 77 | async def fetch_issues_for_similarity_check(conn: Connection) -> list[dict[str, Any]]: 78 | """Fetch GitHub issues for similarity check.""" 79 | return await conn.fetch( 80 | """ 81 | SELECT 82 | id, 83 | project, 84 | text, 85 | external_reference 86 | FROM github_contents 87 | WHERE source='issue' AND similar_issues IS NULL 88 | """, 89 | ) 90 | 91 | 92 | async def find_similar_issues(conn: Connection, id: int, project: GithubContentProject) -> list[dict[str, Any]]: 93 | """Find similar GitHub issues by vector similarity.""" 94 | return await conn.fetch( 95 | """ 96 | SELECT 97 | text, 98 | external_reference, 99 | embedding <=> (SELECT embedding FROM github_contents WHERE id = $1) AS distance 100 | FROM github_contents 101 | WHERE source='issue' AND project=$2 AND id != $3 102 | ORDER BY distance 103 | LIMIT 3; 104 | """, 105 | id, 106 | project, 107 | id, 108 | ) 109 | 110 | 111 | async def update_similar_issues(conn: Connection, id: int, similar_issues_obj: list[dict[str, Any]]) -> None: 112 | await conn.execute( 113 | """ 114 | UPDATE github_contents SET similar_issues=$1 WHERE id=$2 115 | """, 116 | json.dumps(similar_issues_obj), 117 | id, 118 | ) 119 | -------------------------------------------------------------------------------- /src/common/db/slack.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Any 3 | 4 | from asyncpg import Connection 5 | 6 | 7 | async def create_slack_message( 8 | conn: Connection, 9 | channel: str, 10 | author: str, 11 | message_id: str, 12 | event_ts: str, 13 | parent_event_ts: str | None, 14 | text: str, 15 | ts: datetime, 16 | embedding: list[float], 17 | ) -> None: 18 | """Create a new slack message in the database""" 19 | embedding_str = '[' + ','.join(map(str, embedding)) + ']' 20 | await conn.execute( 21 | """ 22 | INSERT INTO slack_messages (channel, author, message_id, event_ts, parent_event_ts, text, ts, embedding) 23 | VALUES ($1, $2, $3, $4, $5, $6, $7, $8) 24 | """, 25 | channel, 26 | author, 27 | message_id, 28 | event_ts, 29 | parent_event_ts, 30 | text, 31 | ts, 32 | embedding_str, 33 | ) 34 | 35 | 36 | async def get_root_slack_messages(conn: Connection, channel_id: str, limit: int = 10) -> list[dict[str, Any]]: 37 | """Fetch the root slack message from the database.""" 38 | return await conn.fetch( 39 | """ 40 | WITH messages AS ( 41 | SELECT s.id, s.author, s.text, s.ts, count(r.id) as replies_count 42 | FROM slack_messages s 43 | LEFT JOIN slack_messages r ON r.parent_event_ts = s.event_ts OR r.event_ts = s.event_ts 44 | WHERE s.parent_event_ts IS NULL AND s.channel = $1 45 | GROUP BY s.author, s.id, s.text, s.ts, s.event_ts 46 | ORDER BY s.ts DESC 47 | LIMIT $2 48 | ) 49 | SELECT * FROM messages ORDER BY ts 50 | """, 51 | channel_id, 52 | limit, 53 | ) 54 | 55 | 56 | async def get_slack_thread(conn: Connection, message_id: int) -> list[dict[str, Any]]: 57 | """Fetch a slack thread from the database.""" 58 | return await conn.fetch( 59 | """ 60 | SELECT author, text, ts 61 | FROM slack_messages WHERE parent_event_ts=(SELECT event_ts FROM slack_messages WHERE id = $1) 62 | ORDER BY ts 63 | """, 64 | message_id, 65 | ) 66 | -------------------------------------------------------------------------------- /src/common/embeddings.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from datetime import datetime 3 | from typing import Literal 4 | 5 | import logfire 6 | import tiktoken 7 | from asyncpg import Connection 8 | from openai import AsyncOpenAI 9 | 10 | TOKEN_LIMIT = 8192 # OpenAI embedding model token limit 11 | 12 | 13 | def count_tokens(text: str, model: str = 'text-embedding-ada-002') -> int: 14 | """Counts the number of tokens in a given text using OpenAI's tiktoken.""" 15 | encoding = tiktoken.encoding_for_model(model) 16 | return len(encoding.encode(text)) 17 | 18 | 19 | def truncate_text_to_token_limit(text: str, model: str = 'text-embedding-ada-002', max_tokens: int = 8192) -> str: 20 | """Truncate text to fit within the token limit for embeddings.""" 21 | encoding = tiktoken.encoding_for_model(model) 22 | tokens = encoding.encode(text) # Convert text to tokens 23 | 24 | if len(tokens) > max_tokens: 25 | tokens = tokens[:max_tokens] # Truncate to max tokens 26 | 27 | return encoding.decode(tokens) # Convert tokens back to text 28 | 29 | 30 | async def generate_embedding(openai_client: AsyncOpenAI, text: str) -> list[float]: 31 | with logfire.span('call openai'): 32 | response = await openai_client.embeddings.create(input=text, model='text-embedding-ada-002') 33 | return response.data[0].embedding 34 | 35 | 36 | def hash_text(text: str) -> str: 37 | return hashlib.md5(text.encode()).hexdigest() 38 | 39 | 40 | EmbeddingsSource = Literal['slack_message', 'github_issue', 'pydantic_docs', 'pydantic_ai_docs', 'logfire_docs'] 41 | 42 | 43 | async def get_stored_embeddings_hash_by_source(conn: Connection, source: EmbeddingsSource) -> set[str]: 44 | hashes = await conn.fetch('SELECT hash FROM embeddings WHERE source=$1', source) 45 | return {record['hash'] for record in hashes} 46 | 47 | 48 | async def create_embeddings( 49 | conn: Connection, 50 | source: EmbeddingsSource, 51 | text: str, 52 | text_hash: str, 53 | embedding: list[list[float]], 54 | event_ts: datetime | None = None, 55 | external_reference: str | None = None, 56 | author: str | None = None, 57 | parent: str | None = None, 58 | ) -> None: 59 | """Create a new embeddings in the database""" 60 | embedding_str = '[' + ','.join(map(str, embedding)) + ']' 61 | await conn.execute( 62 | """ 63 | INSERT INTO embeddings (source, external_reference, text, hash, author, event_ts, embedding, parent) 64 | VALUES ($1, $2, $3, $4, $5, $6, $7, $8) 65 | """, 66 | source, 67 | external_reference, 68 | text, 69 | text_hash, 70 | author, 71 | event_ts, 72 | embedding_str, 73 | parent, 74 | ) 75 | 76 | 77 | async def delete_embeddings_by_hash(conn: Connection, hashes: set[str], source: EmbeddingsSource) -> None: 78 | await conn.execute('DELETE FROM embeddings WHERE hash = ANY($1) AND source=$2', hashes, source) 79 | -------------------------------------------------------------------------------- /src/tiling/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations as _annotations 2 | 3 | from contextlib import AsyncExitStack, asynccontextmanager 4 | from typing import Annotated 5 | 6 | import logfire 7 | from annotated_types import Ge, Gt, Le, Lt 8 | from fastapi import FastAPI, Header, Response 9 | from fastapi.responses import PlainTextResponse 10 | from httpx import AsyncClient 11 | from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor 12 | 13 | from ..common import AsyncClientDep 14 | from .build_map import BuildMap 15 | 16 | 17 | @asynccontextmanager 18 | async def lifespan(app_: FastAPI): 19 | async with AsyncExitStack() as stack: 20 | app_.state.httpx_client = httpx_client = await stack.enter_async_context(AsyncClient()) 21 | HTTPXClientInstrumentor.instrument_client(httpx_client) 22 | yield 23 | 24 | 25 | app = FastAPI(lifespan=lifespan) 26 | logfire.instrument_fastapi(app, capture_headers=True) 27 | 28 | 29 | @app.get('/', response_class=PlainTextResponse) 30 | @app.head('/', include_in_schema=False) 31 | async def index() -> str: 32 | return 'Tiling service\n' 33 | 34 | 35 | @app.get('/robots.txt', response_class=PlainTextResponse) 36 | @app.head('/robots.txt', include_in_schema=False) 37 | async def robots_txt() -> str: 38 | return 'User-agent: *\nDisallow: /\n' 39 | 40 | 41 | @app.get('/health', response_class=PlainTextResponse) 42 | @app.head('/health', include_in_schema=False) 43 | async def health() -> str: 44 | return 'OK\n' 45 | 46 | 47 | @app.get('/favicon.ico', status_code=404, response_class=PlainTextResponse) 48 | @app.head('/favicon.ico', include_in_schema=False) 49 | async def favicon_ico() -> str: 50 | return 'page not found' 51 | 52 | 53 | @app.get('/map.jpg') 54 | async def get_map( 55 | http_client: AsyncClientDep, 56 | lat: Annotated[float, Ge(-85), Le(85)], 57 | lng: Annotated[float, Ge(-180), Le(180)], 58 | zoom: Annotated[int, Gt(0), Lt(20)] = 10, 59 | width: Annotated[int, Ge(95), Le(1000)] = 600, 60 | height: Annotated[int, Ge(60), Le(1000)] = 400, 61 | scale: Annotated[int, Ge(1), Le(2)] = 1, 62 | referer: Annotated[str | None, Header()] = None, 63 | ) -> Response: 64 | builder = BuildMap( 65 | http_client=http_client, referrer=referer, lat=lat, lng=lng, zoom=zoom, width=width, height=height, scale=scale 66 | ) 67 | image = await builder.run() 68 | return Response( 69 | content=image, 70 | media_type='image/jpeg', 71 | headers={'Cache-Control': 'max-age=1209600', 'X-Robots-Tag': 'noindex'}, # 1209600 is 14 days 72 | ) 73 | 74 | 75 | def run(): 76 | import uvicorn 77 | 78 | uvicorn.run(app, host='0.0.0.0', port=8000, log_level='info') 79 | -------------------------------------------------------------------------------- /src/tiling/build_map.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import io 3 | import math 4 | import random 5 | from asyncio import Semaphore 6 | from collections.abc import Awaitable, Iterator, Sequence 7 | from statistics import mean 8 | from time import time 9 | 10 | import logfire 11 | from httpx import AsyncClient 12 | from PIL import Image, ImageDraw 13 | 14 | __all__ = ('BuildMap',) 15 | 16 | SHARDS = 'a', 'b', 'c' 17 | TILE_SIZE = 256 18 | HEADERS = {'User-Agent': 'https://github.com/tutorcruncher/static-maps'} 19 | 20 | COPYRIGHT_MSG = '© OpenStreetMap contributors' 21 | 22 | OSM_ROOT = 'https://{shard}.tile.openstreetmap.org' 23 | 24 | URL_TEMPLATE = '{url_root}/{zoom:d}/{x:d}/{y:d}.png' 25 | OSM_SEMAPHORE = Semaphore(value=32) 26 | 27 | 28 | class BuildMap: 29 | __slots__ = 'http_client', 'lat', 'lng', 'zoom', 'w', 'h', 'no_tiles', 'tiles', 'times', 'headers', 'scale' 30 | 31 | def __init__( 32 | self, 33 | *, 34 | http_client: AsyncClient, 35 | referrer: str | None, 36 | lat: float, 37 | lng: float, 38 | zoom: int, 39 | width: int, 40 | height: int, 41 | scale: int, 42 | ): 43 | self.http_client = http_client 44 | self.lat = lat 45 | self.lng = lng 46 | self.zoom = zoom 47 | self.w = width * scale 48 | self.h = height * scale 49 | self.scale = scale 50 | self.no_tiles = 2**self.zoom 51 | 52 | self.tiles: set[tuple[bytes, int, int]] = set() 53 | self.times: list[float] = [] 54 | self.headers = HEADERS.copy() 55 | if referrer: 56 | self.headers['Referer'] = referrer 57 | 58 | async def run(self) -> bytes: 59 | # https://wiki.openstreetmap.org/wiki/Slippy_map_tilenames#Implementations 60 | x_tile = self.no_tiles * (self.lng + 180) / 360 61 | 62 | lat_rad = math.radians(self.lat) 63 | y_tile = self.no_tiles * (1 - math.log(math.tan(lat_rad) + 1 / math.cos(lat_rad)) / math.pi) / 2 64 | 65 | x_range, x_correction = self.range_correction(x_tile, self.w) 66 | y_range, y_correction = self.range_correction(y_tile, self.h) 67 | 68 | await asyncio.gather(*self.get_tiles(x_range, x_correction, y_range, y_correction)) 69 | 70 | logfire.info( 71 | '{lat=:0.6f} {lng=:0.6f} {zoom=} {tiles=} {avg_download_time=:0.3f}s', 72 | lat=self.lat, 73 | lng=self.lng, 74 | zoom=self.zoom, 75 | tiles=len(self.times), 76 | avg_download_time=mean(self.times), 77 | times=self.times, 78 | ) 79 | 80 | return await asyncio.get_event_loop().run_in_executor(None, self.stitch_tiles) 81 | 82 | @staticmethod 83 | def range_correction(tile_no: float, size: int) -> tuple[Sequence[int], int]: 84 | half_t = size / 2 / TILE_SIZE # half the width/height in tiles 85 | min_, max_ = int(math.floor(tile_no - half_t)), int(math.ceil(tile_no + half_t)) 86 | correction = (tile_no - min_) * TILE_SIZE - size / 2 87 | return range(min_, max_), intr(correction) 88 | 89 | def get_tiles( 90 | self, x_range: Sequence[int], x_correction: int, y_range: Sequence[int], y_correction: int 91 | ) -> Iterator[Awaitable[None]]: 92 | for col, x in enumerate(x_range): 93 | for row, y in enumerate(y_range): 94 | yield self.get_tile(x, y, col * TILE_SIZE - x_correction, row * TILE_SIZE - y_correction) 95 | 96 | async def get_tile(self, osm_x: int, osm_y: int, image_x: int, image_y: int) -> None: 97 | if not 0 <= osm_y < self.no_tiles: 98 | return 99 | # wraps map around at edges 100 | osm_x = osm_x % self.no_tiles 101 | root = OSM_ROOT.format(shard=random.choice(SHARDS)) 102 | url = URL_TEMPLATE.format(url_root=root, zoom=self.zoom, x=osm_x, y=osm_y) 103 | # debug(url, osm_x, osm_y, image_x, image_y) 104 | 105 | start = time() 106 | async with OSM_SEMAPHORE: 107 | r = await self.http_client.get(url, headers=self.headers) 108 | self.times.append(time() - start) 109 | if r.status_code != 200: 110 | data = {'content': r.content, 'response_headers': dict(r.headers)} 111 | logfire.warn('unexpected {status=} from {url!r}', status=r.status_code, url=url, data=data) 112 | else: 113 | self.tiles.add((r.content, image_x, image_y)) 114 | 115 | @logfire.instrument('stitch tiles together') 116 | def stitch_tiles(self) -> bytes: 117 | # the minimum image width is set to 95px to fit copyright text 118 | box_size_w, box_size_h = 95, 8 119 | text_pos_x, text_pos_y = 94, 8 120 | if self.w >= 205: 121 | box_size_w, box_size_h = 205, 20 122 | text_pos_x, text_pos_y = 200, 20 123 | 124 | img_bg = Image.new('RGBA', (self.w, self.h), (255, 255, 255, 255)) 125 | 126 | for content, x, y in self.tiles: 127 | img_bg.paste(Image.open(io.BytesIO(content)), (x, y)) 128 | 129 | self.tiles = set() 130 | img_fg = Image.new('RGBA', img_bg.size, (0, 0, 0, 0)) 131 | rect_box = self.w - box_size_w * self.scale, self.h - box_size_h * self.scale, self.w, self.h 132 | ImageDraw.Draw(img_fg).rectangle(rect_box, fill=(255, 255, 255, 128)) 133 | text_pos: tuple[int, int] = self.w - text_pos_x * self.scale, self.h - text_pos_y * self.scale 134 | ImageDraw.Draw(img_fg).text(text_pos, COPYRIGHT_MSG, fill=(0, 0, 0)) # type: ignore 135 | 136 | bio = io.BytesIO() 137 | Image.alpha_composite(img_bg, img_fg).convert('RGB').save(bio, format='jpeg', quality=95, optimize=True) 138 | return bio.getvalue() 139 | 140 | 141 | def intr(v: float) -> int: 142 | return int(round(v)) 143 | -------------------------------------------------------------------------------- /src/webui/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations as _annotations 2 | 3 | import sys 4 | from contextlib import AsyncExitStack, asynccontextmanager 5 | from typing import Annotated, Any 6 | 7 | import arq 8 | import logfire 9 | from annotated_types import Ge, Gt, Le, Lt 10 | from arq.connections import RedisSettings 11 | from fastapi import FastAPI 12 | from fastapi.responses import HTMLResponse, PlainTextResponse 13 | from fastui import prebuilt_html 14 | from fastui.auth import fastapi_auth_exception_handling 15 | from fastui.dev import dev_fastapi_app 16 | from httpx import AsyncClient 17 | from openai import AsyncOpenAI 18 | from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor 19 | from starlette.responses import StreamingResponse 20 | 21 | from ..common import AsyncClientDep 22 | from ..common.db import Database 23 | from .llm import router as llm_router 24 | from .main import router as main_router 25 | from .settings import settings 26 | from .slack import router as slack_router 27 | from .web_hooks import router as web_hooks_router 28 | 29 | 30 | @asynccontextmanager 31 | async def lifespan(app_: FastAPI): 32 | async with AsyncExitStack() as stack: 33 | app_.state.httpx_client = httpx_client = await stack.enter_async_context(AsyncClient()) 34 | HTTPXClientInstrumentor.instrument_client(httpx_client) 35 | app_.state.db = await stack.enter_async_context( 36 | Database.create(settings.pg_dsn, True, settings.create_database) 37 | ) 38 | app_.state.arq_redis = await arq.create_pool(RedisSettings.from_dsn(settings.redis_dsn)) 39 | app_.state.settings = settings 40 | app_.state.openai_client = openai_client = AsyncOpenAI(http_client=httpx_client) 41 | logfire.instrument_openai(openai_client=openai_client) 42 | yield 43 | 44 | 45 | # This doesn't have any effect yet, needs https://github.com/pydantic/FastUI/issues/198 46 | frontend_reload = '--reload' in sys.argv 47 | if frontend_reload: 48 | # dev_fastapi_app reloads in the browser when the Python source changes 49 | app = dev_fastapi_app(lifespan=lifespan) 50 | else: 51 | app = FastAPI(lifespan=lifespan) 52 | 53 | logfire.instrument_fastapi(app, capture_headers=True) 54 | 55 | fastapi_auth_exception_handling(app) 56 | app.include_router(llm_router, prefix='/api/llm') 57 | app.include_router(slack_router, prefix='/api/slack') 58 | app.include_router(main_router, prefix='/api') 59 | app.include_router(web_hooks_router, prefix='/webhooks') 60 | 61 | 62 | @app.get('/robots.txt', response_class=PlainTextResponse) 63 | @app.head('/robots.txt', include_in_schema=False) 64 | async def robots_txt() -> str: 65 | return 'User-agent: *\nDisallow: /\n' 66 | 67 | 68 | @app.get('/health', response_class=PlainTextResponse) 69 | @app.head('/health', include_in_schema=False) 70 | async def health(db: Database) -> str: 71 | async with db.acquire() as con: 72 | version = await con.fetchval('SELECT version()') 73 | return f'pg version: {version}' 74 | 75 | 76 | @app.get('/favicon.ico', status_code=404, response_class=PlainTextResponse) 77 | async def favicon_ico() -> str: 78 | return 'page not found' 79 | 80 | 81 | @app.get('/map.jpg') 82 | async def map_jpg( 83 | http_client: AsyncClientDep, 84 | # Show a map of London by default 85 | lat: Annotated[float, Ge(-85), Le(85)] = 51.5074, 86 | lng: Annotated[float, Ge(-180), Le(180)] = -0.1, 87 | zoom: Annotated[int, Gt(0), Lt(20)] = 10, 88 | width: Annotated[int, Ge(95), Le(1000)] = 600, 89 | height: Annotated[int, Ge(60), Le(1000)] = 400, 90 | scale: Annotated[int, Ge(1), Le(2)] = 1, 91 | ) -> StreamingResponse: 92 | params: dict[str, Any] = {'lat': lat, 'lng': lng, 'zoom': zoom, 'width': width, 'height': height, 'scale': scale} 93 | r = await http_client.get(f'{settings.tiling_server}/map.jpg', params=params) 94 | return StreamingResponse(r.aiter_bytes(), media_type='image/jpeg') 95 | 96 | 97 | @app.get('/{path:path}') 98 | @app.head('/{path:path}', include_in_schema=False) 99 | async def html_landing() -> HTMLResponse: 100 | return HTMLResponse(prebuilt_html(title='Logfire Demo')) 101 | 102 | 103 | def run(): 104 | import uvicorn 105 | 106 | uvicorn.run(app, host='0.0.0.0', port=8000, log_level='info') 107 | -------------------------------------------------------------------------------- /src/webui/llm.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import hashlib 3 | import json 4 | from collections.abc import AsyncIterable 5 | from random import random 6 | from typing import Annotated 7 | from uuid import UUID 8 | 9 | import logfire 10 | import tiktoken 11 | from fastapi import APIRouter 12 | from fastui import AnyComponent, FastUI, events 13 | from fastui import components as c 14 | from fastui.forms import fastui_form 15 | from openai import AsyncOpenAI 16 | from pydantic import BaseModel, Field 17 | from starlette.responses import StreamingResponse 18 | 19 | from ..common import AsyncClientDep 20 | from ..common.db import Database 21 | from .shared import demo_page 22 | 23 | router = APIRouter() 24 | 25 | # This is a workaround for the compatibility issue with the new version of Pydantic 26 | # https://github.com/pydantic/FastUI/issues/369 27 | c.Link.model_rebuild() 28 | 29 | 30 | class PromptModel(BaseModel): 31 | prompt: str | None = Field(title='Prompt', description='Ask me (almost) anything', max_length=300) 32 | 33 | 34 | def form_comp(chat_id: UUID) -> c.ModelForm: 35 | return c.ModelForm( 36 | model=PromptModel, 37 | method='POST', 38 | submit_url=f'/api/llm/ask/{chat_id}', 39 | footer=[c.Div(components=[c.Button(text='Ask')], class_name='text-end')], 40 | ) 41 | 42 | 43 | @router.get('', response_model=FastUI, response_model_exclude_none=True) 44 | async def llm_page(db: Database) -> list[AnyComponent]: 45 | async with db.acquire() as conn: 46 | # create a new chat row 47 | chat_id = await conn.fetchval('insert into chats DEFAULT VALUES RETURNING id') 48 | 49 | return demo_page( 50 | c.Link(components=[c.Text(text='back')], on_click=events.BackEvent()), 51 | c.Div( 52 | components=[c.Div(components=[form_comp(chat_id)], class_name='col-md-6')], 53 | class_name='row justify-content-center', 54 | ), 55 | title='LLM Query', 56 | ) 57 | 58 | 59 | @router.post('/ask/{chat_id}', response_model=FastUI, response_model_exclude_none=True) 60 | async def llm_ask( 61 | db: Database, prompt: Annotated[PromptModel, fastui_form(PromptModel)], chat_id: UUID 62 | ) -> list[AnyComponent]: 63 | async with db.acquire() as conn: 64 | # create a new message row 65 | await conn.execute( 66 | """ 67 | insert into messages (chat_id, role, message) VALUES ($1, 'user', $2) 68 | """, 69 | chat_id, 70 | prompt.prompt, 71 | ) 72 | return [ 73 | c.Markdown(text=f'**You asked:** {prompt.prompt}'), 74 | c.ServerLoad(path=f'/llm/ask/stream/{chat_id}', sse=True), 75 | form_comp(chat_id), 76 | ] 77 | 78 | 79 | OPENAI_MODEL = 'gpt-4' 80 | 81 | 82 | @router.get('/ask/stream/{chat_id}') 83 | async def llm_stream(db: Database, http_client: AsyncClientDep, chat_id: UUID) -> StreamingResponse: 84 | async with db.acquire() as conn: 85 | # count tokens used today 86 | tokens_used = await conn.fetchval( 87 | 'select sum(cost) from messages where created_at > current_date and cost is not null' 88 | ) 89 | logfire.info('{cost_today=}', cost_today=tokens_used) 90 | 91 | if tokens_used is not None and tokens_used > 500_000: 92 | content = [_sse_message('**Limit Exceeded**:\n\nDaily token limit exceeded.')] 93 | return StreamingResponse(content, media_type='text/event-stream') 94 | 95 | # get messages from this chat 96 | chat_messages = await conn.fetch( 97 | 'select role, message as content from messages where chat_id = $1 order by created_at', 98 | chat_id, 99 | ) 100 | 101 | questions = '|'.join(m['content'].lower() for m in chat_messages if m['role'] == 'user') 102 | questions_hash = hashlib.md5(questions.encode()).hexdigest() 103 | 104 | opt_chunks = await conn.fetchval('select chunks from llm_results where questions_hash = $1', questions_hash) 105 | 106 | messages = [{'role': 'system', 'content': 'Please response in markdown only.'}, *map(dict, chat_messages)] 107 | 108 | async def gen_saved(chunks_json: str) -> AsyncIterable[str]: 109 | """ 110 | Generate a result based on on previously saved chunks. 111 | """ 112 | chunks = json.loads(chunks_json) 113 | output = '' 114 | try: 115 | await asyncio.sleep(0.5 + random() * 0.5) 116 | with logfire.span('saved result {messages=}', messages=messages) as logfire_span: 117 | for chunk in chunks: 118 | if chunk is not None: 119 | output += chunk 120 | yield _sse_message(f'**{OPENAI_MODEL.upper()}s**:\n\n{output}') 121 | 122 | # 0.12s delay is taken roughly from 123 | # https://github.com/pydantic/FastUI/blob/196414360b69b3dab7012576f852229831307883/demo/sse.py#L66C1-L388C2 124 | await asyncio.sleep(random() * 0.12) 125 | logfire_span.set_attribute('output', output) 126 | finally: 127 | async with db.acquire() as conn: 128 | await conn.execute( 129 | "insert into messages (chat_id, role, message, cost) VALUES ($1, 'system', $2, 0)", 130 | chat_id, 131 | output, 132 | ) 133 | 134 | async def gen_openai() -> AsyncIterable[str]: 135 | output = '' 136 | input_usage = sum(_count_usage(m['content']) for m in messages if m['role'] in ('system', 'user')) 137 | output_usage = 0 138 | output_chunks = [] 139 | try: 140 | openai_client = AsyncOpenAI(http_client=http_client) 141 | logfire.instrument_openai(openai_client=openai_client) 142 | with logfire.span('call openai'): 143 | chunks = await openai_client.chat.completions.create( 144 | model=OPENAI_MODEL, 145 | messages=messages, 146 | stream=True, 147 | stream_options={'include_usage': True}, 148 | ) 149 | 150 | async for chunk in chunks: 151 | if not chunk.choices: 152 | # Ignore the usage chunk at the end 153 | continue 154 | text = chunk.choices[0].delta.content 155 | output_chunks.append(text) 156 | if text is not None: 157 | output += text 158 | yield _sse_message(f'**{OPENAI_MODEL.upper()}**:\n\n{output}') 159 | output_usage = _count_usage(output) 160 | async with db.acquire() as conn: 161 | await conn.execute( 162 | 'insert into llm_results (questions_hash, chunks) VALUES ($1, $2) ON CONFLICT DO NOTHING', 163 | questions_hash, 164 | json.dumps(output_chunks), 165 | ) 166 | finally: 167 | async with db.acquire() as conn: 168 | await conn.execute( 169 | "insert into messages (chat_id, role, message, cost) VALUES ($1, 'system', $2, $3)", 170 | chat_id, 171 | output, 172 | input_usage + output_usage, 173 | ) 174 | 175 | if opt_chunks: 176 | gen = gen_saved(opt_chunks) 177 | else: 178 | gen = gen_openai() 179 | return StreamingResponse(gen, media_type='text/event-stream') 180 | 181 | 182 | TOKEN_ENCODER = tiktoken.encoding_for_model(OPENAI_MODEL) 183 | 184 | 185 | def _count_usage(message: str) -> int: 186 | return len(TOKEN_ENCODER.encode(message)) 187 | 188 | 189 | def _sse_message(markdown: str) -> str: 190 | m = FastUI(root=[c.Markdown(text=markdown)]) 191 | return f'data: {m.model_dump_json(by_alias=True, exclude_none=True)}\n\n' 192 | -------------------------------------------------------------------------------- /src/webui/main.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations as _annotations 2 | 3 | from time import time 4 | 5 | from fastapi import APIRouter 6 | from fastui import AnyComponent, FastUI 7 | from fastui import components as c 8 | from fastui.events import GoToEvent 9 | 10 | from .settings import settings 11 | from .shared import demo_page 12 | 13 | router = APIRouter() 14 | 15 | 16 | @router.get('/', response_model=FastUI, response_model_exclude_none=True) 17 | def api_index() -> list[AnyComponent]: 18 | # language=markdown 19 | markdown = """\ 20 | This site demonstrates [Pydantic Logfire](https://docs.logfire.dev). 21 | 22 | You can use the sections below to see how different tasks are recorded by Logfire. 23 | """ 24 | slack_links = [ 25 | f'* [{channel_name}](/slack/{channel_id})' for channel_id, channel_name in settings.slack_channel.items() 26 | ] 27 | return demo_page( 28 | c.Markdown(text=markdown), 29 | c.Div( 30 | components=[ 31 | c.Heading(text='LLM Query', level=2), 32 | c.Link(components=[c.Text(text='Simple LLM question and answer.')], on_click=GoToEvent(url='/llm')), 33 | ], 34 | class_name='border-top mt-3 pt-1', 35 | ), 36 | c.Div( 37 | components=[ 38 | c.Heading(text='Slack Messages Archive', level=2), 39 | c.Markdown(text='\n'.join(slack_links)), 40 | ], 41 | class_name='border-top mt-3 pt-1', 42 | ), 43 | c.Div( 44 | components=[ 45 | c.Heading(text='Distributed Tracing', level=2), 46 | c.Paragraph(text="Here's an image generated by a separate tiling service."), 47 | c.Image(src=f'/map.jpg?v={time()}', alt='Map', width=600, height=400), 48 | ], 49 | class_name='border-top mt-3 pt-1', 50 | ), 51 | ) 52 | 53 | 54 | @router.get('/{path:path}', status_code=404) 55 | async def api_404(): 56 | # so we don't fall through to the index page 57 | return {'message': 'Not Found'} 58 | -------------------------------------------------------------------------------- /src/webui/settings.py: -------------------------------------------------------------------------------- 1 | from pydantic import SecretStr 2 | 3 | from ..common import GeneralSettings 4 | 5 | 6 | class Settings(GeneralSettings): 7 | create_database: bool = True 8 | tiling_server: str = 'http://localhost:8001' 9 | github_webhook_secret: SecretStr = 'test-github-secret' 10 | slack_signing_secret: SecretStr = 'test-slack-signing-secret' 11 | slack_channel: dict[str, str] = {} # mapping between Slack channel IDs and names 12 | 13 | 14 | settings = Settings() # type: ignore 15 | -------------------------------------------------------------------------------- /src/webui/shared.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations as _annotations 2 | 3 | from fastui import AnyComponent 4 | from fastui import components as c 5 | from fastui.events import GoToEvent 6 | 7 | 8 | def demo_page(*components: AnyComponent, title: str | None = None) -> list[AnyComponent]: 9 | return [ 10 | c.PageTitle(text=f'Logfire Demo — {title}' if title else 'Logfire Demo'), 11 | c.Navbar( 12 | title='Logfire Demo', 13 | title_event=GoToEvent(url='/'), 14 | end_links=[ 15 | c.Link( 16 | components=[c.Text(text='Login')], 17 | on_click=GoToEvent(url='/auth/login/password'), 18 | active='startswith:/auth', 19 | ), 20 | ], 21 | ), 22 | c.Page( 23 | components=[ 24 | *((c.Heading(text=title),) if title else ()), 25 | *components, 26 | ], 27 | ), 28 | c.Footer( 29 | extra_text='Logfire Demo', 30 | links=[ 31 | c.Link(components=[c.Text(text='Docs')], on_click=GoToEvent(url='https://docs.logfire.dev')), 32 | c.Link(components=[c.Text(text='Dashboard')], on_click=GoToEvent(url='https://dash.logfire.dev')), 33 | c.Link(components=[c.Text(text='PyPI')], on_click=GoToEvent(url='https://pypi.org/project/logfire/')), 34 | ], 35 | ), 36 | ] 37 | -------------------------------------------------------------------------------- /src/webui/slack.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, Request 2 | from fastui import FastUI, events 3 | from fastui import components as c 4 | 5 | from ..common.db import Database 6 | from ..common.db.slack import get_root_slack_messages, get_slack_thread 7 | from .shared import demo_page 8 | 9 | router = APIRouter() 10 | 11 | 12 | @router.get('/{channel_id}', response_model=FastUI, response_model_exclude_none=True) 13 | async def read_messages(request: Request, db: Database, channel_id: str): 14 | async with db.acquire() as conn: 15 | messages = await get_root_slack_messages(conn, channel_id) 16 | 17 | text = '' 18 | for msg in messages: 19 | text += f'- **@{msg["author"]}** ({msg["ts"]}): _{msg["text"][:50]}_ - [View Thread ({msg["replies_count"]})](/slack/thread/{msg["id"]}) \n\n' 20 | 21 | return demo_page( 22 | c.Link(components=[c.Text(text='back')], on_click=events.BackEvent()), 23 | c.Div(components=[c.Markdown(text=text)]), 24 | title='Logfire Slack Messages', 25 | ) 26 | 27 | 28 | @router.get('/thread/{message_id}', response_model=FastUI, response_model_exclude_none=True) 29 | async def read_thread(request: Request, db: Database, message_id: int): 30 | async with db.acquire() as conn: 31 | messages = await get_slack_thread(conn, message_id) 32 | 33 | text = '' 34 | for i, msg in enumerate(messages): 35 | text += f'{i + 1}. **@{msg["author"]}** ({msg["ts"]}): _{msg["text"]}_ \n\n' 36 | return demo_page( 37 | c.Link(components=[c.Text(text='back')], on_click=events.BackEvent()), 38 | c.Div(components=[c.Markdown(text=text)], class_name='col-md-6'), 39 | title=f'Logfire Slack Messages Thread {message_id}', 40 | ) 41 | -------------------------------------------------------------------------------- /src/webui/web_hooks.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import hmac 3 | import json 4 | from datetime import UTC, datetime 5 | from typing import Annotated, Any 6 | 7 | import logfire 8 | from fastapi import APIRouter, Depends, Header, HTTPException, Request 9 | from openai import AsyncOpenAI 10 | 11 | from ..common.db import Database 12 | from ..common.db.github import create_github_content, get_github_content, update_github_content 13 | from ..common.db.slack import create_slack_message 14 | from ..common.embeddings import generate_embedding, truncate_text_to_token_limit 15 | from .settings import settings 16 | 17 | router = APIRouter() 18 | 19 | 20 | def _get_openai_client(request: Request) -> AsyncOpenAI: 21 | return request.app.state.openai_client 22 | 23 | 24 | AsyncOpenAIClientDep = Annotated[AsyncOpenAI, Depends(_get_openai_client)] 25 | 26 | 27 | async def generate_github_content_embedding(openai_client: AsyncOpenAI, text: str) -> list[float]: 28 | """Generate an embedding for GitHub content.""" 29 | truncated_text = truncate_text_to_token_limit(text) 30 | return await generate_embedding(openai_client, truncated_text) 31 | 32 | 33 | def extract_data(issue: dict[str, Any]) -> tuple[int, str, str, datetime]: 34 | """Extract relevant information from a GitHub issue or comment.""" 35 | issue_id = issue.get('id') 36 | title = issue.get('title') 37 | text = issue.get('body') 38 | if title: 39 | text = f'{title}\n\n{text}' 40 | external_reference = issue.get('html_url') 41 | event_ts = datetime.fromisoformat(issue['created_at'].replace('Z', '+00:00')) 42 | return issue_id, text, external_reference, event_ts 43 | 44 | 45 | def verify_github_signature(secret: str, payload: bytes, signature: str) -> bool: 46 | """Verify GitHub webhook signature (HMAC SHA-256)""" 47 | mac = hmac.new(secret.encode(), msg=payload, digestmod=hashlib.sha256) 48 | expected_signature = f'sha256={mac.hexdigest()}' 49 | return hmac.compare_digest(expected_signature, signature) 50 | 51 | 52 | @router.post('/github') 53 | async def github_webhook( 54 | request: Request, 55 | db: Database, 56 | openai_client: AsyncOpenAIClientDep, 57 | x_hub_signature_256: str = Header(None), # GitHub sends signature in headers 58 | ): 59 | """Handle GitHub webhook events""" 60 | payload = await request.body() 61 | 62 | # Verify signature for security 63 | if not verify_github_signature(settings.github_webhook_secret.get_secret_value(), payload, x_hub_signature_256): 64 | raise HTTPException(status_code=403, detail='Invalid signature') 65 | 66 | data = await request.json() # Convert request payload to JSON 67 | event_type = request.headers.get('X-GitHub-Event') # GitHub event type 68 | 69 | if event_type not in ['issues', 'issue_comment']: 70 | logfire.debug('Event not supported: {event_type}', event_type=event_type) 71 | return {'message': 'Event not supported'} 72 | 73 | if event_type == 'issues': 74 | logfire.info('Received GitHub issue event: {gh_data}', gh_data=data) 75 | if data.get('action') == 'opened': 76 | issue = data.get('issue') 77 | if not issue: 78 | logfire.error('Invalid GitHub issue: {gh_data}', gh_data=data) 79 | return {'message': 'Invalid GitHub issue'} 80 | 81 | i_id, i_text, i_external_reference, event_ts = extract_data(issue) 82 | project = data.get('repository', {}).get('name') 83 | embeddings = await generate_github_content_embedding(openai_client, i_text) 84 | async with db.acquire() as conn: 85 | await create_github_content( 86 | conn, project, 'issue', i_id, i_external_reference, i_text, event_ts, embeddings 87 | ) 88 | else: 89 | logfire.debug('Action not supported: {gh_data}', gh_data=data) 90 | return {'message': 'Action not supported'} 91 | elif event_type == 'issue_comment': 92 | logfire.info('Received GitHub comment event: {gh_data}', gh_data=data) 93 | if data.get('action') == 'created': 94 | issue = data.get('issue') 95 | comment = data.get('comment') 96 | if not issue or not comment: 97 | logfire.error('Invalid GitHub issue comment: {gh_data}', gh_data=data) 98 | return {'message': 'Invalid GitHub issue comment'} 99 | 100 | if 'pull_request' in issue: # Ignore pull requests comments 101 | logfire.error('Ignoring comment on GitHub pull request: {gh_data}', gh_data=data) 102 | return {'message': 'Ignoring comment on GitHub pull request'} 103 | 104 | # Comment has to be added to the issue text 105 | project = data.get('repository', {}).get('name') 106 | i_id, _, i_external_reference, _ = extract_data(issue) 107 | async with db.acquire() as conn: 108 | saved_issue = await get_github_content(conn, project, 'issue', i_id) 109 | if not saved_issue: 110 | logfire.error( 111 | 'GitHub issue not found: {external_reference}', external_reference=i_external_reference 112 | ) 113 | return {'message': 'GitHub issue not found'} 114 | 115 | _, c_text, _, _ = extract_data(comment) 116 | text = f'{saved_issue["text"]}\n\n{c_text}' 117 | embeddings = await generate_github_content_embedding(openai_client, text) 118 | await update_github_content(conn, project, 'issue', i_id, text, embeddings) 119 | logfire.info('Updated GitHub issue: {external_reference}', external_reference=i_external_reference) 120 | else: 121 | logfire.debug('Action not supported: {gh_data}', gh_data=data) 122 | return {'message': 'Action not supported'} 123 | 124 | return {'message': 'Webhook received successfully!'} 125 | 126 | 127 | def verify_slack_signature(request: Request, body: bytes, slack_signing_secret: str) -> bool: 128 | """Verify Slack request signature for security""" 129 | timestamp = request.headers.get('X-Slack-Request-Timestamp') 130 | slack_signature = request.headers.get('X-Slack-Signature') 131 | 132 | if not timestamp or not slack_signature: 133 | return False 134 | 135 | # Slack signature format: v0=HMAC_SHA256(secret, "v0:{timestamp}:{body}") 136 | basestring = f'v0:{timestamp}:{body.decode("utf-8")}' 137 | calculated_signature = ( 138 | 'v0=' + hmac.new(slack_signing_secret.encode(), basestring.encode(), hashlib.sha256).hexdigest() 139 | ) 140 | 141 | return hmac.compare_digest(calculated_signature, slack_signature) 142 | 143 | 144 | @router.post('/slack/events') 145 | async def slack_events(request: Request, db: Database, openai_client: AsyncOpenAIClientDep): 146 | """Receive Slack messages via webhook""" 147 | body = await request.body() 148 | if not verify_slack_signature(request, body, settings.slack_signing_secret.get_secret_value()): 149 | raise HTTPException(status_code=403, detail='Invalid signature') 150 | 151 | data = json.loads(body) 152 | if data.get('type') == 'url_verification': 153 | # Slack sends a challenge code for verification 154 | return {'challenge': data['challenge']} 155 | 156 | if data.get('type') == 'event_callback': 157 | event = data.get('event', {}) 158 | 159 | logfire.info('Received Slack event: {event}', event=event) 160 | 161 | # Only process messages from allowed channels 162 | if (channel := event.get('channel')) not in settings.slack_channel: 163 | logfire.info('Invalid Slack channel: {channel}', channel=channel) 164 | return {'message': 'Invalid Slack channel'} 165 | 166 | if event.get('type') == 'message' and event.get('subtype') is None: 167 | author = event.get('user') 168 | text = event.get('text') 169 | message_id = event.get('client_msg_id') 170 | ts = datetime.fromtimestamp(float(event.get('ts')), tz=UTC) 171 | event_ts = event.get('event_ts') 172 | parent_event_ts = event.get('thread_ts') 173 | if not author or not text or not message_id or not event_ts: 174 | logfire.error('Invalid Slack message: {event}', event=event) 175 | return {'message': 'Invalid Slack message'} 176 | 177 | embedding = await generate_embedding(openai_client, text) 178 | 179 | async with db.acquire_trans() as conn: 180 | await create_slack_message( 181 | conn, 182 | channel=channel, 183 | author=author, 184 | message_id=message_id, 185 | event_ts=event_ts, 186 | parent_event_ts=parent_event_ts, 187 | text=text, 188 | ts=ts, 189 | embedding=embedding, 190 | ) 191 | 192 | logfire.info('Saved Slack message: {message_id}', message_id=message_id) 193 | 194 | return {'message': 'Event received'} 195 | -------------------------------------------------------------------------------- /src/worker/__init__.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import logging.config 3 | import time 4 | 5 | import asyncpg 6 | import logfire 7 | from arq import cron 8 | from arq.connections import RedisSettings 9 | from arq.worker import run_worker 10 | from httpx import AsyncClient 11 | from openai import AsyncOpenAI 12 | from pydantic_ai import Agent 13 | 14 | from .docs_embeddings import update_docs_embeddings 15 | from .github_similar_content import similar_issue_agent, suggest_similar_issues 16 | from .settings import settings 17 | 18 | 19 | async def startup(ctx): 20 | openai_client = AsyncOpenAI() 21 | 22 | ai_agent = Agent( 23 | 'openai:gpt-4o', 24 | result_type=str, 25 | system_prompt='Be concise, reply with maximum 50 tokens.', 26 | ) 27 | 28 | client = AsyncClient() 29 | 30 | ctx.update( 31 | client=client, 32 | pg_pool=await asyncpg.create_pool(settings.pg_dsn), 33 | openai_client=openai_client, 34 | ai_agent=ai_agent, 35 | similar_issue_agent=similar_issue_agent, 36 | ) 37 | 38 | 39 | async def shutdown(ctx): 40 | await ctx['client'].aclose() 41 | await ctx['openai_client'].close() 42 | await asyncio.wait_for(ctx['pg_pool'].close(), timeout=2.0) 43 | 44 | 45 | async def pydantic_doc_embeddings(ctx) -> None: 46 | """Update the embeddings for the pydantic documentation.""" 47 | with logfire.span('update pydantic ai docs embeddings'): 48 | await update_docs_embeddings( 49 | ctx['client'], 50 | ctx['pg_pool'], 51 | ctx['openai_client'], 52 | 'https://docs.pydantic.dev/dev/llms.txt', 53 | 'pydantic_docs', 54 | ) 55 | 56 | 57 | async def pydantic_ai_doc_embeddings(ctx) -> None: 58 | """Update the embeddings for the pydantic ai documentation.""" 59 | with logfire.span('update pydantic ai docs embeddings'): 60 | await update_docs_embeddings( 61 | ctx['client'], ctx['pg_pool'], ctx['openai_client'], 'https://ai.pydantic.dev/llms.txt', 'pydantic_ai_docs' 62 | ) 63 | 64 | 65 | async def logfire_doc_embeddings(ctx) -> None: 66 | """Update the embeddings for the logfire documentation.""" 67 | with logfire.span('update logfire docs embeddings'): 68 | await update_docs_embeddings( 69 | ctx['client'], 70 | ctx['pg_pool'], 71 | ctx['openai_client'], 72 | 'https://logfire.pydantic.dev/docs/llms.txt', 73 | 'logfire_docs', 74 | ) 75 | 76 | 77 | QUESTIONS = [ 78 | 'What is Pydantic?', 79 | 'What is PydanticAI?', 80 | 'What is Pydantic Logfire?', 81 | 'What are the main features of PydanticAI?', 82 | 'What are the main features of Pydantic Logfire?', 83 | 'Where is the documentation for the Pydantic Logfire schema?', 84 | 'What database does Pydantic Logfire use?', 85 | 'Where can I find the Pydantic public slack contact details?', 86 | "What's the url for the Pydantic Logfire docs?", 87 | 'How do I invite my team members to Logfire?', 88 | ] 89 | 90 | 91 | async def llm_query(ctx) -> None: 92 | """Query the LLM model with some questions.""" 93 | with logfire.span('query llm'): 94 | question_index = int(time.time() // (5 * 60)) % len(QUESTIONS) # Divide time into 5-minute intervals 95 | question = QUESTIONS[question_index] 96 | response = await ctx['ai_agent'].run(question) 97 | logfire.info('Question: {question} Answer: {response}', question=question, response=response.data) 98 | 99 | 100 | async def check_new_created_issues(ctx) -> None: 101 | """Suggest similar issues for new issues and post them as comments.""" 102 | with logfire.span('check new issues for similarity'): 103 | await suggest_similar_issues( 104 | ctx['pg_pool'], 105 | ctx['similar_issue_agent'], 106 | ctx['client'], 107 | settings.vector_distance_threshold, 108 | settings.ai_similarity_threshold, 109 | ) 110 | 111 | 112 | class WorkerSettings: 113 | functions = [ 114 | pydantic_doc_embeddings, 115 | pydantic_ai_doc_embeddings, 116 | logfire_doc_embeddings, 117 | llm_query, 118 | check_new_created_issues, 119 | ] 120 | on_startup = startup 121 | on_shutdown = shutdown 122 | redis_settings = RedisSettings.from_dsn(settings.redis_dsn) 123 | cron_jobs = [ 124 | cron(pydantic_ai_doc_embeddings, hour={10, 22}, minute=0), 125 | cron(logfire_doc_embeddings, hour={1, 13}, minute=0), 126 | cron(pydantic_doc_embeddings, hour={2, 14}, minute=0), 127 | cron(llm_query, minute={0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55}), 128 | cron(check_new_created_issues, minute={0, 10, 20, 30, 40, 50}), 129 | ] 130 | 131 | 132 | def run(): 133 | logging.config.dictConfig( 134 | { 135 | 'version': 1, 136 | 'disable_existing_loggers': False, 137 | 'handlers': { 138 | 'logfire': {'level': 'INFO', 'class': 'logfire.integrations.logging.LogfireLoggingHandler'}, 139 | }, 140 | 'loggers': {'arq': {'handlers': ['logfire'], 'level': 'INFO'}}, 141 | } 142 | ) 143 | 144 | run_worker(WorkerSettings) # type: ignore 145 | -------------------------------------------------------------------------------- /src/worker/docs_embeddings.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import asyncpg 4 | import logfire 5 | from httpx import AsyncClient 6 | from openai import AsyncOpenAI 7 | 8 | from ..common.embeddings import ( 9 | TOKEN_LIMIT, 10 | EmbeddingsSource, 11 | count_tokens, 12 | create_embeddings, 13 | delete_embeddings_by_hash, 14 | generate_embedding, 15 | get_stored_embeddings_hash_by_source, 16 | hash_text, 17 | ) 18 | 19 | 20 | async def get_content(client: AsyncClient, url: str) -> str: 21 | with logfire.span('Reading from {url=}', url=url): 22 | r = await client.get(url) 23 | r.raise_for_status() 24 | return r.content.decode() 25 | 26 | 27 | def split_large_text(text: str, max_tokens: int = TOKEN_LIMIT) -> list[str]: 28 | """Splits text into smaller chunks by paragraph if it exceeds max_tokens.""" 29 | paragraphs = text.split('\n\n') # Split by double newlines (paragraphs) 30 | chunks = [] 31 | current_chunk = [] 32 | 33 | for paragraph in paragraphs: 34 | current_chunk.append(paragraph) 35 | chunk_text = '\n\n'.join(current_chunk) 36 | 37 | if count_tokens(chunk_text) > max_tokens: 38 | # Remove the last added paragraph and store the chunk 39 | current_chunk.pop() 40 | chunks.append('\n\n'.join(current_chunk)) 41 | current_chunk = [paragraph] # Start new chunk 42 | 43 | # Add remaining content 44 | if current_chunk: 45 | chunks.append('\n\n'.join(current_chunk)) 46 | 47 | return chunks 48 | 49 | 50 | def split_markdown_sections(content: str) -> list[dict[str, str]]: 51 | """Splits a Markdown file into sections based on headers, ensuring each section is <= 8192 tokens.""" 52 | pattern = r'^(#{1,6})\s+(.*)$' 53 | matches = re.finditer(pattern, content, re.MULTILINE) 54 | 55 | sections = [] 56 | last_index = 0 57 | 58 | for match in matches: 59 | header, title = match.groups() 60 | start = match.start() 61 | 62 | if sections: 63 | section_content = content[last_index:start].strip() 64 | # Split if content exceeds token limit 65 | if count_tokens(section_content) > TOKEN_LIMIT: 66 | section_chunks = split_large_text(section_content, TOKEN_LIMIT) 67 | for chunk in section_chunks: 68 | sections.append( 69 | { 70 | 'level': sections[-1]['level'], 71 | 'title': sections[-1]['title'], 72 | 'content': chunk, 73 | } 74 | ) 75 | else: 76 | sections[-1]['content'] = section_content 77 | 78 | sections.append({'level': len(header), 'title': title.strip(), 'content': ''}) 79 | last_index = start 80 | 81 | # Process the last section 82 | if sections: 83 | last_content = content[last_index:].strip() 84 | if count_tokens(last_content) > TOKEN_LIMIT: 85 | section_chunks = split_large_text(last_content, TOKEN_LIMIT) 86 | for chunk in section_chunks: 87 | sections.append( 88 | { 89 | 'level': sections[-1]['level'], 90 | 'title': sections[-1]['title'], 91 | 'content': chunk, 92 | } 93 | ) 94 | else: 95 | sections[-1]['content'] = last_content 96 | 97 | return sections 98 | 99 | 100 | async def update_docs_embeddings( 101 | client: AsyncClient, pg_pool: asyncpg.Pool, openai_client: AsyncOpenAI, url: str, source: EmbeddingsSource 102 | ) -> None: 103 | content = await get_content(client, url) 104 | sections = split_markdown_sections(content) 105 | 106 | async with pg_pool.acquire() as conn: 107 | hashes: set[str] = set() 108 | stored_hashes = await get_stored_embeddings_hash_by_source(conn, source) 109 | 110 | for section in sections: 111 | try: 112 | section_content = f'{section["title"]} {section["content"]}' 113 | text_hash = hash_text(section_content) 114 | hashes.add(text_hash) 115 | if text_hash in stored_hashes: 116 | logfire.info('Skipping {text_hash=}', text_hash=text_hash) 117 | continue 118 | embeddings = await generate_embedding(openai_client, section_content) 119 | await create_embeddings( 120 | conn, 121 | source=source, 122 | text=section_content, 123 | text_hash=text_hash, 124 | embedding=embeddings, 125 | ) 126 | except Exception as exc: 127 | logfire.error('Failed to update docs embeddings {exc!r}', exc=exc) 128 | 129 | # Remove old embeddings that are not in the new content 130 | hashes_to_delete = stored_hashes - hashes 131 | if hashes_to_delete: 132 | await delete_embeddings_by_hash(conn, hashes_to_delete, source) 133 | -------------------------------------------------------------------------------- /src/worker/github_similar_content.py: -------------------------------------------------------------------------------- 1 | import time 2 | from typing import Any 3 | 4 | import asyncpg 5 | import jwt 6 | import logfire 7 | from httpx import AsyncClient 8 | from pydantic import BaseModel, Field 9 | from pydantic_ai import Agent 10 | from pydantic_ai.models import ModelSettings 11 | 12 | from ..common.db.github import ( 13 | GithubContentProject, 14 | fetch_issues_for_similarity_check, 15 | find_similar_issues, 16 | update_similar_issues, 17 | ) 18 | from .settings import settings 19 | 20 | 21 | class SimilarityResult(BaseModel): 22 | percentage: int = Field(description='Similarity of the issues', ge=0, le=100) 23 | reason: str = Field(description='Reason for the similarity') 24 | 25 | 26 | similar_issue_agent = Agent( 27 | 'openai:gpt-4o', 28 | result_type=SimilarityResult, 29 | model_settings=ModelSettings(temprature=0.1), 30 | system_prompt=( 31 | """ 32 | Your task is to provide a detailed similarity analysis while maintaining strict output format requirements. 33 | 34 | ANALYSIS CRITERIA: 35 | 1. Semantic Similarity (40% weight) 36 | - Core problem or feature request 37 | - Technical domain and scope 38 | - Expected behavior and outcomes 39 | 40 | 2. Implementation Details (30% weight) 41 | - Technical approach suggested 42 | - Dependencies mentioned 43 | - Code snippets or examples 44 | 45 | 3. Context & Requirements (30% weight) 46 | - Project context and constraints 47 | - User impact and priorities 48 | - Environment and version details 49 | 50 | SIMILARITY SCALE: 51 | 0-20%: Fundamentally different issues 52 | 21-40%: Slight overlaps but largely distinct 53 | 41-60%: Moderate similarity in some aspects 54 | 61-80%: Significant overlap in core aspects 55 | 81-100%: Nearly identical issues 56 | 57 | RULES: 58 | - Ignore superficial similarities (writing style, formatting) 59 | - Consider partial matches in technical requirements 60 | - Account for implicit similarities in problem domain 61 | - Look for shared root causes in bug reports 62 | - Consider related feature requests as partial matches 63 | 64 | OUTPUT FORMAT: 65 | 1. Provide a single integer similarity score (0-100) 66 | 2. The score must be divisible by 5 (e.g., 75 not 77) 67 | 3. No explanation unless explicitly requested 68 | 69 | EXAMPLE PAIRS AND SCORES: 70 | 71 | # High Similarity (80-100%) 72 | Issue 1: "Error: Connection timeout when processing large files >500MB" 73 | Issue 2: "Timeout occurred during batch processing of files >1GB" 74 | Score: 85 75 | Reason: Nearly identical core issue (timeout during large file processing), same technical domain, similar scope 76 | 77 | Issue 1: "Add dark mode support to dashboard UI" 78 | Issue 2: "Implement dark theme for main dashboard" 79 | Score: 90 80 | Reason: Same feature request, same component, identical scope 81 | 82 | # Moderate Similarity (40-79%) 83 | Issue 1: "Redis connection fails with timeout after 30 seconds" 84 | Issue 2: "MongoDB connection timeout in high-load scenarios" 85 | Score: 60 86 | Reason: Similar problem (database timeout) but different databases and contexts 87 | 88 | Issue 1: "Add user authentication via Google OAuth" 89 | Issue 2: "Implement SSO support for Google accounts" 90 | Score: 75 91 | Reason: Related authentication features with overlapping implementation 92 | 93 | # Low Similarity (0-39%) 94 | Issue 1: "Browser crashes when uploading large files" 95 | Issue 2: "Timeout during large file upload" 96 | Score: 35 97 | Reason: Different core issues (crash vs timeout) despite similar trigger 98 | 99 | Issue 1: "Add PDF export functionality" 100 | Issue 2: "Fix PDF rendering bug in preview" 101 | Score: 25 102 | Reason: Same component (PDF) but different types of issues (feature vs bug) 103 | 104 | # Zero Similarity 105 | Issue 1: "Update documentation for API endpoints" 106 | Issue 2: "Fix memory leak in image processing" 107 | Score: 0 108 | Reason: Completely different domains, types, and purposes 109 | """ 110 | ), 111 | ) 112 | 113 | 114 | def _generate_query(issue_1_text: str, issue_2_text: str) -> str: 115 | return f""" 116 | Are these two GitHub issues similar? 117 | **Issue 1:** 118 | "{issue_1_text}" 119 | 120 | **Issue 2:** 121 | "{issue_2_text}" 122 | """ 123 | 124 | 125 | async def _generate_github_app_access_token( 126 | client: AsyncClient, app_id: int, installation_id: int, private_key: str 127 | ) -> str: 128 | """Generate a GitHub App access token.""" 129 | # Generate a GitHub App JWT 130 | now = int(time.time()) 131 | payload = {'iat': now, 'exp': now + 600, 'iss': app_id} 132 | jwt_token = jwt.encode(payload, private_key, algorithm='RS256') 133 | 134 | # Get Installation Access Token 135 | url = f'https://api.github.com/app/installations/{installation_id}/access_tokens' 136 | headers = {'Authorization': f'Bearer {jwt_token}', 'Accept': 'application/vnd.github.v3+json'} 137 | response = await client.post(url, headers=headers) 138 | return response.json().get('token') 139 | 140 | 141 | async def _post_github_comment( 142 | client: AsyncClient, 143 | access_token: str, 144 | project: GithubContentProject, 145 | issue_link: str, 146 | similar_issues: list[dict[str, Any]], 147 | ) -> None: 148 | # Find the issue number from the issue link 149 | issue_number = issue_link.split('/')[-1] 150 | url = f'https://api.github.com/repos/pydantic/{project}/issues/{issue_number}/comments' 151 | 152 | # Generate the comment body 153 | issue_links = '\n'.join( 154 | [ 155 | f'{i + 1}. "{similar_issue["link"]}" ({similar_issue["ai_similarity"]}% similar)' 156 | for i, similar_issue in enumerate(similar_issues) 157 | ] 158 | ) 159 | body = f'PydanticAI Github Bot Found {len(similar_issues)} issues similar to this one: \n{issue_links}' 160 | 161 | response = await client.post( 162 | url, 163 | json={'body': body}, 164 | headers={'Authorization': f'Bearer {access_token}', 'Accept': 'application/vnd.github.v3+json'}, 165 | ) 166 | response.raise_for_status() 167 | 168 | 169 | async def suggest_similar_issues( 170 | pg_pool: asyncpg.Pool, 171 | similar_issue_agent: Agent, 172 | client: AsyncClient, 173 | vector_distance_threshold: float, 174 | ai_similarity_threshold: int, 175 | ) -> None: 176 | github_access_token = None 177 | 178 | async with pg_pool.acquire() as conn: 179 | # Fetch new issues for similarity check 180 | issues = await fetch_issues_for_similarity_check(conn) 181 | if not issues: 182 | logfire.info('No new issues found') 183 | return 184 | logfire.info(f'Found {len(issues)} new issues') 185 | 186 | for issue in issues: 187 | issue_link = issue['external_reference'] 188 | with logfire.span(f'Checking issue {issue_link}'): 189 | # Fetch similar issues by vector similarity 190 | similar_issues = await find_similar_issues(conn, issue['id'], issue['project']) 191 | logfire.info(f'Found {len(similar_issues)} similar issues for issue {issue_link}') 192 | 193 | similar_issues_obj: list[dict[str, Any]] = [] 194 | for similar_issue in similar_issues: 195 | similar_issue_link = similar_issue['external_reference'] 196 | distance = similar_issue['distance'] 197 | obj = { 198 | 'link': similar_issue_link, 199 | 'distance': distance, 200 | 'ai_similarity': None, 201 | 'post_comment': False, 202 | } 203 | # Skip similar issues with distance > vector_distance_threshold 204 | # It could be done in database level, but we did it here to see some 205 | # similar issues in logs. This help us to adjust the threshold 206 | if distance <= vector_distance_threshold: 207 | # Get similarity percentage from the AI agent 208 | logfire.info( 209 | f'Checking similarity between issue {issue_link} and similar issue {similar_issue_link}' 210 | ) 211 | similarity_result = await similar_issue_agent.run( 212 | _generate_query(issue['text'], similar_issue['text']) 213 | ) 214 | obj['ai_similarity'] = similarity_result.data.percentage 215 | if similarity_result.data.percentage > ai_similarity_threshold: 216 | obj['post_comment'] = True 217 | else: 218 | logfire.info(f'Skipping similar issue {similar_issue_link} due to distance {distance}') 219 | 220 | similar_issues_obj.append(obj) 221 | 222 | # Filter similar issues to post comments 223 | issues_to_comment = [issue for issue in similar_issues_obj if issue['post_comment']] 224 | if not issues_to_comment: 225 | logfire.info(f'No similar issues found for {issue_link}') 226 | else: 227 | # Github access token is valid for 10 minutes. We need to generate a new one 228 | # if we don't have it. As the task runs every 10 minutes, we need to generate 229 | # a new token every time the task runs. 230 | if not github_access_token: 231 | github_access_token = await _generate_github_app_access_token( 232 | client, 233 | settings.github_app_id, 234 | settings.github_app_installation_id, 235 | settings.github_app_private_key, 236 | ) 237 | await _post_github_comment( 238 | client, github_access_token, issue['project'], issue_link, issues_to_comment 239 | ) 240 | logfire.info(f'Posted similar issues for {issue_link}') 241 | 242 | # Update the similar issues in the database 243 | await update_similar_issues(conn, issue['id'], similar_issues_obj) 244 | -------------------------------------------------------------------------------- /src/worker/settings.py: -------------------------------------------------------------------------------- 1 | from pydantic import Field 2 | 3 | from ..common import GeneralSettings 4 | 5 | 6 | class Settings(GeneralSettings): 7 | github_app_id: int 8 | github_app_installation_id: int 9 | github_app_private_key: str 10 | vector_distance_threshold: float = Field(0.4, ge=0.0, le=1.0) 11 | ai_similarity_threshold: int = Field(85, ge=0, le=100) 12 | 13 | 14 | settings = Settings() # type: ignore 15 | --------------------------------------------------------------------------------