├── ingest ├── .python-version ├── pyproject.toml ├── tiger_docs_config.toml ├── README.md ├── postgres_docs.py └── tiger_docs.py ├── .prettierignore ├── .dockerignore ├── src ├── config.ts ├── types.ts ├── serverInfo.ts ├── stdio.ts ├── index.ts ├── apis │ ├── index.ts │ ├── viewSkill.ts │ ├── kewordSearchTigerDocs.ts │ ├── semanticSearchTigerDocs.ts │ └── semanticSearchPostgresDocs.ts ├── httpServer.ts ├── util │ └── featureFlags.ts ├── migrate.ts └── skillutils │ └── index.ts ├── .prettierrc.mjs ├── .env.sample ├── .gitignore ├── bun ├── docker └── tsdb │ └── 100_setup_db.sql ├── tsconfig.json ├── .github └── workflows │ ├── deploy-feature-branch.yaml │ ├── lint.yml │ ├── build-on-feature-branch.yaml │ ├── build-and-deploy-on-merge.yaml │ ├── ingest-tiger-docs.yaml │ ├── ingest-postgres-docs.yaml │ └── publish.yml ├── NOTICE ├── Dockerfile ├── migrations ├── 1759241361471-add-version-index.js ├── 1759241172003-add-hnsw-indexes.js ├── 1759851009030-add-tiger-indexes.js └── 1756387543053-initial.js ├── docker-compose.yml ├── CLAUDE.md ├── eslint.config.mjs ├── .claude-plugin └── marketplace.json ├── package.json ├── API.md ├── generate-server.json.ts ├── DEVELOPMENT.md ├── LICENSE ├── README.md └── skills ├── find-hypertable-candidates └── SKILL.md ├── migrate-postgres-tables-to-hypertables └── SKILL.md ├── design-postgres-tables └── SKILL.md └── setup-timescaledb-hypertables └── SKILL.md /ingest/.python-version: -------------------------------------------------------------------------------- 1 | 3.13 2 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | dist/ 3 | .venv/ -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .github/ 2 | chart/ 3 | dist/ 4 | node_modules/ 5 | -------------------------------------------------------------------------------- /src/config.ts: -------------------------------------------------------------------------------- 1 | export const schema = process.env.DB_SCHEMA || 'docs'; 2 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | import type { Pool } from 'pg'; 2 | 3 | export interface ServerContext extends Record { 4 | pgPool: Pool; 5 | schema: string; 6 | } 7 | -------------------------------------------------------------------------------- /.prettierrc.mjs: -------------------------------------------------------------------------------- 1 | export default { 2 | singleQuote: true, 3 | tabWidth: 2, 4 | trailingComma: 'all', 5 | bracketSpacing: true, 6 | arrowParens: 'always', 7 | printWidth: 80, 8 | }; 9 | -------------------------------------------------------------------------------- /.env.sample: -------------------------------------------------------------------------------- 1 | # Database 2 | PGHOST=db 3 | PGPORT=5432 4 | PGDATABASE=tsdb 5 | PGUSER=tsdbadmin 6 | PGPASSWORD=password 7 | DB_SCHEMA=docs 8 | 9 | # OpenAI Embedding API Key 10 | OPENAI_API_KEY=sk- 11 | 12 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | node_modules 3 | .env 4 | ingest/build 5 | ingest/postgres 6 | .idea 7 | .migrate 8 | .mcpregistry_* 9 | download/ 10 | server.json 11 | 12 | # Temporary files directory 13 | tmp/ 14 | -------------------------------------------------------------------------------- /src/serverInfo.ts: -------------------------------------------------------------------------------- 1 | import { Pool } from 'pg'; 2 | 3 | import { schema } from './config.js'; 4 | import { ServerContext } from './types.js'; 5 | 6 | export const serverInfo = { 7 | name: 'pg-aiguide', 8 | version: '1.0.0', 9 | } as const; 10 | 11 | const pgPool = new Pool(); 12 | 13 | export const context: ServerContext = { pgPool, schema }; 14 | -------------------------------------------------------------------------------- /src/stdio.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import { stdioServerFactory } from '@tigerdata/mcp-boilerplate'; 3 | import { apiFactories } from './apis/index.js'; 4 | import { promptFactories } from './skillutils/index.js'; 5 | import { context, serverInfo } from './serverInfo.js'; 6 | 7 | stdioServerFactory({ 8 | ...serverInfo, 9 | context, 10 | apiFactories, 11 | promptFactories, 12 | }); 13 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import 'dotenv/config'; 3 | import { cliEntrypoint } from '@tigerdata/mcp-boilerplate'; 4 | 5 | import { dirname, join } from 'path'; 6 | import { fileURLToPath } from 'url'; 7 | 8 | const __dirname = dirname(fileURLToPath(import.meta.url)); 9 | 10 | cliEntrypoint( 11 | join(__dirname, 'stdio.js'), 12 | join(__dirname, 'httpServer.js'), 13 | ).catch(console.error); 14 | -------------------------------------------------------------------------------- /bun: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | version="bun-v1.3.3" 4 | downloadDir=$(pwd)/download/bun/${version} 5 | bunCmd="$downloadDir/bin/bun" 6 | 7 | if [ ! -f "$bunCmd" ]; then 8 | echo Installing bun to "$bunCmd" 9 | bashArgs=() 10 | if [ "$version" != "latest" ]; then 11 | bashArgs=(-s "$version") 12 | fi 13 | curl -fsSL https://bun.sh/install | BUN_INSTALL="$downloadDir" bash "${bashArgs[@]}" 14 | fi 15 | 16 | exec "$bunCmd" "$@" -------------------------------------------------------------------------------- /docker/tsdb/100_setup_db.sql: -------------------------------------------------------------------------------- 1 | -- Sets up database similar to how Tiger Cloud works where we have a 2 | -- tsdbadmin user that is not a superuser. 3 | CREATE ROLE tsdbadmin 4 | WITH 5 | LOGIN PASSWORD 'password'; 6 | 7 | CREATE DATABASE tsdb 8 | WITH 9 | OWNER tsdbadmin; 10 | 11 | \c tsdb 12 | 13 | CREATE EXTENSION IF NOT EXISTS vector CASCADE; 14 | 15 | -- Create schema for docs 16 | CREATE SCHEMA IF NOT EXISTS docs AUTHORIZATION tsdbadmin; 17 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "outDir": "./dist", 4 | "rootDir": "./src", 5 | "target": "ES2022", 6 | "module": "Node16", 7 | "moduleResolution": "Node16", 8 | "strict": true, 9 | "esModuleInterop": true, 10 | "skipLibCheck": true, 11 | "forceConsistentCasingInFileNames": true, 12 | "resolveJsonModule": true 13 | }, 14 | "include": ["./src/**/*.ts"], 15 | "exclude": ["node_modules", "dist"] 16 | } 17 | -------------------------------------------------------------------------------- /ingest/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "docs-importer" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.13" 7 | dependencies = [ 8 | "beautifulsoup4>=4.13.5", 9 | "langchain-text-splitters>=0.3.9", 10 | "markdownify>=1.1.0", 11 | "openai>=1.97.1", 12 | "psycopg[binary,pool]>=3.2.9", 13 | "python-dotenv[cli]>=1.1.1", 14 | "scrapy>=2.13.3", 15 | "tiktoken>=0.11.0", 16 | ] 17 | -------------------------------------------------------------------------------- /src/apis/index.ts: -------------------------------------------------------------------------------- 1 | import { semanticSearchPostgresDocsFactory } from './semanticSearchPostgresDocs.js'; 2 | import { semanticSearchTigerDocsFactory } from './semanticSearchTigerDocs.js'; 3 | import { viewSkillFactory } from './viewSkill.js'; 4 | import { keywordSearchTigerDocsFactory } from './kewordSearchTigerDocs.js'; 5 | 6 | export const apiFactories = [ 7 | keywordSearchTigerDocsFactory, 8 | semanticSearchPostgresDocsFactory, 9 | semanticSearchTigerDocsFactory, 10 | viewSkillFactory, 11 | ] as const; 12 | -------------------------------------------------------------------------------- /ingest/tiger_docs_config.toml: -------------------------------------------------------------------------------- 1 | # Configuration for domain-specific element removal 2 | # Add CSS selectors to ignore for each domain 3 | 4 | [domain_selectors] 5 | "www.tigerdata.com" = [ 6 | "script", 7 | "style", 8 | "nav", 9 | "footer", 10 | "#plan-availability", 11 | ".sr-only", 12 | ".code-block-copy-button" 13 | ] 14 | # Add more domains as needed 15 | 16 | # Default selectors applied to all domains 17 | [default_selectors] 18 | selectors = [ 19 | "script", 20 | "style", 21 | "nav", 22 | "footer" 23 | ] 24 | -------------------------------------------------------------------------------- /.github/workflows/deploy-feature-branch.yaml: -------------------------------------------------------------------------------- 1 | name: Deploy Feature Branch 2 | on: 3 | workflow_dispatch: 4 | 5 | jobs: 6 | deploy: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - name: Dispatch Workflow 10 | uses: timescale/workflow-dispatch-action@main 11 | with: 12 | github-token: ${{ secrets.ORG_GITHUB_AGENTS_TOKEN }} 13 | owner: timescale 14 | repo: tiger-agents-deploy 15 | workflow_id: deploy.yaml 16 | ref: 'main' 17 | inputs: '{"repository": "pg-aiguide", "sha": "${{ github.sha }}"}' 18 | -------------------------------------------------------------------------------- /.github/workflows/lint.yml: -------------------------------------------------------------------------------- 1 | name: Lint 2 | 3 | on: 4 | push: 5 | branches: [main] 6 | pull_request: 7 | 8 | jobs: 9 | lint: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v4 14 | - uses: actions/setup-node@v5 15 | with: 16 | node-version: 22 17 | cache: npm 18 | - name: Install dependencies 19 | run: npm ci 20 | - name: Run ESLint 21 | run: npm run lint 22 | - name: Run Prettier 23 | run: npm run prettier:check 24 | - name: Run build 25 | run: npm run build --if-present 26 | -------------------------------------------------------------------------------- /.github/workflows/build-on-feature-branch.yaml: -------------------------------------------------------------------------------- 1 | name: Build Docker - Feature Branch Push 2 | on: 3 | push: 4 | branches-ignore: 5 | - main 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Dispatch Workflow 12 | uses: timescale/workflow-dispatch-action@main 13 | with: 14 | github-token: ${{ secrets.ORG_GITHUB_AGENTS_TOKEN }} 15 | owner: timescale 16 | repo: tiger-agents-deploy 17 | workflow_id: build.yaml 18 | ref: 'main' 19 | inputs: '{"repository": "pg-aiguide", "sha": "${{ github.sha }}"}' 20 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright 2025 Timescale, Inc., d/b/a Tiger Data 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | -------------------------------------------------------------------------------- /src/httpServer.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import { httpServerFactory, log } from '@tigerdata/mcp-boilerplate'; 3 | import { apiFactories } from './apis/index.js'; 4 | import { promptFactories } from './skillutils/index.js'; 5 | import { runMigrations } from './migrate.js'; 6 | import { context, serverInfo } from './serverInfo.js'; 7 | 8 | log.info('starting server...'); 9 | try { 10 | log.info('Running database migrations...'); 11 | await runMigrations(); 12 | log.info('Database migrations completed successfully'); 13 | } catch (error) { 14 | log.error('Database migration failed:', error as Error); 15 | throw error; 16 | } 17 | 18 | export const { registerCleanupFn } = httpServerFactory({ 19 | ...serverInfo, 20 | context, 21 | apiFactories, 22 | promptFactories, 23 | stateful: false, 24 | }); 25 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:22-alpine AS builder 2 | 3 | COPY package*.json /app/ 4 | COPY tsconfig.json /app/ 5 | COPY src /app/src 6 | COPY skills /app/skills 7 | COPY migrations /app/migrations 8 | 9 | WORKDIR /app 10 | 11 | RUN --mount=type=cache,target=/root/.npm npm install 12 | 13 | FROM node:22-alpine AS release 14 | 15 | LABEL io.modelcontextprotocol.server.name="io.github.timescale/pg-aiguide" 16 | 17 | WORKDIR /app 18 | 19 | COPY --from=builder /app/dist /app/dist 20 | COPY --from=builder /app/skills /app/skills 21 | COPY --from=builder /app/package.json /app/package.json 22 | COPY --from=builder /app/package-lock.json /app/package-lock.json 23 | COPY --from=builder /app/migrations /app/migrations 24 | 25 | ENV NODE_ENV=production 26 | 27 | RUN npm ci --ignore-scripts --omit-dev 28 | 29 | CMD ["node", "dist/index.js", "http"] 30 | -------------------------------------------------------------------------------- /migrations/1759241361471-add-version-index.js: -------------------------------------------------------------------------------- 1 | import 'dotenv/config'; 2 | import { Client } from 'pg'; 3 | 4 | const schema = process.env.DB_SCHEMA || 'docs'; 5 | 6 | export const description = 'Add index on postgres_pages.version'; 7 | 8 | export async function up() { 9 | const client = new Client(); 10 | 11 | try { 12 | await client.connect(); 13 | await client.query(/* sql */ ` 14 | CREATE INDEX CONCURRENTLY IF NOT EXISTS postgres_pages_version_idx 15 | ON ${schema}.postgres_pages (version); 16 | `); 17 | } catch (e) { 18 | throw e; 19 | } finally { 20 | await client.end(); 21 | } 22 | } 23 | 24 | export async function down() { 25 | const client = new Client(); 26 | 27 | try { 28 | await client.connect(); 29 | await client.query(/* sql */ ` 30 | DROP INDEX CONCURRENTLY IF EXISTS ${schema}.postgres_pages_version_idx; 31 | `); 32 | } catch (e) { 33 | throw e; 34 | } finally { 35 | await client.end(); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | db: 3 | image: timescale/timescaledb-ha:pg17 4 | environment: 5 | - POSTGRES_USER=postgres 6 | - POSTGRES_PASSWORD=postgres 7 | ports: 8 | - '5432:5432' 9 | volumes: 10 | - db_data:/home/postgres/pgdata/data 11 | - ./docker/tsdb/100_setup_db.sql:/docker-entrypoint-initdb.d/100_setup_db.sql 12 | healthcheck: 13 | test: ['CMD-SHELL', 'pg_isready -U postgres -d postgres'] 14 | interval: 1s 15 | timeout: 5s 16 | retries: 50 17 | 18 | app: 19 | build: 20 | context: . 21 | target: builder 22 | depends_on: 23 | db: 24 | condition: service_healthy 25 | env_file: .env 26 | ports: 27 | - '3020:3001' 28 | volumes: 29 | - ./migrations:/app/migrations 30 | - ./src:/app/src 31 | - ./skills:/app/skills 32 | - ./package.json:/app/package.json 33 | - ./package-lock.json:/app/package-lock.json 34 | - ./tsconfig.json:/app/tsconfig.json 35 | command: npm run watch:http 36 | 37 | volumes: 38 | db_data: 39 | -------------------------------------------------------------------------------- /src/util/featureFlags.ts: -------------------------------------------------------------------------------- 1 | import type { McpFeatureFlags } from '@tigerdata/mcp-boilerplate'; 2 | 3 | export interface FeatureFlags { 4 | mcpSkillsEnabled: boolean; 5 | } 6 | 7 | /** 8 | * Parse feature flags from query parameters or environment variables 9 | * Supports both HTTP (?disable_mcp_skills=1) and stdio transport (env var) 10 | */ 11 | export const parseFeatureFlags = ( 12 | query?: McpFeatureFlags['query'], 13 | ): FeatureFlags => { 14 | // Default: skills enabled 15 | let mcpSkillsEnabled = true; 16 | 17 | // Check query parameters first (for HTTP transport) 18 | if (query) { 19 | if ( 20 | query.disable_mcp_skills === '1' || 21 | query.disable_mcp_skills === 'true' 22 | ) { 23 | mcpSkillsEnabled = false; 24 | } 25 | } 26 | // Fall back to environment variables (for stdio transport) 27 | else if (process.env.DISABLE_MCP_SKILLS) { 28 | if ( 29 | process.env.DISABLE_MCP_SKILLS === '1' || 30 | process.env.DISABLE_MCP_SKILLS === 'true' 31 | ) { 32 | mcpSkillsEnabled = false; 33 | } 34 | } 35 | 36 | return { 37 | mcpSkillsEnabled, 38 | }; 39 | }; 40 | -------------------------------------------------------------------------------- /CLAUDE.md: -------------------------------------------------------------------------------- 1 | # Tiger Docs MCP Server - Development Guidelines 2 | 3 | ## Build, Test & Run Commands 4 | 5 | - Build: `npm run build` - Compiles TypeScript to JavaScript 6 | - Watch mode: `npm run watch` - Watches for changes and rebuilds automatically 7 | - Run server: `npm run start` - Starts the MCP server using stdio transport 8 | - Prepare release: `npm run prepare` - Builds the project for publishing 9 | 10 | ## Code Style Guidelines 11 | 12 | - Use ES modules with `.js` extension in import paths 13 | - Strictly type all functions and variables with TypeScript 14 | - Follow zod schema patterns for tool input validation 15 | - Use `.nullable()` instead of `.optional()` for optional MCP tool parameters (required for gpt-5 compatibility) 16 | - Prefer async/await over callbacks and Promise chains 17 | - Place all imports at top of file, grouped by external then internal 18 | - Use descriptive variable names that clearly indicate purpose 19 | - Implement proper cleanup for timers and resources in server shutdown 20 | - Follow camelCase for variables/functions, PascalCase for types/classes, UPPER_CASE for constants 21 | - Handle errors with try/catch blocks and provide clear error messages 22 | - Use consistent indentation (2 spaces) and trailing commas in multi-line objects 23 | -------------------------------------------------------------------------------- /migrations/1759241172003-add-hnsw-indexes.js: -------------------------------------------------------------------------------- 1 | import 'dotenv/config'; 2 | import { Client } from 'pg'; 3 | 4 | const schema = process.env.DB_SCHEMA || 'docs'; 5 | 6 | export const description = 'Add HNSW indexes to embedding columns'; 7 | 8 | export async function up() { 9 | const client = new Client(); 10 | 11 | try { 12 | await client.connect(); 13 | await client.query(/* sql */ ` 14 | CREATE INDEX CONCURRENTLY IF NOT EXISTS postgres_chunks_embedding_idx 15 | ON ${schema}.postgres_chunks 16 | USING hnsw (embedding vector_cosine_ops); 17 | `); 18 | await client.query(/* sql */ ` 19 | CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_chunks_embedding_idx 20 | ON ${schema}.timescale_chunks 21 | USING hnsw (embedding vector_cosine_ops); 22 | `); 23 | } catch (e) { 24 | throw e; 25 | } finally { 26 | await client.end(); 27 | } 28 | } 29 | 30 | export async function down() { 31 | const client = new Client(); 32 | 33 | try { 34 | await client.connect(); 35 | await client.query(/* sql */ ` 36 | DROP INDEX CONCURRENTLY IF EXISTS ${schema}.postgres_chunks_embedding_idx; 37 | `); 38 | await client.query(/* sql */ ` 39 | DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_chunks_embedding_idx; 40 | `); 41 | } catch (e) { 42 | throw e; 43 | } finally { 44 | await client.end(); 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /eslint.config.mjs: -------------------------------------------------------------------------------- 1 | // @ts-check 2 | 3 | import boilerplatePlugin from '@tigerdata/mcp-boilerplate/eslintPlugin'; 4 | import eslint from '@eslint/js'; 5 | import { defineConfig } from 'eslint/config'; 6 | import { dirname } from 'path'; 7 | import tseslint from 'typescript-eslint'; 8 | import { fileURLToPath } from 'url'; 9 | 10 | const __dirname = dirname(fileURLToPath(import.meta.url)); 11 | 12 | export default defineConfig( 13 | eslint.configs.recommended, 14 | tseslint.configs.recommended, 15 | { 16 | files: ['src/**/*.ts'], 17 | plugins: { 18 | 'mcp-boilerplate': boilerplatePlugin, 19 | }, 20 | languageOptions: { 21 | parserOptions: { 22 | project: './tsconfig.json', 23 | tsconfigRootDir: __dirname, 24 | }, 25 | }, 26 | rules: { 27 | // Disable base rule for unused vars and use TypeScript-specific one 28 | 'no-unused-vars': 'off', 29 | '@typescript-eslint/no-unused-vars': [ 30 | 'error', 31 | { argsIgnorePattern: '^_' }, 32 | ], 33 | '@typescript-eslint/explicit-function-return-type': 'warn', 34 | '@typescript-eslint/no-inferrable-types': 'warn', 35 | 'prefer-const': 'error', 36 | // Custom rule to prevent .optional() in inputSchema 37 | 'mcp-boilerplate/no-optional-input-schema': 'error', 38 | }, 39 | }, 40 | { 41 | ignores: ['dist/', 'node_modules/', 'migrations/', 'skills/'], 42 | }, 43 | ); 44 | -------------------------------------------------------------------------------- /.claude-plugin/marketplace.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "aiguide", 3 | "owner": { 4 | "name": "TigerData", 5 | "url": "https://tigerdata.com", 6 | "email": "support@tigerdata.com" 7 | }, 8 | "metadata": { 9 | "description": "PostgreSQL documentation and ecosystem tools marketplace", 10 | "version": "1.0.0", 11 | "pluginRoot": "." 12 | }, 13 | "plugins": [ 14 | { 15 | "name": "pg", 16 | "source": "./", 17 | "description": "Comprehensive PostgreSQL documentation and best practices through semantic search and curated skills, including ecosystem tools like TimescaleDB and Tiger Cloud", 18 | "version": "0.1.0", 19 | "author": { 20 | "name": "TigerData", 21 | "url": "https://tigerdata.com" 22 | }, 23 | "homepage": "https://tigerdata.com", 24 | "repository": "https://github.com/timescale/pg-aiguide", 25 | "license": "Apache-2.0", 26 | "keywords": [ 27 | "postgresql", 28 | "postgres", 29 | "database", 30 | "sql", 31 | "skills", 32 | "aiguide", 33 | "timescaledb", 34 | "documentation", 35 | "semantic-search", 36 | "best-practices" 37 | ], 38 | "category": "database", 39 | "mcpServers": { 40 | "pg-aiguide": { 41 | "type": "http", 42 | "url": "https://mcp.tigerdata.com/docs?disable_mcp_skills=1" 43 | } 44 | }, 45 | "strict": false 46 | } 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /.github/workflows/build-and-deploy-on-merge.yaml: -------------------------------------------------------------------------------- 1 | name: Build and Deploy - Merge main 2 | on: 3 | push: 4 | branches: 5 | - main 6 | 7 | jobs: 8 | build: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Dispatch Workflow 12 | uses: timescale/workflow-dispatch-action@main 13 | with: 14 | github-token: ${{ secrets.ORG_GITHUB_AGENTS_TOKEN }} 15 | owner: timescale 16 | repo: tiger-agents-deploy 17 | workflow_id: build.yaml 18 | ref: 'main' 19 | inputs: > 20 | { 21 | "repository": "pg-aiguide", 22 | "sha": "${{ github.sha }}", 23 | "latest": true 24 | } 25 | 26 | deploy: 27 | runs-on: ubuntu-latest 28 | needs: build 29 | strategy: 30 | matrix: 31 | include: 32 | - env: dev 33 | namespace: savannah-system 34 | - env: prod 35 | namespace: tiger-mcp 36 | steps: 37 | - name: Dispatch Workflow - ${{ matrix.env }} 38 | uses: timescale/workflow-dispatch-action@main 39 | with: 40 | github-token: ${{ secrets.ORG_GITHUB_AGENTS_TOKEN }} 41 | owner: timescale 42 | repo: tiger-agents-deploy 43 | workflow_id: deploy.yaml 44 | ref: 'main' 45 | inputs: > 46 | { 47 | "repository": "pg-aiguide", 48 | "sha": "${{ github.sha }}", 49 | "env": "${{ matrix.env }}", 50 | "namespace": "${{ matrix.namespace }}" 51 | } 52 | -------------------------------------------------------------------------------- /.github/workflows/ingest-tiger-docs.yaml: -------------------------------------------------------------------------------- 1 | name: Ingest Tiger Docs 2 | 3 | on: 4 | schedule: 5 | - cron: '0 2 * * 0' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | ingest-dev: 10 | name: Ingest Tiger docs for Dev 11 | runs-on: ubuntu-latest 12 | defaults: 13 | run: 14 | working-directory: ingest 15 | env: 16 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 17 | PGHOST: ${{ secrets.PGHOST }} 18 | PGPORT: ${{ secrets.PGPORT }} 19 | PGDATABASE: ${{ secrets.PGDATABASE }} 20 | PGUSER: ${{ secrets.PGUSER }} 21 | PGPASSWORD: ${{ secrets.PGPASSWORD }} 22 | steps: &ingest-steps 23 | - name: Checkout repository 24 | uses: actions/checkout@v5 25 | 26 | - name: Set up Python 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version-file: ./ingest/.python-version 30 | 31 | - name: Install uv 32 | uses: astral-sh/setup-uv@v6 33 | 34 | - name: Install python dependencies 35 | run: uv sync 36 | 37 | - name: Ingest Tiger docs 38 | run: uv run python tiger_docs.py 39 | 40 | ingest-prod: 41 | name: Ingest Tiger docs for Prod 42 | runs-on: ubuntu-latest 43 | defaults: 44 | run: 45 | working-directory: ingest 46 | env: 47 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 48 | PGHOST: ${{ secrets.PROD_PGHOST }} 49 | PGPORT: ${{ secrets.PROD_PGPORT }} 50 | PGDATABASE: ${{ secrets.PROD_PGDATABASE }} 51 | PGUSER: ${{ secrets.PROD_PGUSER }} 52 | PGPASSWORD: ${{ secrets.PROD_PGPASSWORD }} 53 | steps: *ingest-steps 54 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@tigerdata/pg-aiguide", 3 | "version": "0.2.4", 4 | "description": "Comprehensive PostgreSQL documentation and best practices through semantic search and curated skills, including ecosystem tools like TimescaleDB and Tiger Cloud", 5 | "license": "Apache-2.0", 6 | "author": "TigerData", 7 | "homepage": "https://tigerdata.com", 8 | "repository": { 9 | "type": "git", 10 | "url": "https://github.com/timescale/pg-aiguide" 11 | }, 12 | "mcpName": "io.github.timescale/pg-aiguide", 13 | "type": "module", 14 | "bin": { 15 | "pg-aiguide": "dist/index.js" 16 | }, 17 | "files": [ 18 | "dist" 19 | ], 20 | "scripts": { 21 | "build": "tsc && shx chmod +x dist/*.js", 22 | "prepare": "npm run build", 23 | "watch": "tsx watch src/index.ts stdio", 24 | "watch:http": "tsx watch src/index.ts http", 25 | "start": "node dist/index.js stdio", 26 | "start:http": "node dist/index.js http", 27 | "inspector": "npx @modelcontextprotocol/inspector", 28 | "lint": "eslint", 29 | "lint:fix": "eslint --fix", 30 | "prettier:check": "prettier --check .", 31 | "prettier:write": "prettier --write .", 32 | "migrate": "migrate" 33 | }, 34 | "dependencies": { 35 | "@ai-sdk/openai": "^2.0.80", 36 | "@tigerdata/mcp-boilerplate": "^0.8.0", 37 | "ai": "^5.0.108", 38 | "dotenv": "^17.2.3", 39 | "gray-matter": "^4.0.3", 40 | "migrate": "^2.1.0", 41 | "pg": "^8.16.3", 42 | "zod": "^3.25.76" 43 | }, 44 | "devDependencies": { 45 | "@eslint/js": "^9.39.1", 46 | "@types/node": "^22.19.2", 47 | "@types/pg": "^8.15.6", 48 | "eslint": "^9.39.1", 49 | "prettier": "^3.7.4", 50 | "shx": "^0.4.0", 51 | "tsx": "^4.21.0", 52 | "typescript": "^5.9.3", 53 | "typescript-eslint": "^8.49.0" 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /migrations/1759851009030-add-tiger-indexes.js: -------------------------------------------------------------------------------- 1 | import 'dotenv/config'; 2 | import { Client } from 'pg'; 3 | 4 | const schema = process.env.DB_SCHEMA || 'docs'; 5 | 6 | export const description = 'Add HNSW indexes to embedding columns'; 7 | 8 | export async function up() { 9 | const client = new Client(); 10 | 11 | try { 12 | await client.connect(); 13 | await client.query(/* sql */ ` 14 | CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_pages_domain_idx 15 | ON ${schema}.timescale_pages(domain); 16 | `); 17 | await client.query(/* sql */ ` 18 | CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_pages_url_idx 19 | ON ${schema}.timescale_pages(url); 20 | `); 21 | await client.query(/* sql */ ` 22 | CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_chunks_page_id_idx 23 | ON ${schema}.timescale_chunks(page_id); 24 | `); 25 | await client.query(/* sql */ ` 26 | CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_chunks_metadata_idx 27 | ON ${schema}.timescale_chunks 28 | USING gin(metadata); 29 | `); 30 | } catch (e) { 31 | throw e; 32 | } finally { 33 | await client.end(); 34 | } 35 | } 36 | 37 | export async function down() { 38 | const client = new Client(); 39 | 40 | try { 41 | await client.connect(); 42 | await client.query(/* sql */ ` 43 | DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_pages_domain_idx; 44 | `); 45 | await client.query(/* sql */ ` 46 | DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_pages_url_idx; 47 | `); 48 | await client.query(/* sql */ ` 49 | DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_chunks_page_id_idx; 50 | `); 51 | await client.query(/* sql */ ` 52 | DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_chunks_metadata_idx; 53 | `); 54 | } catch (e) { 55 | throw e; 56 | } finally { 57 | await client.end(); 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /.github/workflows/ingest-postgres-docs.yaml: -------------------------------------------------------------------------------- 1 | name: Ingest PostgreSQL Docs 2 | run-name: Ingest PostgreSQL ${{ inputs.version }} Docs 3 | 4 | on: 5 | workflow_dispatch: 6 | inputs: 7 | version: 8 | description: 'PostgreSQL version to ingest (e.g. 14, 15, etc.)' 9 | required: true 10 | default: '17' 11 | 12 | jobs: 13 | ingest-dev: 14 | name: Ingest PostgreSQL docs for Dev 15 | runs-on: ubuntu-latest 16 | defaults: 17 | run: 18 | working-directory: ingest 19 | env: 20 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 21 | PGHOST: ${{ secrets.PGHOST }} 22 | PGPORT: ${{ secrets.PGPORT }} 23 | PGDATABASE: ${{ secrets.PGDATABASE }} 24 | PGUSER: ${{ secrets.PGUSER }} 25 | PGPASSWORD: ${{ secrets.PGPASSWORD }} 26 | steps: &ingest-steps 27 | - name: Checkout repository 28 | uses: actions/checkout@v5 29 | 30 | - name: Set up Python 31 | uses: actions/setup-python@v5 32 | with: 33 | python-version-file: ./ingest/.python-version 34 | 35 | - name: Install uv 36 | uses: astral-sh/setup-uv@v6 37 | 38 | - name: Install python dependencies 39 | run: uv sync 40 | 41 | - name: Install system dependencies 42 | run: | 43 | sudo apt-get update 44 | sudo apt-get install -y docbook-xml docbook-xsl libxml2-utils xsltproc fop 45 | 46 | - name: Ingest PostgreSQL ${{ github.event.inputs.version }} docs for dev 47 | run: uv run python postgres_docs.py ${{ github.event.inputs.version }} 48 | 49 | ingest-prod: 50 | name: Ingest PostgreSQL docs for Prod 51 | runs-on: ubuntu-latest 52 | defaults: 53 | run: 54 | working-directory: ingest 55 | env: 56 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 57 | PGHOST: ${{ secrets.PROD_PGHOST }} 58 | PGPORT: ${{ secrets.PROD_PGPORT }} 59 | PGDATABASE: ${{ secrets.PROD_PGDATABASE }} 60 | PGUSER: ${{ secrets.PROD_PGUSER }} 61 | PGPASSWORD: ${{ secrets.PROD_PGPASSWORD }} 62 | steps: *ingest-steps 63 | -------------------------------------------------------------------------------- /src/apis/viewSkill.ts: -------------------------------------------------------------------------------- 1 | import { ApiFactory, InferSchema } from '@tigerdata/mcp-boilerplate'; 2 | import { z } from 'zod'; 3 | import { ServerContext } from '../types.js'; 4 | import { skills, viewSkillContent } from '../skillutils/index.js'; 5 | import { parseFeatureFlags } from '../util/featureFlags.js'; 6 | 7 | // Create enum schema dynamically 8 | const inputSchema = { 9 | name: z 10 | .enum(Array.from(skills.keys()) as [string, ...string[]]) 11 | .describe('The name of the skill to retrieve'), 12 | } as const; 13 | 14 | // Path within the skill directory - currently fixed to SKILL.md 15 | const SKILL_PATH = 'SKILL.md'; 16 | 17 | const outputSchema = { 18 | name: z.string().describe('The name of the requested skill'), 19 | path: z.string().describe('The path within the skill (e.g., "SKILL.md")'), 20 | description: z.string().describe('Description of what this skill does'), 21 | content: z.string().describe('The full skill content'), 22 | } as const; 23 | 24 | type OutputSchema = InferSchema; 25 | 26 | export const viewSkillFactory: ApiFactory< 27 | ServerContext, 28 | typeof inputSchema, 29 | typeof outputSchema 30 | > = (_context, { query }) => { 31 | // Parse feature flags from query or environment 32 | const flags = parseFeatureFlags(query); 33 | 34 | return { 35 | name: 'view_skill', 36 | disabled: !flags.mcpSkillsEnabled, 37 | config: { 38 | title: 'View Skill', 39 | description: `Retrieve detailed skills for TimescaleDB operations and best practices. 40 | 41 | Available Skills: 42 | 43 | ${Array.from(skills.values()) 44 | .map((s) => `**${s.name}** - ${s.description}`) 45 | .join('\n\n')} 46 | `, 47 | inputSchema, 48 | outputSchema, 49 | }, 50 | fn: async ({ name }): Promise => { 51 | const skill = skills.get(name); 52 | 53 | if (!skill) { 54 | throw new Error(`Skill '${name}' not found`); 55 | } 56 | 57 | const content = await viewSkillContent(name, SKILL_PATH); 58 | 59 | return { 60 | name: skill.name, 61 | path: SKILL_PATH, 62 | description: skill.description || '', 63 | content, 64 | }; 65 | }, 66 | }; 67 | }; 68 | -------------------------------------------------------------------------------- /API.md: -------------------------------------------------------------------------------- 1 | # API 2 | 3 | All methods are exposed as MCP tools. 4 | 5 | ## Semantic Search 6 | 7 | ### `semantic_search_postgres_docs` 8 | 9 | Searches the PostgreSQL documentation for relevant entries based on semantic similarity to the search prompt. 10 | 11 | **MCP Tool**: `semantic_search_postgres_docs` 12 | 13 | #### Input 14 | 15 | ```jsonc 16 | { 17 | "prompt": "What is the SQL command to create a table?", 18 | "version": 17, // optional, default is 17 (supports versions 14-18) 19 | "limit": 10, // optional, default is 10 20 | } 21 | ``` 22 | 23 | #### Output 24 | 25 | ```jsonc 26 | { 27 | "results": [ 28 | { 29 | "id": 11716, 30 | "content": "CREATE TABLE ...", 31 | "metadata": "{...}", // JSON-encoded metadata 32 | "distance": 0.407, // lower = more relevant 33 | }, 34 | // ...more results 35 | ], 36 | } 37 | ``` 38 | 39 | ### `semantic_search_tiger_docs` 40 | 41 | Searches the TigerData and TimescaleDB documentation using semantic similarity. 42 | 43 | **MCP Tool**: `semantic_search_tiger_docs` 44 | 45 | #### Input 46 | 47 | ```jsonc 48 | { 49 | "prompt": "How do I set up continuous aggregates?", 50 | "limit": 10, // optional, default is 10 51 | } 52 | ``` 53 | 54 | #### Output 55 | 56 | Same format as PostgreSQL semantic search above. 57 | 58 | ## Skills 59 | 60 | ### `view_skill` 61 | 62 | Retrieves curated skills for common PostgreSQL and TimescaleDB tasks. This tool is disabled 63 | when deploying as a claude plugin (which use [agent skills ](https://www.claude.com/blog/skills) directly). 64 | 65 | **MCP Tool**: `view_skill` 66 | 67 | ### Input 68 | 69 | ```jsonc 70 | { 71 | "name": "setup-timescaledb-hypertables", // see available skills in tool description 72 | "path": "SKILL.md", // optional, defaults to "SKILL.md" 73 | } 74 | ``` 75 | 76 | ### Output 77 | 78 | ```jsonc 79 | { 80 | "name": "setup-timescaledb-hypertables", 81 | "path": "SKILL.md", 82 | "description": "Step-by-step instructions for designing table schemas and setting up TimescaleDB with hypertables, indexes, compression, retention policies, and continuous aggregates.", 83 | "content": "...", // full skill content 84 | } 85 | ``` 86 | 87 | **Available Skills**: Check the MCP tool description for the current list of available skills or look in the `skills` directory. 88 | -------------------------------------------------------------------------------- /src/apis/kewordSearchTigerDocs.ts: -------------------------------------------------------------------------------- 1 | import { ApiFactory, InferSchema } from '@tigerdata/mcp-boilerplate'; 2 | import { z } from 'zod'; 3 | import { ServerContext } from '../types.js'; 4 | 5 | const inputSchema = { 6 | limit: z.coerce 7 | .number() 8 | .int() 9 | .describe('The maximum number of matches to return. Defaults to 10.'), 10 | keywords: z.string().describe('The set of keywords to search for.'), 11 | } as const; 12 | 13 | const zEmbeddedDoc = z.object({ 14 | id: z 15 | .number() 16 | .int() 17 | .describe('The unique identifier of the documentation entry.'), 18 | content: z.string().describe('The content of the documentation entry.'), 19 | metadata: z 20 | .string() 21 | .describe( 22 | 'Additional metadata about the documentation entry, as a JSON encoded string.', 23 | ), 24 | score: z 25 | .number() 26 | .describe( 27 | 'The score indicating the relevance of the entry to the keywords. Higher values indicate higher relevance.', 28 | ), 29 | }); 30 | 31 | type EmbeddedDoc = z.infer; 32 | 33 | const outputSchema = { 34 | results: z.array(zEmbeddedDoc), 35 | } as const; 36 | 37 | type OutputSchema = InferSchema; 38 | 39 | export const keywordSearchTigerDocsFactory: ApiFactory< 40 | ServerContext, 41 | typeof inputSchema, 42 | typeof outputSchema, 43 | z.infer<(typeof outputSchema)['results']> 44 | > = ({ pgPool, schema }) => ({ 45 | name: 'keyword_search_tiger_docs', 46 | method: 'get', 47 | route: '/keyword-search/tiger-docs', 48 | config: { 49 | title: 'Keyword Search of Tiger Documentation', 50 | description: 51 | 'This retrieves relevancy ranked documentation entries based on a set of keywords, using a bm25 search. The content covers Tiger Cloud and TimescaleDB topics.', 52 | inputSchema, 53 | outputSchema, 54 | }, 55 | disabled: process.env.ENABLE_KEYWORD_SEARCH !== 'true', 56 | fn: async ({ keywords, limit }): Promise => { 57 | if (limit < 0) { 58 | throw new Error('Limit must be a non-negative integer.'); 59 | } 60 | if (!keywords.trim()) { 61 | throw new Error('Keywords must be a non-empty string.'); 62 | } 63 | 64 | const result = await pgPool.query( 65 | /* sql */ ` 66 | SELECT 67 | id::int, 68 | content, 69 | metadata::text, 70 | -(content <@> to_bm25query($1, 'docs.timescale_chunks_content_idx')) as score 71 | FROM ${schema}.timescale_chunks 72 | ORDER BY content <@> to_bm25query($1, 'docs.timescale_chunks_content_idx') 73 | LIMIT $2 74 | `, 75 | [keywords, limit || 10], 76 | ); 77 | 78 | return { 79 | results: result.rows, 80 | }; 81 | }, 82 | pickResult: (r) => r.results, 83 | }); 84 | -------------------------------------------------------------------------------- /generate-server.json.ts: -------------------------------------------------------------------------------- 1 | import { writeFile } from 'fs/promises'; 2 | 3 | const version = process.argv[2]?.trim().replace(/^v/, ''); 4 | if (!version) { 5 | console.error('Must provide version as first argument'); 6 | process.exit(1); 7 | } 8 | 9 | const environmentVariables = [ 10 | { 11 | description: 'Your API key for text embeddings via OpenAI', 12 | isRequired: true, 13 | format: 'string', 14 | isSecret: true, 15 | name: 'OPENAI_API_KEY', 16 | }, 17 | { 18 | description: 'PostgreSQL host to connect to', 19 | isRequired: true, 20 | format: 'string', 21 | isSecret: true, 22 | name: 'PGHOST', 23 | }, 24 | { 25 | description: 'PostgreSQL port to connect to', 26 | isRequired: true, 27 | format: 'number', 28 | isSecret: true, 29 | name: 'PGPORT', 30 | }, 31 | { 32 | description: 'PostgreSQL user to connect as', 33 | isRequired: true, 34 | format: 'string', 35 | isSecret: true, 36 | name: 'PGUSER', 37 | }, 38 | { 39 | description: 'PostgreSQL password to connect with', 40 | isRequired: true, 41 | format: 'string', 42 | isSecret: true, 43 | name: 'PGPASSWORD', 44 | }, 45 | { 46 | description: 'PostgreSQL database to connect to', 47 | isRequired: true, 48 | format: 'string', 49 | isSecret: true, 50 | name: 'PGDATABASE', 51 | }, 52 | { 53 | description: 'PostgreSQL database schema to use', 54 | isRequired: false, 55 | format: 'string', 56 | isSecret: true, 57 | name: 'DB_SCHEMA', 58 | }, 59 | ]; 60 | 61 | const output = { 62 | $schema: 63 | 'https://static.modelcontextprotocol.io/schemas/2025-10-17/server.schema.json', 64 | name: 'io.github.timescale/pg-aiguide', 65 | // max length 100 chars: 66 | description: 67 | 'Comprehensive PostgreSQL documentation and best practices, including ecosystem tools', 68 | repository: { 69 | url: 'https://github.com/timescale/pg-aiguide', 70 | source: 'github', 71 | }, 72 | version, 73 | remotes: [ 74 | { 75 | type: 'streamable-http', 76 | url: 'https://mcp.tigerdata.com/docs', 77 | }, 78 | ], 79 | packages: [ 80 | { 81 | registryType: 'npm', 82 | identifier: '@tigerdata/pg-aiguide', 83 | version, 84 | transport: { 85 | type: 'stdio', 86 | }, 87 | environmentVariables, 88 | }, 89 | { 90 | registryType: 'oci', 91 | identifier: `ghcr.io/timescale/pg-aiguide:${version}`, 92 | transport: { 93 | type: 'stdio', 94 | }, 95 | environmentVariables, 96 | }, 97 | ], 98 | }; 99 | 100 | await writeFile('server.json', JSON.stringify(output, null, 2)); 101 | -------------------------------------------------------------------------------- /src/apis/semanticSearchTigerDocs.ts: -------------------------------------------------------------------------------- 1 | import { openai } from '@ai-sdk/openai'; 2 | import { ApiFactory, InferSchema } from '@tigerdata/mcp-boilerplate'; 3 | import { embed } from 'ai'; 4 | import { z } from 'zod'; 5 | import { ServerContext } from '../types.js'; 6 | 7 | const inputSchema = { 8 | limit: z.coerce 9 | .number() 10 | .int() 11 | .describe('The maximum number of matches to return. Defaults to 10.'), 12 | prompt: z 13 | .string() 14 | .describe( 15 | 'The natural language query used to search the documentation for relevant information.', 16 | ), 17 | } as const; 18 | 19 | const zEmbeddedDoc = z.object({ 20 | id: z 21 | .number() 22 | .int() 23 | .describe('The unique identifier of the documentation entry.'), 24 | content: z.string().describe('The content of the documentation entry.'), 25 | metadata: z 26 | .string() 27 | .describe( 28 | 'Additional metadata about the documentation entry, as a JSON encoded string.', 29 | ), 30 | distance: z 31 | .number() 32 | .describe( 33 | 'The distance score indicating the relevance of the entry to the prompt. Lower values indicate higher relevance.', 34 | ), 35 | }); 36 | 37 | type EmbeddedDoc = z.infer; 38 | 39 | const outputSchema = { 40 | results: z.array(zEmbeddedDoc), 41 | } as const; 42 | 43 | type OutputSchema = InferSchema; 44 | 45 | export const semanticSearchTigerDocsFactory: ApiFactory< 46 | ServerContext, 47 | typeof inputSchema, 48 | typeof outputSchema, 49 | z.infer<(typeof outputSchema)['results']> 50 | > = ({ pgPool, schema }) => ({ 51 | name: 'semantic_search_tiger_docs', 52 | method: 'get', 53 | route: '/semantic-search/tiger-docs', 54 | config: { 55 | title: 'Semantic Search of Tiger Documentation Embeddings', 56 | description: 57 | 'This retrieves relevant documentation entries based on a natural language query. The content covers Tiger Cloud and TimescaleDB topics.', 58 | inputSchema, 59 | outputSchema, 60 | }, 61 | fn: async ({ prompt, limit }): Promise => { 62 | if (limit < 0) { 63 | throw new Error('Limit must be a non-negative integer.'); 64 | } 65 | if (!prompt.trim()) { 66 | throw new Error('Prompt must be a non-empty string.'); 67 | } 68 | 69 | const { embedding } = await embed({ 70 | model: openai.embedding('text-embedding-3-small'), 71 | value: prompt, 72 | }); 73 | 74 | const result = await pgPool.query( 75 | /* sql */ ` 76 | SELECT 77 | id::int, 78 | content, 79 | metadata::text, 80 | embedding <=> $1::vector(1536) AS distance 81 | FROM ${schema}.timescale_chunks 82 | ORDER BY distance 83 | LIMIT $2 84 | `, 85 | [JSON.stringify(embedding), limit || 10], 86 | ); 87 | 88 | return { 89 | results: result.rows, 90 | }; 91 | }, 92 | pickResult: (r) => r.results, 93 | }); 94 | -------------------------------------------------------------------------------- /migrations/1756387543053-initial.js: -------------------------------------------------------------------------------- 1 | import 'dotenv/config'; 2 | import { Client } from 'pg'; 3 | 4 | const schema = process.env.DB_SCHEMA || 'docs'; 5 | 6 | export const description = 'Create schema and docs tables'; 7 | 8 | export async function up() { 9 | const client = new Client(); 10 | 11 | try { 12 | await client.connect(); 13 | await client.query('BEGIN'); 14 | await client.query(/* sql */ ` 15 | CREATE EXTENSION IF NOT EXISTS vector; 16 | 17 | CREATE TABLE ${schema}.postgres_pages ( 18 | id int4 PRIMARY KEY generated by default as identity 19 | , version int2 NOT NULL 20 | , url TEXT UNIQUE NOT NULL 21 | , domain TEXT NOT NULL 22 | , filename TEXT NOT NULL 23 | , content_length INTEGER 24 | , scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP 25 | , chunking_method TEXT DEFAULT 'header' 26 | , chunks_count INTEGER DEFAULT 0 27 | ); 28 | 29 | CREATE TABLE IF NOT EXISTS ${schema}.postgres_chunks ( 30 | id int4 PRIMARY KEY generated by default as identity 31 | , page_id INTEGER REFERENCES ${schema}.postgres_pages(id) ON DELETE CASCADE 32 | , chunk_index INTEGER NOT NULL 33 | , sub_chunk_index INTEGER NOT NULL DEFAULT 0 34 | , content TEXT NOT NULL 35 | , metadata JSONB 36 | , embedding vector(1536) 37 | , created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP 38 | ); 39 | 40 | CREATE TABLE ${schema}.timescale_pages ( 41 | id int4 PRIMARY KEY generated by default as identity 42 | , url TEXT UNIQUE NOT NULL 43 | , domain TEXT NOT NULL 44 | , filename TEXT NOT NULL 45 | , content_length INTEGER 46 | , scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP 47 | , chunking_method TEXT DEFAULT 'header' 48 | , chunks_count INTEGER DEFAULT 0 49 | ); 50 | 51 | CREATE TABLE IF NOT EXISTS ${schema}.timescale_chunks ( 52 | id int4 PRIMARY KEY generated by default as identity 53 | , page_id INTEGER REFERENCES ${schema}.timescale_pages(id) ON DELETE CASCADE 54 | , chunk_index INTEGER NOT NULL 55 | , sub_chunk_index INTEGER NOT NULL DEFAULT 0 56 | , content TEXT NOT NULL 57 | , metadata JSONB 58 | , embedding vector(1536) 59 | , created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP 60 | ); 61 | `); 62 | 63 | await client.query('COMMIT'); 64 | } catch (e) { 65 | await client.query('ROLLBACK'); 66 | throw e; 67 | } finally { 68 | await client.end(); 69 | } 70 | } 71 | 72 | export async function down() { 73 | const client = new Client(); 74 | 75 | try { 76 | await client.connect(); 77 | await client.query(/* sql */ ` 78 | DROP TABLE IF EXISTS ${schema}.timescale_chunks; 79 | DROP TABLE IF EXISTS ${schema}.timescale_pages; 80 | DROP TABLE IF EXISTS ${schema}.postgres_chunks; 81 | DROP TABLE IF EXISTS ${schema}.postgres_pages; 82 | `); 83 | } finally { 84 | await client.end(); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /ingest/README.md: -------------------------------------------------------------------------------- 1 | # Ingest 2 | 3 | ## Setup 4 | 5 | ### Prerequisites 6 | 7 | - [`uv`](https://docs.astral.sh/uv/) 8 | - Docbook Toolsets for building PostgreSQL docs 9 | (see [this page](https://www.postgresql.org/docs/current/docguide-toolsets.html) 10 | for installing for specific platforms) 11 | 12 | ### Install Dependencies 13 | 14 | ```bash 15 | uv sync 16 | ``` 17 | 18 | ## Running the ingest 19 | 20 | ### PostgreSQL Documentation 21 | 22 | ```text 23 | $ uv run python postgres_docs.py --help 24 | usage: postgres_docs.py [-h] version 25 | 26 | Ingest Postgres documentation into the database. 27 | 28 | positional arguments: 29 | version Postgres version to ingest 30 | 31 | options: 32 | -h, --help show this help message and exit 33 | ``` 34 | 35 | ### Tiger Documentation 36 | 37 | ```text 38 | uv run python tiger_docs.py --help 39 | usage: tiger_docs.py [-h] [--domain DOMAIN] [-o OUTPUT_DIR] [-m MAX_PAGES] [--strip-images] [--no-strip-images] [--chunk] [--no-chunk] [--chunking {header,semantic}] [--storage-type {file,database}] [--database-uri DATABASE_URI] 40 | [--skip-indexes] [--delay DELAY] [--concurrent CONCURRENT] [--log-level {DEBUG,INFO,WARNING,ERROR}] [--user-agent USER_AGENT] 41 | 42 | Scrape websites using sitemaps and convert to chunked markdown for RAG applications 43 | 44 | options: 45 | -h, --help show this help message and exit 46 | --domain, -d DOMAIN Domain to scrape (e.g., docs.tigerdata.com) 47 | -o, --output-dir OUTPUT_DIR 48 | Output directory for scraped files (default: scraped_docs) 49 | -m, --max-pages MAX_PAGES 50 | Maximum number of pages to scrape (default: unlimited) 51 | --strip-images Strip data: images from content (default: True) 52 | --no-strip-images Keep data: images in content 53 | --chunk Enable content chunking (default: True) 54 | --no-chunk Disable content chunking 55 | --chunking {header,semantic} 56 | Chunking method: header (default) or semantic (requires OPENAI_API_KEY) 57 | --storage-type {file,database} 58 | Storage type: database (default) or file 59 | --database-uri DATABASE_URI 60 | PostgreSQL connection URI (default: uses DB_URL from environment) 61 | --skip-indexes Skip creating database indexes after import (for development/testing) 62 | --delay DELAY Download delay in seconds (default: 1.0) 63 | --concurrent CONCURRENT 64 | Maximum concurrent requests (default: 4) 65 | --log-level {DEBUG,INFO,WARNING,ERROR} 66 | Logging level (default: INFO) 67 | --user-agent USER_AGENT 68 | User agent string 69 | 70 | Examples: 71 | tiger_docs.py docs.tigerdata.com 72 | tiger_docs.py docs.tigerdata.com -o tiger_docs -m 50 73 | tiger_docs.py docs.tigerdata.com -o semantic_docs -m 5 --chunking semantic 74 | tiger_docs.py docs.tigerdata.com --no-chunk --no-strip-images -m 100 75 | tiger_docs.py docs.tigerdata.com --storage-type database --database-uri postgresql://user:pass@host:5432/dbname 76 | tiger_docs.py docs.tigerdata.com --storage-type database --chunking semantic -m 10 77 | ``` 78 | -------------------------------------------------------------------------------- /src/apis/semanticSearchPostgresDocs.ts: -------------------------------------------------------------------------------- 1 | import { ApiFactory, InferSchema } from '@tigerdata/mcp-boilerplate'; 2 | import { openai } from '@ai-sdk/openai'; 3 | import { embed } from 'ai'; 4 | import { z } from 'zod'; 5 | import { ServerContext } from '../types.js'; 6 | 7 | const inputSchema = { 8 | version: z 9 | .enum(['14', '15', '16', '17', '18']) 10 | .describe( 11 | 'The PostgreSQL major version to use for the query. Recommended to assume the latest version if unknown.', 12 | ), 13 | limit: z.coerce 14 | .number() 15 | .int() 16 | .describe('The maximum number of matches to return. Defaults to 10.'), 17 | prompt: z 18 | .string() 19 | .describe( 20 | 'The natural language query used to search the PostgreSQL documentation for relevant information.', 21 | ), 22 | } as const; 23 | 24 | const zEmbeddedDoc = z.object({ 25 | id: z 26 | .number() 27 | .int() 28 | .describe('The unique identifier of the documentation entry.'), 29 | content: z.string().describe('The content of the documentation entry.'), 30 | metadata: z 31 | .string() 32 | .describe( 33 | 'Additional metadata about the documentation entry, as a JSON encoded string.', 34 | ), 35 | distance: z 36 | .number() 37 | .describe( 38 | 'The distance score indicating the relevance of the entry to the prompt. Lower values indicate higher relevance.', 39 | ), 40 | }); 41 | 42 | type EmbeddedDoc = z.infer; 43 | 44 | const outputSchema = { 45 | results: z.array(zEmbeddedDoc), 46 | } as const; 47 | 48 | type OutputSchema = InferSchema; 49 | 50 | export const semanticSearchPostgresDocsFactory: ApiFactory< 51 | ServerContext, 52 | typeof inputSchema, 53 | typeof outputSchema, 54 | z.infer<(typeof outputSchema)['results']> 55 | > = ({ pgPool, schema }) => ({ 56 | name: 'semantic_search_postgres_docs', 57 | method: 'get', 58 | route: '/semantic-search/postgres-docs', 59 | config: { 60 | title: 'Semantic Search of PostgreSQL Documentation Embeddings', 61 | description: 62 | 'This retrieves relevant PostgreSQL documentation entries based on a natural language query.', 63 | inputSchema, 64 | outputSchema, 65 | }, 66 | fn: async ({ prompt, version, limit }): Promise => { 67 | if (limit < 0) { 68 | throw new Error('Limit must be a non-negative integer.'); 69 | } 70 | if (!prompt.trim()) { 71 | throw new Error('Prompt must be a non-empty string.'); 72 | } 73 | 74 | const { embedding } = await embed({ 75 | model: openai.embedding('text-embedding-3-small'), 76 | value: prompt, 77 | }); 78 | 79 | const result = await pgPool.query( 80 | /* sql */ ` 81 | SELECT 82 | c.id::int, 83 | c.content, 84 | c.metadata::text, 85 | c.embedding <=> $1::vector(1536) AS distance 86 | FROM ${schema}.postgres_chunks c 87 | JOIN ${schema}.postgres_pages p ON c.page_id = p.id 88 | WHERE p.version = $2 89 | ORDER BY distance 90 | LIMIT $3 91 | `, 92 | [JSON.stringify(embedding), version, limit || 10], 93 | ); 94 | 95 | return { 96 | results: result.rows, 97 | }; 98 | }, 99 | pickResult: (r) => r.results, 100 | }); 101 | -------------------------------------------------------------------------------- /src/migrate.ts: -------------------------------------------------------------------------------- 1 | import migrate from 'migrate'; 2 | import path from 'path'; 3 | import { Client } from 'pg'; 4 | import { createHash } from 'crypto'; 5 | import { fileURLToPath } from 'url'; 6 | import { schema } from './config.js'; 7 | 8 | // Use a hash of the project name 9 | const hash = createHash('sha256').update('pg-aiguide').digest('hex'); 10 | const MIGRATION_ADVISORY_LOCK_ID = parseInt(hash.substring(0, 15), 16); 11 | 12 | const __filename = fileURLToPath(import.meta.url); 13 | const __dirname = path.dirname(__filename); 14 | 15 | const createStateStore = (): { 16 | load(callback: (err: Error | null, set?: unknown) => void): Promise; 17 | save(set: unknown, callback: (err: Error | null) => void): Promise; 18 | close(): Promise; 19 | } => { 20 | let client: Client; 21 | 22 | return { 23 | async load( 24 | callback: (err: Error | null, set?: unknown) => void, 25 | ): Promise { 26 | try { 27 | client = new Client(); 28 | await client.connect(); 29 | 30 | // Acquire advisory lock to prevent concurrent migrations 31 | await client.query(/* sql */ `SELECT pg_advisory_lock($1)`, [ 32 | MIGRATION_ADVISORY_LOCK_ID, 33 | ]); 34 | 35 | // Ensure schema exists 36 | await client.query(/* sql */ ` 37 | CREATE SCHEMA IF NOT EXISTS ${schema}; 38 | `); 39 | 40 | // Ensure migrations table exists 41 | await client.query(/* sql */ ` 42 | CREATE TABLE IF NOT EXISTS ${schema}.migrations ( 43 | id SERIAL PRIMARY KEY, 44 | set JSONB NOT NULL, 45 | applied_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP 46 | ); 47 | `); 48 | 49 | // Load the most recent migration set 50 | const result = await client.query( 51 | /* sql */ `SELECT set FROM ${schema}.migrations ORDER BY applied_at DESC LIMIT 1`, 52 | ); 53 | 54 | const set = result.rows.length > 0 ? result.rows[0].set : {}; 55 | callback(null, set); 56 | } catch (error) { 57 | callback(error as Error); 58 | } 59 | }, 60 | 61 | async save( 62 | set: unknown, 63 | callback: (err: Error | null) => void, 64 | ): Promise { 65 | try { 66 | // Insert the entire set as JSONB 67 | await client.query( 68 | /* sql */ `INSERT INTO ${schema}.migrations (set) VALUES ($1)`, 69 | [JSON.stringify(set)], 70 | ); 71 | 72 | callback(null); 73 | } catch (error) { 74 | callback(error as Error); 75 | } 76 | }, 77 | 78 | async close(): Promise { 79 | if (client) { 80 | // Release advisory lock 81 | await client.query(/* sql */ `SELECT pg_advisory_unlock($1)`, [ 82 | MIGRATION_ADVISORY_LOCK_ID, 83 | ]); 84 | await client.end(); 85 | } 86 | }, 87 | }; 88 | }; 89 | 90 | export const runMigrations = async (): Promise => { 91 | return new Promise((resolve, reject) => { 92 | const stateStore = createStateStore(); 93 | 94 | migrate.load( 95 | { 96 | stateStore, 97 | migrationsDirectory: path.join(__dirname, '..', 'migrations'), 98 | }, 99 | (err, set) => { 100 | if (err) { 101 | stateStore.close().finally(() => reject(err)); 102 | return; 103 | } 104 | 105 | set.up((err) => { 106 | stateStore.close().finally(() => { 107 | if (err) { 108 | reject(err); 109 | } else { 110 | resolve(); 111 | } 112 | }); 113 | }); 114 | }, 115 | ); 116 | }); 117 | }; 118 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | name: Publish Package 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'v*.*.*' 7 | 8 | permissions: 9 | # Required for OIDC authentication to npm (https://docs.npmjs.com/trusted-publishers) 10 | id-token: write 11 | packages: write 12 | contents: read 13 | 14 | jobs: 15 | publish-npm: 16 | name: Publish package to npm 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v5 21 | - name: Setup Node.js 22 | uses: actions/setup-node@v5 23 | with: 24 | node-version: 22 25 | cache: 'npm' 26 | - name: Update npm 27 | run: npm install -g npm@latest 28 | - name: Install dependencies 29 | run: npm ci 30 | - name: Build 31 | run: npm run build --if-present 32 | - name: Publish 33 | run: npm publish --access public 34 | 35 | publish-docker: 36 | name: Publish Docker Images 37 | runs-on: ubuntu-latest 38 | steps: 39 | - name: Checkout 40 | uses: actions/checkout@v5 41 | - name: Docker meta 42 | id: meta 43 | uses: docker/metadata-action@v5 44 | with: 45 | images: | 46 | docker.io/timescale/pg-aiguide 47 | ghcr.io/timescale/pg-aiguide 48 | tags: | 49 | type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'main') }} 50 | type=semver,pattern={{version}} 51 | type=semver,pattern={{major}}.{{minor}} 52 | type=semver,pattern={{major}} 53 | - name: Set up QEMU 54 | uses: docker/setup-qemu-action@v3 55 | - name: Set up Docker Buildx 56 | uses: docker/setup-buildx-action@v3 57 | - name: Login to Docker Hub 58 | uses: docker/login-action@v3 59 | with: 60 | username: ${{ secrets.ORG_DOCKER_HUB_USERNAME }} 61 | password: ${{ secrets.ORG_DOCKER_HUB_ACCESS_TOKEN }} 62 | - name: Login to GitHub Container Registry 63 | uses: docker/login-action@v3 64 | with: 65 | registry: ghcr.io 66 | username: ${{ github.repository_owner }} 67 | password: ${{ secrets.GITHUB_TOKEN }} 68 | - name: Build and push 69 | uses: docker/build-push-action@v6 70 | with: 71 | context: . 72 | platforms: linux/amd64,linux/arm64 73 | push: true 74 | tags: ${{ steps.meta.outputs.tags }} 75 | labels: ${{ steps.meta.outputs.labels }} 76 | 77 | publish-mcp: 78 | name: Publish package to MCP Registry 79 | runs-on: ubuntu-latest 80 | needs: 81 | - publish-npm 82 | - publish-docker 83 | steps: 84 | - name: Checkout 85 | uses: actions/checkout@v5 86 | - name: Install mcp-publisher 87 | run: | 88 | curl -L "https://github.com/modelcontextprotocol/registry/releases/latest/download/mcp-publisher_$(uname -s | tr '[:upper:]' '[:lower:]')_$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/').tar.gz" | tar xz mcp-publisher 89 | - name: Authenticate to MCP Registry 90 | run: ./mcp-publisher login github-oidc 91 | - name: Set version in server.json 92 | run: | 93 | ./bun ./generate-server.json.ts ${{ github.ref_name }} 94 | - name: Publish server to MCP Registry 95 | run: ./mcp-publisher publish 96 | 97 | notify-publish: 98 | name: Notify publish to Slack 99 | runs-on: ubuntu-latest 100 | needs: 101 | - publish-npm 102 | - publish-docker 103 | - publish-mcp 104 | if: success() 105 | steps: 106 | - name: Set version 107 | id: version 108 | run: echo "number=${GITHUB_REF_NAME#v}" >> $GITHUB_OUTPUT 109 | 110 | - name: Post to Slack 111 | uses: slackapi/slack-github-action@v2.1.1 112 | with: 113 | method: chat.postMessage 114 | token: ${{ secrets.SLACK_BOT_TOKEN }} 115 | payload: | 116 | channel: ${{ secrets.SLACK_CHANNEL_ID }} 117 | unfurl_links: false 118 | unfurl_media: false 119 | text: "pg-aiguide ${{ github.ref_name }} published" 120 | blocks: 121 | - type: "markdown" 122 | text: | 123 | **pg-aiguide ${{ github.ref_name }} published** 124 | [workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) | [npm](https://www.npmjs.com/package/@tigerdata/pg-aiguide/v/${{ steps.version.outputs.number }}) | [mcp registry](https://registry.modelcontextprotocol.io/?q=pg-aiguide) | [ghcr.io](https://ghcr.io/timescale/pg-aiguide) | [docker.io](https://hub.docker.com/r/timescale/pg-aiguide) 125 | -------------------------------------------------------------------------------- /DEVELOPMENT.md: -------------------------------------------------------------------------------- 1 | # Development Guide 2 | 3 | ## Getting Started 4 | 5 | Clone the repo. 6 | 7 | ```bash 8 | git clone git@github.com:timescale/pg-aiguide.git 9 | ``` 10 | 11 | ## Configuration 12 | 13 | Create a `.env` file based on the `.env.sample` file. 14 | 15 | ```bash 16 | cp .env.sample .env 17 | ``` 18 | 19 | Add your OPENAI_API_KEY to be used for generating embeddings. 20 | 21 | ### Configuration Parameters 22 | 23 | The server supports disabling MCP skills through different mechanisms for each transport: 24 | 25 | #### HTTP Transport 26 | 27 | Pass parameters as query strings: 28 | 29 | ``` 30 | https://mcp.tigerdata.com/docs?disable_mcp_skills=1 31 | ``` 32 | 33 | #### Stdio Transport 34 | 35 | Use environment variables in the connection configuration: 36 | 37 | ```json 38 | { 39 | "mcpServers": { 40 | "pg-aiguide": { 41 | "command": "node", 42 | "args": ["/path/to/dist/index.js", "stdio"], 43 | "env": { 44 | "DISABLE_MCP_SKILLS": "1" 45 | } 46 | } 47 | } 48 | } 49 | ``` 50 | 51 | Or when running directly: 52 | 53 | ```bash 54 | DISABLE_MCP_SKILLS=1 node dist/index.js stdio 55 | ``` 56 | 57 | #### Available Parameters 58 | 59 | | Parameter | HTTP Query | Stdio Env Var | Values | Description | 60 | | ------------------ | -------------------- | -------------------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | 61 | | Disable MCP Skills | `disable_mcp_skills` | `DISABLE_MCP_SKILLS` | 1 or true | Disable all MCP skills (tools and prompt templates). This removes the `view_skill` tool and all skill-based prompt templates from the available capabilities. | 62 | 63 | **Examples:** 64 | 65 | - HTTP: `?disable_mcp_skills=1` 66 | - Stdio: `DISABLE_MCP_SKILLS=1` 67 | - Default (skills enabled): No parameter needed 68 | 69 | ## Run a TimescaleDB Database 70 | 71 | You will need a database with the [pgvector extension](https://github.com/pgvector/pgvector). 72 | 73 | ### Using Tiger Cloud 74 | 75 | Use the [tiger CLI](https://github.com/timescale/tiger-cli) to create a Tiger Cloud service. 76 | 77 | ```bash 78 | tiger service create --free --with-password -o json 79 | ``` 80 | 81 | Copy your database connection parameters into your .env file. 82 | 83 | ### Using Docker 84 | 85 | Run the database in a docker container. 86 | 87 | ```bash 88 | # pull the latest image 89 | docker pull timescale/timescaledb-ha:pg17 90 | 91 | # run the database container 92 | docker run -d --name pg-aiguide \ 93 | -e POSTGRES_PASSWORD=password \ 94 | -e POSTGRES_DB=tsdb \ 95 | -e POSTGRES_USER=tsdbadmin \ 96 | -p 127.0.0.1:5432:5432 \ 97 | timescale/timescaledb-ha:pg17 98 | ``` 99 | 100 | Copy your database connection parameters to your .env file: 101 | 102 | ```dotenv 103 | PGHOST=localhost 104 | PGPORT=5432 105 | PGDATABASE=tsdb 106 | PGUSER=tsdbadmin 107 | PGPASSWORD=password 108 | ``` 109 | 110 | ## Building the MCP Server 111 | 112 | Run `npm i` to install dependencies and build the project. Use `npm run watch` to rebuild on changes. 113 | 114 | ## Loading the Database 115 | 116 | The database is NOT preloaded with the documentation. To make the MCP server usable, you need to scrape, chunk, embed, load, and index the documentation. 117 | Follow the [directions in the ingest directory](/ingest/README.md) to load the database. 118 | 119 | ## Testing 120 | 121 | The MCP Inspector is a very handy to exercise the MCP server from a web-based UI. 122 | 123 | ```bash 124 | npm run inspector 125 | ``` 126 | 127 | | Field | Value | 128 | | -------------- | --------------- | 129 | | Transport Type | `STDIO` | 130 | | Command | `node` | 131 | | Arguments | `dist/index.js` | 132 | 133 | ### Testing in Claude Desktop 134 | 135 | Create/edit the file `~/Library/Application Support/Claude/claude_desktop_config.json` to add an entry like the following, making sure to use the absolute path to your local `pg-aiguide` project, and real database credentials. 136 | 137 | ```json 138 | { 139 | "mcpServers": { 140 | "pg-aiguide": { 141 | "command": "node", 142 | "args": ["/absolute/path/to/pg-aiguide/dist/index.js", "stdio"], 143 | "env": { 144 | "PGHOST": "x.y.tsdb.cloud.timescale.com", 145 | "PGDATABASE": "tsdb", 146 | "PGPORT": "32467", 147 | "PGUSER": "readonly_mcp_user", 148 | "PGPASSWORD": "abc123", 149 | "DB_SCHEMA": "docs", 150 | "OPENAI_API_KEY": "sk-svcacct" 151 | } 152 | } 153 | } 154 | } 155 | ``` 156 | -------------------------------------------------------------------------------- /src/skillutils/index.ts: -------------------------------------------------------------------------------- 1 | import { dirname, join } from 'path'; 2 | import { fileURLToPath } from 'url'; 3 | import { readdir, readFile } from 'fs/promises'; 4 | import matter from 'gray-matter'; 5 | import { z } from 'zod'; 6 | import { log, type PromptFactory } from '@tigerdata/mcp-boilerplate'; 7 | import { ServerContext } from '../types.js'; 8 | 9 | const __dirname = dirname(fileURLToPath(import.meta.url)); 10 | // Skills directory at repo root level 11 | const skillsDir = join(__dirname, '..', '..', 'skills'); 12 | 13 | // ===== Skill Types ===== 14 | 15 | export const zSkillMatter = z.object({ 16 | name: z.string().trim().min(1), 17 | description: z.string(), 18 | }); 19 | export type SkillMatter = z.infer; 20 | 21 | export const zSkill = z.object({ 22 | path: z.string(), 23 | name: z.string(), 24 | description: z.string(), 25 | }); 26 | export type Skill = z.infer; 27 | 28 | // ===== Skill Loading Implementation ===== 29 | 30 | // Cache for skill content 31 | const skillContentCache: Map = new Map(); 32 | let skillMapPromise: Promise> | null = null; 33 | 34 | /** 35 | * Parse a SKILL.md file and validate its metadata 36 | */ 37 | const parseSkillFile = async ( 38 | fileContent: string, 39 | ): Promise<{ 40 | matter: SkillMatter; 41 | content: string; 42 | }> => { 43 | const { data, content } = matter(fileContent); 44 | const skillMatter = zSkillMatter.parse(data); 45 | 46 | // Normalize skill name 47 | if (!/^[a-zA-Z0-9-_]+$/.test(skillMatter.name)) { 48 | const normalized = skillMatter.name 49 | .toLowerCase() 50 | .replace(/\s+/g, '-') 51 | .replace(/[^a-z0-9-_]/g, '_') 52 | .replace(/-[-_]+/g, '-') 53 | .replace(/_[_-]+/g, '_') 54 | .replace(/(^[-_]+)|([-_]+$)/g, ''); 55 | log.warn( 56 | `Skill name "${skillMatter.name}" contains invalid characters. Normalizing to "${normalized}".`, 57 | ); 58 | skillMatter.name = normalized; 59 | } 60 | 61 | return { 62 | matter: skillMatter, 63 | content: content.trim(), 64 | }; 65 | }; 66 | 67 | /** 68 | * Load all skills from the filesystem 69 | */ 70 | async function doLoadSkills(): Promise> { 71 | const skills = new Map(); 72 | skillContentCache.clear(); 73 | 74 | const alreadyExists = (name: string, path: string): boolean => { 75 | const existing = skills.get(name); 76 | if (existing) { 77 | log.warn( 78 | `Skill with name "${name}" already loaded from path "${existing.path}". Skipping duplicate at path "${path}".`, 79 | ); 80 | return true; 81 | } 82 | return false; 83 | }; 84 | 85 | const loadLocalPath = async (path: string): Promise => { 86 | const skillPath = join(path, 'SKILL.md'); 87 | try { 88 | const fileContent = await readFile(skillPath, 'utf-8'); 89 | const { 90 | matter: { name, description }, 91 | content, 92 | } = await parseSkillFile(fileContent); 93 | 94 | if (alreadyExists(name, path)) return; 95 | 96 | skills.set(name, { 97 | path, 98 | name, 99 | description, 100 | }); 101 | 102 | skillContentCache.set(`${name}/SKILL.md`, content); 103 | } catch (err) { 104 | log.error(`Failed to load skill at path: ${skillPath}`, err as Error); 105 | } 106 | }; 107 | 108 | try { 109 | // Load skills from subdirectories with SKILL.md files 110 | const dirEntries = await readdir(skillsDir, { withFileTypes: true }); 111 | for (const entry of dirEntries) { 112 | if (!entry.isDirectory()) continue; 113 | await loadLocalPath(join(skillsDir, entry.name)); 114 | } 115 | 116 | if (skills.size === 0) { 117 | log.warn( 118 | 'No skills found. Please add SKILL.md files to the skills/ subdirectories.', 119 | ); 120 | } else { 121 | log.info(`Successfully loaded ${skills.size} skill(s)`); 122 | } 123 | } catch (err) { 124 | log.error('Failed to load skills', err as Error); 125 | } 126 | 127 | return skills; 128 | } 129 | 130 | /** 131 | * Load skills with caching 132 | */ 133 | export const loadSkills = async ( 134 | force = false, 135 | ): Promise> => { 136 | if (skillMapPromise && !force) { 137 | return skillMapPromise; 138 | } 139 | 140 | skillMapPromise = doLoadSkills().catch((err) => { 141 | log.error('Failed to load skills', err as Error); 142 | skillMapPromise = null; 143 | return new Map(); 144 | }); 145 | 146 | return skillMapPromise; 147 | }; 148 | 149 | /** 150 | * View skill content 151 | */ 152 | export const viewSkillContent = async ( 153 | name: string, 154 | targetPath = 'SKILL.md', 155 | ): Promise => { 156 | const skillsMap = await loadSkills(); 157 | const skill = skillsMap.get(name); 158 | if (!skill) { 159 | throw new Error(`Skill not found: ${name}`); 160 | } 161 | 162 | const cacheKey = `${name}/${targetPath}`; 163 | const cached = skillContentCache.get(cacheKey); 164 | if (cached) { 165 | return cached; 166 | } 167 | 168 | // Read from filesystem 169 | try { 170 | const fullPath = join(skill.path, targetPath); 171 | const content = await readFile(fullPath, 'utf-8'); 172 | skillContentCache.set(cacheKey, content); 173 | return content; 174 | } catch { 175 | throw new Error(`Failed to read skill content: ${name}/${targetPath}`); 176 | } 177 | }; 178 | 179 | // Initialize skills on module load 180 | export const skills = await loadSkills(); 181 | 182 | interface PromptResult { 183 | [x: string]: unknown; 184 | description: string; 185 | messages: { 186 | role: 'user'; 187 | content: { 188 | type: 'text'; 189 | text: string; 190 | }; 191 | }[]; 192 | } 193 | 194 | // Export skills as prompt factories for MCP server 195 | export const promptFactories: PromptFactory< 196 | ServerContext, 197 | Record 198 | >[] = Array.from(skills.entries()).map(([name, skillData]) => () => ({ 199 | name, 200 | config: { 201 | // Using the dash-separated name as the title to work around a problem in Claude Code 202 | // See https://github.com/anthropics/claude-code/issues/7464 203 | title: name, 204 | description: skillData.description, 205 | inputSchema: {}, // No arguments for static skills 206 | }, 207 | fn: async (): Promise => { 208 | const content = await viewSkillContent(name); 209 | return { 210 | description: skillData.description || name, 211 | messages: [ 212 | { 213 | role: 'user' as const, 214 | content: { 215 | type: 'text' as const, 216 | text: content, 217 | }, 218 | }, 219 | ], 220 | }; 221 | }, 222 | })); 223 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pg-aiguide 2 | 3 | **AI-optimized PostgreSQL expertise for coding assistants** 4 | 5 | pg-aiguide helps AI coding tools write dramatically better PostgreSQL code. It provides: 6 | 7 | - **Semantic search** across the official PostgreSQL manual (version-aware) 8 | - **AI-optimized “skills”** — curated, opinionated Postgres best practices used automatically by AI agents 9 | - **Extension ecosystem docs**, starting with TimescaleDB, with more coming soon 10 | 11 | Use it either as: 12 | 13 | - a **public MCP server** that can be used with any AI coding agent, or 14 | - a **Claude Code plugin** optimized for use with Claude's native skill support. 15 | 16 | ## ⭐ Why pg-aiguide? 17 | 18 | AI coding tools often generate Postgres code that is: 19 | 20 | - outdated 21 | - missing constraints and indexes 22 | - unaware of modern PG features 23 | - inconsistent with real-world best practices 24 | 25 | pg-aiguide fixes that by giving AI agents deep, versioned PostgreSQL knowledge and proven patterns. 26 | 27 | ### See the difference 28 | 29 | https://github.com/user-attachments/assets/5a426381-09b5-4635-9050-f55422253a3d 30 | 31 |
32 | Video Transcript 33 | 34 | Prompt given to Claude Code: 35 | 36 | > Please describe the schema you would create for an e-commerce website two times, first with the tiger mcp server disabled, then with the tiger mcp server enabled. For each time, write the schema to its own file in the current working directory. Then compare the two files and let me know which approach generated the better schema, using both qualitative and quantitative reasons. For this example, only use standard Postgres. 37 | 38 | Result (summarized): 39 | 40 | - **4× more constraints** 41 | - **55% more indexes** (including partial/expression indexes) 42 | - **PG17-recommended patterns** 43 | - **Modern features** (`GENERATED ALWAYS AS IDENTITY`, `NULLS NOT DISTINCT`) 44 | - **Cleaner naming & documentation** 45 | 46 | Conclusion: _pg-aiguide produces more robust, performant, maintainable schemas._ 47 | 48 |
49 | 50 | ## 🚀 Quickstart 51 | 52 | pg-aiguide is available as a **public MCP server**: 53 | 54 | [https://mcp.tigerdata.com/docs](https://mcp.tigerdata.com/docs) 55 | 56 |
57 | Manual MCP configuration using JSON 58 | 59 | ```json 60 | { 61 | "mcpServers": { 62 | "pg-aiguide": { 63 | "url": "https://mcp.tigerdata.com/docs" 64 | } 65 | } 66 | } 67 | ``` 68 | 69 |
70 | 71 | Or it can be used as a **Claude Code Plugin**: 72 | 73 | ```bash 74 | claude plugin marketplace add timescale/pg-aiguide 75 | claude plugin install pg@aiguide 76 | ``` 77 | 78 | ### Install by environment 79 | 80 | #### One-click installs 81 | 82 | [![Install in Cursor](https://img.shields.io/badge/Install_in-Cursor-000000?style=flat-square&logoColor=white)](https://cursor.com/en/install-mcp?name=pg-aiguide&config=eyJuYW1lIjoicGctYWlndWlkZSIsInR5cGUiOiJodHRwIiwidXJsIjoiaHR0cHM6Ly9tY3AudGlnZXJkYXRhLmNvbS9kb2NzIn0=) 83 | [![Install in VS Code](https://img.shields.io/badge/Install_in-VS_Code-0098FF?style=flat-square&logo=visualstudiocode&logoColor=white)](https://vscode.dev/redirect/mcp/install?name=pg-aiguide&config=%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D) 84 | [![Install in VS Code Insiders](https://img.shields.io/badge/Install_in-VS_Code_Insiders-24bfa5?style=flat-square&logo=visualstudiocode&logoColor=white)](https://insiders.vscode.dev/redirect/mcp/install?name=pg-aiguide&config=%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D&quality=insiders) 85 | [![Install in Visual Studio](https://img.shields.io/badge/Install_in-Visual_Studio-C16FDE?style=flat-square&logo=visualstudio&logoColor=white)](https://vs-open.link/mcp-install?%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D) 86 | [![Install in Goose](https://block.github.io/goose/img/extension-install-dark.svg)](https://block.github.io/goose/extension?cmd=&arg=&id=pg-aiguide&name=pg-aiguide&description=MCP%20Server%20for%20pg-aiguide) 87 | [![Add MCP Server pg-aiguide to LM Studio](https://files.lmstudio.ai/deeplink/mcp-install-light.svg)](https://lmstudio.ai/install-mcp?name=pg-aiguide&config=eyJuYW1lIjoicGctYWlndWlkZSIsInR5cGUiOiJodHRwIiwidXJsIjoiaHR0cHM6Ly9tY3AudGlnZXJkYXRhLmNvbS9kb2NzIn0=) 88 | 89 |
90 | Claude Code 91 | 92 | This repo serves as a claude code marketplace plugin. To install, run: 93 | 94 | ```bash 95 | claude plugin marketplace add timescale/pg-aiguide 96 | claude plugin install pg@aiguide 97 | ``` 98 | 99 | This plugin uses the skills available in the `skills` directory as well as our 100 | publicly available MCP server endpoint hosted by TigerData for searching PostgreSQL documentation. 101 | 102 |
103 | 104 |
105 | Codex 106 | 107 | Run the following to add the MCP server to codex: 108 | 109 | ```bash 110 | codex mcp add --url "https://mcp.tigerdata.com/docs" pg-aiguide 111 | ``` 112 | 113 |
114 | 115 |
116 | Cursor 117 | 118 | One-click install: 119 | 120 | [![Install MCP Server](https://cursor.com/deeplink/mcp-install-dark.svg)](https://cursor.com/en-US/install-mcp?name=pg-aiguide&config=eyJ1cmwiOiJodHRwczovL21jcC50aWdlcmRhdGEuY29tL2RvY3MifQ%3D%3D) 121 | 122 | Or add the following to `.cursor/mcp.json` 123 | 124 | ```json 125 | { 126 | "mcpServers": { 127 | "pg-aiguide": { 128 | "url": "https://mcp.tigerdata.com/docs" 129 | } 130 | } 131 | } 132 | ``` 133 | 134 |
135 | 136 |
137 | Gemini CLI 138 | 139 | Run the following to add the MCP server to Gemini CLI: 140 | 141 | ```bash 142 | gemini mcp add -s user pg-aiguide "https://mcp.tigerdata.com/docs" -t http 143 | ``` 144 | 145 |
146 | 147 |
148 | Visual Studio 149 | 150 | Click the button to install: 151 | 152 | [![Install in Visual Studio](https://img.shields.io/badge/Install_in-Visual_Studio-C16FDE?style=flat-square&logo=visualstudio&logoColor=white)](https://vs-open.link/mcp-install?%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D) 153 | 154 |
155 | 156 |
157 | VS Code 158 | 159 | Click the button to install: 160 | 161 | [![Install in VS Code](https://img.shields.io/badge/Install_in-VS_Code-0098FF?style=flat-square&logo=visualstudiocode&logoColor=white)](https://vscode.dev/redirect/mcp/install?name=pg-aiguide&config=%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D) 162 | 163 | Alternatively, run the following to add the MCP server to VS Code: 164 | 165 | ```bash 166 | code --add-mcp '{"name":"pg-aiguide","type":"http","url":"https://mcp.tigerdata.com/docs"}' 167 | ``` 168 | 169 |
170 | 171 |
172 | VS Code Insiders 173 | 174 | Click the button to install: 175 | 176 | [![Install in VS Code Insiders](https://img.shields.io/badge/Install_in-VS_Code_Insiders-24bfa5?style=flat-square&logo=visualstudiocode&logoColor=white)](https://insiders.vscode.dev/redirect/mcp/install?name=pg-aiguide&config=%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D&quality=insiders) 177 | 178 | Alternatively, run the following to add the MCP server to VS Code Insiders: 179 | 180 | ```bash 181 | code-insiders --add-mcp '{"name":"pg-aiguide","type":"http","url":"https://mcp.tigerdata.com/docs"}' 182 | ``` 183 | 184 |
185 | 186 |
187 | Windsurf 188 | 189 | Add the following to `~/.codeium/windsurf/mcp_config.json` 190 | 191 | ```json 192 | { 193 | "mcpServers": { 194 | "pg-aiguide": { 195 | "serverUrl": "https://mcp.tigerdata.com/docs" 196 | } 197 | } 198 | } 199 | ``` 200 | 201 |
202 | 203 | ### 💡 Your First Prompt 204 | 205 | Once installed, pg-aiguide can answer Postgres questions or design schemas. 206 | 207 | **Simple schema example prompt** 208 | 209 | > Create a Postgres table schema for storing usernames and unique email addresses. 210 | 211 | **Complex schema example prompt** 212 | 213 | > You are a senior software engineer. You are given a task to generate a Postgres schema for an IoT device company. 214 | > The devices collect environmental data on a factory floor. The data includes temperature, humidity, pressure, as 215 | > the main data points as well as other measurements that vary from device to device. Each device has a unique id 216 | > and a human-readable name. We want to record the time the data was collected as well. Analysis for recent data 217 | > includes finding outliers and anomalies based on measurements, as well as analyzing the data of particular devices for ad-hoc analysis. Historical data analysis includes analyzing the history of data for one device or getting statistics for all devices over long periods of time. 218 | 219 | ## Features 220 | 221 | ### Semantic Search (MCP Tools) 222 | 223 | - [**`semantic_search_postgres_docs`**](API.md#semantic_search_postgres_docs) 224 | Performs semantic search over the official PostgreSQL manual, with results scoped to a specific Postgres version. 225 | 226 | - [**`semantic_search_tiger_docs`** ](API.md#semantic_search_tiger_docs) 227 | Searches Tiger Data’s documentation corpus, including TimescaleDB and future ecosystem extensions. 228 | 229 | ### Skills (AI-Optimized Best Practices) 230 | 231 | - **[`view_skill`](API.md#view_skill)** 232 | Exposes curated, opinionated PostgreSQL best-practice skills used automatically by AI coding assistants. 233 | 234 | These skills provide guidance on: 235 | - Schema design 236 | - Indexing strategies 237 | - Data types 238 | - Data integrity and constraints 239 | - Naming conventions 240 | - Performance tuning 241 | - Modern PostgreSQL features 242 | 243 | ## 🔌 Ecosystem Documentation 244 | 245 | Supported today: 246 | 247 | - **TimescaleDB** (docs + skills) 248 | 249 | Coming soon: 250 | 251 | - **pgvector** 252 | - **PostGIS** 253 | 254 | We welcome contributions for additional extensions and tools. 255 | 256 | ## 🛠 Development 257 | 258 | See [DEVELOPMENT.md](DEVELOPMENT.md) for: 259 | 260 | - running the MCP server locally 261 | - adding new skills 262 | - adding new docs 263 | 264 | ## 🤝 Contributing 265 | 266 | We welcome: 267 | 268 | - new Postgres best-practice skills 269 | - additional documentation corpora 270 | - search quality improvements 271 | - bug reports and feature ideas 272 | 273 | ## 📄 License 274 | 275 | Apache 2.0 276 | -------------------------------------------------------------------------------- /skills/find-hypertable-candidates/SKILL.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: find-hypertable-candidates 3 | description: Analyze an existing PostgreSQL database to identify tables that would benefit from conversion to TimescaleDB hypertables 4 | --- 5 | 6 | # PostgreSQL Hypertable Candidate Analysis 7 | 8 | Identify tables that would benefit from TimescaleDB hypertable conversion. After identification, use the companion "migrate-postgres-tables-to-hypertables" skill for configuration and migration. 9 | 10 | ## TimescaleDB Benefits 11 | 12 | **Performance gains:** 90%+ compression, fast time-based queries, improved insert performance, efficient aggregations, continuous aggregates for materialization (dashboards, reports, analytics), automatic data management (retention, compression). 13 | 14 | **Best for insert-heavy patterns:** 15 | 16 | - Time-series data (sensors, metrics, monitoring) 17 | - Event logs (user events, audit trails, application logs) 18 | - Transaction records (orders, payments, financial) 19 | - Sequential data (auto-incrementing IDs with timestamps) 20 | - Append-only datasets (immutable records, historical) 21 | 22 | **Requirements:** Large volumes (1M+ rows), time-based queries, infrequent updates 23 | 24 | ## Step 1: Database Schema Analysis 25 | 26 | ### Option A: From Database Connection 27 | 28 | #### Table statistics and size 29 | 30 | ```sql 31 | -- Get all tables with row counts and insert/update patterns 32 | WITH table_stats AS ( 33 | SELECT 34 | schemaname, tablename, 35 | n_tup_ins as total_inserts, 36 | n_tup_upd as total_updates, 37 | n_tup_del as total_deletes, 38 | n_live_tup as live_rows, 39 | n_dead_tup as dead_rows 40 | FROM pg_stat_user_tables 41 | ), 42 | table_sizes AS ( 43 | SELECT 44 | schemaname, tablename, 45 | pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as total_size, 46 | pg_total_relation_size(schemaname||'.'||tablename) as total_size_bytes 47 | FROM pg_tables 48 | WHERE schemaname NOT IN ('information_schema', 'pg_catalog') 49 | ) 50 | SELECT 51 | ts.schemaname, ts.tablename, ts.live_rows, 52 | tsize.total_size, tsize.total_size_bytes, 53 | ts.total_inserts, ts.total_updates, ts.total_deletes, 54 | ROUND(CASE WHEN ts.live_rows > 0 55 | THEN (ts.total_inserts::float / ts.live_rows) * 100 56 | ELSE 0 END, 2) as insert_ratio_pct 57 | FROM table_stats ts 58 | JOIN table_sizes tsize ON ts.schemaname = tsize.schemaname AND ts.tablename = tsize.tablename 59 | ORDER BY tsize.total_size_bytes DESC; 60 | ``` 61 | 62 | **Look for:** 63 | 64 | - mostly insert-heavy patterns (less updates/deletes) 65 | - big tables (1M+ rows or 100MB+) 66 | 67 | #### Index patterns 68 | 69 | ```sql 70 | -- Identify common query dimensions 71 | SELECT schemaname, tablename, indexname, indexdef 72 | FROM pg_indexes 73 | WHERE schemaname NOT IN ('information_schema', 'pg_catalog') 74 | ORDER BY tablename, indexname; 75 | ``` 76 | 77 | **Look for:** 78 | 79 | - Multiple indexes with timestamp/created_at columns → time-based queries 80 | - Composite (entity_id, timestamp) indexes → good candidates 81 | - Time-only indexes → time range filtering common 82 | 83 | #### Query patterns (if pg_stat_statements available) 84 | 85 | ```sql 86 | -- Check availability 87 | SELECT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements'); 88 | 89 | -- Analyze expensive queries for candidate tables 90 | SELECT query, calls, mean_exec_time, total_exec_time 91 | FROM pg_stat_statements 92 | WHERE query ILIKE '%your_table_name%' 93 | ORDER BY total_exec_time DESC LIMIT 20; 94 | ``` 95 | 96 | **✅ Good patterns:** Time-based WHERE, entity filtering combined with time-based qualifiers, GROUP BY time_bucket, range queries over time 97 | **❌ Poor patterns:** Non-time lookups with no time-based qualifiers in same query (WHERE email = ...) 98 | 99 | #### Constraints 100 | 101 | ```sql 102 | -- Check migration compatibility 103 | SELECT conname, contype, pg_get_constraintdef(oid) as definition 104 | FROM pg_constraint 105 | WHERE conrelid = 'your_table_name'::regclass; 106 | ``` 107 | 108 | **Compatibility:** 109 | 110 | - Primary keys (p): Must include partition column or ask user if can be modified 111 | - Foreign keys (f): Plain→Hypertable and Hypertable→Plain OK, Hypertable→Hypertable NOT supported 112 | - Unique constraints (u): Must include partition column or ask user if can be modified 113 | - Check constraints (c): Usually OK 114 | 115 | ### Option B: From Code Analysis 116 | 117 | #### ✅ GOOD Patterns 118 | 119 | ```python 120 | # Append-only logging 121 | INSERT INTO events (user_id, event_time, data) VALUES (...); 122 | # Time-series collection 123 | INSERT INTO metrics (device_id, timestamp, value) VALUES (...); 124 | # Time-based queries 125 | SELECT * FROM metrics WHERE timestamp >= NOW() - INTERVAL '24 hours'; 126 | # Time aggregations 127 | SELECT DATE_TRUNC('day', timestamp), COUNT(*) GROUP BY 1; 128 | ``` 129 | 130 | #### ❌ POOR Patterns 131 | 132 | ```python 133 | # Frequent updates to historical records 134 | UPDATE users SET email = ..., updated_at = NOW() WHERE id = ...; 135 | # Non-time lookups 136 | SELECT * FROM users WHERE email = ...; 137 | # Small reference tables 138 | SELECT * FROM countries ORDER BY name; 139 | ``` 140 | 141 | #### Schema Indicators 142 | 143 | **✅ GOOD:** 144 | 145 | - Has timestamp/timestamptz column 146 | - Multiple indexes with timestamp-based columns 147 | - Composite (entity_id, timestamp) indexes 148 | 149 | **❌ POOR:** 150 | 151 | - Mostly indexes with non-time-based columns (on columns like email, name, status, etc.) 152 | - Columns that you expect to be updated over time (updated_at, updated_by, status, etc.) 153 | - Unique constraints on non-time fields 154 | - Frequent updated_at modifications 155 | - Small static tables 156 | 157 | #### Special Case: ID-Based Tables 158 | 159 | Sequential ID tables can be candidates if: 160 | 161 | - Insert-mostly pattern / updates are either infrequent or only on recent records. 162 | - If updates do happen, they occur on recent records (such as an order status being updated orderered->processing->delivered. Note once an order is delivered, it is unlikely to be updated again.) 163 | - IDs correlate with time (as is the case for serial/auto-incrementing IDs/GENERATED ALWAYS AS IDENTITY) 164 | - ID is the primary query dimension 165 | - Recent data accessed more often (frequently the case in ecommerce, finance, etc.) 166 | - Time-based reporting common (e.g. monthly, daily summaries/analytics) 167 | 168 | ```sql 169 | CREATE TABLE orders ( 170 | id BIGSERIAL PRIMARY KEY, -- Can partition by ID 171 | user_id BIGINT, 172 | created_at TIMESTAMPTZ DEFAULT NOW() -- For sparse indexes 173 | ); 174 | ``` 175 | 176 | Note: For ID-based tables where there is also a time column (created_at, ordered_at, etc.), 177 | you can partition by ID and use sparse indexes on the time column. 178 | See the `migrate-postgres-tables-to-hypertables` skill for details. 179 | 180 | ## Step 2: Candidacy Scoring (8+ points = good candidate) 181 | 182 | ### Time-Series Characteristics (5+ points needed) 183 | 184 | - Has timestamp/timestamptz column: **3 points** 185 | - Data inserted chronologically: **2 points** 186 | - Queries filter by time: **2 points** 187 | - Time aggregations common: **2 points** 188 | 189 | ### Scale & Performance (3+ points recommended) 190 | 191 | - Large table (1M+ rows or 100MB+): **2 points** 192 | - High insert volume: **1 point** 193 | - Infrequent updates to historical: **1 point** 194 | - Range queries common: **1 point** 195 | - Aggregation queries: **2 points** 196 | 197 | ### Data Patterns (bonus) 198 | 199 | - Contains entity ID for segmentation (device_id, user_id, product_id, symbol, etc.): **1 point** 200 | - Numeric measurements: **1 point** 201 | - Log/event structure: **1 point** 202 | 203 | ## Common Patterns 204 | 205 | ### ✅ GOOD Candidates 206 | 207 | **✅ Event/Log Tables** (user_events, audit_logs) 208 | 209 | ```sql 210 | CREATE TABLE user_events ( 211 | id BIGSERIAL PRIMARY KEY, 212 | user_id BIGINT, 213 | event_type TEXT, 214 | event_time TIMESTAMPTZ DEFAULT NOW(), 215 | metadata JSONB 216 | ); 217 | -- Partition by id, segment by user_id, enable minmax sparse_index on event_time 218 | ``` 219 | 220 | **✅ Sensor/IoT Data** (sensor_readings, telemetry) 221 | 222 | ```sql 223 | CREATE TABLE sensor_readings ( 224 | device_id TEXT, 225 | timestamp TIMESTAMPTZ, 226 | temperature DOUBLE PRECISION, 227 | humidity DOUBLE PRECISION 228 | ); 229 | -- Partition by timestamp, segment by device_id, minmax sparse indexes on temperature and humidity 230 | ``` 231 | 232 | **✅ Financial/Trading** (stock_prices, transactions) 233 | 234 | ```sql 235 | CREATE TABLE stock_prices ( 236 | symbol VARCHAR(10), 237 | price_time TIMESTAMPTZ, 238 | open_price DECIMAL, 239 | close_price DECIMAL, 240 | volume BIGINT 241 | ); 242 | -- Partition by price_time, segment by symbol, minmax sparse indexes on open_price and close_price and volume 243 | ``` 244 | 245 | **✅ System Metrics** (monitoring_data) 246 | 247 | ```sql 248 | CREATE TABLE system_metrics ( 249 | hostname TEXT, 250 | metric_time TIMESTAMPTZ, 251 | cpu_usage DOUBLE PRECISION, 252 | memory_usage BIGINT 253 | ); 254 | -- Partition by metric_time, segment by hostname, minmax sparse indexes on cpu_usage and memory_usage 255 | ``` 256 | 257 | ### ❌ POOR Candidates 258 | 259 | **❌ Reference Tables** (countries, categories) 260 | 261 | ```sql 262 | CREATE TABLE countries ( 263 | id SERIAL PRIMARY KEY, 264 | name VARCHAR(100), 265 | code CHAR(2) 266 | ); 267 | -- Static data, no time component 268 | ``` 269 | 270 | **❌ User Profiles** (users, accounts) 271 | 272 | ```sql 273 | CREATE TABLE users ( 274 | id BIGSERIAL PRIMARY KEY, 275 | email VARCHAR(255), 276 | created_at TIMESTAMPTZ, 277 | updated_at TIMESTAMPTZ 278 | ); 279 | -- Accessed by ID, frequently updated, has timestamp but it's not the primary query dimension (the primary query dimension is id or email) 280 | ``` 281 | 282 | **❌ Settings/Config** (user_settings) 283 | 284 | ```sql 285 | CREATE TABLE user_settings ( 286 | user_id BIGINT PRIMARY KEY, 287 | theme VARCHAR(20), -- Changes: light -> dark -> auto 288 | language VARCHAR(10), -- Changes: en -> es -> fr 289 | notifications JSONB, -- Frequent preference updates 290 | updated_at TIMESTAMPTZ 291 | ); 292 | -- Accessed by user_id, frequently updated, has timestamp but it's not the primary query dimension (the primary query dimension is user_id) 293 | ``` 294 | 295 | ## Analysis Output Requirements 296 | 297 | For each candidate table provide: 298 | 299 | - **Score:** Based on criteria (8+ = strong candidate) 300 | - **Pattern:** Insert vs update ratio 301 | - **Access:** Time-based vs entity lookups 302 | - **Size:** Current size and growth rate 303 | - **Queries:** Time-range, aggregations, point lookups 304 | 305 | Focus on insert-heavy patterns with time-based or sequential access. Tables scoring 8+ points are strong candidates for conversion. 306 | -------------------------------------------------------------------------------- /skills/migrate-postgres-tables-to-hypertables/SKILL.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: migrate-postgres-tables-to-hypertables 3 | description: Comprehensive guide for migrating PostgreSQL tables to TimescaleDB hypertables with optimal configuration and performance validation 4 | --- 5 | 6 | # PostgreSQL to TimescaleDB Hypertable Migration 7 | 8 | Migrate identified PostgreSQL tables to TimescaleDB hypertables with optimal configuration, migration planning and validation. 9 | 10 | **Prerequisites**: Tables already identified as hypertable candidates (use companion "find-hypertable-candidates" skill if needed). 11 | 12 | ## Step 1: Optimal Configuration 13 | 14 | ### Partition Column Selection 15 | 16 | ```sql 17 | -- Find potential partition columns 18 | SELECT column_name, data_type, is_nullable 19 | FROM information_schema.columns 20 | WHERE table_name = 'your_table_name' 21 | AND data_type IN ('timestamp', 'timestamptz', 'bigint', 'integer', 'date') 22 | ORDER BY ordinal_position; 23 | ``` 24 | 25 | **Requirements:** Time-based (TIMESTAMP/TIMESTAMPTZ/DATE) or sequential integer (INT/BIGINT) 26 | 27 | Should represent when the event actually occurred or sequential ordering. 28 | 29 | **Common choices:** 30 | 31 | - `timestamp`, `created_at`, `event_time` - when event occurred 32 | - `id`, `sequence_number` - auto-increment (for sequential data without timestamps) 33 | - `ingested_at` - less ideal, only if primary query dimension 34 | - `updated_at` - AVOID (records updated out of order, breaks chunk distribution) unless primary query dimension 35 | 36 | #### Special Case: table with BOTH ID AND Timestamp 37 | 38 | When table has sequential ID (PK) AND timestamp that correlate: 39 | 40 | ```sql 41 | -- Partition by ID, enable minmax sparse indexes on timestamp 42 | SELECT create_hypertable('orders', 'id', chunk_time_interval => 1000000); 43 | ALTER TABLE orders SET ( 44 | timescaledb.sparse_index = 'minmax(created_at),...' 45 | ); 46 | ``` 47 | 48 | Sparse indexes on time column enable skipping compressed blocks outside queried time ranges. 49 | 50 | Use when: ID correlates with time (newer records have higher IDs), need ID-based lookups, time queries also common 51 | 52 | ### Chunk Interval Selection 53 | 54 | ```sql 55 | -- Ensure statistics are current 56 | ANALYZE your_table_name; 57 | 58 | -- Estimate index size per time unit 59 | WITH time_range AS ( 60 | SELECT 61 | MIN(timestamp_column) as min_time, 62 | MAX(timestamp_column) as max_time, 63 | EXTRACT(EPOCH FROM (MAX(timestamp_column) - MIN(timestamp_column)))/3600 as total_hours 64 | FROM your_table_name 65 | ), 66 | total_index_size AS ( 67 | SELECT SUM(pg_relation_size(indexname::regclass)) as total_index_bytes 68 | FROM pg_stat_user_indexes 69 | WHERE schemaname||'.'||tablename = 'your_schema.your_table_name' 70 | ) 71 | SELECT 72 | pg_size_pretty(tis.total_index_bytes / tr.total_hours) as index_size_per_hour 73 | FROM time_range tr, total_index_size tis; 74 | ``` 75 | 76 | **Target:** Indexes of recent chunks < 25% of RAM 77 | **Default:** IMPORTANT: Keep default of 7 days if unsure 78 | **Range:** 1 hour minimum, 30 days maximum 79 | 80 | **Example:** 32GB RAM → target 8GB for recent indexes. If index_size_per_hour = 200MB: 81 | 82 | - 1 hour chunks: 200MB chunk index size × 40 recent = 8GB ✓ 83 | - 6 hour chunks: 1.2GB chunk index size × 7 recent = 8.4GB ✓ 84 | - 1 day chunks: 4.8GB chunk index size × 2 recent = 9.6GB ⚠️ 85 | Choose largest interval keeping 2+ recent chunk indexes under target. 86 | 87 | ### Primary Key/ Unique Constraints Compatibility 88 | 89 | ```sql 90 | -- Check existing primary key/ unique constraints 91 | SELECT conname, pg_get_constraintdef(oid) as definition 92 | FROM pg_constraint 93 | WHERE conrelid = 'your_table_name'::regclass AND contype = 'p' OR contype = 'u'; 94 | ``` 95 | 96 | **Rules:** PK/UNIQUE must include partition column 97 | 98 | **Actions:** 99 | 100 | 1. **No PK/UNIQUE:** No changes needed 101 | 2. **PK/UNIQUE includes partition column:** No changes needed 102 | 3. **PK/UNIQUE excludes partition column:** ⚠️ **ASK USER PERMISSION** to modify PK/UNIQUE 103 | 104 | **Example: user prompt if needed:** 105 | 106 | > "Primary key (id) doesn't include partition column (timestamp). Must modify to PRIMARY KEY (id, timestamp) to convert to hypertable. This may break application code. Is this acceptable?" 107 | > "Unique constraint (id) doesn't include partition column (timestamp). Must modify to UNIQUE (id, timestamp) to convert to hypertable. This may break application code. Is this acceptable?" 108 | 109 | If the user accepts, modify the constraint: 110 | 111 | ```sql 112 | BEGIN; 113 | ALTER TABLE your_table_name DROP CONSTRAINT existing_pk_name; 114 | ALTER TABLE your_table_name ADD PRIMARY KEY (existing_columns, partition_column); 115 | COMMIT; 116 | ``` 117 | 118 | If the user does not accept, you should NOT migrate the table. 119 | 120 | IMPORTANT: DO NOT modify the primary key/unique constraint without user permission. 121 | 122 | ### Compression Configuration 123 | 124 | For detailed segment_by and order_by selection, see "setup-timescaledb-hypertables" skill. Quick reference: 125 | 126 | **segment_by:** Most common WHERE filter with >100 rows per value per chunk 127 | 128 | - IoT: `device_id` 129 | - Finance: `symbol` 130 | - Analytics: `user_id` or `session_id` 131 | 132 | ```sql 133 | -- Analyze cardinality for segment_by selection 134 | SELECT column_name, COUNT(DISTINCT column_name) as unique_values, 135 | ROUND(COUNT(*)::float / COUNT(DISTINCT column_name), 2) as avg_rows_per_value 136 | FROM your_table_name GROUP BY column_name; 137 | ``` 138 | 139 | **order_by:** Usually `timestamp DESC`. The (segment_by, order_by) combination should form a natural time-series progression. 140 | 141 | - If column has <100 rows/chunk (too low for segment_by), prepend to order_by: `order_by='low_density_col, timestamp DESC'` 142 | 143 | **sparse indexes:** add minmax on the columns that are used in the WHERE clauses but are not in the segment_by or order_by. Use minmax for columns used in range queries. 144 | 145 | ```sql 146 | ALTER TABLE your_table_name SET ( 147 | timescaledb.enable_columnstore, 148 | timescaledb.segmentby = 'entity_id', 149 | timescaledb.orderby = 'timestamp DESC' 150 | timescaledb.sparse_index = 'minmax(value_1),...' 151 | ); 152 | 153 | -- Compress after data unlikely to change (adjust `after` parameter based on update patterns) 154 | CALL add_columnstore_policy('your_table_name', after => INTERVAL '7 days'); 155 | ``` 156 | 157 | ## Step 2: Migration Planning 158 | 159 | ### Pre-Migration Checklist 160 | 161 | - [ ] Partition column selected 162 | - [ ] Chunk interval calculated (or using default) 163 | - [ ] PK includes partition column OR user approved modification 164 | - [ ] No Hypertable→Hypertable foreign keys 165 | - [ ] Unique constraints include partition column 166 | - [ ] Created compression configuration (segment_by, order_by, sparse indexes, compression policy) 167 | - [ ] Maintenance window scheduled / backup created. 168 | 169 | ### Migration Options 170 | 171 | #### Option 1: In-Place (Tables < 1GB) 172 | 173 | ```sql 174 | -- Enable extension 175 | CREATE EXTENSION IF NOT EXISTS timescaledb; 176 | 177 | -- Convert to hypertable (locks table) 178 | SELECT create_hypertable( 179 | 'your_table_name', 180 | 'timestamp_column', 181 | chunk_time_interval => INTERVAL '7 days', 182 | if_not_exists => TRUE 183 | ); 184 | 185 | -- Configure compression 186 | ALTER TABLE your_table_name SET ( 187 | timescaledb.enable_columnstore, 188 | timescaledb.segmentby = 'entity_id', 189 | timescaledb.orderby = 'timestamp DESC', 190 | timescaledb.sparse_index = 'minmax(value_1),...' 191 | ); 192 | 193 | -- Adjust `after` parameter based on update patterns 194 | CALL add_columnstore_policy('your_table_name', after => INTERVAL '7 days'); 195 | ``` 196 | 197 | #### Option 2: Blue-Green (Tables > 1GB) 198 | 199 | ```sql 200 | -- 1. Create new hypertable 201 | CREATE TABLE your_table_name_new (LIKE your_table_name INCLUDING ALL); 202 | 203 | -- 2. Convert to hypertable 204 | SELECT create_hypertable('your_table_name_new', 'timestamp_column'); 205 | 206 | -- 3. Configure compression 207 | ALTER TABLE your_table_name_new SET ( 208 | timescaledb.enable_columnstore, 209 | timescaledb.segmentby = 'entity_id', 210 | timescaledb.orderby = 'timestamp DESC' 211 | ); 212 | 213 | -- 4. Migrate data in batches 214 | INSERT INTO your_table_name_new 215 | SELECT * FROM your_table_name 216 | WHERE timestamp_column >= '2024-01-01' AND timestamp_column < '2024-02-01'; 217 | -- Repeat for each time range 218 | 219 | -- 4. Enter maintenance window and do the following: 220 | 221 | -- 5. Pause modification of the old table. 222 | 223 | -- 6. Copy over the most recent data from the old table to the new table. 224 | 225 | -- 7. Swap tables 226 | BEGIN; 227 | ALTER TABLE your_table_name RENAME TO your_table_name_old; 228 | ALTER TABLE your_table_name_new RENAME TO your_table_name; 229 | COMMIT; 230 | 231 | -- 8. Exit maintenance window. 232 | 233 | -- 9. (sometime much later) Drop old table after validation 234 | -- DROP TABLE your_table_name_old; 235 | ``` 236 | 237 | ### Common Issues 238 | 239 | #### Foreign Keys 240 | 241 | ```sql 242 | -- Check foreign keys 243 | SELECT conname, confrelid::regclass as referenced_table 244 | FROM pg_constraint 245 | WHERE (conrelid = 'your_table_name'::regclass 246 | OR confrelid = 'your_table_name'::regclass) 247 | AND contype = 'f'; 248 | ``` 249 | 250 | **Supported:** Plain→Hypertable, Hypertable→Plain 251 | **NOT supported:** Hypertable→Hypertable 252 | 253 | ⚠️ **CRITICAL:** Hypertable→Hypertable FKs must be dropped (enforce in application). **ASK USER PERMISSION**. If no, **STOP MIGRATION**. 254 | 255 | #### Large Table Migration Time 256 | 257 | ```sql 258 | -- Rough estimate: ~75k rows/second 259 | SELECT 260 | pg_size_pretty(pg_total_relation_size(tablename)) as size, 261 | n_live_tup as rows, 262 | ROUND(n_live_tup / 75000.0 / 60, 1) as estimated_minutes 263 | FROM pg_stat_user_tables 264 | WHERE tablename = 'your_table_name'; 265 | ``` 266 | 267 | **Solutions for large tables (>1GB/10M rows):** Use blue-green migration, migrate during off-peak, test on subset first 268 | 269 | ## Step 3: Performance Validation 270 | 271 | ### Chunk & Compression Analysis 272 | 273 | ```sql 274 | -- View chunks and compression 275 | SELECT 276 | chunk_name, 277 | pg_size_pretty(total_bytes) as size, 278 | pg_size_pretty(compressed_total_bytes) as compressed_size, 279 | ROUND((total_bytes - compressed_total_bytes::numeric) / total_bytes * 100, 1) as compression_pct, 280 | range_start, 281 | range_end 282 | FROM timescaledb_information.chunks 283 | WHERE hypertable_name = 'your_table_name' 284 | ORDER BY range_start DESC; 285 | ``` 286 | 287 | **Look for:** 288 | 289 | - Consistent chunk sizes (within 2x) 290 | - Compression >90% for time-series 291 | - Recent chunks uncompressed 292 | - Chunk indexes < 25% RAM 293 | 294 | ### Query Performance Tests 295 | 296 | ```sql 297 | -- 1. Time-range query (should show chunk exclusion) 298 | EXPLAIN (ANALYZE, BUFFERS) 299 | SELECT COUNT(*), AVG(value) 300 | FROM your_table_name 301 | WHERE timestamp >= NOW() - INTERVAL '1 day'; 302 | 303 | -- 2. Entity + time query (benefits from segment_by) 304 | EXPLAIN (ANALYZE, BUFFERS) 305 | SELECT * FROM your_table_name 306 | WHERE entity_id = 'X' AND timestamp >= NOW() - INTERVAL '1 week'; 307 | 308 | -- 3. Aggregation (benefits from columnstore) 309 | EXPLAIN (ANALYZE, BUFFERS) 310 | SELECT DATE_TRUNC('hour', timestamp), entity_id, COUNT(*), AVG(value) 311 | FROM your_table_name 312 | WHERE timestamp >= NOW() - INTERVAL '1 month' 313 | GROUP BY 1, 2; 314 | ``` 315 | 316 | **✅ Good signs:** 317 | 318 | - "Chunks excluded during startup: X" in EXPLAIN plan 319 | - "Custom Scan (ColumnarScan)" for compressed data 320 | - Lower "Buffers: shared read" in EXPLAIN ANALYZE plan than pre-migration 321 | - Faster execution times 322 | 323 | **❌ Bad signs:** 324 | 325 | - "Seq Scan" on large chunks 326 | - No chunk exclusion messages 327 | - Slower than before migration 328 | 329 | ### Storage Metrics 330 | 331 | ```sql 332 | -- Monitor compression effectiveness 333 | SELECT 334 | hypertable_name, 335 | pg_size_pretty(total_bytes) as total_size, 336 | pg_size_pretty(compressed_total_bytes) as compressed_size, 337 | ROUND(compressed_total_bytes::numeric / total_bytes * 100, 1) as compressed_pct_of_total, 338 | ROUND((uncompressed_total_bytes - compressed_total_bytes::numeric) / 339 | uncompressed_total_bytes * 100, 1) as compression_ratio_pct 340 | FROM timescaledb_information.hypertables 341 | WHERE hypertable_name = 'your_table_name'; 342 | ``` 343 | 344 | **Monitor:** 345 | 346 | - compression_ratio_pct >90% (typical time-series) 347 | - compressed_pct_of_total growing as data ages 348 | - Size growth slowing significantly vs pre-hypertable 349 | - Decreasing compression_ratio_pct = poor segment_by 350 | 351 | ### Troubleshooting 352 | 353 | #### Poor Chunk Exclusion 354 | 355 | ```sql 356 | -- Verify chunks are being excluded 357 | EXPLAIN (ANALYZE, BUFFERS) 358 | SELECT * FROM your_table_name 359 | WHERE timestamp >= '2024-01-01' AND timestamp < '2024-01-02'; 360 | -- Look for "Chunks excluded during startup: X" 361 | ``` 362 | 363 | #### Poor Compression 364 | 365 | ```sql 366 | -- Get newest compressed chunk name 367 | SELECT chunk_name FROM timescaledb_information.chunks 368 | WHERE hypertable_name = 'your_table_name' 369 | AND compressed_total_bytes IS NOT NULL 370 | ORDER BY range_start DESC LIMIT 1; 371 | 372 | -- Analyze segment distribution 373 | SELECT segment_by_column, COUNT(*) as rows_per_segment 374 | FROM _timescaledb_internal._hyper_X_Y_chunk -- Use actual chunk name 375 | GROUP BY 1 ORDER BY 2 DESC; 376 | ``` 377 | 378 | **Look for:** <20 rows per segment: Poor segment_by choice (should be >100) => Low compression potential. 379 | 380 | #### Poor insert performance 381 | 382 | Check that you don't have too many indexes. Unused indexes hurt insert performance and should be dropped. 383 | 384 | ```sql 385 | SELECT 386 | schemaname, 387 | tablename, 388 | indexname, 389 | idx_tup_read, 390 | idx_tup_fetch, 391 | idx_scan 392 | FROM pg_stat_user_indexes 393 | WHERE tablename LIKE '%your_table_name%' 394 | ORDER BY idx_scan DESC; 395 | ``` 396 | 397 | **Look for:** Unused indexes via a low idx_scan value. Drop such indexes (but ask user permission). 398 | 399 | ### Ongoing Monitoring 400 | 401 | ```sql 402 | -- Monitor chunk compression status 403 | CREATE OR REPLACE VIEW hypertable_compression_status AS 404 | SELECT 405 | h.hypertable_name, 406 | COUNT(c.chunk_name) as total_chunks, 407 | COUNT(c.chunk_name) FILTER (WHERE c.compressed_total_bytes IS NOT NULL) as compressed_chunks, 408 | ROUND( 409 | COUNT(c.chunk_name) FILTER (WHERE c.compressed_total_bytes IS NOT NULL)::numeric / 410 | COUNT(c.chunk_name) * 100, 1 411 | ) as compression_coverage_pct, 412 | pg_size_pretty(SUM(c.total_bytes)) as total_size, 413 | pg_size_pretty(SUM(c.compressed_total_bytes)) as compressed_size 414 | FROM timescaledb_information.hypertables h 415 | LEFT JOIN timescaledb_information.chunks c ON h.hypertable_name = c.hypertable_name 416 | GROUP BY h.hypertable_name; 417 | 418 | -- Query this view regularly to monitor compression progress 419 | SELECT * FROM hypertable_compression_status 420 | WHERE hypertable_name = 'your_table_name'; 421 | ``` 422 | 423 | **Look for:** 424 | 425 | - compression_coverage_pct should increase over time as data ages and gets compressed. 426 | - total_chunks should not grow too quickly (more than 10000 becomes a problem). 427 | - You should not see unexpected spikes in total_size or compressed_size. 428 | 429 | ## Success Criteria 430 | 431 | **✅ Migration successful when:** 432 | 433 | - All queries return correct results 434 | - Query performance equal or better 435 | - Compression >90% for older data 436 | - Chunk exclusion working for time queries 437 | - Insert performance acceptable 438 | 439 | **❌ Investigate if:** 440 | 441 | - Query performance >20% worse 442 | - Compression <80% 443 | - No chunk exclusion 444 | - Insert performance degraded 445 | - Increased error rates 446 | 447 | Focus on high-volume, insert-heavy workloads with time-based access patterns for best ROI. 448 | -------------------------------------------------------------------------------- /skills/design-postgres-tables/SKILL.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: design-postgres-tables 3 | description: Comprehensive PostgreSQL-specific table design reference covering data types, indexing, constraints, performance patterns, and advanced features 4 | --- 5 | 6 | # PostgreSQL Table Design 7 | 8 | ## Core Rules 9 | 10 | - Define a **PRIMARY KEY** for reference tables (users, orders, etc.). Not always needed for time-series/event/log data. When used, prefer `BIGINT GENERATED ALWAYS AS IDENTITY`; use `UUID` only when global uniqueness/opacity is needed. 11 | - **Normalize first (to 3NF)** to eliminate data redundancy and update anomalies; denormalize **only** for measured, high-ROI reads where join performance is proven problematic. Premature denormalization creates maintenance burden. 12 | - Add **NOT NULL** everywhere it’s semantically required; use **DEFAULT**s for common values. 13 | - Create **indexes for access paths you actually query**: PK/unique (auto), **FK columns (manual!)**, frequent filters/sorts, and join keys. 14 | - Prefer **TIMESTAMPTZ** for event time; **NUMERIC** for money; **TEXT** for strings; **BIGINT** for integer values, **DOUBLE PRECISION** for floats (or `NUMERIC` for exact decimal arithmetic). 15 | 16 | ## PostgreSQL “Gotchas” 17 | 18 | - **Identifiers**: unquoted → lowercased. Avoid quoted/mixed-case names. Convention: use `snake_case` for table/column names. 19 | - **Unique + NULLs**: UNIQUE allows multiple NULLs. Use `UNIQUE (...) NULLS NOT DISTINCT` (PG15+) to restrict to one NULL. 20 | - **FK indexes**: PostgreSQL **does not** auto-index FK columns. Add them. 21 | - **No silent coercions**: length/precision overflows error out (no truncation). Example: inserting 999 into `NUMERIC(2,0)` fails with error, unlike some databases that silently truncate or round. 22 | - **Sequences/identity have gaps** (normal; don't "fix"). Rollbacks, crashes, and concurrent transactions create gaps in ID sequences (1, 2, 5, 6...). This is expected behavior—don't try to make IDs consecutive. 23 | - **Heap storage**: no clustered PK by default (unlike SQL Server/MySQL InnoDB); `CLUSTER` is one-off reorganization, not maintained on subsequent inserts. Row order on disk is insertion order unless explicitly clustered. 24 | - **MVCC**: updates/deletes leave dead tuples; vacuum handles them—design to avoid hot wide-row churn. 25 | 26 | ## Data Types 27 | 28 | - **IDs**: `BIGINT GENERATED ALWAYS AS IDENTITY` preferred (`GENERATED BY DEFAULT` also fine); `UUID` when merging/federating/used in a distributed system or for opaque IDs. Generate with `uuidv7()` (preferred if using PG18+) or `gen_random_uuid()` (if using an older PG version). 29 | - **Integers**: prefer `BIGINT` unless storage space is critical; `INTEGER` for smaller ranges; avoid `SMALLINT` unless constrained. 30 | - **Floats**: prefer `DOUBLE PRECISION` over `REAL` unless storage space is critical. Use `NUMERIC` for exact decimal arithmetic. 31 | - **Strings**: prefer `TEXT`; if length limits needed, use `CHECK (LENGTH(col) <= n)` instead of `VARCHAR(n)`; avoid `CHAR(n)`. Use `BYTEA` for binary data. Large strings/binary (>2KB default threshold) automatically stored in TOAST with compression. TOAST storage: `PLAIN` (no TOAST), `EXTENDED` (compress + out-of-line), `EXTERNAL` (out-of-line, no compress), `MAIN` (compress, keep in-line if possible). Default `EXTENDED` usually optimal. Control with `ALTER TABLE tbl ALTER COLUMN col SET STORAGE strategy` and `ALTER TABLE tbl SET (toast_tuple_target = 4096)` for threshold. Case-insensitive: for locale/accent handling use non-deterministic collations; for plain ASCII use expression indexes on `LOWER(col)` (preferred unless column needs case-insensitive PK/FK/UNIQUE) or `CITEXT`. 32 | - **Money**: `NUMERIC(p,s)` (never float). 33 | - **Time**: `TIMESTAMPTZ` for timestamps; `DATE` for date-only; `INTERVAL` for durations. Avoid `TIMESTAMP` (without timezone). Use `now()` for transaction start time, `clock_timestamp()` for current wall-clock time. 34 | - **Booleans**: `BOOLEAN` with `NOT NULL` constraint unless tri-state values are required. 35 | - **Enums**: `CREATE TYPE ... AS ENUM` for small, stable sets (e.g. US states, days of week). For business-logic-driven and evolving values (e.g. order statuses) → use TEXT (or INT) + CHECK or lookup table. 36 | - **Arrays**: `TEXT[]`, `INTEGER[]`, etc. Use for ordered lists where you query elements. Index with **GIN** for containment (`@>`, `<@`) and overlap (`&&`) queries. Access: `arr[1]` (1-indexed), `arr[1:3]` (slicing). Good for tags, categories; avoid for relations—use junction tables instead. Literal syntax: `'{val1,val2}'` or `ARRAY[val1,val2]`. 37 | - **Range types**: `daterange`, `numrange`, `tstzrange` for intervals. Support overlap (`&&`), containment (`@>`), operators. Index with **GiST**. Good for scheduling, versioning, numeric ranges. Pick a bounds scheme and use it consistently; prefer `[)` (inclusive/exclusive) by default. 38 | - **Network types**: `INET` for IP addresses, `CIDR` for network ranges, `MACADDR` for MAC addresses. Support network operators (`<<`, `>>`, `&&`). 39 | - **Geometric types**: avoid `POINT`, `LINE`, `POLYGON`, `CIRCLE`. Index with **GiST**. Consider **PostGIS** for spatial features. 40 | - **Text search**: `TSVECTOR` for full-text search documents, `TSQUERY` for search queries. Index `tsvector` with **GIN**. Always specify language: `to_tsvector('english', col)` and `to_tsquery('english', 'query')`. Never use single-argument versions. This applies to both index expressions and queries. 41 | - **Domain types**: `CREATE DOMAIN email AS TEXT CHECK (VALUE ~ '^[^@]+@[^@]+$')` for reusable custom types with validation. Enforces constraints across tables. 42 | - **Composite types**: `CREATE TYPE address AS (street TEXT, city TEXT, zip TEXT)` for structured data within columns. Access with `(col).field` syntax. 43 | - **JSONB**: preferred over JSON; index with **GIN**. Use only for optional/semi-structured attrs. ONLY use JSON if the original ordering of the contents MUST be preserved. 44 | - **Vector types**: `vector` type by `pgvector` for vector similarity search for embeddings. 45 | 46 | ### Do not use the following data types 47 | 48 | - DO NOT use `timestamp` (without time zone); DO use `timestamptz` instead. 49 | - DO NOT use `char(n)` or `varchar(n)`; DO use `text` instead. 50 | - DO NOT use `money` type; DO use `numeric` instead. 51 | - DO NOT use `timetz` type; DO use `timestamptz` instead. 52 | - DO NOT use `timestamptz(0)` or any other precision specification; DO use `timestamptz` instead 53 | - DO NOT use `serial` type; DO use `generated always as identity` instead. 54 | - DO NOT use `POINT`, `LINE`, `POLYGON`, `CIRCLE` built-in types, DO use `geometry` from postgis extension instead. 55 | 56 | ## Table Types 57 | 58 | - **Regular**: default; fully durable, logged. 59 | - **TEMPORARY**: session-scoped, auto-dropped, not logged. Faster for scratch work. 60 | - **UNLOGGED**: persistent but not crash-safe. Faster writes; good for caches/staging. 61 | 62 | ## Row-Level Security 63 | 64 | Enable with `ALTER TABLE tbl ENABLE ROW LEVEL SECURITY`. Create policies: `CREATE POLICY user_access ON orders FOR SELECT TO app_users USING (user_id = current_user_id())`. Built-in user-based access control at the row level. 65 | 66 | ## Constraints 67 | 68 | - **PK**: implicit UNIQUE + NOT NULL; creates a B-tree index. 69 | - **FK**: specify `ON DELETE/UPDATE` action (`CASCADE`, `RESTRICT`, `SET NULL`, `SET DEFAULT`). Add explicit index on referencing column—speeds up joins and prevents locking issues on parent deletes/updates. Use `DEFERRABLE INITIALLY DEFERRED` for circular FK dependencies checked at transaction end. 70 | - **UNIQUE**: creates a B-tree index; allows multiple NULLs unless `NULLS NOT DISTINCT` (PG15+). Standard behavior: `(1, NULL)` and `(1, NULL)` are allowed. With `NULLS NOT DISTINCT`: only one `(1, NULL)` allowed. Prefer `NULLS NOT DISTINCT` unless you specifically need duplicate NULLs. 71 | - **CHECK**: row-local constraints; NULL values pass the check (three-valued logic). Example: `CHECK (price > 0)` allows NULL prices. Combine with `NOT NULL` to enforce: `price NUMERIC NOT NULL CHECK (price > 0)`. 72 | - **EXCLUDE**: prevents overlapping values using operators. `EXCLUDE USING gist (room_id WITH =, booking_period WITH &&)` prevents double-booking rooms. Requires appropriate index type (often GiST). 73 | 74 | ## Indexing 75 | 76 | - **B-tree**: default for equality/range queries (`=`, `<`, `>`, `BETWEEN`, `ORDER BY`) 77 | - **Composite**: order matters—index used if equality on leftmost prefix (`WHERE a = ? AND b > ?` uses index on `(a,b)`, but `WHERE b = ?` does not). Put most selective/frequently filtered columns first. 78 | - **Covering**: `CREATE INDEX ON tbl (id) INCLUDE (name, email)` - includes non-key columns for index-only scans without visiting table. 79 | - **Partial**: for hot subsets (`WHERE status = 'active'` → `CREATE INDEX ON tbl (user_id) WHERE status = 'active'`). Any query with `status = 'active'` can use this index. 80 | - **Expression**: for computed search keys (`CREATE INDEX ON tbl (LOWER(email))`). Expression must match exactly in WHERE clause: `WHERE LOWER(email) = 'user@example.com'`. 81 | - **GIN**: JSONB containment/existence, arrays (`@>`, `?`), full-text search (`@@`) 82 | - **GiST**: ranges, geometry, exclusion constraints 83 | - **BRIN**: very large, naturally ordered data (time-series)—minimal storage overhead. Effective when row order on disk correlates with indexed column (insertion order or after `CLUSTER`). 84 | 85 | ## Partitioning 86 | 87 | - Use for very large tables (>100M rows) where queries consistently filter on partition key (often time/date). 88 | - Alternate use: use for tables where data maintenance tasks dictates e.g. data pruned or bulk replaced periodically 89 | - **RANGE**: common for time-series (`PARTITION BY RANGE (created_at)`). Create partitions: `CREATE TABLE logs_2024_01 PARTITION OF logs FOR VALUES FROM ('2024-01-01') TO ('2024-02-01')`. **TimescaleDB** automates time-based or ID-based partitioning with retention policies and compression. 90 | - **LIST**: for discrete values (`PARTITION BY LIST (region)`). Example: `FOR VALUES IN ('us-east', 'us-west')`. 91 | - **HASH**: for even distribution when no natural key (`PARTITION BY HASH (user_id)`). Creates N partitions with modulus. 92 | - **Constraint exclusion**: requires `CHECK` constraints on partitions for query planner to prune. Auto-created for declarative partitioning (PG10+). 93 | - Prefer declarative partitioning or hypertables. Do NOT use table inheritance. 94 | - **Limitations**: no global UNIQUE constraints—include partition key in PK/UNIQUE. FKs from partitioned tables not supported; use triggers. 95 | 96 | ## Special Considerations 97 | 98 | ### Update-Heavy Tables 99 | 100 | - **Separate hot/cold columns**—put frequently updated columns in separate table to minimize bloat. 101 | - **Use `fillfactor=90`** to leave space for HOT updates that avoid index maintenance. 102 | - **Avoid updating indexed columns**—prevents beneficial HOT updates. 103 | - **Partition by update patterns**—separate frequently updated rows in a different partition from stable data. 104 | 105 | ### Insert-Heavy Workloads 106 | 107 | - **Minimize indexes**—only create what you query; every index slows inserts. 108 | - **Use `COPY` or multi-row `INSERT`** instead of single-row inserts. 109 | - **UNLOGGED tables** for rebuildable staging data—much faster writes. 110 | - **Defer index creation** for bulk loads—>drop index, load data, recreate indexes. 111 | - **Partition by time/hash** to distribute load. **TimescaleDB** automates partitioning and compression of insert-heavy data. 112 | - **Use a natural key for primary key** such as a (timestamp, device_id) if enforcing global uniqueness is important many insert-heavy tables don't need a primary key at all. 113 | - If you do need a surrogate key, **Prefer `BIGINT GENERATED ALWAYS AS IDENTITY` over `UUID`**. 114 | 115 | ### Upsert-Friendly Design 116 | 117 | - **Requires UNIQUE index** on conflict target columns—`ON CONFLICT (col1, col2)` needs exact matching unique index (partial indexes don't work). 118 | - **Use `EXCLUDED.column`** to reference would-be-inserted values; only update columns that actually changed to reduce write overhead. 119 | - **`DO NOTHING` faster** than `DO UPDATE` when no actual update needed. 120 | 121 | ### Safe Schema Evolution 122 | 123 | - **Transactional DDL**: most DDL operations can run in transactions and be rolled back—`BEGIN; ALTER TABLE...; ROLLBACK;` for safe testing. 124 | - **Concurrent index creation**: `CREATE INDEX CONCURRENTLY` avoids blocking writes but can't run in transactions. 125 | - **Volatile defaults cause rewrites**: adding `NOT NULL` columns with volatile defaults (e.g., `now()`, `gen_random_uuid()`) rewrites entire table. Non-volatile defaults are fast. 126 | - **Drop constraints before columns**: `ALTER TABLE DROP CONSTRAINT` then `DROP COLUMN` to avoid dependency issues. 127 | - **Function signature changes**: `CREATE OR REPLACE` with different arguments creates overloads, not replacements. DROP old version if no overload desired. 128 | 129 | ## Generated Columns 130 | 131 | - `... GENERATED ALWAYS AS () STORED` for computed, indexable fields. PG18+ adds `VIRTUAL` columns (computed on read, not stored). 132 | 133 | ## Extensions 134 | 135 | - **`pgcrypto`**: `crypt()` for password hashing. 136 | - **`uuid-ossp`**: alternative UUID functions; prefer `pgcrypto` for new projects. 137 | - **`pg_trgm`**: fuzzy text search with `%` operator, `similarity()` function. Index with GIN for `LIKE '%pattern%'` acceleration. 138 | - **`citext`**: case-insensitive text type. Prefer expression indexes on `LOWER(col)` unless you need case-insensitive constraints. 139 | - **`btree_gin`/`btree_gist`**: enable mixed-type indexes (e.g., GIN index on both JSONB and text columns). 140 | - **`hstore`**: key-value pairs; mostly superseded by JSONB but useful for simple string mappings. 141 | - **`timescaledb`**: essential for time-series—automated partitioning, retention, compression, continuous aggregates. 142 | - **`postgis`**: comprehensive geospatial support beyond basic geometric types—essential for location-based applications. 143 | - **`pgvector`**: vector similarity search for embeddings. 144 | - **`pgaudit`**: audit logging for all database activity. 145 | 146 | ## JSONB Guidance 147 | 148 | - Prefer `JSONB` with **GIN** index. 149 | - Default: `CREATE INDEX ON tbl USING GIN (jsonb_col);` → accelerates: 150 | - **Containment** `jsonb_col @> '{"k":"v"}'` 151 | - **Key existence** `jsonb_col ? 'k'`, **any/all keys** `?\|`, `?&` 152 | - **Path containment** on nested docs 153 | - **Disjunction** `jsonb_col @> ANY(ARRAY['{"status":"active"}', '{"status":"pending"}'])` 154 | - Heavy `@>` workloads: consider opclass `jsonb_path_ops` for smaller/faster containment-only indexes: 155 | - `CREATE INDEX ON tbl USING GIN (jsonb_col jsonb_path_ops);` 156 | - **Trade-off**: loses support for key existence (`?`, `?|`, `?&`) queries—only supports containment (`@>`) 157 | - Equality/range on a specific scalar field: extract and index with B-tree (generated column or expression): 158 | - `ALTER TABLE tbl ADD COLUMN price INT GENERATED ALWAYS AS ((jsonb_col->>'price')::INT) STORED;` 159 | - `CREATE INDEX ON tbl (price);` 160 | - Prefer queries like `WHERE price BETWEEN 100 AND 500` (uses B-tree) over `WHERE (jsonb_col->>'price')::INT BETWEEN 100 AND 500` without index. 161 | - Arrays inside JSONB: use GIN + `@>` for containment (e.g., tags). Consider `jsonb_path_ops` if only doing containment. 162 | - Keep core relations in tables; use JSONB for optional/variable attributes. 163 | - Use constraints to limit allowed JSONB values in a column e.g. `config JSONB NOT NULL CHECK(jsonb_typeof(config) = 'object')` 164 | 165 | ## Examples 166 | 167 | ### Users 168 | 169 | ```sql 170 | CREATE TABLE users ( 171 | user_id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, 172 | email TEXT NOT NULL UNIQUE, 173 | name TEXT NOT NULL, 174 | created_at TIMESTAMPTZ NOT NULL DEFAULT now() 175 | ); 176 | CREATE UNIQUE INDEX ON users (LOWER(email)); 177 | CREATE INDEX ON users (created_at); 178 | ``` 179 | 180 | ### Orders 181 | 182 | ```sql 183 | CREATE TABLE orders ( 184 | order_id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY, 185 | user_id BIGINT NOT NULL REFERENCES users(user_id), 186 | status TEXT NOT NULL DEFAULT 'PENDING' CHECK (status IN ('PENDING','PAID','CANCELED')), 187 | total NUMERIC(10,2) NOT NULL CHECK (total > 0), 188 | created_at TIMESTAMPTZ NOT NULL DEFAULT now() 189 | ); 190 | CREATE INDEX ON orders (user_id); 191 | CREATE INDEX ON orders (created_at); 192 | ``` 193 | 194 | ### JSONB 195 | 196 | ```sql 197 | CREATE TABLE profiles ( 198 | user_id BIGINT PRIMARY KEY REFERENCES users(user_id), 199 | attrs JSONB NOT NULL DEFAULT '{}', 200 | theme TEXT GENERATED ALWAYS AS (attrs->>'theme') STORED 201 | ); 202 | CREATE INDEX profiles_attrs_gin ON profiles USING GIN (attrs); 203 | ``` 204 | -------------------------------------------------------------------------------- /skills/setup-timescaledb-hypertables/SKILL.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: setup-timescaledb-hypertables 3 | description: 'Step-by-step instructions for designing table schemas and setting up TimescaleDB with hypertables, indexes, compression, retention policies, and continuous aggregates. Instructions for selecting: partition columns, segment_by columns, order_by columns, chunk time interval, real-time aggregation.' 4 | --- 5 | 6 | # TimescaleDB Complete Setup 7 | 8 | Instructions for insert-heavy data patterns where data is inserted but rarely changed: 9 | 10 | - **Time-series data** (sensors, metrics, system monitoring) 11 | - **Event logs** (user events, audit trails, application logs) 12 | - **Transaction records** (orders, payments, financial transactions) 13 | - **Sequential data** (records with auto-incrementing IDs and timestamps) 14 | - **Append-only datasets** (immutable records, historical data) 15 | 16 | ## Step 1: Create Hypertable 17 | 18 | ```sql 19 | CREATE TABLE your_table_name ( 20 | timestamp TIMESTAMPTZ NOT NULL, 21 | entity_id TEXT NOT NULL, -- device_id, user_id, symbol, etc. 22 | category TEXT, -- sensor_type, event_type, asset_class, etc. 23 | value_1 DOUBLE PRECISION, -- price, temperature, latency, etc. 24 | value_2 DOUBLE PRECISION, -- volume, humidity, throughput, etc. 25 | value_3 INTEGER, -- count, status, level, etc. 26 | metadata JSONB -- flexible additional data 27 | ) WITH ( 28 | tsdb.hypertable, 29 | tsdb.partition_column='timestamp', 30 | tsdb.enable_columnstore=true, -- Disable if table has vector columns 31 | tsdb.segmentby='entity_id', -- See selection guide below 32 | tsdb.orderby='timestamp DESC', -- See selection guide below 33 | tsdb.sparse_index='minmax(value_1),minmax(value_2),minmax(value_3)' -- see selection guide below 34 | ); 35 | ``` 36 | 37 | ### Compression Decision 38 | 39 | - **Enable by default** for insert-heavy patterns 40 | - **Disable** if table has vector type columns (pgvector) - indexes on vector columns incompatible with columnstore 41 | 42 | ### Partition Column Selection 43 | 44 | Must be time-based (TIMESTAMP/TIMESTAMPTZ/DATE) or integer (INT/BIGINT) with good temporal/sequential distribution. 45 | 46 | **Common patterns:** 47 | 48 | - TIME-SERIES: `timestamp`, `event_time`, `measured_at` 49 | - EVENT LOGS: `event_time`, `created_at`, `logged_at` 50 | - TRANSACTIONS: `created_at`, `transaction_time`, `processed_at` 51 | - SEQUENTIAL: `id` (auto-increment when no timestamp), `sequence_number` 52 | - APPEND-ONLY: `created_at`, `inserted_at`, `id` 53 | 54 | **Less ideal:** `ingested_at` (when data entered system - use only if it's your primary query dimension) 55 | **Avoid:** `updated_at` (breaks time ordering unless it's primary query dimension) 56 | 57 | ### Segment_By Column Selection 58 | 59 | **PREFER SINGLE COLUMN** - multi-column rarely optimal. Multi-column can only work for highly correlated columns (e.g., metric_name + metric_type) with sufficient row density. 60 | 61 | **Requirements:** 62 | 63 | - Frequently used in WHERE clauses (most common filter) 64 | - Good row density (>100 rows per value per chunk) 65 | - Primary logical partition/grouping 66 | 67 | **Examples:** 68 | 69 | - IoT: `device_id` 70 | - Finance: `symbol` 71 | - Metrics: `service_name`, `service_name, metric_type` (if sufficient row density), `metric_name, metric_type` (if sufficient row density) 72 | - Analytics: `user_id` if sufficient row density, otherwise `session_id` 73 | - E-commerce: `product_id` if sufficient row density, otherwise `category_id` 74 | 75 | **Row density guidelines:** 76 | 77 | - Target: >100 rows per segment_by value within each chunk. 78 | - Poor: <10 rows per segment_by value per chunk → choose less granular column 79 | - What to do with low-density columns: prepend to order_by column list. 80 | 81 | **Query pattern drives choice:** 82 | 83 | ```sql 84 | SELECT * FROM table WHERE entity_id = 'X' AND timestamp > ... 85 | -- ↳ segment_by: entity_id (if >100 rows per chunk) 86 | ``` 87 | 88 | **Avoid:** timestamps, unique IDs, low-density columns (<100 rows/value/chunk), columns rarely used in filtering 89 | 90 | ### Order_By Column Selection 91 | 92 | Creates natural time-series progression when combined with segment_by for optimal compression. 93 | 94 | **Most common:** `timestamp DESC` 95 | 96 | **Examples:** 97 | 98 | - IoT/Finance/E-commerce: `timestamp DESC` 99 | - Metrics: `metric_name, timestamp DESC` (if metric_name has too low density for segment_by) 100 | - Analytics: `user_id, timestamp DESC` (user_id has too low density for segment_by) 101 | 102 | **Alternative patterns:** 103 | 104 | - `sequence_id DESC` for event streams with sequence numbers 105 | - `timestamp DESC, event_order DESC` for sub-ordering within same timestamp 106 | 107 | **Low-density column handling:** 108 | If a column has <100 rows per chunk (too low for segment_by), prepend it to order_by: 109 | 110 | - Example: `metric_name` has 20 rows/chunk → use `segment_by='service_name'`, `order_by='metric_name, timestamp DESC'` 111 | - Groups similar values together (all temperature readings, then pressure readings) for better compression 112 | 113 | **Good test:** ordering created by `(segment_by_column, order_by_column)` should form a natural time-series progression. Values close to each other in the progression should be similar. 114 | 115 | **Avoid in order_by:** random columns, columns with high variance between adjacent rows, columns unrelated to segment_by 116 | 117 | ### Compression Sparse Index Selection 118 | 119 | **Sparse indexes** enable query filtering on compressed data without decompression. Store metadata per batch (~1000 rows) to eliminate batches that don't match query predicates. 120 | 121 | **Types:** 122 | 123 | - **minmax:** Min/max values per batch - for range queries (>, <, BETWEEN) on numeric/temporal columns 124 | 125 | **Use minmax for:** price, temperature, measurement, timestamp (range filtering) 126 | 127 | **Use for:** 128 | 129 | - minmax for outlier detection (temperature > 90). 130 | - minmax for fields that are highly correlated with segmentby and orderby columns (e.g. if orderby includes `created_at`, minmax on `updated_at` is useful). 131 | 132 | **Avoid:** rarely filtered columns. 133 | 134 | IMPORTANT: NEVER index columns in segmentby or orderby. Orderby columns will always have minmax indexes without any configuration. 135 | 136 | **Configuration:** 137 | The format is a comma-separated list of type_of_index(column_name). 138 | 139 | ```sql 140 | ALTER TABLE table_name SET ( 141 | timescaledb.sparse_index = 'minmax(value_1),minmax(value_2)' 142 | ); 143 | ``` 144 | 145 | Explicit configuration available since v2.22.0 (was auto-created since v2.16.0). 146 | 147 | ### Chunk Time Interval (Optional) 148 | 149 | Default: 7 days (use if volume unknown, or ask user). Adjust based on volume: 150 | 151 | - High frequency: 1 hour - 1 day 152 | - Medium: 1 day - 1 week 153 | - Low: 1 week - 1 month 154 | 155 | ```sql 156 | SELECT set_chunk_time_interval('your_table_name', INTERVAL '1 day'); 157 | ``` 158 | 159 | **Good test:** recent chunk indexes should fit in less than 25% of RAM. 160 | 161 | ### Indexes & Primary Keys 162 | 163 | Common index patterns - composite indexes on an id and timestamp: 164 | 165 | ```sql 166 | CREATE INDEX idx_entity_timestamp ON your_table_name (entity_id, timestamp DESC); 167 | ``` 168 | 169 | **Important:** Only create indexes you'll actually use - each has maintenance overhead. 170 | 171 | **Primary key and unique constraints rules:** Must include partition column. 172 | 173 | **Option 1: Composite PK with partition column** 174 | 175 | ```sql 176 | ALTER TABLE your_table_name ADD PRIMARY KEY (entity_id, timestamp); 177 | ``` 178 | 179 | **Option 2: Single-column PK (only if it's the partition column)** 180 | 181 | ```sql 182 | CREATE TABLE ... (id BIGINT PRIMARY KEY, ...) WITH (tsdb.partition_column='id'); 183 | ``` 184 | 185 | **Option 3: No PK**: strict uniqueness is often not required for insert-heavy patterns. 186 | 187 | ## Step 2: Compression Policy 188 | 189 | Set `after` interval for when: data becomes mostly immutable (some updates/backfill OK) AND B-tree indexes aren't needed for queries (less common criterion). 190 | 191 | ```sql 192 | -- Adjust 'after' based on update patterns 193 | CALL add_columnstore_policy('your_table_name', after => INTERVAL '1 day'); 194 | ``` 195 | 196 | ## Step 3: Retention Policy 197 | 198 | IMPORTANT: Don't guess - ask user or comment out if unknown. 199 | 200 | ```sql 201 | -- Example - replace with requirements or comment out 202 | SELECT add_retention_policy('your_table_name', INTERVAL '365 days'); 203 | ``` 204 | 205 | ## Step 4: Create Continuous Aggregates 206 | 207 | Use different aggregation intervals for different uses. 208 | 209 | ### Short-term (Minutes/Hours) 210 | 211 | For up-to-the-minute dashboards on high-frequency data. 212 | 213 | ```sql 214 | CREATE MATERIALIZED VIEW your_table_hourly 215 | WITH (timescaledb.continuous) AS 216 | SELECT 217 | time_bucket(INTERVAL '1 hour', timestamp) AS bucket, 218 | entity_id, 219 | category, 220 | COUNT(*) as record_count, 221 | AVG(value_1) as avg_value_1, 222 | MIN(value_1) as min_value_1, 223 | MAX(value_1) as max_value_1, 224 | SUM(value_2) as sum_value_2 225 | FROM your_table_name 226 | GROUP BY bucket, entity_id, category; 227 | ``` 228 | 229 | ### Long-term (Days/Weeks/Months) 230 | 231 | For long-term reporting and analytics. 232 | 233 | ```sql 234 | CREATE MATERIALIZED VIEW your_table_daily 235 | WITH (timescaledb.continuous) AS 236 | SELECT 237 | time_bucket(INTERVAL '1 day', timestamp) AS bucket, 238 | entity_id, 239 | category, 240 | COUNT(*) as record_count, 241 | AVG(value_1) as avg_value_1, 242 | MIN(value_1) as min_value_1, 243 | MAX(value_1) as max_value_1, 244 | PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY value_1) as median_value_1, 245 | PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY value_1) as p95_value_1, 246 | SUM(value_2) as sum_value_2 247 | FROM your_table_name 248 | GROUP BY bucket, entity_id, category; 249 | ``` 250 | 251 | ## Step 5: Aggregate Refresh Policies 252 | 253 | Set up refresh policies based on your data freshness requirements. 254 | 255 | **start_offset:** Usually omit (refreshes all). Exception: If you don't care about refreshing data older than X (see below). With retention policy on raw data: match the retention policy. 256 | 257 | **end_offset:** Set beyond active update window (e.g., 15 min if data usually arrives within 10 min). Data newer than end_offset won't appear in queries without real-time aggregation. If you don't know your update window, use the size of the time_bucket in the query, but not less than 5 minutes. 258 | 259 | **schedule_interval:** Set to the same value as the end_offset but not more than 1 hour. 260 | 261 | **Hourly - frequent refresh for dashboards:** 262 | 263 | ```sql 264 | SELECT add_continuous_aggregate_policy('your_table_hourly', 265 | end_offset => INTERVAL '15 minutes', 266 | schedule_interval => INTERVAL '15 minutes'); 267 | ``` 268 | 269 | **Daily - less frequent for reports:** 270 | 271 | ```sql 272 | SELECT add_continuous_aggregate_policy('your_table_daily', 273 | end_offset => INTERVAL '1 hour', 274 | schedule_interval => INTERVAL '1 hour'); 275 | ``` 276 | 277 | **Use start_offset only if you don't care about refreshing old data** 278 | Use for high-volume systems where query accuracy on older data doesn't matter: 279 | 280 | ```sql 281 | -- the following aggregate can be stale for data older than 7 days 282 | -- SELECT add_continuous_aggregate_policy('aggregate_for_last_7_days', 283 | -- start_offset => INTERVAL '7 days', -- only refresh last 7 days 284 | -- end_offset => INTERVAL '15 minutes', 285 | -- schedule_interval => INTERVAL '15 minutes'); 286 | ``` 287 | 288 | IMPORTANT: you MUST set a start_offset to be less than the retention policy on raw data. By default, set the start_offset equal to the retention policy. 289 | If the retention policy is commented out, comment out the start_offset as well. like this: 290 | 291 | ```sql 292 | SELECT add_continuous_aggregate_policy('your_table_daily', 293 | -- start_offset => INTERVAL '', -- uncomment if retention policy is enabled on the raw data table 294 | end_offset => INTERVAL '1 hour', 295 | schedule_interval => INTERVAL '1 hour'); 296 | ``` 297 | 298 | ## Step 6: Real-Time Aggregation (Optional) 299 | 300 | Real-time combines materialized + recent raw data at query time. Provides up-to-date results at the cost of higher query latency. 301 | 302 | More useful for fine-grained aggregates (e.g., minutely) than coarse ones (e.g., daily/monthly) since large buckets will be mostly incomplete with recent data anyway. 303 | 304 | Disabled by default in v2.13+, before that it was enabled by default. 305 | 306 | **Use when:** Need data newer than end_offset, up-to-minute dashboards, can tolerate higher query latency 307 | **Disable when:** Performance critical, refresh policies sufficient, high query volume, missing and stale data for recent data is acceptable 308 | 309 | **Enable for current results (higher query cost):** 310 | 311 | ```sql 312 | ALTER MATERIALIZED VIEW your_table_hourly SET (timescaledb.materialized_only = false); 313 | ``` 314 | 315 | **Disable for performance (but with stale results):** 316 | 317 | ```sql 318 | ALTER MATERIALIZED VIEW your_table_hourly SET (timescaledb.materialized_only = true); 319 | ``` 320 | 321 | ## Step 7: Compress Aggregates 322 | 323 | Rule: segment_by = ALL GROUP BY columns except time_bucket, order_by = time_bucket DESC 324 | 325 | ```sql 326 | -- Hourly 327 | ALTER MATERIALIZED VIEW your_table_hourly SET ( 328 | timescaledb.enable_columnstore, 329 | timescaledb.segmentby = 'entity_id, category', 330 | timescaledb.orderby = 'bucket DESC' 331 | ); 332 | CALL add_columnstore_policy('your_table_hourly', after => INTERVAL '3 days'); 333 | 334 | -- Daily 335 | ALTER MATERIALIZED VIEW your_table_daily SET ( 336 | timescaledb.enable_columnstore, 337 | timescaledb.segmentby = 'entity_id, category', 338 | timescaledb.orderby = 'bucket DESC' 339 | ); 340 | CALL add_columnstore_policy('your_table_daily', after => INTERVAL '7 days'); 341 | ``` 342 | 343 | ## Step 8: Aggregate Retention 344 | 345 | Aggregates are typically kept longer than raw data. 346 | IMPORTANT: Don't guess - ask user or you **MUST comment out if unknown**. 347 | 348 | ```sql 349 | -- Example - replace or comment out 350 | SELECT add_retention_policy('your_table_hourly', INTERVAL '2 years'); 351 | SELECT add_retention_policy('your_table_daily', INTERVAL '5 years'); 352 | ``` 353 | 354 | ## Step 9: Performance Indexes on Continuous Aggregates 355 | 356 | **Index strategy:** Analyze WHERE clauses in common queries → Create indexes matching filter columns + time ordering 357 | 358 | **Pattern:** `(filter_column, bucket DESC)` supports `WHERE filter_column = X AND bucket >= Y ORDER BY bucket DESC` 359 | 360 | Examples: 361 | 362 | ```sql 363 | CREATE INDEX idx_hourly_entity_bucket ON your_table_hourly (entity_id, bucket DESC); 364 | CREATE INDEX idx_hourly_category_bucket ON your_table_hourly (category, bucket DESC); 365 | ``` 366 | 367 | **Multi-column filters:** Create composite indexes for `WHERE entity_id = X AND category = Y`: 368 | 369 | ```sql 370 | CREATE INDEX idx_hourly_entity_category_bucket ON your_table_hourly (entity_id, category, bucket DESC); 371 | ``` 372 | 373 | **Important:** Only create indexes you'll actually use - each has maintenance overhead. 374 | 375 | ## Step 10: Optional Enhancements 376 | 377 | ### Space Partitioning (NOT RECOMMENDED) 378 | 379 | Only for query patterns where you ALWAYS filter by the space-partition column with expert knowledge and extensive benchmarking. STRONGLY prefer time-only partitioning. 380 | 381 | ## Step 11: Verify Configuration 382 | 383 | ```sql 384 | -- Check hypertable 385 | SELECT * FROM timescaledb_information.hypertables 386 | WHERE hypertable_name = 'your_table_name'; 387 | 388 | -- Check compression 389 | SELECT * FROM timescaledb_information.columnstore_settings 390 | WHERE hypertable_name LIKE 'your_table_name'; 391 | 392 | -- Check aggregates 393 | SELECT * FROM timescaledb_information.continuous_aggregates; 394 | 395 | -- Check policies 396 | SELECT * FROM timescaledb_information.jobs ORDER BY job_id; 397 | 398 | -- Monitor chunk information 399 | SELECT chunk_name, table_size, compressed_heap_size, compressed_index_size 400 | FROM timescaledb_information.chunks 401 | WHERE hypertable_name = 'your_table_name'; 402 | ``` 403 | 404 | ## Performance Guidelines 405 | 406 | - **Chunk size:** Recent chunk indexes should fit in less than 25% of RAM 407 | - **Compression:** Expect 90%+ reduction (10x) with proper columnstore config 408 | - **Query optimization:** Use continuous aggregates for historical queries and dashboards 409 | - **Memory:** Run `timescaledb-tune` for self-hosting (auto-configured on cloud) 410 | 411 | ## Schema Best Practices 412 | 413 | ### Do's and Don'ts 414 | 415 | - ✅ Use `TIMESTAMPTZ` NOT `timestamp` 416 | - ✅ Use `>=` and `<` NOT `BETWEEN` for timestamps 417 | - ✅ Use `TEXT` with constraints NOT `char(n)`/`varchar(n)` 418 | - ✅ Use `snake_case` NOT `CamelCase` 419 | - ✅ Use `BIGINT GENERATED ALWAYS AS IDENTITY` NOT `SERIAL` 420 | - ✅ Use `BIGINT` for IDs by default over `INTEGER` or `SMALLINT` 421 | - ✅ Use `DOUBLE PRECISION` by default over `REAL`/`FLOAT` 422 | - ✅ Use `NUMERIC` NOT `MONEY` 423 | - ✅ Use `NOT EXISTS` NOT `NOT IN` 424 | - ✅ Use `time_bucket()` or `date_trunc()` NOT `timestamp(0)` for truncation 425 | 426 | ## API Reference (Current vs Deprecated) 427 | 428 | **Deprecated Parameters → New Parameters:** 429 | 430 | - `timescaledb.compress` → `timescaledb.enable_columnstore` 431 | - `timescaledb.compress_segmentby` → `timescaledb.segmentby` 432 | - `timescaledb.compress_orderby` → `timescaledb.orderby` 433 | 434 | **Deprecated Functions → New Functions:** 435 | 436 | - `add_compression_policy()` → `add_columnstore_policy()` 437 | - `remove_compression_policy()` → `remove_columnstore_policy()` 438 | - `compress_chunk()` → `convert_to_columnstore()` 439 | - `decompress_chunk()` → `convert_to_rowstore()` 440 | 441 | **Deprecated Views → New Views:** 442 | 443 | - `compression_settings` → `columnstore_settings` 444 | - `hypertable_compression_settings` → `hypertable_columnstore_settings` 445 | - `chunk_compression_settings` → `chunk_columnstore_settings` 446 | 447 | **Deprecated Stats Functions → New Stats Functions:** 448 | 449 | - `hypertable_compression_stats()` → `hypertable_columnstore_stats()` 450 | - `chunk_compression_stats()` → `chunk_columnstore_stats()` 451 | 452 | ## Questions to Ask User 453 | 454 | 1. What kind of data will you be storing? 455 | 2. How do you expect to use the data? 456 | 3. What queries will you run? 457 | 4. How long to keep the data? 458 | 5. Column types if unclear 459 | -------------------------------------------------------------------------------- /ingest/postgres_docs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from dataclasses import dataclass 3 | from dotenv import load_dotenv 4 | from bs4 import BeautifulSoup, element as BeautifulSoupElement 5 | import json 6 | from markdownify import markdownify 7 | import openai 8 | import os 9 | from pathlib import Path 10 | import psycopg 11 | from psycopg.sql import SQL, Identifier 12 | import re 13 | import shutil 14 | import subprocess 15 | import tiktoken 16 | 17 | 18 | THIS_DIR = Path(__file__).parent.resolve() 19 | 20 | load_dotenv(dotenv_path=os.path.join(THIS_DIR, "..", ".env")) 21 | 22 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") 23 | 24 | POSTGRES_DIR = THIS_DIR / "postgres" 25 | SMGL_DIR = POSTGRES_DIR / "doc" / "src" / "sgml" 26 | HTML_DIR = SMGL_DIR / "html" 27 | BUILD_DIR = THIS_DIR / "build" 28 | BUILD_DIR.mkdir(exist_ok=True) 29 | MD_DIR = BUILD_DIR / "md" 30 | 31 | POSTGRES_BASE_URL = "https://www.postgresql.org/docs" 32 | 33 | ENC = tiktoken.get_encoding("cl100k_base") 34 | MAX_CHUNK_TOKENS = 7000 35 | 36 | 37 | def update_repo(): 38 | if not POSTGRES_DIR.exists(): 39 | subprocess.run( 40 | "git clone https://github.com/postgres/postgres.git postgres", 41 | shell=True, 42 | check=True, 43 | env=os.environ, 44 | text=True, 45 | ) 46 | else: 47 | subprocess.run( 48 | "git fetch", 49 | shell=True, 50 | check=True, 51 | env=os.environ, 52 | text=True, 53 | cwd=POSTGRES_DIR, 54 | ) 55 | 56 | 57 | def get_version_tag(version: int) -> str: 58 | result = subprocess.run( 59 | ["git", "tag", "-l"], capture_output=True, text=True, cwd=POSTGRES_DIR 60 | ) 61 | if result.returncode != 0: 62 | raise RuntimeError("Failed to get git tags") 63 | 64 | tags = result.stdout.splitlines() 65 | 66 | candidate_tags = [] 67 | 68 | for version_type in ["", "RC", "BETA"]: 69 | pattern = re.compile(rf"REL_{version}_{version_type}(\d+)$") 70 | for tag in tags: 71 | match = pattern.match(tag) 72 | if match: 73 | minor_version = int(match.group(1)) 74 | candidate_tags.append((minor_version, tag)) 75 | if len(candidate_tags) > 0: 76 | break 77 | 78 | if not candidate_tags: 79 | raise ValueError(f"No tags found for Postgres version {version}") 80 | 81 | candidate_tags.sort(key=lambda x: x[0], reverse=True) 82 | return candidate_tags[0][1] 83 | 84 | 85 | def checkout_tag(tag: str) -> None: 86 | print(f"checking out {tag}...") 87 | subprocess.run( 88 | f"git checkout {tag}", 89 | shell=True, 90 | check=True, 91 | env=os.environ, 92 | text=True, 93 | cwd=POSTGRES_DIR, 94 | ) 95 | 96 | 97 | def build_html() -> None: 98 | html_stamp = SMGL_DIR / "html-stamp" 99 | 100 | # make uses the presence of html-stamp to determine if it needs to 101 | # rebuild the html docs. 102 | if html_stamp.exists(): 103 | html_stamp.unlink() 104 | 105 | if HTML_DIR.exists(): 106 | shutil.rmtree(HTML_DIR) 107 | 108 | print("configuring postgres build...") 109 | environ = os.environ.copy() 110 | # Shim for macOS and icu4c installed via homebrew, where it's not linked into 111 | # /usr/local by default. 112 | if Path("/opt/homebrew/opt/icu4c/lib/pkgconfig").exists(): 113 | environ["PKG_CONFIG_PATH"] = "/opt/homebrew/opt/icu4c/lib/pkgconfig" 114 | subprocess.run( 115 | "./configure --without-readline --without-zlib", 116 | shell=True, 117 | check=True, 118 | env=environ, 119 | text=True, 120 | cwd=POSTGRES_DIR, 121 | ) 122 | 123 | print("building postgres docs...") 124 | subprocess.run( 125 | "make html", 126 | shell=True, 127 | check=True, 128 | env=os.environ, 129 | text=True, 130 | cwd=SMGL_DIR, 131 | ) 132 | 133 | 134 | def build_markdown() -> None: 135 | print("converting to markdown...") 136 | if MD_DIR.exists(): 137 | shutil.rmtree(MD_DIR) 138 | MD_DIR.mkdir() 139 | 140 | for html_file in HTML_DIR.glob("*.html"): 141 | # Skip files which are more metadata about the docs than actual docs 142 | # that people would ask questions about. 143 | if html_file.name in [ 144 | "legalnotice.html", 145 | "appendix-obsolete.md", 146 | "appendixes.md", 147 | "biblio.html", 148 | "bookindex.html", 149 | "bug-reporting.html", 150 | "source-format.html", 151 | "error-message-reporting.html", 152 | "error-style-guide.html", 153 | "source-conventions.html", 154 | "sourcerepo.html", 155 | ] or html_file.name.startswith("docguide"): 156 | continue 157 | md_file = MD_DIR / (html_file.stem + ".md") 158 | 159 | html_content = html_file.read_text(encoding="utf-8") 160 | html_content = html_content.replace( 161 | '', "" 162 | ) 163 | 164 | soup = BeautifulSoup(html_content, "html.parser") 165 | 166 | is_refentry = bool(soup.find("div", class_="refentry")) 167 | 168 | elem = soup.find("div", attrs={"id": True}) 169 | if elem and isinstance(elem, BeautifulSoupElement.Tag): 170 | slug = str(elem["id"]).lower() + ".html" 171 | else: 172 | raise SystemError(f"No div with id found in {html_file}") 173 | 174 | title = soup.find("title") 175 | title_text = ( 176 | str(title.string).strip() 177 | if title and isinstance(title, BeautifulSoupElement.Tag) 178 | else "PostgreSQL Documentation" 179 | ) 180 | if title: 181 | title.decompose() 182 | for class_name in ["navheader", "navfooter"]: 183 | for div in soup.find_all("div", class_=class_name): 184 | div.decompose() 185 | 186 | # Don't bother including refentry in the transform as we don't chunk 187 | # them by headers anyway. 188 | if not is_refentry: 189 | # Convert h3 headings in admonitions to h4 so that we avoid 190 | # chunking them. 191 | for class_name in [ 192 | "caution", 193 | "important", 194 | "notice", 195 | "warning", 196 | "tip", 197 | "note", 198 | ]: 199 | for div in soup.find_all("div", class_=class_name): 200 | if div is None or not isinstance(div, BeautifulSoupElement.Tag): 201 | continue 202 | h3 = div.find("h3") 203 | if h3 and isinstance(h3, BeautifulSoupElement.Tag): 204 | h3.name = "h4" 205 | 206 | md_content = markdownify(str(soup), heading_style="ATX") 207 | md_content = f"""--- 208 | title: {title_text} 209 | slug: {slug} 210 | refentry: {is_refentry} 211 | --- 212 | {md_content}""" 213 | md_file.write_text(md_content, encoding="utf-8") 214 | 215 | 216 | @dataclass 217 | class Page: 218 | id: int 219 | version: int 220 | url: str 221 | domain: str 222 | filename: str 223 | 224 | 225 | @dataclass 226 | class Chunk: 227 | idx: int 228 | header: str 229 | header_path: list[str] 230 | content: str 231 | token_count: int = 0 232 | subindex: int = 0 233 | 234 | 235 | def insert_page( 236 | conn: psycopg.Connection, 237 | page: Page, 238 | ) -> None: 239 | print("inserting page", page.filename, page.url) 240 | result = conn.execute( 241 | "insert into docs.postgres_pages_tmp (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id", 242 | [ 243 | page.version, 244 | page.url, 245 | page.domain, 246 | page.filename, 247 | 0, 248 | 0, 249 | ], 250 | ) 251 | row = result.fetchone() 252 | assert row is not None 253 | page.id = row[0] 254 | 255 | 256 | def update_page_stats( 257 | conn: psycopg.Connection, 258 | page: Page, 259 | ) -> None: 260 | conn.execute( 261 | """ 262 | update docs.postgres_pages_tmp p 263 | set 264 | content_length = coalesce(chunks_stats.total_length, 0), 265 | chunks_count = coalesce(chunks_stats.chunks_count, 0) 266 | from ( 267 | select 268 | page_id, 269 | sum(char_length(content)) as total_length, 270 | count(*) as chunks_count 271 | from docs.postgres_chunks_tmp 272 | where page_id = %s 273 | group by page_id 274 | ) as chunks_stats 275 | where p.id = chunks_stats.page_id and p.id = %s 276 | """, 277 | [page.id, page.id], 278 | ) 279 | 280 | 281 | def insert_chunk( 282 | conn: psycopg.Connection, 283 | page: Page, 284 | chunk: Chunk, 285 | ) -> None: 286 | client = openai.OpenAI(api_key=OPENAI_API_KEY) 287 | content = "" 288 | for i in range(len(chunk.header_path)): 289 | content += ( 290 | "".join(["#" for _ in range(i + 1)]) + " " + chunk.header_path[i] + "\n\n" 291 | ) 292 | content += chunk.content 293 | embedding = ( 294 | client.embeddings.create( 295 | model="text-embedding-3-small", 296 | input=chunk.content, 297 | ) 298 | .data[0] 299 | .embedding 300 | ) 301 | content = chunk.content 302 | # token_count, embedding = embed(header_path, content) 303 | print(f"header: {chunk.header}") 304 | url = page.url 305 | if len(chunk.header_path) > 1: 306 | pattern = r"\((#\S+)\)" 307 | match = re.search(pattern, chunk.header_path[-1]) 308 | if match: 309 | url += match.group(1).lower() 310 | conn.execute( 311 | "insert into docs.postgres_chunks_tmp (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)", 312 | [ 313 | page.id, 314 | chunk.idx, 315 | chunk.subindex, 316 | chunk.content, 317 | json.dumps( 318 | { 319 | "header": chunk.header, 320 | "header_path": chunk.header_path, 321 | "source_url": url, 322 | "token_count": chunk.token_count, 323 | } 324 | ), 325 | embedding, 326 | ], 327 | ) 328 | 329 | 330 | def split_chunk(chunk: Chunk) -> list[Chunk]: 331 | num_subchunks = (chunk.token_count // MAX_CHUNK_TOKENS) + 1 332 | input_ids = ENC.encode(chunk.content) 333 | 334 | tokens_per_chunk = len(input_ids) // num_subchunks 335 | 336 | subchunks = [] 337 | subindex = 0 338 | idx = 0 339 | while idx < len(input_ids): 340 | cur_idx = min(idx + tokens_per_chunk, len(input_ids)) 341 | chunk_ids = input_ids[idx:cur_idx] 342 | if not chunk_ids: 343 | break 344 | decoded = ENC.decode(chunk_ids) 345 | if decoded: 346 | subchunks.append( 347 | Chunk( 348 | idx=chunk.idx, 349 | header=chunk.header, 350 | header_path=chunk.header_path, 351 | content=decoded, 352 | token_count=len(chunk_ids), 353 | subindex=subindex, 354 | ) 355 | ) 356 | subindex += 1 357 | if cur_idx == len(input_ids): 358 | break 359 | idx += tokens_per_chunk 360 | return subchunks 361 | 362 | 363 | def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None: 364 | if chunk.content == "": # discard empty chunks 365 | return 366 | 367 | chunk.token_count = len(ENC.encode(chunk.content)) 368 | if chunk.token_count < 10: # discard chunks that are too tiny to be useful 369 | return 370 | 371 | chunks = [chunk] 372 | 373 | if chunk.token_count > MAX_CHUNK_TOKENS: 374 | print( 375 | f"Chunk {chunk.header} too large ({chunk.token_count} tokens), splitting..." 376 | ) 377 | chunks = split_chunk(chunk) 378 | 379 | for chunk in chunks: 380 | insert_chunk(conn, page, chunk) 381 | conn.commit() 382 | 383 | 384 | def chunk_files(conn: psycopg.Connection, version: int) -> None: 385 | conn.execute("drop table if exists docs.postgres_chunks_tmp") 386 | conn.execute("drop table if exists docs.postgres_pages_tmp") 387 | conn.execute( 388 | "create table docs.postgres_pages_tmp (like docs.postgres_pages including all excluding constraints)" 389 | ) 390 | conn.execute( 391 | "insert into docs.postgres_pages_tmp select * from docs.postgres_pages where version != %s", 392 | [version], 393 | ) 394 | conn.execute( 395 | "create table docs.postgres_chunks_tmp (like docs.postgres_chunks including all excluding constraints)" 396 | ) 397 | conn.execute( 398 | "insert into docs.postgres_chunks_tmp select c.* from docs.postgres_chunks c inner join docs.postgres_pages p on c.page_id = p.id where p.version != %s", 399 | [version], 400 | ) 401 | conn.execute( 402 | "alter table docs.postgres_chunks_tmp add foreign key (page_id) references docs.postgres_pages_tmp(id) on delete cascade" 403 | ) 404 | conn.commit() 405 | 406 | # Reset the sequences for the temp tables 407 | conn.execute( 408 | "select setval(pg_get_serial_sequence('docs.postgres_chunks_tmp', 'id'), (select max(id) from docs.postgres_chunks_tmp))" 409 | ) 410 | conn.execute( 411 | "select setval(pg_get_serial_sequence('docs.postgres_pages_tmp', 'id'), (select max(id) from docs.postgres_pages_tmp))" 412 | ) 413 | conn.commit() 414 | 415 | header_pattern = re.compile("^(#{1,3}) .+$") 416 | codeblock_pattern = re.compile("^```") 417 | 418 | section_prefix = r"^[A-Za-z0-9.]+\.\s*" 419 | chapter_prefix = r"^Chapter\s+[0-9]+\.\s*" 420 | 421 | page_count = 0 422 | 423 | for md in MD_DIR.glob("*.md"): 424 | print(f"chunking {md}...") 425 | with md.open() as f: 426 | # process the frontmatter 427 | f.readline() 428 | f.readline() # title line 429 | slug = f.readline().split(":", 1)[1].strip() 430 | refentry = f.readline().split(":", 1)[1].strip().lower() == "true" 431 | f.readline() 432 | 433 | page = Page( 434 | id=0, 435 | version=version, 436 | url=f"{POSTGRES_BASE_URL}/{version}/{slug}", 437 | domain="postgresql.org", 438 | filename=md.name, 439 | ) 440 | page_count += 1 441 | 442 | insert_page(conn, page) 443 | 444 | header_path = [] 445 | idx = 0 446 | chunk: Chunk | None = None 447 | in_codeblock = False 448 | while True: 449 | line = f.readline() 450 | if line == "": 451 | if chunk is not None: 452 | process_chunk(conn, page, chunk) 453 | break 454 | match = header_pattern.match(line) 455 | if match is None or in_codeblock or (refentry and chunk is not None): 456 | assert chunk is not None 457 | if codeblock_pattern.match(line): 458 | in_codeblock = not in_codeblock 459 | chunk.content += line 460 | continue 461 | header_hases = match.group(1) 462 | depth = len(header_hases) 463 | header_path = header_path[: (depth - 1)] 464 | header = line.lstrip("#").strip() 465 | header = re.sub(section_prefix, "", header).strip() 466 | header = re.sub(chapter_prefix, "", header).strip() 467 | header_path.append(header) 468 | if chunk is not None: 469 | process_chunk(conn, page, chunk) 470 | chunk = Chunk( 471 | idx=idx, 472 | header=header, 473 | header_path=header_path.copy(), 474 | content="", 475 | ) 476 | idx += 1 477 | update_page_stats(conn, page) 478 | conn.commit() 479 | 480 | with conn.cursor() as cur: 481 | cur.execute("drop table docs.postgres_chunks") 482 | cur.execute("drop table docs.postgres_pages") 483 | cur.execute("alter table docs.postgres_chunks_tmp rename to postgres_chunks") 484 | cur.execute("alter table docs.postgres_pages_tmp rename to postgres_pages") 485 | 486 | # the auto create foreign key and index names include the _tmp_ bit in their 487 | # names, so we remove them so that they match the generated names for the 488 | # renamed tables. 489 | for table in ["postgres_pages", "postgres_chunks"]: 490 | cur.execute( 491 | """ 492 | select indexname 493 | from pg_indexes 494 | where schemaname = 'docs' 495 | and tablename = %s 496 | and indexname like %s 497 | """, 498 | [table, '%_tmp_%'], 499 | ) 500 | for row in cur.fetchall(): 501 | old_index_name = row[0] 502 | new_index_name = old_index_name.replace("_tmp_", "_") 503 | cur.execute( 504 | SQL( 505 | "alter index docs.{old_index_name} rename to {new_index_name}" 506 | ).format( 507 | old_index_name=Identifier(old_index_name), 508 | new_index_name=Identifier(new_index_name), 509 | ) 510 | ) 511 | 512 | cur.execute(""" 513 | select conname 514 | from pg_constraint 515 | where conrelid = to_regclass(%s) 516 | and contype = 'f' 517 | and conname like %s 518 | """, ['docs.postgres_chunks', '%_tmp_%']) 519 | for row in cur.fetchall(): 520 | old_fk_name = row[0] 521 | new_fk_name = old_fk_name.replace("_tmp_", "_") 522 | cur.execute( 523 | SQL( 524 | "alter table docs.postgres_chunks rename constraint {old_fk_name} to {new_fk_name}" 525 | ).format( 526 | old_fk_name=Identifier(old_fk_name), 527 | new_fk_name=Identifier(new_fk_name), 528 | ) 529 | ) 530 | 531 | conn.commit() 532 | 533 | print(f"Processed {page_count} pages.") 534 | 535 | 536 | def main(): 537 | parser = argparse.ArgumentParser( 538 | description="Ingest Postgres documentation into the database." 539 | ) 540 | parser.add_argument("version", type=int, help="Postgres version to ingest") 541 | args = parser.parse_args() 542 | version = args.version 543 | update_repo() 544 | tag = get_version_tag(version) 545 | db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}" 546 | with psycopg.connect(db_uri) as conn: 547 | print(f"Building Postgres {version} ({tag}) documentation...") 548 | checkout_tag(tag) 549 | build_html() 550 | build_markdown() 551 | chunk_files(conn, version) 552 | 553 | 554 | if __name__ == "__main__": 555 | main() 556 | -------------------------------------------------------------------------------- /ingest/tiger_docs.py: -------------------------------------------------------------------------------- 1 | from scrapy.spiders import SitemapSpider 2 | from scrapy.crawler import CrawlerProcess 3 | from scrapy.utils.project import get_project_settings 4 | from bs4 import BeautifulSoup 5 | from markdownify import markdownify as md 6 | import os 7 | import re 8 | import sys 9 | import argparse 10 | import asyncio 11 | import time 12 | from urllib.parse import urlparse, urljoin 13 | import hashlib 14 | import requests 15 | import json 16 | import psycopg 17 | from psycopg.sql import SQL, Identifier 18 | import openai 19 | import tomllib 20 | from dotenv import load_dotenv, find_dotenv 21 | from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter 22 | 23 | script_dir = os.path.dirname(os.path.abspath(__file__)) 24 | 25 | if not os.path.exists(os.path.join(script_dir, 'build')): 26 | os.makedirs(os.path.join(script_dir, 'build')) 27 | 28 | load_dotenv(dotenv_path=os.path.join(script_dir, '..', '.env')) 29 | schema = 'docs' 30 | 31 | with open(os.path.join(script_dir, 'tiger_docs_config.toml'), 'rb') as config_fp: 32 | config = tomllib.load(config_fp) 33 | DOMAIN_SELECTORS = config['domain_selectors'] 34 | DEFAULT_SELECTORS = config['default_selectors'] 35 | 36 | 37 | def add_header_breadcrumbs_to_content(content, metadata): 38 | """Add header breadcrumbs to content - shared utility function""" 39 | breadcrumbs = [] 40 | 41 | # Find the deepest header level present in metadata 42 | present_headers = [] 43 | for level in ['Header 1', 'Header 2', 'Header 3']: 44 | if level in metadata: 45 | present_headers.append(level) 46 | 47 | # Add all headers except the last one (to avoid duplication with chunk content) 48 | for level in present_headers[:-1]: 49 | header_level = level.split()[-1] # Get "1", "2", "3" 50 | header_prefix = '#' * int(header_level) 51 | breadcrumbs.append(f"{header_prefix} {metadata[level]}") 52 | 53 | # Combine breadcrumbs with chunk content 54 | if breadcrumbs: 55 | breadcrumb_text = '\n'.join(breadcrumbs) + '\n\n' 56 | return breadcrumb_text + content 57 | else: 58 | return content 59 | 60 | class DatabaseManager: 61 | """Handles PostgreSQL database interactions for storing scraped content""" 62 | 63 | def __init__(self, database_uri, embedding_model=None): 64 | self.database_uri = database_uri 65 | self.embedding_model = embedding_model 66 | self.finalize_queries: list[SQL] = [] 67 | 68 | try: 69 | self.connection = psycopg.connect(self.database_uri) 70 | except Exception as e: 71 | raise RuntimeError(f"Database connection failed: {e}") 72 | 73 | def initialize(self): 74 | with self.connection.cursor() as cursor: 75 | cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_chunks_tmp").format(schema=Identifier(schema))) 76 | cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_pages_tmp").format(schema=Identifier(schema))) 77 | cursor.execute(SQL("CREATE TABLE {schema}.timescale_pages_tmp (LIKE {schema}.timescale_pages INCLUDING ALL EXCLUDING CONSTRAINTS)").format(schema=Identifier(schema))) 78 | cursor.execute(SQL("CREATE TABLE {schema}.timescale_chunks_tmp (LIKE {schema}.timescale_chunks INCLUDING ALL EXCLUDING CONSTRAINTS)").format(schema=Identifier(schema))) 79 | cursor.execute(SQL("ALTER TABLE {schema}.timescale_chunks_tmp ADD FOREIGN KEY (page_id) REFERENCES {schema}.timescale_pages_tmp(id) ON DELETE CASCADE").format(schema=Identifier(schema))) 80 | 81 | # The bm25 indexes have a bug that prevent inserting data into a table 82 | # underneath non-public schemas that has them, so we need to make remove 83 | # them from the tmp tables and recreate them after renaming. 84 | cursor.execute( 85 | """ 86 | SELECT indexname, indexdef 87 | FROM pg_indexes 88 | WHERE schemaname = %s 89 | AND tablename LIKE %s 90 | AND indexdef LIKE %s 91 | """, 92 | ["docs", "timescale%_tmp%", "%bm25%"], 93 | ) 94 | rows = cursor.fetchall() 95 | for row in rows: 96 | index_name = row[0] 97 | index_def = row[1] 98 | tmp_index_def = index_def.replace("_tmp", "") 99 | cursor.execute( 100 | SQL("DROP INDEX IF EXISTS {schema}.{index_name}").format( 101 | schema=Identifier(schema), 102 | index_name=Identifier(index_name), 103 | ) 104 | ) 105 | self.finalize_queries.append(SQL(tmp_index_def)) 106 | self.connection.commit() 107 | 108 | def finalize(self): 109 | """Rename the temporary tables and their indexes to the final names, dropping the old tables if they exist""" 110 | with self.connection.cursor() as cursor: 111 | cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_chunks").format(schema=Identifier(schema))) 112 | cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_pages").format(schema=Identifier(schema))) 113 | cursor.execute(SQL("ALTER TABLE {schema}.timescale_chunks_tmp RENAME TO timescale_chunks").format(schema=Identifier(schema))) 114 | cursor.execute(SQL("ALTER TABLE {schema}.timescale_pages_tmp RENAME TO timescale_pages").format(schema=Identifier(schema))) 115 | 116 | # the auto create foreign key and index names include the _tmp_ bit in their 117 | # names, so we remove them so that they match the generated names for the 118 | # renamed tables. 119 | for table in ["timescale_pages", "timescale_chunks"]: 120 | cursor.execute( 121 | """ 122 | select indexname 123 | from pg_indexes 124 | where schemaname = %s 125 | and tablename = %s 126 | and indexname like %s 127 | """, 128 | [schema, table, '%_tmp_%'], 129 | ) 130 | for row in cursor.fetchall(): 131 | old_index_name = row[0] 132 | new_index_name = old_index_name.replace("_tmp_", "_") 133 | cursor.execute( 134 | SQL( 135 | "alter index {schema}.{old_index_name} rename to {new_index_name}" 136 | ).format( 137 | schema=Identifier(schema), 138 | old_index_name=Identifier(old_index_name), 139 | new_index_name=Identifier(new_index_name), 140 | ) 141 | ) 142 | 143 | cursor.execute( 144 | SQL(""" 145 | select conname 146 | from pg_constraint 147 | where conrelid = to_regclass(%s) 148 | and contype = 'f' 149 | and conname like %s 150 | """).format(schema=Identifier(schema)), 151 | [f"{schema}.timescale_chunks", '%_tmp_%'], 152 | ) 153 | for row in cursor.fetchall(): 154 | old_fk_name = row[0] 155 | new_fk_name = old_fk_name.replace("_tmp_", "_") 156 | cursor.execute( 157 | SQL( 158 | "alter table {schema}.timescale_chunks rename constraint {old_fk_name} to {new_fk_name}" 159 | ).format( 160 | schema=Identifier(schema), 161 | old_fk_name=Identifier(old_fk_name), 162 | new_fk_name=Identifier(new_fk_name), 163 | ) 164 | ) 165 | 166 | for query in self.finalize_queries: 167 | cursor.execute(query) 168 | 169 | self.connection.commit() 170 | 171 | def save_page(self, url, domain, filename, content_length, chunking_method='header'): 172 | """Save page information and return the page ID""" 173 | try: 174 | with ( 175 | self.connection.cursor() as cursor, 176 | self.connection.transaction() as _, 177 | ): 178 | cursor.execute(SQL(""" 179 | INSERT INTO {schema}.timescale_pages_tmp (url, domain, filename, content_length, chunking_method) 180 | VALUES (%s, %s, %s, %s, %s) 181 | ON CONFLICT (url) DO UPDATE SET 182 | content_length = EXCLUDED.content_length, 183 | chunking_method = EXCLUDED.chunking_method, 184 | scraped_at = CURRENT_TIMESTAMP 185 | RETURNING id 186 | """).format(schema=Identifier(schema)), (url, domain, filename, content_length, chunking_method)) 187 | 188 | page_id = cursor.fetchone()[0] 189 | 190 | # Delete existing chunks for this page (in case of re-scraping) 191 | cursor.execute(SQL("DELETE FROM {schema}.timescale_chunks WHERE page_id = %s").format(schema=Identifier(schema)), (page_id,)) 192 | 193 | return page_id 194 | 195 | except Exception as e: 196 | raise RuntimeError(f"Failed to save page {url}: {e}") 197 | 198 | def generate_embeddings_batch(self, texts): 199 | """Generate embeddings for a batch of texts using the configured embedding model""" 200 | if self.embedding_model is None: 201 | return [None] * len(texts) 202 | 203 | try: 204 | # Clean texts for embedding 205 | clean_texts = [] 206 | for text in texts: 207 | clean_text = text.strip() if text else "" 208 | clean_texts.append(clean_text) 209 | 210 | # Generate embeddings in batch using the model 211 | embeddings = self.embedding_model.get_text_embeddings(clean_texts) 212 | return embeddings 213 | 214 | except Exception as e: 215 | print(f"Warning: Failed to generate batch embeddings: {e}") 216 | return [None] * len(texts) 217 | 218 | def save_chunks(self, page_id, chunks): 219 | """Save chunks for a page with batch embedding generation""" 220 | try: 221 | # Prepare content with breadcrumbs for all chunks 222 | processed_chunks = [] 223 | chunk_texts = [] 224 | 225 | for chunk in chunks: 226 | content_with_breadcrumbs = add_header_breadcrumbs_to_content( 227 | chunk['content'], 228 | chunk['metadata'] 229 | ) 230 | processed_chunks.append({ 231 | 'content': content_with_breadcrumbs, 232 | 'metadata': chunk['metadata'] 233 | }) 234 | chunk_texts.append(content_with_breadcrumbs) 235 | 236 | # Generate embeddings for all chunks in batch 237 | embeddings = self.generate_embeddings_batch(chunk_texts) 238 | 239 | with ( 240 | self.connection.cursor() as cursor, 241 | self.connection.transaction() as _, 242 | ): 243 | for chunk, embedding in zip(processed_chunks, embeddings): 244 | cursor.execute(SQL(""" 245 | INSERT INTO {schema}.timescale_chunks_tmp (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) 246 | VALUES (%s, %s, %s, %s, %s, %s) 247 | """).format(schema=Identifier(schema)), ( 248 | page_id, 249 | chunk['metadata'].get('chunk_index', 0), 250 | chunk['metadata'].get('sub_chunk_index', 0), 251 | chunk['content'], 252 | json.dumps(chunk['metadata']), 253 | embedding 254 | )) 255 | 256 | # Update chunks count in pages table 257 | cursor.execute(SQL(""" 258 | UPDATE {schema}.timescale_pages_tmp 259 | SET chunks_count = %s 260 | WHERE id = %s 261 | """).format(schema=Identifier(schema)), (len(chunks), page_id)) 262 | 263 | except Exception as e: 264 | raise RuntimeError(f"Failed to save chunks for page {page_id}: {e}") 265 | 266 | def get_scraped_page_count(self): 267 | """Get the number of pages scraped into the temporary tables""" 268 | with self.connection.cursor() as cursor: 269 | cursor.execute(SQL("SELECT COUNT(*) FROM {schema}.timescale_pages_tmp").format(schema=Identifier(schema))) 270 | return cursor.fetchone()[0] 271 | 272 | def close(self): 273 | """Close database connection""" 274 | if self.connection: 275 | self.connection.close() 276 | 277 | class FileManager: 278 | """Handles file-based storage for scraped content""" 279 | 280 | def __init__(self, output_dir='scraped_docs'): 281 | self.output_dir = output_dir 282 | # Create output directory if it doesn't exist 283 | os.makedirs(self.output_dir, exist_ok=True) 284 | 285 | def save_chunked_content(self, url, filename, chunks): 286 | """Save chunked content to a markdown file with delimiters""" 287 | filepath = os.path.join(self.output_dir, filename) 288 | 289 | # Create markdown with chunk delimiters 290 | chunked_markdown = f"# Source: {url}\n\n" 291 | chunked_markdown += f"\n\n" 292 | 293 | for i, chunk in enumerate(chunks): 294 | # Add chunk delimiter 295 | chunked_markdown += f"---\n\n" 296 | 297 | # Add metadata as comments 298 | if chunk['metadata']: 299 | chunked_markdown += f"\n" 300 | 301 | chunked_markdown += "---\n\n" 302 | 303 | # Add header breadcrumbs and content 304 | content_with_breadcrumbs = add_header_breadcrumbs_to_content( 305 | chunk['content'], 306 | chunk['metadata'] 307 | ) 308 | chunked_markdown += content_with_breadcrumbs 309 | chunked_markdown += "\n\n" 310 | 311 | with open(filepath, 'w', encoding='utf-8') as f: 312 | f.write(chunked_markdown) 313 | 314 | return filepath 315 | 316 | def save_regular_content(self, url, filename, content): 317 | """Save regular markdown content to a file""" 318 | filepath = os.path.join(self.output_dir, filename) 319 | 320 | with open(filepath, 'w', encoding='utf-8') as f: 321 | f.write(f"# Source: {url}\n\n") 322 | f.write(content) 323 | 324 | return filepath 325 | 326 | class SitemapMarkdownSpider(SitemapSpider): 327 | name = 'sitemap_markdown' 328 | 329 | def __init__(self, domain=None, output_dir='scraped_docs', max_pages=None, strip_data_images=True, chunk_content=True, chunking_method='header', db_manager=None, file_manager=None, url_prefix=None, *args, **kwargs): 330 | super(SitemapMarkdownSpider, self).__init__(*args, **kwargs) 331 | 332 | if not domain: 333 | raise ValueError("domain parameter is required") 334 | 335 | self.domain = domain 336 | self.output_dir = output_dir 337 | self.max_pages = int(max_pages) if max_pages else None 338 | self.should_strip_data_images = strip_data_images if isinstance(strip_data_images, bool) else strip_data_images.lower() == 'true' 339 | self.should_chunk_content = chunk_content if isinstance(chunk_content, bool) else chunk_content.lower() == 'true' 340 | self.chunking_method = chunking_method # 'header' or 'semantic' 341 | self.allowed_domains = [domain] 342 | self.url_prefix = url_prefix # e.g., '/docs' to only scrape URLs under that path 343 | 344 | # Use passed-in storage managers 345 | self.db_manager = db_manager 346 | self.file_manager = file_manager 347 | 348 | # Get sitemap URLs from robots.txt or fallback to default 349 | self.sitemap_urls = self.get_sitemap_urls(domain) 350 | 351 | # Track processed URLs to avoid duplicates 352 | self.processed_urls = set() 353 | # Track number of pages processed 354 | self.pages_processed = 0 355 | 356 | # Configure domain-specific element removal 357 | self.ignore_selectors = self.get_ignore_selectors(domain) 358 | 359 | def _init_default_embedding_model(self): 360 | """Initialize OpenAI embedding model for database storage""" 361 | try: 362 | if not os.getenv('OPENAI_API_KEY'): 363 | raise ValueError("OPENAI_API_KEY environment variable is required for database storage with embeddings") 364 | 365 | self.logger.info("Initializing OpenAI embedding client") 366 | client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY')) 367 | 368 | # Create a simple wrapper class for the OpenAI client 369 | class OpenAIEmbeddingWrapper: 370 | def __init__(self, client): 371 | self.client = client 372 | self.model = "text-embedding-3-small" 373 | 374 | def get_text_embeddings(self, texts): 375 | """Generate embeddings for a batch of texts""" 376 | response = self.client.embeddings.create( 377 | input=texts, 378 | model=self.model 379 | ) 380 | return [embedding.embedding for embedding in response.data] 381 | 382 | return OpenAIEmbeddingWrapper(client) 383 | 384 | except Exception as e: 385 | raise RuntimeError(f"Failed to initialize OpenAI embeddings: {e}") 386 | 387 | def get_sitemap_urls(self, domain): 388 | """Get sitemap URLs from robots.txt, fallback to common locations""" 389 | sitemap_urls = [] 390 | 391 | # Try to get sitemaps from robots.txt 392 | robots_url = f'https://{domain}/robots.txt' 393 | try: 394 | self.logger.info(f'Checking robots.txt at: {robots_url}') 395 | response = requests.get(robots_url, timeout=10) 396 | response.raise_for_status() 397 | 398 | # Parse robots.txt for sitemap entries 399 | for line in response.text.split('\n'): 400 | line = line.strip() 401 | if line.lower().startswith('sitemap:'): 402 | sitemap_url = line.split(':', 1)[1].strip() 403 | # Handle relative URLs 404 | if not sitemap_url.startswith('http'): 405 | sitemap_url = urljoin(f'https://{domain}/', sitemap_url) 406 | # Filter to only include docs sitemaps if url_prefix is set 407 | if self.url_prefix: 408 | if self.url_prefix in sitemap_url: 409 | sitemap_urls.append(sitemap_url) 410 | self.logger.info(f'Found docs sitemap in robots.txt: {sitemap_url}') 411 | else: 412 | sitemap_urls.append(sitemap_url) 413 | self.logger.info(f'Found sitemap in robots.txt: {sitemap_url}') 414 | 415 | except Exception as e: 416 | self.logger.warning(f'Could not fetch robots.txt from {robots_url}: {e}') 417 | 418 | # If no sitemaps found in robots.txt, try common locations 419 | if not sitemap_urls: 420 | common_sitemap_locations = [ 421 | f'https://{domain}/sitemap.xml', 422 | f'https://{domain}/sitemap_index.xml', 423 | f'https://{domain}/sitemap.txt' 424 | ] 425 | # If url_prefix is set, also try prefix-specific sitemaps 426 | if self.url_prefix: 427 | common_sitemap_locations = [ 428 | f'https://{domain}{self.url_prefix}/sitemap.xml', 429 | f'https://{domain}{self.url_prefix}/sitemap-0.xml', 430 | ] + common_sitemap_locations 431 | 432 | for sitemap_url in common_sitemap_locations: 433 | try: 434 | self.logger.info(f'Trying common sitemap location: {sitemap_url}') 435 | response = requests.head(sitemap_url, timeout=10) 436 | if response.status_code == 200: 437 | sitemap_urls.append(sitemap_url) 438 | self.logger.info(f'Found sitemap at: {sitemap_url}') 439 | break 440 | except Exception as e: 441 | self.logger.debug(f'Sitemap not found at {sitemap_url}: {e}') 442 | 443 | # If still no sitemap found, return empty list and let Scrapy handle the error 444 | if not sitemap_urls: 445 | self.logger.error(f'No sitemap found for domain: {domain}') 446 | 447 | return sitemap_urls 448 | 449 | def get_ignore_selectors(self, domain): 450 | """Get CSS selectors to ignore for specific domains""" 451 | # Get domain-specific selectors, fallback to default 452 | selectors = DOMAIN_SELECTORS.get(domain, DEFAULT_SELECTORS.copy()) 453 | 454 | # Also check for subdomain matches (e.g., subdomain.readthedocs.io) 455 | if selectors == DEFAULT_SELECTORS: 456 | for known_domain, known_selectors in DOMAIN_SELECTORS.items(): 457 | if known_domain in domain: 458 | selectors = known_selectors.copy() 459 | break 460 | 461 | self.logger.info(f'Using ignore selectors for {domain}: {selectors}') 462 | return selectors 463 | 464 | def strip_data_images(self, soup): 465 | """Remove elements with data: src attributes""" 466 | data_images_removed = 0 467 | 468 | # Only remove img tags with data: src 469 | for img in soup.find_all('img', src=True): 470 | if img['src'].startswith('data:'): 471 | img.decompose() 472 | data_images_removed += 1 473 | 474 | if data_images_removed > 0: 475 | self.logger.debug(f'Removed {data_images_removed} data: images') 476 | 477 | return soup 478 | 479 | def convert_callouts_to_admonitions(self, soup): 480 | """Convert div.callout elements with h6 to admonition-style markdown callouts""" 481 | callouts_converted = 0 482 | 483 | # Map of h6 text to admonition types 484 | admonition_map = { 485 | 'warning': ':warning:', 486 | 'note': ':information_source:', 487 | 'tip': ':bulb:', 488 | 'important': ':exclamation:', 489 | 'caution': ':warning:', 490 | 'danger': ':no_entry:', 491 | 'info': ':information_source:', 492 | 'example': ':memo:', 493 | 'see also': ':point_right:', 494 | } 495 | 496 | for callout_div in soup.find_all('div', class_='callout'): 497 | h6 = callout_div.find('h6') 498 | if not h6: 499 | continue 500 | 501 | h6_text = h6.get_text().strip().lower() 502 | 503 | # Find matching admonition type 504 | admonition_icon = None 505 | for keyword, icon in admonition_map.items(): 506 | if keyword in h6_text: 507 | admonition_icon = icon 508 | break 509 | 510 | # Default to info if no match 511 | if not admonition_icon: 512 | admonition_icon = ':information_source:' 513 | 514 | # Create blockquote with icon and h6 text 515 | blockquote = soup.new_tag('blockquote') 516 | 517 | # Add the h6 text with icon as first paragraph 518 | header_p = soup.new_tag('p') 519 | header_p.string = f"{admonition_icon} {h6.get_text().strip()}" 520 | blockquote.append(header_p) 521 | 522 | # Remove the h6 from callout div 523 | h6.decompose() 524 | 525 | # Move all remaining content from callout div to blockquote 526 | for child in list(callout_div.children): 527 | if child.name: # Skip text nodes 528 | blockquote.append(child.extract()) 529 | 530 | # Replace callout div with blockquote 531 | callout_div.replace_with(blockquote) 532 | callouts_converted += 1 533 | 534 | if callouts_converted > 0: 535 | self.logger.debug(f'Converted {callouts_converted} callout divs to admonitions') 536 | 537 | return soup 538 | 539 | def clean_code_blocks(self, soup): 540 | """Clean up code block HTML structure before markdown conversion""" 541 | code_blocks_cleaned = 0 542 | 543 | # Find code blocks with token-line structure 544 | for code_container in soup.find_all(['pre', 'code']): 545 | token_lines = code_container.find_all('div', class_='token-line') 546 | 547 | if token_lines: 548 | # Extract text from each token line and join with newlines 549 | lines = [] 550 | for line_div in token_lines: 551 | # Get text content from line-content span or the div itself 552 | line_content = line_div.find(attrs={'data-line_content': 'true'}) 553 | if line_content: 554 | lines.append(line_content.get_text()) 555 | else: 556 | lines.append(line_div.get_text()) 557 | 558 | # Replace the complex structure with simple text 559 | code_container.clear() 560 | code_container.string = '\n'.join(lines) 561 | code_blocks_cleaned += 1 562 | 563 | if code_blocks_cleaned > 0: 564 | self.logger.debug(f'Cleaned {code_blocks_cleaned} code blocks') 565 | 566 | return soup 567 | 568 | def extract_anchor_links(self, text): 569 | """Extract markdown anchor links from text (only internal #anchors)""" 570 | import re 571 | 572 | # Pattern to match markdown links that are internal anchors: [text](#anchor) 573 | anchor_pattern = r'\[([^\]]+)\]\(#([^)]+)\)' 574 | 575 | anchors = [] 576 | for match in re.finditer(anchor_pattern, text): 577 | link_text = match.group(1) 578 | anchor_id = match.group(2) 579 | 580 | anchors.append({ 581 | 'text': link_text, 582 | 'anchor': anchor_id 583 | }) 584 | 585 | return anchors 586 | 587 | 588 | def semantic_chunk_with_openai(self, markdown_text, url): 589 | """Use OpenAI to identify semantic boundaries for chunking using split identifiers""" 590 | try: 591 | # Initialize OpenAI client 592 | client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY')) 593 | 594 | # Split text into lines for LLM processing 595 | lines = markdown_text.split('\n') 596 | small_chunks = [line for line in lines if line.strip()] # Filter out empty lines 597 | 598 | # Add chunk identifiers 599 | chunked_input = '' 600 | for i, chunk in enumerate(small_chunks): 601 | chunked_input += f"<|start_chunk_{i+1}|>{chunk}<|end_chunk_{i+1}|>" 602 | 603 | # Create prompt for semantic boundary identification 604 | system_prompt = """You are an assistant specialized in splitting text into thematically consistent sections. 605 | The text has been divided into chunks, each marked with <|start_chunk_X|> and <|end_chunk_X|> tags, where X is the chunk number. 606 | Your task is to identify the points where splits should occur, such that consecutive chunks of similar themes stay together. 607 | 608 | Focus on: 609 | - Topic changes or conceptual shifts 610 | - Natural reading breaks that maintain context 611 | - Keeping related examples, tables, code blocks, and explanations together 612 | - Ensuring each chunk contains complete thoughts/concepts 613 | - Prefer to split at markdown headers 614 | 615 | Respond with a list of chunk IDs where you believe a split should be made. For example, if chunks 1 and 2 belong together but chunk 3 starts a new topic, you would suggest a split after chunk 2. THE CHUNKS MUST BE IN ASCENDING ORDER. 616 | Your response should be in the form: 'split_after: 2, 5, 8'.""" 617 | 618 | user_prompt = f"""CHUNKED_TEXT: {chunked_input} 619 | 620 | Respond only with the IDs of the chunks where you believe a split should occur. YOU MUST RESPOND WITH AT LEAST ONE SPLIT. THESE SPLITS MUST BE IN ASCENDING ORDER.""" 621 | 622 | # Call OpenAI API 623 | response = client.chat.completions.create( 624 | model="gpt-4o", # Use cost-effective model 625 | messages=[ 626 | {"role": "system", "content": system_prompt}, 627 | {"role": "user", "content": user_prompt} 628 | ], 629 | temperature=0.1, # Low temperature for consistent results 630 | max_tokens=300 631 | ) 632 | 633 | # Parse response to get split positions 634 | result_string = response.choices[0].message.content.strip() 635 | 636 | # Extract numbers from response 637 | try: 638 | # Find the line containing split_after 639 | split_after_lines = [line for line in result_string.split('\n') if 'split_after:' in line] 640 | if not split_after_lines: 641 | # Fallback: extract all numbers from response 642 | numbers = re.findall(r'\d+', result_string) 643 | else: 644 | numbers = re.findall(r'\d+', split_after_lines[0]) 645 | 646 | split_indices = list(map(int, numbers)) 647 | 648 | # Validate that numbers are in ascending order 649 | if split_indices != sorted(split_indices): 650 | raise ValueError(f"Split indices not in ascending order for {url}: {split_indices}") 651 | 652 | except Exception as e: 653 | raise ValueError(f"Could not parse OpenAI response for {url}: {e}") 654 | 655 | # Convert chunk IDs to split indices (0-based) 656 | chunks_to_split_after = [i - 1 for i in split_indices if i > 0 and i <= len(small_chunks)] 657 | 658 | # Create final chunks by combining lines based on split points 659 | final_chunks = [] 660 | current_chunk_lines = [] 661 | 662 | for i, line in enumerate(small_chunks): 663 | current_chunk_lines.append(line) 664 | if i in chunks_to_split_after or i == len(small_chunks) - 1: 665 | if current_chunk_lines: 666 | # Join lines back with newlines 667 | chunk_content = '\n'.join(current_chunk_lines) 668 | 669 | # Extract anchor links from chunk content 670 | content_anchors = self.extract_anchor_links(chunk_content) 671 | 672 | # Create metadata 673 | chunk_metadata = { 674 | 'source_url': url, 675 | 'chunk_index': len(final_chunks), 676 | 'sub_chunk_index': 0, 677 | 'chunking_method': 'semantic_openai', 678 | 'line_range': f"{i - len(current_chunk_lines) + 1}-{i}" 679 | } 680 | 681 | # Add anchor information to metadata 682 | if content_anchors: 683 | chunk_metadata['anchor_links'] = content_anchors 684 | chunk_metadata['anchor_count'] = len(content_anchors) 685 | chunk_metadata['anchor_ids'] = [a['anchor'] for a in content_anchors] 686 | 687 | final_chunks.append({ 688 | 'content': chunk_content, 689 | 'metadata': chunk_metadata 690 | }) 691 | current_chunk_lines = [] 692 | 693 | self.logger.debug(f'Created {len(final_chunks)} semantic chunks using OpenAI from {len(small_chunks)} lines') 694 | return final_chunks 695 | 696 | except Exception as e: 697 | raise RuntimeError(f"OpenAI semantic chunking failed for {url}: {e}") 698 | 699 | def chunk_markdown_content_header_based(self, markdown_text, url): 700 | """Original header-based chunking method""" 701 | chunks = [] 702 | 703 | # Define headers to split on (up to h3) 704 | headers_to_split_on = [ 705 | ("#", "Header 1"), 706 | ("##", "Header 2"), 707 | ("###", "Header 3"), 708 | ] 709 | 710 | # First pass: split by markdown headers 711 | markdown_splitter = MarkdownHeaderTextSplitter( 712 | headers_to_split_on=headers_to_split_on, 713 | strip_headers=False # Keep headers in the chunks 714 | ) 715 | 716 | header_splits = markdown_splitter.split_text(markdown_text) 717 | 718 | # Second pass: recursive character splitting for large chunks 719 | text_splitter = RecursiveCharacterTextSplitter( 720 | chunk_size=2000, 721 | chunk_overlap=200, 722 | length_function=len, 723 | separators=["```", "\n\n", "\n", " ", ""] 724 | ) 725 | 726 | for i, doc in enumerate(header_splits): 727 | # Get the header metadata 728 | metadata = doc.metadata.copy() if hasattr(doc, 'metadata') else {} 729 | metadata['source_url'] = url 730 | metadata['chunk_index'] = i 731 | metadata['chunking_method'] = 'header_based' 732 | 733 | # Extract anchor links from headers (breadcrumb context) 734 | header_anchors = [] 735 | for level in ['Header 1', 'Header 2', 'Header 3']: 736 | if level in metadata: 737 | header_anchors.extend(self.extract_anchor_links(metadata[level])) 738 | 739 | # Split large chunks further 740 | sub_chunks = text_splitter.split_text(doc.page_content) 741 | 742 | for j, chunk_text in enumerate(sub_chunks): 743 | chunk_metadata = metadata.copy() 744 | chunk_metadata['sub_chunk_index'] = j 745 | 746 | # Extract anchor links from chunk content 747 | content_anchors = self.extract_anchor_links(chunk_text) 748 | 749 | # Combine header and content anchors, removing duplicates 750 | all_anchors = header_anchors + content_anchors 751 | unique_anchors = [] 752 | seen_anchors = set() 753 | for anchor in all_anchors: 754 | anchor_key = (anchor['text'], anchor['anchor']) 755 | if anchor_key not in seen_anchors: 756 | unique_anchors.append(anchor) 757 | seen_anchors.add(anchor_key) 758 | 759 | # Add anchor information to metadata 760 | if unique_anchors: 761 | chunk_metadata['anchor_links'] = unique_anchors 762 | chunk_metadata['anchor_count'] = len(unique_anchors) 763 | # Also create a simple list of anchor IDs for easier searching 764 | chunk_metadata['anchor_ids'] = [a['anchor'] for a in unique_anchors] 765 | 766 | chunks.append({ 767 | 'content': chunk_text, 768 | 'metadata': chunk_metadata 769 | }) 770 | 771 | self.logger.debug(f'Created {len(chunks)} chunks using header-based method') 772 | return chunks 773 | 774 | def chunk_markdown_content(self, markdown_text, url): 775 | """Route to appropriate chunking method based on configuration""" 776 | if self.chunking_method == 'semantic': 777 | return self.semantic_chunk_with_openai(markdown_text, url) 778 | else: # Default to header-based 779 | return self.chunk_markdown_content_header_based(markdown_text, url) 780 | 781 | def sitemap_filter(self, entries): 782 | """Filter sitemap entries to only include HTML pages under the url_prefix""" 783 | for entry in entries: 784 | # Only process HTML pages, skip images, PDFs, etc. 785 | if any(ext in entry['loc'] for ext in ['.pdf', '.jpg', '.png', '.gif', '.css', '.js', '.xml']): 786 | continue 787 | # If url_prefix is set, only include URLs that match the prefix 788 | if self.url_prefix: 789 | parsed = urlparse(entry['loc']) 790 | if not parsed.path.startswith(self.url_prefix): 791 | continue 792 | yield entry 793 | 794 | def parse(self, response): 795 | """Parse each page from the sitemap""" 796 | url = response.url 797 | 798 | # Skip if already processed 799 | if url in self.processed_urls: 800 | return 801 | 802 | # Check if we've reached the maximum number of pages 803 | if self.max_pages and self.pages_processed >= self.max_pages: 804 | self.logger.info(f'Reached maximum pages limit ({self.max_pages}), stopping crawler') 805 | self.crawler.engine.close_spider(self, 'max_pages_reached') 806 | return 807 | 808 | self.processed_urls.add(url) 809 | self.pages_processed += 1 810 | 811 | # Log the URL being processed 812 | self.logger.info(f'Processing: {url}') 813 | 814 | try: 815 | # Parse HTML with BeautifulSoup 816 | soup = BeautifulSoup(response.body, 'html.parser') 817 | 818 | # Remove elements based on configured selectors 819 | for selector in self.ignore_selectors: 820 | elements = soup.select(selector) 821 | for element in elements: 822 | element.decompose() 823 | if elements: 824 | self.logger.debug(f'Removed {len(elements)} elements matching: {selector}') 825 | 826 | # Strip data: images if requested 827 | if self.should_strip_data_images: 828 | soup = self.strip_data_images(soup) 829 | 830 | # Convert callout divs to admonitions 831 | soup = self.convert_callouts_to_admonitions(soup) 832 | 833 | # Clean up code block structure 834 | soup = self.clean_code_blocks(soup) 835 | 836 | # Find main content 837 | main_content = soup.find("main") or soup 838 | html_content = str(main_content) 839 | 840 | # Convert to markdown 841 | markdown_output = md(html_content, heading_style="ATX") 842 | 843 | # Generate filename from URL 844 | filename = self.generate_filename(url) 845 | filepath = os.path.join(self.output_dir, filename) 846 | 847 | if self.should_chunk_content: 848 | # Chunk the content 849 | chunks = self.chunk_markdown_content(markdown_output, url) 850 | 851 | if self.db_manager is not None: 852 | # Save to database 853 | page_id = self.db_manager.save_page( 854 | url=url, 855 | domain=self.domain, 856 | filename=filename, 857 | content_length=len(markdown_output), 858 | chunking_method=self.chunking_method 859 | ) 860 | 861 | self.logger.info(f'Generating embeddings for {len(chunks)} chunks from: {url}') 862 | self.db_manager.save_chunks(page_id, chunks) 863 | 864 | self.logger.info(f'Saved {len(chunks)} chunks with embeddings to database: {url}') 865 | 866 | if self.file_manager is not None: 867 | # Save to file 868 | filepath = self.file_manager.save_chunked_content(url, filename, chunks) 869 | self.logger.info(f'Saved {len(chunks)} chunks: {filepath}') 870 | 871 | return { 872 | 'url': url, 873 | 'filename': filename, 874 | 'content_length': len(markdown_output), 875 | 'chunks_count': len(chunks) 876 | } 877 | else: 878 | if self.db_manager is not None: 879 | # Save to database without chunking 880 | page_id = self.db_manager.save_page( 881 | url=url, 882 | domain=self.domain, 883 | filename=filename, 884 | content_length=len(markdown_output), 885 | chunking_method='none' 886 | ) 887 | # Save entire content as single chunk 888 | single_chunk = [{ 889 | 'content': markdown_output, 890 | 'metadata': { 891 | 'source_url': url, 892 | 'chunk_index': 0, 893 | 'sub_chunk_index': 0, 894 | 'chunking_method': 'none' 895 | } 896 | }] 897 | self.db_manager.save_chunks(page_id, single_chunk) 898 | 899 | self.logger.info(f'Saved to database: {url}') 900 | 901 | if self.file_manager is not None: 902 | # Save to file 903 | filepath = self.file_manager.save_regular_content(url, filename, markdown_output) 904 | self.logger.info(f'Saved: {filepath}') 905 | 906 | return { 907 | 'url': url, 908 | 'filename': filename, 909 | 'content_length': len(markdown_output) 910 | } 911 | 912 | except Exception as e: 913 | self.logger.error(f'Error processing {url}: {str(e)}') 914 | return None 915 | 916 | def generate_filename(self, url): 917 | """Generate a safe filename from URL""" 918 | parsed = urlparse(url) 919 | path = parsed.path 920 | 921 | # Remove leading/trailing slashes and replace path separators 922 | path = path.strip('/') 923 | if not path: 924 | path = 'index' 925 | 926 | # Replace problematic characters 927 | safe_path = re.sub(r'[^\w\-_/]', '_', path) 928 | safe_path = re.sub(r'_+', '_', safe_path) # Replace multiple underscores 929 | safe_path = safe_path.replace('/', '_') 930 | 931 | # Ensure filename isn't too long 932 | if len(safe_path) > 100: 933 | # Create hash of original path and truncate 934 | hash_suffix = hashlib.md5(path.encode()).hexdigest()[:8] 935 | safe_path = safe_path[:80] + '_' + hash_suffix 936 | 937 | return f"{safe_path}.md" 938 | 939 | 940 | # Standalone script to run the spider 941 | if __name__ == "__main__": 942 | import argparse 943 | import sys 944 | from scrapy.crawler import CrawlerProcess 945 | from scrapy.utils.project import get_project_settings 946 | 947 | parser = argparse.ArgumentParser( 948 | description='Scrape websites using sitemaps and convert to chunked markdown for RAG applications', 949 | formatter_class=argparse.RawDescriptionHelpFormatter, 950 | epilog='''Examples: 951 | %(prog)s www.tigerdata.com 952 | %(prog)s www.tigerdata.com -o tiger_docs -m 50 953 | %(prog)s www.tigerdata.com -o semantic_docs -m 5 --chunking semantic 954 | %(prog)s www.tigerdata.com --no-chunk --no-strip-images -m 100 955 | %(prog)s www.tigerdata.com --storage-type database --database-uri postgresql://user:pass@host:5432/dbname 956 | %(prog)s www.tigerdata.com --storage-type database --chunking semantic -m 10 957 | ''' 958 | ) 959 | 960 | # Optional arguments 961 | parser.add_argument('--domain', '-d', 962 | help='Domain to scrape (e.g., www.tigerdata.com)') 963 | 964 | parser.add_argument('-o', '--output-dir', 965 | default='scraped_docs', 966 | help='Output directory for scraped files (default: scraped_docs)') 967 | 968 | parser.add_argument('-m', '--max-pages', 969 | type=int, 970 | help='Maximum number of pages to scrape (default: unlimited)') 971 | 972 | parser.add_argument('--strip-images', 973 | action='store_true', 974 | default=True, 975 | help='Strip data: images from content (default: True)') 976 | 977 | parser.add_argument('--no-strip-images', 978 | dest='strip_images', 979 | action='store_false', 980 | help='Keep data: images in content') 981 | 982 | parser.add_argument('--chunk', 983 | action='store_true', 984 | default=True, 985 | help='Enable content chunking (default: True)') 986 | 987 | parser.add_argument('--no-chunk', 988 | dest='chunk', 989 | action='store_false', 990 | help='Disable content chunking') 991 | 992 | parser.add_argument('--chunking', 993 | choices=['header', 'semantic'], 994 | default='header', 995 | help='Chunking method: header (default) or semantic (requires OPENAI_API_KEY)') 996 | 997 | # Storage options 998 | parser.add_argument('--storage-type', 999 | choices=['file', 'database'], 1000 | default='database', 1001 | help='Storage type: database (default) or file') 1002 | 1003 | parser.add_argument('--database-uri', 1004 | help='PostgreSQL connection URI (default: uses DB_URL from environment)') 1005 | 1006 | parser.add_argument('--skip-indexes', 1007 | action='store_true', 1008 | help='Skip creating database indexes after import (for development/testing)') 1009 | 1010 | parser.add_argument('--delay', 1011 | type=float, 1012 | default=1.0, 1013 | help='Download delay in seconds (default: 1.0)') 1014 | 1015 | parser.add_argument('--concurrent', 1016 | type=int, 1017 | default=4, 1018 | help='Maximum concurrent requests (default: 4)') 1019 | 1020 | parser.add_argument('--url-prefix', 1021 | help='URL path prefix to filter pages (e.g., /docs to only scrape URLs under /docs)') 1022 | 1023 | parser.add_argument('--log-level', 1024 | choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'], 1025 | default='INFO', 1026 | help='Logging level (default: INFO)') 1027 | 1028 | parser.add_argument('--user-agent', 1029 | default='Mozilla/5.0 (compatible; DocumentationScraper)', 1030 | help='User agent string') 1031 | 1032 | # Set defaults from environment variables 1033 | parser.set_defaults( 1034 | database_uri=os.environ.get('DB_URL', f'postgresql://{os.environ["PGUSER"]}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}'), 1035 | domain=os.environ.get('SCRAPER_DOMAIN', 'www.tigerdata.com'), 1036 | max_pages=int(os.environ.get('SCRAPER_MAX_PAGES', 0)) or None, 1037 | output_dir=os.environ.get('SCRAPER_OUTPUT_DIR', os.path.join(script_dir, 'build', 'scraped_docs')), 1038 | chunking=os.environ.get('SCRAPER_CHUNKING_METHOD', 'header'), 1039 | storage_type=os.environ.get('SCRAPER_STORAGE_TYPE', 'database'), 1040 | url_prefix=os.environ.get('SCRAPER_URL_PREFIX', '/docs') 1041 | ) 1042 | 1043 | args = parser.parse_args() 1044 | 1045 | # Validate semantic chunking requirements 1046 | if args.chunking == 'semantic': 1047 | if not os.getenv('OPENAI_API_KEY'): 1048 | print("Error: Semantic chunking requires OPENAI_API_KEY environment variable") 1049 | print("Set it with: export OPENAI_API_KEY=your_api_key") 1050 | print("Or create a .env file with: OPENAI_API_KEY=your_api_key") 1051 | sys.exit(1) 1052 | 1053 | # Configure Scrapy settings 1054 | settings = get_project_settings() 1055 | settings.update({ 1056 | 'USER_AGENT': args.user_agent, 1057 | 'ROBOTSTXT_OBEY': True, 1058 | 'DOWNLOAD_DELAY': args.delay, 1059 | 'RANDOMIZE_DOWNLOAD_DELAY': True, 1060 | 'CONCURRENT_REQUESTS': args.concurrent, 1061 | 'CONCURRENT_REQUESTS_PER_DOMAIN': min(args.concurrent, 2), 1062 | 'LOG_LEVEL': args.log_level, 1063 | }) 1064 | 1065 | print(f"Starting scraper for {args.domain}") 1066 | print(f"URL prefix: {args.url_prefix or 'none (all pages)'}") 1067 | print(f"Output directory: {args.output_dir}") 1068 | print(f"Max pages: {args.max_pages or 'unlimited'}") 1069 | print(f"Chunking: {'enabled' if args.chunk else 'disabled'} ({args.chunking})") 1070 | print(f"Strip images: {args.strip_images}") 1071 | print(f"Storage type: {args.storage_type}") 1072 | if args.storage_type == 'database': 1073 | print(f"Database URI: {args.database_uri}") 1074 | print() 1075 | 1076 | # Initialize storage managers 1077 | db_manager = None 1078 | file_manager = None 1079 | 1080 | if args.storage_type == 'database': 1081 | # Initialize embedding model for database storage (needed for both header and semantic) 1082 | client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY')) 1083 | 1084 | # Create embedding wrapper 1085 | class OpenAIEmbeddingWrapper: 1086 | def __init__(self, client): 1087 | self.client = client 1088 | self.model = "text-embedding-3-small" 1089 | 1090 | def get_text_embeddings(self, texts): 1091 | response = self.client.embeddings.create( 1092 | input=texts, 1093 | model=self.model 1094 | ) 1095 | return [embedding.embedding for embedding in response.data] 1096 | 1097 | embedding_model = OpenAIEmbeddingWrapper(client) 1098 | db_manager = DatabaseManager(database_uri=args.database_uri, embedding_model=embedding_model) 1099 | db_manager.initialize() 1100 | else: 1101 | file_manager = FileManager(args.output_dir) 1102 | 1103 | process = CrawlerProcess(settings) 1104 | process.crawl( 1105 | SitemapMarkdownSpider, 1106 | domain=args.domain, 1107 | output_dir=args.output_dir, 1108 | max_pages=args.max_pages, 1109 | strip_data_images=args.strip_images, 1110 | chunk_content=args.chunk, 1111 | chunking_method=args.chunking, 1112 | db_manager=db_manager, 1113 | file_manager=file_manager, 1114 | url_prefix=args.url_prefix 1115 | ) 1116 | process.start() 1117 | 1118 | # Create database indexes after scraping completes 1119 | if args.storage_type == 'database' and db_manager: 1120 | try: 1121 | # Check if any pages were scraped 1122 | page_count = db_manager.get_scraped_page_count() 1123 | print(f"Scraped {page_count} pages.") 1124 | 1125 | if page_count == 0: 1126 | print("Error: No pages were scraped. Aborting to preserve existing data.") 1127 | print("Check that the sitemap is accessible and the URL prefix is correct.") 1128 | raise SystemExit(1) 1129 | 1130 | if args.skip_indexes: 1131 | print("Skipping database finalization (--skip-indexes flag set).") 1132 | else: 1133 | print("Finalizing database...") 1134 | db_manager.finalize() 1135 | print("Database finalized successfully.") 1136 | except SystemExit: 1137 | raise 1138 | except Exception as e: 1139 | print(f"Failed to finish database: {e}") 1140 | raise SystemExit(1) 1141 | finally: 1142 | db_manager.close() 1143 | --------------------------------------------------------------------------------