├── ingest
    ├── .python-version
    ├── pyproject.toml
    ├── tiger_docs_config.toml
    ├── README.md
    ├── postgres_docs.py
    └── tiger_docs.py
├── .prettierignore
├── .dockerignore
├── src
    ├── config.ts
    ├── types.ts
    ├── serverInfo.ts
    ├── stdio.ts
    ├── index.ts
    ├── apis
    │   ├── index.ts
    │   ├── viewSkill.ts
    │   ├── kewordSearchTigerDocs.ts
    │   ├── semanticSearchTigerDocs.ts
    │   └── semanticSearchPostgresDocs.ts
    ├── httpServer.ts
    ├── util
    │   └── featureFlags.ts
    ├── migrate.ts
    └── skillutils
    │   └── index.ts
├── .prettierrc.mjs
├── .env.sample
├── .gitignore
├── bun
├── docker
    └── tsdb
    │   └── 100_setup_db.sql
├── tsconfig.json
├── .github
    └── workflows
    │   ├── deploy-feature-branch.yaml
    │   ├── lint.yml
    │   ├── build-on-feature-branch.yaml
    │   ├── build-and-deploy-on-merge.yaml
    │   ├── ingest-tiger-docs.yaml
    │   ├── ingest-postgres-docs.yaml
    │   └── publish.yml
├── NOTICE
├── Dockerfile
├── migrations
    ├── 1759241361471-add-version-index.js
    ├── 1759241172003-add-hnsw-indexes.js
    ├── 1759851009030-add-tiger-indexes.js
    └── 1756387543053-initial.js
├── docker-compose.yml
├── CLAUDE.md
├── eslint.config.mjs
├── .claude-plugin
    └── marketplace.json
├── package.json
├── API.md
├── generate-server.json.ts
├── DEVELOPMENT.md
├── LICENSE
├── README.md
└── skills
    ├── find-hypertable-candidates
        └── SKILL.md
    ├── migrate-postgres-tables-to-hypertables
        └── SKILL.md
    ├── design-postgres-tables
        └── SKILL.md
    └── setup-timescaledb-hypertables
        └── SKILL.md


/ingest/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | dist/
3 | .venv/


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | .github/
2 | chart/
3 | dist/
4 | node_modules/
5 | 


--------------------------------------------------------------------------------
/src/config.ts:
--------------------------------------------------------------------------------
1 | export const schema = process.env.DB_SCHEMA || 'docs';
2 | 


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
1 | import type { Pool } from 'pg';
2 | 
3 | export interface ServerContext extends Record<string, unknown> {
4 |   pgPool: Pool;
5 |   schema: string;
6 | }
7 | 


--------------------------------------------------------------------------------
/.prettierrc.mjs:
--------------------------------------------------------------------------------
1 | export default {
2 |   singleQuote: true,
3 |   tabWidth: 2,
4 |   trailingComma: 'all',
5 |   bracketSpacing: true,
6 |   arrowParens: 'always',
7 |   printWidth: 80,
8 | };
9 | 


--------------------------------------------------------------------------------
/.env.sample:
--------------------------------------------------------------------------------
 1 | # Database
 2 | PGHOST=db
 3 | PGPORT=5432
 4 | PGDATABASE=tsdb
 5 | PGUSER=tsdbadmin
 6 | PGPASSWORD=password
 7 | DB_SCHEMA=docs
 8 | 
 9 | # OpenAI Embedding API Key
10 | OPENAI_API_KEY=sk-
11 | 
12 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dist
 2 | node_modules
 3 | .env
 4 | ingest/build
 5 | ingest/postgres
 6 | .idea
 7 | .migrate
 8 | .mcpregistry_*
 9 | download/
10 | server.json
11 | 
12 | # Temporary files directory
13 | tmp/
14 | 


--------------------------------------------------------------------------------
/src/serverInfo.ts:
--------------------------------------------------------------------------------
 1 | import { Pool } from 'pg';
 2 | 
 3 | import { schema } from './config.js';
 4 | import { ServerContext } from './types.js';
 5 | 
 6 | export const serverInfo = {
 7 |   name: 'pg-aiguide',
 8 |   version: '1.0.0',
 9 | } as const;
10 | 
11 | const pgPool = new Pool();
12 | 
13 | export const context: ServerContext = { pgPool, schema };
14 | 


--------------------------------------------------------------------------------
/src/stdio.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | import { stdioServerFactory } from '@tigerdata/mcp-boilerplate';
 3 | import { apiFactories } from './apis/index.js';
 4 | import { promptFactories } from './skillutils/index.js';
 5 | import { context, serverInfo } from './serverInfo.js';
 6 | 
 7 | stdioServerFactory({
 8 |   ...serverInfo,
 9 |   context,
10 |   apiFactories,
11 |   promptFactories,
12 | });
13 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | import 'dotenv/config';
 3 | import { cliEntrypoint } from '@tigerdata/mcp-boilerplate';
 4 | 
 5 | import { dirname, join } from 'path';
 6 | import { fileURLToPath } from 'url';
 7 | 
 8 | const __dirname = dirname(fileURLToPath(import.meta.url));
 9 | 
10 | cliEntrypoint(
11 |   join(__dirname, 'stdio.js'),
12 |   join(__dirname, 'httpServer.js'),
13 | ).catch(console.error);
14 | 


--------------------------------------------------------------------------------
/bun:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | version="bun-v1.3.3"
 4 | downloadDir=$(pwd)/download/bun/${version}
 5 | bunCmd="$downloadDir/bin/bun"
 6 | 
 7 | if [ ! -f "$bunCmd" ]; then
 8 |     echo Installing bun to "$bunCmd"
 9 |     bashArgs=()
10 |     if [ "$version" != "latest" ]; then
11 |         bashArgs=(-s "$version")
12 |     fi
13 |     curl -fsSL https://bun.sh/install | BUN_INSTALL="$downloadDir" bash "${bashArgs[@]}"
14 | fi
15 | 
16 | exec "$bunCmd" "$@"


--------------------------------------------------------------------------------
/docker/tsdb/100_setup_db.sql:
--------------------------------------------------------------------------------
 1 | -- Sets up database similar to how Tiger Cloud works where we have a
 2 | -- tsdbadmin user that is not a superuser.
 3 | CREATE ROLE tsdbadmin
 4 | WITH
 5 |   LOGIN PASSWORD 'password';
 6 | 
 7 | CREATE DATABASE tsdb
 8 | WITH
 9 |   OWNER tsdbadmin;
10 | 
11 | \c tsdb
12 | 
13 | CREATE EXTENSION IF NOT EXISTS vector CASCADE;
14 | 
15 | -- Create schema for docs
16 | CREATE SCHEMA IF NOT EXISTS docs AUTHORIZATION tsdbadmin;
17 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "outDir": "./dist",
 4 |     "rootDir": "./src",
 5 |     "target": "ES2022",
 6 |     "module": "Node16",
 7 |     "moduleResolution": "Node16",
 8 |     "strict": true,
 9 |     "esModuleInterop": true,
10 |     "skipLibCheck": true,
11 |     "forceConsistentCasingInFileNames": true,
12 |     "resolveJsonModule": true
13 |   },
14 |   "include": ["./src/**/*.ts"],
15 |   "exclude": ["node_modules", "dist"]
16 | }
17 | 


--------------------------------------------------------------------------------
/ingest/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "docs-importer"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.13"
 7 | dependencies = [
 8 |     "beautifulsoup4>=4.13.5",
 9 |     "langchain-text-splitters>=0.3.9",
10 |     "markdownify>=1.1.0",
11 |     "openai>=1.97.1",
12 |     "psycopg[binary,pool]>=3.2.9",
13 |     "python-dotenv[cli]>=1.1.1",
14 |     "scrapy>=2.13.3",
15 |     "tiktoken>=0.11.0",
16 | ]
17 | 


--------------------------------------------------------------------------------
/src/apis/index.ts:
--------------------------------------------------------------------------------
 1 | import { semanticSearchPostgresDocsFactory } from './semanticSearchPostgresDocs.js';
 2 | import { semanticSearchTigerDocsFactory } from './semanticSearchTigerDocs.js';
 3 | import { viewSkillFactory } from './viewSkill.js';
 4 | import { keywordSearchTigerDocsFactory } from './kewordSearchTigerDocs.js';
 5 | 
 6 | export const apiFactories = [
 7 |   keywordSearchTigerDocsFactory,
 8 |   semanticSearchPostgresDocsFactory,
 9 |   semanticSearchTigerDocsFactory,
10 |   viewSkillFactory,
11 | ] as const;
12 | 


--------------------------------------------------------------------------------
/ingest/tiger_docs_config.toml:
--------------------------------------------------------------------------------
 1 | # Configuration for domain-specific element removal
 2 | # Add CSS selectors to ignore for each domain
 3 | 
 4 | [domain_selectors]
 5 | "www.tigerdata.com" = [
 6 |   "script",
 7 |   "style",
 8 |   "nav",
 9 |   "footer",
10 |   "#plan-availability",
11 |   ".sr-only",
12 |   ".code-block-copy-button"
13 | ]
14 | # Add more domains as needed
15 | 
16 | # Default selectors applied to all domains
17 | [default_selectors]
18 | selectors = [
19 |   "script",
20 |   "style",
21 |   "nav",
22 |   "footer"
23 | ]
24 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-feature-branch.yaml:
--------------------------------------------------------------------------------
 1 | name: Deploy Feature Branch
 2 | on:
 3 |   workflow_dispatch:
 4 | 
 5 | jobs:
 6 |   deploy:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - name: Dispatch Workflow
10 |         uses: timescale/workflow-dispatch-action@main
11 |         with:
12 |           github-token: ${{ secrets.ORG_GITHUB_AGENTS_TOKEN }}
13 |           owner: timescale
14 |           repo: tiger-agents-deploy
15 |           workflow_id: deploy.yaml
16 |           ref: 'main'
17 |           inputs: '{"repository": "pg-aiguide", "sha": "${{ github.sha }}"}'
18 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yml:
--------------------------------------------------------------------------------
 1 | name: Lint
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [main]
 6 |   pull_request:
 7 | 
 8 | jobs:
 9 |   lint:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - uses: actions/checkout@v4
14 |       - uses: actions/setup-node@v5
15 |         with:
16 |           node-version: 22
17 |           cache: npm
18 |       - name: Install dependencies
19 |         run: npm ci
20 |       - name: Run ESLint
21 |         run: npm run lint
22 |       - name: Run Prettier
23 |         run: npm run prettier:check
24 |       - name: Run build
25 |         run: npm run build --if-present
26 | 


--------------------------------------------------------------------------------
/.github/workflows/build-on-feature-branch.yaml:
--------------------------------------------------------------------------------
 1 | name: Build Docker - Feature Branch Push
 2 | on:
 3 |   push:
 4 |     branches-ignore:
 5 |       - main
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Dispatch Workflow
12 |         uses: timescale/workflow-dispatch-action@main
13 |         with:
14 |           github-token: ${{ secrets.ORG_GITHUB_AGENTS_TOKEN }}
15 |           owner: timescale
16 |           repo: tiger-agents-deploy
17 |           workflow_id: build.yaml
18 |           ref: 'main'
19 |           inputs: '{"repository": "pg-aiguide", "sha": "${{ github.sha }}"}'
20 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Copyright 2025 Timescale, Inc., d/b/a Tiger Data
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |    http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | 


--------------------------------------------------------------------------------
/src/httpServer.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env node
 2 | import { httpServerFactory, log } from '@tigerdata/mcp-boilerplate';
 3 | import { apiFactories } from './apis/index.js';
 4 | import { promptFactories } from './skillutils/index.js';
 5 | import { runMigrations } from './migrate.js';
 6 | import { context, serverInfo } from './serverInfo.js';
 7 | 
 8 | log.info('starting server...');
 9 | try {
10 |   log.info('Running database migrations...');
11 |   await runMigrations();
12 |   log.info('Database migrations completed successfully');
13 | } catch (error) {
14 |   log.error('Database migration failed:', error as Error);
15 |   throw error;
16 | }
17 | 
18 | export const { registerCleanupFn } = httpServerFactory({
19 |   ...serverInfo,
20 |   context,
21 |   apiFactories,
22 |   promptFactories,
23 |   stateful: false,
24 | });
25 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM node:22-alpine AS builder
 2 | 
 3 | COPY package*.json /app/
 4 | COPY tsconfig.json /app/
 5 | COPY src /app/src
 6 | COPY skills /app/skills
 7 | COPY migrations /app/migrations
 8 | 
 9 | WORKDIR /app
10 | 
11 | RUN --mount=type=cache,target=/root/.npm npm install
12 | 
13 | FROM node:22-alpine AS release
14 | 
15 | LABEL io.modelcontextprotocol.server.name="io.github.timescale/pg-aiguide"
16 | 
17 | WORKDIR /app
18 | 
19 | COPY --from=builder /app/dist /app/dist
20 | COPY --from=builder /app/skills /app/skills
21 | COPY --from=builder /app/package.json /app/package.json
22 | COPY --from=builder /app/package-lock.json /app/package-lock.json
23 | COPY --from=builder /app/migrations /app/migrations
24 | 
25 | ENV NODE_ENV=production
26 | 
27 | RUN npm ci --ignore-scripts --omit-dev
28 | 
29 | CMD ["node", "dist/index.js", "http"]
30 | 


--------------------------------------------------------------------------------
/migrations/1759241361471-add-version-index.js:
--------------------------------------------------------------------------------
 1 | import 'dotenv/config';
 2 | import { Client } from 'pg';
 3 | 
 4 | const schema = process.env.DB_SCHEMA || 'docs';
 5 | 
 6 | export const description = 'Add index on postgres_pages.version';
 7 | 
 8 | export async function up() {
 9 |   const client = new Client();
10 | 
11 |   try {
12 |     await client.connect();
13 |     await client.query(/* sql */ `
14 |       CREATE INDEX CONCURRENTLY IF NOT EXISTS postgres_pages_version_idx
15 |       ON ${schema}.postgres_pages (version);
16 |     `);
17 |   } catch (e) {
18 |     throw e;
19 |   } finally {
20 |     await client.end();
21 |   }
22 | }
23 | 
24 | export async function down() {
25 |   const client = new Client();
26 | 
27 |   try {
28 |     await client.connect();
29 |     await client.query(/* sql */ `
30 |       DROP INDEX CONCURRENTLY IF EXISTS ${schema}.postgres_pages_version_idx;
31 |     `);
32 |   } catch (e) {
33 |     throw e;
34 |   } finally {
35 |     await client.end();
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   db:
 3 |     image: timescale/timescaledb-ha:pg17
 4 |     environment:
 5 |       - POSTGRES_USER=postgres
 6 |       - POSTGRES_PASSWORD=postgres
 7 |     ports:
 8 |       - '5432:5432'
 9 |     volumes:
10 |       - db_data:/home/postgres/pgdata/data
11 |       - ./docker/tsdb/100_setup_db.sql:/docker-entrypoint-initdb.d/100_setup_db.sql
12 |     healthcheck:
13 |       test: ['CMD-SHELL', 'pg_isready -U postgres -d postgres']
14 |       interval: 1s
15 |       timeout: 5s
16 |       retries: 50
17 | 
18 |   app:
19 |     build:
20 |       context: .
21 |       target: builder
22 |     depends_on:
23 |       db:
24 |         condition: service_healthy
25 |     env_file: .env
26 |     ports:
27 |       - '3020:3001'
28 |     volumes:
29 |       - ./migrations:/app/migrations
30 |       - ./src:/app/src
31 |       - ./skills:/app/skills
32 |       - ./package.json:/app/package.json
33 |       - ./package-lock.json:/app/package-lock.json
34 |       - ./tsconfig.json:/app/tsconfig.json
35 |     command: npm run watch:http
36 | 
37 | volumes:
38 |   db_data:
39 | 


--------------------------------------------------------------------------------
/src/util/featureFlags.ts:
--------------------------------------------------------------------------------
 1 | import type { McpFeatureFlags } from '@tigerdata/mcp-boilerplate';
 2 | 
 3 | export interface FeatureFlags {
 4 |   mcpSkillsEnabled: boolean;
 5 | }
 6 | 
 7 | /**
 8 |  * Parse feature flags from query parameters or environment variables
 9 |  * Supports both HTTP (?disable_mcp_skills=1) and stdio transport (env var)
10 |  */
11 | export const parseFeatureFlags = (
12 |   query?: McpFeatureFlags['query'],
13 | ): FeatureFlags => {
14 |   // Default: skills enabled
15 |   let mcpSkillsEnabled = true;
16 | 
17 |   // Check query parameters first (for HTTP transport)
18 |   if (query) {
19 |     if (
20 |       query.disable_mcp_skills === '1' ||
21 |       query.disable_mcp_skills === 'true'
22 |     ) {
23 |       mcpSkillsEnabled = false;
24 |     }
25 |   }
26 |   // Fall back to environment variables (for stdio transport)
27 |   else if (process.env.DISABLE_MCP_SKILLS) {
28 |     if (
29 |       process.env.DISABLE_MCP_SKILLS === '1' ||
30 |       process.env.DISABLE_MCP_SKILLS === 'true'
31 |     ) {
32 |       mcpSkillsEnabled = false;
33 |     }
34 |   }
35 | 
36 |   return {
37 |     mcpSkillsEnabled,
38 |   };
39 | };
40 | 


--------------------------------------------------------------------------------
/CLAUDE.md:
--------------------------------------------------------------------------------
 1 | # Tiger Docs MCP Server - Development Guidelines
 2 | 
 3 | ## Build, Test & Run Commands
 4 | 
 5 | - Build: `npm run build` - Compiles TypeScript to JavaScript
 6 | - Watch mode: `npm run watch` - Watches for changes and rebuilds automatically
 7 | - Run server: `npm run start` - Starts the MCP server using stdio transport
 8 | - Prepare release: `npm run prepare` - Builds the project for publishing
 9 | 
10 | ## Code Style Guidelines
11 | 
12 | - Use ES modules with `.js` extension in import paths
13 | - Strictly type all functions and variables with TypeScript
14 | - Follow zod schema patterns for tool input validation
15 | - Use `.nullable()` instead of `.optional()` for optional MCP tool parameters (required for gpt-5 compatibility)
16 | - Prefer async/await over callbacks and Promise chains
17 | - Place all imports at top of file, grouped by external then internal
18 | - Use descriptive variable names that clearly indicate purpose
19 | - Implement proper cleanup for timers and resources in server shutdown
20 | - Follow camelCase for variables/functions, PascalCase for types/classes, UPPER_CASE for constants
21 | - Handle errors with try/catch blocks and provide clear error messages
22 | - Use consistent indentation (2 spaces) and trailing commas in multi-line objects
23 | 


--------------------------------------------------------------------------------
/migrations/1759241172003-add-hnsw-indexes.js:
--------------------------------------------------------------------------------
 1 | import 'dotenv/config';
 2 | import { Client } from 'pg';
 3 | 
 4 | const schema = process.env.DB_SCHEMA || 'docs';
 5 | 
 6 | export const description = 'Add HNSW indexes to embedding columns';
 7 | 
 8 | export async function up() {
 9 |   const client = new Client();
10 | 
11 |   try {
12 |     await client.connect();
13 |     await client.query(/* sql */ `
14 |       CREATE INDEX CONCURRENTLY IF NOT EXISTS postgres_chunks_embedding_idx
15 |       ON ${schema}.postgres_chunks
16 |       USING hnsw (embedding vector_cosine_ops);
17 |     `);
18 |     await client.query(/* sql */ `
19 |       CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_chunks_embedding_idx
20 |       ON ${schema}.timescale_chunks
21 |       USING hnsw (embedding vector_cosine_ops);
22 |     `);
23 |   } catch (e) {
24 |     throw e;
25 |   } finally {
26 |     await client.end();
27 |   }
28 | }
29 | 
30 | export async function down() {
31 |   const client = new Client();
32 | 
33 |   try {
34 |     await client.connect();
35 |     await client.query(/* sql */ `
36 |       DROP INDEX CONCURRENTLY IF EXISTS ${schema}.postgres_chunks_embedding_idx;
37 |     `);
38 |     await client.query(/* sql */ `
39 |       DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_chunks_embedding_idx;
40 |     `);
41 |   } catch (e) {
42 |     throw e;
43 |   } finally {
44 |     await client.end();
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/eslint.config.mjs:
--------------------------------------------------------------------------------
 1 | // @ts-check
 2 | 
 3 | import boilerplatePlugin from '@tigerdata/mcp-boilerplate/eslintPlugin';
 4 | import eslint from '@eslint/js';
 5 | import { defineConfig } from 'eslint/config';
 6 | import { dirname } from 'path';
 7 | import tseslint from 'typescript-eslint';
 8 | import { fileURLToPath } from 'url';
 9 | 
10 | const __dirname = dirname(fileURLToPath(import.meta.url));
11 | 
12 | export default defineConfig(
13 |   eslint.configs.recommended,
14 |   tseslint.configs.recommended,
15 |   {
16 |     files: ['src/**/*.ts'],
17 |     plugins: {
18 |       'mcp-boilerplate': boilerplatePlugin,
19 |     },
20 |     languageOptions: {
21 |       parserOptions: {
22 |         project: './tsconfig.json',
23 |         tsconfigRootDir: __dirname,
24 |       },
25 |     },
26 |     rules: {
27 |       // Disable base rule for unused vars and use TypeScript-specific one
28 |       'no-unused-vars': 'off',
29 |       '@typescript-eslint/no-unused-vars': [
30 |         'error',
31 |         { argsIgnorePattern: '^_' },
32 |       ],
33 |       '@typescript-eslint/explicit-function-return-type': 'warn',
34 |       '@typescript-eslint/no-inferrable-types': 'warn',
35 |       'prefer-const': 'error',
36 |       // Custom rule to prevent .optional() in inputSchema
37 |       'mcp-boilerplate/no-optional-input-schema': 'error',
38 |     },
39 |   },
40 |   {
41 |     ignores: ['dist/', 'node_modules/', 'migrations/', 'skills/'],
42 |   },
43 | );
44 | 


--------------------------------------------------------------------------------
/.claude-plugin/marketplace.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "aiguide",
 3 |   "owner": {
 4 |     "name": "TigerData",
 5 |     "url": "https://tigerdata.com",
 6 |     "email": "support@tigerdata.com"
 7 |   },
 8 |   "metadata": {
 9 |     "description": "PostgreSQL documentation and ecosystem tools marketplace",
10 |     "version": "1.0.0",
11 |     "pluginRoot": "."
12 |   },
13 |   "plugins": [
14 |     {
15 |       "name": "pg",
16 |       "source": "./",
17 |       "description": "Comprehensive PostgreSQL documentation and best practices through semantic search and curated skills, including ecosystem tools like TimescaleDB and Tiger Cloud",
18 |       "version": "0.1.0",
19 |       "author": {
20 |         "name": "TigerData",
21 |         "url": "https://tigerdata.com"
22 |       },
23 |       "homepage": "https://tigerdata.com",
24 |       "repository": "https://github.com/timescale/pg-aiguide",
25 |       "license": "Apache-2.0",
26 |       "keywords": [
27 |         "postgresql",
28 |         "postgres",
29 |         "database",
30 |         "sql",
31 |         "skills",
32 |         "aiguide",
33 |         "timescaledb",
34 |         "documentation",
35 |         "semantic-search",
36 |         "best-practices"
37 |       ],
38 |       "category": "database",
39 |       "mcpServers": {
40 |         "pg-aiguide": {
41 |           "type": "http",
42 |           "url": "https://mcp.tigerdata.com/docs?disable_mcp_skills=1"
43 |         }
44 |       },
45 |       "strict": false
46 |     }
47 |   ]
48 | }
49 | 


--------------------------------------------------------------------------------
/.github/workflows/build-and-deploy-on-merge.yaml:
--------------------------------------------------------------------------------
 1 | name: Build and Deploy - Merge main
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 | 
 7 | jobs:
 8 |   build:
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Dispatch Workflow
12 |         uses: timescale/workflow-dispatch-action@main
13 |         with:
14 |           github-token: ${{ secrets.ORG_GITHUB_AGENTS_TOKEN }}
15 |           owner: timescale
16 |           repo: tiger-agents-deploy
17 |           workflow_id: build.yaml
18 |           ref: 'main'
19 |           inputs: >
20 |             {
21 |               "repository": "pg-aiguide",
22 |               "sha": "${{ github.sha }}",
23 |               "latest": true
24 |             }
25 | 
26 |   deploy:
27 |     runs-on: ubuntu-latest
28 |     needs: build
29 |     strategy:
30 |       matrix:
31 |         include:
32 |           - env: dev
33 |             namespace: savannah-system
34 |           - env: prod
35 |             namespace: tiger-mcp
36 |     steps:
37 |       - name: Dispatch Workflow - ${{ matrix.env }}
38 |         uses: timescale/workflow-dispatch-action@main
39 |         with:
40 |           github-token: ${{ secrets.ORG_GITHUB_AGENTS_TOKEN }}
41 |           owner: timescale
42 |           repo: tiger-agents-deploy
43 |           workflow_id: deploy.yaml
44 |           ref: 'main'
45 |           inputs: >
46 |             {
47 |               "repository": "pg-aiguide",
48 |               "sha": "${{ github.sha }}",
49 |               "env": "${{ matrix.env }}",
50 |               "namespace": "${{ matrix.namespace }}"
51 |             }
52 | 


--------------------------------------------------------------------------------
/.github/workflows/ingest-tiger-docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Ingest Tiger Docs
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 2 * * 0'
 6 |   workflow_dispatch:
 7 | 
 8 | jobs:
 9 |   ingest-dev:
10 |     name: Ingest Tiger docs for Dev
11 |     runs-on: ubuntu-latest
12 |     defaults:
13 |       run:
14 |         working-directory: ingest
15 |     env:
16 |       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
17 |       PGHOST: ${{ secrets.PGHOST }}
18 |       PGPORT: ${{ secrets.PGPORT }}
19 |       PGDATABASE: ${{ secrets.PGDATABASE }}
20 |       PGUSER: ${{ secrets.PGUSER }}
21 |       PGPASSWORD: ${{ secrets.PGPASSWORD }}
22 |     steps: &ingest-steps
23 |       - name: Checkout repository
24 |         uses: actions/checkout@v5
25 | 
26 |       - name: Set up Python
27 |         uses: actions/setup-python@v5
28 |         with:
29 |           python-version-file: ./ingest/.python-version
30 | 
31 |       - name: Install uv
32 |         uses: astral-sh/setup-uv@v6
33 | 
34 |       - name: Install python dependencies
35 |         run: uv sync
36 | 
37 |       - name: Ingest Tiger docs
38 |         run: uv run python tiger_docs.py
39 | 
40 |   ingest-prod:
41 |     name: Ingest Tiger docs for Prod
42 |     runs-on: ubuntu-latest
43 |     defaults:
44 |       run:
45 |         working-directory: ingest
46 |     env:
47 |       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
48 |       PGHOST: ${{ secrets.PROD_PGHOST }}
49 |       PGPORT: ${{ secrets.PROD_PGPORT }}
50 |       PGDATABASE: ${{ secrets.PROD_PGDATABASE }}
51 |       PGUSER: ${{ secrets.PROD_PGUSER }}
52 |       PGPASSWORD: ${{ secrets.PROD_PGPASSWORD }}
53 |     steps: *ingest-steps
54 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "@tigerdata/pg-aiguide",
 3 |   "version": "0.2.4",
 4 |   "description": "Comprehensive PostgreSQL documentation and best practices through semantic search and curated skills, including ecosystem tools like TimescaleDB and Tiger Cloud",
 5 |   "license": "Apache-2.0",
 6 |   "author": "TigerData",
 7 |   "homepage": "https://tigerdata.com",
 8 |   "repository": {
 9 |     "type": "git",
10 |     "url": "https://github.com/timescale/pg-aiguide"
11 |   },
12 |   "mcpName": "io.github.timescale/pg-aiguide",
13 |   "type": "module",
14 |   "bin": {
15 |     "pg-aiguide": "dist/index.js"
16 |   },
17 |   "files": [
18 |     "dist"
19 |   ],
20 |   "scripts": {
21 |     "build": "tsc && shx chmod +x dist/*.js",
22 |     "prepare": "npm run build",
23 |     "watch": "tsx watch src/index.ts stdio",
24 |     "watch:http": "tsx watch src/index.ts http",
25 |     "start": "node dist/index.js stdio",
26 |     "start:http": "node dist/index.js http",
27 |     "inspector": "npx @modelcontextprotocol/inspector",
28 |     "lint": "eslint",
29 |     "lint:fix": "eslint --fix",
30 |     "prettier:check": "prettier --check .",
31 |     "prettier:write": "prettier --write .",
32 |     "migrate": "migrate"
33 |   },
34 |   "dependencies": {
35 |     "@ai-sdk/openai": "^2.0.80",
36 |     "@tigerdata/mcp-boilerplate": "^0.8.0",
37 |     "ai": "^5.0.108",
38 |     "dotenv": "^17.2.3",
39 |     "gray-matter": "^4.0.3",
40 |     "migrate": "^2.1.0",
41 |     "pg": "^8.16.3",
42 |     "zod": "^3.25.76"
43 |   },
44 |   "devDependencies": {
45 |     "@eslint/js": "^9.39.1",
46 |     "@types/node": "^22.19.2",
47 |     "@types/pg": "^8.15.6",
48 |     "eslint": "^9.39.1",
49 |     "prettier": "^3.7.4",
50 |     "shx": "^0.4.0",
51 |     "tsx": "^4.21.0",
52 |     "typescript": "^5.9.3",
53 |     "typescript-eslint": "^8.49.0"
54 |   }
55 | }
56 | 


--------------------------------------------------------------------------------
/migrations/1759851009030-add-tiger-indexes.js:
--------------------------------------------------------------------------------
 1 | import 'dotenv/config';
 2 | import { Client } from 'pg';
 3 | 
 4 | const schema = process.env.DB_SCHEMA || 'docs';
 5 | 
 6 | export const description = 'Add HNSW indexes to embedding columns';
 7 | 
 8 | export async function up() {
 9 |   const client = new Client();
10 | 
11 |   try {
12 |     await client.connect();
13 |     await client.query(/* sql */ `
14 |       CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_pages_domain_idx
15 |       ON ${schema}.timescale_pages(domain);
16 |     `);
17 |     await client.query(/* sql */ `
18 |       CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_pages_url_idx
19 |       ON ${schema}.timescale_pages(url);
20 |     `);
21 |     await client.query(/* sql */ `
22 |       CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_chunks_page_id_idx
23 |       ON ${schema}.timescale_chunks(page_id);
24 |     `);
25 |     await client.query(/* sql */ `
26 |       CREATE INDEX CONCURRENTLY IF NOT EXISTS timescale_chunks_metadata_idx
27 |       ON ${schema}.timescale_chunks
28 |       USING gin(metadata);
29 |     `);
30 |   } catch (e) {
31 |     throw e;
32 |   } finally {
33 |     await client.end();
34 |   }
35 | }
36 | 
37 | export async function down() {
38 |   const client = new Client();
39 | 
40 |   try {
41 |     await client.connect();
42 |     await client.query(/* sql */ `
43 |       DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_pages_domain_idx;
44 |     `);
45 |     await client.query(/* sql */ `
46 |       DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_pages_url_idx;
47 |     `);
48 |     await client.query(/* sql */ `
49 |       DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_chunks_page_id_idx;
50 |     `);
51 |     await client.query(/* sql */ `
52 |       DROP INDEX CONCURRENTLY IF EXISTS ${schema}.timescale_chunks_metadata_idx;
53 |     `);
54 |   } catch (e) {
55 |     throw e;
56 |   } finally {
57 |     await client.end();
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/.github/workflows/ingest-postgres-docs.yaml:
--------------------------------------------------------------------------------
 1 | name: Ingest PostgreSQL Docs
 2 | run-name: Ingest PostgreSQL ${{ inputs.version }} Docs
 3 | 
 4 | on:
 5 |   workflow_dispatch:
 6 |     inputs:
 7 |       version:
 8 |         description: 'PostgreSQL version to ingest (e.g. 14, 15, etc.)'
 9 |         required: true
10 |         default: '17'
11 | 
12 | jobs:
13 |   ingest-dev:
14 |     name: Ingest PostgreSQL docs for Dev
15 |     runs-on: ubuntu-latest
16 |     defaults:
17 |       run:
18 |         working-directory: ingest
19 |     env:
20 |       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
21 |       PGHOST: ${{ secrets.PGHOST }}
22 |       PGPORT: ${{ secrets.PGPORT }}
23 |       PGDATABASE: ${{ secrets.PGDATABASE }}
24 |       PGUSER: ${{ secrets.PGUSER }}
25 |       PGPASSWORD: ${{ secrets.PGPASSWORD }}
26 |     steps: &ingest-steps
27 |       - name: Checkout repository
28 |         uses: actions/checkout@v5
29 | 
30 |       - name: Set up Python
31 |         uses: actions/setup-python@v5
32 |         with:
33 |           python-version-file: ./ingest/.python-version
34 | 
35 |       - name: Install uv
36 |         uses: astral-sh/setup-uv@v6
37 | 
38 |       - name: Install python dependencies
39 |         run: uv sync
40 | 
41 |       - name: Install system dependencies
42 |         run: |
43 |           sudo apt-get update
44 |           sudo apt-get install -y docbook-xml docbook-xsl libxml2-utils xsltproc fop
45 | 
46 |       - name: Ingest PostgreSQL ${{ github.event.inputs.version }} docs for dev
47 |         run: uv run python postgres_docs.py ${{ github.event.inputs.version }}
48 | 
49 |   ingest-prod:
50 |     name: Ingest PostgreSQL docs for Prod
51 |     runs-on: ubuntu-latest
52 |     defaults:
53 |       run:
54 |         working-directory: ingest
55 |     env:
56 |       OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
57 |       PGHOST: ${{ secrets.PROD_PGHOST }}
58 |       PGPORT: ${{ secrets.PROD_PGPORT }}
59 |       PGDATABASE: ${{ secrets.PROD_PGDATABASE }}
60 |       PGUSER: ${{ secrets.PROD_PGUSER }}
61 |       PGPASSWORD: ${{ secrets.PROD_PGPASSWORD }}
62 |     steps: *ingest-steps
63 | 


--------------------------------------------------------------------------------
/src/apis/viewSkill.ts:
--------------------------------------------------------------------------------
 1 | import { ApiFactory, InferSchema } from '@tigerdata/mcp-boilerplate';
 2 | import { z } from 'zod';
 3 | import { ServerContext } from '../types.js';
 4 | import { skills, viewSkillContent } from '../skillutils/index.js';
 5 | import { parseFeatureFlags } from '../util/featureFlags.js';
 6 | 
 7 | // Create enum schema dynamically
 8 | const inputSchema = {
 9 |   name: z
10 |     .enum(Array.from(skills.keys()) as [string, ...string[]])
11 |     .describe('The name of the skill to retrieve'),
12 | } as const;
13 | 
14 | // Path within the skill directory - currently fixed to SKILL.md
15 | const SKILL_PATH = 'SKILL.md';
16 | 
17 | const outputSchema = {
18 |   name: z.string().describe('The name of the requested skill'),
19 |   path: z.string().describe('The path within the skill (e.g., "SKILL.md")'),
20 |   description: z.string().describe('Description of what this skill does'),
21 |   content: z.string().describe('The full skill content'),
22 | } as const;
23 | 
24 | type OutputSchema = InferSchema<typeof outputSchema>;
25 | 
26 | export const viewSkillFactory: ApiFactory<
27 |   ServerContext,
28 |   typeof inputSchema,
29 |   typeof outputSchema
30 | > = (_context, { query }) => {
31 |   // Parse feature flags from query or environment
32 |   const flags = parseFeatureFlags(query);
33 | 
34 |   return {
35 |     name: 'view_skill',
36 |     disabled: !flags.mcpSkillsEnabled,
37 |     config: {
38 |       title: 'View Skill',
39 |       description: `Retrieve detailed skills for TimescaleDB operations and best practices.
40 | 
41 | Available Skills:
42 | 
43 | ${Array.from(skills.values())
44 |   .map((s) => `**${s.name}** - ${s.description}`)
45 |   .join('\n\n')}
46 | `,
47 |       inputSchema,
48 |       outputSchema,
49 |     },
50 |     fn: async ({ name }): Promise<OutputSchema> => {
51 |       const skill = skills.get(name);
52 | 
53 |       if (!skill) {
54 |         throw new Error(`Skill '${name}' not found`);
55 |       }
56 | 
57 |       const content = await viewSkillContent(name, SKILL_PATH);
58 | 
59 |       return {
60 |         name: skill.name,
61 |         path: SKILL_PATH,
62 |         description: skill.description || '',
63 |         content,
64 |       };
65 |     },
66 |   };
67 | };
68 | 


--------------------------------------------------------------------------------
/API.md:
--------------------------------------------------------------------------------
 1 | # API
 2 | 
 3 | All methods are exposed as MCP tools.
 4 | 
 5 | ## Semantic Search
 6 | 
 7 | ### `semantic_search_postgres_docs`
 8 | 
 9 | Searches the PostgreSQL documentation for relevant entries based on semantic similarity to the search prompt.
10 | 
11 | **MCP Tool**: `semantic_search_postgres_docs`
12 | 
13 | #### Input
14 | 
15 | ```jsonc
16 | {
17 |   "prompt": "What is the SQL command to create a table?",
18 |   "version": 17, // optional, default is 17 (supports versions 14-18)
19 |   "limit": 10, // optional, default is 10
20 | }
21 | ```
22 | 
23 | #### Output
24 | 
25 | ```jsonc
26 | {
27 |   "results": [
28 |     {
29 |       "id": 11716,
30 |       "content": "CREATE TABLE ...",
31 |       "metadata": "{...}", // JSON-encoded metadata
32 |       "distance": 0.407, // lower = more relevant
33 |     },
34 |     // ...more results
35 |   ],
36 | }
37 | ```
38 | 
39 | ### `semantic_search_tiger_docs`
40 | 
41 | Searches the TigerData and TimescaleDB documentation using semantic similarity.
42 | 
43 | **MCP Tool**: `semantic_search_tiger_docs`
44 | 
45 | #### Input
46 | 
47 | ```jsonc
48 | {
49 |   "prompt": "How do I set up continuous aggregates?",
50 |   "limit": 10, // optional, default is 10
51 | }
52 | ```
53 | 
54 | #### Output
55 | 
56 | Same format as PostgreSQL semantic search above.
57 | 
58 | ## Skills
59 | 
60 | ### `view_skill`
61 | 
62 | Retrieves curated skills for common PostgreSQL and TimescaleDB tasks. This tool is disabled
63 | when deploying as a claude plugin (which use [agent skills ](https://www.claude.com/blog/skills) directly).
64 | 
65 | **MCP Tool**: `view_skill`
66 | 
67 | ### Input
68 | 
69 | ```jsonc
70 | {
71 |   "name": "setup-timescaledb-hypertables", // see available skills in tool description
72 |   "path": "SKILL.md", // optional, defaults to "SKILL.md"
73 | }
74 | ```
75 | 
76 | ### Output
77 | 
78 | ```jsonc
79 | {
80 |   "name": "setup-timescaledb-hypertables",
81 |   "path": "SKILL.md",
82 |   "description": "Step-by-step instructions for designing table schemas and setting up TimescaleDB with hypertables, indexes, compression, retention policies, and continuous aggregates.",
83 |   "content": "...", // full skill content
84 | }
85 | ```
86 | 
87 | **Available Skills**: Check the MCP tool description for the current list of available skills or look in the `skills` directory.
88 | 


--------------------------------------------------------------------------------
/src/apis/kewordSearchTigerDocs.ts:
--------------------------------------------------------------------------------
 1 | import { ApiFactory, InferSchema } from '@tigerdata/mcp-boilerplate';
 2 | import { z } from 'zod';
 3 | import { ServerContext } from '../types.js';
 4 | 
 5 | const inputSchema = {
 6 |   limit: z.coerce
 7 |     .number()
 8 |     .int()
 9 |     .describe('The maximum number of matches to return. Defaults to 10.'),
10 |   keywords: z.string().describe('The set of keywords to search for.'),
11 | } as const;
12 | 
13 | const zEmbeddedDoc = z.object({
14 |   id: z
15 |     .number()
16 |     .int()
17 |     .describe('The unique identifier of the documentation entry.'),
18 |   content: z.string().describe('The content of the documentation entry.'),
19 |   metadata: z
20 |     .string()
21 |     .describe(
22 |       'Additional metadata about the documentation entry, as a JSON encoded string.',
23 |     ),
24 |   score: z
25 |     .number()
26 |     .describe(
27 |       'The score indicating the relevance of the entry to the keywords. Higher values indicate higher relevance.',
28 |     ),
29 | });
30 | 
31 | type EmbeddedDoc = z.infer<typeof zEmbeddedDoc>;
32 | 
33 | const outputSchema = {
34 |   results: z.array(zEmbeddedDoc),
35 | } as const;
36 | 
37 | type OutputSchema = InferSchema<typeof outputSchema>;
38 | 
39 | export const keywordSearchTigerDocsFactory: ApiFactory<
40 |   ServerContext,
41 |   typeof inputSchema,
42 |   typeof outputSchema,
43 |   z.infer<(typeof outputSchema)['results']>
44 | > = ({ pgPool, schema }) => ({
45 |   name: 'keyword_search_tiger_docs',
46 |   method: 'get',
47 |   route: '/keyword-search/tiger-docs',
48 |   config: {
49 |     title: 'Keyword Search of Tiger Documentation',
50 |     description:
51 |       'This retrieves relevancy ranked documentation entries based on a set of keywords, using a bm25 search. The content covers Tiger Cloud and TimescaleDB topics.',
52 |     inputSchema,
53 |     outputSchema,
54 |   },
55 |   disabled: process.env.ENABLE_KEYWORD_SEARCH !== 'true',
56 |   fn: async ({ keywords, limit }): Promise<OutputSchema> => {
57 |     if (limit < 0) {
58 |       throw new Error('Limit must be a non-negative integer.');
59 |     }
60 |     if (!keywords.trim()) {
61 |       throw new Error('Keywords must be a non-empty string.');
62 |     }
63 | 
64 |     const result = await pgPool.query<EmbeddedDoc>(
65 |       /* sql */ `
66 | SELECT
67 |   id::int,
68 |   content,
69 |   metadata::text,
70 |   -(content <@> to_bm25query($1, 'docs.timescale_chunks_content_idx')) as score
71 |  FROM ${schema}.timescale_chunks
72 |  ORDER BY content <@> to_bm25query($1, 'docs.timescale_chunks_content_idx')
73 |  LIMIT $2
74 | `,
75 |       [keywords, limit || 10],
76 |     );
77 | 
78 |     return {
79 |       results: result.rows,
80 |     };
81 |   },
82 |   pickResult: (r) => r.results,
83 | });
84 | 


--------------------------------------------------------------------------------
/generate-server.json.ts:
--------------------------------------------------------------------------------
  1 | import { writeFile } from 'fs/promises';
  2 | 
  3 | const version = process.argv[2]?.trim().replace(/^v/, '');
  4 | if (!version) {
  5 |   console.error('Must provide version as first argument');
  6 |   process.exit(1);
  7 | }
  8 | 
  9 | const environmentVariables = [
 10 |   {
 11 |     description: 'Your API key for text embeddings via OpenAI',
 12 |     isRequired: true,
 13 |     format: 'string',
 14 |     isSecret: true,
 15 |     name: 'OPENAI_API_KEY',
 16 |   },
 17 |   {
 18 |     description: 'PostgreSQL host to connect to',
 19 |     isRequired: true,
 20 |     format: 'string',
 21 |     isSecret: true,
 22 |     name: 'PGHOST',
 23 |   },
 24 |   {
 25 |     description: 'PostgreSQL port to connect to',
 26 |     isRequired: true,
 27 |     format: 'number',
 28 |     isSecret: true,
 29 |     name: 'PGPORT',
 30 |   },
 31 |   {
 32 |     description: 'PostgreSQL user to connect as',
 33 |     isRequired: true,
 34 |     format: 'string',
 35 |     isSecret: true,
 36 |     name: 'PGUSER',
 37 |   },
 38 |   {
 39 |     description: 'PostgreSQL password to connect with',
 40 |     isRequired: true,
 41 |     format: 'string',
 42 |     isSecret: true,
 43 |     name: 'PGPASSWORD',
 44 |   },
 45 |   {
 46 |     description: 'PostgreSQL database to connect to',
 47 |     isRequired: true,
 48 |     format: 'string',
 49 |     isSecret: true,
 50 |     name: 'PGDATABASE',
 51 |   },
 52 |   {
 53 |     description: 'PostgreSQL database schema to use',
 54 |     isRequired: false,
 55 |     format: 'string',
 56 |     isSecret: true,
 57 |     name: 'DB_SCHEMA',
 58 |   },
 59 | ];
 60 | 
 61 | const output = {
 62 |   $schema:
 63 |     'https://static.modelcontextprotocol.io/schemas/2025-10-17/server.schema.json',
 64 |   name: 'io.github.timescale/pg-aiguide',
 65 |   // max length 100 chars:
 66 |   description:
 67 |     'Comprehensive PostgreSQL documentation and best practices, including ecosystem tools',
 68 |   repository: {
 69 |     url: 'https://github.com/timescale/pg-aiguide',
 70 |     source: 'github',
 71 |   },
 72 |   version,
 73 |   remotes: [
 74 |     {
 75 |       type: 'streamable-http',
 76 |       url: 'https://mcp.tigerdata.com/docs',
 77 |     },
 78 |   ],
 79 |   packages: [
 80 |     {
 81 |       registryType: 'npm',
 82 |       identifier: '@tigerdata/pg-aiguide',
 83 |       version,
 84 |       transport: {
 85 |         type: 'stdio',
 86 |       },
 87 |       environmentVariables,
 88 |     },
 89 |     {
 90 |       registryType: 'oci',
 91 |       identifier: `ghcr.io/timescale/pg-aiguide:${version}`,
 92 |       transport: {
 93 |         type: 'stdio',
 94 |       },
 95 |       environmentVariables,
 96 |     },
 97 |   ],
 98 | };
 99 | 
100 | await writeFile('server.json', JSON.stringify(output, null, 2));
101 | 


--------------------------------------------------------------------------------
/src/apis/semanticSearchTigerDocs.ts:
--------------------------------------------------------------------------------
 1 | import { openai } from '@ai-sdk/openai';
 2 | import { ApiFactory, InferSchema } from '@tigerdata/mcp-boilerplate';
 3 | import { embed } from 'ai';
 4 | import { z } from 'zod';
 5 | import { ServerContext } from '../types.js';
 6 | 
 7 | const inputSchema = {
 8 |   limit: z.coerce
 9 |     .number()
10 |     .int()
11 |     .describe('The maximum number of matches to return. Defaults to 10.'),
12 |   prompt: z
13 |     .string()
14 |     .describe(
15 |       'The natural language query used to search the documentation for relevant information.',
16 |     ),
17 | } as const;
18 | 
19 | const zEmbeddedDoc = z.object({
20 |   id: z
21 |     .number()
22 |     .int()
23 |     .describe('The unique identifier of the documentation entry.'),
24 |   content: z.string().describe('The content of the documentation entry.'),
25 |   metadata: z
26 |     .string()
27 |     .describe(
28 |       'Additional metadata about the documentation entry, as a JSON encoded string.',
29 |     ),
30 |   distance: z
31 |     .number()
32 |     .describe(
33 |       'The distance score indicating the relevance of the entry to the prompt. Lower values indicate higher relevance.',
34 |     ),
35 | });
36 | 
37 | type EmbeddedDoc = z.infer<typeof zEmbeddedDoc>;
38 | 
39 | const outputSchema = {
40 |   results: z.array(zEmbeddedDoc),
41 | } as const;
42 | 
43 | type OutputSchema = InferSchema<typeof outputSchema>;
44 | 
45 | export const semanticSearchTigerDocsFactory: ApiFactory<
46 |   ServerContext,
47 |   typeof inputSchema,
48 |   typeof outputSchema,
49 |   z.infer<(typeof outputSchema)['results']>
50 | > = ({ pgPool, schema }) => ({
51 |   name: 'semantic_search_tiger_docs',
52 |   method: 'get',
53 |   route: '/semantic-search/tiger-docs',
54 |   config: {
55 |     title: 'Semantic Search of Tiger Documentation Embeddings',
56 |     description:
57 |       'This retrieves relevant documentation entries based on a natural language query. The content covers Tiger Cloud and TimescaleDB topics.',
58 |     inputSchema,
59 |     outputSchema,
60 |   },
61 |   fn: async ({ prompt, limit }): Promise<OutputSchema> => {
62 |     if (limit < 0) {
63 |       throw new Error('Limit must be a non-negative integer.');
64 |     }
65 |     if (!prompt.trim()) {
66 |       throw new Error('Prompt must be a non-empty string.');
67 |     }
68 | 
69 |     const { embedding } = await embed({
70 |       model: openai.embedding('text-embedding-3-small'),
71 |       value: prompt,
72 |     });
73 | 
74 |     const result = await pgPool.query<EmbeddedDoc>(
75 |       /* sql */ `
76 | SELECT
77 |   id::int,
78 |   content,
79 |   metadata::text,
80 |   embedding <=> $1::vector(1536) AS distance
81 |  FROM ${schema}.timescale_chunks
82 |  ORDER BY distance
83 |  LIMIT $2
84 | `,
85 |       [JSON.stringify(embedding), limit || 10],
86 |     );
87 | 
88 |     return {
89 |       results: result.rows,
90 |     };
91 |   },
92 |   pickResult: (r) => r.results,
93 | });
94 | 


--------------------------------------------------------------------------------
/migrations/1756387543053-initial.js:
--------------------------------------------------------------------------------
 1 | import 'dotenv/config';
 2 | import { Client } from 'pg';
 3 | 
 4 | const schema = process.env.DB_SCHEMA || 'docs';
 5 | 
 6 | export const description = 'Create schema and docs tables';
 7 | 
 8 | export async function up() {
 9 |   const client = new Client();
10 | 
11 |   try {
12 |     await client.connect();
13 |     await client.query('BEGIN');
14 |     await client.query(/* sql */ `
15 |       CREATE EXTENSION IF NOT EXISTS vector;
16 | 
17 |       CREATE TABLE ${schema}.postgres_pages (
18 |         id int4 PRIMARY KEY generated by default as identity
19 |         , version int2 NOT NULL
20 |         , url TEXT UNIQUE NOT NULL
21 |         , domain TEXT NOT NULL
22 |         , filename TEXT NOT NULL
23 |         , content_length INTEGER
24 |         , scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
25 |         , chunking_method TEXT DEFAULT 'header'
26 |         , chunks_count INTEGER DEFAULT 0
27 |       );
28 | 
29 |       CREATE TABLE IF NOT EXISTS ${schema}.postgres_chunks (
30 |         id int4 PRIMARY KEY generated by default as identity
31 |         , page_id INTEGER REFERENCES ${schema}.postgres_pages(id) ON DELETE CASCADE
32 |         , chunk_index INTEGER NOT NULL
33 |         , sub_chunk_index INTEGER NOT NULL DEFAULT 0
34 |         , content TEXT NOT NULL
35 |         , metadata JSONB
36 |         , embedding vector(1536)
37 |         , created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
38 |       );
39 | 
40 |       CREATE TABLE ${schema}.timescale_pages (
41 |         id int4 PRIMARY KEY generated by default as identity
42 |         , url TEXT UNIQUE NOT NULL
43 |         , domain TEXT NOT NULL
44 |         , filename TEXT NOT NULL
45 |         , content_length INTEGER
46 |         , scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
47 |         , chunking_method TEXT DEFAULT 'header'
48 |         , chunks_count INTEGER DEFAULT 0
49 |       );
50 | 
51 |       CREATE TABLE IF NOT EXISTS ${schema}.timescale_chunks (
52 |         id int4 PRIMARY KEY generated by default as identity
53 |         , page_id INTEGER REFERENCES ${schema}.timescale_pages(id) ON DELETE CASCADE
54 |         , chunk_index INTEGER NOT NULL
55 |         , sub_chunk_index INTEGER NOT NULL DEFAULT 0
56 |         , content TEXT NOT NULL
57 |         , metadata JSONB
58 |         , embedding vector(1536)
59 |         , created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
60 |       );
61 |     `);
62 | 
63 |     await client.query('COMMIT');
64 |   } catch (e) {
65 |     await client.query('ROLLBACK');
66 |     throw e;
67 |   } finally {
68 |     await client.end();
69 |   }
70 | }
71 | 
72 | export async function down() {
73 |   const client = new Client();
74 | 
75 |   try {
76 |     await client.connect();
77 |     await client.query(/* sql */ `
78 |       DROP TABLE IF EXISTS ${schema}.timescale_chunks;
79 |       DROP TABLE IF EXISTS ${schema}.timescale_pages;
80 |       DROP TABLE IF EXISTS ${schema}.postgres_chunks;
81 |       DROP TABLE IF EXISTS ${schema}.postgres_pages;
82 |     `);
83 |   } finally {
84 |     await client.end();
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/ingest/README.md:
--------------------------------------------------------------------------------
 1 | # Ingest
 2 | 
 3 | ## Setup
 4 | 
 5 | ### Prerequisites
 6 | 
 7 | - [`uv`](https://docs.astral.sh/uv/)
 8 | - Docbook Toolsets for building PostgreSQL docs
 9 |   (see [this page](https://www.postgresql.org/docs/current/docguide-toolsets.html)
10 |   for installing for specific platforms)
11 | 
12 | ### Install Dependencies
13 | 
14 | ```bash
15 | uv sync
16 | ```
17 | 
18 | ## Running the ingest
19 | 
20 | ### PostgreSQL Documentation
21 | 
22 | ```text
23 | $ uv run python postgres_docs.py --help
24 | usage: postgres_docs.py [-h] version
25 | 
26 | Ingest Postgres documentation into the database.
27 | 
28 | positional arguments:
29 |   version     Postgres version to ingest
30 | 
31 | options:
32 |   -h, --help  show this help message and exit
33 | ```
34 | 
35 | ### Tiger Documentation
36 | 
37 | ```text
38 | uv run python tiger_docs.py --help
39 | usage: tiger_docs.py [-h] [--domain DOMAIN] [-o OUTPUT_DIR] [-m MAX_PAGES] [--strip-images] [--no-strip-images] [--chunk] [--no-chunk] [--chunking {header,semantic}] [--storage-type {file,database}] [--database-uri DATABASE_URI]
40 |                          [--skip-indexes] [--delay DELAY] [--concurrent CONCURRENT] [--log-level {DEBUG,INFO,WARNING,ERROR}] [--user-agent USER_AGENT]
41 | 
42 | Scrape websites using sitemaps and convert to chunked markdown for RAG applications
43 | 
44 | options:
45 |   -h, --help            show this help message and exit
46 |   --domain, -d DOMAIN   Domain to scrape (e.g., docs.tigerdata.com)
47 |   -o, --output-dir OUTPUT_DIR
48 |                         Output directory for scraped files (default: scraped_docs)
49 |   -m, --max-pages MAX_PAGES
50 |                         Maximum number of pages to scrape (default: unlimited)
51 |   --strip-images        Strip data: images from content (default: True)
52 |   --no-strip-images     Keep data: images in content
53 |   --chunk               Enable content chunking (default: True)
54 |   --no-chunk            Disable content chunking
55 |   --chunking {header,semantic}
56 |                         Chunking method: header (default) or semantic (requires OPENAI_API_KEY)
57 |   --storage-type {file,database}
58 |                         Storage type: database (default) or file
59 |   --database-uri DATABASE_URI
60 |                         PostgreSQL connection URI (default: uses DB_URL from environment)
61 |   --skip-indexes        Skip creating database indexes after import (for development/testing)
62 |   --delay DELAY         Download delay in seconds (default: 1.0)
63 |   --concurrent CONCURRENT
64 |                         Maximum concurrent requests (default: 4)
65 |   --log-level {DEBUG,INFO,WARNING,ERROR}
66 |                         Logging level (default: INFO)
67 |   --user-agent USER_AGENT
68 |                         User agent string
69 | 
70 | Examples:
71 |   tiger_docs.py docs.tigerdata.com
72 |   tiger_docs.py docs.tigerdata.com -o tiger_docs -m 50
73 |   tiger_docs.py docs.tigerdata.com -o semantic_docs -m 5 --chunking semantic
74 |   tiger_docs.py docs.tigerdata.com --no-chunk --no-strip-images -m 100
75 |   tiger_docs.py docs.tigerdata.com --storage-type database --database-uri postgresql://user:pass@host:5432/dbname
76 |   tiger_docs.py docs.tigerdata.com --storage-type database --chunking semantic -m 10
77 | ```
78 | 


--------------------------------------------------------------------------------
/src/apis/semanticSearchPostgresDocs.ts:
--------------------------------------------------------------------------------
  1 | import { ApiFactory, InferSchema } from '@tigerdata/mcp-boilerplate';
  2 | import { openai } from '@ai-sdk/openai';
  3 | import { embed } from 'ai';
  4 | import { z } from 'zod';
  5 | import { ServerContext } from '../types.js';
  6 | 
  7 | const inputSchema = {
  8 |   version: z
  9 |     .enum(['14', '15', '16', '17', '18'])
 10 |     .describe(
 11 |       'The PostgreSQL major version to use for the query. Recommended to assume the latest version if unknown.',
 12 |     ),
 13 |   limit: z.coerce
 14 |     .number()
 15 |     .int()
 16 |     .describe('The maximum number of matches to return. Defaults to 10.'),
 17 |   prompt: z
 18 |     .string()
 19 |     .describe(
 20 |       'The natural language query used to search the PostgreSQL documentation for relevant information.',
 21 |     ),
 22 | } as const;
 23 | 
 24 | const zEmbeddedDoc = z.object({
 25 |   id: z
 26 |     .number()
 27 |     .int()
 28 |     .describe('The unique identifier of the documentation entry.'),
 29 |   content: z.string().describe('The content of the documentation entry.'),
 30 |   metadata: z
 31 |     .string()
 32 |     .describe(
 33 |       'Additional metadata about the documentation entry, as a JSON encoded string.',
 34 |     ),
 35 |   distance: z
 36 |     .number()
 37 |     .describe(
 38 |       'The distance score indicating the relevance of the entry to the prompt. Lower values indicate higher relevance.',
 39 |     ),
 40 | });
 41 | 
 42 | type EmbeddedDoc = z.infer<typeof zEmbeddedDoc>;
 43 | 
 44 | const outputSchema = {
 45 |   results: z.array(zEmbeddedDoc),
 46 | } as const;
 47 | 
 48 | type OutputSchema = InferSchema<typeof outputSchema>;
 49 | 
 50 | export const semanticSearchPostgresDocsFactory: ApiFactory<
 51 |   ServerContext,
 52 |   typeof inputSchema,
 53 |   typeof outputSchema,
 54 |   z.infer<(typeof outputSchema)['results']>
 55 | > = ({ pgPool, schema }) => ({
 56 |   name: 'semantic_search_postgres_docs',
 57 |   method: 'get',
 58 |   route: '/semantic-search/postgres-docs',
 59 |   config: {
 60 |     title: 'Semantic Search of PostgreSQL Documentation Embeddings',
 61 |     description:
 62 |       'This retrieves relevant PostgreSQL documentation entries based on a natural language query.',
 63 |     inputSchema,
 64 |     outputSchema,
 65 |   },
 66 |   fn: async ({ prompt, version, limit }): Promise<OutputSchema> => {
 67 |     if (limit < 0) {
 68 |       throw new Error('Limit must be a non-negative integer.');
 69 |     }
 70 |     if (!prompt.trim()) {
 71 |       throw new Error('Prompt must be a non-empty string.');
 72 |     }
 73 | 
 74 |     const { embedding } = await embed({
 75 |       model: openai.embedding('text-embedding-3-small'),
 76 |       value: prompt,
 77 |     });
 78 | 
 79 |     const result = await pgPool.query<EmbeddedDoc>(
 80 |       /* sql */ `
 81 | SELECT
 82 |   c.id::int,
 83 |   c.content,
 84 |   c.metadata::text,
 85 |   c.embedding <=> $1::vector(1536) AS distance
 86 |  FROM ${schema}.postgres_chunks c
 87 |  JOIN ${schema}.postgres_pages p ON c.page_id = p.id
 88 |  WHERE p.version = $2
 89 |  ORDER BY distance
 90 |  LIMIT $3
 91 | `,
 92 |       [JSON.stringify(embedding), version, limit || 10],
 93 |     );
 94 | 
 95 |     return {
 96 |       results: result.rows,
 97 |     };
 98 |   },
 99 |   pickResult: (r) => r.results,
100 | });
101 | 


--------------------------------------------------------------------------------
/src/migrate.ts:
--------------------------------------------------------------------------------
  1 | import migrate from 'migrate';
  2 | import path from 'path';
  3 | import { Client } from 'pg';
  4 | import { createHash } from 'crypto';
  5 | import { fileURLToPath } from 'url';
  6 | import { schema } from './config.js';
  7 | 
  8 | // Use a hash of the project name
  9 | const hash = createHash('sha256').update('pg-aiguide').digest('hex');
 10 | const MIGRATION_ADVISORY_LOCK_ID = parseInt(hash.substring(0, 15), 16);
 11 | 
 12 | const __filename = fileURLToPath(import.meta.url);
 13 | const __dirname = path.dirname(__filename);
 14 | 
 15 | const createStateStore = (): {
 16 |   load(callback: (err: Error | null, set?: unknown) => void): Promise<void>;
 17 |   save(set: unknown, callback: (err: Error | null) => void): Promise<void>;
 18 |   close(): Promise<void>;
 19 | } => {
 20 |   let client: Client;
 21 | 
 22 |   return {
 23 |     async load(
 24 |       callback: (err: Error | null, set?: unknown) => void,
 25 |     ): Promise<void> {
 26 |       try {
 27 |         client = new Client();
 28 |         await client.connect();
 29 | 
 30 |         // Acquire advisory lock to prevent concurrent migrations
 31 |         await client.query(/* sql */ `SELECT pg_advisory_lock($1)`, [
 32 |           MIGRATION_ADVISORY_LOCK_ID,
 33 |         ]);
 34 | 
 35 |         // Ensure schema exists
 36 |         await client.query(/* sql */ `
 37 |           CREATE SCHEMA IF NOT EXISTS ${schema};
 38 |         `);
 39 | 
 40 |         // Ensure migrations table exists
 41 |         await client.query(/* sql */ `
 42 |           CREATE TABLE IF NOT EXISTS ${schema}.migrations (
 43 |             id SERIAL PRIMARY KEY,
 44 |             set JSONB NOT NULL,
 45 |             applied_at TIMESTAMPTZ DEFAULT CURRENT_TIMESTAMP
 46 |           );
 47 |         `);
 48 | 
 49 |         // Load the most recent migration set
 50 |         const result = await client.query(
 51 |           /* sql */ `SELECT set FROM ${schema}.migrations ORDER BY applied_at DESC LIMIT 1`,
 52 |         );
 53 | 
 54 |         const set = result.rows.length > 0 ? result.rows[0].set : {};
 55 |         callback(null, set);
 56 |       } catch (error) {
 57 |         callback(error as Error);
 58 |       }
 59 |     },
 60 | 
 61 |     async save(
 62 |       set: unknown,
 63 |       callback: (err: Error | null) => void,
 64 |     ): Promise<void> {
 65 |       try {
 66 |         // Insert the entire set as JSONB
 67 |         await client.query(
 68 |           /* sql */ `INSERT INTO ${schema}.migrations (set) VALUES ($1)`,
 69 |           [JSON.stringify(set)],
 70 |         );
 71 | 
 72 |         callback(null);
 73 |       } catch (error) {
 74 |         callback(error as Error);
 75 |       }
 76 |     },
 77 | 
 78 |     async close(): Promise<void> {
 79 |       if (client) {
 80 |         // Release advisory lock
 81 |         await client.query(/* sql */ `SELECT pg_advisory_unlock($1)`, [
 82 |           MIGRATION_ADVISORY_LOCK_ID,
 83 |         ]);
 84 |         await client.end();
 85 |       }
 86 |     },
 87 |   };
 88 | };
 89 | 
 90 | export const runMigrations = async (): Promise<void> => {
 91 |   return new Promise((resolve, reject) => {
 92 |     const stateStore = createStateStore();
 93 | 
 94 |     migrate.load(
 95 |       {
 96 |         stateStore,
 97 |         migrationsDirectory: path.join(__dirname, '..', 'migrations'),
 98 |       },
 99 |       (err, set) => {
100 |         if (err) {
101 |           stateStore.close().finally(() => reject(err));
102 |           return;
103 |         }
104 | 
105 |         set.up((err) => {
106 |           stateStore.close().finally(() => {
107 |             if (err) {
108 |               reject(err);
109 |             } else {
110 |               resolve();
111 |             }
112 |           });
113 |         });
114 |       },
115 |     );
116 |   });
117 | };
118 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
  1 | name: Publish Package
  2 | 
  3 | on:
  4 |   push:
  5 |     tags:
  6 |       - 'v*.*.*'
  7 | 
  8 | permissions:
  9 |   # Required for OIDC authentication to npm (https://docs.npmjs.com/trusted-publishers)
 10 |   id-token: write
 11 |   packages: write
 12 |   contents: read
 13 | 
 14 | jobs:
 15 |   publish-npm:
 16 |     name: Publish package to npm
 17 |     runs-on: ubuntu-latest
 18 |     steps:
 19 |       - name: Checkout
 20 |         uses: actions/checkout@v5
 21 |       - name: Setup Node.js
 22 |         uses: actions/setup-node@v5
 23 |         with:
 24 |           node-version: 22
 25 |           cache: 'npm'
 26 |       - name: Update npm
 27 |         run: npm install -g npm@latest
 28 |       - name: Install dependencies
 29 |         run: npm ci
 30 |       - name: Build
 31 |         run: npm run build --if-present
 32 |       - name: Publish
 33 |         run: npm publish --access public
 34 | 
 35 |   publish-docker:
 36 |     name: Publish Docker Images
 37 |     runs-on: ubuntu-latest
 38 |     steps:
 39 |       - name: Checkout
 40 |         uses: actions/checkout@v5
 41 |       - name: Docker meta
 42 |         id: meta
 43 |         uses: docker/metadata-action@v5
 44 |         with:
 45 |           images: |
 46 |             docker.io/timescale/pg-aiguide
 47 |             ghcr.io/timescale/pg-aiguide
 48 |           tags: |
 49 |             type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', 'main') }}
 50 |             type=semver,pattern={{version}}
 51 |             type=semver,pattern={{major}}.{{minor}}
 52 |             type=semver,pattern={{major}}
 53 |       - name: Set up QEMU
 54 |         uses: docker/setup-qemu-action@v3
 55 |       - name: Set up Docker Buildx
 56 |         uses: docker/setup-buildx-action@v3
 57 |       - name: Login to Docker Hub
 58 |         uses: docker/login-action@v3
 59 |         with:
 60 |           username: ${{ secrets.ORG_DOCKER_HUB_USERNAME }}
 61 |           password: ${{ secrets.ORG_DOCKER_HUB_ACCESS_TOKEN }}
 62 |       - name: Login to GitHub Container Registry
 63 |         uses: docker/login-action@v3
 64 |         with:
 65 |           registry: ghcr.io
 66 |           username: ${{ github.repository_owner }}
 67 |           password: ${{ secrets.GITHUB_TOKEN }}
 68 |       - name: Build and push
 69 |         uses: docker/build-push-action@v6
 70 |         with:
 71 |           context: .
 72 |           platforms: linux/amd64,linux/arm64
 73 |           push: true
 74 |           tags: ${{ steps.meta.outputs.tags }}
 75 |           labels: ${{ steps.meta.outputs.labels }}
 76 | 
 77 |   publish-mcp:
 78 |     name: Publish package to MCP Registry
 79 |     runs-on: ubuntu-latest
 80 |     needs:
 81 |       - publish-npm
 82 |       - publish-docker
 83 |     steps:
 84 |       - name: Checkout
 85 |         uses: actions/checkout@v5
 86 |       - name: Install mcp-publisher
 87 |         run: |
 88 |           curl -L "https://github.com/modelcontextprotocol/registry/releases/latest/download/mcp-publisher_$(uname -s | tr '[:upper:]' '[:lower:]')_$(uname -m | sed 's/x86_64/amd64/;s/aarch64/arm64/').tar.gz" | tar xz mcp-publisher
 89 |       - name: Authenticate to MCP Registry
 90 |         run: ./mcp-publisher login github-oidc
 91 |       - name: Set version in server.json
 92 |         run: |
 93 |           ./bun ./generate-server.json.ts ${{ github.ref_name }}
 94 |       - name: Publish server to MCP Registry
 95 |         run: ./mcp-publisher publish
 96 | 
 97 |   notify-publish:
 98 |     name: Notify publish to Slack
 99 |     runs-on: ubuntu-latest
100 |     needs:
101 |       - publish-npm
102 |       - publish-docker
103 |       - publish-mcp
104 |     if: success()
105 |     steps:
106 |       - name: Set version
107 |         id: version
108 |         run: echo "number=${GITHUB_REF_NAME#v}" >> $GITHUB_OUTPUT
109 | 
110 |       - name: Post to Slack
111 |         uses: slackapi/slack-github-action@v2.1.1
112 |         with:
113 |           method: chat.postMessage
114 |           token: ${{ secrets.SLACK_BOT_TOKEN }}
115 |           payload: |
116 |             channel: ${{ secrets.SLACK_CHANNEL_ID }}
117 |             unfurl_links: false
118 |             unfurl_media: false
119 |             text: "pg-aiguide ${{ github.ref_name }} published"
120 |             blocks:
121 |               - type: "markdown"
122 |                 text: |
123 |                   **pg-aiguide ${{ github.ref_name }} published**
124 |                   [workflow run](${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}) | [npm](https://www.npmjs.com/package/@tigerdata/pg-aiguide/v/${{ steps.version.outputs.number }}) | [mcp registry](https://registry.modelcontextprotocol.io/?q=pg-aiguide) | [ghcr.io](https://ghcr.io/timescale/pg-aiguide) | [docker.io](https://hub.docker.com/r/timescale/pg-aiguide)
125 | 


--------------------------------------------------------------------------------
/DEVELOPMENT.md:
--------------------------------------------------------------------------------
  1 | # Development Guide
  2 | 
  3 | ## Getting Started
  4 | 
  5 | Clone the repo.
  6 | 
  7 | ```bash
  8 | git clone git@github.com:timescale/pg-aiguide.git
  9 | ```
 10 | 
 11 | ## Configuration
 12 | 
 13 | Create a `.env` file based on the `.env.sample` file.
 14 | 
 15 | ```bash
 16 | cp .env.sample .env
 17 | ```
 18 | 
 19 | Add your OPENAI_API_KEY to be used for generating embeddings.
 20 | 
 21 | ### Configuration Parameters
 22 | 
 23 | The server supports disabling MCP skills through different mechanisms for each transport:
 24 | 
 25 | #### HTTP Transport
 26 | 
 27 | Pass parameters as query strings:
 28 | 
 29 | ```
 30 | https://mcp.tigerdata.com/docs?disable_mcp_skills=1
 31 | ```
 32 | 
 33 | #### Stdio Transport
 34 | 
 35 | Use environment variables in the connection configuration:
 36 | 
 37 | ```json
 38 | {
 39 |   "mcpServers": {
 40 |     "pg-aiguide": {
 41 |       "command": "node",
 42 |       "args": ["/path/to/dist/index.js", "stdio"],
 43 |       "env": {
 44 |         "DISABLE_MCP_SKILLS": "1"
 45 |       }
 46 |     }
 47 |   }
 48 | }
 49 | ```
 50 | 
 51 | Or when running directly:
 52 | 
 53 | ```bash
 54 | DISABLE_MCP_SKILLS=1 node dist/index.js stdio
 55 | ```
 56 | 
 57 | #### Available Parameters
 58 | 
 59 | | Parameter          | HTTP Query           | Stdio Env Var        | Values    | Description                                                                                                                                                   |
 60 | | ------------------ | -------------------- | -------------------- | --------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 61 | | Disable MCP Skills | `disable_mcp_skills` | `DISABLE_MCP_SKILLS` | 1 or true | Disable all MCP skills (tools and prompt templates). This removes the `view_skill` tool and all skill-based prompt templates from the available capabilities. |
 62 | 
 63 | **Examples:**
 64 | 
 65 | - HTTP: `?disable_mcp_skills=1`
 66 | - Stdio: `DISABLE_MCP_SKILLS=1`
 67 | - Default (skills enabled): No parameter needed
 68 | 
 69 | ## Run a TimescaleDB Database
 70 | 
 71 | You will need a database with the [pgvector extension](https://github.com/pgvector/pgvector).
 72 | 
 73 | ### Using Tiger Cloud
 74 | 
 75 | Use the [tiger CLI](https://github.com/timescale/tiger-cli) to create a Tiger Cloud service.
 76 | 
 77 | ```bash
 78 | tiger service create --free --with-password -o json
 79 | ```
 80 | 
 81 | Copy your database connection parameters into your .env file.
 82 | 
 83 | ### Using Docker
 84 | 
 85 | Run the database in a docker container.
 86 | 
 87 | ```bash
 88 | # pull the latest image
 89 | docker pull timescale/timescaledb-ha:pg17
 90 | 
 91 | # run the database container
 92 | docker run -d --name pg-aiguide \
 93 |   -e POSTGRES_PASSWORD=password \
 94 |   -e POSTGRES_DB=tsdb \
 95 |   -e POSTGRES_USER=tsdbadmin \
 96 |   -p 127.0.0.1:5432:5432 \
 97 |   timescale/timescaledb-ha:pg17
 98 | ```
 99 | 
100 | Copy your database connection parameters to your .env file:
101 | 
102 | ```dotenv
103 | PGHOST=localhost
104 | PGPORT=5432
105 | PGDATABASE=tsdb
106 | PGUSER=tsdbadmin
107 | PGPASSWORD=password
108 | ```
109 | 
110 | ## Building the MCP Server
111 | 
112 | Run `npm i` to install dependencies and build the project. Use `npm run watch` to rebuild on changes.
113 | 
114 | ## Loading the Database
115 | 
116 | The database is NOT preloaded with the documentation. To make the MCP server usable, you need to scrape, chunk, embed, load, and index the documentation.
117 | Follow the [directions in the ingest directory](/ingest/README.md) to load the database.
118 | 
119 | ## Testing
120 | 
121 | The MCP Inspector is a very handy to exercise the MCP server from a web-based UI.
122 | 
123 | ```bash
124 | npm run inspector
125 | ```
126 | 
127 | | Field          | Value           |
128 | | -------------- | --------------- |
129 | | Transport Type | `STDIO`         |
130 | | Command        | `node`          |
131 | | Arguments      | `dist/index.js` |
132 | 
133 | ### Testing in Claude Desktop
134 | 
135 | Create/edit the file `~/Library/Application Support/Claude/claude_desktop_config.json` to add an entry like the following, making sure to use the absolute path to your local `pg-aiguide` project, and real database credentials.
136 | 
137 | ```json
138 | {
139 |   "mcpServers": {
140 |     "pg-aiguide": {
141 |       "command": "node",
142 |       "args": ["/absolute/path/to/pg-aiguide/dist/index.js", "stdio"],
143 |       "env": {
144 |         "PGHOST": "x.y.tsdb.cloud.timescale.com",
145 |         "PGDATABASE": "tsdb",
146 |         "PGPORT": "32467",
147 |         "PGUSER": "readonly_mcp_user",
148 |         "PGPASSWORD": "abc123",
149 |         "DB_SCHEMA": "docs",
150 |         "OPENAI_API_KEY": "sk-svcacct"
151 |       }
152 |     }
153 |   }
154 | }
155 | ```
156 | 


--------------------------------------------------------------------------------
/src/skillutils/index.ts:
--------------------------------------------------------------------------------
  1 | import { dirname, join } from 'path';
  2 | import { fileURLToPath } from 'url';
  3 | import { readdir, readFile } from 'fs/promises';
  4 | import matter from 'gray-matter';
  5 | import { z } from 'zod';
  6 | import { log, type PromptFactory } from '@tigerdata/mcp-boilerplate';
  7 | import { ServerContext } from '../types.js';
  8 | 
  9 | const __dirname = dirname(fileURLToPath(import.meta.url));
 10 | // Skills directory at repo root level
 11 | const skillsDir = join(__dirname, '..', '..', 'skills');
 12 | 
 13 | // ===== Skill Types =====
 14 | 
 15 | export const zSkillMatter = z.object({
 16 |   name: z.string().trim().min(1),
 17 |   description: z.string(),
 18 | });
 19 | export type SkillMatter = z.infer<typeof zSkillMatter>;
 20 | 
 21 | export const zSkill = z.object({
 22 |   path: z.string(),
 23 |   name: z.string(),
 24 |   description: z.string(),
 25 | });
 26 | export type Skill = z.infer<typeof zSkill>;
 27 | 
 28 | // ===== Skill Loading Implementation =====
 29 | 
 30 | // Cache for skill content
 31 | const skillContentCache: Map<string, string> = new Map();
 32 | let skillMapPromise: Promise<Map<string, Skill>> | null = null;
 33 | 
 34 | /**
 35 |  * Parse a SKILL.md file and validate its metadata
 36 |  */
 37 | const parseSkillFile = async (
 38 |   fileContent: string,
 39 | ): Promise<{
 40 |   matter: SkillMatter;
 41 |   content: string;
 42 | }> => {
 43 |   const { data, content } = matter(fileContent);
 44 |   const skillMatter = zSkillMatter.parse(data);
 45 | 
 46 |   // Normalize skill name
 47 |   if (!/^[a-zA-Z0-9-_]+$/.test(skillMatter.name)) {
 48 |     const normalized = skillMatter.name
 49 |       .toLowerCase()
 50 |       .replace(/\s+/g, '-')
 51 |       .replace(/[^a-z0-9-_]/g, '_')
 52 |       .replace(/-[-_]+/g, '-')
 53 |       .replace(/_[_-]+/g, '_')
 54 |       .replace(/(^[-_]+)|([-_]+$)/g, '');
 55 |     log.warn(
 56 |       `Skill name "${skillMatter.name}" contains invalid characters. Normalizing to "${normalized}".`,
 57 |     );
 58 |     skillMatter.name = normalized;
 59 |   }
 60 | 
 61 |   return {
 62 |     matter: skillMatter,
 63 |     content: content.trim(),
 64 |   };
 65 | };
 66 | 
 67 | /**
 68 |  * Load all skills from the filesystem
 69 |  */
 70 | async function doLoadSkills(): Promise<Map<string, Skill>> {
 71 |   const skills = new Map<string, Skill>();
 72 |   skillContentCache.clear();
 73 | 
 74 |   const alreadyExists = (name: string, path: string): boolean => {
 75 |     const existing = skills.get(name);
 76 |     if (existing) {
 77 |       log.warn(
 78 |         `Skill with name "${name}" already loaded from path "${existing.path}". Skipping duplicate at path "${path}".`,
 79 |       );
 80 |       return true;
 81 |     }
 82 |     return false;
 83 |   };
 84 | 
 85 |   const loadLocalPath = async (path: string): Promise<void> => {
 86 |     const skillPath = join(path, 'SKILL.md');
 87 |     try {
 88 |       const fileContent = await readFile(skillPath, 'utf-8');
 89 |       const {
 90 |         matter: { name, description },
 91 |         content,
 92 |       } = await parseSkillFile(fileContent);
 93 | 
 94 |       if (alreadyExists(name, path)) return;
 95 | 
 96 |       skills.set(name, {
 97 |         path,
 98 |         name,
 99 |         description,
100 |       });
101 | 
102 |       skillContentCache.set(`${name}/SKILL.md`, content);
103 |     } catch (err) {
104 |       log.error(`Failed to load skill at path: ${skillPath}`, err as Error);
105 |     }
106 |   };
107 | 
108 |   try {
109 |     // Load skills from subdirectories with SKILL.md files
110 |     const dirEntries = await readdir(skillsDir, { withFileTypes: true });
111 |     for (const entry of dirEntries) {
112 |       if (!entry.isDirectory()) continue;
113 |       await loadLocalPath(join(skillsDir, entry.name));
114 |     }
115 | 
116 |     if (skills.size === 0) {
117 |       log.warn(
118 |         'No skills found. Please add SKILL.md files to the skills/ subdirectories.',
119 |       );
120 |     } else {
121 |       log.info(`Successfully loaded ${skills.size} skill(s)`);
122 |     }
123 |   } catch (err) {
124 |     log.error('Failed to load skills', err as Error);
125 |   }
126 | 
127 |   return skills;
128 | }
129 | 
130 | /**
131 |  * Load skills with caching
132 |  */
133 | export const loadSkills = async (
134 |   force = false,
135 | ): Promise<Map<string, Skill>> => {
136 |   if (skillMapPromise && !force) {
137 |     return skillMapPromise;
138 |   }
139 | 
140 |   skillMapPromise = doLoadSkills().catch((err) => {
141 |     log.error('Failed to load skills', err as Error);
142 |     skillMapPromise = null;
143 |     return new Map<string, Skill>();
144 |   });
145 | 
146 |   return skillMapPromise;
147 | };
148 | 
149 | /**
150 |  * View skill content
151 |  */
152 | export const viewSkillContent = async (
153 |   name: string,
154 |   targetPath = 'SKILL.md',
155 | ): Promise<string> => {
156 |   const skillsMap = await loadSkills();
157 |   const skill = skillsMap.get(name);
158 |   if (!skill) {
159 |     throw new Error(`Skill not found: ${name}`);
160 |   }
161 | 
162 |   const cacheKey = `${name}/${targetPath}`;
163 |   const cached = skillContentCache.get(cacheKey);
164 |   if (cached) {
165 |     return cached;
166 |   }
167 | 
168 |   // Read from filesystem
169 |   try {
170 |     const fullPath = join(skill.path, targetPath);
171 |     const content = await readFile(fullPath, 'utf-8');
172 |     skillContentCache.set(cacheKey, content);
173 |     return content;
174 |   } catch {
175 |     throw new Error(`Failed to read skill content: ${name}/${targetPath}`);
176 |   }
177 | };
178 | 
179 | // Initialize skills on module load
180 | export const skills = await loadSkills();
181 | 
182 | interface PromptResult {
183 |   [x: string]: unknown;
184 |   description: string;
185 |   messages: {
186 |     role: 'user';
187 |     content: {
188 |       type: 'text';
189 |       text: string;
190 |     };
191 |   }[];
192 | }
193 | 
194 | // Export skills as prompt factories for MCP server
195 | export const promptFactories: PromptFactory<
196 |   ServerContext,
197 |   Record<string, never>
198 | >[] = Array.from(skills.entries()).map(([name, skillData]) => () => ({
199 |   name,
200 |   config: {
201 |     // Using the dash-separated name as the title to work around a problem in Claude Code
202 |     // See https://github.com/anthropics/claude-code/issues/7464
203 |     title: name,
204 |     description: skillData.description,
205 |     inputSchema: {}, // No arguments for static skills
206 |   },
207 |   fn: async (): Promise<PromptResult> => {
208 |     const content = await viewSkillContent(name);
209 |     return {
210 |       description: skillData.description || name,
211 |       messages: [
212 |         {
213 |           role: 'user' as const,
214 |           content: {
215 |             type: 'text' as const,
216 |             text: content,
217 |           },
218 |         },
219 |       ],
220 |     };
221 |   },
222 | }));
223 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                              Apache License
  3 |                        Version 2.0, January 2004
  4 |                     http://www.apache.org/licenses/
  5 | 
  6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 | 1. Definitions.
  9 | 
 10 |   "License" shall mean the terms and conditions for use, reproduction,
 11 |   and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |   "Licensor" shall mean the copyright owner or entity authorized by
 14 |   the copyright owner that is granting the License.
 15 | 
 16 |   "Legal Entity" shall mean the union of the acting entity and all
 17 |   other entities that control, are controlled by, or are under common
 18 |   control with that entity. For the purposes of this definition,
 19 |   "control" means (i) the power, direct or indirect, to cause the
 20 |   direction or management of such entity, whether by contract or
 21 |   otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |   outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |   "You" (or "Your") shall mean an individual or Legal Entity
 25 |   exercising permissions granted by this License.
 26 | 
 27 |   "Source" form shall mean the preferred form for making modifications,
 28 |   including but not limited to software source code, documentation
 29 |   source, and configuration files.
 30 | 
 31 |   "Object" form shall mean any form resulting from mechanical
 32 |   transformation or translation of a Source form, including but
 33 |   not limited to compiled object code, generated documentation,
 34 |   and conversions to other media types.
 35 | 
 36 |   "Work" shall mean the work of authorship, whether in Source or
 37 |   Object form, made available under the License, as indicated by a
 38 |   copyright notice that is included in or attached to the work
 39 |   (an example is provided in the Appendix below).
 40 | 
 41 |   "Derivative Works" shall mean any work, whether in Source or Object
 42 |   form, that is based on (or derived from) the Work and for which the
 43 |   editorial revisions, annotations, elaborations, or other modifications
 44 |   represent, as a whole, an original work of authorship. For the purposes
 45 |   of this License, Derivative Works shall not include works that remain
 46 |   separable from, or merely link (or bind by name) to the interfaces of,
 47 |   the Work and Derivative Works thereof.
 48 | 
 49 |   "Contribution" shall mean any work of authorship, including
 50 |   the original version of the Work and any modifications or additions
 51 |   to that Work or Derivative Works thereof, that is intentionally
 52 |   submitted to Licensor for inclusion in the Work by the copyright owner
 53 |   or by an individual or Legal Entity authorized to submit on behalf of
 54 |   the copyright owner. For the purposes of this definition, "submitted"
 55 |   means any form of electronic, verbal, or written communication sent
 56 |   to the Licensor or its representatives, including but not limited to
 57 |   communication on electronic mailing lists, source code control systems,
 58 |   and issue tracking systems that are managed by, or on behalf of, the
 59 |   Licensor for the purpose of discussing and improving the Work, but
 60 |   excluding communication that is conspicuously marked or otherwise
 61 |   designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |   "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |   on behalf of whom a Contribution has been received by Licensor and
 65 |   subsequently incorporated within the Work.
 66 | 
 67 | 2. Grant of Copyright License. Subject to the terms and conditions of
 68 |   this License, each Contributor hereby grants to You a perpetual,
 69 |   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |   copyright license to reproduce, prepare Derivative Works of,
 71 |   publicly display, publicly perform, sublicense, and distribute the
 72 |   Work and such Derivative Works in Source or Object form.
 73 | 
 74 | 3. Grant of Patent License. Subject to the terms and conditions of
 75 |   this License, each Contributor hereby grants to You a perpetual,
 76 |   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |   (except as stated in this section) patent license to make, have made,
 78 |   use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |   where such license applies only to those patent claims licensable
 80 |   by such Contributor that are necessarily infringed by their
 81 |   Contribution(s) alone or by combination of their Contribution(s)
 82 |   with the Work to which such Contribution(s) was submitted. If You
 83 |   institute patent litigation against any entity (including a
 84 |   cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |   or a Contribution incorporated within the Work constitutes direct
 86 |   or contributory patent infringement, then any patent licenses
 87 |   granted to You under this License for that Work shall terminate
 88 |   as of the date such litigation is filed.
 89 | 
 90 | 4. Redistribution. You may reproduce and distribute copies of the
 91 |   Work or Derivative Works thereof in any medium, with or without
 92 |   modifications, and in Source or Object form, provided that You
 93 |   meet the following conditions:
 94 | 
 95 |   (a) You must give any other recipients of the Work or
 96 |       Derivative Works a copy of this License; and
 97 | 
 98 |   (b) You must cause any modified files to carry prominent notices
 99 |       stating that You changed the files; and
100 | 
101 |   (c) You must retain, in the Source form of any Derivative Works
102 |       that You distribute, all copyright, patent, trademark, and
103 |       attribution notices from the Source form of the Work,
104 |       excluding those notices that do not pertain to any part of
105 |       the Derivative Works; and
106 | 
107 |   (d) If the Work includes a "NOTICE" text file as part of its
108 |       distribution, then any Derivative Works that You distribute must
109 |       include a readable copy of the attribution notices contained
110 |       within such NOTICE file, excluding those notices that do not
111 |       pertain to any part of the Derivative Works, in at least one
112 |       of the following places: within a NOTICE text file distributed
113 |       as part of the Derivative Works; within the Source form or
114 |       documentation, if provided along with the Derivative Works; or,
115 |       within a display generated by the Derivative Works, if and
116 |       wherever such third-party notices normally appear. The contents
117 |       of the NOTICE file are for informational purposes only and
118 |       do not modify the License. You may add Your own attribution
119 |       notices within Derivative Works that You distribute, alongside
120 |       or as an addendum to the NOTICE text from the Work, provided
121 |       that such additional attribution notices cannot be construed
122 |       as modifying the License.
123 | 
124 |   You may add Your own copyright statement to Your modifications and
125 |   may provide additional or different license terms and conditions
126 |   for use, reproduction, or distribution of Your modifications, or
127 |   for any such Derivative Works as a whole, provided Your use,
128 |   reproduction, and distribution of the Work otherwise complies with
129 |   the conditions stated in this License.
130 | 
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 |   any Contribution intentionally submitted for inclusion in the Work
133 |   by You to the Licensor shall be under the terms and conditions of
134 |   this License, without any additional terms or conditions.
135 |   Notwithstanding the above, nothing herein shall supersede or modify
136 |   the terms of any separate license agreement you may have executed
137 |   with Licensor regarding such Contributions.
138 | 
139 | 6. Trademarks. This License does not grant permission to use the trade
140 |   names, trademarks, service marks, or product names of the Licensor,
141 |   except as required for reasonable and customary use in describing the
142 |   origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 |   agreed to in writing, Licensor provides the Work (and each
146 |   Contributor provides its Contributions) on an "AS IS" BASIS,
147 |   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |   implied, including, without limitation, any warranties or conditions
149 |   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |   PARTICULAR PURPOSE. You are solely responsible for determining the
151 |   appropriateness of using or redistributing the Work and assume any
152 |   risks associated with Your exercise of permissions under this License.
153 | 
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 |   whether in tort (including negligence), contract, or otherwise,
156 |   unless required by applicable law (such as deliberate and grossly
157 |   negligent acts) or agreed to in writing, shall any Contributor be
158 |   liable to You for damages, including any direct, indirect, special,
159 |   incidental, or consequential damages of any character arising as a
160 |   result of this License or out of the use or inability to use the
161 |   Work (including but not limited to damages for loss of goodwill,
162 |   work stoppage, computer failure or malfunction, or any and all
163 |   other commercial damages or losses), even if such Contributor
164 |   has been advised of the possibility of such damages.
165 | 
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 |   the Work or Derivative Works thereof, You may choose to offer,
168 |   and charge a fee for, acceptance of support, warranty, indemnity,
169 |   or other liability obligations and/or rights consistent with this
170 |   License. However, in accepting such obligations, You may act only
171 |   on Your own behalf and on Your sole responsibility, not on behalf
172 |   of any other Contributor, and only if You agree to indemnify,
173 |   defend, and hold each Contributor harmless for any liability
174 |   incurred by, or claims asserted against, such Contributor by reason
175 |   of your accepting any such warranty or additional liability.
176 | 
177 | END OF TERMS AND CONDITIONS
178 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pg-aiguide
  2 | 
  3 | **AI-optimized PostgreSQL expertise for coding assistants**
  4 | 
  5 | pg-aiguide helps AI coding tools write dramatically better PostgreSQL code. It provides:
  6 | 
  7 | - **Semantic search** across the official PostgreSQL manual (version-aware)
  8 | - **AI-optimized “skills”** — curated, opinionated Postgres best practices used automatically by AI agents
  9 | - **Extension ecosystem docs**, starting with TimescaleDB, with more coming soon
 10 | 
 11 | Use it either as:
 12 | 
 13 | - a **public MCP server** that can be used with any AI coding agent, or
 14 | - a **Claude Code plugin** optimized for use with Claude's native skill support.
 15 | 
 16 | ## ⭐ Why pg-aiguide?
 17 | 
 18 | AI coding tools often generate Postgres code that is:
 19 | 
 20 | - outdated
 21 | - missing constraints and indexes
 22 | - unaware of modern PG features
 23 | - inconsistent with real-world best practices
 24 | 
 25 | pg-aiguide fixes that by giving AI agents deep, versioned PostgreSQL knowledge and proven patterns.
 26 | 
 27 | ### See the difference
 28 | 
 29 | https://github.com/user-attachments/assets/5a426381-09b5-4635-9050-f55422253a3d
 30 | 
 31 | <details>
 32 | <summary>Video Transcript </summary>
 33 | 
 34 | Prompt given to Claude Code:
 35 | 
 36 | > Please describe the schema you would create for an e-commerce website two times, first with the tiger mcp server disabled, then with the tiger mcp server enabled. For each time, write the schema to its own file in the current working directory. Then compare the two files and let me know which approach generated the better schema, using both qualitative and quantitative reasons. For this example, only use standard Postgres.
 37 | 
 38 | Result (summarized):
 39 | 
 40 | - **4× more constraints**
 41 | - **55% more indexes** (including partial/expression indexes)
 42 | - **PG17-recommended patterns**
 43 | - **Modern features** (`GENERATED ALWAYS AS IDENTITY`, `NULLS NOT DISTINCT`)
 44 | - **Cleaner naming & documentation**
 45 | 
 46 | Conclusion: _pg-aiguide produces more robust, performant, maintainable schemas._
 47 | 
 48 | </details>
 49 | 
 50 | ## 🚀 Quickstart
 51 | 
 52 | pg-aiguide is available as a **public MCP server**:
 53 | 
 54 | [https://mcp.tigerdata.com/docs](https://mcp.tigerdata.com/docs)
 55 | 
 56 | <details> 
 57 | <summary>Manual MCP configuration using JSON</summary>
 58 | 
 59 | ```json
 60 | {
 61 |   "mcpServers": {
 62 |     "pg-aiguide": {
 63 |       "url": "https://mcp.tigerdata.com/docs"
 64 |     }
 65 |   }
 66 | }
 67 | ```
 68 | 
 69 | </details>
 70 | 
 71 | Or it can be used as a **Claude Code Plugin**:
 72 | 
 73 | ```bash
 74 | claude plugin marketplace add timescale/pg-aiguide
 75 | claude plugin install pg@aiguide
 76 | ```
 77 | 
 78 | ### Install by environment
 79 | 
 80 | #### One-click installs
 81 | 
 82 | [![Install in Cursor](https://img.shields.io/badge/Install_in-Cursor-000000?style=flat-square&logoColor=white)](https://cursor.com/en/install-mcp?name=pg-aiguide&config=eyJuYW1lIjoicGctYWlndWlkZSIsInR5cGUiOiJodHRwIiwidXJsIjoiaHR0cHM6Ly9tY3AudGlnZXJkYXRhLmNvbS9kb2NzIn0=)
 83 | [![Install in VS Code](https://img.shields.io/badge/Install_in-VS_Code-0098FF?style=flat-square&logo=visualstudiocode&logoColor=white)](https://vscode.dev/redirect/mcp/install?name=pg-aiguide&config=%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D)
 84 | [![Install in VS Code Insiders](https://img.shields.io/badge/Install_in-VS_Code_Insiders-24bfa5?style=flat-square&logo=visualstudiocode&logoColor=white)](https://insiders.vscode.dev/redirect/mcp/install?name=pg-aiguide&config=%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D&quality=insiders)
 85 | [![Install in Visual Studio](https://img.shields.io/badge/Install_in-Visual_Studio-C16FDE?style=flat-square&logo=visualstudio&logoColor=white)](https://vs-open.link/mcp-install?%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D)
 86 | [![Install in Goose](https://block.github.io/goose/img/extension-install-dark.svg)](https://block.github.io/goose/extension?cmd=&arg=&id=pg-aiguide&name=pg-aiguide&description=MCP%20Server%20for%20pg-aiguide)
 87 | [![Add MCP Server pg-aiguide to LM Studio](https://files.lmstudio.ai/deeplink/mcp-install-light.svg)](https://lmstudio.ai/install-mcp?name=pg-aiguide&config=eyJuYW1lIjoicGctYWlndWlkZSIsInR5cGUiOiJodHRwIiwidXJsIjoiaHR0cHM6Ly9tY3AudGlnZXJkYXRhLmNvbS9kb2NzIn0=)
 88 | 
 89 | <details>
 90 | <summary>Claude Code</summary>
 91 | 
 92 | This repo serves as a claude code marketplace plugin. To install, run:
 93 | 
 94 | ```bash
 95 | claude plugin marketplace add timescale/pg-aiguide
 96 | claude plugin install pg@aiguide
 97 | ```
 98 | 
 99 | This plugin uses the skills available in the `skills` directory as well as our
100 | publicly available MCP server endpoint hosted by TigerData for searching PostgreSQL documentation.
101 | 
102 | </details>
103 | 
104 | <details>
105 | <summary> Codex </summary>
106 | 
107 | Run the following to add the MCP server to codex:
108 | 
109 | ```bash
110 | codex mcp add --url "https://mcp.tigerdata.com/docs" pg-aiguide
111 | ```
112 | 
113 | </details>
114 | 
115 | <details>
116 | <summary> Cursor </summary>
117 | 
118 | One-click install:
119 | 
120 | [![Install MCP Server](https://cursor.com/deeplink/mcp-install-dark.svg)](https://cursor.com/en-US/install-mcp?name=pg-aiguide&config=eyJ1cmwiOiJodHRwczovL21jcC50aWdlcmRhdGEuY29tL2RvY3MifQ%3D%3D)
121 | 
122 | Or add the following to `.cursor/mcp.json`
123 | 
124 | ```json
125 | {
126 |   "mcpServers": {
127 |     "pg-aiguide": {
128 |       "url": "https://mcp.tigerdata.com/docs"
129 |     }
130 |   }
131 | }
132 | ```
133 | 
134 | </details>
135 | 
136 | <details>
137 | <summary> Gemini CLI </summary>
138 | 
139 | Run the following to add the MCP server to Gemini CLI:
140 | 
141 | ```bash
142 | gemini mcp add -s user pg-aiguide "https://mcp.tigerdata.com/docs" -t http
143 | ```
144 | 
145 | </details>
146 | 
147 | <details>
148 | <summary> Visual Studio </summary>
149 | 
150 | Click the button to install:
151 | 
152 | [![Install in Visual Studio](https://img.shields.io/badge/Install_in-Visual_Studio-C16FDE?style=flat-square&logo=visualstudio&logoColor=white)](https://vs-open.link/mcp-install?%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D)
153 | 
154 | </details>
155 | 
156 | <details>
157 | <summary> VS Code </summary>
158 | 
159 | Click the button to install:
160 | 
161 | [![Install in VS Code](https://img.shields.io/badge/Install_in-VS_Code-0098FF?style=flat-square&logo=visualstudiocode&logoColor=white)](https://vscode.dev/redirect/mcp/install?name=pg-aiguide&config=%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D)
162 | 
163 | Alternatively, run the following to add the MCP server to VS Code:
164 | 
165 | ```bash
166 | code --add-mcp '{"name":"pg-aiguide","type":"http","url":"https://mcp.tigerdata.com/docs"}'
167 | ```
168 | 
169 | </details>
170 | 
171 | <details>
172 | <summary> VS Code Insiders </summary>
173 | 
174 | Click the button to install:
175 | 
176 | [![Install in VS Code Insiders](https://img.shields.io/badge/Install_in-VS_Code_Insiders-24bfa5?style=flat-square&logo=visualstudiocode&logoColor=white)](https://insiders.vscode.dev/redirect/mcp/install?name=pg-aiguide&config=%7B%22type%22%3A%22http%22%2C%22url%22%3A%22https%3A%2F%2Fmcp.tigerdata.com%2Fdocs%22%7D&quality=insiders)
177 | 
178 | Alternatively, run the following to add the MCP server to VS Code Insiders:
179 | 
180 | ```bash
181 | code-insiders --add-mcp '{"name":"pg-aiguide","type":"http","url":"https://mcp.tigerdata.com/docs"}'
182 | ```
183 | 
184 | </details>
185 | 
186 | <details>
187 | <summary> Windsurf </summary>
188 | 
189 | Add the following to `~/.codeium/windsurf/mcp_config.json`
190 | 
191 | ```json
192 | {
193 |   "mcpServers": {
194 |     "pg-aiguide": {
195 |       "serverUrl": "https://mcp.tigerdata.com/docs"
196 |     }
197 |   }
198 | }
199 | ```
200 | 
201 | </details>
202 | 
203 | ### 💡 Your First Prompt
204 | 
205 | Once installed, pg-aiguide can answer Postgres questions or design schemas.
206 | 
207 | **Simple schema example prompt**
208 | 
209 | > Create a Postgres table schema for storing usernames and unique email addresses.
210 | 
211 | **Complex schema example prompt**
212 | 
213 | > You are a senior software engineer. You are given a task to generate a Postgres schema for an IoT device company.
214 | > The devices collect environmental data on a factory floor. The data includes temperature, humidity, pressure, as
215 | > the main data points as well as other measurements that vary from device to device. Each device has a unique id
216 | > and a human-readable name. We want to record the time the data was collected as well. Analysis for recent data
217 | > includes finding outliers and anomalies based on measurements, as well as analyzing the data of particular devices for ad-hoc analysis. Historical data analysis includes analyzing the history of data for one device or getting statistics for all devices over long periods of time.
218 | 
219 | ## Features
220 | 
221 | ### Semantic Search (MCP Tools)
222 | 
223 | - [**`semantic_search_postgres_docs`**](API.md#semantic_search_postgres_docs)  
224 |   Performs semantic search over the official PostgreSQL manual, with results scoped to a specific Postgres version.
225 | 
226 | - [**`semantic_search_tiger_docs`** ](API.md#semantic_search_tiger_docs)
227 |   Searches Tiger Data’s documentation corpus, including TimescaleDB and future ecosystem extensions.
228 | 
229 | ### Skills (AI-Optimized Best Practices)
230 | 
231 | - **[`view_skill`](API.md#view_skill)**  
232 |   Exposes curated, opinionated PostgreSQL best-practice skills used automatically by AI coding assistants.
233 | 
234 |   These skills provide guidance on:
235 |   - Schema design
236 |   - Indexing strategies
237 |   - Data types
238 |   - Data integrity and constraints
239 |   - Naming conventions
240 |   - Performance tuning
241 |   - Modern PostgreSQL features
242 | 
243 | ## 🔌 Ecosystem Documentation
244 | 
245 | Supported today:
246 | 
247 | - **TimescaleDB** (docs + skills)
248 | 
249 | Coming soon:
250 | 
251 | - **pgvector**
252 | - **PostGIS**
253 | 
254 | We welcome contributions for additional extensions and tools.
255 | 
256 | ## 🛠 Development
257 | 
258 | See [DEVELOPMENT.md](DEVELOPMENT.md) for:
259 | 
260 | - running the MCP server locally
261 | - adding new skills
262 | - adding new docs
263 | 
264 | ## 🤝 Contributing
265 | 
266 | We welcome:
267 | 
268 | - new Postgres best-practice skills
269 | - additional documentation corpora
270 | - search quality improvements
271 | - bug reports and feature ideas
272 | 
273 | ## 📄 License
274 | 
275 | Apache 2.0
276 | 


--------------------------------------------------------------------------------
/skills/find-hypertable-candidates/SKILL.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: find-hypertable-candidates
  3 | description: Analyze an existing PostgreSQL database to identify tables that would benefit from conversion to TimescaleDB hypertables
  4 | ---
  5 | 
  6 | # PostgreSQL Hypertable Candidate Analysis
  7 | 
  8 | Identify tables that would benefit from TimescaleDB hypertable conversion. After identification, use the companion "migrate-postgres-tables-to-hypertables" skill for configuration and migration.
  9 | 
 10 | ## TimescaleDB Benefits
 11 | 
 12 | **Performance gains:** 90%+ compression, fast time-based queries, improved insert performance, efficient aggregations, continuous aggregates for materialization (dashboards, reports, analytics), automatic data management (retention, compression).
 13 | 
 14 | **Best for insert-heavy patterns:**
 15 | 
 16 | - Time-series data (sensors, metrics, monitoring)
 17 | - Event logs (user events, audit trails, application logs)
 18 | - Transaction records (orders, payments, financial)
 19 | - Sequential data (auto-incrementing IDs with timestamps)
 20 | - Append-only datasets (immutable records, historical)
 21 | 
 22 | **Requirements:** Large volumes (1M+ rows), time-based queries, infrequent updates
 23 | 
 24 | ## Step 1: Database Schema Analysis
 25 | 
 26 | ### Option A: From Database Connection
 27 | 
 28 | #### Table statistics and size
 29 | 
 30 | ```sql
 31 | -- Get all tables with row counts and insert/update patterns
 32 | WITH table_stats AS (
 33 |     SELECT
 34 |         schemaname, tablename,
 35 |         n_tup_ins as total_inserts,
 36 |         n_tup_upd as total_updates,
 37 |         n_tup_del as total_deletes,
 38 |         n_live_tup as live_rows,
 39 |         n_dead_tup as dead_rows
 40 |     FROM pg_stat_user_tables
 41 | ),
 42 | table_sizes AS (
 43 |     SELECT
 44 |         schemaname, tablename,
 45 |         pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as total_size,
 46 |         pg_total_relation_size(schemaname||'.'||tablename) as total_size_bytes
 47 |     FROM pg_tables
 48 |     WHERE schemaname NOT IN ('information_schema', 'pg_catalog')
 49 | )
 50 | SELECT
 51 |     ts.schemaname, ts.tablename, ts.live_rows,
 52 |     tsize.total_size, tsize.total_size_bytes,
 53 |     ts.total_inserts, ts.total_updates, ts.total_deletes,
 54 |     ROUND(CASE WHEN ts.live_rows > 0
 55 |           THEN (ts.total_inserts::float / ts.live_rows) * 100
 56 |           ELSE 0 END, 2) as insert_ratio_pct
 57 | FROM table_stats ts
 58 | JOIN table_sizes tsize ON ts.schemaname = tsize.schemaname AND ts.tablename = tsize.tablename
 59 | ORDER BY tsize.total_size_bytes DESC;
 60 | ```
 61 | 
 62 | **Look for:**
 63 | 
 64 | - mostly insert-heavy patterns (less updates/deletes)
 65 | - big tables (1M+ rows or 100MB+)
 66 | 
 67 | #### Index patterns
 68 | 
 69 | ```sql
 70 | -- Identify common query dimensions
 71 | SELECT schemaname, tablename, indexname, indexdef
 72 | FROM pg_indexes
 73 | WHERE schemaname NOT IN ('information_schema', 'pg_catalog')
 74 | ORDER BY tablename, indexname;
 75 | ```
 76 | 
 77 | **Look for:**
 78 | 
 79 | - Multiple indexes with timestamp/created_at columns → time-based queries
 80 | - Composite (entity_id, timestamp) indexes → good candidates
 81 | - Time-only indexes → time range filtering common
 82 | 
 83 | #### Query patterns (if pg_stat_statements available)
 84 | 
 85 | ```sql
 86 | -- Check availability
 87 | SELECT EXISTS (SELECT 1 FROM pg_extension WHERE extname = 'pg_stat_statements');
 88 | 
 89 | -- Analyze expensive queries for candidate tables
 90 | SELECT query, calls, mean_exec_time, total_exec_time
 91 | FROM pg_stat_statements
 92 | WHERE query ILIKE '%your_table_name%'
 93 | ORDER BY total_exec_time DESC LIMIT 20;
 94 | ```
 95 | 
 96 | **✅ Good patterns:** Time-based WHERE, entity filtering combined with time-based qualifiers, GROUP BY time_bucket, range queries over time
 97 | **❌ Poor patterns:** Non-time lookups with no time-based qualifiers in same query (WHERE email = ...)
 98 | 
 99 | #### Constraints
100 | 
101 | ```sql
102 | -- Check migration compatibility
103 | SELECT conname, contype, pg_get_constraintdef(oid) as definition
104 | FROM pg_constraint
105 | WHERE conrelid = 'your_table_name'::regclass;
106 | ```
107 | 
108 | **Compatibility:**
109 | 
110 | - Primary keys (p): Must include partition column or ask user if can be modified
111 | - Foreign keys (f): Plain→Hypertable and Hypertable→Plain OK, Hypertable→Hypertable NOT supported
112 | - Unique constraints (u): Must include partition column or ask user if can be modified
113 | - Check constraints (c): Usually OK
114 | 
115 | ### Option B: From Code Analysis
116 | 
117 | #### ✅ GOOD Patterns
118 | 
119 | ```python
120 | # Append-only logging
121 | INSERT INTO events (user_id, event_time, data) VALUES (...);
122 | # Time-series collection
123 | INSERT INTO metrics (device_id, timestamp, value) VALUES (...);
124 | # Time-based queries
125 | SELECT * FROM metrics WHERE timestamp >= NOW() - INTERVAL '24 hours';
126 | # Time aggregations
127 | SELECT DATE_TRUNC('day', timestamp), COUNT(*) GROUP BY 1;
128 | ```
129 | 
130 | #### ❌ POOR Patterns
131 | 
132 | ```python
133 | # Frequent updates to historical records
134 | UPDATE users SET email = ..., updated_at = NOW() WHERE id = ...;
135 | # Non-time lookups
136 | SELECT * FROM users WHERE email = ...;
137 | # Small reference tables
138 | SELECT * FROM countries ORDER BY name;
139 | ```
140 | 
141 | #### Schema Indicators
142 | 
143 | **✅ GOOD:**
144 | 
145 | - Has timestamp/timestamptz column
146 | - Multiple indexes with timestamp-based columns
147 | - Composite (entity_id, timestamp) indexes
148 | 
149 | **❌ POOR:**
150 | 
151 | - Mostly indexes with non-time-based columns (on columns like email, name, status, etc.)
152 | - Columns that you expect to be updated over time (updated_at, updated_by, status, etc.)
153 | - Unique constraints on non-time fields
154 | - Frequent updated_at modifications
155 | - Small static tables
156 | 
157 | #### Special Case: ID-Based Tables
158 | 
159 | Sequential ID tables can be candidates if:
160 | 
161 | - Insert-mostly pattern / updates are either infrequent or only on recent records.
162 | - If updates do happen, they occur on recent records (such as an order status being updated orderered->processing->delivered. Note once an order is delivered, it is unlikely to be updated again.)
163 | - IDs correlate with time (as is the case for serial/auto-incrementing IDs/GENERATED ALWAYS AS IDENTITY)
164 | - ID is the primary query dimension
165 | - Recent data accessed more often (frequently the case in ecommerce, finance, etc.)
166 | - Time-based reporting common (e.g. monthly, daily summaries/analytics)
167 | 
168 | ```sql
169 | CREATE TABLE orders (
170 |     id BIGSERIAL PRIMARY KEY,           -- Can partition by ID
171 |     user_id BIGINT,
172 |     created_at TIMESTAMPTZ DEFAULT NOW() -- For sparse indexes
173 | );
174 | ```
175 | 
176 | Note: For ID-based tables where there is also a time column (created_at, ordered_at, etc.),
177 | you can partition by ID and use sparse indexes on the time column.
178 | See the `migrate-postgres-tables-to-hypertables` skill for details.
179 | 
180 | ## Step 2: Candidacy Scoring (8+ points = good candidate)
181 | 
182 | ### Time-Series Characteristics (5+ points needed)
183 | 
184 | - Has timestamp/timestamptz column: **3 points**
185 | - Data inserted chronologically: **2 points**
186 | - Queries filter by time: **2 points**
187 | - Time aggregations common: **2 points**
188 | 
189 | ### Scale & Performance (3+ points recommended)
190 | 
191 | - Large table (1M+ rows or 100MB+): **2 points**
192 | - High insert volume: **1 point**
193 | - Infrequent updates to historical: **1 point**
194 | - Range queries common: **1 point**
195 | - Aggregation queries: **2 points**
196 | 
197 | ### Data Patterns (bonus)
198 | 
199 | - Contains entity ID for segmentation (device_id, user_id, product_id, symbol, etc.): **1 point**
200 | - Numeric measurements: **1 point**
201 | - Log/event structure: **1 point**
202 | 
203 | ## Common Patterns
204 | 
205 | ### ✅ GOOD Candidates
206 | 
207 | **✅ Event/Log Tables** (user_events, audit_logs)
208 | 
209 | ```sql
210 | CREATE TABLE user_events (
211 |     id BIGSERIAL PRIMARY KEY,
212 |     user_id BIGINT,
213 |     event_type TEXT,
214 |     event_time TIMESTAMPTZ DEFAULT NOW(),
215 |     metadata JSONB
216 | );
217 | -- Partition by id, segment by user_id, enable minmax sparse_index on event_time
218 | ```
219 | 
220 | **✅ Sensor/IoT Data** (sensor_readings, telemetry)
221 | 
222 | ```sql
223 | CREATE TABLE sensor_readings (
224 |     device_id TEXT,
225 |     timestamp TIMESTAMPTZ,
226 |     temperature DOUBLE PRECISION,
227 |     humidity DOUBLE PRECISION
228 | );
229 | -- Partition by timestamp, segment by device_id, minmax sparse indexes on temperature and humidity
230 | ```
231 | 
232 | **✅ Financial/Trading** (stock_prices, transactions)
233 | 
234 | ```sql
235 | CREATE TABLE stock_prices (
236 |     symbol VARCHAR(10),
237 |     price_time TIMESTAMPTZ,
238 |     open_price DECIMAL,
239 |     close_price DECIMAL,
240 |     volume BIGINT
241 | );
242 | -- Partition by price_time, segment by symbol, minmax sparse indexes on open_price and close_price and volume
243 | ```
244 | 
245 | **✅ System Metrics** (monitoring_data)
246 | 
247 | ```sql
248 | CREATE TABLE system_metrics (
249 |     hostname TEXT,
250 |     metric_time TIMESTAMPTZ,
251 |     cpu_usage DOUBLE PRECISION,
252 |     memory_usage BIGINT
253 | );
254 | -- Partition by metric_time, segment by hostname, minmax sparse indexes on cpu_usage and memory_usage
255 | ```
256 | 
257 | ### ❌ POOR Candidates
258 | 
259 | **❌ Reference Tables** (countries, categories)
260 | 
261 | ```sql
262 | CREATE TABLE countries (
263 |     id SERIAL PRIMARY KEY,
264 |     name VARCHAR(100),
265 |     code CHAR(2)
266 | );
267 | -- Static data, no time component
268 | ```
269 | 
270 | **❌ User Profiles** (users, accounts)
271 | 
272 | ```sql
273 | CREATE TABLE users (
274 |     id BIGSERIAL PRIMARY KEY,
275 |     email VARCHAR(255),
276 |     created_at TIMESTAMPTZ,
277 |     updated_at TIMESTAMPTZ
278 | );
279 | -- Accessed by ID, frequently updated, has timestamp but it's not the primary query dimension (the primary query dimension is id or email)
280 | ```
281 | 
282 | **❌ Settings/Config** (user_settings)
283 | 
284 | ```sql
285 | CREATE TABLE user_settings (
286 |     user_id BIGINT PRIMARY KEY,
287 |     theme VARCHAR(20),       -- Changes: light -> dark -> auto
288 |     language VARCHAR(10),    -- Changes: en -> es -> fr
289 |     notifications JSONB,     -- Frequent preference updates
290 |     updated_at TIMESTAMPTZ
291 | );
292 | -- Accessed by user_id, frequently updated, has timestamp but it's not the primary query dimension (the primary query dimension is user_id)
293 | ```
294 | 
295 | ## Analysis Output Requirements
296 | 
297 | For each candidate table provide:
298 | 
299 | - **Score:** Based on criteria (8+ = strong candidate)
300 | - **Pattern:** Insert vs update ratio
301 | - **Access:** Time-based vs entity lookups
302 | - **Size:** Current size and growth rate
303 | - **Queries:** Time-range, aggregations, point lookups
304 | 
305 | Focus on insert-heavy patterns with time-based or sequential access. Tables scoring 8+ points are strong candidates for conversion.
306 | 


--------------------------------------------------------------------------------
/skills/migrate-postgres-tables-to-hypertables/SKILL.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: migrate-postgres-tables-to-hypertables
  3 | description: Comprehensive guide for migrating PostgreSQL tables to TimescaleDB hypertables with optimal configuration and performance validation
  4 | ---
  5 | 
  6 | # PostgreSQL to TimescaleDB Hypertable Migration
  7 | 
  8 | Migrate identified PostgreSQL tables to TimescaleDB hypertables with optimal configuration, migration planning and validation.
  9 | 
 10 | **Prerequisites**: Tables already identified as hypertable candidates (use companion "find-hypertable-candidates" skill if needed).
 11 | 
 12 | ## Step 1: Optimal Configuration
 13 | 
 14 | ### Partition Column Selection
 15 | 
 16 | ```sql
 17 | -- Find potential partition columns
 18 | SELECT column_name, data_type, is_nullable
 19 | FROM information_schema.columns
 20 | WHERE table_name = 'your_table_name'
 21 |   AND data_type IN ('timestamp', 'timestamptz', 'bigint', 'integer', 'date')
 22 | ORDER BY ordinal_position;
 23 | ```
 24 | 
 25 | **Requirements:** Time-based (TIMESTAMP/TIMESTAMPTZ/DATE) or sequential integer (INT/BIGINT)
 26 | 
 27 | Should represent when the event actually occurred or sequential ordering.
 28 | 
 29 | **Common choices:**
 30 | 
 31 | - `timestamp`, `created_at`, `event_time` - when event occurred
 32 | - `id`, `sequence_number` - auto-increment (for sequential data without timestamps)
 33 | - `ingested_at` - less ideal, only if primary query dimension
 34 | - `updated_at` - AVOID (records updated out of order, breaks chunk distribution) unless primary query dimension
 35 | 
 36 | #### Special Case: table with BOTH ID AND Timestamp
 37 | 
 38 | When table has sequential ID (PK) AND timestamp that correlate:
 39 | 
 40 | ```sql
 41 | -- Partition by ID, enable minmax sparse indexes on timestamp
 42 | SELECT create_hypertable('orders', 'id', chunk_time_interval => 1000000);
 43 | ALTER TABLE orders SET (
 44 |     timescaledb.sparse_index = 'minmax(created_at),...'
 45 | );
 46 | ```
 47 | 
 48 | Sparse indexes on time column enable skipping compressed blocks outside queried time ranges.
 49 | 
 50 | Use when: ID correlates with time (newer records have higher IDs), need ID-based lookups, time queries also common
 51 | 
 52 | ### Chunk Interval Selection
 53 | 
 54 | ```sql
 55 | -- Ensure statistics are current
 56 | ANALYZE your_table_name;
 57 | 
 58 | -- Estimate index size per time unit
 59 | WITH time_range AS (
 60 |     SELECT
 61 |         MIN(timestamp_column) as min_time,
 62 |         MAX(timestamp_column) as max_time,
 63 |         EXTRACT(EPOCH FROM (MAX(timestamp_column) - MIN(timestamp_column)))/3600 as total_hours
 64 |     FROM your_table_name
 65 | ),
 66 | total_index_size AS (
 67 |     SELECT SUM(pg_relation_size(indexname::regclass)) as total_index_bytes
 68 |     FROM pg_stat_user_indexes
 69 |     WHERE schemaname||'.'||tablename = 'your_schema.your_table_name'
 70 | )
 71 | SELECT
 72 |     pg_size_pretty(tis.total_index_bytes / tr.total_hours) as index_size_per_hour
 73 | FROM time_range tr, total_index_size tis;
 74 | ```
 75 | 
 76 | **Target:** Indexes of recent chunks < 25% of RAM
 77 | **Default:** IMPORTANT: Keep default of 7 days if unsure
 78 | **Range:** 1 hour minimum, 30 days maximum
 79 | 
 80 | **Example:** 32GB RAM → target 8GB for recent indexes. If index_size_per_hour = 200MB:
 81 | 
 82 | - 1 hour chunks: 200MB chunk index size × 40 recent = 8GB ✓
 83 | - 6 hour chunks: 1.2GB chunk index size × 7 recent = 8.4GB ✓
 84 | - 1 day chunks: 4.8GB chunk index size × 2 recent = 9.6GB ⚠️
 85 |   Choose largest interval keeping 2+ recent chunk indexes under target.
 86 | 
 87 | ### Primary Key/ Unique Constraints Compatibility
 88 | 
 89 | ```sql
 90 | -- Check existing primary key/ unique constraints
 91 | SELECT conname, pg_get_constraintdef(oid) as definition
 92 | FROM pg_constraint
 93 | WHERE conrelid = 'your_table_name'::regclass AND contype = 'p' OR contype = 'u';
 94 | ```
 95 | 
 96 | **Rules:** PK/UNIQUE must include partition column
 97 | 
 98 | **Actions:**
 99 | 
100 | 1. **No PK/UNIQUE:** No changes needed
101 | 2. **PK/UNIQUE includes partition column:** No changes needed
102 | 3. **PK/UNIQUE excludes partition column:** ⚠️ **ASK USER PERMISSION** to modify PK/UNIQUE
103 | 
104 | **Example: user prompt if needed:**
105 | 
106 | > "Primary key (id) doesn't include partition column (timestamp). Must modify to PRIMARY KEY (id, timestamp) to convert to hypertable. This may break application code. Is this acceptable?"
107 | > "Unique constraint (id) doesn't include partition column (timestamp). Must modify to UNIQUE (id, timestamp) to convert to hypertable. This may break application code. Is this acceptable?"
108 | 
109 | If the user accepts, modify the constraint:
110 | 
111 | ```sql
112 | BEGIN;
113 | ALTER TABLE your_table_name DROP CONSTRAINT existing_pk_name;
114 | ALTER TABLE your_table_name ADD PRIMARY KEY (existing_columns, partition_column);
115 | COMMIT;
116 | ```
117 | 
118 | If the user does not accept, you should NOT migrate the table.
119 | 
120 | IMPORTANT: DO NOT modify the primary key/unique constraint without user permission.
121 | 
122 | ### Compression Configuration
123 | 
124 | For detailed segment_by and order_by selection, see "setup-timescaledb-hypertables" skill. Quick reference:
125 | 
126 | **segment_by:** Most common WHERE filter with >100 rows per value per chunk
127 | 
128 | - IoT: `device_id`
129 | - Finance: `symbol`
130 | - Analytics: `user_id` or `session_id`
131 | 
132 | ```sql
133 | -- Analyze cardinality for segment_by selection
134 | SELECT column_name, COUNT(DISTINCT column_name) as unique_values,
135 |        ROUND(COUNT(*)::float / COUNT(DISTINCT column_name), 2) as avg_rows_per_value
136 | FROM your_table_name GROUP BY column_name;
137 | ```
138 | 
139 | **order_by:** Usually `timestamp DESC`. The (segment_by, order_by) combination should form a natural time-series progression.
140 | 
141 | - If column has <100 rows/chunk (too low for segment_by), prepend to order_by: `order_by='low_density_col, timestamp DESC'`
142 | 
143 | **sparse indexes:** add minmax on the columns that are used in the WHERE clauses but are not in the segment_by or order_by. Use minmax for columns used in range queries.
144 | 
145 | ```sql
146 | ALTER TABLE your_table_name SET (
147 |     timescaledb.enable_columnstore,
148 |     timescaledb.segmentby = 'entity_id',
149 |     timescaledb.orderby = 'timestamp DESC'
150 |     timescaledb.sparse_index = 'minmax(value_1),...'
151 | );
152 | 
153 | -- Compress after data unlikely to change (adjust `after` parameter based on update patterns)
154 | CALL add_columnstore_policy('your_table_name', after => INTERVAL '7 days');
155 | ```
156 | 
157 | ## Step 2: Migration Planning
158 | 
159 | ### Pre-Migration Checklist
160 | 
161 | - [ ] Partition column selected
162 | - [ ] Chunk interval calculated (or using default)
163 | - [ ] PK includes partition column OR user approved modification
164 | - [ ] No Hypertable→Hypertable foreign keys
165 | - [ ] Unique constraints include partition column
166 | - [ ] Created compression configuration (segment_by, order_by, sparse indexes, compression policy)
167 | - [ ] Maintenance window scheduled / backup created.
168 | 
169 | ### Migration Options
170 | 
171 | #### Option 1: In-Place (Tables < 1GB)
172 | 
173 | ```sql
174 | -- Enable extension
175 | CREATE EXTENSION IF NOT EXISTS timescaledb;
176 | 
177 | -- Convert to hypertable (locks table)
178 | SELECT create_hypertable(
179 |     'your_table_name',
180 |     'timestamp_column',
181 |     chunk_time_interval => INTERVAL '7 days',
182 |     if_not_exists => TRUE
183 | );
184 | 
185 | -- Configure compression
186 | ALTER TABLE your_table_name SET (
187 |     timescaledb.enable_columnstore,
188 |     timescaledb.segmentby = 'entity_id',
189 |     timescaledb.orderby = 'timestamp DESC',
190 |     timescaledb.sparse_index = 'minmax(value_1),...'
191 | );
192 | 
193 | -- Adjust `after` parameter based on update patterns
194 | CALL add_columnstore_policy('your_table_name', after => INTERVAL '7 days');
195 | ```
196 | 
197 | #### Option 2: Blue-Green (Tables > 1GB)
198 | 
199 | ```sql
200 | -- 1. Create new hypertable
201 | CREATE TABLE your_table_name_new (LIKE your_table_name INCLUDING ALL);
202 | 
203 | -- 2. Convert to hypertable
204 | SELECT create_hypertable('your_table_name_new', 'timestamp_column');
205 | 
206 | -- 3. Configure compression
207 | ALTER TABLE your_table_name_new SET (
208 |     timescaledb.enable_columnstore,
209 |     timescaledb.segmentby = 'entity_id',
210 |     timescaledb.orderby = 'timestamp DESC'
211 | );
212 | 
213 | -- 4. Migrate data in batches
214 | INSERT INTO your_table_name_new
215 | SELECT * FROM your_table_name
216 | WHERE timestamp_column >= '2024-01-01' AND timestamp_column < '2024-02-01';
217 | -- Repeat for each time range
218 | 
219 | -- 4. Enter maintenance window and do the following:
220 | 
221 | -- 5. Pause modification of the old table.
222 | 
223 | -- 6. Copy over the most recent data from the old table to the new table.
224 | 
225 | -- 7. Swap tables
226 | BEGIN;
227 | ALTER TABLE your_table_name RENAME TO your_table_name_old;
228 | ALTER TABLE your_table_name_new RENAME TO your_table_name;
229 | COMMIT;
230 | 
231 | -- 8. Exit maintenance window.
232 | 
233 | -- 9. (sometime much later) Drop old table after validation
234 | -- DROP TABLE your_table_name_old;
235 | ```
236 | 
237 | ### Common Issues
238 | 
239 | #### Foreign Keys
240 | 
241 | ```sql
242 | -- Check foreign keys
243 | SELECT conname, confrelid::regclass as referenced_table
244 | FROM pg_constraint
245 | WHERE (conrelid = 'your_table_name'::regclass
246 |     OR confrelid = 'your_table_name'::regclass)
247 |   AND contype = 'f';
248 | ```
249 | 
250 | **Supported:** Plain→Hypertable, Hypertable→Plain
251 | **NOT supported:** Hypertable→Hypertable
252 | 
253 | ⚠️ **CRITICAL:** Hypertable→Hypertable FKs must be dropped (enforce in application). **ASK USER PERMISSION**. If no, **STOP MIGRATION**.
254 | 
255 | #### Large Table Migration Time
256 | 
257 | ```sql
258 | -- Rough estimate: ~75k rows/second
259 | SELECT
260 |     pg_size_pretty(pg_total_relation_size(tablename)) as size,
261 |     n_live_tup as rows,
262 |     ROUND(n_live_tup / 75000.0 / 60, 1) as estimated_minutes
263 | FROM pg_stat_user_tables
264 | WHERE tablename = 'your_table_name';
265 | ```
266 | 
267 | **Solutions for large tables (>1GB/10M rows):** Use blue-green migration, migrate during off-peak, test on subset first
268 | 
269 | ## Step 3: Performance Validation
270 | 
271 | ### Chunk & Compression Analysis
272 | 
273 | ```sql
274 | -- View chunks and compression
275 | SELECT
276 |     chunk_name,
277 |     pg_size_pretty(total_bytes) as size,
278 |     pg_size_pretty(compressed_total_bytes) as compressed_size,
279 |     ROUND((total_bytes - compressed_total_bytes::numeric) / total_bytes * 100, 1) as compression_pct,
280 |     range_start,
281 |     range_end
282 | FROM timescaledb_information.chunks
283 | WHERE hypertable_name = 'your_table_name'
284 | ORDER BY range_start DESC;
285 | ```
286 | 
287 | **Look for:**
288 | 
289 | - Consistent chunk sizes (within 2x)
290 | - Compression >90% for time-series
291 | - Recent chunks uncompressed
292 | - Chunk indexes < 25% RAM
293 | 
294 | ### Query Performance Tests
295 | 
296 | ```sql
297 | -- 1. Time-range query (should show chunk exclusion)
298 | EXPLAIN (ANALYZE, BUFFERS)
299 | SELECT COUNT(*), AVG(value)
300 | FROM your_table_name
301 | WHERE timestamp >= NOW() - INTERVAL '1 day';
302 | 
303 | -- 2. Entity + time query (benefits from segment_by)
304 | EXPLAIN (ANALYZE, BUFFERS)
305 | SELECT * FROM your_table_name
306 | WHERE entity_id = 'X' AND timestamp >= NOW() - INTERVAL '1 week';
307 | 
308 | -- 3. Aggregation (benefits from columnstore)
309 | EXPLAIN (ANALYZE, BUFFERS)
310 | SELECT DATE_TRUNC('hour', timestamp), entity_id, COUNT(*), AVG(value)
311 | FROM your_table_name
312 | WHERE timestamp >= NOW() - INTERVAL '1 month'
313 | GROUP BY 1, 2;
314 | ```
315 | 
316 | **✅ Good signs:**
317 | 
318 | - "Chunks excluded during startup: X" in EXPLAIN plan
319 | - "Custom Scan (ColumnarScan)" for compressed data
320 | - Lower "Buffers: shared read" in EXPLAIN ANALYZE plan than pre-migration
321 | - Faster execution times
322 | 
323 | **❌ Bad signs:**
324 | 
325 | - "Seq Scan" on large chunks
326 | - No chunk exclusion messages
327 | - Slower than before migration
328 | 
329 | ### Storage Metrics
330 | 
331 | ```sql
332 | -- Monitor compression effectiveness
333 | SELECT
334 |     hypertable_name,
335 |     pg_size_pretty(total_bytes) as total_size,
336 |     pg_size_pretty(compressed_total_bytes) as compressed_size,
337 |     ROUND(compressed_total_bytes::numeric / total_bytes * 100, 1) as compressed_pct_of_total,
338 |     ROUND((uncompressed_total_bytes - compressed_total_bytes::numeric) /
339 |           uncompressed_total_bytes * 100, 1) as compression_ratio_pct
340 | FROM timescaledb_information.hypertables
341 | WHERE hypertable_name = 'your_table_name';
342 | ```
343 | 
344 | **Monitor:**
345 | 
346 | - compression_ratio_pct >90% (typical time-series)
347 | - compressed_pct_of_total growing as data ages
348 | - Size growth slowing significantly vs pre-hypertable
349 | - Decreasing compression_ratio_pct = poor segment_by
350 | 
351 | ### Troubleshooting
352 | 
353 | #### Poor Chunk Exclusion
354 | 
355 | ```sql
356 | -- Verify chunks are being excluded
357 | EXPLAIN (ANALYZE, BUFFERS)
358 | SELECT * FROM your_table_name
359 | WHERE timestamp >= '2024-01-01' AND timestamp < '2024-01-02';
360 | -- Look for "Chunks excluded during startup: X"
361 | ```
362 | 
363 | #### Poor Compression
364 | 
365 | ```sql
366 | -- Get newest compressed chunk name
367 | SELECT chunk_name FROM timescaledb_information.chunks
368 | WHERE hypertable_name = 'your_table_name'
369 |   AND compressed_total_bytes IS NOT NULL
370 | ORDER BY range_start DESC LIMIT 1;
371 | 
372 | -- Analyze segment distribution
373 | SELECT segment_by_column, COUNT(*) as rows_per_segment
374 | FROM _timescaledb_internal._hyper_X_Y_chunk  -- Use actual chunk name
375 | GROUP BY 1 ORDER BY 2 DESC;
376 | ```
377 | 
378 | **Look for:** <20 rows per segment: Poor segment_by choice (should be >100) => Low compression potential.
379 | 
380 | #### Poor insert performance
381 | 
382 | Check that you don't have too many indexes. Unused indexes hurt insert performance and should be dropped.
383 | 
384 | ```sql
385 | SELECT
386 |     schemaname,
387 |     tablename,
388 |     indexname,
389 |     idx_tup_read,
390 |     idx_tup_fetch,
391 |     idx_scan
392 | FROM pg_stat_user_indexes
393 | WHERE tablename LIKE '%your_table_name%'
394 | ORDER BY idx_scan DESC;
395 | ```
396 | 
397 | **Look for:** Unused indexes via a low idx_scan value. Drop such indexes (but ask user permission).
398 | 
399 | ### Ongoing Monitoring
400 | 
401 | ```sql
402 | -- Monitor chunk compression status
403 | CREATE OR REPLACE VIEW hypertable_compression_status AS
404 | SELECT
405 |     h.hypertable_name,
406 |     COUNT(c.chunk_name) as total_chunks,
407 |     COUNT(c.chunk_name) FILTER (WHERE c.compressed_total_bytes IS NOT NULL) as compressed_chunks,
408 |     ROUND(
409 |         COUNT(c.chunk_name) FILTER (WHERE c.compressed_total_bytes IS NOT NULL)::numeric /
410 |         COUNT(c.chunk_name) * 100, 1
411 |     ) as compression_coverage_pct,
412 |     pg_size_pretty(SUM(c.total_bytes)) as total_size,
413 |     pg_size_pretty(SUM(c.compressed_total_bytes)) as compressed_size
414 | FROM timescaledb_information.hypertables h
415 | LEFT JOIN timescaledb_information.chunks c ON h.hypertable_name = c.hypertable_name
416 | GROUP BY h.hypertable_name;
417 | 
418 | -- Query this view regularly to monitor compression progress
419 | SELECT * FROM hypertable_compression_status
420 | WHERE hypertable_name = 'your_table_name';
421 | ```
422 | 
423 | **Look for:**
424 | 
425 | - compression_coverage_pct should increase over time as data ages and gets compressed.
426 | - total_chunks should not grow too quickly (more than 10000 becomes a problem).
427 | - You should not see unexpected spikes in total_size or compressed_size.
428 | 
429 | ## Success Criteria
430 | 
431 | **✅ Migration successful when:**
432 | 
433 | - All queries return correct results
434 | - Query performance equal or better
435 | - Compression >90% for older data
436 | - Chunk exclusion working for time queries
437 | - Insert performance acceptable
438 | 
439 | **❌ Investigate if:**
440 | 
441 | - Query performance >20% worse
442 | - Compression <80%
443 | - No chunk exclusion
444 | - Insert performance degraded
445 | - Increased error rates
446 | 
447 | Focus on high-volume, insert-heavy workloads with time-based access patterns for best ROI.
448 | 


--------------------------------------------------------------------------------
/skills/design-postgres-tables/SKILL.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: design-postgres-tables
  3 | description: Comprehensive PostgreSQL-specific table design reference covering data types, indexing, constraints, performance patterns, and advanced features
  4 | ---
  5 | 
  6 | # PostgreSQL Table Design
  7 | 
  8 | ## Core Rules
  9 | 
 10 | - Define a **PRIMARY KEY** for reference tables (users, orders, etc.). Not always needed for time-series/event/log data. When used, prefer `BIGINT GENERATED ALWAYS AS IDENTITY`; use `UUID` only when global uniqueness/opacity is needed.
 11 | - **Normalize first (to 3NF)** to eliminate data redundancy and update anomalies; denormalize **only** for measured, high-ROI reads where join performance is proven problematic. Premature denormalization creates maintenance burden.
 12 | - Add **NOT NULL** everywhere it’s semantically required; use **DEFAULT**s for common values.
 13 | - Create **indexes for access paths you actually query**: PK/unique (auto), **FK columns (manual!)**, frequent filters/sorts, and join keys.
 14 | - Prefer **TIMESTAMPTZ** for event time; **NUMERIC** for money; **TEXT** for strings; **BIGINT** for integer values, **DOUBLE PRECISION** for floats (or `NUMERIC` for exact decimal arithmetic).
 15 | 
 16 | ## PostgreSQL “Gotchas”
 17 | 
 18 | - **Identifiers**: unquoted → lowercased. Avoid quoted/mixed-case names. Convention: use `snake_case` for table/column names.
 19 | - **Unique + NULLs**: UNIQUE allows multiple NULLs. Use `UNIQUE (...) NULLS NOT DISTINCT` (PG15+) to restrict to one NULL.
 20 | - **FK indexes**: PostgreSQL **does not** auto-index FK columns. Add them.
 21 | - **No silent coercions**: length/precision overflows error out (no truncation). Example: inserting 999 into `NUMERIC(2,0)` fails with error, unlike some databases that silently truncate or round.
 22 | - **Sequences/identity have gaps** (normal; don't "fix"). Rollbacks, crashes, and concurrent transactions create gaps in ID sequences (1, 2, 5, 6...). This is expected behavior—don't try to make IDs consecutive.
 23 | - **Heap storage**: no clustered PK by default (unlike SQL Server/MySQL InnoDB); `CLUSTER` is one-off reorganization, not maintained on subsequent inserts. Row order on disk is insertion order unless explicitly clustered.
 24 | - **MVCC**: updates/deletes leave dead tuples; vacuum handles them—design to avoid hot wide-row churn.
 25 | 
 26 | ## Data Types
 27 | 
 28 | - **IDs**: `BIGINT GENERATED ALWAYS AS IDENTITY` preferred (`GENERATED BY DEFAULT` also fine); `UUID` when merging/federating/used in a distributed system or for opaque IDs. Generate with `uuidv7()` (preferred if using PG18+) or `gen_random_uuid()` (if using an older PG version).
 29 | - **Integers**: prefer `BIGINT` unless storage space is critical; `INTEGER` for smaller ranges; avoid `SMALLINT` unless constrained.
 30 | - **Floats**: prefer `DOUBLE PRECISION` over `REAL` unless storage space is critical. Use `NUMERIC` for exact decimal arithmetic.
 31 | - **Strings**: prefer `TEXT`; if length limits needed, use `CHECK (LENGTH(col) <= n)` instead of `VARCHAR(n)`; avoid `CHAR(n)`. Use `BYTEA` for binary data. Large strings/binary (>2KB default threshold) automatically stored in TOAST with compression. TOAST storage: `PLAIN` (no TOAST), `EXTENDED` (compress + out-of-line), `EXTERNAL` (out-of-line, no compress), `MAIN` (compress, keep in-line if possible). Default `EXTENDED` usually optimal. Control with `ALTER TABLE tbl ALTER COLUMN col SET STORAGE strategy` and `ALTER TABLE tbl SET (toast_tuple_target = 4096)` for threshold. Case-insensitive: for locale/accent handling use non-deterministic collations; for plain ASCII use expression indexes on `LOWER(col)` (preferred unless column needs case-insensitive PK/FK/UNIQUE) or `CITEXT`.
 32 | - **Money**: `NUMERIC(p,s)` (never float).
 33 | - **Time**: `TIMESTAMPTZ` for timestamps; `DATE` for date-only; `INTERVAL` for durations. Avoid `TIMESTAMP` (without timezone). Use `now()` for transaction start time, `clock_timestamp()` for current wall-clock time.
 34 | - **Booleans**: `BOOLEAN` with `NOT NULL` constraint unless tri-state values are required.
 35 | - **Enums**: `CREATE TYPE ... AS ENUM` for small, stable sets (e.g. US states, days of week). For business-logic-driven and evolving values (e.g. order statuses) → use TEXT (or INT) + CHECK or lookup table.
 36 | - **Arrays**: `TEXT[]`, `INTEGER[]`, etc. Use for ordered lists where you query elements. Index with **GIN** for containment (`@>`, `<@`) and overlap (`&&`) queries. Access: `arr[1]` (1-indexed), `arr[1:3]` (slicing). Good for tags, categories; avoid for relations—use junction tables instead. Literal syntax: `'{val1,val2}'` or `ARRAY[val1,val2]`.
 37 | - **Range types**: `daterange`, `numrange`, `tstzrange` for intervals. Support overlap (`&&`), containment (`@>`), operators. Index with **GiST**. Good for scheduling, versioning, numeric ranges. Pick a bounds scheme and use it consistently; prefer `[)` (inclusive/exclusive) by default.
 38 | - **Network types**: `INET` for IP addresses, `CIDR` for network ranges, `MACADDR` for MAC addresses. Support network operators (`<<`, `>>`, `&&`).
 39 | - **Geometric types**: avoid `POINT`, `LINE`, `POLYGON`, `CIRCLE`. Index with **GiST**. Consider **PostGIS** for spatial features.
 40 | - **Text search**: `TSVECTOR` for full-text search documents, `TSQUERY` for search queries. Index `tsvector` with **GIN**. Always specify language: `to_tsvector('english', col)` and `to_tsquery('english', 'query')`. Never use single-argument versions. This applies to both index expressions and queries.
 41 | - **Domain types**: `CREATE DOMAIN email AS TEXT CHECK (VALUE ~ '^[^@]+@[^@]+$')` for reusable custom types with validation. Enforces constraints across tables.
 42 | - **Composite types**: `CREATE TYPE address AS (street TEXT, city TEXT, zip TEXT)` for structured data within columns. Access with `(col).field` syntax.
 43 | - **JSONB**: preferred over JSON; index with **GIN**. Use only for optional/semi-structured attrs. ONLY use JSON if the original ordering of the contents MUST be preserved.
 44 | - **Vector types**: `vector` type by `pgvector` for vector similarity search for embeddings.
 45 | 
 46 | ### Do not use the following data types
 47 | 
 48 | - DO NOT use `timestamp` (without time zone); DO use `timestamptz` instead.
 49 | - DO NOT use `char(n)` or `varchar(n)`; DO use `text` instead.
 50 | - DO NOT use `money` type; DO use `numeric` instead.
 51 | - DO NOT use `timetz` type; DO use `timestamptz` instead.
 52 | - DO NOT use `timestamptz(0)` or any other precision specification; DO use `timestamptz` instead
 53 | - DO NOT use `serial` type; DO use `generated always as identity` instead.
 54 | - DO NOT use `POINT`, `LINE`, `POLYGON`, `CIRCLE` built-in types, DO use `geometry` from postgis extension instead.
 55 | 
 56 | ## Table Types
 57 | 
 58 | - **Regular**: default; fully durable, logged.
 59 | - **TEMPORARY**: session-scoped, auto-dropped, not logged. Faster for scratch work.
 60 | - **UNLOGGED**: persistent but not crash-safe. Faster writes; good for caches/staging.
 61 | 
 62 | ## Row-Level Security
 63 | 
 64 | Enable with `ALTER TABLE tbl ENABLE ROW LEVEL SECURITY`. Create policies: `CREATE POLICY user_access ON orders FOR SELECT TO app_users USING (user_id = current_user_id())`. Built-in user-based access control at the row level.
 65 | 
 66 | ## Constraints
 67 | 
 68 | - **PK**: implicit UNIQUE + NOT NULL; creates a B-tree index.
 69 | - **FK**: specify `ON DELETE/UPDATE` action (`CASCADE`, `RESTRICT`, `SET NULL`, `SET DEFAULT`). Add explicit index on referencing column—speeds up joins and prevents locking issues on parent deletes/updates. Use `DEFERRABLE INITIALLY DEFERRED` for circular FK dependencies checked at transaction end.
 70 | - **UNIQUE**: creates a B-tree index; allows multiple NULLs unless `NULLS NOT DISTINCT` (PG15+). Standard behavior: `(1, NULL)` and `(1, NULL)` are allowed. With `NULLS NOT DISTINCT`: only one `(1, NULL)` allowed. Prefer `NULLS NOT DISTINCT` unless you specifically need duplicate NULLs.
 71 | - **CHECK**: row-local constraints; NULL values pass the check (three-valued logic). Example: `CHECK (price > 0)` allows NULL prices. Combine with `NOT NULL` to enforce: `price NUMERIC NOT NULL CHECK (price > 0)`.
 72 | - **EXCLUDE**: prevents overlapping values using operators. `EXCLUDE USING gist (room_id WITH =, booking_period WITH &&)` prevents double-booking rooms. Requires appropriate index type (often GiST).
 73 | 
 74 | ## Indexing
 75 | 
 76 | - **B-tree**: default for equality/range queries (`=`, `<`, `>`, `BETWEEN`, `ORDER BY`)
 77 | - **Composite**: order matters—index used if equality on leftmost prefix (`WHERE a = ? AND b > ?` uses index on `(a,b)`, but `WHERE b = ?` does not). Put most selective/frequently filtered columns first.
 78 | - **Covering**: `CREATE INDEX ON tbl (id) INCLUDE (name, email)` - includes non-key columns for index-only scans without visiting table.
 79 | - **Partial**: for hot subsets (`WHERE status = 'active'` → `CREATE INDEX ON tbl (user_id) WHERE status = 'active'`). Any query with `status = 'active'` can use this index.
 80 | - **Expression**: for computed search keys (`CREATE INDEX ON tbl (LOWER(email))`). Expression must match exactly in WHERE clause: `WHERE LOWER(email) = 'user@example.com'`.
 81 | - **GIN**: JSONB containment/existence, arrays (`@>`, `?`), full-text search (`@@`)
 82 | - **GiST**: ranges, geometry, exclusion constraints
 83 | - **BRIN**: very large, naturally ordered data (time-series)—minimal storage overhead. Effective when row order on disk correlates with indexed column (insertion order or after `CLUSTER`).
 84 | 
 85 | ## Partitioning
 86 | 
 87 | - Use for very large tables (>100M rows) where queries consistently filter on partition key (often time/date).
 88 | - Alternate use: use for tables where data maintenance tasks dictates e.g. data pruned or bulk replaced periodically
 89 | - **RANGE**: common for time-series (`PARTITION BY RANGE (created_at)`). Create partitions: `CREATE TABLE logs_2024_01 PARTITION OF logs FOR VALUES FROM ('2024-01-01') TO ('2024-02-01')`. **TimescaleDB** automates time-based or ID-based partitioning with retention policies and compression.
 90 | - **LIST**: for discrete values (`PARTITION BY LIST (region)`). Example: `FOR VALUES IN ('us-east', 'us-west')`.
 91 | - **HASH**: for even distribution when no natural key (`PARTITION BY HASH (user_id)`). Creates N partitions with modulus.
 92 | - **Constraint exclusion**: requires `CHECK` constraints on partitions for query planner to prune. Auto-created for declarative partitioning (PG10+).
 93 | - Prefer declarative partitioning or hypertables. Do NOT use table inheritance.
 94 | - **Limitations**: no global UNIQUE constraints—include partition key in PK/UNIQUE. FKs from partitioned tables not supported; use triggers.
 95 | 
 96 | ## Special Considerations
 97 | 
 98 | ### Update-Heavy Tables
 99 | 
100 | - **Separate hot/cold columns**—put frequently updated columns in separate table to minimize bloat.
101 | - **Use `fillfactor=90`** to leave space for HOT updates that avoid index maintenance.
102 | - **Avoid updating indexed columns**—prevents beneficial HOT updates.
103 | - **Partition by update patterns**—separate frequently updated rows in a different partition from stable data.
104 | 
105 | ### Insert-Heavy Workloads
106 | 
107 | - **Minimize indexes**—only create what you query; every index slows inserts.
108 | - **Use `COPY` or multi-row `INSERT`** instead of single-row inserts.
109 | - **UNLOGGED tables** for rebuildable staging data—much faster writes.
110 | - **Defer index creation** for bulk loads—>drop index, load data, recreate indexes.
111 | - **Partition by time/hash** to distribute load. **TimescaleDB** automates partitioning and compression of insert-heavy data.
112 | - **Use a natural key for primary key** such as a (timestamp, device_id) if enforcing global uniqueness is important many insert-heavy tables don't need a primary key at all.
113 | - If you do need a surrogate key, **Prefer `BIGINT GENERATED ALWAYS AS IDENTITY` over `UUID`**.
114 | 
115 | ### Upsert-Friendly Design
116 | 
117 | - **Requires UNIQUE index** on conflict target columns—`ON CONFLICT (col1, col2)` needs exact matching unique index (partial indexes don't work).
118 | - **Use `EXCLUDED.column`** to reference would-be-inserted values; only update columns that actually changed to reduce write overhead.
119 | - **`DO NOTHING` faster** than `DO UPDATE` when no actual update needed.
120 | 
121 | ### Safe Schema Evolution
122 | 
123 | - **Transactional DDL**: most DDL operations can run in transactions and be rolled back—`BEGIN; ALTER TABLE...; ROLLBACK;` for safe testing.
124 | - **Concurrent index creation**: `CREATE INDEX CONCURRENTLY` avoids blocking writes but can't run in transactions.
125 | - **Volatile defaults cause rewrites**: adding `NOT NULL` columns with volatile defaults (e.g., `now()`, `gen_random_uuid()`) rewrites entire table. Non-volatile defaults are fast.
126 | - **Drop constraints before columns**: `ALTER TABLE DROP CONSTRAINT` then `DROP COLUMN` to avoid dependency issues.
127 | - **Function signature changes**: `CREATE OR REPLACE` with different arguments creates overloads, not replacements. DROP old version if no overload desired.
128 | 
129 | ## Generated Columns
130 | 
131 | - `... GENERATED ALWAYS AS (<expr>) STORED` for computed, indexable fields. PG18+ adds `VIRTUAL` columns (computed on read, not stored).
132 | 
133 | ## Extensions
134 | 
135 | - **`pgcrypto`**: `crypt()` for password hashing.
136 | - **`uuid-ossp`**: alternative UUID functions; prefer `pgcrypto` for new projects.
137 | - **`pg_trgm`**: fuzzy text search with `%` operator, `similarity()` function. Index with GIN for `LIKE '%pattern%'` acceleration.
138 | - **`citext`**: case-insensitive text type. Prefer expression indexes on `LOWER(col)` unless you need case-insensitive constraints.
139 | - **`btree_gin`/`btree_gist`**: enable mixed-type indexes (e.g., GIN index on both JSONB and text columns).
140 | - **`hstore`**: key-value pairs; mostly superseded by JSONB but useful for simple string mappings.
141 | - **`timescaledb`**: essential for time-series—automated partitioning, retention, compression, continuous aggregates.
142 | - **`postgis`**: comprehensive geospatial support beyond basic geometric types—essential for location-based applications.
143 | - **`pgvector`**: vector similarity search for embeddings.
144 | - **`pgaudit`**: audit logging for all database activity.
145 | 
146 | ## JSONB Guidance
147 | 
148 | - Prefer `JSONB` with **GIN** index.
149 | - Default: `CREATE INDEX ON tbl USING GIN (jsonb_col);` → accelerates:
150 |   - **Containment** `jsonb_col @> '{"k":"v"}'`
151 |   - **Key existence** `jsonb_col ? 'k'`, **any/all keys** `?\|`, `?&`
152 |   - **Path containment** on nested docs
153 |   - **Disjunction** `jsonb_col @> ANY(ARRAY['{"status":"active"}', '{"status":"pending"}'])`
154 | - Heavy `@>` workloads: consider opclass `jsonb_path_ops` for smaller/faster containment-only indexes:
155 |   - `CREATE INDEX ON tbl USING GIN (jsonb_col jsonb_path_ops);`
156 |   - **Trade-off**: loses support for key existence (`?`, `?|`, `?&`) queries—only supports containment (`@>`)
157 | - Equality/range on a specific scalar field: extract and index with B-tree (generated column or expression):
158 |   - `ALTER TABLE tbl ADD COLUMN price INT GENERATED ALWAYS AS ((jsonb_col->>'price')::INT) STORED;`
159 |   - `CREATE INDEX ON tbl (price);`
160 |   - Prefer queries like `WHERE price BETWEEN 100 AND 500` (uses B-tree) over `WHERE (jsonb_col->>'price')::INT BETWEEN 100 AND 500` without index.
161 | - Arrays inside JSONB: use GIN + `@>` for containment (e.g., tags). Consider `jsonb_path_ops` if only doing containment.
162 | - Keep core relations in tables; use JSONB for optional/variable attributes.
163 | - Use constraints to limit allowed JSONB values in a column e.g. `config JSONB NOT NULL CHECK(jsonb_typeof(config) = 'object')`
164 | 
165 | ## Examples
166 | 
167 | ### Users
168 | 
169 | ```sql
170 | CREATE TABLE users (
171 |   user_id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
172 |   email TEXT NOT NULL UNIQUE,
173 |   name TEXT NOT NULL,
174 |   created_at TIMESTAMPTZ NOT NULL DEFAULT now()
175 | );
176 | CREATE UNIQUE INDEX ON users (LOWER(email));
177 | CREATE INDEX ON users (created_at);
178 | ```
179 | 
180 | ### Orders
181 | 
182 | ```sql
183 | CREATE TABLE orders (
184 |   order_id BIGINT GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
185 |   user_id BIGINT NOT NULL REFERENCES users(user_id),
186 |   status TEXT NOT NULL DEFAULT 'PENDING' CHECK (status IN ('PENDING','PAID','CANCELED')),
187 |   total NUMERIC(10,2) NOT NULL CHECK (total > 0),
188 |   created_at TIMESTAMPTZ NOT NULL DEFAULT now()
189 | );
190 | CREATE INDEX ON orders (user_id);
191 | CREATE INDEX ON orders (created_at);
192 | ```
193 | 
194 | ### JSONB
195 | 
196 | ```sql
197 | CREATE TABLE profiles (
198 |   user_id BIGINT PRIMARY KEY REFERENCES users(user_id),
199 |   attrs JSONB NOT NULL DEFAULT '{}',
200 |   theme TEXT GENERATED ALWAYS AS (attrs->>'theme') STORED
201 | );
202 | CREATE INDEX profiles_attrs_gin ON profiles USING GIN (attrs);
203 | ```
204 | 


--------------------------------------------------------------------------------
/skills/setup-timescaledb-hypertables/SKILL.md:
--------------------------------------------------------------------------------
  1 | ---
  2 | name: setup-timescaledb-hypertables
  3 | description: 'Step-by-step instructions for designing table schemas and setting up TimescaleDB with hypertables, indexes, compression, retention policies, and continuous aggregates. Instructions for selecting: partition columns, segment_by columns, order_by columns, chunk time interval, real-time aggregation.'
  4 | ---
  5 | 
  6 | # TimescaleDB Complete Setup
  7 | 
  8 | Instructions for insert-heavy data patterns where data is inserted but rarely changed:
  9 | 
 10 | - **Time-series data** (sensors, metrics, system monitoring)
 11 | - **Event logs** (user events, audit trails, application logs)
 12 | - **Transaction records** (orders, payments, financial transactions)
 13 | - **Sequential data** (records with auto-incrementing IDs and timestamps)
 14 | - **Append-only datasets** (immutable records, historical data)
 15 | 
 16 | ## Step 1: Create Hypertable
 17 | 
 18 | ```sql
 19 | CREATE TABLE your_table_name (
 20 |     timestamp TIMESTAMPTZ NOT NULL,
 21 |     entity_id TEXT NOT NULL,          -- device_id, user_id, symbol, etc.
 22 |     category TEXT,                    -- sensor_type, event_type, asset_class, etc.
 23 |     value_1 DOUBLE PRECISION,         -- price, temperature, latency, etc.
 24 |     value_2 DOUBLE PRECISION,         -- volume, humidity, throughput, etc.
 25 |     value_3 INTEGER,                  -- count, status, level, etc.
 26 |     metadata JSONB                    -- flexible additional data
 27 | ) WITH (
 28 |     tsdb.hypertable,
 29 |     tsdb.partition_column='timestamp',
 30 |     tsdb.enable_columnstore=true,     -- Disable if table has vector columns
 31 |     tsdb.segmentby='entity_id',       -- See selection guide below
 32 |     tsdb.orderby='timestamp DESC',     -- See selection guide below
 33 |     tsdb.sparse_index='minmax(value_1),minmax(value_2),minmax(value_3)' -- see selection guide below
 34 | );
 35 | ```
 36 | 
 37 | ### Compression Decision
 38 | 
 39 | - **Enable by default** for insert-heavy patterns
 40 | - **Disable** if table has vector type columns (pgvector) - indexes on vector columns incompatible with columnstore
 41 | 
 42 | ### Partition Column Selection
 43 | 
 44 | Must be time-based (TIMESTAMP/TIMESTAMPTZ/DATE) or integer (INT/BIGINT) with good temporal/sequential distribution.
 45 | 
 46 | **Common patterns:**
 47 | 
 48 | - TIME-SERIES: `timestamp`, `event_time`, `measured_at`
 49 | - EVENT LOGS: `event_time`, `created_at`, `logged_at`
 50 | - TRANSACTIONS: `created_at`, `transaction_time`, `processed_at`
 51 | - SEQUENTIAL: `id` (auto-increment when no timestamp), `sequence_number`
 52 | - APPEND-ONLY: `created_at`, `inserted_at`, `id`
 53 | 
 54 | **Less ideal:** `ingested_at` (when data entered system - use only if it's your primary query dimension)
 55 | **Avoid:** `updated_at` (breaks time ordering unless it's primary query dimension)
 56 | 
 57 | ### Segment_By Column Selection
 58 | 
 59 | **PREFER SINGLE COLUMN** - multi-column rarely optimal. Multi-column can only work for highly correlated columns (e.g., metric_name + metric_type) with sufficient row density.
 60 | 
 61 | **Requirements:**
 62 | 
 63 | - Frequently used in WHERE clauses (most common filter)
 64 | - Good row density (>100 rows per value per chunk)
 65 | - Primary logical partition/grouping
 66 | 
 67 | **Examples:**
 68 | 
 69 | - IoT: `device_id`
 70 | - Finance: `symbol`
 71 | - Metrics: `service_name`, `service_name, metric_type` (if sufficient row density), `metric_name, metric_type` (if sufficient row density)
 72 | - Analytics: `user_id` if sufficient row density, otherwise `session_id`
 73 | - E-commerce: `product_id` if sufficient row density, otherwise `category_id`
 74 | 
 75 | **Row density guidelines:**
 76 | 
 77 | - Target: >100 rows per segment_by value within each chunk.
 78 | - Poor: <10 rows per segment_by value per chunk → choose less granular column
 79 | - What to do with low-density columns: prepend to order_by column list.
 80 | 
 81 | **Query pattern drives choice:**
 82 | 
 83 | ```sql
 84 | SELECT * FROM table WHERE entity_id = 'X' AND timestamp > ...
 85 | -- ↳ segment_by: entity_id (if >100 rows per chunk)
 86 | ```
 87 | 
 88 | **Avoid:** timestamps, unique IDs, low-density columns (<100 rows/value/chunk), columns rarely used in filtering
 89 | 
 90 | ### Order_By Column Selection
 91 | 
 92 | Creates natural time-series progression when combined with segment_by for optimal compression.
 93 | 
 94 | **Most common:** `timestamp DESC`
 95 | 
 96 | **Examples:**
 97 | 
 98 | - IoT/Finance/E-commerce: `timestamp DESC`
 99 | - Metrics: `metric_name, timestamp DESC` (if metric_name has too low density for segment_by)
100 | - Analytics: `user_id, timestamp DESC` (user_id has too low density for segment_by)
101 | 
102 | **Alternative patterns:**
103 | 
104 | - `sequence_id DESC` for event streams with sequence numbers
105 | - `timestamp DESC, event_order DESC` for sub-ordering within same timestamp
106 | 
107 | **Low-density column handling:**
108 | If a column has <100 rows per chunk (too low for segment_by), prepend it to order_by:
109 | 
110 | - Example: `metric_name` has 20 rows/chunk → use `segment_by='service_name'`, `order_by='metric_name, timestamp DESC'`
111 | - Groups similar values together (all temperature readings, then pressure readings) for better compression
112 | 
113 | **Good test:** ordering created by `(segment_by_column, order_by_column)` should form a natural time-series progression. Values close to each other in the progression should be similar.
114 | 
115 | **Avoid in order_by:** random columns, columns with high variance between adjacent rows, columns unrelated to segment_by
116 | 
117 | ### Compression Sparse Index Selection
118 | 
119 | **Sparse indexes** enable query filtering on compressed data without decompression. Store metadata per batch (~1000 rows) to eliminate batches that don't match query predicates.
120 | 
121 | **Types:**
122 | 
123 | - **minmax:** Min/max values per batch - for range queries (>, <, BETWEEN) on numeric/temporal columns
124 | 
125 | **Use minmax for:** price, temperature, measurement, timestamp (range filtering)
126 | 
127 | **Use for:**
128 | 
129 | - minmax for outlier detection (temperature > 90).
130 | - minmax for fields that are highly correlated with segmentby and orderby columns (e.g. if orderby includes `created_at`, minmax on `updated_at` is useful).
131 | 
132 | **Avoid:** rarely filtered columns.
133 | 
134 | IMPORTANT: NEVER index columns in segmentby or orderby. Orderby columns will always have minmax indexes without any configuration.
135 | 
136 | **Configuration:**
137 | The format is a comma-separated list of type_of_index(column_name).
138 | 
139 | ```sql
140 | ALTER TABLE table_name SET (
141 |     timescaledb.sparse_index = 'minmax(value_1),minmax(value_2)'
142 | );
143 | ```
144 | 
145 | Explicit configuration available since v2.22.0 (was auto-created since v2.16.0).
146 | 
147 | ### Chunk Time Interval (Optional)
148 | 
149 | Default: 7 days (use if volume unknown, or ask user). Adjust based on volume:
150 | 
151 | - High frequency: 1 hour - 1 day
152 | - Medium: 1 day - 1 week
153 | - Low: 1 week - 1 month
154 | 
155 | ```sql
156 | SELECT set_chunk_time_interval('your_table_name', INTERVAL '1 day');
157 | ```
158 | 
159 | **Good test:** recent chunk indexes should fit in less than 25% of RAM.
160 | 
161 | ### Indexes & Primary Keys
162 | 
163 | Common index patterns - composite indexes on an id and timestamp:
164 | 
165 | ```sql
166 | CREATE INDEX idx_entity_timestamp ON your_table_name (entity_id, timestamp DESC);
167 | ```
168 | 
169 | **Important:** Only create indexes you'll actually use - each has maintenance overhead.
170 | 
171 | **Primary key and unique constraints rules:** Must include partition column.
172 | 
173 | **Option 1: Composite PK with partition column**
174 | 
175 | ```sql
176 | ALTER TABLE your_table_name ADD PRIMARY KEY (entity_id, timestamp);
177 | ```
178 | 
179 | **Option 2: Single-column PK (only if it's the partition column)**
180 | 
181 | ```sql
182 | CREATE TABLE ... (id BIGINT PRIMARY KEY, ...) WITH (tsdb.partition_column='id');
183 | ```
184 | 
185 | **Option 3: No PK**: strict uniqueness is often not required for insert-heavy patterns.
186 | 
187 | ## Step 2: Compression Policy
188 | 
189 | Set `after` interval for when: data becomes mostly immutable (some updates/backfill OK) AND B-tree indexes aren't needed for queries (less common criterion).
190 | 
191 | ```sql
192 | -- Adjust 'after' based on update patterns
193 | CALL add_columnstore_policy('your_table_name', after => INTERVAL '1 day');
194 | ```
195 | 
196 | ## Step 3: Retention Policy
197 | 
198 | IMPORTANT: Don't guess - ask user or comment out if unknown.
199 | 
200 | ```sql
201 | -- Example - replace with requirements or comment out
202 | SELECT add_retention_policy('your_table_name', INTERVAL '365 days');
203 | ```
204 | 
205 | ## Step 4: Create Continuous Aggregates
206 | 
207 | Use different aggregation intervals for different uses.
208 | 
209 | ### Short-term (Minutes/Hours)
210 | 
211 | For up-to-the-minute dashboards on high-frequency data.
212 | 
213 | ```sql
214 | CREATE MATERIALIZED VIEW your_table_hourly
215 | WITH (timescaledb.continuous) AS
216 | SELECT
217 |     time_bucket(INTERVAL '1 hour', timestamp) AS bucket,
218 |     entity_id,
219 |     category,
220 |     COUNT(*) as record_count,
221 |     AVG(value_1) as avg_value_1,
222 |     MIN(value_1) as min_value_1,
223 |     MAX(value_1) as max_value_1,
224 |     SUM(value_2) as sum_value_2
225 | FROM your_table_name
226 | GROUP BY bucket, entity_id, category;
227 | ```
228 | 
229 | ### Long-term (Days/Weeks/Months)
230 | 
231 | For long-term reporting and analytics.
232 | 
233 | ```sql
234 | CREATE MATERIALIZED VIEW your_table_daily
235 | WITH (timescaledb.continuous) AS
236 | SELECT
237 |     time_bucket(INTERVAL '1 day', timestamp) AS bucket,
238 |     entity_id,
239 |     category,
240 |     COUNT(*) as record_count,
241 |     AVG(value_1) as avg_value_1,
242 |     MIN(value_1) as min_value_1,
243 |     MAX(value_1) as max_value_1,
244 |     PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY value_1) as median_value_1,
245 |     PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY value_1) as p95_value_1,
246 |     SUM(value_2) as sum_value_2
247 | FROM your_table_name
248 | GROUP BY bucket, entity_id, category;
249 | ```
250 | 
251 | ## Step 5: Aggregate Refresh Policies
252 | 
253 | Set up refresh policies based on your data freshness requirements.
254 | 
255 | **start_offset:** Usually omit (refreshes all). Exception: If you don't care about refreshing data older than X (see below). With retention policy on raw data: match the retention policy.
256 | 
257 | **end_offset:** Set beyond active update window (e.g., 15 min if data usually arrives within 10 min). Data newer than end_offset won't appear in queries without real-time aggregation. If you don't know your update window, use the size of the time_bucket in the query, but not less than 5 minutes.
258 | 
259 | **schedule_interval:** Set to the same value as the end_offset but not more than 1 hour.
260 | 
261 | **Hourly - frequent refresh for dashboards:**
262 | 
263 | ```sql
264 | SELECT add_continuous_aggregate_policy('your_table_hourly',
265 |     end_offset => INTERVAL '15 minutes',
266 |     schedule_interval => INTERVAL '15 minutes');
267 | ```
268 | 
269 | **Daily - less frequent for reports:**
270 | 
271 | ```sql
272 | SELECT add_continuous_aggregate_policy('your_table_daily',
273 |     end_offset => INTERVAL '1 hour',
274 |     schedule_interval => INTERVAL '1 hour');
275 | ```
276 | 
277 | **Use start_offset only if you don't care about refreshing old data**
278 | Use for high-volume systems where query accuracy on older data doesn't matter:
279 | 
280 | ```sql
281 | -- the following aggregate can be stale for data older than 7 days
282 | -- SELECT add_continuous_aggregate_policy('aggregate_for_last_7_days',
283 | --     start_offset => INTERVAL '7 days',    -- only refresh last 7 days
284 | --     end_offset => INTERVAL '15 minutes',
285 | --     schedule_interval => INTERVAL '15 minutes');
286 | ```
287 | 
288 | IMPORTANT: you MUST set a start_offset to be less than the retention policy on raw data. By default, set the start_offset equal to the retention policy.
289 | If the retention policy is commented out, comment out the start_offset as well. like this:
290 | 
291 | ```sql
292 | SELECT add_continuous_aggregate_policy('your_table_daily',
293 | --  start_offset => INTERVAL '<retention period here>',    -- uncomment if retention policy is enabled on the raw data table
294 |     end_offset => INTERVAL '1 hour',
295 |     schedule_interval => INTERVAL '1 hour');
296 | ```
297 | 
298 | ## Step 6: Real-Time Aggregation (Optional)
299 | 
300 | Real-time combines materialized + recent raw data at query time. Provides up-to-date results at the cost of higher query latency.
301 | 
302 | More useful for fine-grained aggregates (e.g., minutely) than coarse ones (e.g., daily/monthly) since large buckets will be mostly incomplete with recent data anyway.
303 | 
304 | Disabled by default in v2.13+, before that it was enabled by default.
305 | 
306 | **Use when:** Need data newer than end_offset, up-to-minute dashboards, can tolerate higher query latency
307 | **Disable when:** Performance critical, refresh policies sufficient, high query volume, missing and stale data for recent data is acceptable
308 | 
309 | **Enable for current results (higher query cost):**
310 | 
311 | ```sql
312 | ALTER MATERIALIZED VIEW your_table_hourly SET (timescaledb.materialized_only = false);
313 | ```
314 | 
315 | **Disable for performance (but with stale results):**
316 | 
317 | ```sql
318 | ALTER MATERIALIZED VIEW your_table_hourly SET (timescaledb.materialized_only = true);
319 | ```
320 | 
321 | ## Step 7: Compress Aggregates
322 | 
323 | Rule: segment_by = ALL GROUP BY columns except time_bucket, order_by = time_bucket DESC
324 | 
325 | ```sql
326 | -- Hourly
327 | ALTER MATERIALIZED VIEW your_table_hourly SET (
328 |     timescaledb.enable_columnstore,
329 |     timescaledb.segmentby = 'entity_id, category',
330 |     timescaledb.orderby = 'bucket DESC'
331 | );
332 | CALL add_columnstore_policy('your_table_hourly', after => INTERVAL '3 days');
333 | 
334 | -- Daily
335 | ALTER MATERIALIZED VIEW your_table_daily SET (
336 |     timescaledb.enable_columnstore,
337 |     timescaledb.segmentby = 'entity_id, category',
338 |     timescaledb.orderby = 'bucket DESC'
339 | );
340 | CALL add_columnstore_policy('your_table_daily', after => INTERVAL '7 days');
341 | ```
342 | 
343 | ## Step 8: Aggregate Retention
344 | 
345 | Aggregates are typically kept longer than raw data.
346 | IMPORTANT: Don't guess - ask user or you **MUST comment out if unknown**.
347 | 
348 | ```sql
349 | -- Example - replace or comment out
350 | SELECT add_retention_policy('your_table_hourly', INTERVAL '2 years');
351 | SELECT add_retention_policy('your_table_daily', INTERVAL '5 years');
352 | ```
353 | 
354 | ## Step 9: Performance Indexes on Continuous Aggregates
355 | 
356 | **Index strategy:** Analyze WHERE clauses in common queries → Create indexes matching filter columns + time ordering
357 | 
358 | **Pattern:** `(filter_column, bucket DESC)` supports `WHERE filter_column = X AND bucket >= Y ORDER BY bucket DESC`
359 | 
360 | Examples:
361 | 
362 | ```sql
363 | CREATE INDEX idx_hourly_entity_bucket ON your_table_hourly (entity_id, bucket DESC);
364 | CREATE INDEX idx_hourly_category_bucket ON your_table_hourly (category, bucket DESC);
365 | ```
366 | 
367 | **Multi-column filters:** Create composite indexes for `WHERE entity_id = X AND category = Y`:
368 | 
369 | ```sql
370 | CREATE INDEX idx_hourly_entity_category_bucket ON your_table_hourly (entity_id, category, bucket DESC);
371 | ```
372 | 
373 | **Important:** Only create indexes you'll actually use - each has maintenance overhead.
374 | 
375 | ## Step 10: Optional Enhancements
376 | 
377 | ### Space Partitioning (NOT RECOMMENDED)
378 | 
379 | Only for query patterns where you ALWAYS filter by the space-partition column with expert knowledge and extensive benchmarking. STRONGLY prefer time-only partitioning.
380 | 
381 | ## Step 11: Verify Configuration
382 | 
383 | ```sql
384 | -- Check hypertable
385 | SELECT * FROM timescaledb_information.hypertables
386 | WHERE hypertable_name = 'your_table_name';
387 | 
388 | -- Check compression
389 | SELECT * FROM timescaledb_information.columnstore_settings
390 | WHERE hypertable_name LIKE 'your_table_name';
391 | 
392 | -- Check aggregates
393 | SELECT * FROM timescaledb_information.continuous_aggregates;
394 | 
395 | -- Check policies
396 | SELECT * FROM timescaledb_information.jobs ORDER BY job_id;
397 | 
398 | -- Monitor chunk information
399 | SELECT chunk_name, table_size, compressed_heap_size, compressed_index_size
400 | FROM timescaledb_information.chunks
401 | WHERE hypertable_name = 'your_table_name';
402 | ```
403 | 
404 | ## Performance Guidelines
405 | 
406 | - **Chunk size:** Recent chunk indexes should fit in less than 25% of RAM
407 | - **Compression:** Expect 90%+ reduction (10x) with proper columnstore config
408 | - **Query optimization:** Use continuous aggregates for historical queries and dashboards
409 | - **Memory:** Run `timescaledb-tune` for self-hosting (auto-configured on cloud)
410 | 
411 | ## Schema Best Practices
412 | 
413 | ### Do's and Don'ts
414 | 
415 | - ✅ Use `TIMESTAMPTZ` NOT `timestamp`
416 | - ✅ Use `>=` and `<` NOT `BETWEEN` for timestamps
417 | - ✅ Use `TEXT` with constraints NOT `char(n)`/`varchar(n)`
418 | - ✅ Use `snake_case` NOT `CamelCase`
419 | - ✅ Use `BIGINT GENERATED ALWAYS AS IDENTITY` NOT `SERIAL`
420 | - ✅ Use `BIGINT` for IDs by default over `INTEGER` or `SMALLINT`
421 | - ✅ Use `DOUBLE PRECISION` by default over `REAL`/`FLOAT`
422 | - ✅ Use `NUMERIC` NOT `MONEY`
423 | - ✅ Use `NOT EXISTS` NOT `NOT IN`
424 | - ✅ Use `time_bucket()` or `date_trunc()` NOT `timestamp(0)` for truncation
425 | 
426 | ## API Reference (Current vs Deprecated)
427 | 
428 | **Deprecated Parameters → New Parameters:**
429 | 
430 | - `timescaledb.compress` → `timescaledb.enable_columnstore`
431 | - `timescaledb.compress_segmentby` → `timescaledb.segmentby`
432 | - `timescaledb.compress_orderby` → `timescaledb.orderby`
433 | 
434 | **Deprecated Functions → New Functions:**
435 | 
436 | - `add_compression_policy()` → `add_columnstore_policy()`
437 | - `remove_compression_policy()` → `remove_columnstore_policy()`
438 | - `compress_chunk()` → `convert_to_columnstore()`
439 | - `decompress_chunk()` → `convert_to_rowstore()`
440 | 
441 | **Deprecated Views → New Views:**
442 | 
443 | - `compression_settings` → `columnstore_settings`
444 | - `hypertable_compression_settings` → `hypertable_columnstore_settings`
445 | - `chunk_compression_settings` → `chunk_columnstore_settings`
446 | 
447 | **Deprecated Stats Functions → New Stats Functions:**
448 | 
449 | - `hypertable_compression_stats()` → `hypertable_columnstore_stats()`
450 | - `chunk_compression_stats()` → `chunk_columnstore_stats()`
451 | 
452 | ## Questions to Ask User
453 | 
454 | 1. What kind of data will you be storing?
455 | 2. How do you expect to use the data?
456 | 3. What queries will you run?
457 | 4. How long to keep the data?
458 | 5. Column types if unclear
459 | 


--------------------------------------------------------------------------------
/ingest/postgres_docs.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from dataclasses import dataclass
  3 | from dotenv import load_dotenv
  4 | from bs4 import BeautifulSoup, element as BeautifulSoupElement
  5 | import json
  6 | from markdownify import markdownify
  7 | import openai
  8 | import os
  9 | from pathlib import Path
 10 | import psycopg
 11 | from psycopg.sql import SQL, Identifier
 12 | import re
 13 | import shutil
 14 | import subprocess
 15 | import tiktoken
 16 | 
 17 | 
 18 | THIS_DIR = Path(__file__).parent.resolve()
 19 | 
 20 | load_dotenv(dotenv_path=os.path.join(THIS_DIR, "..", ".env"))
 21 | 
 22 | OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 23 | 
 24 | POSTGRES_DIR = THIS_DIR / "postgres"
 25 | SMGL_DIR = POSTGRES_DIR / "doc" / "src" / "sgml"
 26 | HTML_DIR = SMGL_DIR / "html"
 27 | BUILD_DIR = THIS_DIR / "build"
 28 | BUILD_DIR.mkdir(exist_ok=True)
 29 | MD_DIR = BUILD_DIR / "md"
 30 | 
 31 | POSTGRES_BASE_URL = "https://www.postgresql.org/docs"
 32 | 
 33 | ENC = tiktoken.get_encoding("cl100k_base")
 34 | MAX_CHUNK_TOKENS = 7000
 35 | 
 36 | 
 37 | def update_repo():
 38 |     if not POSTGRES_DIR.exists():
 39 |         subprocess.run(
 40 |             "git clone https://github.com/postgres/postgres.git postgres",
 41 |             shell=True,
 42 |             check=True,
 43 |             env=os.environ,
 44 |             text=True,
 45 |         )
 46 |     else:
 47 |         subprocess.run(
 48 |             "git fetch",
 49 |             shell=True,
 50 |             check=True,
 51 |             env=os.environ,
 52 |             text=True,
 53 |             cwd=POSTGRES_DIR,
 54 |         )
 55 | 
 56 | 
 57 | def get_version_tag(version: int) -> str:
 58 |     result = subprocess.run(
 59 |         ["git", "tag", "-l"], capture_output=True, text=True, cwd=POSTGRES_DIR
 60 |     )
 61 |     if result.returncode != 0:
 62 |         raise RuntimeError("Failed to get git tags")
 63 | 
 64 |     tags = result.stdout.splitlines()
 65 | 
 66 |     candidate_tags = []
 67 | 
 68 |     for version_type in ["", "RC", "BETA"]:
 69 |         pattern = re.compile(rf"REL_{version}_{version_type}(\d+)$")
 70 |         for tag in tags:
 71 |             match = pattern.match(tag)
 72 |             if match:
 73 |                 minor_version = int(match.group(1))
 74 |                 candidate_tags.append((minor_version, tag))
 75 |         if len(candidate_tags) > 0:
 76 |             break
 77 | 
 78 |     if not candidate_tags:
 79 |         raise ValueError(f"No tags found for Postgres version {version}")
 80 | 
 81 |     candidate_tags.sort(key=lambda x: x[0], reverse=True)
 82 |     return candidate_tags[0][1]
 83 | 
 84 | 
 85 | def checkout_tag(tag: str) -> None:
 86 |     print(f"checking out {tag}...")
 87 |     subprocess.run(
 88 |         f"git checkout {tag}",
 89 |         shell=True,
 90 |         check=True,
 91 |         env=os.environ,
 92 |         text=True,
 93 |         cwd=POSTGRES_DIR,
 94 |     )
 95 | 
 96 | 
 97 | def build_html() -> None:
 98 |     html_stamp = SMGL_DIR / "html-stamp"
 99 | 
100 |     # make uses the presence of html-stamp to determine if it needs to
101 |     # rebuild the html docs.
102 |     if html_stamp.exists():
103 |         html_stamp.unlink()
104 | 
105 |     if HTML_DIR.exists():
106 |         shutil.rmtree(HTML_DIR)
107 | 
108 |     print("configuring postgres build...")
109 |     environ = os.environ.copy()
110 |     # Shim for macOS and icu4c installed via homebrew, where it's not linked into
111 |     # /usr/local by default.
112 |     if Path("/opt/homebrew/opt/icu4c/lib/pkgconfig").exists():
113 |         environ["PKG_CONFIG_PATH"] = "/opt/homebrew/opt/icu4c/lib/pkgconfig"
114 |     subprocess.run(
115 |         "./configure --without-readline --without-zlib",
116 |         shell=True,
117 |         check=True,
118 |         env=environ,
119 |         text=True,
120 |         cwd=POSTGRES_DIR,
121 |     )
122 | 
123 |     print("building postgres docs...")
124 |     subprocess.run(
125 |         "make html",
126 |         shell=True,
127 |         check=True,
128 |         env=os.environ,
129 |         text=True,
130 |         cwd=SMGL_DIR,
131 |     )
132 | 
133 | 
134 | def build_markdown() -> None:
135 |     print("converting to markdown...")
136 |     if MD_DIR.exists():
137 |         shutil.rmtree(MD_DIR)
138 |     MD_DIR.mkdir()
139 | 
140 |     for html_file in HTML_DIR.glob("*.html"):
141 |         # Skip files which are more metadata about the docs than actual docs
142 |         # that people would ask questions about.
143 |         if html_file.name in [
144 |             "legalnotice.html",
145 |             "appendix-obsolete.md",
146 |             "appendixes.md",
147 |             "biblio.html",
148 |             "bookindex.html",
149 |             "bug-reporting.html",
150 |             "source-format.html",
151 |             "error-message-reporting.html",
152 |             "error-style-guide.html",
153 |             "source-conventions.html",
154 |             "sourcerepo.html",
155 |         ] or html_file.name.startswith("docguide"):
156 |             continue
157 |         md_file = MD_DIR / (html_file.stem + ".md")
158 | 
159 |         html_content = html_file.read_text(encoding="utf-8")
160 |         html_content = html_content.replace(
161 |             '<?xml version="1.0" encoding="UTF-8" standalone="no"?>', ""
162 |         )
163 | 
164 |         soup = BeautifulSoup(html_content, "html.parser")
165 | 
166 |         is_refentry = bool(soup.find("div", class_="refentry"))
167 | 
168 |         elem = soup.find("div", attrs={"id": True})
169 |         if elem and isinstance(elem, BeautifulSoupElement.Tag):
170 |             slug = str(elem["id"]).lower() + ".html"
171 |         else:
172 |             raise SystemError(f"No div with id found in {html_file}")
173 | 
174 |         title = soup.find("title")
175 |         title_text = (
176 |             str(title.string).strip()
177 |             if title and isinstance(title, BeautifulSoupElement.Tag)
178 |             else "PostgreSQL Documentation"
179 |         )
180 |         if title:
181 |             title.decompose()
182 |         for class_name in ["navheader", "navfooter"]:
183 |             for div in soup.find_all("div", class_=class_name):
184 |                 div.decompose()
185 | 
186 |         # Don't bother including refentry in the transform as we don't chunk
187 |         # them by headers anyway.
188 |         if not is_refentry:
189 |             # Convert h3 headings in admonitions to h4 so that we avoid
190 |             # chunking them.
191 |             for class_name in [
192 |                 "caution",
193 |                 "important",
194 |                 "notice",
195 |                 "warning",
196 |                 "tip",
197 |                 "note",
198 |             ]:
199 |                 for div in soup.find_all("div", class_=class_name):
200 |                     if div is None or not isinstance(div, BeautifulSoupElement.Tag):
201 |                         continue
202 |                     h3 = div.find("h3")
203 |                     if h3 and isinstance(h3, BeautifulSoupElement.Tag):
204 |                         h3.name = "h4"
205 | 
206 |         md_content = markdownify(str(soup), heading_style="ATX")
207 |         md_content = f"""---
208 | title: {title_text}
209 | slug: {slug}
210 | refentry: {is_refentry}
211 | ---
212 | {md_content}"""
213 |         md_file.write_text(md_content, encoding="utf-8")
214 | 
215 | 
216 | @dataclass
217 | class Page:
218 |     id: int
219 |     version: int
220 |     url: str
221 |     domain: str
222 |     filename: str
223 | 
224 | 
225 | @dataclass
226 | class Chunk:
227 |     idx: int
228 |     header: str
229 |     header_path: list[str]
230 |     content: str
231 |     token_count: int = 0
232 |     subindex: int = 0
233 | 
234 | 
235 | def insert_page(
236 |     conn: psycopg.Connection,
237 |     page: Page,
238 | ) -> None:
239 |     print("inserting page", page.filename, page.url)
240 |     result = conn.execute(
241 |         "insert into docs.postgres_pages_tmp (version, url, domain, filename, content_length, chunks_count) values (%s,%s,%s,%s,%s,%s) RETURNING id",
242 |         [
243 |             page.version,
244 |             page.url,
245 |             page.domain,
246 |             page.filename,
247 |             0,
248 |             0,
249 |         ],
250 |     )
251 |     row = result.fetchone()
252 |     assert row is not None
253 |     page.id = row[0]
254 | 
255 | 
256 | def update_page_stats(
257 |     conn: psycopg.Connection,
258 |     page: Page,
259 | ) -> None:
260 |     conn.execute(
261 |         """
262 |         update docs.postgres_pages_tmp p
263 |         set
264 |             content_length = coalesce(chunks_stats.total_length, 0),
265 |             chunks_count = coalesce(chunks_stats.chunks_count, 0)
266 |         from (
267 |             select
268 |                 page_id,
269 |                 sum(char_length(content)) as total_length,
270 |                 count(*) as chunks_count
271 |             from docs.postgres_chunks_tmp
272 |             where page_id = %s
273 |             group by page_id
274 |         ) as chunks_stats
275 |         where p.id = chunks_stats.page_id and p.id = %s
276 |     """,
277 |         [page.id, page.id],
278 |     )
279 | 
280 | 
281 | def insert_chunk(
282 |     conn: psycopg.Connection,
283 |     page: Page,
284 |     chunk: Chunk,
285 | ) -> None:
286 |     client = openai.OpenAI(api_key=OPENAI_API_KEY)
287 |     content = ""
288 |     for i in range(len(chunk.header_path)):
289 |         content += (
290 |             "".join(["#" for _ in range(i + 1)]) + " " + chunk.header_path[i] + "\n\n"
291 |         )
292 |     content += chunk.content
293 |     embedding = (
294 |         client.embeddings.create(
295 |             model="text-embedding-3-small",
296 |             input=chunk.content,
297 |         )
298 |         .data[0]
299 |         .embedding
300 |     )
301 |     content = chunk.content
302 |     # token_count, embedding = embed(header_path, content)
303 |     print(f"header: {chunk.header}")
304 |     url = page.url
305 |     if len(chunk.header_path) > 1:
306 |         pattern = r"\((#\S+)\)"
307 |         match = re.search(pattern, chunk.header_path[-1])
308 |         if match:
309 |             url += match.group(1).lower()
310 |     conn.execute(
311 |         "insert into docs.postgres_chunks_tmp (page_id, chunk_index, sub_chunk_index, content, metadata, embedding) values (%s,%s,%s,%s,%s,%s)",
312 |         [
313 |             page.id,
314 |             chunk.idx,
315 |             chunk.subindex,
316 |             chunk.content,
317 |             json.dumps(
318 |                 {
319 |                     "header": chunk.header,
320 |                     "header_path": chunk.header_path,
321 |                     "source_url": url,
322 |                     "token_count": chunk.token_count,
323 |                 }
324 |             ),
325 |             embedding,
326 |         ],
327 |     )
328 | 
329 | 
330 | def split_chunk(chunk: Chunk) -> list[Chunk]:
331 |     num_subchunks = (chunk.token_count // MAX_CHUNK_TOKENS) + 1
332 |     input_ids = ENC.encode(chunk.content)
333 | 
334 |     tokens_per_chunk = len(input_ids) // num_subchunks
335 | 
336 |     subchunks = []
337 |     subindex = 0
338 |     idx = 0
339 |     while idx < len(input_ids):
340 |         cur_idx = min(idx + tokens_per_chunk, len(input_ids))
341 |         chunk_ids = input_ids[idx:cur_idx]
342 |         if not chunk_ids:
343 |             break
344 |         decoded = ENC.decode(chunk_ids)
345 |         if decoded:
346 |             subchunks.append(
347 |                 Chunk(
348 |                     idx=chunk.idx,
349 |                     header=chunk.header,
350 |                     header_path=chunk.header_path,
351 |                     content=decoded,
352 |                     token_count=len(chunk_ids),
353 |                     subindex=subindex,
354 |                 )
355 |             )
356 |             subindex += 1
357 |         if cur_idx == len(input_ids):
358 |             break
359 |         idx += tokens_per_chunk
360 |     return subchunks
361 | 
362 | 
363 | def process_chunk(conn: psycopg.Connection, page: Page, chunk: Chunk) -> None:
364 |     if chunk.content == "":  # discard empty chunks
365 |         return
366 | 
367 |     chunk.token_count = len(ENC.encode(chunk.content))
368 |     if chunk.token_count < 10:  # discard chunks that are too tiny to be useful
369 |         return
370 | 
371 |     chunks = [chunk]
372 | 
373 |     if chunk.token_count > MAX_CHUNK_TOKENS:
374 |         print(
375 |             f"Chunk {chunk.header} too large ({chunk.token_count} tokens), splitting..."
376 |         )
377 |         chunks = split_chunk(chunk)
378 | 
379 |     for chunk in chunks:
380 |         insert_chunk(conn, page, chunk)
381 |     conn.commit()
382 | 
383 | 
384 | def chunk_files(conn: psycopg.Connection, version: int) -> None:
385 |     conn.execute("drop table if exists docs.postgres_chunks_tmp")
386 |     conn.execute("drop table if exists docs.postgres_pages_tmp")
387 |     conn.execute(
388 |         "create table docs.postgres_pages_tmp (like docs.postgres_pages including all excluding constraints)"
389 |     )
390 |     conn.execute(
391 |         "insert into docs.postgres_pages_tmp select * from docs.postgres_pages where version != %s",
392 |         [version],
393 |     )
394 |     conn.execute(
395 |         "create table docs.postgres_chunks_tmp (like docs.postgres_chunks including all excluding constraints)"
396 |     )
397 |     conn.execute(
398 |         "insert into docs.postgres_chunks_tmp select c.* from docs.postgres_chunks c inner join docs.postgres_pages p on c.page_id = p.id where p.version != %s",
399 |         [version],
400 |     )
401 |     conn.execute(
402 |         "alter table docs.postgres_chunks_tmp add foreign key (page_id) references docs.postgres_pages_tmp(id) on delete cascade"
403 |     )
404 |     conn.commit()
405 | 
406 |     # Reset the sequences for the temp tables
407 |     conn.execute(
408 |         "select setval(pg_get_serial_sequence('docs.postgres_chunks_tmp', 'id'), (select max(id) from docs.postgres_chunks_tmp))"
409 |     )
410 |     conn.execute(
411 |         "select setval(pg_get_serial_sequence('docs.postgres_pages_tmp', 'id'), (select max(id) from docs.postgres_pages_tmp))"
412 |     )
413 |     conn.commit()
414 | 
415 |     header_pattern = re.compile("^(#{1,3}) .+$")
416 |     codeblock_pattern = re.compile("^```")
417 | 
418 |     section_prefix = r"^[A-Za-z0-9.]+\.\s*"
419 |     chapter_prefix = r"^Chapter\s+[0-9]+\.\s*"
420 | 
421 |     page_count = 0
422 | 
423 |     for md in MD_DIR.glob("*.md"):
424 |         print(f"chunking {md}...")
425 |         with md.open() as f:
426 |             # process the frontmatter
427 |             f.readline()
428 |             f.readline()  # title line
429 |             slug = f.readline().split(":", 1)[1].strip()
430 |             refentry = f.readline().split(":", 1)[1].strip().lower() == "true"
431 |             f.readline()
432 | 
433 |             page = Page(
434 |                 id=0,
435 |                 version=version,
436 |                 url=f"{POSTGRES_BASE_URL}/{version}/{slug}",
437 |                 domain="postgresql.org",
438 |                 filename=md.name,
439 |             )
440 |             page_count += 1
441 | 
442 |             insert_page(conn, page)
443 | 
444 |             header_path = []
445 |             idx = 0
446 |             chunk: Chunk | None = None
447 |             in_codeblock = False
448 |             while True:
449 |                 line = f.readline()
450 |                 if line == "":
451 |                     if chunk is not None:
452 |                         process_chunk(conn, page, chunk)
453 |                     break
454 |                 match = header_pattern.match(line)
455 |                 if match is None or in_codeblock or (refentry and chunk is not None):
456 |                     assert chunk is not None
457 |                     if codeblock_pattern.match(line):
458 |                         in_codeblock = not in_codeblock
459 |                     chunk.content += line
460 |                     continue
461 |                 header_hases = match.group(1)
462 |                 depth = len(header_hases)
463 |                 header_path = header_path[: (depth - 1)]
464 |                 header = line.lstrip("#").strip()
465 |                 header = re.sub(section_prefix, "", header).strip()
466 |                 header = re.sub(chapter_prefix, "", header).strip()
467 |                 header_path.append(header)
468 |                 if chunk is not None:
469 |                     process_chunk(conn, page, chunk)
470 |                 chunk = Chunk(
471 |                     idx=idx,
472 |                     header=header,
473 |                     header_path=header_path.copy(),
474 |                     content="",
475 |                 )
476 |                 idx += 1
477 |             update_page_stats(conn, page)
478 |             conn.commit()
479 | 
480 |     with conn.cursor() as cur:
481 |         cur.execute("drop table docs.postgres_chunks")
482 |         cur.execute("drop table docs.postgres_pages")
483 |         cur.execute("alter table docs.postgres_chunks_tmp rename to postgres_chunks")
484 |         cur.execute("alter table docs.postgres_pages_tmp rename to postgres_pages")
485 | 
486 |         # the auto create foreign key and index names include the _tmp_ bit in their
487 |         # names, so we remove them so that they match the generated names for the
488 |         # renamed tables.
489 |         for table in ["postgres_pages", "postgres_chunks"]:
490 |             cur.execute(
491 |                 """
492 |                 select indexname
493 |                 from pg_indexes
494 |                 where schemaname = 'docs'
495 |                 and tablename = %s
496 |                 and indexname like %s
497 |             """,
498 |                 [table, '%_tmp_%'],
499 |             )
500 |             for row in cur.fetchall():
501 |                 old_index_name = row[0]
502 |                 new_index_name = old_index_name.replace("_tmp_", "_")
503 |                 cur.execute(
504 |                     SQL(
505 |                         "alter index docs.{old_index_name} rename to {new_index_name}"
506 |                     ).format(
507 |                         old_index_name=Identifier(old_index_name),
508 |                         new_index_name=Identifier(new_index_name),
509 |                     )
510 |                 )
511 | 
512 |         cur.execute("""
513 |             select conname
514 |             from pg_constraint
515 |             where conrelid = to_regclass(%s)
516 |             and contype = 'f'
517 |             and conname like %s
518 |         """, ['docs.postgres_chunks', '%_tmp_%'])
519 |         for row in cur.fetchall():
520 |             old_fk_name = row[0]
521 |             new_fk_name = old_fk_name.replace("_tmp_", "_")
522 |             cur.execute(
523 |                 SQL(
524 |                     "alter table docs.postgres_chunks rename constraint {old_fk_name} to {new_fk_name}"
525 |                 ).format(
526 |                     old_fk_name=Identifier(old_fk_name),
527 |                     new_fk_name=Identifier(new_fk_name),
528 |                 )
529 |             )
530 | 
531 |     conn.commit()
532 | 
533 |     print(f"Processed {page_count} pages.")
534 | 
535 | 
536 | def main():
537 |     parser = argparse.ArgumentParser(
538 |         description="Ingest Postgres documentation into the database."
539 |     )
540 |     parser.add_argument("version", type=int, help="Postgres version to ingest")
541 |     args = parser.parse_args()
542 |     version = args.version
543 |     update_repo()
544 |     tag = get_version_tag(version)
545 |     db_uri = f"postgresql://{os.environ['PGUSER']}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}"
546 |     with psycopg.connect(db_uri) as conn:
547 |         print(f"Building Postgres {version} ({tag}) documentation...")
548 |         checkout_tag(tag)
549 |         build_html()
550 |         build_markdown()
551 |         chunk_files(conn, version)
552 | 
553 | 
554 | if __name__ == "__main__":
555 |     main()
556 | 


--------------------------------------------------------------------------------
/ingest/tiger_docs.py:
--------------------------------------------------------------------------------
   1 | from scrapy.spiders import SitemapSpider
   2 | from scrapy.crawler import CrawlerProcess
   3 | from scrapy.utils.project import get_project_settings
   4 | from bs4 import BeautifulSoup
   5 | from markdownify import markdownify as md
   6 | import os
   7 | import re
   8 | import sys
   9 | import argparse
  10 | import asyncio
  11 | import time
  12 | from urllib.parse import urlparse, urljoin
  13 | import hashlib
  14 | import requests
  15 | import json
  16 | import psycopg
  17 | from psycopg.sql import SQL, Identifier
  18 | import openai
  19 | import tomllib
  20 | from dotenv import load_dotenv, find_dotenv
  21 | from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter
  22 | 
  23 | script_dir = os.path.dirname(os.path.abspath(__file__))
  24 | 
  25 | if not os.path.exists(os.path.join(script_dir, 'build')):
  26 |     os.makedirs(os.path.join(script_dir, 'build'))
  27 | 
  28 | load_dotenv(dotenv_path=os.path.join(script_dir, '..', '.env'))
  29 | schema = 'docs'
  30 | 
  31 | with open(os.path.join(script_dir, 'tiger_docs_config.toml'), 'rb') as config_fp:
  32 |     config = tomllib.load(config_fp)
  33 |     DOMAIN_SELECTORS = config['domain_selectors']
  34 |     DEFAULT_SELECTORS = config['default_selectors']
  35 | 
  36 | 
  37 | def add_header_breadcrumbs_to_content(content, metadata):
  38 |     """Add header breadcrumbs to content - shared utility function"""
  39 |     breadcrumbs = []
  40 | 
  41 |     # Find the deepest header level present in metadata
  42 |     present_headers = []
  43 |     for level in ['Header 1', 'Header 2', 'Header 3']:
  44 |         if level in metadata:
  45 |             present_headers.append(level)
  46 | 
  47 |     # Add all headers except the last one (to avoid duplication with chunk content)
  48 |     for level in present_headers[:-1]:
  49 |         header_level = level.split()[-1]  # Get "1", "2", "3"
  50 |         header_prefix = '#' * int(header_level)
  51 |         breadcrumbs.append(f"{header_prefix} {metadata[level]}")
  52 | 
  53 |     # Combine breadcrumbs with chunk content
  54 |     if breadcrumbs:
  55 |         breadcrumb_text = '\n'.join(breadcrumbs) + '\n\n'
  56 |         return breadcrumb_text + content
  57 |     else:
  58 |         return content
  59 | 
  60 | class DatabaseManager:
  61 |     """Handles PostgreSQL database interactions for storing scraped content"""
  62 | 
  63 |     def __init__(self, database_uri, embedding_model=None):
  64 |         self.database_uri = database_uri
  65 |         self.embedding_model = embedding_model
  66 |         self.finalize_queries: list[SQL] = []
  67 | 
  68 |         try:
  69 |             self.connection = psycopg.connect(self.database_uri)
  70 |         except Exception as e:
  71 |             raise RuntimeError(f"Database connection failed: {e}")
  72 | 
  73 |     def initialize(self):
  74 |         with self.connection.cursor() as cursor:
  75 |             cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_chunks_tmp").format(schema=Identifier(schema)))
  76 |             cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_pages_tmp").format(schema=Identifier(schema)))
  77 |             cursor.execute(SQL("CREATE TABLE {schema}.timescale_pages_tmp (LIKE {schema}.timescale_pages INCLUDING ALL EXCLUDING CONSTRAINTS)").format(schema=Identifier(schema)))
  78 |             cursor.execute(SQL("CREATE TABLE {schema}.timescale_chunks_tmp (LIKE {schema}.timescale_chunks INCLUDING ALL EXCLUDING CONSTRAINTS)").format(schema=Identifier(schema)))
  79 |             cursor.execute(SQL("ALTER TABLE {schema}.timescale_chunks_tmp ADD FOREIGN KEY (page_id) REFERENCES {schema}.timescale_pages_tmp(id) ON DELETE CASCADE").format(schema=Identifier(schema)))
  80 | 
  81 |             # The bm25 indexes have a bug that prevent inserting data into a table
  82 |             # underneath non-public schemas that has them, so we need to make remove
  83 |             # them from the tmp tables and recreate them after renaming.
  84 |             cursor.execute(
  85 |                 """
  86 |                 SELECT indexname, indexdef
  87 |                 FROM pg_indexes
  88 |                 WHERE schemaname = %s
  89 |                     AND tablename LIKE %s
  90 |                     AND indexdef LIKE %s
  91 |             """,
  92 |                 ["docs", "timescale%_tmp%", "%bm25%"],
  93 |             )
  94 |             rows = cursor.fetchall()
  95 |             for row in rows:
  96 |                 index_name = row[0]
  97 |                 index_def = row[1]
  98 |                 tmp_index_def = index_def.replace("_tmp", "")
  99 |                 cursor.execute(
 100 |                     SQL("DROP INDEX IF EXISTS {schema}.{index_name}").format(
 101 |                         schema=Identifier(schema),
 102 |                         index_name=Identifier(index_name),
 103 |                     )
 104 |                 )
 105 |                 self.finalize_queries.append(SQL(tmp_index_def))
 106 |         self.connection.commit()
 107 | 
 108 |     def finalize(self):
 109 |         """Rename the temporary tables and their indexes to the final names, dropping the old tables if they exist"""
 110 |         with self.connection.cursor() as cursor:
 111 |             cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_chunks").format(schema=Identifier(schema)))
 112 |             cursor.execute(SQL("DROP TABLE IF EXISTS {schema}.timescale_pages").format(schema=Identifier(schema)))
 113 |             cursor.execute(SQL("ALTER TABLE {schema}.timescale_chunks_tmp RENAME TO timescale_chunks").format(schema=Identifier(schema)))
 114 |             cursor.execute(SQL("ALTER TABLE {schema}.timescale_pages_tmp RENAME TO timescale_pages").format(schema=Identifier(schema)))
 115 | 
 116 |             # the auto create foreign key and index names include the _tmp_ bit in their
 117 |             # names, so we remove them so that they match the generated names for the
 118 |             # renamed tables.
 119 |             for table in ["timescale_pages", "timescale_chunks"]:
 120 |                 cursor.execute(
 121 |                     """
 122 |                     select indexname
 123 |                     from pg_indexes
 124 |                     where schemaname = %s
 125 |                     and tablename = %s
 126 |                     and indexname like %s
 127 |                 """,
 128 |                     [schema, table, '%_tmp_%'],
 129 |                 )
 130 |                 for row in cursor.fetchall():
 131 |                     old_index_name = row[0]
 132 |                     new_index_name = old_index_name.replace("_tmp_", "_")
 133 |                     cursor.execute(
 134 |                         SQL(
 135 |                             "alter index {schema}.{old_index_name} rename to {new_index_name}"
 136 |                         ).format(
 137 |                             schema=Identifier(schema),
 138 |                             old_index_name=Identifier(old_index_name),
 139 |                             new_index_name=Identifier(new_index_name),
 140 |                         )
 141 |                     )
 142 | 
 143 |             cursor.execute(
 144 |                 SQL("""
 145 |                     select conname
 146 |                     from pg_constraint
 147 |                     where conrelid = to_regclass(%s)
 148 |                     and contype = 'f'
 149 |                     and conname like %s
 150 |                 """).format(schema=Identifier(schema)),
 151 |                 [f"{schema}.timescale_chunks", '%_tmp_%'],
 152 |             )
 153 |             for row in cursor.fetchall():
 154 |                 old_fk_name = row[0]
 155 |                 new_fk_name = old_fk_name.replace("_tmp_", "_")
 156 |                 cursor.execute(
 157 |                     SQL(
 158 |                         "alter table {schema}.timescale_chunks rename constraint {old_fk_name} to {new_fk_name}"
 159 |                     ).format(
 160 |                         schema=Identifier(schema),
 161 |                         old_fk_name=Identifier(old_fk_name),
 162 |                         new_fk_name=Identifier(new_fk_name),
 163 |                     )
 164 |                 )
 165 | 
 166 |             for query in self.finalize_queries:
 167 |                 cursor.execute(query)
 168 | 
 169 |         self.connection.commit()
 170 | 
 171 |     def save_page(self, url, domain, filename, content_length, chunking_method='header'):
 172 |         """Save page information and return the page ID"""
 173 |         try:
 174 |             with (
 175 |                 self.connection.cursor() as cursor,
 176 |                 self.connection.transaction() as _,
 177 |             ):
 178 |                 cursor.execute(SQL("""
 179 |                     INSERT INTO {schema}.timescale_pages_tmp (url, domain, filename, content_length, chunking_method)
 180 |                     VALUES (%s, %s, %s, %s, %s)
 181 |                     ON CONFLICT (url) DO UPDATE SET
 182 |                         content_length = EXCLUDED.content_length,
 183 |                         chunking_method = EXCLUDED.chunking_method,
 184 |                         scraped_at = CURRENT_TIMESTAMP
 185 |                     RETURNING id
 186 |                 """).format(schema=Identifier(schema)), (url, domain, filename, content_length, chunking_method))
 187 | 
 188 |                 page_id = cursor.fetchone()[0]
 189 | 
 190 |                 # Delete existing chunks for this page (in case of re-scraping)
 191 |                 cursor.execute(SQL("DELETE FROM {schema}.timescale_chunks WHERE page_id = %s").format(schema=Identifier(schema)), (page_id,))
 192 | 
 193 |                 return page_id
 194 | 
 195 |         except Exception as e:
 196 |             raise RuntimeError(f"Failed to save page {url}: {e}")
 197 | 
 198 |     def generate_embeddings_batch(self, texts):
 199 |         """Generate embeddings for a batch of texts using the configured embedding model"""
 200 |         if self.embedding_model is None:
 201 |             return [None] * len(texts)
 202 | 
 203 |         try:
 204 |             # Clean texts for embedding
 205 |             clean_texts = []
 206 |             for text in texts:
 207 |                 clean_text = text.strip() if text else ""
 208 |                 clean_texts.append(clean_text)
 209 | 
 210 |             # Generate embeddings in batch using the model
 211 |             embeddings = self.embedding_model.get_text_embeddings(clean_texts)
 212 |             return embeddings
 213 | 
 214 |         except Exception as e:
 215 |             print(f"Warning: Failed to generate batch embeddings: {e}")
 216 |             return [None] * len(texts)
 217 | 
 218 |     def save_chunks(self, page_id, chunks):
 219 |         """Save chunks for a page with batch embedding generation"""
 220 |         try:
 221 |             # Prepare content with breadcrumbs for all chunks
 222 |             processed_chunks = []
 223 |             chunk_texts = []
 224 | 
 225 |             for chunk in chunks:
 226 |                 content_with_breadcrumbs = add_header_breadcrumbs_to_content(
 227 |                     chunk['content'],
 228 |                     chunk['metadata']
 229 |                 )
 230 |                 processed_chunks.append({
 231 |                     'content': content_with_breadcrumbs,
 232 |                     'metadata': chunk['metadata']
 233 |                 })
 234 |                 chunk_texts.append(content_with_breadcrumbs)
 235 | 
 236 |             # Generate embeddings for all chunks in batch
 237 |             embeddings = self.generate_embeddings_batch(chunk_texts)
 238 | 
 239 |             with (
 240 |                 self.connection.cursor() as cursor,
 241 |                 self.connection.transaction() as _,
 242 |             ):
 243 |                 for chunk, embedding in zip(processed_chunks, embeddings):
 244 |                     cursor.execute(SQL("""
 245 |                         INSERT INTO {schema}.timescale_chunks_tmp (page_id, chunk_index, sub_chunk_index, content, metadata, embedding)
 246 |                         VALUES (%s, %s, %s, %s, %s, %s)
 247 |                     """).format(schema=Identifier(schema)), (
 248 |                         page_id,
 249 |                         chunk['metadata'].get('chunk_index', 0),
 250 |                         chunk['metadata'].get('sub_chunk_index', 0),
 251 |                         chunk['content'],
 252 |                         json.dumps(chunk['metadata']),
 253 |                         embedding
 254 |                     ))
 255 | 
 256 |                 # Update chunks count in pages table
 257 |                 cursor.execute(SQL("""
 258 |                     UPDATE {schema}.timescale_pages_tmp
 259 |                     SET chunks_count = %s
 260 |                     WHERE id = %s
 261 |                 """).format(schema=Identifier(schema)), (len(chunks), page_id))
 262 | 
 263 |         except Exception as e:
 264 |             raise RuntimeError(f"Failed to save chunks for page {page_id}: {e}")
 265 | 
 266 |     def get_scraped_page_count(self):
 267 |         """Get the number of pages scraped into the temporary tables"""
 268 |         with self.connection.cursor() as cursor:
 269 |             cursor.execute(SQL("SELECT COUNT(*) FROM {schema}.timescale_pages_tmp").format(schema=Identifier(schema)))
 270 |             return cursor.fetchone()[0]
 271 | 
 272 |     def close(self):
 273 |         """Close database connection"""
 274 |         if self.connection:
 275 |             self.connection.close()
 276 | 
 277 | class FileManager:
 278 |     """Handles file-based storage for scraped content"""
 279 | 
 280 |     def __init__(self, output_dir='scraped_docs'):
 281 |         self.output_dir = output_dir
 282 |         # Create output directory if it doesn't exist
 283 |         os.makedirs(self.output_dir, exist_ok=True)
 284 | 
 285 |     def save_chunked_content(self, url, filename, chunks):
 286 |         """Save chunked content to a markdown file with delimiters"""
 287 |         filepath = os.path.join(self.output_dir, filename)
 288 | 
 289 |         # Create markdown with chunk delimiters
 290 |         chunked_markdown = f"# Source: {url}\n\n"
 291 |         chunked_markdown += f"<!-- Total Chunks: {len(chunks)} -->\n\n"
 292 | 
 293 |         for i, chunk in enumerate(chunks):
 294 |             # Add chunk delimiter
 295 |             chunked_markdown += f"---\n<!-- CHUNK {i+1}/{len(chunks)} -->\n"
 296 | 
 297 |             # Add metadata as comments
 298 |             if chunk['metadata']:
 299 |                 chunked_markdown += f"<!-- Metadata: {chunk['metadata']} -->\n"
 300 | 
 301 |             chunked_markdown += "---\n\n"
 302 | 
 303 |             # Add header breadcrumbs and content
 304 |             content_with_breadcrumbs = add_header_breadcrumbs_to_content(
 305 |                 chunk['content'],
 306 |                 chunk['metadata']
 307 |             )
 308 |             chunked_markdown += content_with_breadcrumbs
 309 |             chunked_markdown += "\n\n"
 310 | 
 311 |         with open(filepath, 'w', encoding='utf-8') as f:
 312 |             f.write(chunked_markdown)
 313 | 
 314 |         return filepath
 315 | 
 316 |     def save_regular_content(self, url, filename, content):
 317 |         """Save regular markdown content to a file"""
 318 |         filepath = os.path.join(self.output_dir, filename)
 319 | 
 320 |         with open(filepath, 'w', encoding='utf-8') as f:
 321 |             f.write(f"# Source: {url}\n\n")
 322 |             f.write(content)
 323 | 
 324 |         return filepath
 325 | 
 326 | class SitemapMarkdownSpider(SitemapSpider):
 327 |     name = 'sitemap_markdown'
 328 | 
 329 |     def __init__(self, domain=None, output_dir='scraped_docs', max_pages=None, strip_data_images=True, chunk_content=True, chunking_method='header', db_manager=None, file_manager=None, url_prefix=None, *args, **kwargs):
 330 |         super(SitemapMarkdownSpider, self).__init__(*args, **kwargs)
 331 | 
 332 |         if not domain:
 333 |             raise ValueError("domain parameter is required")
 334 | 
 335 |         self.domain = domain
 336 |         self.output_dir = output_dir
 337 |         self.max_pages = int(max_pages) if max_pages else None
 338 |         self.should_strip_data_images = strip_data_images if isinstance(strip_data_images, bool) else strip_data_images.lower() == 'true'
 339 |         self.should_chunk_content = chunk_content if isinstance(chunk_content, bool) else chunk_content.lower() == 'true'
 340 |         self.chunking_method = chunking_method  # 'header' or 'semantic'
 341 |         self.allowed_domains = [domain]
 342 |         self.url_prefix = url_prefix  # e.g., '/docs' to only scrape URLs under that path
 343 | 
 344 |         # Use passed-in storage managers
 345 |         self.db_manager = db_manager
 346 |         self.file_manager = file_manager
 347 | 
 348 |         # Get sitemap URLs from robots.txt or fallback to default
 349 |         self.sitemap_urls = self.get_sitemap_urls(domain)
 350 | 
 351 |         # Track processed URLs to avoid duplicates
 352 |         self.processed_urls = set()
 353 |         # Track number of pages processed
 354 |         self.pages_processed = 0
 355 | 
 356 |         # Configure domain-specific element removal
 357 |         self.ignore_selectors = self.get_ignore_selectors(domain)
 358 | 
 359 |     def _init_default_embedding_model(self):
 360 |         """Initialize OpenAI embedding model for database storage"""
 361 |         try:
 362 |             if not os.getenv('OPENAI_API_KEY'):
 363 |                 raise ValueError("OPENAI_API_KEY environment variable is required for database storage with embeddings")
 364 | 
 365 |             self.logger.info("Initializing OpenAI embedding client")
 366 |             client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
 367 | 
 368 |             # Create a simple wrapper class for the OpenAI client
 369 |             class OpenAIEmbeddingWrapper:
 370 |                 def __init__(self, client):
 371 |                     self.client = client
 372 |                     self.model = "text-embedding-3-small"
 373 | 
 374 |                 def get_text_embeddings(self, texts):
 375 |                     """Generate embeddings for a batch of texts"""
 376 |                     response = self.client.embeddings.create(
 377 |                         input=texts,
 378 |                         model=self.model
 379 |                     )
 380 |                     return [embedding.embedding for embedding in response.data]
 381 | 
 382 |             return OpenAIEmbeddingWrapper(client)
 383 | 
 384 |         except Exception as e:
 385 |             raise RuntimeError(f"Failed to initialize OpenAI embeddings: {e}")
 386 | 
 387 |     def get_sitemap_urls(self, domain):
 388 |         """Get sitemap URLs from robots.txt, fallback to common locations"""
 389 |         sitemap_urls = []
 390 | 
 391 |         # Try to get sitemaps from robots.txt
 392 |         robots_url = f'https://{domain}/robots.txt'
 393 |         try:
 394 |             self.logger.info(f'Checking robots.txt at: {robots_url}')
 395 |             response = requests.get(robots_url, timeout=10)
 396 |             response.raise_for_status()
 397 | 
 398 |             # Parse robots.txt for sitemap entries
 399 |             for line in response.text.split('\n'):
 400 |                 line = line.strip()
 401 |                 if line.lower().startswith('sitemap:'):
 402 |                     sitemap_url = line.split(':', 1)[1].strip()
 403 |                     # Handle relative URLs
 404 |                     if not sitemap_url.startswith('http'):
 405 |                         sitemap_url = urljoin(f'https://{domain}/', sitemap_url)
 406 |                     # Filter to only include docs sitemaps if url_prefix is set
 407 |                     if self.url_prefix:
 408 |                         if self.url_prefix in sitemap_url:
 409 |                             sitemap_urls.append(sitemap_url)
 410 |                             self.logger.info(f'Found docs sitemap in robots.txt: {sitemap_url}')
 411 |                     else:
 412 |                         sitemap_urls.append(sitemap_url)
 413 |                         self.logger.info(f'Found sitemap in robots.txt: {sitemap_url}')
 414 | 
 415 |         except Exception as e:
 416 |             self.logger.warning(f'Could not fetch robots.txt from {robots_url}: {e}')
 417 | 
 418 |         # If no sitemaps found in robots.txt, try common locations
 419 |         if not sitemap_urls:
 420 |             common_sitemap_locations = [
 421 |                 f'https://{domain}/sitemap.xml',
 422 |                 f'https://{domain}/sitemap_index.xml',
 423 |                 f'https://{domain}/sitemap.txt'
 424 |             ]
 425 |             # If url_prefix is set, also try prefix-specific sitemaps
 426 |             if self.url_prefix:
 427 |                 common_sitemap_locations = [
 428 |                     f'https://{domain}{self.url_prefix}/sitemap.xml',
 429 |                     f'https://{domain}{self.url_prefix}/sitemap-0.xml',
 430 |                 ] + common_sitemap_locations
 431 | 
 432 |             for sitemap_url in common_sitemap_locations:
 433 |                 try:
 434 |                     self.logger.info(f'Trying common sitemap location: {sitemap_url}')
 435 |                     response = requests.head(sitemap_url, timeout=10)
 436 |                     if response.status_code == 200:
 437 |                         sitemap_urls.append(sitemap_url)
 438 |                         self.logger.info(f'Found sitemap at: {sitemap_url}')
 439 |                         break
 440 |                 except Exception as e:
 441 |                     self.logger.debug(f'Sitemap not found at {sitemap_url}: {e}')
 442 | 
 443 |         # If still no sitemap found, return empty list and let Scrapy handle the error
 444 |         if not sitemap_urls:
 445 |             self.logger.error(f'No sitemap found for domain: {domain}')
 446 | 
 447 |         return sitemap_urls
 448 | 
 449 |     def get_ignore_selectors(self, domain):
 450 |         """Get CSS selectors to ignore for specific domains"""
 451 |         # Get domain-specific selectors, fallback to default
 452 |         selectors = DOMAIN_SELECTORS.get(domain, DEFAULT_SELECTORS.copy())
 453 | 
 454 |         # Also check for subdomain matches (e.g., subdomain.readthedocs.io)
 455 |         if selectors == DEFAULT_SELECTORS:
 456 |             for known_domain, known_selectors in DOMAIN_SELECTORS.items():
 457 |                 if known_domain in domain:
 458 |                     selectors = known_selectors.copy()
 459 |                     break
 460 | 
 461 |         self.logger.info(f'Using ignore selectors for {domain}: {selectors}')
 462 |         return selectors
 463 | 
 464 |     def strip_data_images(self, soup):
 465 |         """Remove <img> elements with data: src attributes"""
 466 |         data_images_removed = 0
 467 | 
 468 |         # Only remove img tags with data: src
 469 |         for img in soup.find_all('img', src=True):
 470 |             if img['src'].startswith('data:'):
 471 |                 img.decompose()
 472 |                 data_images_removed += 1
 473 | 
 474 |         if data_images_removed > 0:
 475 |             self.logger.debug(f'Removed {data_images_removed} data: images')
 476 | 
 477 |         return soup
 478 | 
 479 |     def convert_callouts_to_admonitions(self, soup):
 480 |         """Convert div.callout elements with h6 to admonition-style markdown callouts"""
 481 |         callouts_converted = 0
 482 | 
 483 |         # Map of h6 text to admonition types
 484 |         admonition_map = {
 485 |             'warning': ':warning:',
 486 |             'note': ':information_source:',
 487 |             'tip': ':bulb:',
 488 |             'important': ':exclamation:',
 489 |             'caution': ':warning:',
 490 |             'danger': ':no_entry:',
 491 |             'info': ':information_source:',
 492 |             'example': ':memo:',
 493 |             'see also': ':point_right:',
 494 |         }
 495 | 
 496 |         for callout_div in soup.find_all('div', class_='callout'):
 497 |             h6 = callout_div.find('h6')
 498 |             if not h6:
 499 |                 continue
 500 | 
 501 |             h6_text = h6.get_text().strip().lower()
 502 | 
 503 |             # Find matching admonition type
 504 |             admonition_icon = None
 505 |             for keyword, icon in admonition_map.items():
 506 |                 if keyword in h6_text:
 507 |                     admonition_icon = icon
 508 |                     break
 509 | 
 510 |             # Default to info if no match
 511 |             if not admonition_icon:
 512 |                 admonition_icon = ':information_source:'
 513 | 
 514 |             # Create blockquote with icon and h6 text
 515 |             blockquote = soup.new_tag('blockquote')
 516 | 
 517 |             # Add the h6 text with icon as first paragraph
 518 |             header_p = soup.new_tag('p')
 519 |             header_p.string = f"{admonition_icon} {h6.get_text().strip()}"
 520 |             blockquote.append(header_p)
 521 | 
 522 |             # Remove the h6 from callout div
 523 |             h6.decompose()
 524 | 
 525 |             # Move all remaining content from callout div to blockquote
 526 |             for child in list(callout_div.children):
 527 |                 if child.name:  # Skip text nodes
 528 |                     blockquote.append(child.extract())
 529 | 
 530 |             # Replace callout div with blockquote
 531 |             callout_div.replace_with(blockquote)
 532 |             callouts_converted += 1
 533 | 
 534 |         if callouts_converted > 0:
 535 |             self.logger.debug(f'Converted {callouts_converted} callout divs to admonitions')
 536 | 
 537 |         return soup
 538 | 
 539 |     def clean_code_blocks(self, soup):
 540 |         """Clean up code block HTML structure before markdown conversion"""
 541 |         code_blocks_cleaned = 0
 542 | 
 543 |         # Find code blocks with token-line structure
 544 |         for code_container in soup.find_all(['pre', 'code']):
 545 |             token_lines = code_container.find_all('div', class_='token-line')
 546 | 
 547 |             if token_lines:
 548 |                 # Extract text from each token line and join with newlines
 549 |                 lines = []
 550 |                 for line_div in token_lines:
 551 |                     # Get text content from line-content span or the div itself
 552 |                     line_content = line_div.find(attrs={'data-line_content': 'true'})
 553 |                     if line_content:
 554 |                         lines.append(line_content.get_text())
 555 |                     else:
 556 |                         lines.append(line_div.get_text())
 557 | 
 558 |                 # Replace the complex structure with simple text
 559 |                 code_container.clear()
 560 |                 code_container.string = '\n'.join(lines)
 561 |                 code_blocks_cleaned += 1
 562 | 
 563 |         if code_blocks_cleaned > 0:
 564 |             self.logger.debug(f'Cleaned {code_blocks_cleaned} code blocks')
 565 | 
 566 |         return soup
 567 | 
 568 |     def extract_anchor_links(self, text):
 569 |         """Extract markdown anchor links from text (only internal #anchors)"""
 570 |         import re
 571 | 
 572 |         # Pattern to match markdown links that are internal anchors: [text](#anchor)
 573 |         anchor_pattern = r'\[([^\]]+)\]\(#([^)]+)\)'
 574 | 
 575 |         anchors = []
 576 |         for match in re.finditer(anchor_pattern, text):
 577 |             link_text = match.group(1)
 578 |             anchor_id = match.group(2)
 579 | 
 580 |             anchors.append({
 581 |                 'text': link_text,
 582 |                 'anchor': anchor_id
 583 |             })
 584 | 
 585 |         return anchors
 586 | 
 587 | 
 588 |     def semantic_chunk_with_openai(self, markdown_text, url):
 589 |         """Use OpenAI to identify semantic boundaries for chunking using split identifiers"""
 590 |         try:
 591 |             # Initialize OpenAI client
 592 |             client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
 593 | 
 594 |             # Split text into lines for LLM processing
 595 |             lines = markdown_text.split('\n')
 596 |             small_chunks = [line for line in lines if line.strip()]  # Filter out empty lines
 597 | 
 598 |             # Add chunk identifiers
 599 |             chunked_input = ''
 600 |             for i, chunk in enumerate(small_chunks):
 601 |                 chunked_input += f"<|start_chunk_{i+1}|>{chunk}<|end_chunk_{i+1}|>"
 602 | 
 603 |             # Create prompt for semantic boundary identification
 604 |             system_prompt = """You are an assistant specialized in splitting text into thematically consistent sections.
 605 | The text has been divided into chunks, each marked with <|start_chunk_X|> and <|end_chunk_X|> tags, where X is the chunk number.
 606 | Your task is to identify the points where splits should occur, such that consecutive chunks of similar themes stay together.
 607 | 
 608 | Focus on:
 609 | - Topic changes or conceptual shifts
 610 | - Natural reading breaks that maintain context
 611 | - Keeping related examples, tables, code blocks, and explanations together
 612 | - Ensuring each chunk contains complete thoughts/concepts
 613 | - Prefer to split at markdown headers
 614 | 
 615 | Respond with a list of chunk IDs where you believe a split should be made. For example, if chunks 1 and 2 belong together but chunk 3 starts a new topic, you would suggest a split after chunk 2. THE CHUNKS MUST BE IN ASCENDING ORDER.
 616 | Your response should be in the form: 'split_after: 2, 5, 8'."""
 617 | 
 618 |             user_prompt = f"""CHUNKED_TEXT: {chunked_input}
 619 | 
 620 | Respond only with the IDs of the chunks where you believe a split should occur. YOU MUST RESPOND WITH AT LEAST ONE SPLIT. THESE SPLITS MUST BE IN ASCENDING ORDER."""
 621 | 
 622 |             # Call OpenAI API
 623 |             response = client.chat.completions.create(
 624 |                 model="gpt-4o",  # Use cost-effective model
 625 |                 messages=[
 626 |                     {"role": "system", "content": system_prompt},
 627 |                     {"role": "user", "content": user_prompt}
 628 |                 ],
 629 |                 temperature=0.1,  # Low temperature for consistent results
 630 |                 max_tokens=300
 631 |             )
 632 | 
 633 |             # Parse response to get split positions
 634 |             result_string = response.choices[0].message.content.strip()
 635 | 
 636 |             # Extract numbers from response
 637 |             try:
 638 |                 # Find the line containing split_after
 639 |                 split_after_lines = [line for line in result_string.split('\n') if 'split_after:' in line]
 640 |                 if not split_after_lines:
 641 |                     # Fallback: extract all numbers from response
 642 |                     numbers = re.findall(r'\d+', result_string)
 643 |                 else:
 644 |                     numbers = re.findall(r'\d+', split_after_lines[0])
 645 | 
 646 |                 split_indices = list(map(int, numbers))
 647 | 
 648 |                 # Validate that numbers are in ascending order
 649 |                 if split_indices != sorted(split_indices):
 650 |                     raise ValueError(f"Split indices not in ascending order for {url}: {split_indices}")
 651 | 
 652 |             except Exception as e:
 653 |                 raise ValueError(f"Could not parse OpenAI response for {url}: {e}")
 654 | 
 655 |             # Convert chunk IDs to split indices (0-based)
 656 |             chunks_to_split_after = [i - 1 for i in split_indices if i > 0 and i <= len(small_chunks)]
 657 | 
 658 |             # Create final chunks by combining lines based on split points
 659 |             final_chunks = []
 660 |             current_chunk_lines = []
 661 | 
 662 |             for i, line in enumerate(small_chunks):
 663 |                 current_chunk_lines.append(line)
 664 |                 if i in chunks_to_split_after or i == len(small_chunks) - 1:
 665 |                     if current_chunk_lines:
 666 |                         # Join lines back with newlines
 667 |                         chunk_content = '\n'.join(current_chunk_lines)
 668 | 
 669 |                         # Extract anchor links from chunk content
 670 |                         content_anchors = self.extract_anchor_links(chunk_content)
 671 | 
 672 |                         # Create metadata
 673 |                         chunk_metadata = {
 674 |                             'source_url': url,
 675 |                             'chunk_index': len(final_chunks),
 676 |                             'sub_chunk_index': 0,
 677 |                             'chunking_method': 'semantic_openai',
 678 |                             'line_range': f"{i - len(current_chunk_lines) + 1}-{i}"
 679 |                         }
 680 | 
 681 |                         # Add anchor information to metadata
 682 |                         if content_anchors:
 683 |                             chunk_metadata['anchor_links'] = content_anchors
 684 |                             chunk_metadata['anchor_count'] = len(content_anchors)
 685 |                             chunk_metadata['anchor_ids'] = [a['anchor'] for a in content_anchors]
 686 | 
 687 |                         final_chunks.append({
 688 |                             'content': chunk_content,
 689 |                             'metadata': chunk_metadata
 690 |                         })
 691 |                     current_chunk_lines = []
 692 | 
 693 |             self.logger.debug(f'Created {len(final_chunks)} semantic chunks using OpenAI from {len(small_chunks)} lines')
 694 |             return final_chunks
 695 | 
 696 |         except Exception as e:
 697 |             raise RuntimeError(f"OpenAI semantic chunking failed for {url}: {e}")
 698 | 
 699 |     def chunk_markdown_content_header_based(self, markdown_text, url):
 700 |         """Original header-based chunking method"""
 701 |         chunks = []
 702 | 
 703 |         # Define headers to split on (up to h3)
 704 |         headers_to_split_on = [
 705 |             ("#", "Header 1"),
 706 |             ("##", "Header 2"),
 707 |             ("###", "Header 3"),
 708 |         ]
 709 | 
 710 |         # First pass: split by markdown headers
 711 |         markdown_splitter = MarkdownHeaderTextSplitter(
 712 |             headers_to_split_on=headers_to_split_on,
 713 |             strip_headers=False  # Keep headers in the chunks
 714 |         )
 715 | 
 716 |         header_splits = markdown_splitter.split_text(markdown_text)
 717 | 
 718 |         # Second pass: recursive character splitting for large chunks
 719 |         text_splitter = RecursiveCharacterTextSplitter(
 720 |             chunk_size=2000,
 721 |             chunk_overlap=200,
 722 |             length_function=len,
 723 |             separators=["```", "\n\n", "\n", " ", ""]
 724 |         )
 725 | 
 726 |         for i, doc in enumerate(header_splits):
 727 |             # Get the header metadata
 728 |             metadata = doc.metadata.copy() if hasattr(doc, 'metadata') else {}
 729 |             metadata['source_url'] = url
 730 |             metadata['chunk_index'] = i
 731 |             metadata['chunking_method'] = 'header_based'
 732 | 
 733 |             # Extract anchor links from headers (breadcrumb context)
 734 |             header_anchors = []
 735 |             for level in ['Header 1', 'Header 2', 'Header 3']:
 736 |                 if level in metadata:
 737 |                     header_anchors.extend(self.extract_anchor_links(metadata[level]))
 738 | 
 739 |             # Split large chunks further
 740 |             sub_chunks = text_splitter.split_text(doc.page_content)
 741 | 
 742 |             for j, chunk_text in enumerate(sub_chunks):
 743 |                 chunk_metadata = metadata.copy()
 744 |                 chunk_metadata['sub_chunk_index'] = j
 745 | 
 746 |                 # Extract anchor links from chunk content
 747 |                 content_anchors = self.extract_anchor_links(chunk_text)
 748 | 
 749 |                 # Combine header and content anchors, removing duplicates
 750 |                 all_anchors = header_anchors + content_anchors
 751 |                 unique_anchors = []
 752 |                 seen_anchors = set()
 753 |                 for anchor in all_anchors:
 754 |                     anchor_key = (anchor['text'], anchor['anchor'])
 755 |                     if anchor_key not in seen_anchors:
 756 |                         unique_anchors.append(anchor)
 757 |                         seen_anchors.add(anchor_key)
 758 | 
 759 |                 # Add anchor information to metadata
 760 |                 if unique_anchors:
 761 |                     chunk_metadata['anchor_links'] = unique_anchors
 762 |                     chunk_metadata['anchor_count'] = len(unique_anchors)
 763 |                     # Also create a simple list of anchor IDs for easier searching
 764 |                     chunk_metadata['anchor_ids'] = [a['anchor'] for a in unique_anchors]
 765 | 
 766 |                 chunks.append({
 767 |                     'content': chunk_text,
 768 |                     'metadata': chunk_metadata
 769 |                 })
 770 | 
 771 |         self.logger.debug(f'Created {len(chunks)} chunks using header-based method')
 772 |         return chunks
 773 | 
 774 |     def chunk_markdown_content(self, markdown_text, url):
 775 |         """Route to appropriate chunking method based on configuration"""
 776 |         if self.chunking_method == 'semantic':
 777 |             return self.semantic_chunk_with_openai(markdown_text, url)
 778 |         else:  # Default to header-based
 779 |             return self.chunk_markdown_content_header_based(markdown_text, url)
 780 | 
 781 |     def sitemap_filter(self, entries):
 782 |         """Filter sitemap entries to only include HTML pages under the url_prefix"""
 783 |         for entry in entries:
 784 |             # Only process HTML pages, skip images, PDFs, etc.
 785 |             if any(ext in entry['loc'] for ext in ['.pdf', '.jpg', '.png', '.gif', '.css', '.js', '.xml']):
 786 |                 continue
 787 |             # If url_prefix is set, only include URLs that match the prefix
 788 |             if self.url_prefix:
 789 |                 parsed = urlparse(entry['loc'])
 790 |                 if not parsed.path.startswith(self.url_prefix):
 791 |                     continue
 792 |             yield entry
 793 | 
 794 |     def parse(self, response):
 795 |         """Parse each page from the sitemap"""
 796 |         url = response.url
 797 | 
 798 |         # Skip if already processed
 799 |         if url in self.processed_urls:
 800 |             return
 801 | 
 802 |         # Check if we've reached the maximum number of pages
 803 |         if self.max_pages and self.pages_processed >= self.max_pages:
 804 |             self.logger.info(f'Reached maximum pages limit ({self.max_pages}), stopping crawler')
 805 |             self.crawler.engine.close_spider(self, 'max_pages_reached')
 806 |             return
 807 | 
 808 |         self.processed_urls.add(url)
 809 |         self.pages_processed += 1
 810 | 
 811 |         # Log the URL being processed
 812 |         self.logger.info(f'Processing: {url}')
 813 | 
 814 |         try:
 815 |             # Parse HTML with BeautifulSoup
 816 |             soup = BeautifulSoup(response.body, 'html.parser')
 817 | 
 818 |             # Remove elements based on configured selectors
 819 |             for selector in self.ignore_selectors:
 820 |                 elements = soup.select(selector)
 821 |                 for element in elements:
 822 |                     element.decompose()
 823 |                 if elements:
 824 |                     self.logger.debug(f'Removed {len(elements)} elements matching: {selector}')
 825 | 
 826 |             # Strip data: images if requested
 827 |             if self.should_strip_data_images:
 828 |                 soup = self.strip_data_images(soup)
 829 | 
 830 |             # Convert callout divs to admonitions
 831 |             soup = self.convert_callouts_to_admonitions(soup)
 832 | 
 833 |             # Clean up code block structure
 834 |             soup = self.clean_code_blocks(soup)
 835 | 
 836 |             # Find main content
 837 |             main_content = soup.find("main") or soup
 838 |             html_content = str(main_content)
 839 | 
 840 |             # Convert to markdown
 841 |             markdown_output = md(html_content, heading_style="ATX")
 842 | 
 843 |             # Generate filename from URL
 844 |             filename = self.generate_filename(url)
 845 |             filepath = os.path.join(self.output_dir, filename)
 846 | 
 847 |             if self.should_chunk_content:
 848 |                 # Chunk the content
 849 |                 chunks = self.chunk_markdown_content(markdown_output, url)
 850 | 
 851 |                 if self.db_manager is not None:
 852 |                     # Save to database
 853 |                     page_id = self.db_manager.save_page(
 854 |                         url=url,
 855 |                         domain=self.domain,
 856 |                         filename=filename,
 857 |                         content_length=len(markdown_output),
 858 |                         chunking_method=self.chunking_method
 859 |                     )
 860 | 
 861 |                     self.logger.info(f'Generating embeddings for {len(chunks)} chunks from: {url}')
 862 |                     self.db_manager.save_chunks(page_id, chunks)
 863 | 
 864 |                     self.logger.info(f'Saved {len(chunks)} chunks with embeddings to database: {url}')
 865 | 
 866 |                 if self.file_manager is not None:
 867 |                     # Save to file
 868 |                     filepath = self.file_manager.save_chunked_content(url, filename, chunks)
 869 |                     self.logger.info(f'Saved {len(chunks)} chunks: {filepath}')
 870 | 
 871 |                 return {
 872 |                     'url': url,
 873 |                     'filename': filename,
 874 |                     'content_length': len(markdown_output),
 875 |                     'chunks_count': len(chunks)
 876 |                 }
 877 |             else:
 878 |                 if self.db_manager is not None:
 879 |                     # Save to database without chunking
 880 |                     page_id = self.db_manager.save_page(
 881 |                         url=url,
 882 |                         domain=self.domain,
 883 |                         filename=filename,
 884 |                         content_length=len(markdown_output),
 885 |                         chunking_method='none'
 886 |                     )
 887 |                     # Save entire content as single chunk
 888 |                     single_chunk = [{
 889 |                         'content': markdown_output,
 890 |                         'metadata': {
 891 |                             'source_url': url,
 892 |                             'chunk_index': 0,
 893 |                             'sub_chunk_index': 0,
 894 |                             'chunking_method': 'none'
 895 |                         }
 896 |                     }]
 897 |                     self.db_manager.save_chunks(page_id, single_chunk)
 898 | 
 899 |                     self.logger.info(f'Saved to database: {url}')
 900 | 
 901 |                 if self.file_manager is not None:
 902 |                     # Save to file
 903 |                     filepath = self.file_manager.save_regular_content(url, filename, markdown_output)
 904 |                     self.logger.info(f'Saved: {filepath}')
 905 | 
 906 |                 return {
 907 |                     'url': url,
 908 |                     'filename': filename,
 909 |                     'content_length': len(markdown_output)
 910 |                 }
 911 | 
 912 |         except Exception as e:
 913 |             self.logger.error(f'Error processing {url}: {str(e)}')
 914 |             return None
 915 | 
 916 |     def generate_filename(self, url):
 917 |         """Generate a safe filename from URL"""
 918 |         parsed = urlparse(url)
 919 |         path = parsed.path
 920 | 
 921 |         # Remove leading/trailing slashes and replace path separators
 922 |         path = path.strip('/')
 923 |         if not path:
 924 |             path = 'index'
 925 | 
 926 |         # Replace problematic characters
 927 |         safe_path = re.sub(r'[^\w\-_/]', '_', path)
 928 |         safe_path = re.sub(r'_+', '_', safe_path)  # Replace multiple underscores
 929 |         safe_path = safe_path.replace('/', '_')
 930 | 
 931 |         # Ensure filename isn't too long
 932 |         if len(safe_path) > 100:
 933 |             # Create hash of original path and truncate
 934 |             hash_suffix = hashlib.md5(path.encode()).hexdigest()[:8]
 935 |             safe_path = safe_path[:80] + '_' + hash_suffix
 936 | 
 937 |         return f"{safe_path}.md"
 938 | 
 939 | 
 940 | # Standalone script to run the spider
 941 | if __name__ == "__main__":
 942 |     import argparse
 943 |     import sys
 944 |     from scrapy.crawler import CrawlerProcess
 945 |     from scrapy.utils.project import get_project_settings
 946 | 
 947 |     parser = argparse.ArgumentParser(
 948 |         description='Scrape websites using sitemaps and convert to chunked markdown for RAG applications',
 949 |         formatter_class=argparse.RawDescriptionHelpFormatter,
 950 |         epilog='''Examples:
 951 |   %(prog)s www.tigerdata.com
 952 |   %(prog)s www.tigerdata.com -o tiger_docs -m 50
 953 |   %(prog)s www.tigerdata.com -o semantic_docs -m 5 --chunking semantic
 954 |   %(prog)s www.tigerdata.com --no-chunk --no-strip-images -m 100
 955 |   %(prog)s www.tigerdata.com --storage-type database --database-uri postgresql://user:pass@host:5432/dbname
 956 |   %(prog)s www.tigerdata.com --storage-type database --chunking semantic -m 10
 957 |         '''
 958 |     )
 959 | 
 960 |     # Optional arguments
 961 |     parser.add_argument('--domain', '-d',
 962 |                        help='Domain to scrape (e.g., www.tigerdata.com)')
 963 | 
 964 |     parser.add_argument('-o', '--output-dir',
 965 |                        default='scraped_docs',
 966 |                        help='Output directory for scraped files (default: scraped_docs)')
 967 | 
 968 |     parser.add_argument('-m', '--max-pages',
 969 |                        type=int,
 970 |                        help='Maximum number of pages to scrape (default: unlimited)')
 971 | 
 972 |     parser.add_argument('--strip-images',
 973 |                        action='store_true',
 974 |                        default=True,
 975 |                        help='Strip data: images from content (default: True)')
 976 | 
 977 |     parser.add_argument('--no-strip-images',
 978 |                        dest='strip_images',
 979 |                        action='store_false',
 980 |                        help='Keep data: images in content')
 981 | 
 982 |     parser.add_argument('--chunk',
 983 |                        action='store_true',
 984 |                        default=True,
 985 |                        help='Enable content chunking (default: True)')
 986 | 
 987 |     parser.add_argument('--no-chunk',
 988 |                        dest='chunk',
 989 |                        action='store_false',
 990 |                        help='Disable content chunking')
 991 | 
 992 |     parser.add_argument('--chunking',
 993 |                        choices=['header', 'semantic'],
 994 |                        default='header',
 995 |                        help='Chunking method: header (default) or semantic (requires OPENAI_API_KEY)')
 996 | 
 997 |     # Storage options
 998 |     parser.add_argument('--storage-type',
 999 |                        choices=['file', 'database'],
1000 |                        default='database',
1001 |                        help='Storage type: database (default) or file')
1002 | 
1003 |     parser.add_argument('--database-uri',
1004 |                        help='PostgreSQL connection URI (default: uses DB_URL from environment)')
1005 | 
1006 |     parser.add_argument('--skip-indexes',
1007 |                        action='store_true',
1008 |                        help='Skip creating database indexes after import (for development/testing)')
1009 | 
1010 |     parser.add_argument('--delay',
1011 |                        type=float,
1012 |                        default=1.0,
1013 |                        help='Download delay in seconds (default: 1.0)')
1014 | 
1015 |     parser.add_argument('--concurrent',
1016 |                        type=int,
1017 |                        default=4,
1018 |                        help='Maximum concurrent requests (default: 4)')
1019 | 
1020 |     parser.add_argument('--url-prefix',
1021 |                        help='URL path prefix to filter pages (e.g., /docs to only scrape URLs under /docs)')
1022 | 
1023 |     parser.add_argument('--log-level',
1024 |                        choices=['DEBUG', 'INFO', 'WARNING', 'ERROR'],
1025 |                        default='INFO',
1026 |                        help='Logging level (default: INFO)')
1027 | 
1028 |     parser.add_argument('--user-agent',
1029 |                        default='Mozilla/5.0 (compatible; DocumentationScraper)',
1030 |                        help='User agent string')
1031 | 
1032 |     # Set defaults from environment variables
1033 |     parser.set_defaults(
1034 |         database_uri=os.environ.get('DB_URL', f'postgresql://{os.environ["PGUSER"]}:{os.environ['PGPASSWORD']}@{os.environ['PGHOST']}:{os.environ['PGPORT']}/{os.environ['PGDATABASE']}'),
1035 |         domain=os.environ.get('SCRAPER_DOMAIN', 'www.tigerdata.com'),
1036 |         max_pages=int(os.environ.get('SCRAPER_MAX_PAGES', 0)) or None,
1037 |         output_dir=os.environ.get('SCRAPER_OUTPUT_DIR', os.path.join(script_dir, 'build', 'scraped_docs')),
1038 |         chunking=os.environ.get('SCRAPER_CHUNKING_METHOD', 'header'),
1039 |         storage_type=os.environ.get('SCRAPER_STORAGE_TYPE', 'database'),
1040 |         url_prefix=os.environ.get('SCRAPER_URL_PREFIX', '/docs')
1041 |     )
1042 | 
1043 |     args = parser.parse_args()
1044 | 
1045 |     # Validate semantic chunking requirements
1046 |     if args.chunking == 'semantic':
1047 |         if not os.getenv('OPENAI_API_KEY'):
1048 |             print("Error: Semantic chunking requires OPENAI_API_KEY environment variable")
1049 |             print("Set it with: export OPENAI_API_KEY=your_api_key")
1050 |             print("Or create a .env file with: OPENAI_API_KEY=your_api_key")
1051 |             sys.exit(1)
1052 | 
1053 |     # Configure Scrapy settings
1054 |     settings = get_project_settings()
1055 |     settings.update({
1056 |         'USER_AGENT': args.user_agent,
1057 |         'ROBOTSTXT_OBEY': True,
1058 |         'DOWNLOAD_DELAY': args.delay,
1059 |         'RANDOMIZE_DOWNLOAD_DELAY': True,
1060 |         'CONCURRENT_REQUESTS': args.concurrent,
1061 |         'CONCURRENT_REQUESTS_PER_DOMAIN': min(args.concurrent, 2),
1062 |         'LOG_LEVEL': args.log_level,
1063 |     })
1064 | 
1065 |     print(f"Starting scraper for {args.domain}")
1066 |     print(f"URL prefix: {args.url_prefix or 'none (all pages)'}")
1067 |     print(f"Output directory: {args.output_dir}")
1068 |     print(f"Max pages: {args.max_pages or 'unlimited'}")
1069 |     print(f"Chunking: {'enabled' if args.chunk else 'disabled'} ({args.chunking})")
1070 |     print(f"Strip images: {args.strip_images}")
1071 |     print(f"Storage type: {args.storage_type}")
1072 |     if args.storage_type == 'database':
1073 |         print(f"Database URI: {args.database_uri}")
1074 |     print()
1075 | 
1076 |     # Initialize storage managers
1077 |     db_manager = None
1078 |     file_manager = None
1079 | 
1080 |     if args.storage_type == 'database':
1081 |         # Initialize embedding model for database storage (needed for both header and semantic)
1082 |         client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
1083 | 
1084 |         # Create embedding wrapper
1085 |         class OpenAIEmbeddingWrapper:
1086 |             def __init__(self, client):
1087 |                 self.client = client
1088 |                 self.model = "text-embedding-3-small"
1089 | 
1090 |             def get_text_embeddings(self, texts):
1091 |                 response = self.client.embeddings.create(
1092 |                     input=texts,
1093 |                     model=self.model
1094 |                 )
1095 |                 return [embedding.embedding for embedding in response.data]
1096 | 
1097 |         embedding_model = OpenAIEmbeddingWrapper(client)
1098 |         db_manager = DatabaseManager(database_uri=args.database_uri, embedding_model=embedding_model)
1099 |         db_manager.initialize()
1100 |     else:
1101 |         file_manager = FileManager(args.output_dir)
1102 | 
1103 |     process = CrawlerProcess(settings)
1104 |     process.crawl(
1105 |         SitemapMarkdownSpider,
1106 |         domain=args.domain,
1107 |         output_dir=args.output_dir,
1108 |         max_pages=args.max_pages,
1109 |         strip_data_images=args.strip_images,
1110 |         chunk_content=args.chunk,
1111 |         chunking_method=args.chunking,
1112 |         db_manager=db_manager,
1113 |         file_manager=file_manager,
1114 |         url_prefix=args.url_prefix
1115 |     )
1116 |     process.start()
1117 | 
1118 |     # Create database indexes after scraping completes
1119 |     if args.storage_type == 'database' and db_manager:
1120 |         try:
1121 |             # Check if any pages were scraped
1122 |             page_count = db_manager.get_scraped_page_count()
1123 |             print(f"Scraped {page_count} pages.")
1124 | 
1125 |             if page_count == 0:
1126 |                 print("Error: No pages were scraped. Aborting to preserve existing data.")
1127 |                 print("Check that the sitemap is accessible and the URL prefix is correct.")
1128 |                 raise SystemExit(1)
1129 | 
1130 |             if args.skip_indexes:
1131 |                 print("Skipping database finalization (--skip-indexes flag set).")
1132 |             else:
1133 |                 print("Finalizing database...")
1134 |                 db_manager.finalize()
1135 |                 print("Database finalized successfully.")
1136 |         except SystemExit:
1137 |             raise
1138 |         except Exception as e:
1139 |             print(f"Failed to finish database: {e}")
1140 |             raise SystemExit(1)
1141 |         finally:
1142 |             db_manager.close()
1143 | 


--------------------------------------------------------------------------------