├── .clineignore
├── .clinerules
├── .dockerignore
├── .env.example
├── .github
    └── workflows
    │   ├── ci.yml
    │   └── release.yml
├── .gitignore
├── .husky
    ├── commit-msg
    └── pre-commit
├── .releaserc.json
├── ARCHITECTURE.md
├── CHANGELOG.md
├── Dockerfile
├── LICENSE
├── README.md
├── biome.json
├── commitlint.config.js
├── db
    └── migrations
    │   ├── 000-initial-schema.sql
    │   └── 001-add-indexed-at-column.sql
├── docker-compose.yml
├── docs
    └── docs-mcp-server.png
├── package-lock.json
├── package.json
├── postcss.config.cjs
├── src
    ├── index.ts
    ├── mcp
    │   ├── index.ts
    │   ├── mcpServer.ts
    │   ├── startHttpServer.ts
    │   ├── startStdioServer.ts
    │   ├── tools.ts
    │   └── utils.ts
    ├── pipeline
    │   ├── PipelineManager.test.ts
    │   ├── PipelineManager.ts
    │   ├── PipelineWorker.test.ts
    │   ├── PipelineWorker.ts
    │   ├── errors.ts
    │   ├── index.ts
    │   └── types.ts
    ├── scraper
    │   ├── ScraperRegistry.test.ts
    │   ├── ScraperRegistry.ts
    │   ├── ScraperService.test.ts
    │   ├── ScraperService.ts
    │   ├── fetcher
    │   │   ├── FileFetcher.test.ts
    │   │   ├── FileFetcher.ts
    │   │   ├── FingerprintGenerator.test.ts
    │   │   ├── FingerprintGenerator.ts
    │   │   ├── HttpFetcher.test.ts
    │   │   ├── HttpFetcher.ts
    │   │   ├── index.ts
    │   │   └── types.ts
    │   ├── index.ts
    │   ├── middleware
    │   │   ├── HtmlCheerioParserMiddleware.test.ts
    │   │   ├── HtmlCheerioParserMiddleware.ts
    │   │   ├── HtmlJsExecutorMiddleware.test.ts
    │   │   ├── HtmlJsExecutorMiddleware.ts
    │   │   ├── HtmlLinkExtractorMiddleware.test.ts
    │   │   ├── HtmlLinkExtractorMiddleware.ts
    │   │   ├── HtmlMetadataExtractorMiddleware.test.ts
    │   │   ├── HtmlMetadataExtractorMiddleware.ts
    │   │   ├── HtmlPlaywrightMiddleware.test.ts
    │   │   ├── HtmlPlaywrightMiddleware.ts
    │   │   ├── HtmlSanitizerMiddleware.test.ts
    │   │   ├── HtmlSanitizerMiddleware.ts
    │   │   ├── HtmlToMarkdownMiddleware.test.ts
    │   │   ├── HtmlToMarkdownMiddleware.ts
    │   │   ├── MarkdownLinkExtractorMiddleware.test.ts
    │   │   ├── MarkdownLinkExtractorMiddleware.ts
    │   │   ├── MarkdownMetadataExtractorMiddleware.test.ts
    │   │   ├── MarkdownMetadataExtractorMiddleware.ts
    │   │   ├── index.ts
    │   │   └── types.ts
    │   ├── pipelines
    │   │   ├── BasePipeline.test.ts
    │   │   ├── BasePipeline.ts
    │   │   ├── HtmlPipeline.test.ts
    │   │   ├── HtmlPipeline.ts
    │   │   ├── MarkdownPipeline.test.ts
    │   │   ├── MarkdownPipeline.ts
    │   │   └── types.ts
    │   ├── strategies
    │   │   ├── BaseScraperStrategy.test.ts
    │   │   ├── BaseScraperStrategy.ts
    │   │   ├── GitHubScraperStrategy.ts
    │   │   ├── LocalFileStrategy.test.ts
    │   │   ├── LocalFileStrategy.ts
    │   │   ├── NpmScraperStrategy.ts
    │   │   ├── PyPiScraperStrategy.ts
    │   │   ├── WebScraperStrategy.test.ts
    │   │   └── WebScraperStrategy.ts
    │   ├── types.ts
    │   └── utils
    │   │   ├── buffer.test.ts
    │   │   ├── buffer.ts
    │   │   ├── patternMatcher.test.ts
    │   │   ├── patternMatcher.ts
    │   │   ├── sandbox.test.ts
    │   │   ├── sandbox.ts
    │   │   ├── scope.test.ts
    │   │   └── scope.ts
    ├── splitter
    │   ├── GreedySplitter.test.ts
    │   ├── GreedySplitter.ts
    │   ├── SemanticMarkdownSplitter.test.ts
    │   ├── SemanticMarkdownSplitter.ts
    │   ├── errors.ts
    │   ├── index.ts
    │   ├── splitters
    │   │   ├── CodeContentSplitter.test.ts
    │   │   ├── CodeContentSplitter.ts
    │   │   ├── TableContentSplitter.test.ts
    │   │   ├── TableContentSplitter.ts
    │   │   ├── TextContentSplitter.test.ts
    │   │   ├── TextContentSplitter.ts
    │   │   └── types.ts
    │   └── types.ts
    ├── store
    │   ├── DocumentManagementService.test.ts
    │   ├── DocumentManagementService.ts
    │   ├── DocumentRetrieverService.test.ts
    │   ├── DocumentRetrieverService.ts
    │   ├── DocumentStore.test.ts
    │   ├── DocumentStore.ts
    │   ├── applyMigrations.ts
    │   ├── embeddings
    │   │   ├── EmbeddingFactory.test.ts
    │   │   ├── EmbeddingFactory.ts
    │   │   ├── FixedDimensionEmbeddings.test.ts
    │   │   └── FixedDimensionEmbeddings.ts
    │   ├── errors.ts
    │   ├── index.ts
    │   └── types.ts
    ├── tools
    │   ├── CancelJobTool.test.ts
    │   ├── CancelJobTool.ts
    │   ├── ClearCompletedJobsTool.test.ts
    │   ├── ClearCompletedJobsTool.ts
    │   ├── FetchUrlTool.test.ts
    │   ├── FetchUrlTool.ts
    │   ├── FindVersionTool.test.ts
    │   ├── FindVersionTool.ts
    │   ├── GetJobInfoTool.test.ts
    │   ├── GetJobInfoTool.ts
    │   ├── ListJobsTool.test.ts
    │   ├── ListJobsTool.ts
    │   ├── ListLibrariesTool.test.ts
    │   ├── ListLibrariesTool.ts
    │   ├── RemoveTool.test.ts
    │   ├── RemoveTool.ts
    │   ├── ScrapeTool.test.ts
    │   ├── ScrapeTool.ts
    │   ├── SearchTool.test.ts
    │   ├── SearchTool.ts
    │   ├── errors.test.ts
    │   ├── errors.ts
    │   └── index.ts
    ├── types
    │   └── index.ts
    ├── utils
    │   ├── config.ts
    │   ├── dom.ts
    │   ├── errors.ts
    │   ├── logger.ts
    │   ├── mimeTypeUtils.ts
    │   ├── paths.ts
    │   ├── string.ts
    │   ├── url.test.ts
    │   └── url.ts
    └── web
    │   ├── components
    │       ├── Alert.tsx
    │       ├── JobItem.tsx
    │       ├── JobList.tsx
    │       ├── Layout.tsx
    │       ├── LibraryDetailCard.tsx
    │       ├── LibraryItem.tsx
    │       ├── LibraryList.tsx
    │       ├── LibrarySearchCard.tsx
    │       ├── LoadingSpinner.tsx
    │       ├── ScrapeForm.tsx
    │       ├── ScrapeFormContent.tsx
    │       ├── SearchResultItem.tsx
    │       ├── SearchResultList.tsx
    │       ├── SearchResultSkeletonItem.tsx
    │       ├── Tooltip.tsx
    │       ├── VersionBadge.tsx
    │       ├── VersionDetailsRow.tsx
    │       └── utils.ts
    │   ├── main.client.ts
    │   ├── routes
    │       ├── index.tsx
    │       ├── jobs
    │       │   ├── cancel.tsx
    │       │   ├── clear-completed.tsx
    │       │   ├── list.tsx
    │       │   └── new.tsx
    │       └── libraries
    │       │   ├── detail.tsx
    │       │   └── list.tsx
    │   ├── styles
    │       └── main.css
    │   └── web.ts
├── tsconfig.json
├── vite.config.ts
└── vite.config.web.ts


/.clineignore:
--------------------------------------------------------------------------------
1 | package-lock.json
2 | dist/
3 | .git/
4 | *.log
5 | .env.*
6 | !.env.example
7 | !.github/
8 | 


--------------------------------------------------------------------------------
/.clinerules:
--------------------------------------------------------------------------------
 1 | # Cline Custom Instructions
 2 | 
 3 | - You must read the `README.md` to understand the project structure and setup.
 4 | - You must read the `ARCHITECTURE.md` file before making changes across multiple services.
 5 | - You must follow DRY, KISS, YAGNI, and SOLID principles.
 6 | - You must use the latest version of the programming language and libraries.
 7 | - Prefer the simplest solution.
 8 | - Never commit secrets, credentials, or sensitive data to the repository.
 9 | - When importing a relative path, avoid using file extensions like ".js" and ".ts".
10 | - Update TSDoc for all classes, methods and functions. Focus on functionality and reasoning.
11 | - NEVER document individual parameters or return values if their use can easily be derived from their name.
12 | - When asked to check the documentation of a library, use the `search_docs` tool.
13 | 
14 | ## Architecture
15 | 
16 | - Focus on system concepts and component relationships.
17 | - Put implementation details in source code.
18 | - Update `ARCHITECTURE.md` when the architecture changes.
19 | - Do not use special characters like braces in mermaid diagram titles or names. Quote them if necessary.
20 | 
21 | ## Git
22 | 
23 | - Branches must be created locally before pushing.
24 | - Branch names must be prefixed with type (`feature/`, `bugfix/`, `chore/`) and include the issue number if available (e.g., `feature/1234-description`).
25 | - All commit messages must use Conventional Commits (`feat:`, `fix:`, etc.).
26 | - Commit subject must be imperative mood and ≤72 characters.
27 | - If a commit body is present, add a blank line before it.
28 | - Commit body (for non-trivial changes) must explain what and why, not how.
29 | - Reference related issues in commit messages when relevant (e.g., `Closes #123`).
30 | - Do not include unrelated changes in a single commit.
31 | - Do not use vague or generic commit messages.
32 | - Pull request descriptions must summarize the what and why of all changes in the branch (not just a list of commits or the how).
33 | - Pull requests must target `main` unless specified otherwise.
34 | 
35 | ## Typescript
36 | 
37 | - Install dependencies using `npm install`
38 | - Prefer a specific type or `unknown` over `any`.
39 | - Do not use non-null assertions (`!`). Use optional chaining (`?.`) or nullish coalescing (`??`).
40 | 
41 | ## Logging Guidelines
42 | 
43 | - Use `console.*` for CLI user output (results, direct feedback).
44 | - Use `logger.info/warn/error` for meaningful application events; prefix with a relevant emoji.
45 | - Use `logger.debug` for detailed developer/tracing logs; no emoji prefix.
46 | - Prefer `logger.debug` over `logger.info` for granular internal steps to reduce log verbosity.
47 | 
48 | ## Web UI
49 | 
50 | - Use AlpineJS for frontend components and TailwindCSS for styling.
51 | - Use TSX with kitajs for AlpineJS components.
52 | - Use HTMX for server-side interactions.
53 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Docker specific
 2 | Dockerfile
 3 | .dockerignore
 4 | 
 5 | # Version control
 6 | .git
 7 | .github
 8 | .husky
 9 | 
10 | # Already in gitignore but explicitly listed for Docker context
11 | node_modules
12 | dist
13 | *.log
14 | .env*
15 | 
16 | # Other
17 | .store
18 | README.md
19 | ARCHITECTURE.md
20 | CHANGELOG.md
21 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Embedding Model Configuration
 2 | # Optional: Format is "provider:model_name" or just "model_name" for OpenAI (default)
 3 | # Examples:
 4 | #   - openai:text-embedding-3-small (default if no provider specified)
 5 | #   - vertex:text-embedding-004 (Google Cloud Vertex AI)
 6 | #   - gemini:gemini-embedding-exp-03-07 (Google Generative AI)
 7 | #   - aws:amazon.titan-embed-text-v1
 8 | #   - microsoft:text-embedding-ada-002
 9 | DOCS_MCP_EMBEDDING_MODEL=
10 | 
11 | # OpenAI Provider Configuration (Default)
12 | # Required for OpenAI provider or as fallback
13 | OPENAI_API_KEY=your-key-here
14 | # Optional: Your OpenAI Organization ID
15 | OPENAI_ORG_ID=
16 | # Optional: Custom base URL for OpenAI-compatible APIs (e.g., Ollama, Azure OpenAI)
17 | OPENAI_API_BASE=
18 | 
19 | # Google Cloud Vertex AI Configuration
20 | # Required for vertex provider: Path to service account JSON key file
21 | GOOGLE_APPLICATION_CREDENTIALS=/path/to/gcp-key.json
22 | 
23 | # Google Generative AI (Gemini) Configuration
24 | # Required for gemini provider: Google API key
25 | GOOGLE_API_KEY=your-google-api-key
26 | 
27 | # AWS Bedrock Configuration
28 | # Required for aws provider
29 | AWS_ACCESS_KEY_ID=your-aws-key
30 | AWS_SECRET_ACCESS_KEY=your-aws-secret
31 | AWS_REGION=us-east-1
32 | # Optional: Use BEDROCK_AWS_REGION instead of AWS_REGION if needed
33 | # BEDROCK_AWS_REGION=us-east-1
34 | 
35 | # Azure OpenAI Configuration
36 | # Required for microsoft provider
37 | AZURE_OPENAI_API_KEY=your-azure-key
38 | AZURE_OPENAI_API_INSTANCE_NAME=your-instance
39 | AZURE_OPENAI_API_DEPLOYMENT_NAME=your-deployment
40 | AZURE_OPENAI_API_VERSION=2024-02-01
41 | 
42 | # Optional: Specify a custom directory to store the SQLite database file (documents.db).
43 | # If set, this path takes precedence over the default locations.
44 | # Default behavior (if unset):
45 | # 1. Uses './.store/' in the project root if it exists (legacy).
46 | # 2. Falls back to OS-specific data directory (e.g., ~/Library/Application Support/docs-mcp-server on macOS).
47 | # DOCS_MCP_STORE_PATH=/path/to/your/desired/storage/directory
48 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 | 
 9 | jobs:
10 |   lint:
11 |     name: Lint
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout code
15 |         uses: actions/checkout@v4
16 | 
17 |       - name: Set up Node.js
18 |         uses: actions/setup-node@v4
19 |         with:
20 |           node-version: '>=20.0.0' # Match engines requirement in package.json
21 |           cache: 'npm'
22 | 
23 |       - name: Install dependencies
24 |         run: npm ci
25 | 
26 |       - name: Run linter
27 |         run: npm run lint
28 | 
29 |   test:
30 |     name: Test
31 |     runs-on: ubuntu-latest
32 |     needs: lint # Run after linting passes
33 |     steps:
34 |       - name: Checkout code
35 |         uses: actions/checkout@v4
36 | 
37 |       - name: Set up Node.js
38 |         uses: actions/setup-node@v4
39 |         with:
40 |           node-version: '>=20.0.0'
41 |           cache: 'npm'
42 | 
43 |       - name: Install dependencies
44 |         run: npm ci
45 | 
46 |       - name: Install Playwright browsers
47 |         run: npx playwright install --no-shell --with-deps chromium
48 | 
49 |       - name: Run tests
50 |         run: npx vitest run
51 | 
52 |   build:
53 |     name: Build
54 |     runs-on: ubuntu-latest
55 |     needs: test # Run after tests pass
56 |     steps:
57 |       - name: Checkout code
58 |         uses: actions/checkout@v4
59 | 
60 |       - name: Set up Node.js
61 |         uses: actions/setup-node@v4
62 |         with:
63 |           node-version: '>=20.0.0'
64 |           cache: 'npm'
65 | 
66 |       - name: Install dependencies
67 |         run: npm ci
68 | 
69 |       - name: Run build
70 |         run: npm run build
71 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
  1 | name: Release
  2 | 
  3 | on:
  4 |   workflow_dispatch: # Allows manual triggering
  5 | 
  6 | jobs:
  7 |   release:
  8 |     name: Release to npm and GitHub
  9 |     runs-on: ubuntu-latest
 10 |     # Permissions needed for semantic-release to commit/tag/release
 11 |     permissions:
 12 |       contents: write
 13 |       issues: write
 14 |       pull-requests: write
 15 |       # id-token: write # Needed for OIDC trusted publishing (if not using NPM_TOKEN)
 16 |     outputs:
 17 |       # Output whether a new release was published
 18 |       new_release_published: ${{ steps.semantic.outputs.new_release_published }}
 19 |       new_release_version: ${{ steps.semantic.outputs.new_release_version }}
 20 |     steps:
 21 |       - name: Checkout code
 22 |         # Need fetch-depth: 0 for semantic-release to analyze all relevant commits
 23 |         # and commit package.json/CHANGELOG.md changes
 24 |         uses: actions/checkout@v4
 25 |         with:
 26 |           fetch-depth: 0
 27 | 
 28 |       - name: Set up Node.js
 29 |         uses: actions/setup-node@v4
 30 |         with:
 31 |           node-version: '>=20.0.0' # Match engines requirement in package.json
 32 |           registry-url: 'https://registry.npmjs.org' # Specify npm registry
 33 |           cache: 'npm'
 34 | 
 35 |       - name: Install dependencies
 36 |         run: npm ci
 37 | 
 38 |       - name: Run build
 39 |         run: npm run build
 40 | 
 41 |       - name: Run semantic-release
 42 |         id: semantic # Give step an ID to reference its outputs
 43 |         uses: cycjimmy/semantic-release-action@v4
 44 |         env:
 45 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 46 |           NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }}
 47 | 
 48 |   docker_publish:
 49 |     name: Build and Push Docker Image to GHCR
 50 |     # Run only after the release job completes successfully
 51 |     needs: release
 52 |     # Run only if semantic-release actually published a new version
 53 |     if: needs.release.outputs.new_release_published == 'true'
 54 |     runs-on: ubuntu-latest
 55 |     permissions:
 56 |       contents: read      # Needed to check out the code
 57 |       packages: write     # Needed to push Docker image to GHCR
 58 |       attestations: write # Needed for build attestations
 59 |       id-token: write     # Needed for OIDC (good practice)
 60 | 
 61 |     steps:
 62 |       - name: Checkout code
 63 |         # Checkout the specific commit tagged by semantic-release
 64 |         uses: actions/checkout@v4
 65 |         with:
 66 |           # Use the tag name determined by the release job
 67 |           ref: v${{ needs.release.outputs.new_release_version }}
 68 | 
 69 |       - name: Set up Docker Buildx
 70 |         uses: docker/setup-buildx-action@v3
 71 | 
 72 |       - name: Log in to GitHub Container Registry
 73 |         uses: docker/login-action@v3
 74 |         with:
 75 |           registry: ghcr.io
 76 |           username: ${{ github.actor }}
 77 |           password: ${{ secrets.GITHUB_TOKEN }}
 78 | 
 79 |       - name: Extract Docker metadata
 80 |         id: meta
 81 |         uses: docker/metadata-action@v5
 82 |         with:
 83 |           images: ghcr.io/${{ github.repository }}
 84 |           # Use the version from the semantic-release output
 85 |           tags: |
 86 |             type=raw,value=${{ needs.release.outputs.new_release_version }} # e.g., v1.4.1
 87 |             type=semver,pattern={{version}},value=${{ needs.release.outputs.new_release_version }} # e.g., 1.4.1
 88 |             type=semver,pattern=v{{major}}.{{minor}},value=${{ needs.release.outputs.new_release_version }} # e.g., v1.4
 89 |             type=semver,pattern=v{{major}},value=${{ needs.release.outputs.new_release_version }} # e.g., v1
 90 |             type=raw,value=latest,enable=true # Always tag latest on main branch release
 91 | 
 92 |       - name: Build and push Docker image
 93 |         id: push
 94 |         uses: docker/build-push-action@v6
 95 |         with:
 96 |           context: .
 97 |           push: true
 98 |           tags: ${{ steps.meta.outputs.tags }}
 99 |           labels: ${{ steps.meta.outputs.labels }}
100 |           cache-from: type=gha
101 |           cache-to: type=gha,mode=max
102 |           platforms: linux/amd64,linux/arm64 # Build for both x86_64 and arm64 (Mac Silicon)
103 | 
104 |       - name: Generate artifact attestation
105 |         uses: actions/attest-build-provenance@v1
106 |         with:
107 |           subject-name: ghcr.io/${{ github.repository }}
108 |           subject-digest: ${{ steps.push.outputs.digest }}
109 |           push-to-registry: true
110 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | dist/
3 | .store/
4 | public/assets/
5 | *.log
6 | .env*
7 | !*.env.example
8 | *.code-workspace
9 | 


--------------------------------------------------------------------------------
/.husky/commit-msg:
--------------------------------------------------------------------------------
1 | npx commitlint --edit $1
2 | 


--------------------------------------------------------------------------------
/.husky/pre-commit:
--------------------------------------------------------------------------------
1 | npx lint-staged
2 | 


--------------------------------------------------------------------------------
/.releaserc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "branches": ["main"],
 3 |   "plugins": [
 4 |     "@semantic-release/commit-analyzer",
 5 |     "@semantic-release/release-notes-generator",
 6 |     [
 7 |       "@semantic-release/changelog",
 8 |       {
 9 |         "changelogFile": "CHANGELOG.md"
10 |       }
11 |     ],
12 |     [
13 |       "@semantic-release/npm",
14 |       {
15 |         "npmPublish": true,
16 |         "pkgRoot": "."
17 |       }
18 |     ],
19 |     [
20 |       "@semantic-release/git",
21 |       {
22 |         "assets": ["package.json", "CHANGELOG.md"],
23 |         "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}"
24 |       }
25 |     ],
26 |     [
27 |       "@semantic-release/github",
28 |       {
29 |         "assets": []
30 |       }
31 |     ]
32 |   ]
33 | }
34 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build stage
 2 | FROM node:22-slim AS builder
 3 | 
 4 | WORKDIR /app
 5 | 
 6 | # Copy package files
 7 | COPY package*.json ./
 8 | 
 9 | # Install dependencies
10 | RUN npm ci
11 | 
12 | # Copy source code
13 | COPY . .
14 | 
15 | # Build application
16 | RUN npm run build
17 | 
18 | # Production stage
19 | FROM node:22-slim
20 | 
21 | WORKDIR /app
22 | 
23 | # Copy package files
24 | COPY package*.json .
25 | COPY db            db
26 | 
27 | # Install production dependencies only
28 | RUN npm ci --omit=dev
29 | 
30 | # Install system Chromium and required dependencies
31 | RUN apt-get update \
32 |   && apt-get install -y --no-install-recommends chromium \
33 |   && apt-get clean \
34 |   && rm -rf /var/lib/apt/lists/* /tmp/* \
35 |   && CHROMIUM_PATH=$(command -v chromium || command -v chromium-browser) \
36 |   && if [ -z "$CHROMIUM_PATH" ]; then echo "Chromium executable not found!" && exit 1; fi \
37 |   && if [ "$CHROMIUM_PATH" != "/usr/bin/chromium" ]; then echo "Unexpected Chromium path: $CHROMIUM_PATH" && exit 1; fi \
38 |   && echo "Chromium installed at $CHROMIUM_PATH"
39 | 
40 | # Set Playwright to use system Chromium (hardcoded path, as ENV cannot use shell vars)
41 | ENV PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium
42 | 
43 | # Copy built files from builder
44 | COPY --from=builder /app/dist ./dist
45 | COPY --from=builder /app/public ./public
46 | 
47 | # Set data directory for the container
48 | ENV DOCS_MCP_STORE_PATH=/data
49 | 
50 | # Define volumes
51 | VOLUME /data
52 | 
53 | # Expose the ports the applications listen on
54 | EXPOSE 6280
55 | EXPOSE 6281
56 | 
57 | # Set the command to run the application
58 | ENTRYPOINT ["node", "dist/index.js"]
59 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Andre Rabold
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/biome.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "$schema": "https://biomejs.dev/schema.json",
 3 |   "formatter": {
 4 |     "enabled": true,
 5 |     "formatWithErrors": false,
 6 |     "ignore": [],
 7 |     "attributePosition": "auto",
 8 |     "indentStyle": "space",
 9 |     "indentWidth": 2,
10 |     "lineWidth": 90,
11 |     "lineEnding": "lf"
12 |   },
13 |   "files": {
14 |     "include": ["src/**/*.ts"]
15 |   },
16 |   "overrides": [
17 |     {
18 |       "include": ["src/**/*.test.ts"],
19 |       "linter": {
20 |         "rules": {
21 |           "style": {
22 |             "noNonNullAssertion": "off"
23 |           }
24 |         }
25 |       }
26 |     }
27 |   ]
28 | }
29 | 


--------------------------------------------------------------------------------
/commitlint.config.js:
--------------------------------------------------------------------------------
 1 | // commitlint.config.js
 2 | /** @type {import('@commitlint/types').UserConfig} */
 3 | export default {
 4 |   extends: ["@commitlint/config-conventional"],
 5 |   rules: {
 6 |     "body-max-line-length": [0, "always"],
 7 |     "footer-max-line-length": [0, "always"],
 8 |   },
 9 | };
10 | 


--------------------------------------------------------------------------------
/db/migrations/000-initial-schema.sql:
--------------------------------------------------------------------------------
 1 | -- Initial database schema setup
 2 | 
 3 | -- Documents table
 4 | CREATE TABLE IF NOT EXISTS documents(
 5 |   id INTEGER PRIMARY KEY AUTOINCREMENT,
 6 |   library TEXT NOT NULL,
 7 |   version TEXT NOT NULL DEFAULT '',
 8 |   url TEXT NOT NULL,
 9 |   content TEXT,
10 |   metadata JSON,
11 |   sort_order INTEGER NOT NULL,
12 |   UNIQUE(url, library, version, sort_order)
13 | );
14 | 
15 | -- Indexes
16 | CREATE INDEX IF NOT EXISTS idx_documents_library_lower ON documents(lower(library));
17 | CREATE INDEX IF NOT EXISTS idx_documents_version_lower ON documents(lower(library), lower(version));
18 | 
19 | -- Create Embeddings virtual table
20 | -- Note: Dimension is hardcoded here based on the value in schema.ts at the time of creation.
21 | -- If VECTOR_DIMENSION changes, a separate migration would be needed to update/recreate this table.
22 | CREATE VIRTUAL TABLE IF NOT EXISTS documents_vec USING vec0(
23 |   library TEXT NOT NULL,
24 |   version TEXT NOT NULL,
25 |   embedding FLOAT[1536]
26 | );
27 | 
28 | -- Create FTS5 virtual table
29 | CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5(
30 |   content,
31 |   title,
32 |   url,
33 |   path,
34 |   tokenize='porter unicode61',
35 |   content='documents',
36 |   content_rowid='id'
37 | );
38 | 
39 | -- Delete trigger to maintain FTS index
40 | CREATE TRIGGER IF NOT EXISTS documents_fts_after_delete AFTER DELETE ON documents BEGIN
41 |   INSERT INTO documents_fts(documents_fts, rowid, content, title, url, path)
42 |   VALUES('delete', old.id, old.content, json_extract(old.metadata, '$.title'), old.url, json_extract(old.metadata, '$.path'));
43 | END;
44 | 
45 | -- Update trigger to maintain FTS index
46 | CREATE TRIGGER IF NOT EXISTS documents_fts_after_update AFTER UPDATE ON documents BEGIN
47 |   INSERT INTO documents_fts(documents_fts, rowid, content, title, url, path)
48 |   VALUES('delete', old.id, old.content, json_extract(old.metadata, '$.title'), old.url, json_extract(old.metadata, '$.path'));
49 |   INSERT INTO documents_fts(rowid, content, title, url, path)
50 |   VALUES(new.id, new.content, json_extract(new.metadata, '$.title'), new.url, json_extract(new.metadata, '$.path'));
51 | END;
52 | 
53 | -- Insert trigger to maintain FTS index
54 | CREATE TRIGGER IF NOT EXISTS documents_fts_after_insert AFTER INSERT ON documents BEGIN
55 |   INSERT INTO documents_fts(rowid, content, title, url, path)
56 |   VALUES(new.id, new.content, json_extract(new.metadata, '$.title'), new.url, json_extract(new.metadata, '$.path'));
57 | END;
58 | 


--------------------------------------------------------------------------------
/db/migrations/001-add-indexed-at-column.sql:
--------------------------------------------------------------------------------
1 | -- Add indexed_at column to track when documents were last indexed
2 | -- Step 1: Add the column allowing NULLs (SQLite limitation workaround)
3 | ALTER TABLE documents ADD COLUMN indexed_at DATETIME;
4 | 
5 | -- Step 2: Update existing rows to set the timestamp
6 | UPDATE documents SET indexed_at = CURRENT_TIMESTAMP WHERE indexed_at IS NULL;
7 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   docs-mcp-server:
 3 |     image: ghcr.io/arabold/docs-mcp-server:latest
 4 |     build:
 5 |       context: .
 6 |       dockerfile: Dockerfile
 7 |     command: ["--protocol", "http", "--port", "6280"]
 8 |     # platform: linux/amd64
 9 |     container_name: docs-mcp-server
10 |     ports:
11 |       - "6280:6280"
12 |     env_file:
13 |       - .env
14 |     environment:
15 |       - MCP_PORT=6280
16 |     volumes:
17 |       - docs-mcp-data:/data
18 | 
19 |   docs-mcp-web:
20 |     image: ghcr.io/arabold/docs-mcp-server:latest
21 |     build:
22 |       context: .
23 |       dockerfile: Dockerfile
24 |     command: ["web", "--port", "6281"]
25 |     # platform: linux/amd64
26 |     container_name: docs-mcp-web
27 |     ports:
28 |       - "6281:6281"
29 |     env_file:
30 |       - .env
31 |     environment:
32 |       - WEB_PORT=6281
33 |     volumes:
34 |       - docs-mcp-data:/data
35 | 
36 | volumes:
37 |   docs-mcp-data:
38 |     name: docs-mcp-data
39 | 


--------------------------------------------------------------------------------
/docs/docs-mcp-server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arabold/docs-mcp-server/2c6fb88ac09b82baea2068e878855d5b78e2a6e2/docs/docs-mcp-server.png


--------------------------------------------------------------------------------
/postcss.config.cjs:
--------------------------------------------------------------------------------
1 | module.exports = {
2 |   plugins: {
3 |     "@tailwindcss/postcss": {}, // Use the dedicated PostCSS plugin
4 |     autoprefixer: {},
5 |   },
6 | };
7 | 


--------------------------------------------------------------------------------
/src/mcp/index.ts:
--------------------------------------------------------------------------------
 1 | import "dotenv/config";
 2 | import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 3 | import type { PipelineManager } from "../pipeline/PipelineManager";
 4 | import type { DocumentManagementService } from "../store/DocumentManagementService";
 5 | import { logger } from "../utils/logger";
 6 | import { startHttpServer } from "./startHttpServer";
 7 | import { startStdioServer } from "./startStdioServer";
 8 | import { type McpServerTools, initializeTools } from "./tools";
 9 | 
10 | // Variables to hold server instances for cleanup
11 | let runningServer: McpServer | null = null;
12 | 
13 | export async function startServer(
14 |   protocol: "stdio" | "http",
15 |   docService: DocumentManagementService, // NEW PARAM
16 |   pipelineManager: PipelineManager, // NEW PARAM
17 |   port?: number, // Existing optional param
18 | ) {
19 |   try {
20 |     // Initialize and get shared tools
21 |     const tools: McpServerTools = await initializeTools(docService, pipelineManager); // Pass instances
22 | 
23 |     let serverInstance: McpServer;
24 |     if (protocol === "stdio") {
25 |       serverInstance = await startStdioServer(tools); // startStdioServer needs to return McpServer
26 |     } else if (protocol === "http") {
27 |       if (port === undefined) {
28 |         logger.error("❌ HTTP protocol requires a port.");
29 |         process.exit(1);
30 |       }
31 |       serverInstance = await startHttpServer(tools, port); // startHttpServer needs to return McpServer
32 |     } else {
33 |       // This case should be caught by src/server.ts, but handle defensively
34 |       logger.error(`❌ Unknown protocol: ${protocol}`);
35 |       process.exit(1);
36 |     }
37 | 
38 |     // Capture the running server instance
39 |     runningServer = serverInstance;
40 |   } catch (error) {
41 |     logger.error(`❌ Fatal Error during server startup: ${error}`);
42 |     // Attempt cleanup even if startup failed partially
43 |     await stopServer();
44 |     process.exit(1);
45 |   }
46 | }
47 | 
48 | /**
49 |  * Stops the MCP server instance gracefully.
50 |  * Shared services (PipelineManager, DocumentManagementService) are shut down
51 |  * separately by the caller (e.g., src/index.ts).
52 |  */
53 | export async function stopServer() {
54 |   logger.debug("Attempting to close MCP Server instance...");
55 |   let hadError = false;
56 |   try {
57 |     if (runningServer) {
58 |       logger.debug("Closing MCP Server instance (McpServer/McpHttpServer)...");
59 |       await runningServer.close();
60 |       logger.debug("MCP Server instance closed.");
61 |     } else {
62 |       logger.debug("MCP Server instance was not running or already null.");
63 |     }
64 |   } catch (e) {
65 |     logger.error(`❌ Error closing MCP Server instance: ${e}`);
66 |     hadError = true;
67 |   }
68 | 
69 |   runningServer = null;
70 |   // DocumentManagementService and PipelineManager instances are managed and shut down by src/index.ts.
71 | 
72 |   if (hadError) {
73 |     logger.warn("⚠️  MCP Server instance stopped with errors.");
74 |   } else {
75 |     logger.info("✅ MCP Server instance stopped.");
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/mcp/startStdioServer.ts:
--------------------------------------------------------------------------------
 1 | import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
 2 | import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js";
 3 | import { LogLevel, logger, setLogLevel } from "../utils/logger";
 4 | import { createMcpServerInstance } from "./mcpServer";
 5 | import type { McpServerTools } from "./tools";
 6 | 
 7 | /**
 8 |  * Starts the MCP server using the Stdio transport.
 9 |  * @param tools The shared tool instances.
10 |  * @returns The created McpServer instance.
11 |  */
12 | export async function startStdioServer(tools: McpServerTools): Promise<McpServer> {
13 |   setLogLevel(LogLevel.ERROR);
14 | 
15 |   // Create a server instance using the factory and shared tools
16 |   const server = createMcpServerInstance(tools);
17 | 
18 |   // Start server with Stdio transport
19 |   const transport = new StdioServerTransport();
20 |   await server.connect(transport);
21 |   logger.info("🤖 MCP server listening on stdio");
22 | 
23 |   // Return the server instance
24 |   return server;
25 | }
26 | 


--------------------------------------------------------------------------------
/src/mcp/tools.ts:
--------------------------------------------------------------------------------
 1 | import type { PipelineManager } from "../pipeline/PipelineManager";
 2 | import { FileFetcher, HttpFetcher } from "../scraper/fetcher";
 3 | import type { DocumentManagementService } from "../store/DocumentManagementService";
 4 | import {
 5 |   CancelJobTool,
 6 |   FetchUrlTool,
 7 |   FindVersionTool,
 8 |   GetJobInfoTool,
 9 |   ListJobsTool,
10 |   ListLibrariesTool,
11 |   RemoveTool,
12 |   ScrapeTool,
13 |   SearchTool,
14 | } from "../tools";
15 | 
16 | /**
17 |  * Interface for the shared tool instances.
18 |  */
19 | export interface McpServerTools {
20 |   listLibraries: ListLibrariesTool;
21 |   findVersion: FindVersionTool;
22 |   scrape: ScrapeTool;
23 |   search: SearchTool;
24 |   listJobs: ListJobsTool;
25 |   getJobInfo: GetJobInfoTool;
26 |   cancelJob: CancelJobTool;
27 |   remove: RemoveTool;
28 |   fetchUrl: FetchUrlTool;
29 | }
30 | 
31 | /**
32 |  * Initializes and returns the shared tool instances.
33 |  * This should be called after initializeServices has completed.
34 |  * @param docService The initialized DocumentManagementService instance.
35 |  * @param pipelineManager The initialized PipelineManager instance.
36 |  * @returns An object containing all instantiated tool instances.
37 |  */
38 | export async function initializeTools(
39 |   docService: DocumentManagementService,
40 |   pipelineManager: PipelineManager,
41 | ): Promise<McpServerTools> {
42 |   const tools: McpServerTools = {
43 |     listLibraries: new ListLibrariesTool(docService),
44 |     findVersion: new FindVersionTool(docService),
45 |     scrape: new ScrapeTool(docService, pipelineManager),
46 |     search: new SearchTool(docService),
47 |     listJobs: new ListJobsTool(pipelineManager),
48 |     getJobInfo: new GetJobInfoTool(pipelineManager),
49 |     cancelJob: new CancelJobTool(pipelineManager),
50 |     // clearCompletedJobs: new ClearCompletedJobsTool(pipelineManager),
51 |     remove: new RemoveTool(docService, pipelineManager),
52 |     fetchUrl: new FetchUrlTool(new HttpFetcher(), new FileFetcher()),
53 |   };
54 | 
55 |   return tools;
56 | }
57 | 


--------------------------------------------------------------------------------
/src/mcp/utils.ts:
--------------------------------------------------------------------------------
 1 | import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js";
 2 | 
 3 | /**
 4 |  * Creates a success response object in the format expected by the MCP server.
 5 |  * @param text The text content of the response.
 6 |  * @returns The response object.
 7 |  */
 8 | export function createResponse(text: string): CallToolResult {
 9 |   return {
10 |     content: [
11 |       {
12 |         type: "text",
13 |         text,
14 |       },
15 |     ],
16 |     isError: false,
17 |   };
18 | }
19 | 
20 | /**
21 |  * Creates an error response object in the format expected by the MCP server.
22 |  * @param text The error message.
23 |  * @returns The response object.
24 |  */
25 | export function createError(text: string): CallToolResult {
26 |   return {
27 |     content: [
28 |       {
29 |         type: "text",
30 |         text,
31 |       },
32 |     ],
33 |     isError: true,
34 |   };
35 | }
36 | 


--------------------------------------------------------------------------------
/src/pipeline/PipelineWorker.ts:
--------------------------------------------------------------------------------
  1 | import type { ScraperService } from "../scraper";
  2 | import type { ScraperProgress } from "../scraper/types";
  3 | import type { DocumentManagementService } from "../store";
  4 | import { logger } from "../utils/logger";
  5 | import { CancellationError } from "./errors";
  6 | import type { PipelineJob, PipelineManagerCallbacks } from "./types";
  7 | 
  8 | /**
  9 |  * Executes a single document processing job.
 10 |  * Handles scraping, storing documents, and reporting progress/errors via callbacks.
 11 |  */
 12 | export class PipelineWorker {
 13 |   // Dependencies are passed in, making the worker stateless regarding specific jobs
 14 |   private readonly store: DocumentManagementService;
 15 |   private readonly scraperService: ScraperService;
 16 | 
 17 |   // Constructor accepts dependencies needed for execution
 18 |   constructor(store: DocumentManagementService, scraperService: ScraperService) {
 19 |     this.store = store;
 20 |     this.scraperService = scraperService;
 21 |   }
 22 | 
 23 |   /**
 24 |    * Executes the given pipeline job.
 25 |    * @param job - The job to execute.
 26 |    * @param callbacks - Callbacks provided by the manager for reporting.
 27 |    */
 28 |   async executeJob(job: PipelineJob, callbacks: PipelineManagerCallbacks): Promise<void> {
 29 |     const { id: jobId, library, version, options, abortController } = job;
 30 |     const signal = abortController.signal;
 31 | 
 32 |     logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`);
 33 | 
 34 |     try {
 35 |       // --- Core Job Logic ---
 36 |       await this.scraperService.scrape(
 37 |         options,
 38 |         async (progress: ScraperProgress) => {
 39 |           // Check for cancellation signal before processing each document
 40 |           if (signal.aborted) {
 41 |             throw new CancellationError("Job cancelled during scraping progress");
 42 |           }
 43 | 
 44 |           // Update job object directly (manager holds the reference)
 45 |           job.progress = progress;
 46 |           // Report progress via manager's callback
 47 |           await callbacks.onJobProgress?.(job, progress);
 48 | 
 49 |           if (progress.document) {
 50 |             try {
 51 |               // TODO: Pass signal to store.addDocument if it supports it
 52 |               await this.store.addDocument(library, version, {
 53 |                 pageContent: progress.document.content,
 54 |                 metadata: progress.document.metadata,
 55 |               });
 56 |               logger.debug(
 57 |                 `[${jobId}] Stored document: ${progress.document.metadata.url}`,
 58 |               );
 59 |             } catch (docError) {
 60 |               logger.error(
 61 |                 `❌ [${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`,
 62 |               );
 63 |               // Report document-specific errors via manager's callback
 64 |               await callbacks.onJobError?.(
 65 |                 job,
 66 |                 docError instanceof Error ? docError : new Error(String(docError)),
 67 |                 progress.document,
 68 |               );
 69 |               // Decide if a single document error should fail the whole job
 70 |               // For now, we log and continue. To fail, re-throw here.
 71 |             }
 72 |           }
 73 |         },
 74 |         signal, // Pass signal to scraper service
 75 |       );
 76 |       // --- End Core Job Logic ---
 77 | 
 78 |       // Check signal one last time after scrape finishes
 79 |       if (signal.aborted) {
 80 |         throw new CancellationError("Job cancelled");
 81 |       }
 82 | 
 83 |       // If successful and not cancelled, the manager will handle status update
 84 |       logger.debug(`[${jobId}] Worker finished job successfully.`);
 85 |     } catch (error) {
 86 |       // Re-throw error to be caught by the manager in _runJob
 87 |       logger.warn(`⚠️  [${jobId}] Worker encountered error: ${error}`);
 88 |       throw error;
 89 |     }
 90 |     // Note: The manager (_runJob) is responsible for updating final job status (COMPLETED/FAILED/CANCELLED)
 91 |     // and resolving/rejecting the completion promise based on the outcome here.
 92 |   }
 93 | 
 94 |   // --- Old methods removed ---
 95 |   // process()
 96 |   // stop()
 97 |   // setCallbacks()
 98 |   // handleScrapingProgress()
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/pipeline/errors.ts:
--------------------------------------------------------------------------------
 1 | export class PipelineError extends Error {
 2 |   constructor(
 3 |     message: string,
 4 |     public readonly cause?: Error,
 5 |   ) {
 6 |     super(message);
 7 |     this.name = this.constructor.name;
 8 |     if (cause?.stack) {
 9 |       this.stack = `${this.stack}\nCaused by: ${cause.stack}`;
10 |     }
11 |   }
12 | }
13 | 
14 | export class DocumentProcessingError extends PipelineError {
15 |   constructor(
16 |     message: string,
17 |     public readonly documentId: string,
18 |     cause?: Error,
19 |   ) {
20 |     super(`Failed to process document ${documentId}: ${message}`, cause);
21 |   }
22 | }
23 | 
24 | export class PipelineStateError extends PipelineError {}
25 | 
26 | /**
27 |  * Error indicating that an operation was cancelled.
28 |  */
29 | export class CancellationError extends PipelineError {
30 |   constructor(message = "Operation cancelled") {
31 |     super(message);
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/src/pipeline/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./PipelineManager";
2 | export * from "./PipelineWorker";
3 | export * from "./errors";
4 | 


--------------------------------------------------------------------------------
/src/pipeline/types.ts:
--------------------------------------------------------------------------------
 1 | import type { ScraperOptions, ScraperProgress } from "../scraper/types";
 2 | import type { Document } from "../types"; // Use local Document type
 3 | 
 4 | /**
 5 |  * Represents the possible states of a pipeline job.
 6 |  */
 7 | export enum PipelineJobStatus {
 8 |   QUEUED = "queued",
 9 |   RUNNING = "running",
10 |   COMPLETED = "completed",
11 |   FAILED = "failed",
12 |   CANCELLING = "cancelling",
13 |   CANCELLED = "cancelled",
14 | }
15 | 
16 | /**
17 |  * Represents a single document processing job within the pipeline.
18 |  */
19 | export interface PipelineJob {
20 |   /** Unique identifier for the job. */
21 |   id: string;
22 |   /** The library name associated with the job. */
23 |   library: string;
24 |   /** The library version associated with the job. */
25 |   version: string;
26 |   /** Options provided for the scraper. */
27 |   options: ScraperOptions;
28 |   /** Current status of the job. */
29 |   status: PipelineJobStatus;
30 |   /** Detailed progress information. */
31 |   progress: ScraperProgress | null;
32 |   /** Error object if the job failed. */
33 |   error: Error | null;
34 |   /** Timestamp when the job was created. */
35 |   createdAt: Date;
36 |   /** Timestamp when the job started running. */
37 |   startedAt: Date | null;
38 |   /** Timestamp when the job finished (completed, failed, or cancelled). */
39 |   finishedAt: Date | null;
40 |   /** AbortController to signal cancellation. */
41 |   abortController: AbortController;
42 |   /** Promise that resolves/rejects when the job finishes. */
43 |   completionPromise: Promise<void>;
44 |   /** Resolver function for the completion promise. */
45 |   resolveCompletion: () => void;
46 |   /** Rejector function for the completion promise. */
47 |   rejectCompletion: (reason?: unknown) => void;
48 | }
49 | 
50 | /**
51 |  * Defines the structure for callback functions used with the PipelineManager.
52 |  * Allows external components to hook into job lifecycle events.
53 |  */
54 | export interface PipelineManagerCallbacks {
55 |   /** Callback triggered when a job's status changes. */
56 |   onJobStatusChange?: (job: PipelineJob) => Promise<void>;
57 |   /** Callback triggered when a job makes progress. */
58 |   onJobProgress?: (job: PipelineJob, progress: ScraperProgress) => Promise<void>;
59 |   /** Callback triggered when a job encounters an error during processing (e.g., storing a doc). */
60 |   onJobError?: (job: PipelineJob, error: Error, document?: Document) => Promise<void>;
61 | }
62 | 


--------------------------------------------------------------------------------
/src/scraper/ScraperRegistry.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it, vi } from "vitest";
 2 | import { ScraperError } from "../utils/errors";
 3 | import { ScraperRegistry } from "./ScraperRegistry";
 4 | import { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy";
 5 | import { LocalFileStrategy } from "./strategies/LocalFileStrategy";
 6 | import { NpmScraperStrategy } from "./strategies/NpmScraperStrategy";
 7 | import { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy";
 8 | 
 9 | vi.mock("../utils/logger");
10 | 
11 | describe("ScraperRegistry", () => {
12 |   it("should throw error for unknown URLs", () => {
13 |     const registry = new ScraperRegistry();
14 |     expect(() => registry.getStrategy("invalid://example.com")).toThrow(ScraperError);
15 |     expect(() => registry.getStrategy("invalid://example.com")).toThrow(
16 |       "No strategy found for URL",
17 |     );
18 |   });
19 | 
20 |   it("should return LocalFileStrategy for file:// URLs", () => {
21 |     const registry = new ScraperRegistry();
22 |     const strategy = registry.getStrategy("file:///path/to/file.txt");
23 |     expect(strategy).toBeInstanceOf(LocalFileStrategy);
24 |   });
25 | 
26 |   it("should return GitHubScraperStrategy for GitHub URLs", () => {
27 |     const registry = new ScraperRegistry();
28 |     const strategy = registry.getStrategy("https://github.com/user/repo");
29 |     expect(strategy).toBeInstanceOf(GitHubScraperStrategy);
30 |   });
31 | 
32 |   it("should return NpmScraperStrategy for NPM URLs", () => {
33 |     const registry = new ScraperRegistry();
34 |     const strategy = registry.getStrategy("https://npmjs.com/package/test");
35 |     expect(strategy).toBeInstanceOf(NpmScraperStrategy);
36 |   });
37 | 
38 |   it("should return PyPiScraperStrategy for PyPI URLs", () => {
39 |     const registry = new ScraperRegistry();
40 |     const strategy = registry.getStrategy("https://pypi.org/project/test");
41 |     expect(strategy).toBeInstanceOf(PyPiScraperStrategy);
42 |   });
43 | });
44 | 


--------------------------------------------------------------------------------
/src/scraper/ScraperRegistry.ts:
--------------------------------------------------------------------------------
 1 | import { ScraperError } from "../utils/errors";
 2 | import { validateUrl } from "../utils/url";
 3 | import { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy";
 4 | import { LocalFileStrategy } from "./strategies/LocalFileStrategy";
 5 | import { NpmScraperStrategy } from "./strategies/NpmScraperStrategy";
 6 | import { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy";
 7 | import { WebScraperStrategy } from "./strategies/WebScraperStrategy";
 8 | import type { ScraperStrategy } from "./types";
 9 | 
10 | export class ScraperRegistry {
11 |   private strategies: ScraperStrategy[];
12 | 
13 |   constructor() {
14 |     this.strategies = [
15 |       new NpmScraperStrategy(),
16 |       new PyPiScraperStrategy(),
17 |       new GitHubScraperStrategy(),
18 |       new WebScraperStrategy(),
19 |       new LocalFileStrategy(),
20 |     ];
21 |   }
22 | 
23 |   getStrategy(url: string): ScraperStrategy {
24 |     validateUrl(url);
25 |     const strategy = this.strategies.find((s) => s.canHandle(url));
26 |     if (!strategy) {
27 |       throw new ScraperError(`No strategy found for URL: ${url}`);
28 |     }
29 |     return strategy;
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/scraper/ScraperService.ts:
--------------------------------------------------------------------------------
 1 | import type { ProgressCallback } from "../types";
 2 | import { ScraperError } from "../utils/errors";
 3 | import type { ScraperRegistry } from "./ScraperRegistry";
 4 | import type { ScraperOptions, ScraperProgress } from "./types";
 5 | 
 6 | /**
 7 |  * Orchestrates document scraping operations using registered scraping strategies.
 8 |  * Automatically selects appropriate strategy based on URL patterns.
 9 |  */
10 | export class ScraperService {
11 |   private registry: ScraperRegistry;
12 | 
13 |   constructor(registry: ScraperRegistry) {
14 |     this.registry = registry;
15 |   }
16 | 
17 |   /**
18 |    * Scrapes content from the provided URL using the appropriate strategy.
19 |    * Reports progress via callback and handles errors.
20 |    */
21 |   async scrape(
22 |     options: ScraperOptions,
23 |     progressCallback: ProgressCallback<ScraperProgress>,
24 |     signal?: AbortSignal, // Add optional signal parameter
25 |   ): Promise<void> {
26 |     // Find strategy for this URL
27 |     const strategy = this.registry.getStrategy(options.url);
28 |     if (!strategy) {
29 |       throw new ScraperError(`No scraper strategy found for URL: ${options.url}`, false);
30 |     }
31 | 
32 |     // Pass the signal down to the strategy
33 |     await strategy.scrape(options, progressCallback, signal);
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/scraper/fetcher/FileFetcher.ts:
--------------------------------------------------------------------------------
 1 | import fs from "node:fs/promises";
 2 | import path from "node:path";
 3 | import * as mime from "mime-types";
 4 | import { ScraperError } from "../../utils/errors";
 5 | import { logger } from "../../utils/logger";
 6 | import type { ContentFetcher, FetchOptions, RawContent } from "./types";
 7 | 
 8 | /**
 9 |  * Fetches content from local file system.
10 |  */
11 | export class FileFetcher implements ContentFetcher {
12 |   canFetch(source: string): boolean {
13 |     return source.startsWith("file://");
14 |   }
15 | 
16 |   /**
17 |    * Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed.
18 |    * Only HTML and Markdown files are processed.
19 |    */
20 |   async fetch(source: string, options?: FetchOptions): Promise<RawContent> {
21 |     // Always decode the file path from file:// URL
22 |     const rawPath = source.replace("file://", "");
23 |     const filePath = decodeURIComponent(rawPath);
24 | 
25 |     try {
26 |       const content = await fs.readFile(filePath);
27 |       const ext = path.extname(filePath).toLowerCase();
28 |       const mimeType = mime.lookup(ext) || "application/octet-stream";
29 |       return {
30 |         content,
31 |         mimeType,
32 |         source,
33 |         encoding: "utf-8", // Assume UTF-8 for text files
34 |       };
35 |     } catch (error: unknown) {
36 |       throw new ScraperError(
37 |         `Failed to read file ${filePath}: ${
38 |           (error as { message?: string }).message ?? "Unknown error"
39 |         }`,
40 |         false,
41 |         error instanceof Error ? error : undefined,
42 |       );
43 |     }
44 |   }
45 | }
46 | 


--------------------------------------------------------------------------------
/src/scraper/fetcher/FingerprintGenerator.test.ts:
--------------------------------------------------------------------------------
 1 | import type { HeaderGeneratorOptions } from "header-generator";
 2 | import { describe, expect, it } from "vitest";
 3 | import { FingerprintGenerator } from "./FingerprintGenerator";
 4 | 
 5 | describe("FingerprintGenerator", () => {
 6 |   it("should be instantiated without options", () => {
 7 |     const generator = new FingerprintGenerator();
 8 |     expect(generator).toBeInstanceOf(FingerprintGenerator);
 9 |   });
10 | 
11 |   it("should be instantiated with options", () => {
12 |     const options: Partial<HeaderGeneratorOptions> = {
13 |       browsers: ["firefox"],
14 |     };
15 |     const generator = new FingerprintGenerator(options);
16 |     expect(generator).toBeInstanceOf(FingerprintGenerator);
17 |   });
18 | 
19 |   it("should generate headers", () => {
20 |     const generator = new FingerprintGenerator();
21 |     const headers = generator.generateHeaders();
22 |     expect(headers).toBeDefined();
23 |     expect(typeof headers).toBe("object");
24 |     expect(Object.keys(headers).length).toBeGreaterThan(0);
25 |     expect(headers["user-agent"]).toBeDefined();
26 |     expect(headers.accept).toBeDefined();
27 |     expect(headers["accept-language"]).toBeDefined();
28 |   });
29 | });
30 | 


--------------------------------------------------------------------------------
/src/scraper/fetcher/FingerprintGenerator.ts:
--------------------------------------------------------------------------------
 1 | import { HeaderGenerator, type HeaderGeneratorOptions } from "header-generator";
 2 | 
 3 | /**
 4 |  * Generates realistic browser-like HTTP headers to help avoid bot detection.
 5 |  * Uses the `header-generator` library for header generation.
 6 |  */
 7 | export class FingerprintGenerator {
 8 |   private headerGenerator: HeaderGenerator;
 9 | 
10 |   /**
11 |    * Creates an instance of FingerprintGenerator.
12 |    * @param options Optional configuration for the header generator.
13 |    */
14 |   constructor(options?: Partial<HeaderGeneratorOptions>) {
15 |     // Default options for a broad range of realistic headers
16 |     const defaultOptions: Partial<HeaderGeneratorOptions> = {
17 |       browsers: [{ name: "chrome", minVersion: 100 }, "firefox", "safari"],
18 |       devices: ["desktop", "mobile"],
19 |       operatingSystems: ["windows", "linux", "macos", "android", "ios"],
20 |       locales: ["en-US", "en"],
21 |       httpVersion: "2",
22 |     };
23 | 
24 |     this.headerGenerator = new HeaderGenerator({
25 |       ...defaultOptions,
26 |       ...options,
27 |     });
28 |   }
29 | 
30 |   /**
31 |    * Generates a set of realistic HTTP headers.
32 |    * @returns A set of realistic HTTP headers.
33 |    */
34 |   generateHeaders(): Record<string, string> {
35 |     return this.headerGenerator.getHeaders();
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/scraper/fetcher/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./types";
2 | export * from "./HttpFetcher";
3 | export * from "./FileFetcher";
4 | 


--------------------------------------------------------------------------------
/src/scraper/fetcher/types.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Raw content fetched from a source before processing.
 3 |  * Includes metadata about the content for proper processing.
 4 |  */
 5 | export interface RawContent {
 6 |   /** Raw content as string or buffer */
 7 |   content: string | Buffer;
 8 |   /**
 9 |    * MIME type of the content (e.g., "text/html", "application/json").
10 |    * Does not include parameters like charset.
11 |    */
12 |   mimeType: string;
13 |   /**
14 |    * Character set of the content (e.g., "utf-8"), extracted from Content-Type header.
15 |    */
16 |   charset?: string;
17 |   /**
18 |    * Content encoding (e.g., "gzip", "deflate"), from Content-Encoding header.
19 |    */
20 |   encoding?: string;
21 |   /** Original source location */
22 |   source: string;
23 | }
24 | 
25 | /**
26 |  * Options for configuring content fetching behavior
27 |  */
28 | export interface FetchOptions {
29 |   /** Maximum retry attempts for failed fetches */
30 |   maxRetries?: number;
31 |   /** Base delay between retries in milliseconds */
32 |   retryDelay?: number;
33 |   /** Additional headers for HTTP requests */
34 |   headers?: Record<string, string>;
35 |   /** Timeout in milliseconds */
36 |   timeout?: number;
37 |   /** AbortSignal for cancellation */
38 |   signal?: AbortSignal;
39 |   /** Whether to follow HTTP redirects (3xx responses) */
40 |   followRedirects?: boolean;
41 | }
42 | 
43 | /**
44 |  * Interface for fetching content from different sources
45 |  */
46 | export interface ContentFetcher {
47 |   /**
48 |    * Check if this fetcher can handle the given source
49 |    */
50 |   canFetch(source: string): boolean;
51 | 
52 |   /**
53 |    * Fetch content from the source
54 |    */
55 |   fetch(source: string, options?: FetchOptions): Promise<RawContent>;
56 | }
57 | 


--------------------------------------------------------------------------------
/src/scraper/index.ts:
--------------------------------------------------------------------------------
1 | // Re-export strategies for external use if needed
2 | export { WebScraperStrategy } from "./strategies/WebScraperStrategy";
3 | export { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy";
4 | export { LocalFileStrategy } from "./strategies/LocalFileStrategy";
5 | export { NpmScraperStrategy } from "./strategies/NpmScraperStrategy";
6 | export { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy";
7 | export { ScraperRegistry } from "./ScraperRegistry";
8 | export { ScraperService } from "./ScraperService";
9 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/HtmlCheerioParserMiddleware.ts:
--------------------------------------------------------------------------------
 1 | import * as cheerio from "cheerio";
 2 | import { logger } from "../../utils/logger";
 3 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types";
 4 | 
 5 | /**
 6 |  * Middleware to parse HTML string/buffer content into a Cheerio object.
 7 |  * It populates the `context.dom` property.
 8 |  * Assumes the input HTML in `context.content` is the final version to be parsed
 9 |  * (e.g., after potential rendering by Playwright or modification by JS execution).
10 |  */
11 | export class HtmlCheerioParserMiddleware implements ContentProcessorMiddleware {
12 |   async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> {
13 |     try {
14 |       logger.debug(`Parsing HTML content with Cheerio from ${context.source}`);
15 |       // Load the HTML string using Cheerio
16 |       const $ = cheerio.load(context.content);
17 | 
18 |       // Add the Cheerio API object to the context
19 |       context.dom = $;
20 | 
21 |       // Proceed to the next middleware
22 |       await next();
23 |     } catch (error) {
24 |       logger.error(
25 |         `❌ Failed to parse HTML with Cheerio for ${context.source}: ${error}`,
26 |       );
27 |       context.errors.push(
28 |         error instanceof Error
29 |           ? error
30 |           : new Error(`Cheerio HTML parsing failed: ${String(error)}`),
31 |       );
32 |       // Do not proceed further down the pipeline if parsing fails
33 |       return;
34 |     }
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/HtmlLinkExtractorMiddleware.ts:
--------------------------------------------------------------------------------
 1 | import { logger } from "../../utils/logger";
 2 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types";
 3 | 
 4 | /**
 5 |  * Middleware to extract links (href attributes from <a> tags) from HTML content using Cheerio.
 6 |  * It expects the Cheerio API object to be available in `context.dom`.
 7 |  * This should run *after* parsing but *before* conversion to Markdown.
 8 |  */
 9 | export class HtmlLinkExtractorMiddleware implements ContentProcessorMiddleware {
10 |   /**
11 |    * Processes the context to extract links from the sanitized HTML body.
12 |    * @param context The current middleware context.
13 |    * @param next Function to call the next middleware.
14 |    */
15 |   async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> {
16 |     // Check if we have a Cheerio object from a previous step
17 |     const $ = context.dom;
18 |     if (!$) {
19 |       logger.warn(
20 |         `⏭️ Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`,
21 |       );
22 |       await next();
23 |       return;
24 |     }
25 | 
26 |     try {
27 |       const linkElements = $("a[href]");
28 |       logger.debug(`Found ${linkElements.length} potential links in ${context.source}`);
29 | 
30 |       const extractedLinks: string[] = [];
31 |       linkElements.each((index, element) => {
32 |         const href = $(element).attr("href");
33 |         if (href && href.trim() !== "") {
34 |           try {
35 |             const urlObj = new URL(href, context.source);
36 |             if (!["http:", "https:", "file:"].includes(urlObj.protocol)) {
37 |               logger.debug(`Ignoring link with invalid protocol: ${href}`);
38 |               return;
39 |             }
40 |             extractedLinks.push(urlObj.href);
41 |           } catch (e) {
42 |             logger.debug(`Ignoring invalid URL syntax: ${href}`);
43 |           }
44 |         }
45 |       });
46 | 
47 |       context.links = [...new Set(extractedLinks)];
48 |       logger.debug(
49 |         `Extracted ${context.links.length} unique, valid links from ${context.source}`,
50 |       );
51 |     } catch (error) {
52 |       logger.error(`❌ Error extracting links from ${context.source}: ${error}`);
53 |       context.errors.push(
54 |         new Error(
55 |           `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`,
56 |         ),
57 |       );
58 |     }
59 | 
60 |     await next();
61 |   }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts:
--------------------------------------------------------------------------------
 1 | import { logger } from "../../utils/logger";
 2 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types";
 3 | 
 4 | /**
 5 |  * Middleware to extract the title from HTML content using Cheerio.
 6 |  * Assumes context.dom (Cheerio API object) is populated by a preceding middleware
 7 |  * (e.g., HtmlCheerioParserMiddleware).
 8 |  */
 9 | export class HtmlMetadataExtractorMiddleware implements ContentProcessorMiddleware {
10 |   /**
11 |    * Processes the context to extract the HTML title.
12 |    * @param context The current processing context.
13 |    * @param next Function to call the next middleware.
14 |    */
15 |   async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> {
16 |     // Check if Cheerio DOM exists from previous middleware
17 |     const $ = context.dom;
18 |     if (!$) {
19 |       logger.warn(
20 |         `⏭️ Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`,
21 |       );
22 |       await next();
23 |       return;
24 |     }
25 | 
26 |     // Only process if we have a Cheerio object (implicitly means it's HTML)
27 |     try {
28 |       // Extract title (using title tag, fallback to h1 if title is empty/missing)
29 |       let title = $("title").first().text().trim();
30 | 
31 |       if (!title) {
32 |         // Fallback to the first H1 if title is empty
33 |         title = $("h1").first().text().trim();
34 |       }
35 | 
36 |       // Default to "Untitled" if both are empty
37 |       title = title || "Untitled";
38 | 
39 |       // Basic cleanup (replace multiple spaces with single space)
40 |       title = title.replace(/\s+/g, " ").trim();
41 | 
42 |       context.metadata.title = title;
43 |       logger.debug(`Extracted title: "${title}" from ${context.source}`);
44 |     } catch (error) {
45 |       logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`);
46 |       context.errors.push(
47 |         new Error(
48 |           `Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`,
49 |         ),
50 |       );
51 |       // Optionally decide whether to stop the pipeline here
52 |     }
53 | 
54 |     // Call the next middleware in the chain
55 |     await next();
56 | 
57 |     // No cleanup needed for Cheerio
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/HtmlSanitizerMiddleware.ts:
--------------------------------------------------------------------------------
  1 | import { logger } from "../../utils/logger";
  2 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types";
  3 | 
  4 | /**
  5 |  * Options for HtmlSanitizerMiddleware.
  6 |  */
  7 | export interface HtmlSanitizerOptions {
  8 |   /** CSS selectors for elements to remove *in addition* to the defaults. */
  9 |   excludeSelectors?: string[];
 10 | }
 11 | 
 12 | /**
 13 |  * Middleware to remove unwanted elements from parsed HTML content using Cheerio.
 14 |  * It expects the Cheerio API object (`context.dom`) to be populated by a preceding middleware
 15 |  * (e.g., HtmlCheerioParserMiddleware).
 16 |  * It modifies the `context.dom` object in place.
 17 |  */
 18 | export class HtmlSanitizerMiddleware implements ContentProcessorMiddleware {
 19 |   // Default selectors to remove
 20 |   private readonly defaultSelectorsToRemove = [
 21 |     "nav",
 22 |     "footer",
 23 |     "script",
 24 |     "style",
 25 |     "noscript",
 26 |     "svg",
 27 |     "link",
 28 |     "meta",
 29 |     "iframe",
 30 |     "header",
 31 |     "button",
 32 |     "input",
 33 |     "textarea",
 34 |     "select",
 35 |     // "form", // Keep commented
 36 |     ".ads",
 37 |     ".advertisement",
 38 |     ".banner",
 39 |     ".cookie-banner",
 40 |     ".cookie-consent",
 41 |     ".hidden",
 42 |     ".hide",
 43 |     ".modal",
 44 |     ".nav-bar",
 45 |     ".overlay",
 46 |     ".popup",
 47 |     ".promo",
 48 |     ".mw-editsection",
 49 |     ".side-bar",
 50 |     ".social-share",
 51 |     ".sticky",
 52 |     "#ads",
 53 |     "#banner",
 54 |     "#cookieBanner",
 55 |     "#modal",
 56 |     "#nav",
 57 |     "#overlay",
 58 |     "#popup",
 59 |     "#sidebar",
 60 |     "#socialMediaBox",
 61 |     "#stickyHeader",
 62 |     "#ad-container",
 63 |     ".ad-container",
 64 |     ".login-form",
 65 |     ".signup-form",
 66 |     ".tooltip",
 67 |     ".dropdown-menu",
 68 |     // ".alert", // Keep commented
 69 |     ".breadcrumb",
 70 |     ".pagination",
 71 |     // '[role="alert"]', // Keep commented
 72 |     '[role="banner"]',
 73 |     '[role="dialog"]',
 74 |     '[role="alertdialog"]',
 75 |     '[role="region"][aria-label*="skip" i]',
 76 |     '[aria-modal="true"]',
 77 |     ".noprint",
 78 |   ];
 79 | 
 80 |   async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> {
 81 |     // Check if Cheerio DOM exists
 82 |     const $ = context.dom;
 83 |     if (!$) {
 84 |       logger.warn(
 85 |         `⏭️ Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`,
 86 |       );
 87 |       await next();
 88 |       return;
 89 |     }
 90 | 
 91 |     try {
 92 |       // Remove unwanted elements using Cheerio
 93 |       const selectorsToRemove = [
 94 |         ...(context.options.excludeSelectors || []), // Use options from the context
 95 |         ...this.defaultSelectorsToRemove,
 96 |       ];
 97 |       logger.debug(
 98 |         `Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`,
 99 |       );
100 |       let removedCount = 0;
101 |       for (const selector of selectorsToRemove) {
102 |         try {
103 |           const elements = $(selector); // Use Cheerio selector
104 |           const count = elements.length;
105 |           if (count > 0) {
106 |             elements.remove(); // Use Cheerio remove
107 |             removedCount += count;
108 |           }
109 |         } catch (selectorError) {
110 |           // Log invalid selectors but continue with others
111 |           // Cheerio is generally more tolerant of invalid selectors than querySelectorAll
112 |           logger.warn(
113 |             `⚠️  Potentially invalid selector "${selector}" during element removal: ${selectorError}`,
114 |           );
115 |           context.errors.push(
116 |             new Error(`Invalid selector "${selector}": ${selectorError}`),
117 |           );
118 |         }
119 |       }
120 |       logger.debug(`Removed ${removedCount} elements for ${context.source}`);
121 | 
122 |       // The context.dom object ($) has been modified in place.
123 |     } catch (error) {
124 |       logger.error(
125 |         `❌ Error during HTML element removal for ${context.source}: ${error}`,
126 |       );
127 |       context.errors.push(
128 |         error instanceof Error
129 |           ? error
130 |           : new Error(`HTML element removal failed: ${String(error)}`),
131 |       );
132 |       // Decide if pipeline should stop? For now, continue.
133 |     }
134 | 
135 |     // Proceed to the next middleware
136 |     await next();
137 |   }
138 | }
139 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it, vi } from "vitest";
 2 | import type { ScraperOptions } from "../types";
 3 | import { MarkdownLinkExtractorMiddleware } from "./MarkdownLinkExtractorMiddleware";
 4 | import type { MiddlewareContext } from "./types";
 5 | 
 6 | // Suppress logger output during tests
 7 | vi.mock("../../utils/logger");
 8 | 
 9 | // Helper to create a minimal valid ScraperOptions object
10 | const createMockScraperOptions = (url = "http://example.com"): ScraperOptions => ({
11 |   url,
12 |   library: "test-lib",
13 |   version: "1.0.0",
14 |   maxDepth: 0,
15 |   maxPages: 1,
16 |   maxConcurrency: 1,
17 |   scope: "subpages",
18 |   followRedirects: true,
19 |   excludeSelectors: [],
20 |   ignoreErrors: false,
21 | });
22 | 
23 | const createMockContext = (
24 |   markdownContent: string,
25 |   source = "http://example.com",
26 |   initialLinks: string[] = [],
27 |   options?: Partial<ScraperOptions>,
28 | ): MiddlewareContext => {
29 |   return {
30 |     content: markdownContent,
31 |     source,
32 |     metadata: {},
33 |     links: initialLinks,
34 |     errors: [],
35 |     options: { ...createMockScraperOptions(source), ...options },
36 |   };
37 | };
38 | 
39 | describe("MarkdownLinkExtractorMiddleware", () => {
40 |   it("should initialize context.links to an empty array if it is undefined", async () => {
41 |     const middleware = new MarkdownLinkExtractorMiddleware();
42 |     // Create context with undefined links
43 |     const context = createMockContext(
44 |       "Some markdown content",
45 |       "http://example.com",
46 |       undefined,
47 |     );
48 |     const next = vi.fn().mockResolvedValue(undefined);
49 | 
50 |     await middleware.process(context, next);
51 | 
52 |     expect(next).toHaveBeenCalledOnce();
53 |     expect(context.links).toBeDefined();
54 |     expect(Array.isArray(context.links)).toBe(true);
55 |     expect(context.links).toHaveLength(0);
56 |   });
57 | 
58 |   it("should not modify context.links if it is already an array", async () => {
59 |     const middleware = new MarkdownLinkExtractorMiddleware();
60 |     const existingLinks = ["https://example.com/link1", "https://example.com/link2"];
61 |     const context = createMockContext(
62 |       "Some markdown content",
63 |       "http://example.com",
64 |       existingLinks,
65 |     );
66 |     const next = vi.fn().mockResolvedValue(undefined);
67 | 
68 |     await middleware.process(context, next);
69 | 
70 |     expect(next).toHaveBeenCalledOnce();
71 |     expect(context.links).toBe(existingLinks); // Should be the same array instance
72 |     expect(context.links).toEqual(existingLinks); // Should have the same content
73 |   });
74 | 
75 |   it("should always call the next middleware", async () => {
76 |     const middleware = new MarkdownLinkExtractorMiddleware();
77 |     // Test with null links to ensure it's handled properly
78 |     const context = createMockContext("Some markdown content") as MiddlewareContext;
79 |     // @ts-expect-error
80 |     context.links = null; // Deliberately set to null to test robustness
81 |     const next = vi.fn().mockResolvedValue(undefined);
82 | 
83 |     await middleware.process(context, next);
84 | 
85 |     expect(next).toHaveBeenCalledOnce();
86 |     expect(context.links).toBeDefined();
87 |     expect(Array.isArray(context.links)).toBe(true);
88 |   });
89 | 
90 |   // Note: Since the current implementation is a placeholder and doesn't actually
91 |   // extract links, we don't test link extraction functionality yet.
92 |   // When the TODO is implemented, additional tests should be added to verify
93 |   // that links are correctly extracted from markdown content.
94 | });
95 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/MarkdownLinkExtractorMiddleware.ts:
--------------------------------------------------------------------------------
 1 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types";
 2 | 
 3 | /**
 4 |  * Placeholder middleware for extracting links from Markdown content.
 5 |  * Currently, it does not implement link extraction, matching the
 6 |  * original MarkdownProcessor's TODO status.
 7 |  */
 8 | export class MarkdownLinkExtractorMiddleware implements ContentProcessorMiddleware {
 9 |   /**
10 |    * Processes the context. Currently a no-op regarding link extraction.
11 |    * @param context The current processing context.
12 |    * @param next Function to call the next middleware.
13 |    */
14 |   async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> {
15 |     // TODO: Implement Markdown link extraction (e.g., using regex or a Markdown parser)
16 |     // For now, ensure context.links exists, defaulting to empty array if not set.
17 |     if (!Array.isArray(context.links)) {
18 |       context.links = [];
19 |     }
20 |     // No links are added here yet.
21 | 
22 |     await next();
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts:
--------------------------------------------------------------------------------
 1 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types";
 2 | 
 3 | /**
 4 |  * Middleware to extract the title (first H1 heading) from Markdown content.
 5 |  */
 6 | export class MarkdownMetadataExtractorMiddleware implements ContentProcessorMiddleware {
 7 |   /**
 8 |    * Processes the context to extract the title from Markdown.
 9 |    * @param context The current processing context.
10 |    * @param next Function to call the next middleware.
11 |    */
12 |   async process(context: MiddlewareContext, next: () => Promise<void>): Promise<void> {
13 |     try {
14 |       let title = "Untitled";
15 |       const match = context.content.match(/^#\s+(.*)$/m);
16 |       if (match?.[1]) {
17 |         title = match[1].trim();
18 |       }
19 |       context.metadata.title = title;
20 |     } catch (error) {
21 |       context.errors.push(
22 |         new Error(
23 |           `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`,
24 |         ),
25 |       );
26 |     }
27 | 
28 |     await next();
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/index.ts:
--------------------------------------------------------------------------------
 1 | export * from "./HtmlCheerioParserMiddleware";
 2 | export * from "./HtmlJsExecutorMiddleware";
 3 | export * from "./HtmlLinkExtractorMiddleware";
 4 | export * from "./HtmlMetadataExtractorMiddleware";
 5 | export * from "./HtmlPlaywrightMiddleware";
 6 | export * from "./HtmlSanitizerMiddleware";
 7 | export * from "./HtmlToMarkdownMiddleware";
 8 | export * from "./MarkdownLinkExtractorMiddleware";
 9 | export * from "./MarkdownMetadataExtractorMiddleware";
10 | 


--------------------------------------------------------------------------------
/src/scraper/middleware/types.ts:
--------------------------------------------------------------------------------
 1 | import type * as cheerio from "cheerio";
 2 | import type { ContentFetcher } from "../fetcher/types";
 3 | import type { ScraperOptions } from "../types";
 4 | 
 5 | /**
 6 |  * Represents the context passed through the middleware pipeline.
 7 |  */
 8 | export interface MiddlewareContext {
 9 |   /** The content being processed (always a string in middleware). */
10 |   content: string;
11 |   /** The original source URL of the content. */
12 |   readonly source: string;
13 |   /** Extracted metadata (e.g., title). */
14 |   metadata: Record<string, unknown>;
15 |   /** Extracted links from the content. */
16 |   links: string[];
17 |   /** Errors encountered during processing. */
18 |   errors: Error[];
19 |   /** Job-specific options influencing processing. */
20 |   readonly options: ScraperOptions;
21 | 
22 |   /** Optional Cheerio root object for HTML processing. */
23 |   dom?: cheerio.CheerioAPI;
24 | 
25 |   /** Optional fetcher instance for resolving resources relative to the source. */
26 |   fetcher?: ContentFetcher;
27 | }
28 | 
29 | /**
30 |  * Defines the interface for a middleware component.
31 |  */
32 | export interface ContentProcessorMiddleware {
33 |   /**
34 |    * Processes the middleware context asynchronously.
35 |    * @param context The current middleware context.
36 |    * @param next A function to call to pass control to the next middleware in the pipeline.
37 |    */
38 |   process(context: MiddlewareContext, next: () => Promise<void>): Promise<void>;
39 | }
40 | 


--------------------------------------------------------------------------------
/src/scraper/pipelines/BasePipeline.ts:
--------------------------------------------------------------------------------
 1 | import type { ContentFetcher, RawContent } from "../fetcher/types";
 2 | import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types";
 3 | import type { ScraperOptions } from "../types";
 4 | import type { ContentPipeline, ProcessedContent } from "./types";
 5 | 
 6 | /**
 7 |  * Base class for content processing pipelines.
 8 |  * Provides common functionality for executing middleware stacks.
 9 |  */
10 | export class BasePipeline implements ContentPipeline {
11 |   /**
12 |    * Determines if this pipeline can process the given content.
13 |    * Must be implemented by derived classes.
14 |    */
15 |   public canProcess(_rawContent: RawContent): boolean {
16 |     throw new Error("Method not implemented.");
17 |   }
18 | 
19 |   /**
20 |    * Processes the raw content through the pipeline.
21 |    * Must be implemented by derived classes.
22 |    */
23 |   public async process(
24 |     _rawContent: RawContent,
25 |     _options: ScraperOptions,
26 |     _fetcher?: ContentFetcher,
27 |   ): Promise<ProcessedContent> {
28 |     throw new Error("Method not implemented.");
29 |   }
30 | 
31 |   /**
32 |    * Executes a middleware stack on the given context.
33 |    * This is a utility method used by derived pipeline classes.
34 |    *
35 |    * @param middleware - The middleware stack to execute
36 |    * @param context - The context to process
37 |    */
38 |   protected async executeMiddlewareStack(
39 |     middleware: ContentProcessorMiddleware[],
40 |     context: MiddlewareContext,
41 |   ): Promise<void> {
42 |     let index = -1;
43 |     const dispatch = async (i: number): Promise<void> => {
44 |       if (i <= index) throw new Error("next() called multiple times");
45 |       index = i;
46 |       const mw = middleware[i];
47 |       if (!mw) return;
48 |       await mw.process(context, dispatch.bind(null, i + 1));
49 |     };
50 | 
51 |     try {
52 |       await dispatch(0);
53 |     } catch (error) {
54 |       context.errors.push(error instanceof Error ? error : new Error(String(error)));
55 |     }
56 |   }
57 | 
58 |   /**
59 |    * Cleans up resources when the pipeline is no longer needed.
60 |    * Default implementation does nothing.
61 |    */
62 |   public async close(): Promise<void> {
63 |     // Default implementation does nothing
64 |   }
65 | }
66 | 


--------------------------------------------------------------------------------
/src/scraper/pipelines/HtmlPipeline.ts:
--------------------------------------------------------------------------------
 1 | import { MimeTypeUtils } from "../../utils/mimeTypeUtils";
 2 | import type { RawContent } from "../fetcher/types";
 3 | import type { ContentFetcher } from "../fetcher/types";
 4 | import { HtmlSanitizerMiddleware } from "../middleware";
 5 | import { HtmlCheerioParserMiddleware } from "../middleware/HtmlCheerioParserMiddleware";
 6 | import { HtmlLinkExtractorMiddleware } from "../middleware/HtmlLinkExtractorMiddleware";
 7 | import { HtmlMetadataExtractorMiddleware } from "../middleware/HtmlMetadataExtractorMiddleware";
 8 | import { HtmlPlaywrightMiddleware } from "../middleware/HtmlPlaywrightMiddleware";
 9 | import { HtmlToMarkdownMiddleware } from "../middleware/HtmlToMarkdownMiddleware";
10 | import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types";
11 | import type { ScraperOptions } from "../types";
12 | import { convertToString } from "../utils/buffer";
13 | import { BasePipeline } from "./BasePipeline";
14 | import type { ProcessedContent } from "./types";
15 | 
16 | /**
17 |  * Pipeline for processing HTML content using middleware.
18 |  */
19 | export class HtmlPipeline extends BasePipeline {
20 |   private readonly playwrightMiddleware: HtmlPlaywrightMiddleware;
21 |   private readonly standardMiddleware: ContentProcessorMiddleware[];
22 | 
23 |   constructor() {
24 |     super();
25 |     this.playwrightMiddleware = new HtmlPlaywrightMiddleware();
26 |     this.standardMiddleware = [
27 |       new HtmlCheerioParserMiddleware(),
28 |       new HtmlMetadataExtractorMiddleware(),
29 |       new HtmlLinkExtractorMiddleware(),
30 |       new HtmlSanitizerMiddleware(),
31 |       new HtmlToMarkdownMiddleware(),
32 |     ];
33 |   }
34 | 
35 |   canProcess(rawContent: RawContent): boolean {
36 |     return MimeTypeUtils.isHtml(rawContent.mimeType);
37 |   }
38 | 
39 |   async process(
40 |     rawContent: RawContent,
41 |     options: ScraperOptions,
42 |     fetcher?: ContentFetcher,
43 |   ): Promise<ProcessedContent> {
44 |     const contentString = convertToString(rawContent.content, rawContent.charset);
45 | 
46 |     const context: MiddlewareContext = {
47 |       content: contentString,
48 |       source: rawContent.source,
49 |       metadata: {},
50 |       links: [],
51 |       errors: [],
52 |       options,
53 |       fetcher,
54 |     };
55 | 
56 |     // Build middleware stack dynamically based on scrapeMode
57 |     let middleware: ContentProcessorMiddleware[] = [...this.standardMiddleware];
58 |     if (options.scrapeMode === "playwright" || options.scrapeMode === "auto") {
59 |       middleware = [this.playwrightMiddleware, ...middleware];
60 |     }
61 | 
62 |     // Execute the middleware stack using the base class method
63 |     await this.executeMiddlewareStack(middleware, context);
64 | 
65 |     return {
66 |       textContent: typeof context.content === "string" ? context.content : "",
67 |       metadata: context.metadata,
68 |       links: context.links,
69 |       errors: context.errors,
70 |     };
71 |   }
72 | 
73 |   async close(): Promise<void> {
74 |     await this.playwrightMiddleware.closeBrowser();
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/scraper/pipelines/MarkdownPipeline.ts:
--------------------------------------------------------------------------------
 1 | import { MimeTypeUtils } from "../../utils/mimeTypeUtils";
 2 | import type { RawContent } from "../fetcher/types";
 3 | import type { ContentFetcher } from "../fetcher/types";
 4 | import { MarkdownLinkExtractorMiddleware } from "../middleware/MarkdownLinkExtractorMiddleware";
 5 | import { MarkdownMetadataExtractorMiddleware } from "../middleware/MarkdownMetadataExtractorMiddleware";
 6 | import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types";
 7 | import type { ScraperOptions } from "../types";
 8 | import { convertToString } from "../utils/buffer";
 9 | import { BasePipeline } from "./BasePipeline";
10 | import type { ProcessedContent } from "./types";
11 | 
12 | /**
13 |  * Pipeline for processing Markdown content using middleware.
14 |  */
15 | export class MarkdownPipeline extends BasePipeline {
16 |   private readonly middleware: ContentProcessorMiddleware[];
17 | 
18 |   constructor() {
19 |     super();
20 |     this.middleware = [
21 |       new MarkdownMetadataExtractorMiddleware(),
22 |       new MarkdownLinkExtractorMiddleware(),
23 |     ];
24 |   }
25 | 
26 |   canProcess(rawContent: RawContent): boolean {
27 |     if (!rawContent.mimeType) return false;
28 |     return (
29 |       MimeTypeUtils.isMarkdown(rawContent.mimeType) ||
30 |       MimeTypeUtils.isText(rawContent.mimeType)
31 |     );
32 |   }
33 | 
34 |   async process(
35 |     rawContent: RawContent,
36 |     options: ScraperOptions,
37 |     fetcher?: ContentFetcher,
38 |   ): Promise<ProcessedContent> {
39 |     const contentString = convertToString(rawContent.content, rawContent.charset);
40 | 
41 |     const context: MiddlewareContext = {
42 |       content: contentString,
43 |       source: rawContent.source,
44 |       metadata: {},
45 |       links: [],
46 |       errors: [],
47 |       options,
48 |       fetcher,
49 |     };
50 | 
51 |     // Execute the middleware stack using the base class method
52 |     await this.executeMiddlewareStack(this.middleware, context);
53 | 
54 |     return {
55 |       textContent: typeof context.content === "string" ? context.content : "",
56 |       metadata: context.metadata,
57 |       links: context.links,
58 |       errors: context.errors,
59 |     };
60 |   }
61 | 
62 |   async close(): Promise<void> {}
63 | }
64 | 


--------------------------------------------------------------------------------
/src/scraper/pipelines/types.ts:
--------------------------------------------------------------------------------
 1 | import type { RawContent } from "../fetcher/types";
 2 | import type { ContentFetcher } from "../fetcher/types";
 3 | import type { ScraperOptions } from "../types";
 4 | 
 5 | /**
 6 |  * Represents the successfully processed content from a pipeline.
 7 |  */
 8 | export interface ProcessedContent {
 9 |   /** The final processed content, typically as a string (e.g., Markdown). */
10 |   textContent: string;
11 |   /** Extracted metadata (e.g., title, description). */
12 |   metadata: Record<string, unknown>;
13 |   /** Extracted links from the content. */
14 |   links: string[];
15 |   /** Any non-critical errors encountered during processing. */
16 |   errors: Error[];
17 | }
18 | 
19 | /**
20 |  * Interface for a content processing pipeline.
21 |  * Each pipeline is specialized for a certain type of content (e.g., HTML, Markdown).
22 |  */
23 | export interface ContentPipeline {
24 |   /**
25 |    * Determines if this pipeline can process the given raw content.
26 |    * @param rawContent The raw content fetched from a source.
27 |    * @returns True if the pipeline can process the content, false otherwise.
28 |    */
29 |   canProcess(rawContent: RawContent): boolean;
30 | 
31 |   /**
32 |    * Processes the raw content.
33 |    * @param rawContent The raw content to process.
34 |    * @param options Scraper options that might influence processing.
35 |    * @param fetcher An optional ContentFetcher for resolving relative resources.
36 |    * @returns A promise that resolves with the ProcessedContent.
37 |    */
38 |   process(
39 |     rawContent: RawContent,
40 |     options: ScraperOptions,
41 |     fetcher?: ContentFetcher,
42 |   ): Promise<ProcessedContent>;
43 | 
44 |   /**
45 |    * Closes any resources or connections used by the pipeline.
46 |    */
47 |   close(): Promise<void>;
48 | }
49 | 


--------------------------------------------------------------------------------
/src/scraper/strategies/GitHubScraperStrategy.ts:
--------------------------------------------------------------------------------
 1 | import type { ProgressCallback } from "../../types";
 2 | import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types";
 3 | import { WebScraperStrategy } from "./WebScraperStrategy";
 4 | 
 5 | export class GitHubScraperStrategy implements ScraperStrategy {
 6 |   private defaultStrategy: WebScraperStrategy;
 7 | 
 8 |   canHandle(url: string): boolean {
 9 |     const { hostname } = new URL(url);
10 |     return ["github.com", "www.github.com"].includes(hostname);
11 |   }
12 | 
13 |   constructor() {
14 |     const shouldFollowLink = (baseUrl: URL, targetUrl: URL) => {
15 |       // Must be in same repository
16 |       if (this.getRepoPath(baseUrl) !== this.getRepoPath(targetUrl)) {
17 |         return false;
18 |       }
19 | 
20 |       const path = targetUrl.pathname;
21 | 
22 |       // Root README (repository root)
23 |       if (path === this.getRepoPath(targetUrl)) {
24 |         return true;
25 |       }
26 | 
27 |       // Wiki pages
28 |       if (path.startsWith(`${this.getRepoPath(targetUrl)}/wiki`)) {
29 |         return true;
30 |       }
31 | 
32 |       // Markdown files under /blob/
33 |       if (
34 |         path.startsWith(`${this.getRepoPath(targetUrl)}/blob/`) &&
35 |         path.endsWith(".md")
36 |       ) {
37 |         return true;
38 |       }
39 | 
40 |       return false;
41 |     };
42 | 
43 |     this.defaultStrategy = new WebScraperStrategy({
44 |       urlNormalizerOptions: {
45 |         ignoreCase: true,
46 |         removeHash: true,
47 |         removeTrailingSlash: true,
48 |         removeQuery: true, // Remove query parameters like ?tab=readme-ov-file
49 |       },
50 |       shouldFollowLink,
51 |     });
52 |   }
53 | 
54 |   private getRepoPath(url: URL): string {
55 |     // Extract /<org>/<repo> from github.com/<org>/<repo>/...
56 |     const match = url.pathname.match(/^\/[^/]+\/[^/]+/);
57 |     return match?.[0] || "";
58 |   }
59 | 
60 |   async scrape(
61 |     options: ScraperOptions,
62 |     progressCallback: ProgressCallback<ScraperProgress>,
63 |     signal?: AbortSignal,
64 |   ): Promise<void> {
65 |     // Validate it's a GitHub URL
66 |     const url = new URL(options.url);
67 |     if (!url.hostname.includes("github.com")) {
68 |       throw new Error("URL must be a GitHub URL");
69 |     }
70 | 
71 |     // Pass signal down to the delegated strategy
72 |     await this.defaultStrategy.scrape(options, progressCallback, signal);
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/scraper/strategies/LocalFileStrategy.ts:
--------------------------------------------------------------------------------
  1 | import fs from "node:fs/promises";
  2 | import path from "node:path";
  3 | import type { Document, ProgressCallback } from "../../types";
  4 | import { logger } from "../../utils/logger";
  5 | import { FileFetcher } from "../fetcher";
  6 | import type { RawContent } from "../fetcher/types";
  7 | import { HtmlPipeline } from "../pipelines/HtmlPipeline";
  8 | import { MarkdownPipeline } from "../pipelines/MarkdownPipeline";
  9 | import type { ScraperOptions, ScraperProgress } from "../types";
 10 | import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy";
 11 | 
 12 | /**
 13 |  * LocalFileStrategy handles crawling and scraping of local files and folders using file:// URLs.
 14 |  *
 15 |  * All files with a MIME type of `text/*` are processed. This includes HTML, Markdown, plain text, and source code files such as `.js`, `.ts`, `.tsx`, `.css`, etc. Binary files, PDFs, images, and other non-text formats are ignored.
 16 |  *
 17 |  * Supports include/exclude filters and percent-encoded paths.
 18 |  */
 19 | export class LocalFileStrategy extends BaseScraperStrategy {
 20 |   private readonly fileFetcher = new FileFetcher();
 21 |   private readonly htmlPipeline: HtmlPipeline;
 22 |   private readonly markdownPipeline: MarkdownPipeline;
 23 |   private readonly pipelines: [HtmlPipeline, MarkdownPipeline];
 24 | 
 25 |   constructor() {
 26 |     super();
 27 |     this.htmlPipeline = new HtmlPipeline();
 28 |     this.markdownPipeline = new MarkdownPipeline();
 29 |     this.pipelines = [this.htmlPipeline, this.markdownPipeline];
 30 |   }
 31 | 
 32 |   canHandle(url: string): boolean {
 33 |     return url.startsWith("file://");
 34 |   }
 35 | 
 36 |   protected async processItem(
 37 |     item: QueueItem,
 38 |     options: ScraperOptions,
 39 |     _progressCallback?: ProgressCallback<ScraperProgress>,
 40 |     _signal?: AbortSignal,
 41 |   ): Promise<{ document?: Document; links?: string[] }> {
 42 |     // Always decode the file path from file:// URL
 43 |     const filePath = decodeURIComponent(item.url.replace(/^file:\/\//, ""));
 44 |     const stats = await fs.stat(filePath);
 45 | 
 46 |     if (stats.isDirectory()) {
 47 |       const contents = await fs.readdir(filePath);
 48 |       // Only return links that pass shouldProcessUrl
 49 |       const links = contents
 50 |         .map((name) => `file://${path.join(filePath, name)}`)
 51 |         .filter((url) => this.shouldProcessUrl(url, options));
 52 |       return { links };
 53 |     }
 54 | 
 55 |     logger.info(`🗂️  Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`);
 56 | 
 57 |     const rawContent: RawContent = await this.fileFetcher.fetch(item.url);
 58 | 
 59 |     let processed: Awaited<ReturnType<HtmlPipeline["process"]>> | undefined;
 60 | 
 61 |     for (const pipeline of this.pipelines) {
 62 |       if (pipeline.canProcess(rawContent)) {
 63 |         processed = await pipeline.process(rawContent, options, this.fileFetcher);
 64 |         break;
 65 |       }
 66 |     }
 67 | 
 68 |     if (!processed) {
 69 |       logger.warn(
 70 |         `⚠️  Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`,
 71 |       );
 72 |       return { document: undefined, links: [] };
 73 |     }
 74 | 
 75 |     for (const err of processed.errors) {
 76 |       logger.warn(`⚠️  Processing error for ${filePath}: ${err.message}`);
 77 |     }
 78 | 
 79 |     return {
 80 |       document: {
 81 |         content: typeof processed.textContent === "string" ? processed.textContent : "",
 82 |         metadata: {
 83 |           url: rawContent.source,
 84 |           title:
 85 |             typeof processed.metadata.title === "string"
 86 |               ? processed.metadata.title
 87 |               : "Untitled",
 88 |           library: options.library,
 89 |           version: options.version,
 90 |         },
 91 |       } satisfies Document,
 92 |     };
 93 |   }
 94 | 
 95 |   async scrape(
 96 |     options: ScraperOptions,
 97 |     progressCallback: ProgressCallback<ScraperProgress>,
 98 |     signal?: AbortSignal,
 99 |   ): Promise<void> {
100 |     try {
101 |       await super.scrape(options, progressCallback, signal);
102 |     } finally {
103 |       await this.htmlPipeline.close();
104 |       await this.markdownPipeline.close();
105 |     }
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/scraper/strategies/NpmScraperStrategy.ts:
--------------------------------------------------------------------------------
 1 | import type { ProgressCallback } from "../../types";
 2 | import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types";
 3 | import { WebScraperStrategy } from "./WebScraperStrategy";
 4 | 
 5 | export class NpmScraperStrategy implements ScraperStrategy {
 6 |   private defaultStrategy: WebScraperStrategy;
 7 | 
 8 |   canHandle(url: string): boolean {
 9 |     const { hostname } = new URL(url);
10 |     return ["npmjs.org", "npmjs.com", "www.npmjs.com"].includes(hostname);
11 |   }
12 | 
13 |   constructor() {
14 |     this.defaultStrategy = new WebScraperStrategy({
15 |       urlNormalizerOptions: {
16 |         ignoreCase: true,
17 |         removeHash: true,
18 |         removeTrailingSlash: true,
19 |         removeQuery: true, // Enable removeQuery for NPM packages
20 |       },
21 |     });
22 |   }
23 | 
24 |   async scrape(
25 |     options: ScraperOptions,
26 |     progressCallback: ProgressCallback<ScraperProgress>,
27 |     signal?: AbortSignal,
28 |   ): Promise<void> {
29 |     // Use default strategy with our configuration, passing the signal
30 |     await this.defaultStrategy.scrape(options, progressCallback, signal);
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/scraper/strategies/PyPiScraperStrategy.ts:
--------------------------------------------------------------------------------
 1 | import type { ProgressCallback } from "../../types";
 2 | import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types";
 3 | import { WebScraperStrategy } from "./WebScraperStrategy";
 4 | 
 5 | export class PyPiScraperStrategy implements ScraperStrategy {
 6 |   private defaultStrategy: WebScraperStrategy;
 7 | 
 8 |   canHandle(url: string): boolean {
 9 |     const { hostname } = new URL(url);
10 |     return ["pypi.org", "www.pypi.org"].includes(hostname);
11 |   }
12 | 
13 |   constructor() {
14 |     this.defaultStrategy = new WebScraperStrategy({
15 |       urlNormalizerOptions: {
16 |         ignoreCase: true,
17 |         removeHash: true,
18 |         removeTrailingSlash: true,
19 |         removeQuery: true, // Enable removeQuery for PyPI packages
20 |       },
21 |     });
22 |   }
23 | 
24 |   async scrape(
25 |     options: ScraperOptions,
26 |     progressCallback: ProgressCallback<ScraperProgress>,
27 |     signal?: AbortSignal,
28 |   ): Promise<void> {
29 |     // Use default strategy with our configuration, passing the signal
30 |     await this.defaultStrategy.scrape(options, progressCallback, signal);
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/src/scraper/types.ts:
--------------------------------------------------------------------------------
 1 | import type { Document, ProgressCallback } from "../types";
 2 | 
 3 | /**
 4 |  * Enum defining the available HTML processing strategies.
 5 |  */
 6 | export enum ScrapeMode {
 7 |   Fetch = "fetch",
 8 |   Playwright = "playwright",
 9 |   Auto = "auto",
10 | }
11 | 
12 | /**
13 |  * Strategy interface for implementing different scraping behaviors
14 |  */
15 | export interface ScraperStrategy {
16 |   canHandle(url: string): boolean;
17 |   scrape(
18 |     options: ScraperOptions,
19 |     progressCallback: ProgressCallback<ScraperProgress>,
20 |     signal?: AbortSignal, // Add optional signal
21 |   ): Promise<void>;
22 | }
23 | 
24 | /**
25 |  * Options for configuring the scraping process
26 |  */
27 | export interface ScraperOptions {
28 |   url: string;
29 |   library: string;
30 |   version: string;
31 |   maxPages?: number;
32 |   maxDepth?: number;
33 |   /**
34 |    * Defines the allowed crawling boundary relative to the starting URL
35 |    * - 'subpages': Only crawl URLs on the same hostname and within the same starting path (default)
36 |    * - 'hostname': Crawl any URL on the same exact hostname, regardless of path
37 |    * - 'domain': Crawl any URL on the same top-level domain, including subdomains
38 |    */
39 |   scope?: "subpages" | "hostname" | "domain";
40 |   /**
41 |    * Controls whether HTTP redirects (3xx responses) should be followed
42 |    * - When true: Redirects are followed automatically (default)
43 |    * - When false: A RedirectError is thrown when a 3xx response is received
44 |    */
45 |   followRedirects?: boolean;
46 |   maxConcurrency?: number;
47 |   ignoreErrors?: boolean;
48 |   /** CSS selectors for elements to exclude during HTML processing */
49 |   excludeSelectors?: string[];
50 |   /**
51 |    * Determines the HTML processing strategy.
52 |    * - 'fetch': Use a simple DOM parser (faster, less JS support).
53 |    * - 'playwright': Use a headless browser (slower, full JS support).
54 |    * - 'auto': Automatically select the best strategy (currently defaults to 'playwright').
55 |    * @default ScrapeMode.Auto
56 |    */
57 |   scrapeMode?: ScrapeMode;
58 |   /** Optional AbortSignal for cancellation */
59 |   signal?: AbortSignal;
60 |   /**
61 |    * Patterns for including URLs during scraping. If not set, all are included by default.
62 |    */
63 |   includePatterns?: string[];
64 |   /**
65 |    * Patterns for excluding URLs during scraping. Exclude takes precedence over include.
66 |    */
67 |   excludePatterns?: string[];
68 |   /**
69 |    * Custom HTTP headers to send with each HTTP request (e.g., for authentication).
70 |    * Keys are header names, values are header values.
71 |    */
72 |   headers?: Record<string, string>;
73 | }
74 | 
75 | /**
76 |  * Result of scraping a single page. Used internally by HtmlScraper.
77 |  */
78 | export interface ScrapedPage {
79 |   content: string;
80 |   title: string;
81 |   url: string;
82 |   /** URLs extracted from page links, used for recursive scraping */
83 |   links: string[];
84 | }
85 | 
86 | /**
87 |  * Progress information during scraping
88 |  */
89 | export interface ScraperProgress {
90 |   pagesScraped: number;
91 |   maxPages: number;
92 |   currentUrl: string;
93 |   depth: number;
94 |   maxDepth: number;
95 |   document?: Document;
96 | }
97 | 


--------------------------------------------------------------------------------
/src/scraper/utils/buffer.test.ts:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2025
 2 | import { describe, expect, it } from "vitest";
 3 | import { convertToString } from "./buffer";
 4 | 
 5 | describe("buffer utilities", () => {
 6 |   describe("convertToString", () => {
 7 |     it("returns string content unchanged", () => {
 8 |       const input = "Hello, world!";
 9 |       expect(convertToString(input)).toBe(input);
10 |     });
11 | 
12 |     it("converts Buffer to string with default UTF-8 charset", () => {
13 |       const input = Buffer.from("Hello, world!", "utf-8");
14 |       expect(convertToString(input)).toBe("Hello, world!");
15 |     });
16 | 
17 |     it("converts Buffer to string with specified UTF-8 charset", () => {
18 |       const input = Buffer.from("Hello, world!", "utf-8");
19 |       expect(convertToString(input, "utf-8")).toBe("Hello, world!");
20 |     });
21 | 
22 |     it("converts Buffer to string with ISO-8859-1 charset", () => {
23 |       // Create a buffer with ISO-8859-1 encoding (Latin-1)
24 |       // This contains characters that would be encoded differently in UTF-8
25 |       const input = Buffer.from("Café", "latin1");
26 |       expect(convertToString(input, "iso-8859-1")).toBe("Café");
27 |     });
28 | 
29 |     it("handles special characters correctly with different charsets", () => {
30 |       // Test with a string containing various special characters
31 |       const specialChars = "äöüßéèêëàáâãåçñ¿¡";
32 | 
33 |       // Create buffer with ISO-8859-1 encoding
34 |       const latinBuffer = Buffer.from(specialChars, "latin1");
35 |       expect(convertToString(latinBuffer, "iso-8859-1")).toBe(specialChars);
36 | 
37 |       // Create buffer with UTF-8 encoding
38 |       const utf8Buffer = Buffer.from(specialChars, "utf-8");
39 |       expect(convertToString(utf8Buffer, "utf-8")).toBe(specialChars);
40 |     });
41 | 
42 |     it("defaults to UTF-8 when charset is not specified", () => {
43 |       const input = Buffer.from("Hello, world!", "utf-8");
44 |       expect(convertToString(input, undefined)).toBe("Hello, world!");
45 |     });
46 | 
47 |     it("handles empty buffer correctly", () => {
48 |       const input = Buffer.from([]);
49 |       expect(convertToString(input)).toBe("");
50 |     });
51 | 
52 |     it("converts Buffer to string with UTF-16LE BOM", () => {
53 |       // UTF-16LE BOM: 0xFF 0xFE
54 |       const utf16le = Buffer.from([0xff, 0xfe, 0x68, 0x00, 0x69, 0x00]); // 'hi' in UTF-16LE
55 |       // Node TextDecoder supports BOM-aware decoding
56 |       expect(convertToString(utf16le, "utf-16le")).toBe("hi");
57 |     });
58 | 
59 |     it("converts Buffer to string with UTF-16BE BOM", () => {
60 |       // UTF-16BE BOM: 0xFE 0xFF
61 |       const utf16be = Buffer.from([0xfe, 0xff, 0x00, 0x68, 0x00, 0x69]); // 'hi' in UTF-16BE
62 |       // Node TextDecoder does not natively support utf-16be, so skip if not supported
63 |       let decoded: string | undefined;
64 |       try {
65 |         decoded = convertToString(utf16be, "utf-16be");
66 |       } catch {
67 |         decoded = undefined;
68 |       }
69 |       // Accept either 'hi' or undefined if not supported
70 |       expect(["hi", undefined]).toContain(decoded);
71 |     });
72 | 
73 |     it("converts Buffer to string with UTF-8 BOM", () => {
74 |       // UTF-8 BOM: 0xEF 0xBB 0xBF
75 |       const utf8bom = Buffer.from([0xef, 0xbb, 0xbf, 0x68, 0x69]); // '\uFEFFhi' in UTF-8
76 |       // Node TextDecoder strips BOM by default, so accept both with and without BOM
77 |       const result = convertToString(utf8bom, "utf-8");
78 |       expect(["hi", "\uFEFFhi"]).toContain(result);
79 |     });
80 |   });
81 | });
82 | 


--------------------------------------------------------------------------------
/src/scraper/utils/buffer.ts:
--------------------------------------------------------------------------------
 1 | import iconv from "iconv-lite";
 2 | 
 3 | /**
 4 |  * Decodes a Buffer or string to a JavaScript string using the specified charset.
 5 |  * The charset should be the encoding as reported by the source (e.g., HTTP header).
 6 |  * The result is always a valid JS string (Unicode/UTF-16).
 7 |  *
 8 |  * If the charset is missing or unsupported, falls back to UTF-8.
 9 |  *
10 |  * @param content The content to decode (Buffer or string)
11 |  * @param charset The source encoding (e.g., 'utf-8', 'iso-8859-1', 'utf-16le', etc.)
12 |  * @returns The decoded string
13 |  */
14 | export function convertToString(content: string | Buffer, charset?: string): string {
15 |   if (typeof content === "string") return content;
16 |   try {
17 |     return iconv.decode(content, charset || "utf-8");
18 |   } catch {
19 |     // Fallback to utf-8 if decoding fails
20 |     return iconv.decode(content, "utf-8");
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/scraper/utils/patternMatcher.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from "vitest";
 2 | import {
 3 |   extractPathAndQuery,
 4 |   isRegexPattern,
 5 |   matchesAnyPattern,
 6 |   patternToRegExp,
 7 |   shouldIncludeUrl,
 8 | } from "./patternMatcher";
 9 | 
10 | describe("patternMatcher", () => {
11 |   it("isRegexPattern detects regex", () => {
12 |     expect(isRegexPattern("/foo.*/")).toBe(true);
13 |     expect(isRegexPattern("foo.*/")).toBe(false);
14 |     expect(isRegexPattern("/foo.*/")).toBe(true);
15 |     expect(isRegexPattern("foo.*")).toBe(false);
16 |   });
17 | 
18 |   it("patternToRegExp auto-detects regex and glob", () => {
19 |     expect(patternToRegExp("/foo.*/").test("foo123")).toBe(true);
20 |     expect(patternToRegExp("foo*bar").test("fooxbar")).toBe(true);
21 |     expect(patternToRegExp("foo*bar").test("fooyyybar")).toBe(true);
22 |     expect(patternToRegExp("foo*bar").test("foo/bar")).toBe(false);
23 |   });
24 | 
25 |   it("matchesAnyPattern works for globs and regex", () => {
26 |     expect(matchesAnyPattern("foo/abc/bar", ["foo/*/bar"])).toBe(true);
27 |     expect(matchesAnyPattern("foo/abc/bar", ["/foo/.*/bar/"])).toBe(true);
28 |     expect(matchesAnyPattern("foo/abc/bar", ["baz/*"])).toBe(false);
29 |   });
30 | 
31 |   it("extractPathAndQuery extracts path and query", () => {
32 |     expect(extractPathAndQuery("https://example.com/foo/bar?x=1")).toBe("/foo/bar?x=1");
33 |     expect(extractPathAndQuery("/foo/bar?x=1")).toBe("/foo/bar?x=1");
34 |   });
35 | 
36 |   it("shouldIncludeUrl applies exclude over include", () => {
37 |     // Exclude wins
38 |     expect(shouldIncludeUrl("https://x.com/foo", ["foo*"], ["/foo/"])).toBe(false);
39 |     // Include only
40 |     expect(shouldIncludeUrl("https://x.com/foo", ["foo*"], undefined)).toBe(true);
41 |     // No include/exclude
42 |     expect(shouldIncludeUrl("https://x.com/foo", undefined, undefined)).toBe(true);
43 |     // Exclude only
44 |     expect(shouldIncludeUrl("https://x.com/foo", undefined, ["foo*"])).toBe(false);
45 |   });
46 | });
47 | 


--------------------------------------------------------------------------------
/src/scraper/utils/patternMatcher.ts:
--------------------------------------------------------------------------------
 1 | import { minimatch } from "minimatch";
 2 | 
 3 | /**
 4 |  * Utility functions for pattern matching (glob and regex) for URL filtering.
 5 |  * Supports auto-detection and conversion of glob patterns to RegExp.
 6 |  *
 7 |  * Patterns starting and ending with '/' are treated as regex, otherwise as glob (minimatch syntax).
 8 |  * Glob wildcards supported: '*' (any chars except '/'), '**' (any chars, including '/').
 9 |  *
10 |  * @module patternMatcher
11 |  */
12 | 
13 | /**
14 |  * Detects if a pattern is a regex (starts and ends with '/')
15 |  */
16 | export function isRegexPattern(pattern: string): boolean {
17 |   return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/");
18 | }
19 | 
20 | /**
21 |  * Converts a pattern string to a RegExp instance (auto-detects glob/regex).
22 |  * For globs, uses minimatch's internal conversion.
23 |  */
24 | export function patternToRegExp(pattern: string): RegExp {
25 |   if (isRegexPattern(pattern)) {
26 |     return new RegExp(pattern.slice(1, -1));
27 |   }
28 |   // For globs, minimatch.makeRe returns a RegExp
29 |   const re = minimatch.makeRe(pattern, { dot: true });
30 |   if (!re) throw new Error(`Invalid glob pattern: ${pattern}`);
31 |   return re;
32 | }
33 | 
34 | /**
35 |  * Checks if a given path matches any pattern in the list.
36 |  * For globs, uses minimatch. For regex, uses RegExp.
37 |  */
38 | export function matchesAnyPattern(path: string, patterns?: string[]): boolean {
39 |   if (!patterns || patterns.length === 0) return false;
40 |   // Always match from a leading slash for path-based globs
41 |   const normalizedPath = path.startsWith("/") ? path : `/${path}`;
42 |   return patterns.some((pattern) => {
43 |     if (isRegexPattern(pattern)) {
44 |       return patternToRegExp(pattern).test(normalizedPath);
45 |     }
46 |     // minimatch expects no leading slash for relative globs, but we keep it for consistency
47 |     // so we strip the leading slash for minimatch
48 |     return minimatch(normalizedPath.replace(/^\//, ""), pattern, { dot: true });
49 |   });
50 | }
51 | 
52 | /**
53 |  * Extracts the path and query from a URL string (no domain).
54 |  */
55 | export function extractPathAndQuery(url: string): string {
56 |   try {
57 |     const u = new URL(url);
58 |     return u.pathname + (u.search || "");
59 |   } catch {
60 |     return url; // fallback: return as-is
61 |   }
62 | }
63 | 
64 | /**
65 |  * Determines if a URL should be included based on include/exclude patterns.
66 |  * Exclude patterns take precedence. If no include patterns, all are included by default.
67 |  */
68 | export function shouldIncludeUrl(
69 |   url: string,
70 |   includePatterns?: string[],
71 |   excludePatterns?: string[],
72 | ): boolean {
73 |   // Always match from a leading slash for path-based globs
74 |   const path = extractPathAndQuery(url);
75 |   const normalizedPath = path.startsWith("/") ? path : `/${path}`;
76 |   // For file:// URLs, also match against the basename (strip leading slash from pattern for basename matching)
77 |   let basename: string | undefined;
78 |   if (url.startsWith("file://")) {
79 |     try {
80 |       const u = new URL(url);
81 |       basename = u.pathname ? u.pathname.split("/").pop() : undefined;
82 |     } catch {}
83 |   }
84 |   // Helper to strip leading slash from patterns for basename matching
85 |   const stripSlash = (patterns?: string[]) =>
86 |     patterns?.map((p) => (p.startsWith("/") ? p.slice(1) : p));
87 |   // Exclude patterns take precedence
88 |   if (
89 |     matchesAnyPattern(normalizedPath, excludePatterns) ||
90 |     (basename && matchesAnyPattern(basename, stripSlash(excludePatterns)))
91 |   )
92 |     return false;
93 |   if (!includePatterns || includePatterns.length === 0) return true;
94 |   return (
95 |     matchesAnyPattern(normalizedPath, includePatterns) ||
96 |     (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false)
97 |   );
98 | }
99 | 


--------------------------------------------------------------------------------
/src/scraper/utils/scope.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it } from "vitest";
 2 | import { isInScope } from "./scope";
 3 | 
 4 | describe("isInScope", () => {
 5 |   const base = new URL("https://docs.example.com/docs/start");
 6 | 
 7 |   it("returns true for subpages in subpages scope", () => {
 8 |     expect(
 9 |       isInScope(base, new URL("https://docs.example.com/docs/intro"), "subpages"),
10 |     ).toBe(true);
11 |     expect(
12 |       isInScope(base, new URL("https://docs.example.com/docs/start/child"), "subpages"),
13 |     ).toBe(true);
14 |     expect(isInScope(base, new URL("https://docs.example.com/docs"), "subpages")).toBe(
15 |       false,
16 |     );
17 |     expect(isInScope(base, new URL("https://docs.example.com/api"), "subpages")).toBe(
18 |       false,
19 |     );
20 |     expect(isInScope(base, new URL("https://other.com/docs/start"), "subpages")).toBe(
21 |       false,
22 |     );
23 |   });
24 | 
25 |   it("returns true for same hostname in hostname scope", () => {
26 |     expect(
27 |       isInScope(base, new URL("https://docs.example.com/docs/intro"), "hostname"),
28 |     ).toBe(true);
29 |     expect(isInScope(base, new URL("https://docs.example.com/api"), "hostname")).toBe(
30 |       true,
31 |     );
32 |     expect(isInScope(base, new URL("https://other.com/docs/start"), "hostname")).toBe(
33 |       false,
34 |     );
35 |   });
36 | 
37 |   it("returns true for same domain in domain scope", () => {
38 |     expect(
39 |       isInScope(base, new URL("https://docs.example.com/docs/intro"), "domain"),
40 |     ).toBe(true);
41 |     expect(isInScope(base, new URL("https://api.example.com/"), "domain")).toBe(true);
42 |     expect(isInScope(base, new URL("https://other.com/docs/start"), "domain")).toBe(
43 |       false,
44 |     );
45 |     expect(isInScope(base, new URL("https://example.com/"), "domain")).toBe(true);
46 |   });
47 | 
48 |   it("returns false for different protocol", () => {
49 |     expect(
50 |       isInScope(base, new URL("http://docs.example.com/docs/intro"), "hostname"),
51 |     ).toBe(false);
52 |     expect(
53 |       isInScope(base, new URL("ftp://docs.example.com/docs/intro"), "hostname"),
54 |     ).toBe(false);
55 |   });
56 | });
57 | 


--------------------------------------------------------------------------------
/src/scraper/utils/scope.ts:
--------------------------------------------------------------------------------
 1 | // Utility for scope filtering, extracted from WebScraperStrategy
 2 | import type { URL } from "node:url";
 3 | 
 4 | /**
 5 |  * Returns true if the targetUrl is in scope of the baseUrl for the given scope.
 6 |  * - "subpages": same hostname, and target path starts with the parent directory of the base path
 7 |  * - "hostname": same hostname
 8 |  * - "domain": same top-level domain (e.g. example.com)
 9 |  */
10 | export function isInScope(
11 |   baseUrl: URL,
12 |   targetUrl: URL,
13 |   scope: "subpages" | "hostname" | "domain",
14 | ): boolean {
15 |   if (baseUrl.protocol !== targetUrl.protocol) return false;
16 |   switch (scope) {
17 |     case "subpages": {
18 |       if (baseUrl.hostname !== targetUrl.hostname) return false;
19 |       // Use the parent directory of the base path
20 |       const baseDir = baseUrl.pathname.endsWith("/")
21 |         ? baseUrl.pathname
22 |         : baseUrl.pathname.replace(/\/[^/]*$/, "/");
23 |       return targetUrl.pathname.startsWith(baseDir);
24 |     }
25 |     case "hostname":
26 |       return baseUrl.hostname === targetUrl.hostname;
27 |     case "domain": {
28 |       // Compare the last two segments of the hostname (e.g. example.com)
29 |       const getDomain = (host: string) => host.split(".").slice(-2).join(".");
30 |       return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname);
31 |     }
32 |     default:
33 |       return false;
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/splitter/errors.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Base error class for all splitter-related errors
 3 |  */
 4 | export class SplitterError extends Error {}
 5 | 
 6 | /**
 7 |  * Thrown when content cannot be split further while maintaining its validity
 8 |  * (e.g., markdown tables require headers, code blocks require language and backticks)
 9 |  */
10 | export class MinimumChunkSizeError extends SplitterError {
11 |   constructor(size: number, maxSize: number) {
12 |     super(
13 |       `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`,
14 |     );
15 |   }
16 | }
17 | 
18 | /**
19 |  * Generic error for content splitting failures
20 |  */
21 | export class ContentSplitterError extends SplitterError {}
22 | 


--------------------------------------------------------------------------------
/src/splitter/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./SemanticMarkdownSplitter";
2 | export * from "./GreedySplitter";
3 | export * from "./errors";
4 | 


--------------------------------------------------------------------------------
/src/splitter/splitters/CodeContentSplitter.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it, vi } from "vitest";
 2 | import { CodeContentSplitter } from "./CodeContentSplitter";
 3 | import type { ContentSplitterOptions } from "./types";
 4 | 
 5 | vi.mock("../../utils/logger");
 6 | 
 7 | describe("CodeContentSplitter", () => {
 8 |   const options = {
 9 |     chunkSize: 100,
10 |   } satisfies ContentSplitterOptions;
11 |   const splitter = new CodeContentSplitter(options);
12 | 
13 |   it("should preserve language in code blocks", async () => {
14 |     const code = `function test() {
15 |   console.log("Hello");
16 | }`;
17 |     const markdown = `\`\`\`typescript\n${code}\n\`\`\``;
18 |     const chunks = await splitter.split(markdown);
19 |     expect(chunks.length).toBe(1);
20 |     expect(chunks[0]).toBe(markdown);
21 |   });
22 | 
23 |   it("should handle code without language", async () => {
24 |     const code = `const x = 1;
25 | const y = 2;`;
26 |     const markdown = `\`\`\`\n${code}\n\`\`\``;
27 |     const chunks = await splitter.split(markdown);
28 |     expect(chunks.length).toBe(1);
29 |     expect(chunks[0]).toBe(markdown);
30 |   });
31 | 
32 |   it("should split large code blocks by lines", async () => {
33 |     const longLine =
34 |       "console.log('This is a very long line of code that should be split.');";
35 |     const code = Array(10).fill(longLine).join("\n");
36 | 
37 |     const markdown = `\`\`\`javascript\n${code}\n\`\`\``;
38 |     const chunks = await splitter.split(markdown);
39 |     expect(chunks.length).toBeGreaterThan(1);
40 |     for (const chunk of chunks) {
41 |       expect(chunk.length).toBeLessThanOrEqual(options.chunkSize);
42 |       expect(chunk.startsWith("```javascript\n")).toBe(true);
43 |       expect(chunk.endsWith("\n```")).toBe(true);
44 |     }
45 |   });
46 | 
47 |   it("should handle empty code blocks", async () => {
48 |     const markdown = "```python\n\n```";
49 |     const chunks = await splitter.split(markdown);
50 |     expect(chunks.length).toBe(1);
51 |     expect(chunks[0]).toBe(markdown);
52 |   });
53 | 
54 |   it("should preserve indentation", async () => {
55 |     const code = `function test() {
56 |   if (condition) {
57 |     for (let i = 0; i < 10; i++) {
58 |       console.log(i);
59 |     }
60 |   }
61 | }`;
62 |     const markdown = `\`\`\`typescript\n${code}\n\`\`\``;
63 |     const chunks = await splitter.split(markdown);
64 |     for (const chunk of chunks) {
65 |       // Check if indentation is preserved within the chunk
66 |       const lines = chunk.split("\n");
67 |       for (let i = 1; i < lines.length - 1; i++) {
68 |         // Skip the first (```typescript) and last (```) lines
69 |         if (lines[i].includes("if")) {
70 |           expect(lines[i].startsWith("  "));
71 |         } else if (lines[i].includes("for")) {
72 |           expect(lines[i].startsWith("    "));
73 |         } else if (lines[i].includes("console")) {
74 |           expect(lines[i].startsWith("      "));
75 |         }
76 |       }
77 |     }
78 |   });
79 | });
80 | 


--------------------------------------------------------------------------------
/src/splitter/splitters/CodeContentSplitter.ts:
--------------------------------------------------------------------------------
 1 | import { MinimumChunkSizeError } from "../errors";
 2 | import type { ContentSplitter, ContentSplitterOptions } from "./types";
 3 | 
 4 | /**
 5 |  * Splits code content while preserving language information and formatting.
 6 |  * Uses line boundaries for splitting and ensures each chunk is properly
 7 |  * wrapped with language-specific code block markers.
 8 |  */
 9 | export class CodeContentSplitter implements ContentSplitter {
10 |   constructor(private options: ContentSplitterOptions) {}
11 | 
12 |   async split(content: string): Promise<string[]> {
13 |     // Determine language and strip triple backticks from content
14 |     const language = content.match(/^```(\w+)\n/)?.[1];
15 |     const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, "");
16 | 
17 |     const lines = strippedContent.split("\n");
18 |     const chunks: string[] = [];
19 |     let currentChunkLines: string[] = [];
20 | 
21 |     for (const line of lines) {
22 |       // Check if a single line with code block markers exceeds chunkSize
23 |       const singleLineSize = this.wrap(line, language).length;
24 |       if (singleLineSize > this.options.chunkSize) {
25 |         throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize);
26 |       }
27 | 
28 |       currentChunkLines.push(line);
29 |       const newChunkContent = this.wrap(currentChunkLines.join("\n"), language);
30 |       const newChunkSize = newChunkContent.length;
31 | 
32 |       if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) {
33 |         // remove last item
34 |         const lastLine = currentChunkLines.pop();
35 |         // wrap content and create chunk
36 |         chunks.push(this.wrap(currentChunkLines.join("\n"), language));
37 |         currentChunkLines = [lastLine as string];
38 |       }
39 |     }
40 | 
41 |     if (currentChunkLines.length > 0) {
42 |       chunks.push(this.wrap(currentChunkLines.join("\n"), language));
43 |     }
44 | 
45 |     return chunks;
46 |   }
47 | 
48 |   protected wrap(content: string, language?: string | null): string {
49 |     return `\`\`\`${language || ""}\n${content.replace(/\n+$/, "")}\n\`\`\``;
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/splitter/splitters/TableContentSplitter.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it, vi } from "vitest";
 2 | import { MinimumChunkSizeError } from "../errors";
 3 | import { TableContentSplitter } from "./TableContentSplitter";
 4 | import type { ContentSplitterOptions } from "./types";
 5 | 
 6 | vi.mock("../../utils/logger");
 7 | 
 8 | describe("TableContentSplitter", () => {
 9 |   const options = {
10 |     chunkSize: 100,
11 |   } satisfies ContentSplitterOptions;
12 |   const splitter = new TableContentSplitter(options);
13 | 
14 |   it("should preserve table headers in each chunk", async () => {
15 |     const table = `| Column 1 | Column 2 | Column 3 |
16 | |----------|-----------|-----------|
17 | | Data A1  | Data A2   | Data A3   |
18 | | Data B1  | Data B2   | Data B3   |`;
19 | 
20 |     const chunks = await splitter.split(table);
21 | 
22 |     for (const chunk of chunks) {
23 |       const lines = chunk.split("\n");
24 |       expect(lines[0]).toBe("| Column 1 | Column 2 | Column 3 |");
25 |       expect(lines[1]).toBe("|---|---|---|");
26 |     }
27 |   });
28 | 
29 |   it("should split large tables by rows", async () => {
30 |     // Create a large table that *might* exceed chunkSize, depending on header length
31 |     const rows = Array(20)
32 |       .fill(0)
33 |       .map((_, i) => `| Data ${i}A | Data ${i}B |`);
34 |     const table = `| Header A | Header B |
35 | |----------|-----------|
36 | ${rows.join("\n")}`;
37 | 
38 |     const chunks = await splitter.split(table);
39 |     expect(chunks.length).toBeGreaterThan(0); // It will split, even if not > 1
40 |     for (const chunk of chunks) {
41 |       const lines = chunk.split("\n");
42 |       expect(lines[0]).toBe("| Header A | Header B |");
43 |       expect(lines[1]).toBe("|---|---|");
44 |     }
45 |   });
46 | 
47 |   it("should throw MinimumChunkSizeError if single row with headers exceeds chunkSize", async () => {
48 |     const splitter = new TableContentSplitter({
49 |       chunkSize: 50, // Small size for testing
50 |     });
51 |     const table = `| Header A | Header B | Header C |
52 | |----------|-----------|-----------|
53 | | Very long data that exceeds max chunk size with headers | More data | And more |`;
54 | 
55 |     await expect(splitter.split(table)).rejects.toThrow(MinimumChunkSizeError);
56 | 
57 |     await expect(splitter.split(table)).rejects.toThrowError(
58 |       "Cannot split content any further",
59 |     );
60 |   });
61 | 
62 |   it("should handle empty table", async () => {
63 |     const splitter = new TableContentSplitter(options);
64 |     const table = "";
65 |     const chunks = await splitter.split(table);
66 |     expect(chunks.length).toBe(1);
67 |     expect(chunks[0]).toBe("");
68 |   });
69 | 
70 |   it("should preserve special characters", async () => {
71 |     const splitter = new TableContentSplitter(options);
72 |     const table = `| Symbol | Description |
73 | |---------|-------------|
74 | | →       | Arrow       |
75 | | 👋      | Wave        |
76 | | &copy;  | Copyright   |
77 | | <tag>   | HTML Tag    |`;
78 | 
79 |     const chunks = await splitter.split(table);
80 |     const allContent = chunks.join("");
81 |     expect(allContent).toContain("→");
82 |     expect(allContent).toContain("👋");
83 |     expect(allContent).toContain("&copy;");
84 |     expect(allContent).toContain("<tag>");
85 |   });
86 | });
87 | 


--------------------------------------------------------------------------------
/src/splitter/splitters/TableContentSplitter.ts:
--------------------------------------------------------------------------------
 1 | import { MinimumChunkSizeError } from "../errors";
 2 | import type { ContentSplitter, ContentSplitterOptions } from "./types";
 3 | 
 4 | /**
 5 |  * Interface representing the structure of a parsed markdown table
 6 |  */
 7 | interface ParsedTable {
 8 |   headers: string[];
 9 |   separator: string;
10 |   rows: string[];
11 | }
12 | 
13 | /**
14 |  * Splits table content while preserving headers and table formatting.
15 |  * Each chunk maintains the table structure with headers and separator row.
16 |  */
17 | export class TableContentSplitter implements ContentSplitter {
18 |   constructor(private options: ContentSplitterOptions) {}
19 | 
20 |   /**
21 |    * Splits table content into chunks while preserving table structure
22 |    */
23 |   async split(content: string): Promise<string[]> {
24 |     const parsedTable = this.parseTable(content);
25 |     if (!parsedTable) {
26 |       return [content];
27 |     }
28 | 
29 |     const { headers, rows } = parsedTable;
30 | 
31 |     const chunks: string[] = [];
32 |     let currentRows: string[] = [];
33 | 
34 |     for (const row of rows) {
35 |       // Check if a single row with headers exceeds chunkSize
36 |       const singleRowSize = this.wrap(row, headers).length;
37 |       if (singleRowSize > this.options.chunkSize) {
38 |         throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize);
39 |       }
40 | 
41 |       const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers);
42 |       const newChunkSize = newChunkContent.length;
43 |       if (newChunkSize > this.options.chunkSize && currentRows.length > 0) {
44 |         // Add current chunk, start new
45 |         chunks.push(this.wrap(currentRows.join("\n"), headers));
46 |         currentRows = [row];
47 |       } else {
48 |         currentRows.push(row);
49 |       }
50 |     }
51 | 
52 |     if (currentRows.length > 0) {
53 |       chunks.push(this.wrap(currentRows.join("\n"), headers));
54 |     }
55 | 
56 |     // No merging of table chunks
57 |     return chunks;
58 |   }
59 | 
60 |   protected wrap(content: string, headers: string[]): string {
61 |     const headerRow = `| ${headers.join(" | ")} |`;
62 |     const separatorRow = `|${headers.map(() => "---").join("|")}|`;
63 |     return [headerRow, separatorRow, content].join("\n");
64 |   }
65 | 
66 |   private parseTable(content: string): ParsedTable | null {
67 |     const lines = content.trim().split("\n");
68 |     if (lines.length < 3) return null; // Need at least headers, separator, and one data row
69 | 
70 |     const headers = this.parseRow(lines[0]);
71 |     if (!headers) return null;
72 | 
73 |     const separator = lines[1];
74 |     if (!this.isValidSeparator(separator)) return null;
75 | 
76 |     const rows = lines.slice(2).filter((row) => row.trim() !== "");
77 | 
78 |     return { headers, separator, rows };
79 |   }
80 | 
81 |   /**
82 |    * Parses a table row into cells
83 |    */
84 |   private parseRow(row: string): string[] | null {
85 |     if (!row.includes("|")) return null;
86 |     return row
87 |       .split("|")
88 |       .map((cell) => cell.trim())
89 |       .filter((cell) => cell !== "");
90 |   }
91 | 
92 |   /**
93 |    * Validates the separator row of the table
94 |    */
95 |   private isValidSeparator(separator: string): boolean {
96 |     return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator);
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/splitter/splitters/TextContentSplitter.test.ts:
--------------------------------------------------------------------------------
 1 | import { describe, expect, it, vi } from "vitest";
 2 | import { TextContentSplitter } from "./TextContentSplitter";
 3 | import type { ContentSplitterOptions } from "./types";
 4 | 
 5 | vi.mock("../../utils/logger");
 6 | 
 7 | describe("TextContentSplitter", () => {
 8 |   const options = {
 9 |     chunkSize: 100,
10 |   } satisfies ContentSplitterOptions;
11 |   const splitter = new TextContentSplitter(options);
12 | 
13 |   it("should split on paragraph boundaries when possible", async () => {
14 |     const text = `First paragraph with some content.
15 | 
16 | Second paragraph that continues the text.
17 | 
18 | Third paragraph to complete the example.`;
19 | 
20 |     const chunks = await splitter.split(text);
21 | 
22 |     expect(chunks.length).toBe(3);
23 |     expect(chunks[0]).toBe("First paragraph with some content.");
24 |     expect(chunks[1]).toBe("Second paragraph that continues the text.");
25 |     expect(chunks[2]).toBe("Third paragraph to complete the example.");
26 |   });
27 | 
28 |   it("should fall back to line breaks when paragraphs too large", async () => {
29 |     // Create a paragraph larger than preferredChunkSize
30 |     const longParagraph = Array(5)
31 |       .fill("This is a very long line of text that should be split.")
32 |       .join(" ");
33 | 
34 |     const text = `${longParagraph}
35 | Line two of the text.
36 | Line three continues here.
37 | And line four finishes it.`;
38 | 
39 |     const chunks = await splitter.split(text);
40 | 
41 |     // Should split into multiple chunks at line boundaries
42 |     expect(chunks.length).toBeGreaterThan(1);
43 |     for (const chunk of chunks) {
44 |       expect(chunk.length).toBeLessThanOrEqual(options.chunkSize);
45 |     }
46 |   });
47 | 
48 |   it("should merge small chunks when possible", async () => {
49 |     const text =
50 |       "Short line 1.\nShort line 2.\nShort line 3.\n\nAnother short one.\nAnd another.";
51 | 
52 |     const chunks = await splitter.split(text);
53 | 
54 |     // Small consecutive lines should be merged
55 |     expect(chunks.length).toBeLessThan(6); // Less than total number of lines
56 |     for (const chunk of chunks) {
57 |       expect(chunk.length).toBeLessThanOrEqual(options.chunkSize);
58 |     }
59 |   });
60 | 
61 |   it("should handle empty content gracefully", async () => {
62 |     const emptyChunks = await splitter.split("");
63 |     expect(emptyChunks.length).toBe(1);
64 |     expect(emptyChunks[0]).toBe("");
65 | 
66 |     const whitespaceChunks = await splitter.split("   \n  \n  ");
67 |     expect(whitespaceChunks.length).toBe(1);
68 |     expect(whitespaceChunks[0]).toBe("");
69 |   });
70 | 
71 |   it("should split words as last resort", async () => {
72 |     const splitter = new TextContentSplitter({
73 |       chunkSize: 20, // Very small for testing word splitting
74 |     });
75 | 
76 |     const text =
77 |       "This is a very long sentence that needs to be split into smaller chunks";
78 | 
79 |     const chunks = await splitter.split(text);
80 | 
81 |     expect(chunks.length).toBeGreaterThan(1);
82 |     for (const chunk of chunks) {
83 |       expect(chunk.length).toBeLessThanOrEqual(20);
84 |     }
85 |   });
86 | });
87 | 


--------------------------------------------------------------------------------
/src/splitter/splitters/types.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Common configuration options for content splitters
 3 |  */
 4 | export interface ContentSplitterOptions {
 5 |   /** Maximum characters per chunk */
 6 |   chunkSize: number;
 7 | }
 8 | 
 9 | /**
10 |  * Core interface for content splitters
11 |  */
12 | export interface ContentSplitter {
13 |   /** Split content into chunks respecting size constraints */
14 |   split(content: string): Promise<string[]>;
15 | }
16 | 


--------------------------------------------------------------------------------
/src/splitter/types.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Types of content within a document section
 3 |  */
 4 | export type SectionContentType = "text" | "code" | "table" | "heading";
 5 | 
 6 | /**
 7 |  * Final output chunk after processing and size-based splitting
 8 |  */
 9 | export interface ContentChunk {
10 |   types: SectionContentType[];
11 |   content: string;
12 |   section: {
13 |     level: number;
14 |     path: string[];
15 |   };
16 | }
17 | 
18 | /**
19 |  * Interface for a splitter that processes markdown content into chunks
20 |  */
21 | export interface DocumentSplitter {
22 |   splitText(markdown: string): Promise<ContentChunk[]>;
23 | }
24 | 


--------------------------------------------------------------------------------
/src/store/embeddings/FixedDimensionEmbeddings.test.ts:
--------------------------------------------------------------------------------
 1 | import { Embeddings } from "@langchain/core/embeddings";
 2 | import { describe, expect, test, vi } from "vitest";
 3 | import { DimensionError } from "../errors";
 4 | import { VECTOR_DIMENSION } from "../types";
 5 | import { FixedDimensionEmbeddings } from "./FixedDimensionEmbeddings";
 6 | 
 7 | // Suppress logger output during tests
 8 | vi.mock("../../utils/logger");
 9 | 
10 | // Mock embedding models that produce vectors of different sizes
11 | class MockBaseEmbeddings extends Embeddings {
12 |   constructor(private dimension: number) {
13 |     super({});
14 |   }
15 | 
16 |   async embedQuery(_text: string): Promise<number[]> {
17 |     return Array(this.dimension).fill(1);
18 |   }
19 | 
20 |   async embedDocuments(_documents: string[]): Promise<number[][]> {
21 |     return [Array(this.dimension).fill(1)];
22 |   }
23 | }
24 | 
25 | describe("FixedDimensionEmbeddings", () => {
26 |   const targetDimension = VECTOR_DIMENSION;
27 | 
28 |   test("should pass through vectors of correct dimension", async () => {
29 |     const base = new MockBaseEmbeddings(targetDimension);
30 |     const wrapper = new FixedDimensionEmbeddings(base, targetDimension, "test:model");
31 | 
32 |     const vector = await wrapper.embedQuery("test");
33 |     expect(vector.length).toBe(targetDimension);
34 |   });
35 | 
36 |   test("should pad vectors that are too short", async () => {
37 |     const shortDimension = 1024;
38 |     const base = new MockBaseEmbeddings(shortDimension);
39 |     const wrapper = new FixedDimensionEmbeddings(base, targetDimension, "test:model");
40 | 
41 |     const vector = await wrapper.embedQuery("test");
42 |     expect(vector.length).toBe(targetDimension);
43 |     // Check that first part contains the original values
44 |     expect(vector.slice(0, shortDimension)).toEqual(Array(shortDimension).fill(1));
45 |     // Check that padding is zeros
46 |     expect(vector.slice(shortDimension)).toEqual(
47 |       Array(targetDimension - shortDimension).fill(0),
48 |     );
49 |   });
50 | 
51 |   test("should truncate oversized vectors when allowTruncate is true", async () => {
52 |     const largeDimension = 2048;
53 |     const base = new MockBaseEmbeddings(largeDimension);
54 |     const wrapper = new FixedDimensionEmbeddings(
55 |       base,
56 |       targetDimension,
57 |       "test:model",
58 |       true,
59 |     );
60 | 
61 |     const vector = await wrapper.embedQuery("test");
62 |     expect(vector.length).toBe(targetDimension);
63 |     expect(vector).toEqual(Array(targetDimension).fill(1));
64 |   });
65 | 
66 |   test("should throw DimensionError for oversized vectors when allowTruncate is false", async () => {
67 |     const largeDimension = 3072;
68 |     const base = new MockBaseEmbeddings(largeDimension);
69 |     const wrapper = new FixedDimensionEmbeddings(base, targetDimension, "test:model");
70 | 
71 |     await expect(() => wrapper.embedQuery("test")).rejects.toThrow(DimensionError);
72 |   });
73 | 
74 |   test("should process multiple documents correctly", async () => {
75 |     const shortDimension = 1024;
76 |     const base = new MockBaseEmbeddings(shortDimension);
77 |     const wrapper = new FixedDimensionEmbeddings(base, targetDimension, "test:model");
78 | 
79 |     const vectors = await wrapper.embedDocuments(["test1", "test2"]);
80 |     expect(vectors.length).toBe(1); // Our mock returns just one vector
81 |     expect(vectors[0].length).toBe(targetDimension);
82 |     // Check padding
83 |     expect(vectors[0].slice(shortDimension)).toEqual(
84 |       Array(targetDimension - shortDimension).fill(0),
85 |     );
86 |   });
87 | });
88 | 


--------------------------------------------------------------------------------
/src/store/embeddings/FixedDimensionEmbeddings.ts:
--------------------------------------------------------------------------------
 1 | import { Embeddings } from "@langchain/core/embeddings";
 2 | import { DimensionError } from "../errors";
 3 | 
 4 | /**
 5 |  * Wrapper around an Embeddings implementation that ensures vectors have a fixed dimension.
 6 |  * - If a vector's dimension is greater than the target and truncation is allowed,
 7 |  *   the vector is truncated (e.g., for models that support MRL - Matryoshka
 8 |  *   Representation Learning).
 9 |  * - If a vector's dimension is greater than the target and truncation is not
10 |  *   allowed, a DimensionError is thrown.
11 |  * - If a vector's dimension is less than the target, it is padded with zeros.
12 |  */
13 | export class FixedDimensionEmbeddings extends Embeddings {
14 |   private provider: string;
15 |   private model: string;
16 | 
17 |   constructor(
18 |     private readonly embeddings: Embeddings,
19 |     private readonly targetDimension: number,
20 |     providerAndModel: string,
21 |     private readonly allowTruncate: boolean = false,
22 |   ) {
23 |     super({});
24 |     // Parse provider and model from string (e.g., "gemini:embedding-001" or just "text-embedding-3-small")
25 |     const [providerOrModel, modelName] = providerAndModel.split(":");
26 |     this.provider = modelName ? providerOrModel : "openai"; // Default to openai if no provider specified
27 |     this.model = modelName || providerOrModel;
28 |   }
29 | 
30 |   /**
31 |    * Normalize a vector to the target dimension by truncating (for MRL models) or padding.
32 |    * @throws {DimensionError} If vector is too large and provider doesn't support MRL
33 |    */
34 |   private normalizeVector(vector: number[]): number[] {
35 |     const dimension = vector.length;
36 | 
37 |     if (dimension > this.targetDimension) {
38 |       // If truncation is allowed (e.g., for MRL models like Gemini), truncate the vector
39 |       if (this.allowTruncate) {
40 |         return vector.slice(0, this.targetDimension);
41 |       }
42 |       // Otherwise, throw an error
43 |       throw new DimensionError(
44 |         `${this.provider}:${this.model}`,
45 |         dimension,
46 |         this.targetDimension,
47 |       );
48 |     }
49 | 
50 |     if (dimension < this.targetDimension) {
51 |       // Pad with zeros to reach target dimension
52 |       return [...vector, ...new Array(this.targetDimension - dimension).fill(0)];
53 |     }
54 | 
55 |     return vector;
56 |   }
57 | 
58 |   async embedQuery(text: string): Promise<number[]> {
59 |     const vector = await this.embeddings.embedQuery(text);
60 |     return this.normalizeVector(vector);
61 |   }
62 | 
63 |   async embedDocuments(documents: string[]): Promise<number[][]> {
64 |     const vectors = await this.embeddings.embedDocuments(documents);
65 |     return vectors.map((vector) => this.normalizeVector(vector));
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/src/store/errors.ts:
--------------------------------------------------------------------------------
 1 | class StoreError extends Error {
 2 |   constructor(
 3 |     message: string,
 4 |     public readonly cause?: unknown,
 5 |   ) {
 6 |     super(cause ? `${message} caused by ${cause}` : message);
 7 |     this.name = this.constructor.name;
 8 | 
 9 |     const causeError =
10 |       cause instanceof Error ? cause : cause ? new Error(String(cause)) : undefined;
11 |     if (causeError?.stack) {
12 |       this.stack = causeError.stack;
13 |     }
14 |   }
15 | }
16 | 
17 | class DimensionError extends StoreError {
18 |   constructor(
19 |     public readonly modelName: string,
20 |     public readonly modelDimension: number,
21 |     public readonly dbDimension: number,
22 |   ) {
23 |     super(
24 |       `Model "${modelName}" produces ${modelDimension}-dimensional vectors, ` +
25 |         `which exceeds the database's fixed dimension of ${dbDimension}. ` +
26 |         `Please use a model with dimension ≤ ${dbDimension}.`,
27 |     );
28 |   }
29 | }
30 | 
31 | class ConnectionError extends StoreError {}
32 | 
33 | class DocumentNotFoundError extends StoreError {
34 |   constructor(public readonly id: string) {
35 |     super(`Document ${id} not found`);
36 |   }
37 | }
38 | 
39 | export { StoreError, ConnectionError, DocumentNotFoundError, DimensionError };
40 | 


--------------------------------------------------------------------------------
/src/store/index.ts:
--------------------------------------------------------------------------------
1 | export * from "./DocumentStore";
2 | export * from "./DocumentManagementService";
3 | export * from "./errors";
4 | 


--------------------------------------------------------------------------------
/src/store/types.ts:
--------------------------------------------------------------------------------
 1 | import type { DocumentMetadata } from "../types";
 2 | 
 3 | /** Default vector dimension used across the application */
 4 | export const VECTOR_DIMENSION = 1536;
 5 | 
 6 | /**
 7 |  * Database document record type matching the documents table schema
 8 |  */
 9 | export interface DbDocument {
10 |   id: string;
11 |   library: string;
12 |   version: string;
13 |   url: string;
14 |   content: string;
15 |   metadata: string; // JSON string of DocumentMetadata
16 |   embedding: string | null; // JSON string of number[]
17 |   sort_order: number;
18 |   score: number | null;
19 | }
20 | 
21 | /**
22 |  * Utility type for handling SQLite query results that may be undefined
23 |  */
24 | export type DbQueryResult<T> = T | undefined;
25 | 
26 | /**
27 |  * Maps raw database document to the Document type used by the application
28 |  */
29 | export function mapDbDocumentToDocument(doc: DbDocument) {
30 |   return {
31 |     id: doc.id,
32 |     pageContent: doc.content,
33 |     metadata: JSON.parse(doc.metadata) as DocumentMetadata,
34 |   };
35 | }
36 | 
37 | /**
38 |  * Search result type returned by the DocumentRetrieverService
39 |  */
40 | export interface StoreSearchResult {
41 |   url: string;
42 |   content: string;
43 |   score: number | null;
44 | }
45 | 
46 | /**
47 |  * Represents a library and its indexed versions.
48 |  */
49 | export interface LibraryVersion {
50 |   version: string;
51 | }
52 | 
53 | /**
54 |  * Detailed information about a specific indexed library version.
55 |  */
56 | export interface LibraryVersionDetails {
57 |   version: string;
58 |   documentCount: number;
59 |   uniqueUrlCount: number;
60 |   indexedAt: string | null; // ISO 8601 format from MIN(indexed_at)
61 | }
62 | 
63 | /**
64 |  * Result type for findBestVersion, indicating the best semver match
65 |  * and whether unversioned documents exist.
66 |  */
67 | export interface FindVersionResult {
68 |   bestMatch: string | null;
69 |   hasUnversioned: boolean;
70 | }
71 | 


--------------------------------------------------------------------------------
/src/tools/CancelJobTool.ts:
--------------------------------------------------------------------------------
 1 | import type { PipelineManager } from "../pipeline/PipelineManager";
 2 | import { PipelineJobStatus } from "../pipeline/types";
 3 | import { logger } from "../utils/logger";
 4 | 
 5 | /**
 6 |  * Input parameters for the CancelJobTool.
 7 |  */
 8 | export interface CancelJobInput {
 9 |   /** The ID of the job to cancel. */
10 |   jobId: string;
11 | }
12 | 
13 | /**
14 |  * Output result for the CancelJobTool.
15 |  */
16 | export interface CancelJobResult {
17 |   /** A message indicating the outcome of the cancellation attempt. */
18 |   message: string;
19 |   /** Indicates if the cancellation request was successfully initiated or if the job was already finished/cancelled. */
20 |   success: boolean;
21 | }
22 | 
23 | /**
24 |  * Tool for attempting to cancel a pipeline job.
25 |  */
26 | export class CancelJobTool {
27 |   private manager: PipelineManager;
28 | 
29 |   /**
30 |    * Creates an instance of CancelJobTool.
31 |    * @param manager The PipelineManager instance.
32 |    */
33 |   constructor(manager: PipelineManager) {
34 |     this.manager = manager;
35 |   }
36 | 
37 |   /**
38 |    * Executes the tool to attempt cancellation of a specific job.
39 |    * @param input - The input parameters, containing the jobId.
40 |    * @returns A promise that resolves with the outcome message.
41 |    */
42 |   async execute(input: CancelJobInput): Promise<CancelJobResult> {
43 |     try {
44 |       // Retrieve the job first to check its status before attempting cancellation
45 |       const job = await this.manager.getJob(input.jobId);
46 | 
47 |       if (!job) {
48 |         logger.warn(`❓ [CancelJobTool] Job not found: ${input.jobId}`);
49 |         return {
50 |           message: `Job with ID ${input.jobId} not found.`,
51 |           success: false,
52 |         };
53 |       }
54 | 
55 |       // Check if the job is already in a final state
56 |       if (
57 |         job.status === PipelineJobStatus.COMPLETED || // Use enum member
58 |         job.status === PipelineJobStatus.FAILED || // Use enum member
59 |         job.status === PipelineJobStatus.CANCELLED // Use enum member
60 |       ) {
61 |         logger.debug(`Job ${input.jobId} is already in a final state: ${job.status}.`);
62 |         return {
63 |           message: `Job ${input.jobId} is already ${job.status}. No action taken.`,
64 |           success: true, // Considered success as no cancellation needed
65 |         };
66 |       }
67 | 
68 |       // Attempt cancellation
69 |       await this.manager.cancelJob(input.jobId);
70 | 
71 |       // Re-fetch the job to confirm status change (or check status directly if cancelJob returned it)
72 |       // PipelineManager.cancelJob doesn't return status, so re-fetch is needed for confirmation.
73 |       const updatedJob = await this.manager.getJob(input.jobId);
74 |       const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)";
75 | 
76 |       logger.debug(
77 |         `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`,
78 |       );
79 |       return {
80 |         message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`,
81 |         success: true,
82 |       };
83 |     } catch (error) {
84 |       logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`);
85 |       return {
86 |         message: `Failed to cancel job ${input.jobId}: ${
87 |           error instanceof Error ? error.message : String(error)
88 |         }`,
89 |         success: false,
90 |       };
91 |     }
92 |   }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/tools/ClearCompletedJobsTool.test.ts:
--------------------------------------------------------------------------------
 1 | import { type Mock, beforeEach, describe, expect, it, vi } from "vitest";
 2 | import type { PipelineManager } from "../pipeline/PipelineManager";
 3 | import { ClearCompletedJobsTool } from "./ClearCompletedJobsTool";
 4 | 
 5 | // Mock dependencies
 6 | vi.mock("../pipeline/PipelineManager");
 7 | vi.mock("../utils/logger");
 8 | 
 9 | describe("ClearCompletedJobsTool", () => {
10 |   let mockManagerInstance: Partial<PipelineManager>;
11 |   let clearCompletedJobsTool: ClearCompletedJobsTool;
12 | 
13 |   beforeEach(() => {
14 |     vi.resetAllMocks();
15 | 
16 |     // Define the mock implementation for the manager instance
17 |     mockManagerInstance = {
18 |       clearCompletedJobs: vi.fn().mockResolvedValue(0), // Default to no jobs cleared
19 |     };
20 | 
21 |     // Instantiate the tool with the correctly typed mock instance
22 |     clearCompletedJobsTool = new ClearCompletedJobsTool(
23 |       mockManagerInstance as PipelineManager,
24 |     );
25 |   });
26 | 
27 |   it("should call manager.clearCompletedJobs", async () => {
28 |     await clearCompletedJobsTool.execute({});
29 |     expect(mockManagerInstance.clearCompletedJobs).toHaveBeenCalledOnce();
30 |   });
31 | 
32 |   it("should return success: true with count when jobs are cleared", async () => {
33 |     const clearedCount = 3;
34 |     (mockManagerInstance.clearCompletedJobs as Mock).mockResolvedValue(clearedCount);
35 | 
36 |     const result = await clearCompletedJobsTool.execute({});
37 | 
38 |     expect(mockManagerInstance.clearCompletedJobs).toHaveBeenCalledOnce();
39 |     expect(result.success).toBe(true);
40 |     expect(result.clearedCount).toBe(clearedCount);
41 |     expect(result.message).toContain("Successfully cleared 3 completed jobs");
42 |   });
43 | 
44 |   it("should return success: true with singular message when 1 job is cleared", async () => {
45 |     const clearedCount = 1;
46 |     (mockManagerInstance.clearCompletedJobs as Mock).mockResolvedValue(clearedCount);
47 | 
48 |     const result = await clearCompletedJobsTool.execute({});
49 | 
50 |     expect(result.success).toBe(true);
51 |     expect(result.clearedCount).toBe(clearedCount);
52 |     expect(result.message).toContain("Successfully cleared 1 completed job");
53 |     expect(result.message).not.toContain("jobs"); // Should be singular
54 |   });
55 | 
56 |   it("should return success: true with appropriate message when no jobs are cleared", async () => {
57 |     const clearedCount = 0;
58 |     (mockManagerInstance.clearCompletedJobs as Mock).mockResolvedValue(clearedCount);
59 | 
60 |     const result = await clearCompletedJobsTool.execute({});
61 | 
62 |     expect(result.success).toBe(true);
63 |     expect(result.clearedCount).toBe(clearedCount);
64 |     expect(result.message).toBe("No completed jobs to clear.");
65 |   });
66 | 
67 |   it("should return success: false if clearCompletedJobs throws an error", async () => {
68 |     const clearError = new Error("Clear operation failed");
69 |     (mockManagerInstance.clearCompletedJobs as Mock).mockRejectedValue(clearError);
70 | 
71 |     const result = await clearCompletedJobsTool.execute({});
72 | 
73 |     expect(mockManagerInstance.clearCompletedJobs).toHaveBeenCalledOnce();
74 |     expect(result.success).toBe(false);
75 |     expect(result.clearedCount).toBe(0);
76 |     expect(result.message).toContain("Failed to clear completed jobs");
77 |     expect(result.message).toContain(clearError.message);
78 |   });
79 | 
80 |   it("should handle non-Error exceptions gracefully", async () => {
81 |     const clearError = "String error message";
82 |     (mockManagerInstance.clearCompletedJobs as Mock).mockRejectedValue(clearError);
83 | 
84 |     const result = await clearCompletedJobsTool.execute({});
85 | 
86 |     expect(result.success).toBe(false);
87 |     expect(result.clearedCount).toBe(0);
88 |     expect(result.message).toContain("Failed to clear completed jobs");
89 |     expect(result.message).toContain(clearError);
90 |   });
91 | });
92 | 


--------------------------------------------------------------------------------
/src/tools/ClearCompletedJobsTool.ts:
--------------------------------------------------------------------------------
 1 | import type { PipelineManager } from "../pipeline/PipelineManager";
 2 | import { logger } from "../utils/logger";
 3 | 
 4 | /**
 5 |  * Input parameters for the ClearCompletedJobsTool.
 6 |  */
 7 | // biome-ignore lint/suspicious/noEmptyInterface: No input parameters needed for this tool
 8 | export interface ClearCompletedJobsInput {
 9 |   // No input parameters needed for this tool
10 | }
11 | 
12 | /**
13 |  * Output result for the ClearCompletedJobsTool.
14 |  */
15 | export interface ClearCompletedJobsResult {
16 |   /** A message indicating the outcome of the clear operation. */
17 |   message: string;
18 |   /** Indicates if the clear operation was successful. */
19 |   success: boolean;
20 |   /** The number of jobs that were cleared. */
21 |   clearedCount: number;
22 | }
23 | 
24 | /**
25 |  * Tool for clearing all completed, cancelled, and failed jobs from the pipeline.
26 |  * This helps keep the job queue clean by removing jobs that are no longer active.
27 |  */
28 | export class ClearCompletedJobsTool {
29 |   private manager: PipelineManager;
30 | 
31 |   /**
32 |    * Creates an instance of ClearCompletedJobsTool.
33 |    * @param manager The PipelineManager instance.
34 |    */
35 |   constructor(manager: PipelineManager) {
36 |     this.manager = manager;
37 |   }
38 | 
39 |   /**
40 |    * Executes the tool to clear all completed jobs from the pipeline.
41 |    * @param input - The input parameters (currently unused).
42 |    * @returns A promise that resolves with the outcome of the clear operation.
43 |    */
44 |   async execute(input: ClearCompletedJobsInput): Promise<ClearCompletedJobsResult> {
45 |     try {
46 |       const clearedCount = await this.manager.clearCompletedJobs();
47 | 
48 |       const message =
49 |         clearedCount > 0
50 |           ? `Successfully cleared ${clearedCount} completed job${clearedCount === 1 ? "" : "s"} from the queue.`
51 |           : "No completed jobs to clear.";
52 | 
53 |       logger.debug(`[ClearCompletedJobsTool] ${message}`);
54 | 
55 |       return {
56 |         message,
57 |         success: true,
58 |         clearedCount,
59 |       };
60 |     } catch (error) {
61 |       const errorMessage = `Failed to clear completed jobs: ${
62 |         error instanceof Error ? error.message : String(error)
63 |       }`;
64 | 
65 |       logger.error(`❌ [ClearCompletedJobsTool] ${errorMessage}`);
66 | 
67 |       return {
68 |         message: errorMessage,
69 |         success: false,
70 |         clearedCount: 0,
71 |       };
72 |     }
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/tools/FindVersionTool.ts:
--------------------------------------------------------------------------------
 1 | import type { DocumentManagementService } from "../store";
 2 | import { logger } from "../utils/logger";
 3 | import { VersionNotFoundError } from "./errors";
 4 | 
 5 | export interface FindVersionToolOptions {
 6 |   library: string;
 7 |   targetVersion?: string;
 8 | }
 9 | 
10 | /**
11 |  * Tool for finding the best matching version of a library in the store.
12 |  * Supports exact version matches and X-Range patterns (e.g., '5.x', '5.2.x').
13 |  */
14 | export class FindVersionTool {
15 |   private docService: DocumentManagementService;
16 | 
17 |   constructor(docService: DocumentManagementService) {
18 |     this.docService = docService;
19 |   }
20 | 
21 |   /**
22 |    * Executes the tool to find the best matching version and checks for unversioned docs.
23 |    * @returns A descriptive string indicating the best match and unversioned status, or an error message.
24 |    */
25 |   async execute(options: FindVersionToolOptions): Promise<string> {
26 |     const { library, targetVersion } = options;
27 |     const targetVersionString = targetVersion ? `@${targetVersion}` : "";
28 | 
29 |     try {
30 |       const { bestMatch, hasUnversioned } = await this.docService.findBestVersion(
31 |         library,
32 |         targetVersion,
33 |       );
34 | 
35 |       let message = "";
36 |       if (bestMatch) {
37 |         message = `Best match: ${bestMatch}.`;
38 |         if (hasUnversioned) {
39 |           message += " Unversioned docs also available.";
40 |         }
41 |       } else if (hasUnversioned) {
42 |         message = `No matching version found for ${library}${targetVersionString}, but unversioned docs exist.`;
43 |       } else {
44 |         // This case should ideally be caught by VersionNotFoundError below,
45 |         // but added for completeness.
46 |         message = `No matching version or unversioned documents found for ${library}${targetVersionString}.`;
47 |       }
48 |       return message;
49 |     } catch (error) {
50 |       if (error instanceof VersionNotFoundError) {
51 |         // This error is thrown when no semver versions AND no unversioned docs exist.
52 |         logger.info(`ℹ️ Version not found: ${error.message}`);
53 |         return `No matching version or unversioned documents found for ${library}${targetVersionString}. Available: ${
54 |           error.availableVersions.length > 0
55 |             ? error.availableVersions.map((v) => v.version).join(", ")
56 |             : "None"
57 |         }.`;
58 |       }
59 |       // Re-throw unexpected errors
60 |       logger.error(
61 |         `❌ Error finding version for ${library}${targetVersionString}: ${error instanceof Error ? error.message : error}`,
62 |       );
63 |       throw error;
64 |     }
65 |   }
66 | }
67 | 


--------------------------------------------------------------------------------
/src/tools/GetJobInfoTool.test.ts:
--------------------------------------------------------------------------------
 1 | import { type Mock, beforeEach, describe, expect, it, vi } from "vitest";
 2 | import type { PipelineManager } from "../pipeline/PipelineManager";
 3 | import { type PipelineJob, PipelineJobStatus } from "../pipeline/types";
 4 | import type { ScraperOptions } from "../scraper/types";
 5 | import { GetJobInfoTool } from "./GetJobInfoTool"; // Updated import
 6 | 
 7 | // Mock dependencies
 8 | vi.mock("../pipeline/PipelineManager");
 9 | vi.mock("../utils/logger");
10 | 
11 | describe("GetJobInfoTool", () => {
12 |   // Updated describe block
13 |   let mockManagerInstance: Partial<PipelineManager>;
14 |   let getJobInfoTool: GetJobInfoTool; // Updated variable name
15 | 
16 |   const MOCK_JOB_ID_FOUND = "job-found-123";
17 |   const MOCK_JOB_ID_NOT_FOUND = "job-not-found-456";
18 | 
19 |   const mockJob: PipelineJob = {
20 |     id: MOCK_JOB_ID_FOUND,
21 |     library: "lib-a",
22 |     version: "1.0.0",
23 |     status: PipelineJobStatus.RUNNING,
24 |     createdAt: new Date("2023-01-01T10:00:00Z"),
25 |     startedAt: new Date("2023-01-01T10:05:00Z"),
26 |     options: { library: "lib-a", version: "1.0.0", url: "url1" } as ScraperOptions,
27 |     progress: null,
28 |     error: null,
29 |     finishedAt: null,
30 |     abortController: new AbortController(),
31 |     completionPromise: Promise.resolve(),
32 |     resolveCompletion: () => {},
33 |     rejectCompletion: () => {},
34 |   };
35 | 
36 |   beforeEach(() => {
37 |     vi.resetAllMocks();
38 | 
39 |     // Define the mock implementation for the manager instance
40 |     mockManagerInstance = {
41 |       // Mock getJob to return the job if ID matches, otherwise undefined
42 |       getJob: vi.fn().mockImplementation(async (jobId: string) => {
43 |         if (jobId === MOCK_JOB_ID_FOUND) {
44 |           return mockJob;
45 |         }
46 |         return undefined; // Simulate job not found
47 |       }),
48 |     };
49 | 
50 |     // Instantiate the tool with the correctly typed mock instance
51 |     getJobInfoTool = new GetJobInfoTool(mockManagerInstance as PipelineManager); // Updated instantiation
52 |   });
53 | 
54 |   it("should call manager.getJob with the provided jobId", async () => {
55 |     await getJobInfoTool.execute({ jobId: MOCK_JOB_ID_FOUND }); // Updated tool call
56 |     expect(mockManagerInstance.getJob).toHaveBeenCalledWith(MOCK_JOB_ID_FOUND);
57 |   });
58 | 
59 |   it("should return the job details if the job is found", async () => {
60 |     const result = await getJobInfoTool.execute({ jobId: MOCK_JOB_ID_FOUND }); // Updated tool call
61 | 
62 |     expect(result.job).not.toBeNull();
63 |     // Check properties of the simplified JobInfo object
64 |     expect(result.job?.id).toBe(mockJob.id);
65 |     expect(result.job?.library).toBe(mockJob.library);
66 |     expect(result.job?.version).toBe(mockJob.version);
67 |     expect(result.job?.status).toBe(mockJob.status);
68 |     expect(result.job?.createdAt).toBe(mockJob.createdAt.toISOString());
69 |     expect(result.job?.startedAt).toBe(mockJob.startedAt?.toISOString());
70 |     expect(result.job?.finishedAt).toBeNull(); // Based on mockJob
71 |     expect(result.job?.error).toBeNull(); // Based on mockJob
72 |   });
73 | 
74 |   it("should return null if the job is not found", async () => {
75 |     const result = await getJobInfoTool.execute({ jobId: MOCK_JOB_ID_NOT_FOUND }); // Updated tool call
76 | 
77 |     expect(mockManagerInstance.getJob).toHaveBeenCalledWith(MOCK_JOB_ID_NOT_FOUND);
78 |     expect(result.job).toBeNull();
79 |   });
80 | });
81 | 


--------------------------------------------------------------------------------
/src/tools/GetJobInfoTool.ts:
--------------------------------------------------------------------------------
 1 | import type { PipelineManager } from "../pipeline/PipelineManager";
 2 | import type { PipelineJob, PipelineJobStatus } from "../pipeline/types";
 3 | 
 4 | /**
 5 |  * Input parameters for the GetJobInfoTool.
 6 |  */
 7 | export interface GetJobInfoInput {
 8 |   /** The ID of the job to retrieve info for. */
 9 |   jobId: string;
10 | }
11 | 
12 | /**
13 |  * Simplified information about a pipeline job for external use.
14 |  */
15 | export interface JobInfo {
16 |   id: string;
17 |   library: string;
18 |   version: string;
19 |   status: PipelineJobStatus;
20 |   createdAt: string;
21 |   startedAt: string | null;
22 |   finishedAt: string | null;
23 |   error: string | null;
24 | }
25 | 
26 | /**
27 |  * Response structure for the GetJobInfoTool.
28 |  */
29 | export interface GetJobInfoToolResponse {
30 |   job: JobInfo | null;
31 | }
32 | 
33 | /**
34 |  * Tool for retrieving simplified information about a specific pipeline job.
35 |  */
36 | export class GetJobInfoTool {
37 |   private manager: PipelineManager;
38 | 
39 |   /**
40 |    * Creates an instance of GetJobInfoTool.
41 |    * @param manager The PipelineManager instance.
42 |    */
43 |   constructor(manager: PipelineManager) {
44 |     this.manager = manager;
45 |   }
46 | 
47 |   /**
48 |    * Executes the tool to retrieve simplified info for a specific job.
49 |    * @param input - The input parameters, containing the jobId.
50 |    * @returns A promise that resolves with the simplified job info or null if not found.
51 |    */
52 |   async execute(input: GetJobInfoInput): Promise<GetJobInfoToolResponse> {
53 |     const job = await this.manager.getJob(input.jobId);
54 | 
55 |     if (!job) {
56 |       // Return null in the result if job not found
57 |       return { job: null };
58 |     }
59 | 
60 |     // Transform the job into a simplified object
61 |     const jobInfo: JobInfo = {
62 |       id: job.id,
63 |       library: job.library,
64 |       version: job.version,
65 |       status: job.status,
66 |       createdAt: job.createdAt.toISOString(),
67 |       startedAt: job.startedAt?.toISOString() ?? null,
68 |       finishedAt: job.finishedAt?.toISOString() ?? null,
69 |       error: job.error?.message ?? null,
70 |     };
71 | 
72 |     return { job: jobInfo };
73 |   }
74 | }
75 | 


--------------------------------------------------------------------------------
/src/tools/ListJobsTool.ts:
--------------------------------------------------------------------------------
 1 | import type { PipelineManager } from "../pipeline/PipelineManager";
 2 | import type { PipelineJob, PipelineJobStatus } from "../pipeline/types";
 3 | import type { JobInfo } from "./GetJobInfoTool"; // Import JobInfo
 4 | 
 5 | /**
 6 |  * Input parameters for the ListJobsTool.
 7 |  */
 8 | export interface ListJobsInput {
 9 |   /** Optional status to filter jobs by. */
10 |   status?: PipelineJobStatus;
11 | }
12 | 
13 | /**
14 |  * Response structure for the ListJobsTool.
15 |  */
16 | export interface ListJobsToolResponse {
17 |   jobs: JobInfo[];
18 | }
19 | 
20 | /**
21 |  * Tool for listing pipeline jobs managed by the PipelineManager.
22 |  * Allows filtering jobs by their status.
23 |  */
24 | export class ListJobsTool {
25 |   private manager: PipelineManager; // Change property name and type
26 | 
27 |   /**
28 |    * Creates an instance of ListJobsTool.
29 |    * @param manager The PipelineManager instance.
30 |    */
31 |   constructor(manager: PipelineManager) {
32 |     // Change constructor parameter
33 |     this.manager = manager;
34 |   }
35 | 
36 |   /**
37 |    * Executes the tool to retrieve a list of pipeline jobs.
38 |    * @param input - The input parameters, optionally including a status filter.
39 |    * @returns A promise that resolves with the list of simplified job objects.
40 |    * @throws {PipelineStateError} If the pipeline manager is somehow unavailable.
41 |    */
42 |   async execute(input: ListJobsInput): Promise<ListJobsToolResponse> {
43 |     const jobs = await this.manager.getJobs(input.status);
44 | 
45 |     // Transform jobs into simplified objects
46 |     const simplifiedJobs: JobInfo[] = jobs.map(
47 |       (job: PipelineJob): JobInfo => ({
48 |         id: job.id,
49 |         library: job.library,
50 |         version: job.version,
51 |         status: job.status,
52 |         createdAt: job.createdAt.toISOString(),
53 |         startedAt: job.startedAt?.toISOString() ?? null,
54 |         finishedAt: job.finishedAt?.toISOString() ?? null,
55 |         error: job.error?.message ?? null,
56 |       }),
57 |     );
58 | 
59 |     return { jobs: simplifiedJobs };
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/tools/ListLibrariesTool.ts:
--------------------------------------------------------------------------------
 1 | import type { DocumentManagementService } from "../store/DocumentManagementService";
 2 | import type { LibraryVersionDetails } from "../store/types";
 3 | 
 4 | // Define the structure for the tool's output, using the detailed version info
 5 | export interface LibraryInfo {
 6 |   name: string;
 7 |   versions: LibraryVersionDetails[]; // Use the detailed interface
 8 | }
 9 | 
10 | export interface ListLibrariesResult {
11 |   libraries: LibraryInfo[];
12 | }
13 | 
14 | /**
15 |  * Tool for listing all available libraries and their indexed versions in the store.
16 |  */
17 | export class ListLibrariesTool {
18 |   private docService: DocumentManagementService;
19 | 
20 |   constructor(docService: DocumentManagementService) {
21 |     this.docService = docService;
22 |   }
23 | 
24 |   async execute(options?: Record<string, never>): Promise<ListLibrariesResult> {
25 |     // docService.listLibraries() now returns the detailed structure directly
26 |     const rawLibraries = await this.docService.listLibraries();
27 | 
28 |     // The structure returned by listLibraries already matches LibraryInfo[]
29 |     // No complex mapping is needed here anymore, just ensure the names match
30 |     const libraries: LibraryInfo[] = rawLibraries.map(({ library, versions }) => ({
31 |       name: library,
32 |       versions: versions, // Directly assign the detailed versions array
33 |     }));
34 | 
35 |     return { libraries };
36 |   }
37 | }
38 | 


--------------------------------------------------------------------------------
/src/tools/RemoveTool.ts:
--------------------------------------------------------------------------------
 1 | import type { PipelineManager } from "../pipeline/PipelineManager";
 2 | import { PipelineJobStatus } from "../pipeline/types";
 3 | import type { DocumentManagementService } from "../store";
 4 | import { logger } from "../utils/logger";
 5 | import { ToolError } from "./errors";
 6 | 
 7 | /**
 8 |  * Represents the arguments for the remove_docs tool.
 9 |  * The MCP server should validate the input against RemoveToolInputSchema before calling execute.
10 |  */
11 | export interface RemoveToolArgs {
12 |   library: string;
13 |   version?: string;
14 | }
15 | 
16 | /**
17 |  * Tool to remove indexed documentation for a specific library version.
18 |  * This class provides the core logic, intended to be called by the McpServer.
19 |  */
20 | export class RemoveTool {
21 |   constructor(
22 |     private readonly documentManagementService: DocumentManagementService,
23 |     private readonly pipelineManager?: PipelineManager, // Optional for backward compatibility
24 |   ) {}
25 | 
26 |   /**
27 |    * Executes the tool to remove the specified library version documents.
28 |    * Aborts any QUEUED/RUNNING job for the same library+version before deleting.
29 |    */
30 |   async execute(args: RemoveToolArgs): Promise<{ message: string }> {
31 |     const { library, version } = args;
32 | 
33 |     logger.info(
34 |       `🗑️ Removing library: ${library}${version ? `, version: ${version}` : " (unversioned)"}`,
35 |     );
36 | 
37 |     try {
38 |       // Abort any QUEUED or RUNNING job for this library+version
39 |       if (this.pipelineManager) {
40 |         const jobs = this.pipelineManager.findJobsByLibraryVersion(
41 |           library,
42 |           (version ?? "").toLowerCase(),
43 |           [PipelineJobStatus.QUEUED, PipelineJobStatus.RUNNING],
44 |         );
45 |         for (const job of jobs) {
46 |           logger.info(
47 |             `🚫 Aborting job for ${library}@${version ?? ""} before deletion: ${job.id}`,
48 |           );
49 |           await this.pipelineManager.cancelJob(job.id);
50 |           // Wait for job to finish cancelling if running
51 |           await this.pipelineManager.waitForJobCompletion(job.id);
52 |         }
53 |       }
54 |       // Core logic: Call the document management service
55 |       await this.documentManagementService.removeAllDocuments(library, version);
56 | 
57 |       const message = `Successfully removed documents for ${library}${version ? `@${version}` : " (unversioned)"}.`;
58 |       logger.info(`✅ ${message}`);
59 |       // Return a simple success object, the McpServer will format the final response
60 |       return { message };
61 |     } catch (error) {
62 |       const errorMessage = `Failed to remove documents for ${library}${version ? `@${version}` : " (unversioned)"}: ${error instanceof Error ? error.message : String(error)}`;
63 |       logger.error(`❌ Error removing library: ${errorMessage}`);
64 |       // Re-throw the error for the McpServer to handle and format
65 |       throw new ToolError(errorMessage, this.constructor.name);
66 |     }
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/tools/SearchTool.ts:
--------------------------------------------------------------------------------
  1 | import type { DocumentManagementService } from "../store";
  2 | import type { LibraryVersionDetails, StoreSearchResult } from "../store/types"; // Import LibraryVersionDetails
  3 | import { logger } from "../utils/logger";
  4 | import { VersionNotFoundError } from "./errors";
  5 | 
  6 | export interface SearchToolOptions {
  7 |   library: string;
  8 |   version?: string;
  9 |   query: string;
 10 |   limit?: number;
 11 |   exactMatch?: boolean;
 12 | }
 13 | 
 14 | export interface SearchToolResultError {
 15 |   message: string;
 16 |   availableVersions?: LibraryVersionDetails[]; // Use LibraryVersionDetails
 17 |   suggestions?: string[]; // Specific to LibraryNotFoundError
 18 | }
 19 | 
 20 | export interface SearchToolResult {
 21 |   results: StoreSearchResult[];
 22 | }
 23 | 
 24 | /**
 25 |  * Tool for searching indexed documentation.
 26 |  * Supports exact version matches and version range patterns.
 27 |  * Returns available versions when requested version is not found.
 28 |  */
 29 | export class SearchTool {
 30 |   private docService: DocumentManagementService;
 31 | 
 32 |   constructor(docService: DocumentManagementService) {
 33 |     this.docService = docService;
 34 |   }
 35 | 
 36 |   async execute(options: SearchToolOptions): Promise<SearchToolResult> {
 37 |     const { library, version, query, limit = 5, exactMatch = false } = options;
 38 | 
 39 |     // When exactMatch is true, version must be specified and not 'latest'
 40 |     if (exactMatch && (!version || version === "latest")) {
 41 |       // Get available *detailed* versions for error message
 42 |       await this.docService.validateLibraryExists(library);
 43 |       // Fetch detailed versions using listLibraries and find the specific library
 44 |       const allLibraries = await this.docService.listLibraries();
 45 |       const libraryInfo = allLibraries.find((lib) => lib.library === library);
 46 |       const detailedVersions = libraryInfo ? libraryInfo.versions : [];
 47 |       throw new VersionNotFoundError(
 48 |         library,
 49 |         "latest", // Or perhaps the original 'version' if it wasn't 'latest'? Check logic.
 50 |         detailedVersions,
 51 |       );
 52 |     }
 53 | 
 54 |     // Default to 'latest' only when exactMatch is false
 55 |     const resolvedVersion = version || "latest";
 56 | 
 57 |     logger.info(
 58 |       `🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`,
 59 |     );
 60 | 
 61 |     try {
 62 |       // 1. Validate library exists first
 63 |       await this.docService.validateLibraryExists(library);
 64 | 
 65 |       // 2. Proceed with version finding and searching
 66 |       let versionToSearch: string | null | undefined = resolvedVersion;
 67 | 
 68 |       if (!exactMatch) {
 69 |         // If not exact match, find the best version (which might be null)
 70 |         const versionResult = await this.docService.findBestVersion(library, version);
 71 |         // Use the bestMatch from the result, which could be null
 72 |         versionToSearch = versionResult.bestMatch;
 73 | 
 74 |         // If findBestVersion returned null (no matching semver) AND unversioned docs exist,
 75 |         // should we search unversioned? The current logic passes null to searchStore,
 76 |         // which gets normalized to "" (unversioned). This seems reasonable.
 77 |         // If findBestVersion threw VersionNotFoundError, it's caught below.
 78 |       }
 79 |       // If exactMatch is true, versionToSearch remains the originally provided version.
 80 | 
 81 |       // Note: versionToSearch can be string | null | undefined here.
 82 |       // searchStore handles null/undefined by normalizing to "".
 83 |       const results = await this.docService.searchStore(
 84 |         library,
 85 |         versionToSearch,
 86 |         query,
 87 |         limit,
 88 |       );
 89 |       logger.info(`✅ Found ${results.length} matching results`);
 90 | 
 91 |       return { results };
 92 |     } catch (error) {
 93 |       logger.error(
 94 |         `❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`,
 95 |       );
 96 |       throw error;
 97 |     }
 98 |   }
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/tools/errors.ts:
--------------------------------------------------------------------------------
 1 | import semver from "semver";
 2 | import type { LibraryVersionDetails } from "../store/types"; // Import LibraryVersionDetails
 3 | 
 4 | class ToolError extends Error {
 5 |   constructor(
 6 |     message: string,
 7 |     public readonly toolName: string,
 8 |   ) {
 9 |     super(message);
10 |     this.name = this.constructor.name;
11 |   }
12 | }
13 | 
14 | class VersionNotFoundError extends ToolError {
15 |   constructor(
16 |     public readonly library: string,
17 |     public readonly requestedVersion: string,
18 |     public readonly availableVersions: LibraryVersionDetails[], // Use LibraryVersionDetails
19 |   ) {
20 |     super(
21 |       `Version ${requestedVersion} not found for ${library}. Available versions: ${availableVersions.map((v) => v.version).join(", ")}`,
22 |       "SearchTool",
23 |     );
24 |   }
25 | 
26 |   getLatestVersion() {
27 |     return this.availableVersions.sort((a, b) => semver.compare(b.version, a.version))[0];
28 |   }
29 | }
30 | 
31 | /**
32 |  * Error thrown when a requested library cannot be found in the store.
33 |  * Includes suggestions for similar library names if available.
34 |  */
35 | class LibraryNotFoundError extends ToolError {
36 |   constructor(
37 |     public readonly requestedLibrary: string,
38 |     public readonly suggestions: string[] = [],
39 |   ) {
40 |     let message = `Library '${requestedLibrary}' not found.`;
41 |     if (suggestions.length > 0) {
42 |       message += ` Did you mean one of these: ${suggestions.join(", ")}?`;
43 |     }
44 |     // Assuming this error might originate from various tools, but SearchTool is a primary candidate.
45 |     // We might need to adjust the toolName if it's thrown elsewhere.
46 |     super(message, "SearchTool");
47 |   }
48 | }
49 | 
50 | export { LibraryNotFoundError, ToolError, VersionNotFoundError };
51 | 


--------------------------------------------------------------------------------
/src/tools/index.ts:
--------------------------------------------------------------------------------
 1 | export * from "./CancelJobTool";
 2 | export * from "./ClearCompletedJobsTool";
 3 | export * from "./errors";
 4 | export * from "./FetchUrlTool";
 5 | export * from "./FindVersionTool";
 6 | export * from "./GetJobInfoTool";
 7 | export * from "./ListJobsTool";
 8 | export * from "./ListLibrariesTool";
 9 | export * from "./RemoveTool";
10 | export * from "./ScrapeTool";
11 | export * from "./SearchTool";
12 | 


--------------------------------------------------------------------------------
/src/types/index.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Common document content type shared across modules
 3 |  */
 4 | export interface Document {
 5 |   content: string;
 6 |   metadata: DocumentMetadata;
 7 | }
 8 | 
 9 | /**
10 |  * Common metadata fields shared across document chunks
11 |  */
12 | export interface DocumentMetadata {
13 |   url: string;
14 |   title: string;
15 |   library: string;
16 |   version: string;
17 |   level?: number; // Optional during scraping
18 |   path?: string[]; // Optional during scraping
19 | }
20 | 
21 | /**
22 |  * Generic progress callback type
23 |  */
24 | export type ProgressCallback<T> = (progress: T) => void | Promise<void>;
25 | 
26 | /**
27 |  * Standard progress response format
28 |  */
29 | export interface ProgressResponse {
30 |   content: { type: string; text: string }[];
31 | }
32 | 


--------------------------------------------------------------------------------
/src/utils/config.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Default configuration values for the scraping pipeline and server
 3 |  */
 4 | 
 5 | /** Maximum number of pages to scrape in a single job */
 6 | export const DEFAULT_MAX_PAGES = 1000;
 7 | 
 8 | /** Maximum navigation depth when crawling links */
 9 | export const DEFAULT_MAX_DEPTH = 3;
10 | 
11 | /** Maximum number of concurrent page requests */
12 | export const DEFAULT_MAX_CONCURRENCY = 3;
13 | 
14 | /** Default protocol for the MCP server */
15 | export const DEFAULT_PROTOCOL = "stdio";
16 | 
17 | /** Default port for the HTTP protocol */
18 | export const DEFAULT_HTTP_PORT = 6280;
19 | 
20 | /** Default port for the Web UI */
21 | export const DEFAULT_WEB_PORT = 6281;
22 | 
23 | /**
24 |  * Default timeout in milliseconds for page operations (e.g., Playwright waitForSelector).
25 |  */
26 | export const DEFAULT_PAGE_TIMEOUT = 5000;
27 | 
28 | /**
29 |  * Maximum number of retries for HTTP fetcher requests.
30 |  */
31 | export const FETCHER_MAX_RETRIES = 6;
32 | 
33 | /**
34 |  * Base delay in milliseconds for HTTP fetcher retry backoff.
35 |  */
36 | export const FETCHER_BASE_DELAY = 1000;
37 | 
38 | /**
39 |  * Default chunk size settings for splitters
40 |  */
41 | export const SPLITTER_MIN_CHUNK_SIZE = 500;
42 | export const SPLITTER_PREFERRED_CHUNK_SIZE = 1500;
43 | export const SPLITTER_MAX_CHUNK_SIZE = 5000;
44 | 
45 | /**
46 |  * Maximum number of documents to process in a single batch for embeddings.
47 |  */
48 | export const EMBEDDING_BATCH_SIZE = 100;
49 | 
50 | /**
51 |  * Maximum number of retries for database migrations if busy.
52 |  */
53 | export const MIGRATION_MAX_RETRIES = 5;
54 | 
55 | /**
56 |  * Delay in milliseconds between migration retry attempts.
57 |  */
58 | export const MIGRATION_RETRY_DELAY_MS = 300;
59 | 


--------------------------------------------------------------------------------
/src/utils/dom.ts:
--------------------------------------------------------------------------------
 1 | import { JSDOM, VirtualConsole } from "jsdom";
 2 | import type { ConstructorOptions } from "jsdom";
 3 | 
 4 | /**
 5 |  * Creates a JSDOM instance with a pre-configured virtual console to suppress console noise.
 6 |  * This utility simplifies the setup of JSDOM by providing a standard configuration.
 7 |  *
 8 |  * @param html - The HTML content to parse.
 9 |  * @param options - Optional JSDOM configuration options. These will be merged with the default virtual console setup.
10 |  * @returns A JSDOM instance.
11 |  */
12 | export function createJSDOM(html: string, options?: ConstructorOptions): JSDOM {
13 |   const virtualConsole = new VirtualConsole();
14 |   // Suppress console output from JSDOM by default
15 |   virtualConsole.on("error", () => {});
16 |   virtualConsole.on("warn", () => {});
17 |   virtualConsole.on("info", () => {});
18 |   virtualConsole.on("debug", () => {});
19 |   virtualConsole.on("log", () => {}); // Also suppress regular logs
20 | 
21 |   const defaultOptions: ConstructorOptions = {
22 |     virtualConsole,
23 |   };
24 | 
25 |   // Merge provided options with defaults, letting provided options override
26 |   const finalOptions: ConstructorOptions = { ...defaultOptions, ...options };
27 | 
28 |   return new JSDOM(html, finalOptions);
29 | }
30 | 


--------------------------------------------------------------------------------
/src/utils/errors.ts:
--------------------------------------------------------------------------------
 1 | class ScraperError extends Error {
 2 |   constructor(
 3 |     message: string,
 4 |     public readonly isRetryable: boolean = false,
 5 |     public readonly cause?: Error,
 6 |   ) {
 7 |     super(message);
 8 |     this.name = this.constructor.name;
 9 |     if (cause?.stack) {
10 |       this.stack = `${this.stack}\nCaused by: ${cause.stack}`;
11 |     }
12 |   }
13 | }
14 | 
15 | class NetworkError extends ScraperError {
16 |   constructor(
17 |     message: string,
18 |     public readonly statusCode?: number,
19 |     cause?: Error,
20 |   ) {
21 |     super(message, true, cause);
22 |   }
23 | }
24 | 
25 | class RateLimitError extends ScraperError {
26 |   constructor(
27 |     message: string,
28 |     public readonly retryAfter?: number,
29 |   ) {
30 |     super(message, true);
31 |   }
32 | }
33 | 
34 | class InvalidUrlError extends ScraperError {
35 |   constructor(url: string, cause?: Error) {
36 |     super(`Invalid URL: ${url}`, false, cause);
37 |   }
38 | }
39 | 
40 | class ParsingError extends ScraperError {
41 |   constructor(message: string, cause?: Error) {
42 |     super(`Failed to parse content: ${message}`, false, cause);
43 |   }
44 | }
45 | 
46 | class RedirectError extends ScraperError {
47 |   constructor(
48 |     public readonly originalUrl: string,
49 |     public readonly redirectUrl: string,
50 |     public readonly statusCode: number,
51 |   ) {
52 |     super(
53 |       `Redirect detected from ${originalUrl} to ${redirectUrl} (status: ${statusCode})`,
54 |       false,
55 |     );
56 |   }
57 | }
58 | 
59 | export {
60 |   ScraperError,
61 |   NetworkError,
62 |   RateLimitError,
63 |   InvalidUrlError,
64 |   ParsingError,
65 |   RedirectError,
66 | };
67 | 


--------------------------------------------------------------------------------
/src/utils/logger.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Defines the available log levels.
 3 |  */
 4 | export enum LogLevel {
 5 |   ERROR = 0,
 6 |   WARN = 1,
 7 |   INFO = 2,
 8 |   DEBUG = 3,
 9 | }
10 | 
11 | let currentLogLevel: LogLevel = LogLevel.INFO; // Default level
12 | 
13 | /**
14 |  * Sets the current logging level for the application.
15 |  * @param level - The desired log level.
16 |  */
17 | export function setLogLevel(level: LogLevel): void {
18 |   currentLogLevel = level;
19 | }
20 | 
21 | /**
22 |  * Provides logging functionalities with level control.
23 |  */
24 | export const logger = {
25 |   /**
26 |    * Logs a debug message if the current log level is DEBUG or higher.
27 |    * @param message - The message to log.
28 |    */
29 |   debug: (message: string) => {
30 |     if (currentLogLevel >= LogLevel.DEBUG && !process.env.VITEST_WORKER_ID) {
31 |       console.debug(message);
32 |     }
33 |   },
34 |   /**
35 |    * Logs an info message if the current log level is INFO or higher.
36 |    * @param message - The message to log.
37 |    */
38 |   info: (message: string) => {
39 |     if (currentLogLevel >= LogLevel.INFO && !process.env.VITEST_WORKER_ID) {
40 |       console.log(message); // Using console.log for INFO
41 |     }
42 |   },
43 |   /**
44 |    * Logs a warning message if the current log level is WARN or higher.
45 |    * @param message - The message to log.
46 |    */
47 |   warn: (message: string) => {
48 |     if (currentLogLevel >= LogLevel.WARN && !process.env.VITEST_WORKER_ID) {
49 |       console.warn(message);
50 |     }
51 |   },
52 |   /**
53 |    * Logs an error message if the current log level is ERROR or higher (always logs).
54 |    * @param message - The message to log.
55 |    */
56 |   error: (message: string) => {
57 |     if (currentLogLevel >= LogLevel.ERROR && !process.env.VITEST_WORKER_ID) {
58 |       console.error(message);
59 |     }
60 |   },
61 | };
62 | 


--------------------------------------------------------------------------------
/src/utils/mimeTypeUtils.ts:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Represents a parsed Content-Type header.
 3 |  */
 4 | export interface ParsedContentType {
 5 |   mimeType: string;
 6 |   charset?: string;
 7 | }
 8 | 
 9 | /**
10 |  * Utility functions for handling MIME types and charsets.
11 |  */
12 | // biome-ignore lint/complexity/noStaticOnlyClass: helpers are static
13 | export class MimeTypeUtils {
14 |   /**
15 |    * Parses a Content-Type header string into its MIME type and charset.
16 |    * @param contentTypeHeader The Content-Type header string (e.g., "text/html; charset=utf-8").
17 |    * @returns A ParsedContentType object, or a default if parsing fails.
18 |    */
19 |   public static parseContentType(contentTypeHeader?: string | null): ParsedContentType {
20 |     if (!contentTypeHeader) {
21 |       return { mimeType: "application/octet-stream" };
22 |     }
23 |     const parts = contentTypeHeader.split(";").map((part) => part.trim());
24 |     const mimeType = parts[0].toLowerCase();
25 |     let charset: string | undefined;
26 | 
27 |     for (let i = 1; i < parts.length; i++) {
28 |       const param = parts[i];
29 |       if (param.toLowerCase().startsWith("charset=")) {
30 |         charset = param.substring("charset=".length).toLowerCase();
31 |         break;
32 |       }
33 |     }
34 |     return { mimeType, charset };
35 |   }
36 | 
37 |   /**
38 |    * Checks if a MIME type represents HTML content.
39 |    */
40 |   public static isHtml(mimeType: string): boolean {
41 |     return mimeType === "text/html" || mimeType === "application/xhtml+xml";
42 |   }
43 | 
44 |   /**
45 |    * Checks if a MIME type represents Markdown content.
46 |    */
47 |   public static isMarkdown(mimeType: string): boolean {
48 |     return mimeType === "text/markdown" || mimeType === "text/x-markdown";
49 |   }
50 | 
51 |   /**
52 |    * Checks if a MIME type represents plain text content.
53 |    */
54 |   public static isText(mimeType: string): boolean {
55 |     return mimeType.startsWith("text/");
56 |   }
57 | 
58 |   // Extend with more helpers as needed (isJson, isXml, isPdf, etc.)
59 | }
60 | 


--------------------------------------------------------------------------------
/src/utils/paths.ts:
--------------------------------------------------------------------------------
 1 | import fs from "node:fs";
 2 | import path from "node:path";
 3 | import { fileURLToPath } from "node:url";
 4 | 
 5 | let projectRoot: string | null = null;
 6 | 
 7 | /**
 8 |  * Finds the project root directory by searching upwards from the current file
 9 |  * for a directory containing 'package.json'. Caches the result.
10 |  *
11 |  * @returns {string} The absolute path to the project root.
12 |  * @throws {Error} If package.json cannot be found.
13 |  */
14 | export function getProjectRoot(): string {
15 |   // Return cached result if available
16 |   if (projectRoot) {
17 |     return projectRoot;
18 |   }
19 | 
20 |   // Start from the directory of the current module
21 |   const currentFilePath = fileURLToPath(import.meta.url);
22 |   let currentDir = path.dirname(currentFilePath);
23 | 
24 |   // eslint-disable-next-line no-constant-condition
25 |   while (true) {
26 |     const packageJsonPath = path.join(currentDir, "package.json");
27 |     if (fs.existsSync(packageJsonPath)) {
28 |       projectRoot = currentDir; // Cache the result
29 |       return projectRoot;
30 |     }
31 | 
32 |     const parentDir = path.dirname(currentDir);
33 |     // Check if we have reached the filesystem root
34 |     if (parentDir === currentDir) {
35 |       throw new Error("Could not find project root containing package.json.");
36 |     }
37 |     currentDir = parentDir;
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/src/utils/string.ts:
--------------------------------------------------------------------------------
1 | /**
2 |  * Thoroughly removes all types of whitespace characters from both ends of a string.
3 |  * Handles spaces, tabs, line breaks, and carriage returns.
4 |  */
5 | export const fullTrim = (str: string): string => {
6 |   return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, "");
7 | };
8 | 


--------------------------------------------------------------------------------
/src/utils/url.ts:
--------------------------------------------------------------------------------
  1 | import psl from "psl";
  2 | import { InvalidUrlError } from "./errors";
  3 | 
  4 | interface UrlNormalizerOptions {
  5 |   ignoreCase?: boolean;
  6 |   removeHash?: boolean;
  7 |   removeTrailingSlash?: boolean;
  8 |   removeQuery?: boolean;
  9 |   removeIndex?: boolean;
 10 | }
 11 | 
 12 | const defaultNormalizerOptions: UrlNormalizerOptions = {
 13 |   ignoreCase: true,
 14 |   removeHash: true,
 15 |   removeTrailingSlash: true,
 16 |   removeQuery: false,
 17 |   removeIndex: true,
 18 | };
 19 | 
 20 | export function normalizeUrl(
 21 |   url: string,
 22 |   options: UrlNormalizerOptions = defaultNormalizerOptions,
 23 | ): string {
 24 |   try {
 25 |     const parsedUrl = new URL(url);
 26 |     const finalOptions = { ...defaultNormalizerOptions, ...options };
 27 | 
 28 |     // Create a new URL to ensure proper structure
 29 |     const normalized = new URL(parsedUrl.origin + parsedUrl.pathname);
 30 | 
 31 |     // Remove index files first, before handling trailing slashes
 32 |     if (finalOptions.removeIndex) {
 33 |       normalized.pathname = normalized.pathname.replace(
 34 |         /\/index\.(html|htm|asp|php|jsp)$/i,
 35 |         "/",
 36 |       );
 37 |     }
 38 | 
 39 |     // Handle trailing slash
 40 |     if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) {
 41 |       normalized.pathname = normalized.pathname.replace(/\/+$/, "");
 42 |     }
 43 | 
 44 |     // Keep original parts we want to preserve
 45 |     const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : "";
 46 |     const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : "";
 47 | 
 48 |     // Construct final URL string in correct order (query before hash)
 49 |     let result = normalized.origin + normalized.pathname;
 50 |     if (preservedSearch) {
 51 |       result += preservedSearch;
 52 |     }
 53 |     if (preservedHash) {
 54 |       result += preservedHash;
 55 |     }
 56 | 
 57 |     // Apply case normalization if configured
 58 |     if (finalOptions.ignoreCase) {
 59 |       result = result.toLowerCase();
 60 |     }
 61 | 
 62 |     return result;
 63 |   } catch {
 64 |     return url; // Return original URL if parsing fails
 65 |   }
 66 | }
 67 | 
 68 | /**
 69 |  * Validates if a string is a valid URL
 70 |  * @throws {InvalidUrlError} If the URL is invalid
 71 |  */
 72 | export function validateUrl(url: string): void {
 73 |   try {
 74 |     new URL(url);
 75 |   } catch (error) {
 76 |     throw new InvalidUrlError(url, error instanceof Error ? error : undefined);
 77 |   }
 78 | }
 79 | 
 80 | /**
 81 |  * Checks if two URLs have the exact same hostname
 82 |  */
 83 | export function hasSameHostname(urlA: URL, urlB: URL): boolean {
 84 |   return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase();
 85 | }
 86 | 
 87 | /**
 88 |  * Checks if two URLs are on the same domain (including subdomains)
 89 |  * Using the public suffix list to properly handle domains like .co.uk
 90 |  */
 91 | export function hasSameDomain(urlA: URL, urlB: URL): boolean {
 92 |   const domainA = psl.get(urlA.hostname.toLowerCase());
 93 |   const domainB = psl.get(urlB.hostname.toLowerCase());
 94 |   return domainA !== null && domainA === domainB;
 95 | }
 96 | 
 97 | /**
 98 |  * Checks if a target URL is under the same path as the base URL
 99 |  * Example: base = https://example.com/docs/
100 |  *          target = https://example.com/docs/getting-started
101 |  *          result = true
102 |  */
103 | export function isSubpath(baseUrl: URL, targetUrl: URL): boolean {
104 |   // Normalize paths to ensure consistent comparison
105 |   const basePath = baseUrl.pathname.endsWith("/")
106 |     ? baseUrl.pathname
107 |     : `${baseUrl.pathname}/`;
108 | 
109 |   return targetUrl.pathname.startsWith(basePath);
110 | }
111 | 
112 | export type { UrlNormalizerOptions };
113 | 


--------------------------------------------------------------------------------
/src/web/components/Alert.tsx:
--------------------------------------------------------------------------------
  1 | import type { PropsWithChildren } from "@kitajs/html";
  2 | 
  3 | /**
  4 |  * Defines the possible types for the Alert component.
  5 |  */
  6 | type AlertType = "success" | "error" | "warning" | "info";
  7 | 
  8 | /**
  9 |  * Props for the Alert component.
 10 |  */
 11 | interface AlertProps extends PropsWithChildren {
 12 |   type: AlertType;
 13 |   title?: string;
 14 |   message: string | JSX.Element; // Allow JSX for messages
 15 | }
 16 | 
 17 | /**
 18 |  * Reusable Alert component using Flowbite styling.
 19 |  * Displays messages with appropriate colors and icons based on the type.
 20 |  * @param props - Component props including type, title (optional), and message.
 21 |  */
 22 | const Alert = ({ type, title, message }: AlertProps) => {
 23 |   let iconSvg: JSX.Element;
 24 |   let colorClasses: string;
 25 |   let defaultTitle: string;
 26 | 
 27 |   switch (type) {
 28 |     case "success":
 29 |       defaultTitle = "Success:";
 30 |       colorClasses =
 31 |         "text-green-800 border-green-300 bg-green-50 dark:bg-gray-800 dark:text-green-400 dark:border-green-800";
 32 |       iconSvg = (
 33 |         <svg
 34 |           class="flex-shrink-0 inline w-4 h-4 me-3"
 35 |           aria-hidden="true"
 36 |           xmlns="http://www.w3.org/2000/svg"
 37 |           fill="currentColor"
 38 |           viewBox="0 0 20 20"
 39 |         >
 40 |           <path d="M10 .5a9.5 9.5 0 1 0 9.5 9.5A9.51 9.51 0 0 0 10 .5Zm9.5 9.5A9.5 9.5 0 0 1 10 19a9.46 9.46 0 0 1-1.671-.14c-.165-.05-.3-.19-.42-.335l-.165-.165c-.19-.2-.3-.425-.3-.655A4.2 4.2 0 0 1 4.5 10a4.25 4.25 0 0 1 7.462-2.882l1.217 1.217a3.175 3.175 0 0 0 4.5.01l.106-.106a.934.934 0 0 0 .1-.36ZM10 11a1 1 0 1 0 0 2 1 1 0 0 0 0-2Z" />
 41 |         </svg>
 42 |       );
 43 |       break;
 44 |     case "error":
 45 |       defaultTitle = "Error:";
 46 |       colorClasses =
 47 |         "text-red-800 border-red-300 bg-red-50 dark:bg-gray-800 dark:text-red-400 dark:border-red-800";
 48 |       iconSvg = (
 49 |         <svg
 50 |           class="flex-shrink-0 inline w-4 h-4 me-3"
 51 |           aria-hidden="true"
 52 |           xmlns="http://www.w3.org/2000/svg"
 53 |           fill="currentColor"
 54 |           viewBox="0 0 20 20"
 55 |         >
 56 |           <path d="M10 .5a9.5 9.5 0 1 0 9.5 9.5A9.51 9.51 0 0 0 10 .5ZM9.5 4a1.5 1.5 0 1 1 0 3 1.5 1.5 0 0 1 0-3ZM12 15H8a1 1 0 0 1 0-2h1v-3h-1a1 1 0 0 1 0-2h2a1 1 0 0 1 1 1v4h1a1 1 0 0 1 0 2Z" />
 57 |         </svg>
 58 |       );
 59 |       break;
 60 |     case "warning":
 61 |       defaultTitle = "Warning:";
 62 |       colorClasses =
 63 |         "text-yellow-800 border-yellow-300 bg-yellow-50 dark:bg-gray-800 dark:text-yellow-300 dark:border-yellow-800";
 64 |       iconSvg = (
 65 |         <svg
 66 |           class="flex-shrink-0 inline w-4 h-4 me-3"
 67 |           aria-hidden="true"
 68 |           xmlns="http://www.w3.org/2000/svg"
 69 |           fill="currentColor"
 70 |           viewBox="0 0 20 20"
 71 |         >
 72 |           <path d="M10 .5a9.5 9.5 0 1 0 9.5 9.5A9.51 9.51 0 0 0 10 .5ZM9.5 4a1.5 1.5 0 1 1 0 3 1.5 1.5 0 0 1 0-3ZM12 15H8a1 1 0 0 1 0-2h1v-3h-1a1 1 0 0 1 0-2h2a1 1 0 0 1 1 1v4h1a1 1 0 0 1 0 2Z" />
 73 |         </svg>
 74 |       );
 75 |       break;
 76 |     case "info":
 77 |     default: // Default to info style
 78 |       defaultTitle = "Info:";
 79 |       colorClasses =
 80 |         "text-blue-800 border-blue-300 bg-blue-50 dark:bg-gray-800 dark:text-blue-400 dark:border-blue-800";
 81 |       iconSvg = (
 82 |         <svg
 83 |           class="flex-shrink-0 inline w-4 h-4 me-3"
 84 |           aria-hidden="true"
 85 |           xmlns="http://www.w3.org/2000/svg"
 86 |           fill="currentColor"
 87 |           viewBox="0 0 20 20"
 88 |         >
 89 |           <path d="M10 .5a9.5 9.5 0 1 0 9.5 9.5A9.51 9.51 0 0 0 10 .5ZM9.5 4a1.5 1.5 0 1 1 0 3 1.5 1.5 0 0 1 0-3ZM12 15H8a1 1 0 0 1 0-2h1v-3H8a1 1 0 0 1 0-2h2a1 1 0 0 1 1 1v4h1a1 1 0 0 1 0 2Z" />
 90 |         </svg>
 91 |       );
 92 |       break;
 93 |   }
 94 | 
 95 |   const displayTitle = title ?? defaultTitle;
 96 | 
 97 |   return (
 98 |     <div
 99 |       class={`flex items-center p-4 mb-4 text-sm border rounded-lg ${colorClasses}`}
100 |       role="alert"
101 |     >
102 |       {iconSvg}
103 |       <span class="sr-only">Info</span>
104 |       <div>
105 |         {displayTitle ? (
106 |           <span class="font-medium" safe>
107 |             {displayTitle}
108 |           </span>
109 |         ) : null}{" "}
110 |         {message}
111 |       </div>
112 |     </div>
113 |   );
114 | };
115 | 
116 | export default Alert;
117 | 


--------------------------------------------------------------------------------
/src/web/components/JobList.tsx:
--------------------------------------------------------------------------------
 1 | import type { JobInfo } from "../../tools/GetJobInfoTool";
 2 | import JobItem from "./JobItem"; // Adjusted import path
 3 | 
 4 | /**
 5 |  * Props for the JobList component.
 6 |  */
 7 | interface JobListProps {
 8 |   jobs: JobInfo[];
 9 | }
10 | 
11 | /**
12 |  * Renders a list of JobItem components or a message if the list is empty.
13 |  * Adds a listener for the 'job-list-refresh' event to trigger a reload of the job list using HTMX.
14 |  * @param props - Component props including the array of jobs.
15 |  */
16 | const JobList = ({ jobs }: JobListProps) => (
17 |   <div id="job-list" class="space-y-2">
18 |     {jobs.length === 0 ? (
19 |       <p class="text-center text-gray-500 dark:text-gray-400">
20 |         No pending jobs.
21 |       </p>
22 |     ) : (
23 |       jobs.map((job) => <JobItem job={job} />)
24 |     )}
25 |     {/* NOTE: To enable live job list refresh after stopping a job, add the following script to your main HTML layout or main.client.ts:
26 |         document.addEventListener('job-list-refresh', function () {
27 |           if (window.htmx) {
28 |             window.htmx.ajax('GET', '/api/jobs', '#job-list');
29 |           } else {
30 |             window.location.reload();
31 |           }
32 |         });
33 |     */}
34 |   </div>
35 | );
36 | 
37 | export default JobList;
38 | 


--------------------------------------------------------------------------------
/src/web/components/Layout.tsx:
--------------------------------------------------------------------------------
 1 | import type { PropsWithChildren } from "@kitajs/html";
 2 | import { readFileSync } from "node:fs";
 3 | 
 4 | /**
 5 |  * Props for the Layout component.
 6 |  */
 7 | interface LayoutProps extends PropsWithChildren {
 8 |   title: string;
 9 |   /** Optional version string to display next to the title. */
10 |   version?: string;
11 | }
12 | 
13 | /**
14 |  * Base HTML layout component for all pages.
15 |  * Includes common head elements, header, and scripts.
16 |  * @param props - Component props including title, version, and children.
17 |  */
18 | const Layout = ({ title, version, children }: LayoutProps) => {
19 |   let versionString = version;
20 |   if (!versionString) {
21 |     // If no version is provided, use the version from package.json
22 |     // We cannot bake the version into the bundle, as the package.json will
23 |     // be updated by the build process, after the bundle is created.
24 |     try {
25 |       const packageJson = JSON.parse(readFileSync("package.json", "utf-8")) as {
26 |         version: string;
27 |       };
28 |       versionString = packageJson.version;
29 |     } catch (error) {
30 |       console.error("Error reading package.json:", error);
31 |     }
32 |   }
33 |   return (
34 |     <html lang="en">
35 |       <head>
36 |         <meta charset="UTF-8" />
37 |         <meta name="viewport" content="width=device-width, initial-scale=1.0" />
38 |         <title safe>{title}</title>
39 |         {/* Bundled CSS (includes Tailwind and Flowbite) */}
40 |         <link rel="stylesheet" href="/assets/main.css" />
41 |         {/* Add style for htmx-indicator behavior (needed globally) */}
42 |         <style>
43 |           {`
44 |           .htmx-indicator {
45 |             display: none;
46 |           }
47 |           .htmx-request .htmx-indicator {
48 |             display: block;
49 |           }
50 |           .htmx-request.htmx-indicator {
51 |             display: block;
52 |           }
53 |           /* Default: Hide skeleton, show results container */
54 |           #searchResultsContainer .search-skeleton { display: none; }
55 |           #searchResultsContainer .search-results { display: block; } /* Or as needed */
56 | 
57 |           /* Request in progress: Show skeleton, hide results */
58 |           #searchResultsContainer.htmx-request .search-skeleton { display: block; } /* Or flex etc. */
59 |           #searchResultsContainer.htmx-request .search-results { display: none; }
60 | 
61 |           /* Keep button spinner logic */
62 |           form .htmx-indicator .spinner { display: flex; }
63 |           form .htmx-indicator .search-text { display: none; }
64 |           form .spinner { display: none; }
65 |         `}
66 |         </style>
67 |       </head>
68 |       <body class="bg-gray-50 dark:bg-gray-900">
69 |         <div class="container max-w-2xl mx-auto px-4 py-4">
70 |           <header class="mb-4">
71 |             <h1 class="text-3xl font-bold text-gray-900 dark:text-white">
72 |               <a href="/">MCP Docs</a>
73 |               {versionString ? (
74 |                 <span
75 |                   safe
76 |                   class="ml-2 text-base font-normal text-gray-500 dark:text-gray-400 align-baseline"
77 |                   title={`Version ${versionString}`}
78 |                 >
79 |                   v{versionString}
80 |                 </span>
81 |               ) : null}
82 |             </h1>
83 |           </header>
84 | 
85 |           <main>{children}</main>
86 |         </div>
87 | 
88 |         {/* Bundled JS (includes Flowbite, HTMX, AlpineJS, and initialization) */}
89 |         <script type="module" src="/assets/main.js"></script>
90 |       </body>
91 |     </html>
92 |   );
93 | };
94 | 
95 | export default Layout;
96 | 


--------------------------------------------------------------------------------
/src/web/components/LibraryDetailCard.tsx:
--------------------------------------------------------------------------------
 1 | import type { LibraryInfo } from "../../tools/ListLibrariesTool";
 2 | import VersionDetailsRow from "./VersionDetailsRow"; // Adjusted import path
 3 | 
 4 | /**
 5 |  * Props for the LibraryDetailCard component.
 6 |  */
 7 | interface LibraryDetailCardProps {
 8 |   library: LibraryInfo;
 9 | }
10 | 
11 | /**
12 |  * Renders a card displaying library details and its versions.
13 |  * Uses VersionDetailsRow without the delete button.
14 |  * @param props - Component props including the library information.
15 |  */
16 | const LibraryDetailCard = ({ library }: LibraryDetailCardProps) => (
17 |   // Use Flowbite Card structure with updated padding and border, and white background
18 |   <div class="block p-4 bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-300 dark:border-gray-600 mb-4">
19 |     <h3 class="text-lg font-medium text-gray-900 dark:text-white mb-1">
20 |       <span safe>{library.name}</span>
21 |     </h3>
22 |     {/* Container for version rows */}
23 |     <div class="mt-1">
24 |       {library.versions.length > 0 ? (
25 |         library.versions.map((version) => (
26 |           <VersionDetailsRow
27 |             libraryName={library.name}
28 |             version={version}
29 |             showDelete={false} // Explicitly hide delete button
30 |           />
31 |         ))
32 |       ) : (
33 |         // Display message if no versions are indexed
34 |         <p class="text-sm text-gray-500 dark:text-gray-400 italic">
35 |           No versions indexed.
36 |         </p>
37 |       )}
38 |     </div>
39 |   </div>
40 | );
41 | 
42 | export default LibraryDetailCard;
43 | 


--------------------------------------------------------------------------------
/src/web/components/LibraryItem.tsx:
--------------------------------------------------------------------------------
 1 | import type { LibraryInfo } from "../../tools/ListLibrariesTool";
 2 | import VersionDetailsRow from "./VersionDetailsRow"; // Adjusted import path
 3 | 
 4 | /**
 5 |  * Props for the LibraryItem component.
 6 |  */
 7 | interface LibraryItemProps {
 8 |   library: LibraryInfo;
 9 | }
10 | 
11 | /**
12 |  * Renders a card for a single library, listing its versions with details.
13 |  * Uses VersionDetailsRow to display each version.
14 |  * @param props - Component props including the library information.
15 |  */
16 | const LibraryItem = ({ library }: LibraryItemProps) => (
17 |   // Use Flowbite Card structure with updated padding and border, and white background
18 |   <div class="block px-4 py-2 bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-300 dark:border-gray-600">
19 |     <h3 class="text-lg font-medium text-gray-900 dark:text-white mb-1">
20 |       <a
21 |         href={`/libraries/${encodeURIComponent(library.name)}`}
22 |         class="hover:underline"
23 |       >
24 |         <span safe>{library.name}</span>
25 |       </a>
26 |     </h3>
27 |     {/* Container for version rows */}
28 |     <div class="mt-1">
29 |       {library.versions.length > 0 ? (
30 |         library.versions.map((version) => (
31 |           <VersionDetailsRow libraryName={library.name} version={version} />
32 |         ))
33 |       ) : (
34 |         // Display message if no versions are indexed
35 |         <p class="text-sm text-gray-500 dark:text-gray-400 italic">
36 |           No versions indexed.
37 |         </p>
38 |       )}
39 |     </div>
40 |   </div>
41 | );
42 | 
43 | export default LibraryItem;
44 | 


--------------------------------------------------------------------------------
/src/web/components/LibraryList.tsx:
--------------------------------------------------------------------------------
 1 | import type { LibraryInfo } from "../../tools/ListLibrariesTool";
 2 | import LibraryItem from "./LibraryItem"; // Adjusted import path
 3 | 
 4 | /**
 5 |  * Props for the LibraryList component.
 6 |  */
 7 | interface LibraryListProps {
 8 |   libraries: LibraryInfo[];
 9 | }
10 | 
11 | /**
12 |  * Renders a list of LibraryItem components.
13 |  * @param props - Component props including the array of libraries.
14 |  */
15 | const LibraryList = ({ libraries }: LibraryListProps) => {
16 |   return (
17 |     <>
18 |       <div class="space-y-2">
19 |         {libraries.map((library) => (
20 |           <LibraryItem library={library} />
21 |         ))}
22 |       </div>
23 |     </>
24 |   );
25 | };
26 | 
27 | export default LibraryList;
28 | 


--------------------------------------------------------------------------------
/src/web/components/LibrarySearchCard.tsx:
--------------------------------------------------------------------------------
 1 | import type { LibraryInfo } from "../../tools/ListLibrariesTool";
 2 | import LoadingSpinner from "./LoadingSpinner"; // Import spinner
 3 | 
 4 | /**
 5 |  * Props for the LibrarySearchCard component.
 6 |  */
 7 | interface LibrarySearchCardProps {
 8 |   library: LibraryInfo;
 9 | }
10 | 
11 | /**
12 |  * Renders the search form card for a specific library.
13 |  * Includes a version dropdown and query input.
14 |  * @param props - Component props including the library information.
15 |  */
16 | const LibrarySearchCard = ({ library }: LibrarySearchCardProps) => {
17 |   return (
18 |     <div class="block p-4 bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-300 dark:border-gray-600 mb-4">
19 |       <h2 class="text-xl font-semibold mb-2 text-gray-900 dark:text-white" safe>
20 |         Search {library.name} Documentation
21 |       </h2>
22 |       <form
23 |         hx-get={`/api/libraries/${encodeURIComponent(library.name)}/search`}
24 |         hx-target="#searchResultsContainer .search-results"
25 |         hx-swap="innerHTML"
26 |         hx-indicator="#searchResultsContainer"
27 |         class="flex space-x-2"
28 |       >
29 |         <select
30 |           name="version"
31 |           class="w-40 bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block p-2.5 dark:bg-gray-700 dark:border-gray-600 dark:placeholder-gray-400 dark:text-white dark:focus:ring-blue-500 dark:focus:border-blue-500"
32 |         >
33 |           <option value="">Latest</option> {/* Default to latest */}
34 |           {library.versions.map((version) => (
35 |             <option value={version.version || "unversioned"} safe>
36 |               {version.version || "Unversioned"}
37 |             </option>
38 |           ))}
39 |         </select>
40 |         <input
41 |           type="text"
42 |           name="query"
43 |           placeholder="Search query..."
44 |           required
45 |           class="flex-grow bg-gray-50 border border-gray-300 text-gray-900 text-sm rounded-lg focus:ring-blue-500 focus:border-blue-500 block p-2.5 dark:bg-gray-700 dark:border-gray-600 dark:placeholder-gray-400 dark:text-white dark:focus:ring-blue-500 dark:focus:border-blue-500"
46 |         />
47 |         <button
48 |           type="submit"
49 |           class="text-white bg-blue-700 hover:bg-blue-800 focus:ring-4 focus:outline-none focus:ring-blue-300 font-medium rounded-lg text-sm px-5 py-2.5 text-center dark:bg-blue-600 dark:hover:bg-blue-700 dark:focus:ring-blue-800 relative"
50 |         >
51 |           <span class="search-text">Search</span>
52 |           {/* Spinner for HTMX loading - shown via htmx-indicator class on parent */}
53 |           <span class="spinner absolute inset-0 flex items-center justify-center">
54 |             <LoadingSpinner />
55 |           </span>
56 |         </button>
57 |       </form>
58 |       {/* Add style for htmx-indicator behavior on button */}
59 |       {/* Styles moved to Layout.tsx */}
60 |     </div>
61 |   );
62 | };
63 | 
64 | export default LibrarySearchCard;
65 | 


--------------------------------------------------------------------------------
/src/web/components/LoadingSpinner.tsx:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Renders an SVG loading spinner icon.
 3 |  * Used for indicating loading states in buttons or other elements.
 4 |  */
 5 | const LoadingSpinner = () => (
 6 |   <svg
 7 |     class="animate-spin h-4 w-4 text-white" // Adjusted size to h-4 w-4 to match usage
 8 |     xmlns="http://www.w3.org/2000/svg"
 9 |     fill="none"
10 |     viewBox="0 0 24 24"
11 |   >
12 |     <circle
13 |       class="opacity-25"
14 |       cx="12"
15 |       cy="12"
16 |       r="10"
17 |       stroke="currentColor"
18 |       stroke-width="4"
19 |     ></circle>
20 |     <path
21 |       class="opacity-75"
22 |       fill="currentColor"
23 |       d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"
24 |     ></path>
25 |   </svg>
26 | );
27 | 
28 | export default LoadingSpinner;
29 | 


--------------------------------------------------------------------------------
/src/web/components/ScrapeForm.tsx:
--------------------------------------------------------------------------------
 1 | import ScrapeFormContent from "./ScrapeFormContent"; // Adjusted import path
 2 | 
 3 | /**
 4 |  * Wrapper component for the ScrapeFormContent.
 5 |  * Provides a container div, often used as a target for HTMX OOB swaps.
 6 |  */
 7 | const ScrapeForm = () => (
 8 |   <div id="scrape-form-container">
 9 |     <ScrapeFormContent />
10 |   </div>
11 | );
12 | 
13 | export default ScrapeForm;
14 | 


--------------------------------------------------------------------------------
/src/web/components/SearchResultItem.tsx:
--------------------------------------------------------------------------------
 1 | import { unified } from "unified"; // Import unified
 2 | import remarkParse from "remark-parse"; // Import unified plugins
 3 | import remarkGfm from "remark-gfm";
 4 | import remarkHtml from "remark-html";
 5 | import DOMPurify from "dompurify"; // Import DOMPurify
 6 | import { createJSDOM } from "../../utils/dom"; // Import JSDOM helper
 7 | import type { StoreSearchResult } from "../../store/types";
 8 | 
 9 | /**
10 |  * Props for the SearchResultItem component.
11 |  */
12 | interface SearchResultItemProps {
13 |   result: StoreSearchResult;
14 | }
15 | 
16 | /**
17 |  * Renders a single search result item.
18 |  * Converts markdown content to HTML using unified.
19 |  * @param props - Component props including the search result data.
20 |  */
21 | const SearchResultItem = async ({ result }: SearchResultItemProps) => {
22 |   // Use unified pipeline to convert markdown to HTML
23 |   const processor = unified().use(remarkParse).use(remarkGfm).use(remarkHtml);
24 |   const file = await processor.process(result.content);
25 |   const rawHtml = String(file);
26 | 
27 |   // Create JSDOM instance and initialize DOMPurify
28 |   const jsdom = createJSDOM("");
29 |   const purifier = DOMPurify(jsdom.window);
30 | 
31 |   // Sanitize the HTML content
32 |   const safeHtml = purifier.sanitize(rawHtml);
33 | 
34 |   return (
35 |     <div class="block px-4 py-2 bg-white dark:bg-gray-800 rounded-lg shadow-sm border border-gray-300 dark:border-gray-600 mb-2">
36 |       <div class="text-sm text-gray-600 dark:text-gray-400 mb-1">
37 |         <a
38 |           href={result.url}
39 |           target="_blank"
40 |           rel="noopener noreferrer"
41 |           class="underline underline-offset-4"
42 |           safe
43 |         >
44 |           {result.url}
45 |         </a>
46 |       </div>
47 |       {/* Render the sanitized HTML content */}
48 |       <div class="format dark:format-invert max-w-none">{safeHtml}</div>
49 |     </div>
50 |   );
51 | };
52 | 
53 | export default SearchResultItem;
54 | 


--------------------------------------------------------------------------------
/src/web/components/SearchResultList.tsx:
--------------------------------------------------------------------------------
 1 | import type { StoreSearchResult } from "../../store/types";
 2 | import SearchResultItem from "./SearchResultItem"; // Adjusted import path
 3 | 
 4 | /**
 5 |  * Props for the SearchResultList component.
 6 |  */
 7 | interface SearchResultListProps {
 8 |   results: StoreSearchResult[];
 9 | }
10 | 
11 | /**
12 |  * Renders the list of search results using SearchResultItem.
13 |  * Displays a message if no results are found.
14 |  * @param props - Component props including the array of search results.
15 |  */
16 | const SearchResultList = ({ results }: SearchResultListProps) => {
17 |   if (results.length === 0) {
18 |     return (
19 |       <p class="text-gray-500 dark:text-gray-400 italic">No results found.</p>
20 |     );
21 |   }
22 |   return (
23 |     <div class="space-y-2">
24 |       {results.map((result) => (
25 |         <SearchResultItem result={result} />
26 |       ))}
27 |     </div>
28 |   );
29 | };
30 | 
31 | export default SearchResultList;
32 | 


--------------------------------------------------------------------------------
/src/web/components/SearchResultSkeletonItem.tsx:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Renders a skeleton placeholder for a search result item.
 3 |  * Used to indicate loading state while search results are being fetched.
 4 |  */
 5 | const SearchResultSkeletonItem = () => (
 6 |   <div class="block px-4 py-2 bg-white dark:bg-gray-800 rounded-lg shadow-sm mb-2 animate-pulse">
 7 |     <div class="h-[0.8em] bg-gray-200 dark:bg-gray-700 rounded w-3/4 mb-2"></div>
 8 |     <div class="h-[0.8em] bg-gray-200 dark:bg-gray-700 rounded w-full mb-2"></div>
 9 |     <div class="h-[0.8em] bg-gray-200 dark:bg-gray-700 rounded w-5/6"></div>
10 |   </div>
11 | );
12 | 
13 | export default SearchResultSkeletonItem;
14 | 


--------------------------------------------------------------------------------
/src/web/components/Tooltip.tsx:
--------------------------------------------------------------------------------
 1 | import type { PropsWithChildren } from "@kitajs/html";
 2 | 
 3 | /**
 4 |  * Props for the Tooltip component.
 5 |  */
 6 | interface TooltipProps extends PropsWithChildren {
 7 |   text: string | Promise<string> | Element;
 8 |   position?: "top" | "right" | "bottom" | "left";
 9 | }
10 | 
11 | /**
12 |  * Reusable Tooltip component using Alpine.js for state management.
13 |  * Displays a help icon that shows a tooltip on hover/focus.
14 |  *
15 |  * @param props - Component props including text and optional position.
16 |  */
17 | const Tooltip = ({ text, position = "top" }: TooltipProps) => {
18 |   // Map position to Tailwind classes
19 |   const positionClasses = {
20 |     top: "bottom-full left-1/2 transform -translate-x-1/2 -translate-y-1 mb-1",
21 |     right: "left-full top-1/2 transform -translate-y-1/2 translate-x-1 ml-1",
22 |     bottom: "top-full left-1/2 transform -translate-x-1/2 translate-y-1 mt-1",
23 |     left: "right-full top-1/2 transform -translate-y-1/2 -translate-x-1 mr-1",
24 |   };
25 | 
26 |   return (
27 |     <div
28 |       class="relative ml-1.5 flex items-center"
29 |       x-data="{ isVisible: false }"
30 |     >
31 |       <button
32 |         type="button"
33 |         class="text-gray-400 hover:text-gray-500 dark:text-gray-500 dark:hover:text-gray-400 focus:outline-none flex items-center"
34 |         aria-label="Help"
35 |         x-on:mouseenter="isVisible = true"
36 |         x-on:mouseleave="isVisible = false"
37 |         x-on:focus="isVisible = true"
38 |         x-on:blur="isVisible = false"
39 |         tabindex="0"
40 |       >
41 |         <svg
42 |           xmlns="http://www.w3.org/2000/svg"
43 |           fill="none"
44 |           viewBox="0 0 24 24"
45 |           stroke-width="1.5"
46 |           stroke="currentColor"
47 |           class="w-4 h-4"
48 |         >
49 |           <path
50 |             stroke-linecap="round"
51 |             stroke-linejoin="round"
52 |             d="M9.879 7.519c1.171-1.025 3.071-1.025 4.242 0 1.172 1.025 1.172 2.687 0 3.712-.203.179-.43.326-.67.442-.745.361-1.45.999-1.45 1.827v.75M21 12a9 9 0 11-18 0 9 9 0 0118 0zm-9 5.25h.008v.008H12v-.008z"
53 |           />
54 |         </svg>
55 |       </button>
56 |       <div
57 |         x-show="isVisible"
58 |         x-cloak
59 |         class={`absolute z-10 w-64 p-2 text-sm text-gray-500 bg-white border border-gray-200 rounded-lg shadow-sm dark:bg-gray-800 dark:border-gray-600 dark:text-gray-400 ${positionClasses[position]}`}
60 |       >
61 |         {text as "safe"}
62 |       </div>
63 |     </div>
64 |   );
65 | };
66 | 
67 | export default Tooltip;
68 | 


--------------------------------------------------------------------------------
/src/web/components/VersionBadge.tsx:
--------------------------------------------------------------------------------
 1 | interface VersionBadgeProps {
 2 |   version: string;
 3 | }
 4 | 
 5 | const VersionBadge = ({ version }: VersionBadgeProps) => {
 6 |   if (!version) {
 7 |     return null; // Don't render if no version is provided
 8 |   }
 9 | 
10 |   return (
11 |     <span class="bg-purple-100 text-purple-800 text-xs font-medium me-2 px-1.5 py-0.5 rounded dark:bg-purple-900 dark:text-purple-300">
12 |       <span safe>{version}</span>
13 |     </span>
14 |   );
15 | };
16 | 
17 | export default VersionBadge;
18 | 


--------------------------------------------------------------------------------
/src/web/components/utils.ts:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/arabold/docs-mcp-server/2c6fb88ac09b82baea2068e878855d5b78e2a6e2/src/web/components/utils.ts


--------------------------------------------------------------------------------
/src/web/main.client.ts:
--------------------------------------------------------------------------------
 1 | // Import the main CSS file which includes Tailwind and Flowbite styles
 2 | import "./styles/main.css";
 3 | 
 4 | import Alpine from "alpinejs";
 5 | import { initFlowbite } from "flowbite";
 6 | import htmx from "htmx.org";
 7 | 
 8 | // Ensure Alpine global store for confirmation actions is initialized before Alpine components render
 9 | Alpine.store("confirmingAction", {
10 |   type: null,
11 |   id: null,
12 |   timeoutId: null,
13 |   isDeleting: false,
14 | });
15 | 
16 | Alpine.start();
17 | 
18 | // Initialize Flowbite components
19 | initFlowbite();
20 | 
21 | // Add a global event listener for 'job-list-refresh' that uses HTMX to reload the job list
22 | // This is still useful for manual refresh after actions like clearing jobs
23 | document.addEventListener("job-list-refresh", () => {
24 |   htmx.ajax("GET", "/api/jobs", "#job-queue");
25 | });
26 | 
27 | // Add a global event listener for 'version-list-refresh' that reloads the version list container using HTMX
28 | document.addEventListener("version-list-refresh", (event: Event) => {
29 |   const customEvent = event as CustomEvent<{ library: string }>;
30 |   const library = customEvent.detail?.library;
31 |   if (library) {
32 |     htmx.ajax(
33 |       "GET",
34 |       `/api/libraries/${encodeURIComponent(library)}/versions`,
35 |       "#version-list",
36 |     );
37 |   }
38 | });
39 | 
40 | // Listen for htmx swaps after a version delete and dispatch version-list-refresh with payload
41 | document.body.addEventListener("htmx:afterSwap", (event) => {
42 |   // Always re-initialize AlpineJS for swapped-in DOM to fix $store errors
43 |   if (event.target instanceof HTMLElement) {
44 |     Alpine.initTree(event.target);
45 |   }
46 | 
47 |   // Existing logic for version delete refresh
48 |   const detail = (event as CustomEvent).detail;
49 |   if (
50 |     detail?.xhr?.status === 204 &&
51 |     detail?.requestConfig?.verb === "delete" &&
52 |     (event.target as HTMLElement)?.id?.startsWith("row-")
53 |   ) {
54 |     // Extract library name from the row id: row-<library>-<version>
55 |     const rowId = (event.target as HTMLElement).id;
56 |     const match = rowId.match(/^row-([^-]+)-/);
57 |     const library = match ? match[1] : null;
58 |     if (library) {
59 |       document.dispatchEvent(
60 |         new CustomEvent("version-list-refresh", { detail: { library } }),
61 |       );
62 |     } else {
63 |       window.location.reload();
64 |     }
65 |   }
66 | });
67 | 


--------------------------------------------------------------------------------
/src/web/routes/index.tsx:
--------------------------------------------------------------------------------
 1 | import type { FastifyInstance } from "fastify";
 2 | import Layout from "../components/Layout"; // Import the Layout component
 3 | 
 4 | /**
 5 |  * Registers the root route that serves the main HTML page.
 6 |  * @param server - The Fastify instance.
 7 |  */
 8 | export function registerIndexRoute(server: FastifyInstance) {
 9 |   server.get("/", async (_, reply) => {
10 |     reply.type("text/html");
11 |     // Use the Layout component and define the main content within it
12 |     return (
13 |       "<!DOCTYPE html>" +
14 |       (
15 |         <Layout title="MCP Docs">
16 |           {/* Job Queue Section */}
17 |           <section class="mb-4 p-4 bg-white rounded-lg shadow dark:bg-gray-800 border border-gray-300 dark:border-gray-600">
18 |             <div class="flex items-center justify-between mb-2">
19 |               <h2 class="text-xl font-semibold text-gray-900 dark:text-white">
20 |                 Job Queue
21 |               </h2>
22 |               <button
23 |                 type="button"
24 |                 class="text-xs px-3 py-1.5 text-gray-700 bg-gray-100 border border-gray-300 rounded-lg hover:bg-gray-200 focus:ring-4 focus:outline-none focus:ring-gray-100 dark:bg-gray-600 dark:text-gray-300 dark:border-gray-500 dark:hover:bg-gray-700 dark:focus:ring-gray-700 transition-colors duration-150"
25 |                 title="Clear all completed, cancelled, and failed jobs"
26 |                 hx-post="/api/jobs/clear-completed"
27 |                 hx-trigger="click"
28 |                 hx-on="htmx:afterRequest: document.dispatchEvent(new Event('job-list-refresh'))"
29 |                 hx-swap="none"
30 |               >
31 |                 Clear Completed Jobs
32 |               </button>
33 |             </div>
34 |             {/* Container for the job list, loaded via HTMX */}
35 |             <div id="job-queue" hx-get="/api/jobs" hx-trigger="load, every 1s">
36 |               {/* Initial loading state */}
37 |               <div class="animate-pulse">
38 |                 <div class="h-[0.8em] bg-gray-200 rounded-full dark:bg-gray-700 w-48 mb-4" />
39 |                 <div class="h-[0.8em] bg-gray-200 rounded-full dark:bg-gray-700 w-full mb-2.5" />
40 |                 <div class="h-[0.8em] bg-gray-200 rounded-full dark:bg-gray-700 w-full mb-2.5" />
41 |               </div>
42 |             </div>
43 |           </section>
44 |           {/* Add New Job Section */}
45 |           <section class="mb-8">
46 |             {/* Container for the add job form, loaded via HTMX */}
47 |             <div id="addJobForm" hx-get="/api/jobs/new" hx-trigger="load">
48 |               {/* Initial loading state (optional, could just be empty) */}
49 |               <div class="p-6 bg-white rounded-lg shadow dark:bg-gray-800 animate-pulse">
50 |                 <div class="h-6 bg-gray-200 rounded-full dark:bg-gray-700 w-1/3 mb-4" />
51 |                 <div class="h-[0.8em] bg-gray-200 rounded-full dark:bg-gray-700 w-full mb-2.5" />
52 |                 <div class="h-[0.8em] bg-gray-200 rounded-full dark:bg-gray-700 w-full mb-2.5" />
53 |               </div>
54 |             </div>
55 |           </section>
56 |           {/* Indexed Documentation Section */}
57 |           <div>
58 |             <h2 class="text-xl font-semibold mb-2 text-gray-900 dark:text-white">
59 |               Indexed Documentation
60 |             </h2>
61 |             <div
62 |               id="indexed-docs"
63 |               hx-get="/api/libraries"
64 |               hx-trigger="load, every 10s"
65 |             >
66 |               <div class="animate-pulse">
67 |                 <div class="h-[0.8em] bg-gray-200 rounded-full dark:bg-gray-700 w-48 mb-4" />
68 |                 <div class="h-[0.8em] bg-gray-200 rounded-full dark:bg-gray-700 w-full mb-2.5" />
69 |                 <div class="h-[0.8em] bg-gray-200 rounded-full dark:bg-gray-700 w-full mb-2.5" />
70 |               </div>
71 |             </div>
72 |           </div>
73 |         </Layout>
74 |       )
75 |     );
76 |   });
77 | }
78 | 


--------------------------------------------------------------------------------
/src/web/routes/jobs/cancel.tsx:
--------------------------------------------------------------------------------
 1 | import type { FastifyInstance } from "fastify";
 2 | import type { CancelJobTool } from "../../../tools/CancelJobTool";
 3 | 
 4 | /**
 5 |  * Registers the API route for cancelling jobs.
 6 |  * @param server - The Fastify instance.
 7 |  * @param cancelJobTool - The tool instance for cancelling jobs.
 8 |  */
 9 | export function registerCancelJobRoute(
10 |   server: FastifyInstance,
11 |   cancelJobTool: CancelJobTool
12 | ) {
13 |   // POST /api/jobs/:jobId/cancel - Cancel a job by ID
14 |   server.post<{ Params: { jobId: string } }>(
15 |     "/api/jobs/:jobId/cancel",
16 |     async (request, reply) => {
17 |       const { jobId } = request.params;
18 |       const result = await cancelJobTool.execute({ jobId });
19 |       if (result.success) {
20 |         return { success: true, message: result.message };
21 |       } else {
22 |         reply.status(400);
23 |         return { success: false, message: result.message };
24 |       }
25 |     }
26 |   );
27 | }
28 | 


--------------------------------------------------------------------------------
/src/web/routes/jobs/clear-completed.tsx:
--------------------------------------------------------------------------------
 1 | import type { FastifyInstance } from "fastify";
 2 | import type { ClearCompletedJobsTool } from "../../../tools/ClearCompletedJobsTool";
 3 | 
 4 | /**
 5 |  * Registers the API route for clearing completed jobs.
 6 |  * @param server - The Fastify instance.
 7 |  * @param clearCompletedJobsTool - The tool instance for clearing completed jobs.
 8 |  */
 9 | export function registerClearCompletedJobsRoute(
10 |   server: FastifyInstance,
11 |   clearCompletedJobsTool: ClearCompletedJobsTool
12 | ) {
13 |   // POST /api/jobs/clear-completed - Clear all completed jobs
14 |   server.post("/api/jobs/clear-completed", async (_, reply) => {
15 |     try {
16 |       const result = await clearCompletedJobsTool.execute({});
17 | 
18 |       reply.type("application/json");
19 |       return {
20 |         success: result.success,
21 |         message: result.message,
22 |       };
23 |     } catch (error) {
24 |       reply.code(500);
25 |       return {
26 |         success: false,
27 |         message: `Internal server error: ${error instanceof Error ? error.message : String(error)}`,
28 |       };
29 |     }
30 |   });
31 | }
32 | 


--------------------------------------------------------------------------------
/src/web/routes/jobs/list.tsx:
--------------------------------------------------------------------------------
 1 | import type { FastifyInstance } from "fastify";
 2 | import type { ListJobsTool } from "../../../tools/ListJobsTool"; // Adjusted import path
 3 | import JobList from "../../components/JobList"; // Import the extracted component
 4 | 
 5 | /**
 6 |  * Registers the API route for listing jobs.
 7 |  * @param server - The Fastify instance.
 8 |  * @param listJobsTool - The tool instance for listing jobs.
 9 |  */
10 | export function registerJobListRoutes(
11 |   server: FastifyInstance,
12 |   listJobsTool: ListJobsTool
13 | ) {
14 |   // GET /api/jobs - List current jobs (only the list)
15 |   server.get("/api/jobs", async () => {
16 |     const result = await listJobsTool.execute({});
17 |     return <JobList jobs={result.jobs} />;
18 |   });
19 | }
20 | 


--------------------------------------------------------------------------------
/src/web/routes/libraries/list.tsx:
--------------------------------------------------------------------------------
 1 | import type { FastifyInstance } from "fastify";
 2 | import type { ListLibrariesTool } from "../../../tools/ListLibrariesTool";
 3 | import { RemoveTool } from "../../../tools";
 4 | import LibraryList from "../../components/LibraryList";
 5 | 
 6 | /**
 7 |  * Registers the API routes for library management.
 8 |  * @param server - The Fastify instance.
 9 |  * @param listLibrariesTool - The tool instance for listing libraries.
10 |  * @param removeTool - The tool instance for removing library versions.
11 |  */
12 | export function registerLibrariesRoutes(
13 |   server: FastifyInstance,
14 |   listLibrariesTool: ListLibrariesTool,
15 |   removeTool: RemoveTool // Accept RemoveTool
16 | ) {
17 |   server.get("/api/libraries", async (_request, reply) => {
18 |     // Add reply
19 |     try {
20 |       const result = await listLibrariesTool.execute();
21 |       // Set content type to HTML for JSX rendering
22 |       reply.type("text/html; charset=utf-8");
23 |       // Render the component directly
24 |       return <LibraryList libraries={result.libraries} />;
25 |     } catch (error) {
26 |       server.log.error(error, "Failed to list libraries");
27 |       reply.status(500).send("Internal Server Error"); // Handle errors
28 |     }
29 |   });
30 | 
31 |   // Add DELETE route for removing versions
32 |   server.delete<{ Params: { libraryName: string; versionParam: string } }>(
33 |     "/api/libraries/:libraryName/versions/:versionParam",
34 |     async (request, reply) => {
35 |       const { libraryName, versionParam } = request.params;
36 |       const version = versionParam === "unversioned" ? undefined : versionParam;
37 |       try {
38 |         await removeTool.execute({ library: libraryName, version });
39 |         reply.status(204).send(); // No Content on success
40 |       } catch (error: any) {
41 |         server.log.error(
42 |           error,
43 |           `Failed to remove ${libraryName}@${versionParam}`
44 |         );
45 |         // Check for specific errors if needed, e.g., NotFoundError
46 |         reply
47 |           .status(500)
48 |           .send({ message: error.message || "Failed to remove version." });
49 |       }
50 |     }
51 |   );
52 | }
53 | 


--------------------------------------------------------------------------------
/src/web/styles/main.css:
--------------------------------------------------------------------------------
 1 | /* Import Tailwind CSS */
 2 | @import "tailwindcss";
 3 | 
 4 | /* Import Flowbite default theme */
 5 | @import "flowbite/src/themes/default";
 6 | 
 7 | /* Import Flowbite plugin */
 8 | @plugin "flowbite/plugin";
 9 | @plugin "flowbite-typography";
10 | 
11 | /* Configure Flowbite source files */
12 | @source "../../../node_modules/flowbite";
13 | 
14 | @layer components {
15 |   a {
16 |     @apply underline-offset-8;
17 |   }
18 | 
19 |   button {
20 |     @apply cursor-pointer;
21 |   }
22 | }
23 | 


--------------------------------------------------------------------------------
/src/web/web.ts:
--------------------------------------------------------------------------------
 1 | import path from "node:path";
 2 | import formBody from "@fastify/formbody";
 3 | import fastifyStatic from "@fastify/static";
 4 | import Fastify, { type FastifyInstance } from "fastify";
 5 | import type { PipelineManager } from "../pipeline/PipelineManager";
 6 | import type { DocumentManagementService } from "../store/DocumentManagementService";
 7 | import { SearchTool } from "../tools";
 8 | import { CancelJobTool } from "../tools/CancelJobTool";
 9 | import { ClearCompletedJobsTool } from "../tools/ClearCompletedJobsTool";
10 | import { ListJobsTool } from "../tools/ListJobsTool";
11 | import { ListLibrariesTool } from "../tools/ListLibrariesTool";
12 | import { RemoveTool } from "../tools/RemoveTool";
13 | import { ScrapeTool } from "../tools/ScrapeTool";
14 | import { logger } from "../utils/logger";
15 | import { getProjectRoot } from "../utils/paths";
16 | import { registerIndexRoute } from "./routes/index";
17 | import { registerCancelJobRoute } from "./routes/jobs/cancel";
18 | import { registerClearCompletedJobsRoute } from "./routes/jobs/clear-completed";
19 | import { registerJobListRoutes } from "./routes/jobs/list";
20 | import { registerNewJobRoutes } from "./routes/jobs/new";
21 | import { registerLibraryDetailRoutes } from "./routes/libraries/detail";
22 | import { registerLibrariesRoutes } from "./routes/libraries/list";
23 | 
24 | /**
25 |  * Initializes the Fastify web server instance.
26 |  *
27 |  * @param port The port number for the web server.
28 |  * @param docService The document management service instance.
29 |  * @param pipelineManager The pipeline manager instance.
30 |  * @returns The initialized Fastify server instance.
31 |  */
32 | export async function startWebServer(
33 |   port: number,
34 |   docService: DocumentManagementService,
35 |   pipelineManager: PipelineManager,
36 | ): Promise<FastifyInstance> {
37 |   const server = Fastify({
38 |     logger: false, // Use our own logger instead
39 |   });
40 | 
41 |   // Register plugins
42 |   await server.register(formBody); // Register formbody to parse form data
43 | 
44 |   // Instantiate tools using provided services
45 |   const listLibrariesTool = new ListLibrariesTool(docService);
46 |   const listJobsTool = new ListJobsTool(pipelineManager);
47 |   const scrapeTool = new ScrapeTool(docService, pipelineManager);
48 |   const removeTool = new RemoveTool(docService, pipelineManager);
49 |   const searchTool = new SearchTool(docService);
50 |   const cancelJobTool = new CancelJobTool(pipelineManager);
51 |   const clearCompletedJobsTool = new ClearCompletedJobsTool(pipelineManager);
52 | 
53 |   // Register static file serving
54 |   await server.register(fastifyStatic, {
55 |     // Use project root to construct absolute path to public directory
56 |     root: path.join(getProjectRoot(), "public"),
57 |     prefix: "/",
58 |     index: false, // Disable automatic index.html serving
59 |   });
60 | 
61 |   // Register routes
62 |   registerIndexRoute(server); // Register the root route first
63 |   registerJobListRoutes(server, listJobsTool);
64 |   registerNewJobRoutes(server, scrapeTool);
65 |   registerCancelJobRoute(server, cancelJobTool);
66 |   registerClearCompletedJobsRoute(server, clearCompletedJobsTool);
67 |   registerLibrariesRoutes(server, listLibrariesTool, removeTool);
68 |   registerLibraryDetailRoutes(server, listLibrariesTool, searchTool);
69 | 
70 |   // Graceful shutdown of services will be handled by the caller (src/index.ts)
71 | 
72 |   try {
73 |     const address = await server.listen({ port, host: "0.0.0.0" });
74 |     logger.info(`🚀 Web UI available at ${address}`);
75 |     return server; // Return the server instance
76 |   } catch (error) {
77 |     logger.error(`❌ Failed to start web UI: ${error}`);
78 |     // Ensure server is closed if listen fails but initialization succeeded partially
79 |     await server.close();
80 |     throw error;
81 |   }
82 | }
83 | 
84 | /**
85 |  * Stops the provided Fastify web server instance.
86 |  *
87 |  * @param server - The Fastify server instance to stop.
88 |  */
89 | export async function stopWebServer(server: FastifyInstance): Promise<void> {
90 |   try {
91 |     await server.close();
92 |     logger.info("🛑 Web UI stopped.");
93 |   } catch (error) {
94 |     logger.error(`❌ Failed to stop web server gracefully: ${error}`);
95 |     // Rethrow or handle as needed, but ensure the process doesn't hang
96 |     throw error;
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "target": "ES2022",
 4 |     "jsx": "react-jsx",
 5 |     "jsxImportSource": "@kitajs/html",
 6 |     "plugins": [{ "name": "@kitajs/ts-html-plugin" }],
 7 |     "module": "ESNext",
 8 |     "moduleResolution": "bundler",
 9 |     "noEmit": true,
10 |     "esModuleInterop": true,
11 |     "forceConsistentCasingInFileNames": true,
12 |     "strict": true,
13 |     "skipLibCheck": true,
14 |     "outDir": "dist",
15 |     "sourceMap": true,
16 |     "declaration": true,
17 |     "allowJs": true,
18 |     "resolveJsonModule": true,
19 |     "rootDir": "src",
20 |     "types": ["@kitajs/html/htmx.d.ts", "vite/client"]
21 |   },
22 |   "include": ["src/**/*"]
23 | }
24 | 


--------------------------------------------------------------------------------
/vite.config.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "vitest/config";
 2 | import path from 'path';
 3 | import packageJson from "./package.json";
 4 | 
 5 | export default defineConfig({
 6 |   plugins: [
 7 |   ],
 8 |   resolve: {
 9 |     // Keep existing resolve extensions
10 |     extensions: [".ts", ".tsx", ".js", ".jsx", ".json"],
11 |   },
12 |   optimizeDeps: {
13 |     force: true
14 |   },
15 |   build: {
16 |     outDir: 'dist', // Output directory
17 |     sourcemap: true, // Generate sourcemaps
18 |     emptyOutDir: true, // Clean the output directory before build (replaces tsup clean:true)
19 |     lib: {
20 |       // Define entry points using path.resolve for robustness
21 |       entry: {
22 |         index: path.resolve(__dirname, 'src/index.ts'),
23 |       },
24 |       formats: ['es'], // Output ESM format only
25 |       // Output filename will be based on the entry key (index.js)
26 |       // fileName: (format, entryName) => `${entryName}.js`,
27 |     },
28 |     rollupOptions: {
29 |       // Externalize dependencies and node built-ins
30 |       external: [
31 |         /^node:/, // Externalize all node built-ins (e.g., 'node:fs', 'node:path')
32 |         ...Object.keys(packageJson.dependencies || {}),
33 |         // Explicitly externalize potentially problematic packages if needed
34 |         'fingerprint-generator',
35 |         'header-generator',
36 |         'better-sqlite3', // Often needs to be external due to native bindings
37 |         'playwright', // Playwright should definitely be external
38 |         'sqlite-vec', // Likely involves native bindings
39 |       ],
40 |       
41 |       output: {
42 |         // Optional: Configure output further if needed
43 |         // preserveModules: true, // Uncomment if you need to preserve source file structure
44 |         // entryFileNames: '[name].js', // Adjust naming if needed
45 |       },
46 |     },
47 |     // Target Node.js environment based on the version running the build
48 |     target: `node${process.versions.node.split('.')[0]}`,
49 |     ssr: true, // Explicitly mark this as an SSR/Node build
50 |   },
51 |   test: {
52 |     globals: true,
53 |     environment: "node",
54 |     testTimeout: 5000,
55 |     include: ["src/**/*.test.ts", "src/**/*.test.tsx"],
56 |   },
57 | });
58 | 


--------------------------------------------------------------------------------
/vite.config.web.ts:
--------------------------------------------------------------------------------
 1 | import { defineConfig } from "vite";
 2 | import path from "node:path";
 3 | import tailwindcss from '@tailwindcss/vite'
 4 | import packageJson from "./package.json";
 5 | 
 6 | // Vite configuration specifically for building frontend assets (CSS, JS)
 7 | export default defineConfig({
 8 |   // No need for dts plugin for frontend assets
 9 |   plugins: [tailwindcss()],
10 |   resolve: {
11 |     // Keep existing resolve extensions
12 |     extensions: [".ts", ".tsx", ".js", ".jsx", ".json"],
13 |   },
14 |   build: {
15 |     // Output assets to public/assets, so they can be served statically
16 |     outDir: path.resolve(__dirname, "public/assets"),
17 |     sourcemap: true, // Generate sourcemaps for easier debugging
18 |     emptyOutDir: true, // Clean the output directory before build
19 |     // Define the frontend entry point
20 |     lib: {
21 |       entry: path.resolve(__dirname, "src/web/main.client.ts"), // Updated entry point
22 |       // Use 'es' format for modern browsers
23 |       formats: ["es"],
24 |       // Define a fixed output filename for the JS bundle
25 |       fileName: () => "main.js",
26 |     },
27 |     rollupOptions: {
28 |       // Unlike the backend build, we DO NOT externalize frontend dependencies
29 |       // They should be bundled into main.js
30 |       external: [], // Ensure no dependencies are externalized
31 |       output: {
32 |         // Ensure CSS is output as a separate file named main.css
33 |         assetFileNames: "main.css", // Directly name the CSS output
34 |       },
35 |     },
36 |     // Target modern browsers
37 |     target: "esnext",
38 |     // This is NOT an SSR build
39 |     ssr: false,
40 |   },
41 | });
42 | 


--------------------------------------------------------------------------------