├── .clineignore ├── .clinerules ├── .dockerignore ├── .env.example ├── .github └── workflows │ ├── ci.yml │ └── release.yml ├── .gitignore ├── .husky ├── commit-msg └── pre-commit ├── .releaserc.json ├── ARCHITECTURE.md ├── CHANGELOG.md ├── Dockerfile ├── LICENSE ├── README.md ├── biome.json ├── commitlint.config.js ├── db └── migrations │ ├── 000-initial-schema.sql │ └── 001-add-indexed-at-column.sql ├── docker-compose.yml ├── docs └── docs-mcp-server.png ├── package-lock.json ├── package.json ├── postcss.config.cjs ├── src ├── index.ts ├── mcp │ ├── index.ts │ ├── mcpServer.ts │ ├── startHttpServer.ts │ ├── startStdioServer.ts │ ├── tools.ts │ └── utils.ts ├── pipeline │ ├── PipelineManager.test.ts │ ├── PipelineManager.ts │ ├── PipelineWorker.test.ts │ ├── PipelineWorker.ts │ ├── errors.ts │ ├── index.ts │ └── types.ts ├── scraper │ ├── ScraperRegistry.test.ts │ ├── ScraperRegistry.ts │ ├── ScraperService.test.ts │ ├── ScraperService.ts │ ├── fetcher │ │ ├── FileFetcher.test.ts │ │ ├── FileFetcher.ts │ │ ├── FingerprintGenerator.test.ts │ │ ├── FingerprintGenerator.ts │ │ ├── HttpFetcher.test.ts │ │ ├── HttpFetcher.ts │ │ ├── index.ts │ │ └── types.ts │ ├── index.ts │ ├── middleware │ │ ├── HtmlCheerioParserMiddleware.test.ts │ │ ├── HtmlCheerioParserMiddleware.ts │ │ ├── HtmlJsExecutorMiddleware.test.ts │ │ ├── HtmlJsExecutorMiddleware.ts │ │ ├── HtmlLinkExtractorMiddleware.test.ts │ │ ├── HtmlLinkExtractorMiddleware.ts │ │ ├── HtmlMetadataExtractorMiddleware.test.ts │ │ ├── HtmlMetadataExtractorMiddleware.ts │ │ ├── HtmlPlaywrightMiddleware.test.ts │ │ ├── HtmlPlaywrightMiddleware.ts │ │ ├── HtmlSanitizerMiddleware.test.ts │ │ ├── HtmlSanitizerMiddleware.ts │ │ ├── HtmlToMarkdownMiddleware.test.ts │ │ ├── HtmlToMarkdownMiddleware.ts │ │ ├── MarkdownLinkExtractorMiddleware.test.ts │ │ ├── MarkdownLinkExtractorMiddleware.ts │ │ ├── MarkdownMetadataExtractorMiddleware.test.ts │ │ ├── MarkdownMetadataExtractorMiddleware.ts │ │ ├── index.ts │ │ └── types.ts │ ├── pipelines │ │ ├── BasePipeline.test.ts │ │ ├── BasePipeline.ts │ │ ├── HtmlPipeline.test.ts │ │ ├── HtmlPipeline.ts │ │ ├── MarkdownPipeline.test.ts │ │ ├── MarkdownPipeline.ts │ │ └── types.ts │ ├── strategies │ │ ├── BaseScraperStrategy.test.ts │ │ ├── BaseScraperStrategy.ts │ │ ├── GitHubScraperStrategy.ts │ │ ├── LocalFileStrategy.test.ts │ │ ├── LocalFileStrategy.ts │ │ ├── NpmScraperStrategy.ts │ │ ├── PyPiScraperStrategy.ts │ │ ├── WebScraperStrategy.test.ts │ │ └── WebScraperStrategy.ts │ ├── types.ts │ └── utils │ │ ├── buffer.test.ts │ │ ├── buffer.ts │ │ ├── patternMatcher.test.ts │ │ ├── patternMatcher.ts │ │ ├── sandbox.test.ts │ │ ├── sandbox.ts │ │ ├── scope.test.ts │ │ └── scope.ts ├── splitter │ ├── GreedySplitter.test.ts │ ├── GreedySplitter.ts │ ├── SemanticMarkdownSplitter.test.ts │ ├── SemanticMarkdownSplitter.ts │ ├── errors.ts │ ├── index.ts │ ├── splitters │ │ ├── CodeContentSplitter.test.ts │ │ ├── CodeContentSplitter.ts │ │ ├── TableContentSplitter.test.ts │ │ ├── TableContentSplitter.ts │ │ ├── TextContentSplitter.test.ts │ │ ├── TextContentSplitter.ts │ │ └── types.ts │ └── types.ts ├── store │ ├── DocumentManagementService.test.ts │ ├── DocumentManagementService.ts │ ├── DocumentRetrieverService.test.ts │ ├── DocumentRetrieverService.ts │ ├── DocumentStore.test.ts │ ├── DocumentStore.ts │ ├── applyMigrations.ts │ ├── embeddings │ │ ├── EmbeddingFactory.test.ts │ │ ├── EmbeddingFactory.ts │ │ ├── FixedDimensionEmbeddings.test.ts │ │ └── FixedDimensionEmbeddings.ts │ ├── errors.ts │ ├── index.ts │ └── types.ts ├── tools │ ├── CancelJobTool.test.ts │ ├── CancelJobTool.ts │ ├── ClearCompletedJobsTool.test.ts │ ├── ClearCompletedJobsTool.ts │ ├── FetchUrlTool.test.ts │ ├── FetchUrlTool.ts │ ├── FindVersionTool.test.ts │ ├── FindVersionTool.ts │ ├── GetJobInfoTool.test.ts │ ├── GetJobInfoTool.ts │ ├── ListJobsTool.test.ts │ ├── ListJobsTool.ts │ ├── ListLibrariesTool.test.ts │ ├── ListLibrariesTool.ts │ ├── RemoveTool.test.ts │ ├── RemoveTool.ts │ ├── ScrapeTool.test.ts │ ├── ScrapeTool.ts │ ├── SearchTool.test.ts │ ├── SearchTool.ts │ ├── errors.test.ts │ ├── errors.ts │ └── index.ts ├── types │ └── index.ts ├── utils │ ├── config.ts │ ├── dom.ts │ ├── errors.ts │ ├── logger.ts │ ├── mimeTypeUtils.ts │ ├── paths.ts │ ├── string.ts │ ├── url.test.ts │ └── url.ts └── web │ ├── components │ ├── Alert.tsx │ ├── JobItem.tsx │ ├── JobList.tsx │ ├── Layout.tsx │ ├── LibraryDetailCard.tsx │ ├── LibraryItem.tsx │ ├── LibraryList.tsx │ ├── LibrarySearchCard.tsx │ ├── LoadingSpinner.tsx │ ├── ScrapeForm.tsx │ ├── ScrapeFormContent.tsx │ ├── SearchResultItem.tsx │ ├── SearchResultList.tsx │ ├── SearchResultSkeletonItem.tsx │ ├── Tooltip.tsx │ ├── VersionBadge.tsx │ ├── VersionDetailsRow.tsx │ └── utils.ts │ ├── main.client.ts │ ├── routes │ ├── index.tsx │ ├── jobs │ │ ├── cancel.tsx │ │ ├── clear-completed.tsx │ │ ├── list.tsx │ │ └── new.tsx │ └── libraries │ │ ├── detail.tsx │ │ └── list.tsx │ ├── styles │ └── main.css │ └── web.ts ├── tsconfig.json ├── vite.config.ts └── vite.config.web.ts /.clineignore: -------------------------------------------------------------------------------- 1 | package-lock.json 2 | dist/ 3 | .git/ 4 | *.log 5 | .env.* 6 | !.env.example 7 | !.github/ 8 | -------------------------------------------------------------------------------- /.clinerules: -------------------------------------------------------------------------------- 1 | # Cline Custom Instructions 2 | 3 | - You must read the `README.md` to understand the project structure and setup. 4 | - You must read the `ARCHITECTURE.md` file before making changes across multiple services. 5 | - You must follow DRY, KISS, YAGNI, and SOLID principles. 6 | - You must use the latest version of the programming language and libraries. 7 | - Prefer the simplest solution. 8 | - Never commit secrets, credentials, or sensitive data to the repository. 9 | - When importing a relative path, avoid using file extensions like ".js" and ".ts". 10 | - Update TSDoc for all classes, methods and functions. Focus on functionality and reasoning. 11 | - NEVER document individual parameters or return values if their use can easily be derived from their name. 12 | - When asked to check the documentation of a library, use the `search_docs` tool. 13 | 14 | ## Architecture 15 | 16 | - Focus on system concepts and component relationships. 17 | - Put implementation details in source code. 18 | - Update `ARCHITECTURE.md` when the architecture changes. 19 | - Do not use special characters like braces in mermaid diagram titles or names. Quote them if necessary. 20 | 21 | ## Git 22 | 23 | - Branches must be created locally before pushing. 24 | - Branch names must be prefixed with type (`feature/`, `bugfix/`, `chore/`) and include the issue number if available (e.g., `feature/1234-description`). 25 | - All commit messages must use Conventional Commits (`feat:`, `fix:`, etc.). 26 | - Commit subject must be imperative mood and ≤72 characters. 27 | - If a commit body is present, add a blank line before it. 28 | - Commit body (for non-trivial changes) must explain what and why, not how. 29 | - Reference related issues in commit messages when relevant (e.g., `Closes #123`). 30 | - Do not include unrelated changes in a single commit. 31 | - Do not use vague or generic commit messages. 32 | - Pull request descriptions must summarize the what and why of all changes in the branch (not just a list of commits or the how). 33 | - Pull requests must target `main` unless specified otherwise. 34 | 35 | ## Typescript 36 | 37 | - Install dependencies using `npm install` 38 | - Prefer a specific type or `unknown` over `any`. 39 | - Do not use non-null assertions (`!`). Use optional chaining (`?.`) or nullish coalescing (`??`). 40 | 41 | ## Logging Guidelines 42 | 43 | - Use `console.*` for CLI user output (results, direct feedback). 44 | - Use `logger.info/warn/error` for meaningful application events; prefix with a relevant emoji. 45 | - Use `logger.debug` for detailed developer/tracing logs; no emoji prefix. 46 | - Prefer `logger.debug` over `logger.info` for granular internal steps to reduce log verbosity. 47 | 48 | ## Web UI 49 | 50 | - Use AlpineJS for frontend components and TailwindCSS for styling. 51 | - Use TSX with kitajs for AlpineJS components. 52 | - Use HTMX for server-side interactions. 53 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Docker specific 2 | Dockerfile 3 | .dockerignore 4 | 5 | # Version control 6 | .git 7 | .github 8 | .husky 9 | 10 | # Already in gitignore but explicitly listed for Docker context 11 | node_modules 12 | dist 13 | *.log 14 | .env* 15 | 16 | # Other 17 | .store 18 | README.md 19 | ARCHITECTURE.md 20 | CHANGELOG.md 21 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Embedding Model Configuration 2 | # Optional: Format is "provider:model_name" or just "model_name" for OpenAI (default) 3 | # Examples: 4 | # - openai:text-embedding-3-small (default if no provider specified) 5 | # - vertex:text-embedding-004 (Google Cloud Vertex AI) 6 | # - gemini:gemini-embedding-exp-03-07 (Google Generative AI) 7 | # - aws:amazon.titan-embed-text-v1 8 | # - microsoft:text-embedding-ada-002 9 | DOCS_MCP_EMBEDDING_MODEL= 10 | 11 | # OpenAI Provider Configuration (Default) 12 | # Required for OpenAI provider or as fallback 13 | OPENAI_API_KEY=your-key-here 14 | # Optional: Your OpenAI Organization ID 15 | OPENAI_ORG_ID= 16 | # Optional: Custom base URL for OpenAI-compatible APIs (e.g., Ollama, Azure OpenAI) 17 | OPENAI_API_BASE= 18 | 19 | # Google Cloud Vertex AI Configuration 20 | # Required for vertex provider: Path to service account JSON key file 21 | GOOGLE_APPLICATION_CREDENTIALS=/path/to/gcp-key.json 22 | 23 | # Google Generative AI (Gemini) Configuration 24 | # Required for gemini provider: Google API key 25 | GOOGLE_API_KEY=your-google-api-key 26 | 27 | # AWS Bedrock Configuration 28 | # Required for aws provider 29 | AWS_ACCESS_KEY_ID=your-aws-key 30 | AWS_SECRET_ACCESS_KEY=your-aws-secret 31 | AWS_REGION=us-east-1 32 | # Optional: Use BEDROCK_AWS_REGION instead of AWS_REGION if needed 33 | # BEDROCK_AWS_REGION=us-east-1 34 | 35 | # Azure OpenAI Configuration 36 | # Required for microsoft provider 37 | AZURE_OPENAI_API_KEY=your-azure-key 38 | AZURE_OPENAI_API_INSTANCE_NAME=your-instance 39 | AZURE_OPENAI_API_DEPLOYMENT_NAME=your-deployment 40 | AZURE_OPENAI_API_VERSION=2024-02-01 41 | 42 | # Optional: Specify a custom directory to store the SQLite database file (documents.db). 43 | # If set, this path takes precedence over the default locations. 44 | # Default behavior (if unset): 45 | # 1. Uses './.store/' in the project root if it exists (legacy). 46 | # 2. Falls back to OS-specific data directory (e.g., ~/Library/Application Support/docs-mcp-server on macOS). 47 | # DOCS_MCP_STORE_PATH=/path/to/your/desired/storage/directory 48 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | 9 | jobs: 10 | lint: 11 | name: Lint 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v4 16 | 17 | - name: Set up Node.js 18 | uses: actions/setup-node@v4 19 | with: 20 | node-version: '>=20.0.0' # Match engines requirement in package.json 21 | cache: 'npm' 22 | 23 | - name: Install dependencies 24 | run: npm ci 25 | 26 | - name: Run linter 27 | run: npm run lint 28 | 29 | test: 30 | name: Test 31 | runs-on: ubuntu-latest 32 | needs: lint # Run after linting passes 33 | steps: 34 | - name: Checkout code 35 | uses: actions/checkout@v4 36 | 37 | - name: Set up Node.js 38 | uses: actions/setup-node@v4 39 | with: 40 | node-version: '>=20.0.0' 41 | cache: 'npm' 42 | 43 | - name: Install dependencies 44 | run: npm ci 45 | 46 | - name: Install Playwright browsers 47 | run: npx playwright install --no-shell --with-deps chromium 48 | 49 | - name: Run tests 50 | run: npx vitest run 51 | 52 | build: 53 | name: Build 54 | runs-on: ubuntu-latest 55 | needs: test # Run after tests pass 56 | steps: 57 | - name: Checkout code 58 | uses: actions/checkout@v4 59 | 60 | - name: Set up Node.js 61 | uses: actions/setup-node@v4 62 | with: 63 | node-version: '>=20.0.0' 64 | cache: 'npm' 65 | 66 | - name: Install dependencies 67 | run: npm ci 68 | 69 | - name: Run build 70 | run: npm run build 71 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release 2 | 3 | on: 4 | workflow_dispatch: # Allows manual triggering 5 | 6 | jobs: 7 | release: 8 | name: Release to npm and GitHub 9 | runs-on: ubuntu-latest 10 | # Permissions needed for semantic-release to commit/tag/release 11 | permissions: 12 | contents: write 13 | issues: write 14 | pull-requests: write 15 | # id-token: write # Needed for OIDC trusted publishing (if not using NPM_TOKEN) 16 | outputs: 17 | # Output whether a new release was published 18 | new_release_published: ${{ steps.semantic.outputs.new_release_published }} 19 | new_release_version: ${{ steps.semantic.outputs.new_release_version }} 20 | steps: 21 | - name: Checkout code 22 | # Need fetch-depth: 0 for semantic-release to analyze all relevant commits 23 | # and commit package.json/CHANGELOG.md changes 24 | uses: actions/checkout@v4 25 | with: 26 | fetch-depth: 0 27 | 28 | - name: Set up Node.js 29 | uses: actions/setup-node@v4 30 | with: 31 | node-version: '>=20.0.0' # Match engines requirement in package.json 32 | registry-url: 'https://registry.npmjs.org' # Specify npm registry 33 | cache: 'npm' 34 | 35 | - name: Install dependencies 36 | run: npm ci 37 | 38 | - name: Run build 39 | run: npm run build 40 | 41 | - name: Run semantic-release 42 | id: semantic # Give step an ID to reference its outputs 43 | uses: cycjimmy/semantic-release-action@v4 44 | env: 45 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 46 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 47 | 48 | docker_publish: 49 | name: Build and Push Docker Image to GHCR 50 | # Run only after the release job completes successfully 51 | needs: release 52 | # Run only if semantic-release actually published a new version 53 | if: needs.release.outputs.new_release_published == 'true' 54 | runs-on: ubuntu-latest 55 | permissions: 56 | contents: read # Needed to check out the code 57 | packages: write # Needed to push Docker image to GHCR 58 | attestations: write # Needed for build attestations 59 | id-token: write # Needed for OIDC (good practice) 60 | 61 | steps: 62 | - name: Checkout code 63 | # Checkout the specific commit tagged by semantic-release 64 | uses: actions/checkout@v4 65 | with: 66 | # Use the tag name determined by the release job 67 | ref: v${{ needs.release.outputs.new_release_version }} 68 | 69 | - name: Set up Docker Buildx 70 | uses: docker/setup-buildx-action@v3 71 | 72 | - name: Log in to GitHub Container Registry 73 | uses: docker/login-action@v3 74 | with: 75 | registry: ghcr.io 76 | username: ${{ github.actor }} 77 | password: ${{ secrets.GITHUB_TOKEN }} 78 | 79 | - name: Extract Docker metadata 80 | id: meta 81 | uses: docker/metadata-action@v5 82 | with: 83 | images: ghcr.io/${{ github.repository }} 84 | # Use the version from the semantic-release output 85 | tags: | 86 | type=raw,value=${{ needs.release.outputs.new_release_version }} # e.g., v1.4.1 87 | type=semver,pattern={{version}},value=${{ needs.release.outputs.new_release_version }} # e.g., 1.4.1 88 | type=semver,pattern=v{{major}}.{{minor}},value=${{ needs.release.outputs.new_release_version }} # e.g., v1.4 89 | type=semver,pattern=v{{major}},value=${{ needs.release.outputs.new_release_version }} # e.g., v1 90 | type=raw,value=latest,enable=true # Always tag latest on main branch release 91 | 92 | - name: Build and push Docker image 93 | id: push 94 | uses: docker/build-push-action@v6 95 | with: 96 | context: . 97 | push: true 98 | tags: ${{ steps.meta.outputs.tags }} 99 | labels: ${{ steps.meta.outputs.labels }} 100 | cache-from: type=gha 101 | cache-to: type=gha,mode=max 102 | platforms: linux/amd64,linux/arm64 # Build for both x86_64 and arm64 (Mac Silicon) 103 | 104 | - name: Generate artifact attestation 105 | uses: actions/attest-build-provenance@v1 106 | with: 107 | subject-name: ghcr.io/${{ github.repository }} 108 | subject-digest: ${{ steps.push.outputs.digest }} 109 | push-to-registry: true 110 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | dist/ 3 | .store/ 4 | public/assets/ 5 | *.log 6 | .env* 7 | !*.env.example 8 | *.code-workspace 9 | -------------------------------------------------------------------------------- /.husky/commit-msg: -------------------------------------------------------------------------------- 1 | npx commitlint --edit $1 2 | -------------------------------------------------------------------------------- /.husky/pre-commit: -------------------------------------------------------------------------------- 1 | npx lint-staged 2 | -------------------------------------------------------------------------------- /.releaserc.json: -------------------------------------------------------------------------------- 1 | { 2 | "branches": ["main"], 3 | "plugins": [ 4 | "@semantic-release/commit-analyzer", 5 | "@semantic-release/release-notes-generator", 6 | [ 7 | "@semantic-release/changelog", 8 | { 9 | "changelogFile": "CHANGELOG.md" 10 | } 11 | ], 12 | [ 13 | "@semantic-release/npm", 14 | { 15 | "npmPublish": true, 16 | "pkgRoot": "." 17 | } 18 | ], 19 | [ 20 | "@semantic-release/git", 21 | { 22 | "assets": ["package.json", "CHANGELOG.md"], 23 | "message": "chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}" 24 | } 25 | ], 26 | [ 27 | "@semantic-release/github", 28 | { 29 | "assets": [] 30 | } 31 | ] 32 | ] 33 | } 34 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # Build stage 2 | FROM node:22-slim AS builder 3 | 4 | WORKDIR /app 5 | 6 | # Copy package files 7 | COPY package*.json ./ 8 | 9 | # Install dependencies 10 | RUN npm ci 11 | 12 | # Copy source code 13 | COPY . . 14 | 15 | # Build application 16 | RUN npm run build 17 | 18 | # Production stage 19 | FROM node:22-slim 20 | 21 | WORKDIR /app 22 | 23 | # Copy package files 24 | COPY package*.json . 25 | COPY db db 26 | 27 | # Install production dependencies only 28 | RUN npm ci --omit=dev 29 | 30 | # Install system Chromium and required dependencies 31 | RUN apt-get update \ 32 | && apt-get install -y --no-install-recommends chromium \ 33 | && apt-get clean \ 34 | && rm -rf /var/lib/apt/lists/* /tmp/* \ 35 | && CHROMIUM_PATH=$(command -v chromium || command -v chromium-browser) \ 36 | && if [ -z "$CHROMIUM_PATH" ]; then echo "Chromium executable not found!" && exit 1; fi \ 37 | && if [ "$CHROMIUM_PATH" != "/usr/bin/chromium" ]; then echo "Unexpected Chromium path: $CHROMIUM_PATH" && exit 1; fi \ 38 | && echo "Chromium installed at $CHROMIUM_PATH" 39 | 40 | # Set Playwright to use system Chromium (hardcoded path, as ENV cannot use shell vars) 41 | ENV PLAYWRIGHT_CHROMIUM_EXECUTABLE_PATH=/usr/bin/chromium 42 | 43 | # Copy built files from builder 44 | COPY --from=builder /app/dist ./dist 45 | COPY --from=builder /app/public ./public 46 | 47 | # Set data directory for the container 48 | ENV DOCS_MCP_STORE_PATH=/data 49 | 50 | # Define volumes 51 | VOLUME /data 52 | 53 | # Expose the ports the applications listen on 54 | EXPOSE 6280 55 | EXPOSE 6281 56 | 57 | # Set the command to run the application 58 | ENTRYPOINT ["node", "dist/index.js"] 59 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Andre Rabold 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /biome.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "https://biomejs.dev/schema.json", 3 | "formatter": { 4 | "enabled": true, 5 | "formatWithErrors": false, 6 | "ignore": [], 7 | "attributePosition": "auto", 8 | "indentStyle": "space", 9 | "indentWidth": 2, 10 | "lineWidth": 90, 11 | "lineEnding": "lf" 12 | }, 13 | "files": { 14 | "include": ["src/**/*.ts"] 15 | }, 16 | "overrides": [ 17 | { 18 | "include": ["src/**/*.test.ts"], 19 | "linter": { 20 | "rules": { 21 | "style": { 22 | "noNonNullAssertion": "off" 23 | } 24 | } 25 | } 26 | } 27 | ] 28 | } 29 | -------------------------------------------------------------------------------- /commitlint.config.js: -------------------------------------------------------------------------------- 1 | // commitlint.config.js 2 | /** @type {import('@commitlint/types').UserConfig} */ 3 | export default { 4 | extends: ["@commitlint/config-conventional"], 5 | rules: { 6 | "body-max-line-length": [0, "always"], 7 | "footer-max-line-length": [0, "always"], 8 | }, 9 | }; 10 | -------------------------------------------------------------------------------- /db/migrations/000-initial-schema.sql: -------------------------------------------------------------------------------- 1 | -- Initial database schema setup 2 | 3 | -- Documents table 4 | CREATE TABLE IF NOT EXISTS documents( 5 | id INTEGER PRIMARY KEY AUTOINCREMENT, 6 | library TEXT NOT NULL, 7 | version TEXT NOT NULL DEFAULT '', 8 | url TEXT NOT NULL, 9 | content TEXT, 10 | metadata JSON, 11 | sort_order INTEGER NOT NULL, 12 | UNIQUE(url, library, version, sort_order) 13 | ); 14 | 15 | -- Indexes 16 | CREATE INDEX IF NOT EXISTS idx_documents_library_lower ON documents(lower(library)); 17 | CREATE INDEX IF NOT EXISTS idx_documents_version_lower ON documents(lower(library), lower(version)); 18 | 19 | -- Create Embeddings virtual table 20 | -- Note: Dimension is hardcoded here based on the value in schema.ts at the time of creation. 21 | -- If VECTOR_DIMENSION changes, a separate migration would be needed to update/recreate this table. 22 | CREATE VIRTUAL TABLE IF NOT EXISTS documents_vec USING vec0( 23 | library TEXT NOT NULL, 24 | version TEXT NOT NULL, 25 | embedding FLOAT[1536] 26 | ); 27 | 28 | -- Create FTS5 virtual table 29 | CREATE VIRTUAL TABLE IF NOT EXISTS documents_fts USING fts5( 30 | content, 31 | title, 32 | url, 33 | path, 34 | tokenize='porter unicode61', 35 | content='documents', 36 | content_rowid='id' 37 | ); 38 | 39 | -- Delete trigger to maintain FTS index 40 | CREATE TRIGGER IF NOT EXISTS documents_fts_after_delete AFTER DELETE ON documents BEGIN 41 | INSERT INTO documents_fts(documents_fts, rowid, content, title, url, path) 42 | VALUES('delete', old.id, old.content, json_extract(old.metadata, '$.title'), old.url, json_extract(old.metadata, '$.path')); 43 | END; 44 | 45 | -- Update trigger to maintain FTS index 46 | CREATE TRIGGER IF NOT EXISTS documents_fts_after_update AFTER UPDATE ON documents BEGIN 47 | INSERT INTO documents_fts(documents_fts, rowid, content, title, url, path) 48 | VALUES('delete', old.id, old.content, json_extract(old.metadata, '$.title'), old.url, json_extract(old.metadata, '$.path')); 49 | INSERT INTO documents_fts(rowid, content, title, url, path) 50 | VALUES(new.id, new.content, json_extract(new.metadata, '$.title'), new.url, json_extract(new.metadata, '$.path')); 51 | END; 52 | 53 | -- Insert trigger to maintain FTS index 54 | CREATE TRIGGER IF NOT EXISTS documents_fts_after_insert AFTER INSERT ON documents BEGIN 55 | INSERT INTO documents_fts(rowid, content, title, url, path) 56 | VALUES(new.id, new.content, json_extract(new.metadata, '$.title'), new.url, json_extract(new.metadata, '$.path')); 57 | END; 58 | -------------------------------------------------------------------------------- /db/migrations/001-add-indexed-at-column.sql: -------------------------------------------------------------------------------- 1 | -- Add indexed_at column to track when documents were last indexed 2 | -- Step 1: Add the column allowing NULLs (SQLite limitation workaround) 3 | ALTER TABLE documents ADD COLUMN indexed_at DATETIME; 4 | 5 | -- Step 2: Update existing rows to set the timestamp 6 | UPDATE documents SET indexed_at = CURRENT_TIMESTAMP WHERE indexed_at IS NULL; 7 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | docs-mcp-server: 3 | image: ghcr.io/arabold/docs-mcp-server:latest 4 | build: 5 | context: . 6 | dockerfile: Dockerfile 7 | command: ["--protocol", "http", "--port", "6280"] 8 | # platform: linux/amd64 9 | container_name: docs-mcp-server 10 | ports: 11 | - "6280:6280" 12 | env_file: 13 | - .env 14 | environment: 15 | - MCP_PORT=6280 16 | volumes: 17 | - docs-mcp-data:/data 18 | 19 | docs-mcp-web: 20 | image: ghcr.io/arabold/docs-mcp-server:latest 21 | build: 22 | context: . 23 | dockerfile: Dockerfile 24 | command: ["web", "--port", "6281"] 25 | # platform: linux/amd64 26 | container_name: docs-mcp-web 27 | ports: 28 | - "6281:6281" 29 | env_file: 30 | - .env 31 | environment: 32 | - WEB_PORT=6281 33 | volumes: 34 | - docs-mcp-data:/data 35 | 36 | volumes: 37 | docs-mcp-data: 38 | name: docs-mcp-data 39 | -------------------------------------------------------------------------------- /docs/docs-mcp-server.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arabold/docs-mcp-server/2c6fb88ac09b82baea2068e878855d5b78e2a6e2/docs/docs-mcp-server.png -------------------------------------------------------------------------------- /postcss.config.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | plugins: { 3 | "@tailwindcss/postcss": {}, // Use the dedicated PostCSS plugin 4 | autoprefixer: {}, 5 | }, 6 | }; 7 | -------------------------------------------------------------------------------- /src/mcp/index.ts: -------------------------------------------------------------------------------- 1 | import "dotenv/config"; 2 | import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; 3 | import type { PipelineManager } from "../pipeline/PipelineManager"; 4 | import type { DocumentManagementService } from "../store/DocumentManagementService"; 5 | import { logger } from "../utils/logger"; 6 | import { startHttpServer } from "./startHttpServer"; 7 | import { startStdioServer } from "./startStdioServer"; 8 | import { type McpServerTools, initializeTools } from "./tools"; 9 | 10 | // Variables to hold server instances for cleanup 11 | let runningServer: McpServer | null = null; 12 | 13 | export async function startServer( 14 | protocol: "stdio" | "http", 15 | docService: DocumentManagementService, // NEW PARAM 16 | pipelineManager: PipelineManager, // NEW PARAM 17 | port?: number, // Existing optional param 18 | ) { 19 | try { 20 | // Initialize and get shared tools 21 | const tools: McpServerTools = await initializeTools(docService, pipelineManager); // Pass instances 22 | 23 | let serverInstance: McpServer; 24 | if (protocol === "stdio") { 25 | serverInstance = await startStdioServer(tools); // startStdioServer needs to return McpServer 26 | } else if (protocol === "http") { 27 | if (port === undefined) { 28 | logger.error("❌ HTTP protocol requires a port."); 29 | process.exit(1); 30 | } 31 | serverInstance = await startHttpServer(tools, port); // startHttpServer needs to return McpServer 32 | } else { 33 | // This case should be caught by src/server.ts, but handle defensively 34 | logger.error(`❌ Unknown protocol: ${protocol}`); 35 | process.exit(1); 36 | } 37 | 38 | // Capture the running server instance 39 | runningServer = serverInstance; 40 | } catch (error) { 41 | logger.error(`❌ Fatal Error during server startup: ${error}`); 42 | // Attempt cleanup even if startup failed partially 43 | await stopServer(); 44 | process.exit(1); 45 | } 46 | } 47 | 48 | /** 49 | * Stops the MCP server instance gracefully. 50 | * Shared services (PipelineManager, DocumentManagementService) are shut down 51 | * separately by the caller (e.g., src/index.ts). 52 | */ 53 | export async function stopServer() { 54 | logger.debug("Attempting to close MCP Server instance..."); 55 | let hadError = false; 56 | try { 57 | if (runningServer) { 58 | logger.debug("Closing MCP Server instance (McpServer/McpHttpServer)..."); 59 | await runningServer.close(); 60 | logger.debug("MCP Server instance closed."); 61 | } else { 62 | logger.debug("MCP Server instance was not running or already null."); 63 | } 64 | } catch (e) { 65 | logger.error(`❌ Error closing MCP Server instance: ${e}`); 66 | hadError = true; 67 | } 68 | 69 | runningServer = null; 70 | // DocumentManagementService and PipelineManager instances are managed and shut down by src/index.ts. 71 | 72 | if (hadError) { 73 | logger.warn("⚠️ MCP Server instance stopped with errors."); 74 | } else { 75 | logger.info("✅ MCP Server instance stopped."); 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /src/mcp/startStdioServer.ts: -------------------------------------------------------------------------------- 1 | import type { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; 2 | import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; 3 | import { LogLevel, logger, setLogLevel } from "../utils/logger"; 4 | import { createMcpServerInstance } from "./mcpServer"; 5 | import type { McpServerTools } from "./tools"; 6 | 7 | /** 8 | * Starts the MCP server using the Stdio transport. 9 | * @param tools The shared tool instances. 10 | * @returns The created McpServer instance. 11 | */ 12 | export async function startStdioServer(tools: McpServerTools): Promise { 13 | setLogLevel(LogLevel.ERROR); 14 | 15 | // Create a server instance using the factory and shared tools 16 | const server = createMcpServerInstance(tools); 17 | 18 | // Start server with Stdio transport 19 | const transport = new StdioServerTransport(); 20 | await server.connect(transport); 21 | logger.info("🤖 MCP server listening on stdio"); 22 | 23 | // Return the server instance 24 | return server; 25 | } 26 | -------------------------------------------------------------------------------- /src/mcp/tools.ts: -------------------------------------------------------------------------------- 1 | import type { PipelineManager } from "../pipeline/PipelineManager"; 2 | import { FileFetcher, HttpFetcher } from "../scraper/fetcher"; 3 | import type { DocumentManagementService } from "../store/DocumentManagementService"; 4 | import { 5 | CancelJobTool, 6 | FetchUrlTool, 7 | FindVersionTool, 8 | GetJobInfoTool, 9 | ListJobsTool, 10 | ListLibrariesTool, 11 | RemoveTool, 12 | ScrapeTool, 13 | SearchTool, 14 | } from "../tools"; 15 | 16 | /** 17 | * Interface for the shared tool instances. 18 | */ 19 | export interface McpServerTools { 20 | listLibraries: ListLibrariesTool; 21 | findVersion: FindVersionTool; 22 | scrape: ScrapeTool; 23 | search: SearchTool; 24 | listJobs: ListJobsTool; 25 | getJobInfo: GetJobInfoTool; 26 | cancelJob: CancelJobTool; 27 | remove: RemoveTool; 28 | fetchUrl: FetchUrlTool; 29 | } 30 | 31 | /** 32 | * Initializes and returns the shared tool instances. 33 | * This should be called after initializeServices has completed. 34 | * @param docService The initialized DocumentManagementService instance. 35 | * @param pipelineManager The initialized PipelineManager instance. 36 | * @returns An object containing all instantiated tool instances. 37 | */ 38 | export async function initializeTools( 39 | docService: DocumentManagementService, 40 | pipelineManager: PipelineManager, 41 | ): Promise { 42 | const tools: McpServerTools = { 43 | listLibraries: new ListLibrariesTool(docService), 44 | findVersion: new FindVersionTool(docService), 45 | scrape: new ScrapeTool(docService, pipelineManager), 46 | search: new SearchTool(docService), 47 | listJobs: new ListJobsTool(pipelineManager), 48 | getJobInfo: new GetJobInfoTool(pipelineManager), 49 | cancelJob: new CancelJobTool(pipelineManager), 50 | // clearCompletedJobs: new ClearCompletedJobsTool(pipelineManager), 51 | remove: new RemoveTool(docService, pipelineManager), 52 | fetchUrl: new FetchUrlTool(new HttpFetcher(), new FileFetcher()), 53 | }; 54 | 55 | return tools; 56 | } 57 | -------------------------------------------------------------------------------- /src/mcp/utils.ts: -------------------------------------------------------------------------------- 1 | import type { CallToolResult } from "@modelcontextprotocol/sdk/types.js"; 2 | 3 | /** 4 | * Creates a success response object in the format expected by the MCP server. 5 | * @param text The text content of the response. 6 | * @returns The response object. 7 | */ 8 | export function createResponse(text: string): CallToolResult { 9 | return { 10 | content: [ 11 | { 12 | type: "text", 13 | text, 14 | }, 15 | ], 16 | isError: false, 17 | }; 18 | } 19 | 20 | /** 21 | * Creates an error response object in the format expected by the MCP server. 22 | * @param text The error message. 23 | * @returns The response object. 24 | */ 25 | export function createError(text: string): CallToolResult { 26 | return { 27 | content: [ 28 | { 29 | type: "text", 30 | text, 31 | }, 32 | ], 33 | isError: true, 34 | }; 35 | } 36 | -------------------------------------------------------------------------------- /src/pipeline/PipelineWorker.ts: -------------------------------------------------------------------------------- 1 | import type { ScraperService } from "../scraper"; 2 | import type { ScraperProgress } from "../scraper/types"; 3 | import type { DocumentManagementService } from "../store"; 4 | import { logger } from "../utils/logger"; 5 | import { CancellationError } from "./errors"; 6 | import type { PipelineJob, PipelineManagerCallbacks } from "./types"; 7 | 8 | /** 9 | * Executes a single document processing job. 10 | * Handles scraping, storing documents, and reporting progress/errors via callbacks. 11 | */ 12 | export class PipelineWorker { 13 | // Dependencies are passed in, making the worker stateless regarding specific jobs 14 | private readonly store: DocumentManagementService; 15 | private readonly scraperService: ScraperService; 16 | 17 | // Constructor accepts dependencies needed for execution 18 | constructor(store: DocumentManagementService, scraperService: ScraperService) { 19 | this.store = store; 20 | this.scraperService = scraperService; 21 | } 22 | 23 | /** 24 | * Executes the given pipeline job. 25 | * @param job - The job to execute. 26 | * @param callbacks - Callbacks provided by the manager for reporting. 27 | */ 28 | async executeJob(job: PipelineJob, callbacks: PipelineManagerCallbacks): Promise { 29 | const { id: jobId, library, version, options, abortController } = job; 30 | const signal = abortController.signal; 31 | 32 | logger.debug(`[${jobId}] Worker starting job for ${library}@${version}`); 33 | 34 | try { 35 | // --- Core Job Logic --- 36 | await this.scraperService.scrape( 37 | options, 38 | async (progress: ScraperProgress) => { 39 | // Check for cancellation signal before processing each document 40 | if (signal.aborted) { 41 | throw new CancellationError("Job cancelled during scraping progress"); 42 | } 43 | 44 | // Update job object directly (manager holds the reference) 45 | job.progress = progress; 46 | // Report progress via manager's callback 47 | await callbacks.onJobProgress?.(job, progress); 48 | 49 | if (progress.document) { 50 | try { 51 | // TODO: Pass signal to store.addDocument if it supports it 52 | await this.store.addDocument(library, version, { 53 | pageContent: progress.document.content, 54 | metadata: progress.document.metadata, 55 | }); 56 | logger.debug( 57 | `[${jobId}] Stored document: ${progress.document.metadata.url}`, 58 | ); 59 | } catch (docError) { 60 | logger.error( 61 | `❌ [${jobId}] Failed to store document ${progress.document.metadata.url}: ${docError}`, 62 | ); 63 | // Report document-specific errors via manager's callback 64 | await callbacks.onJobError?.( 65 | job, 66 | docError instanceof Error ? docError : new Error(String(docError)), 67 | progress.document, 68 | ); 69 | // Decide if a single document error should fail the whole job 70 | // For now, we log and continue. To fail, re-throw here. 71 | } 72 | } 73 | }, 74 | signal, // Pass signal to scraper service 75 | ); 76 | // --- End Core Job Logic --- 77 | 78 | // Check signal one last time after scrape finishes 79 | if (signal.aborted) { 80 | throw new CancellationError("Job cancelled"); 81 | } 82 | 83 | // If successful and not cancelled, the manager will handle status update 84 | logger.debug(`[${jobId}] Worker finished job successfully.`); 85 | } catch (error) { 86 | // Re-throw error to be caught by the manager in _runJob 87 | logger.warn(`⚠️ [${jobId}] Worker encountered error: ${error}`); 88 | throw error; 89 | } 90 | // Note: The manager (_runJob) is responsible for updating final job status (COMPLETED/FAILED/CANCELLED) 91 | // and resolving/rejecting the completion promise based on the outcome here. 92 | } 93 | 94 | // --- Old methods removed --- 95 | // process() 96 | // stop() 97 | // setCallbacks() 98 | // handleScrapingProgress() 99 | } 100 | -------------------------------------------------------------------------------- /src/pipeline/errors.ts: -------------------------------------------------------------------------------- 1 | export class PipelineError extends Error { 2 | constructor( 3 | message: string, 4 | public readonly cause?: Error, 5 | ) { 6 | super(message); 7 | this.name = this.constructor.name; 8 | if (cause?.stack) { 9 | this.stack = `${this.stack}\nCaused by: ${cause.stack}`; 10 | } 11 | } 12 | } 13 | 14 | export class DocumentProcessingError extends PipelineError { 15 | constructor( 16 | message: string, 17 | public readonly documentId: string, 18 | cause?: Error, 19 | ) { 20 | super(`Failed to process document ${documentId}: ${message}`, cause); 21 | } 22 | } 23 | 24 | export class PipelineStateError extends PipelineError {} 25 | 26 | /** 27 | * Error indicating that an operation was cancelled. 28 | */ 29 | export class CancellationError extends PipelineError { 30 | constructor(message = "Operation cancelled") { 31 | super(message); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/pipeline/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./PipelineManager"; 2 | export * from "./PipelineWorker"; 3 | export * from "./errors"; 4 | -------------------------------------------------------------------------------- /src/pipeline/types.ts: -------------------------------------------------------------------------------- 1 | import type { ScraperOptions, ScraperProgress } from "../scraper/types"; 2 | import type { Document } from "../types"; // Use local Document type 3 | 4 | /** 5 | * Represents the possible states of a pipeline job. 6 | */ 7 | export enum PipelineJobStatus { 8 | QUEUED = "queued", 9 | RUNNING = "running", 10 | COMPLETED = "completed", 11 | FAILED = "failed", 12 | CANCELLING = "cancelling", 13 | CANCELLED = "cancelled", 14 | } 15 | 16 | /** 17 | * Represents a single document processing job within the pipeline. 18 | */ 19 | export interface PipelineJob { 20 | /** Unique identifier for the job. */ 21 | id: string; 22 | /** The library name associated with the job. */ 23 | library: string; 24 | /** The library version associated with the job. */ 25 | version: string; 26 | /** Options provided for the scraper. */ 27 | options: ScraperOptions; 28 | /** Current status of the job. */ 29 | status: PipelineJobStatus; 30 | /** Detailed progress information. */ 31 | progress: ScraperProgress | null; 32 | /** Error object if the job failed. */ 33 | error: Error | null; 34 | /** Timestamp when the job was created. */ 35 | createdAt: Date; 36 | /** Timestamp when the job started running. */ 37 | startedAt: Date | null; 38 | /** Timestamp when the job finished (completed, failed, or cancelled). */ 39 | finishedAt: Date | null; 40 | /** AbortController to signal cancellation. */ 41 | abortController: AbortController; 42 | /** Promise that resolves/rejects when the job finishes. */ 43 | completionPromise: Promise; 44 | /** Resolver function for the completion promise. */ 45 | resolveCompletion: () => void; 46 | /** Rejector function for the completion promise. */ 47 | rejectCompletion: (reason?: unknown) => void; 48 | } 49 | 50 | /** 51 | * Defines the structure for callback functions used with the PipelineManager. 52 | * Allows external components to hook into job lifecycle events. 53 | */ 54 | export interface PipelineManagerCallbacks { 55 | /** Callback triggered when a job's status changes. */ 56 | onJobStatusChange?: (job: PipelineJob) => Promise; 57 | /** Callback triggered when a job makes progress. */ 58 | onJobProgress?: (job: PipelineJob, progress: ScraperProgress) => Promise; 59 | /** Callback triggered when a job encounters an error during processing (e.g., storing a doc). */ 60 | onJobError?: (job: PipelineJob, error: Error, document?: Document) => Promise; 61 | } 62 | -------------------------------------------------------------------------------- /src/scraper/ScraperRegistry.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it, vi } from "vitest"; 2 | import { ScraperError } from "../utils/errors"; 3 | import { ScraperRegistry } from "./ScraperRegistry"; 4 | import { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy"; 5 | import { LocalFileStrategy } from "./strategies/LocalFileStrategy"; 6 | import { NpmScraperStrategy } from "./strategies/NpmScraperStrategy"; 7 | import { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy"; 8 | 9 | vi.mock("../utils/logger"); 10 | 11 | describe("ScraperRegistry", () => { 12 | it("should throw error for unknown URLs", () => { 13 | const registry = new ScraperRegistry(); 14 | expect(() => registry.getStrategy("invalid://example.com")).toThrow(ScraperError); 15 | expect(() => registry.getStrategy("invalid://example.com")).toThrow( 16 | "No strategy found for URL", 17 | ); 18 | }); 19 | 20 | it("should return LocalFileStrategy for file:// URLs", () => { 21 | const registry = new ScraperRegistry(); 22 | const strategy = registry.getStrategy("file:///path/to/file.txt"); 23 | expect(strategy).toBeInstanceOf(LocalFileStrategy); 24 | }); 25 | 26 | it("should return GitHubScraperStrategy for GitHub URLs", () => { 27 | const registry = new ScraperRegistry(); 28 | const strategy = registry.getStrategy("https://github.com/user/repo"); 29 | expect(strategy).toBeInstanceOf(GitHubScraperStrategy); 30 | }); 31 | 32 | it("should return NpmScraperStrategy for NPM URLs", () => { 33 | const registry = new ScraperRegistry(); 34 | const strategy = registry.getStrategy("https://npmjs.com/package/test"); 35 | expect(strategy).toBeInstanceOf(NpmScraperStrategy); 36 | }); 37 | 38 | it("should return PyPiScraperStrategy for PyPI URLs", () => { 39 | const registry = new ScraperRegistry(); 40 | const strategy = registry.getStrategy("https://pypi.org/project/test"); 41 | expect(strategy).toBeInstanceOf(PyPiScraperStrategy); 42 | }); 43 | }); 44 | -------------------------------------------------------------------------------- /src/scraper/ScraperRegistry.ts: -------------------------------------------------------------------------------- 1 | import { ScraperError } from "../utils/errors"; 2 | import { validateUrl } from "../utils/url"; 3 | import { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy"; 4 | import { LocalFileStrategy } from "./strategies/LocalFileStrategy"; 5 | import { NpmScraperStrategy } from "./strategies/NpmScraperStrategy"; 6 | import { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy"; 7 | import { WebScraperStrategy } from "./strategies/WebScraperStrategy"; 8 | import type { ScraperStrategy } from "./types"; 9 | 10 | export class ScraperRegistry { 11 | private strategies: ScraperStrategy[]; 12 | 13 | constructor() { 14 | this.strategies = [ 15 | new NpmScraperStrategy(), 16 | new PyPiScraperStrategy(), 17 | new GitHubScraperStrategy(), 18 | new WebScraperStrategy(), 19 | new LocalFileStrategy(), 20 | ]; 21 | } 22 | 23 | getStrategy(url: string): ScraperStrategy { 24 | validateUrl(url); 25 | const strategy = this.strategies.find((s) => s.canHandle(url)); 26 | if (!strategy) { 27 | throw new ScraperError(`No strategy found for URL: ${url}`); 28 | } 29 | return strategy; 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/scraper/ScraperService.ts: -------------------------------------------------------------------------------- 1 | import type { ProgressCallback } from "../types"; 2 | import { ScraperError } from "../utils/errors"; 3 | import type { ScraperRegistry } from "./ScraperRegistry"; 4 | import type { ScraperOptions, ScraperProgress } from "./types"; 5 | 6 | /** 7 | * Orchestrates document scraping operations using registered scraping strategies. 8 | * Automatically selects appropriate strategy based on URL patterns. 9 | */ 10 | export class ScraperService { 11 | private registry: ScraperRegistry; 12 | 13 | constructor(registry: ScraperRegistry) { 14 | this.registry = registry; 15 | } 16 | 17 | /** 18 | * Scrapes content from the provided URL using the appropriate strategy. 19 | * Reports progress via callback and handles errors. 20 | */ 21 | async scrape( 22 | options: ScraperOptions, 23 | progressCallback: ProgressCallback, 24 | signal?: AbortSignal, // Add optional signal parameter 25 | ): Promise { 26 | // Find strategy for this URL 27 | const strategy = this.registry.getStrategy(options.url); 28 | if (!strategy) { 29 | throw new ScraperError(`No scraper strategy found for URL: ${options.url}`, false); 30 | } 31 | 32 | // Pass the signal down to the strategy 33 | await strategy.scrape(options, progressCallback, signal); 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/scraper/fetcher/FileFetcher.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs/promises"; 2 | import path from "node:path"; 3 | import * as mime from "mime-types"; 4 | import { ScraperError } from "../../utils/errors"; 5 | import { logger } from "../../utils/logger"; 6 | import type { ContentFetcher, FetchOptions, RawContent } from "./types"; 7 | 8 | /** 9 | * Fetches content from local file system. 10 | */ 11 | export class FileFetcher implements ContentFetcher { 12 | canFetch(source: string): boolean { 13 | return source.startsWith("file://"); 14 | } 15 | 16 | /** 17 | * Fetches the content of a file given a file:// URL, decoding percent-encoded paths as needed. 18 | * Only HTML and Markdown files are processed. 19 | */ 20 | async fetch(source: string, options?: FetchOptions): Promise { 21 | // Always decode the file path from file:// URL 22 | const rawPath = source.replace("file://", ""); 23 | const filePath = decodeURIComponent(rawPath); 24 | 25 | try { 26 | const content = await fs.readFile(filePath); 27 | const ext = path.extname(filePath).toLowerCase(); 28 | const mimeType = mime.lookup(ext) || "application/octet-stream"; 29 | return { 30 | content, 31 | mimeType, 32 | source, 33 | encoding: "utf-8", // Assume UTF-8 for text files 34 | }; 35 | } catch (error: unknown) { 36 | throw new ScraperError( 37 | `Failed to read file ${filePath}: ${ 38 | (error as { message?: string }).message ?? "Unknown error" 39 | }`, 40 | false, 41 | error instanceof Error ? error : undefined, 42 | ); 43 | } 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/scraper/fetcher/FingerprintGenerator.test.ts: -------------------------------------------------------------------------------- 1 | import type { HeaderGeneratorOptions } from "header-generator"; 2 | import { describe, expect, it } from "vitest"; 3 | import { FingerprintGenerator } from "./FingerprintGenerator"; 4 | 5 | describe("FingerprintGenerator", () => { 6 | it("should be instantiated without options", () => { 7 | const generator = new FingerprintGenerator(); 8 | expect(generator).toBeInstanceOf(FingerprintGenerator); 9 | }); 10 | 11 | it("should be instantiated with options", () => { 12 | const options: Partial = { 13 | browsers: ["firefox"], 14 | }; 15 | const generator = new FingerprintGenerator(options); 16 | expect(generator).toBeInstanceOf(FingerprintGenerator); 17 | }); 18 | 19 | it("should generate headers", () => { 20 | const generator = new FingerprintGenerator(); 21 | const headers = generator.generateHeaders(); 22 | expect(headers).toBeDefined(); 23 | expect(typeof headers).toBe("object"); 24 | expect(Object.keys(headers).length).toBeGreaterThan(0); 25 | expect(headers["user-agent"]).toBeDefined(); 26 | expect(headers.accept).toBeDefined(); 27 | expect(headers["accept-language"]).toBeDefined(); 28 | }); 29 | }); 30 | -------------------------------------------------------------------------------- /src/scraper/fetcher/FingerprintGenerator.ts: -------------------------------------------------------------------------------- 1 | import { HeaderGenerator, type HeaderGeneratorOptions } from "header-generator"; 2 | 3 | /** 4 | * Generates realistic browser-like HTTP headers to help avoid bot detection. 5 | * Uses the `header-generator` library for header generation. 6 | */ 7 | export class FingerprintGenerator { 8 | private headerGenerator: HeaderGenerator; 9 | 10 | /** 11 | * Creates an instance of FingerprintGenerator. 12 | * @param options Optional configuration for the header generator. 13 | */ 14 | constructor(options?: Partial) { 15 | // Default options for a broad range of realistic headers 16 | const defaultOptions: Partial = { 17 | browsers: [{ name: "chrome", minVersion: 100 }, "firefox", "safari"], 18 | devices: ["desktop", "mobile"], 19 | operatingSystems: ["windows", "linux", "macos", "android", "ios"], 20 | locales: ["en-US", "en"], 21 | httpVersion: "2", 22 | }; 23 | 24 | this.headerGenerator = new HeaderGenerator({ 25 | ...defaultOptions, 26 | ...options, 27 | }); 28 | } 29 | 30 | /** 31 | * Generates a set of realistic HTTP headers. 32 | * @returns A set of realistic HTTP headers. 33 | */ 34 | generateHeaders(): Record { 35 | return this.headerGenerator.getHeaders(); 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/scraper/fetcher/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./types"; 2 | export * from "./HttpFetcher"; 3 | export * from "./FileFetcher"; 4 | -------------------------------------------------------------------------------- /src/scraper/fetcher/types.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Raw content fetched from a source before processing. 3 | * Includes metadata about the content for proper processing. 4 | */ 5 | export interface RawContent { 6 | /** Raw content as string or buffer */ 7 | content: string | Buffer; 8 | /** 9 | * MIME type of the content (e.g., "text/html", "application/json"). 10 | * Does not include parameters like charset. 11 | */ 12 | mimeType: string; 13 | /** 14 | * Character set of the content (e.g., "utf-8"), extracted from Content-Type header. 15 | */ 16 | charset?: string; 17 | /** 18 | * Content encoding (e.g., "gzip", "deflate"), from Content-Encoding header. 19 | */ 20 | encoding?: string; 21 | /** Original source location */ 22 | source: string; 23 | } 24 | 25 | /** 26 | * Options for configuring content fetching behavior 27 | */ 28 | export interface FetchOptions { 29 | /** Maximum retry attempts for failed fetches */ 30 | maxRetries?: number; 31 | /** Base delay between retries in milliseconds */ 32 | retryDelay?: number; 33 | /** Additional headers for HTTP requests */ 34 | headers?: Record; 35 | /** Timeout in milliseconds */ 36 | timeout?: number; 37 | /** AbortSignal for cancellation */ 38 | signal?: AbortSignal; 39 | /** Whether to follow HTTP redirects (3xx responses) */ 40 | followRedirects?: boolean; 41 | } 42 | 43 | /** 44 | * Interface for fetching content from different sources 45 | */ 46 | export interface ContentFetcher { 47 | /** 48 | * Check if this fetcher can handle the given source 49 | */ 50 | canFetch(source: string): boolean; 51 | 52 | /** 53 | * Fetch content from the source 54 | */ 55 | fetch(source: string, options?: FetchOptions): Promise; 56 | } 57 | -------------------------------------------------------------------------------- /src/scraper/index.ts: -------------------------------------------------------------------------------- 1 | // Re-export strategies for external use if needed 2 | export { WebScraperStrategy } from "./strategies/WebScraperStrategy"; 3 | export { GitHubScraperStrategy } from "./strategies/GitHubScraperStrategy"; 4 | export { LocalFileStrategy } from "./strategies/LocalFileStrategy"; 5 | export { NpmScraperStrategy } from "./strategies/NpmScraperStrategy"; 6 | export { PyPiScraperStrategy } from "./strategies/PyPiScraperStrategy"; 7 | export { ScraperRegistry } from "./ScraperRegistry"; 8 | export { ScraperService } from "./ScraperService"; 9 | -------------------------------------------------------------------------------- /src/scraper/middleware/HtmlCheerioParserMiddleware.ts: -------------------------------------------------------------------------------- 1 | import * as cheerio from "cheerio"; 2 | import { logger } from "../../utils/logger"; 3 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; 4 | 5 | /** 6 | * Middleware to parse HTML string/buffer content into a Cheerio object. 7 | * It populates the `context.dom` property. 8 | * Assumes the input HTML in `context.content` is the final version to be parsed 9 | * (e.g., after potential rendering by Playwright or modification by JS execution). 10 | */ 11 | export class HtmlCheerioParserMiddleware implements ContentProcessorMiddleware { 12 | async process(context: MiddlewareContext, next: () => Promise): Promise { 13 | try { 14 | logger.debug(`Parsing HTML content with Cheerio from ${context.source}`); 15 | // Load the HTML string using Cheerio 16 | const $ = cheerio.load(context.content); 17 | 18 | // Add the Cheerio API object to the context 19 | context.dom = $; 20 | 21 | // Proceed to the next middleware 22 | await next(); 23 | } catch (error) { 24 | logger.error( 25 | `❌ Failed to parse HTML with Cheerio for ${context.source}: ${error}`, 26 | ); 27 | context.errors.push( 28 | error instanceof Error 29 | ? error 30 | : new Error(`Cheerio HTML parsing failed: ${String(error)}`), 31 | ); 32 | // Do not proceed further down the pipeline if parsing fails 33 | return; 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/scraper/middleware/HtmlLinkExtractorMiddleware.ts: -------------------------------------------------------------------------------- 1 | import { logger } from "../../utils/logger"; 2 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; 3 | 4 | /** 5 | * Middleware to extract links (href attributes from tags) from HTML content using Cheerio. 6 | * It expects the Cheerio API object to be available in `context.dom`. 7 | * This should run *after* parsing but *before* conversion to Markdown. 8 | */ 9 | export class HtmlLinkExtractorMiddleware implements ContentProcessorMiddleware { 10 | /** 11 | * Processes the context to extract links from the sanitized HTML body. 12 | * @param context The current middleware context. 13 | * @param next Function to call the next middleware. 14 | */ 15 | async process(context: MiddlewareContext, next: () => Promise): Promise { 16 | // Check if we have a Cheerio object from a previous step 17 | const $ = context.dom; 18 | if (!$) { 19 | logger.warn( 20 | `⏭️ Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`, 21 | ); 22 | await next(); 23 | return; 24 | } 25 | 26 | try { 27 | const linkElements = $("a[href]"); 28 | logger.debug(`Found ${linkElements.length} potential links in ${context.source}`); 29 | 30 | const extractedLinks: string[] = []; 31 | linkElements.each((index, element) => { 32 | const href = $(element).attr("href"); 33 | if (href && href.trim() !== "") { 34 | try { 35 | const urlObj = new URL(href, context.source); 36 | if (!["http:", "https:", "file:"].includes(urlObj.protocol)) { 37 | logger.debug(`Ignoring link with invalid protocol: ${href}`); 38 | return; 39 | } 40 | extractedLinks.push(urlObj.href); 41 | } catch (e) { 42 | logger.debug(`Ignoring invalid URL syntax: ${href}`); 43 | } 44 | } 45 | }); 46 | 47 | context.links = [...new Set(extractedLinks)]; 48 | logger.debug( 49 | `Extracted ${context.links.length} unique, valid links from ${context.source}`, 50 | ); 51 | } catch (error) { 52 | logger.error(`❌ Error extracting links from ${context.source}: ${error}`); 53 | context.errors.push( 54 | new Error( 55 | `Failed to extract links from HTML: ${error instanceof Error ? error.message : String(error)}`, 56 | ), 57 | ); 58 | } 59 | 60 | await next(); 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/scraper/middleware/HtmlMetadataExtractorMiddleware.ts: -------------------------------------------------------------------------------- 1 | import { logger } from "../../utils/logger"; 2 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; 3 | 4 | /** 5 | * Middleware to extract the title from HTML content using Cheerio. 6 | * Assumes context.dom (Cheerio API object) is populated by a preceding middleware 7 | * (e.g., HtmlCheerioParserMiddleware). 8 | */ 9 | export class HtmlMetadataExtractorMiddleware implements ContentProcessorMiddleware { 10 | /** 11 | * Processes the context to extract the HTML title. 12 | * @param context The current processing context. 13 | * @param next Function to call the next middleware. 14 | */ 15 | async process(context: MiddlewareContext, next: () => Promise): Promise { 16 | // Check if Cheerio DOM exists from previous middleware 17 | const $ = context.dom; 18 | if (!$) { 19 | logger.warn( 20 | `⏭️ Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`, 21 | ); 22 | await next(); 23 | return; 24 | } 25 | 26 | // Only process if we have a Cheerio object (implicitly means it's HTML) 27 | try { 28 | // Extract title (using title tag, fallback to h1 if title is empty/missing) 29 | let title = $("title").first().text().trim(); 30 | 31 | if (!title) { 32 | // Fallback to the first H1 if title is empty 33 | title = $("h1").first().text().trim(); 34 | } 35 | 36 | // Default to "Untitled" if both are empty 37 | title = title || "Untitled"; 38 | 39 | // Basic cleanup (replace multiple spaces with single space) 40 | title = title.replace(/\s+/g, " ").trim(); 41 | 42 | context.metadata.title = title; 43 | logger.debug(`Extracted title: "${title}" from ${context.source}`); 44 | } catch (error) { 45 | logger.error(`❌ Error extracting metadata from ${context.source}: ${error}`); 46 | context.errors.push( 47 | new Error( 48 | `Failed to extract metadata from HTML: ${error instanceof Error ? error.message : String(error)}`, 49 | ), 50 | ); 51 | // Optionally decide whether to stop the pipeline here 52 | } 53 | 54 | // Call the next middleware in the chain 55 | await next(); 56 | 57 | // No cleanup needed for Cheerio 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /src/scraper/middleware/HtmlSanitizerMiddleware.ts: -------------------------------------------------------------------------------- 1 | import { logger } from "../../utils/logger"; 2 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; 3 | 4 | /** 5 | * Options for HtmlSanitizerMiddleware. 6 | */ 7 | export interface HtmlSanitizerOptions { 8 | /** CSS selectors for elements to remove *in addition* to the defaults. */ 9 | excludeSelectors?: string[]; 10 | } 11 | 12 | /** 13 | * Middleware to remove unwanted elements from parsed HTML content using Cheerio. 14 | * It expects the Cheerio API object (`context.dom`) to be populated by a preceding middleware 15 | * (e.g., HtmlCheerioParserMiddleware). 16 | * It modifies the `context.dom` object in place. 17 | */ 18 | export class HtmlSanitizerMiddleware implements ContentProcessorMiddleware { 19 | // Default selectors to remove 20 | private readonly defaultSelectorsToRemove = [ 21 | "nav", 22 | "footer", 23 | "script", 24 | "style", 25 | "noscript", 26 | "svg", 27 | "link", 28 | "meta", 29 | "iframe", 30 | "header", 31 | "button", 32 | "input", 33 | "textarea", 34 | "select", 35 | // "form", // Keep commented 36 | ".ads", 37 | ".advertisement", 38 | ".banner", 39 | ".cookie-banner", 40 | ".cookie-consent", 41 | ".hidden", 42 | ".hide", 43 | ".modal", 44 | ".nav-bar", 45 | ".overlay", 46 | ".popup", 47 | ".promo", 48 | ".mw-editsection", 49 | ".side-bar", 50 | ".social-share", 51 | ".sticky", 52 | "#ads", 53 | "#banner", 54 | "#cookieBanner", 55 | "#modal", 56 | "#nav", 57 | "#overlay", 58 | "#popup", 59 | "#sidebar", 60 | "#socialMediaBox", 61 | "#stickyHeader", 62 | "#ad-container", 63 | ".ad-container", 64 | ".login-form", 65 | ".signup-form", 66 | ".tooltip", 67 | ".dropdown-menu", 68 | // ".alert", // Keep commented 69 | ".breadcrumb", 70 | ".pagination", 71 | // '[role="alert"]', // Keep commented 72 | '[role="banner"]', 73 | '[role="dialog"]', 74 | '[role="alertdialog"]', 75 | '[role="region"][aria-label*="skip" i]', 76 | '[aria-modal="true"]', 77 | ".noprint", 78 | ]; 79 | 80 | async process(context: MiddlewareContext, next: () => Promise): Promise { 81 | // Check if Cheerio DOM exists 82 | const $ = context.dom; 83 | if (!$) { 84 | logger.warn( 85 | `⏭️ Skipping ${this.constructor.name}: context.dom is missing. Ensure HtmlCheerioParserMiddleware runs before this.`, 86 | ); 87 | await next(); 88 | return; 89 | } 90 | 91 | try { 92 | // Remove unwanted elements using Cheerio 93 | const selectorsToRemove = [ 94 | ...(context.options.excludeSelectors || []), // Use options from the context 95 | ...this.defaultSelectorsToRemove, 96 | ]; 97 | logger.debug( 98 | `Removing elements matching ${selectorsToRemove.length} selectors for ${context.source}`, 99 | ); 100 | let removedCount = 0; 101 | for (const selector of selectorsToRemove) { 102 | try { 103 | const elements = $(selector); // Use Cheerio selector 104 | const count = elements.length; 105 | if (count > 0) { 106 | elements.remove(); // Use Cheerio remove 107 | removedCount += count; 108 | } 109 | } catch (selectorError) { 110 | // Log invalid selectors but continue with others 111 | // Cheerio is generally more tolerant of invalid selectors than querySelectorAll 112 | logger.warn( 113 | `⚠️ Potentially invalid selector "${selector}" during element removal: ${selectorError}`, 114 | ); 115 | context.errors.push( 116 | new Error(`Invalid selector "${selector}": ${selectorError}`), 117 | ); 118 | } 119 | } 120 | logger.debug(`Removed ${removedCount} elements for ${context.source}`); 121 | 122 | // The context.dom object ($) has been modified in place. 123 | } catch (error) { 124 | logger.error( 125 | `❌ Error during HTML element removal for ${context.source}: ${error}`, 126 | ); 127 | context.errors.push( 128 | error instanceof Error 129 | ? error 130 | : new Error(`HTML element removal failed: ${String(error)}`), 131 | ); 132 | // Decide if pipeline should stop? For now, continue. 133 | } 134 | 135 | // Proceed to the next middleware 136 | await next(); 137 | } 138 | } 139 | -------------------------------------------------------------------------------- /src/scraper/middleware/MarkdownLinkExtractorMiddleware.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it, vi } from "vitest"; 2 | import type { ScraperOptions } from "../types"; 3 | import { MarkdownLinkExtractorMiddleware } from "./MarkdownLinkExtractorMiddleware"; 4 | import type { MiddlewareContext } from "./types"; 5 | 6 | // Suppress logger output during tests 7 | vi.mock("../../utils/logger"); 8 | 9 | // Helper to create a minimal valid ScraperOptions object 10 | const createMockScraperOptions = (url = "http://example.com"): ScraperOptions => ({ 11 | url, 12 | library: "test-lib", 13 | version: "1.0.0", 14 | maxDepth: 0, 15 | maxPages: 1, 16 | maxConcurrency: 1, 17 | scope: "subpages", 18 | followRedirects: true, 19 | excludeSelectors: [], 20 | ignoreErrors: false, 21 | }); 22 | 23 | const createMockContext = ( 24 | markdownContent: string, 25 | source = "http://example.com", 26 | initialLinks: string[] = [], 27 | options?: Partial, 28 | ): MiddlewareContext => { 29 | return { 30 | content: markdownContent, 31 | source, 32 | metadata: {}, 33 | links: initialLinks, 34 | errors: [], 35 | options: { ...createMockScraperOptions(source), ...options }, 36 | }; 37 | }; 38 | 39 | describe("MarkdownLinkExtractorMiddleware", () => { 40 | it("should initialize context.links to an empty array if it is undefined", async () => { 41 | const middleware = new MarkdownLinkExtractorMiddleware(); 42 | // Create context with undefined links 43 | const context = createMockContext( 44 | "Some markdown content", 45 | "http://example.com", 46 | undefined, 47 | ); 48 | const next = vi.fn().mockResolvedValue(undefined); 49 | 50 | await middleware.process(context, next); 51 | 52 | expect(next).toHaveBeenCalledOnce(); 53 | expect(context.links).toBeDefined(); 54 | expect(Array.isArray(context.links)).toBe(true); 55 | expect(context.links).toHaveLength(0); 56 | }); 57 | 58 | it("should not modify context.links if it is already an array", async () => { 59 | const middleware = new MarkdownLinkExtractorMiddleware(); 60 | const existingLinks = ["https://example.com/link1", "https://example.com/link2"]; 61 | const context = createMockContext( 62 | "Some markdown content", 63 | "http://example.com", 64 | existingLinks, 65 | ); 66 | const next = vi.fn().mockResolvedValue(undefined); 67 | 68 | await middleware.process(context, next); 69 | 70 | expect(next).toHaveBeenCalledOnce(); 71 | expect(context.links).toBe(existingLinks); // Should be the same array instance 72 | expect(context.links).toEqual(existingLinks); // Should have the same content 73 | }); 74 | 75 | it("should always call the next middleware", async () => { 76 | const middleware = new MarkdownLinkExtractorMiddleware(); 77 | // Test with null links to ensure it's handled properly 78 | const context = createMockContext("Some markdown content") as MiddlewareContext; 79 | // @ts-expect-error 80 | context.links = null; // Deliberately set to null to test robustness 81 | const next = vi.fn().mockResolvedValue(undefined); 82 | 83 | await middleware.process(context, next); 84 | 85 | expect(next).toHaveBeenCalledOnce(); 86 | expect(context.links).toBeDefined(); 87 | expect(Array.isArray(context.links)).toBe(true); 88 | }); 89 | 90 | // Note: Since the current implementation is a placeholder and doesn't actually 91 | // extract links, we don't test link extraction functionality yet. 92 | // When the TODO is implemented, additional tests should be added to verify 93 | // that links are correctly extracted from markdown content. 94 | }); 95 | -------------------------------------------------------------------------------- /src/scraper/middleware/MarkdownLinkExtractorMiddleware.ts: -------------------------------------------------------------------------------- 1 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; 2 | 3 | /** 4 | * Placeholder middleware for extracting links from Markdown content. 5 | * Currently, it does not implement link extraction, matching the 6 | * original MarkdownProcessor's TODO status. 7 | */ 8 | export class MarkdownLinkExtractorMiddleware implements ContentProcessorMiddleware { 9 | /** 10 | * Processes the context. Currently a no-op regarding link extraction. 11 | * @param context The current processing context. 12 | * @param next Function to call the next middleware. 13 | */ 14 | async process(context: MiddlewareContext, next: () => Promise): Promise { 15 | // TODO: Implement Markdown link extraction (e.g., using regex or a Markdown parser) 16 | // For now, ensure context.links exists, defaulting to empty array if not set. 17 | if (!Array.isArray(context.links)) { 18 | context.links = []; 19 | } 20 | // No links are added here yet. 21 | 22 | await next(); 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/scraper/middleware/MarkdownMetadataExtractorMiddleware.ts: -------------------------------------------------------------------------------- 1 | import type { ContentProcessorMiddleware, MiddlewareContext } from "./types"; 2 | 3 | /** 4 | * Middleware to extract the title (first H1 heading) from Markdown content. 5 | */ 6 | export class MarkdownMetadataExtractorMiddleware implements ContentProcessorMiddleware { 7 | /** 8 | * Processes the context to extract the title from Markdown. 9 | * @param context The current processing context. 10 | * @param next Function to call the next middleware. 11 | */ 12 | async process(context: MiddlewareContext, next: () => Promise): Promise { 13 | try { 14 | let title = "Untitled"; 15 | const match = context.content.match(/^#\s+(.*)$/m); 16 | if (match?.[1]) { 17 | title = match[1].trim(); 18 | } 19 | context.metadata.title = title; 20 | } catch (error) { 21 | context.errors.push( 22 | new Error( 23 | `Failed to extract metadata from Markdown: ${error instanceof Error ? error.message : String(error)}`, 24 | ), 25 | ); 26 | } 27 | 28 | await next(); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/scraper/middleware/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./HtmlCheerioParserMiddleware"; 2 | export * from "./HtmlJsExecutorMiddleware"; 3 | export * from "./HtmlLinkExtractorMiddleware"; 4 | export * from "./HtmlMetadataExtractorMiddleware"; 5 | export * from "./HtmlPlaywrightMiddleware"; 6 | export * from "./HtmlSanitizerMiddleware"; 7 | export * from "./HtmlToMarkdownMiddleware"; 8 | export * from "./MarkdownLinkExtractorMiddleware"; 9 | export * from "./MarkdownMetadataExtractorMiddleware"; 10 | -------------------------------------------------------------------------------- /src/scraper/middleware/types.ts: -------------------------------------------------------------------------------- 1 | import type * as cheerio from "cheerio"; 2 | import type { ContentFetcher } from "../fetcher/types"; 3 | import type { ScraperOptions } from "../types"; 4 | 5 | /** 6 | * Represents the context passed through the middleware pipeline. 7 | */ 8 | export interface MiddlewareContext { 9 | /** The content being processed (always a string in middleware). */ 10 | content: string; 11 | /** The original source URL of the content. */ 12 | readonly source: string; 13 | /** Extracted metadata (e.g., title). */ 14 | metadata: Record; 15 | /** Extracted links from the content. */ 16 | links: string[]; 17 | /** Errors encountered during processing. */ 18 | errors: Error[]; 19 | /** Job-specific options influencing processing. */ 20 | readonly options: ScraperOptions; 21 | 22 | /** Optional Cheerio root object for HTML processing. */ 23 | dom?: cheerio.CheerioAPI; 24 | 25 | /** Optional fetcher instance for resolving resources relative to the source. */ 26 | fetcher?: ContentFetcher; 27 | } 28 | 29 | /** 30 | * Defines the interface for a middleware component. 31 | */ 32 | export interface ContentProcessorMiddleware { 33 | /** 34 | * Processes the middleware context asynchronously. 35 | * @param context The current middleware context. 36 | * @param next A function to call to pass control to the next middleware in the pipeline. 37 | */ 38 | process(context: MiddlewareContext, next: () => Promise): Promise; 39 | } 40 | -------------------------------------------------------------------------------- /src/scraper/pipelines/BasePipeline.ts: -------------------------------------------------------------------------------- 1 | import type { ContentFetcher, RawContent } from "../fetcher/types"; 2 | import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; 3 | import type { ScraperOptions } from "../types"; 4 | import type { ContentPipeline, ProcessedContent } from "./types"; 5 | 6 | /** 7 | * Base class for content processing pipelines. 8 | * Provides common functionality for executing middleware stacks. 9 | */ 10 | export class BasePipeline implements ContentPipeline { 11 | /** 12 | * Determines if this pipeline can process the given content. 13 | * Must be implemented by derived classes. 14 | */ 15 | public canProcess(_rawContent: RawContent): boolean { 16 | throw new Error("Method not implemented."); 17 | } 18 | 19 | /** 20 | * Processes the raw content through the pipeline. 21 | * Must be implemented by derived classes. 22 | */ 23 | public async process( 24 | _rawContent: RawContent, 25 | _options: ScraperOptions, 26 | _fetcher?: ContentFetcher, 27 | ): Promise { 28 | throw new Error("Method not implemented."); 29 | } 30 | 31 | /** 32 | * Executes a middleware stack on the given context. 33 | * This is a utility method used by derived pipeline classes. 34 | * 35 | * @param middleware - The middleware stack to execute 36 | * @param context - The context to process 37 | */ 38 | protected async executeMiddlewareStack( 39 | middleware: ContentProcessorMiddleware[], 40 | context: MiddlewareContext, 41 | ): Promise { 42 | let index = -1; 43 | const dispatch = async (i: number): Promise => { 44 | if (i <= index) throw new Error("next() called multiple times"); 45 | index = i; 46 | const mw = middleware[i]; 47 | if (!mw) return; 48 | await mw.process(context, dispatch.bind(null, i + 1)); 49 | }; 50 | 51 | try { 52 | await dispatch(0); 53 | } catch (error) { 54 | context.errors.push(error instanceof Error ? error : new Error(String(error))); 55 | } 56 | } 57 | 58 | /** 59 | * Cleans up resources when the pipeline is no longer needed. 60 | * Default implementation does nothing. 61 | */ 62 | public async close(): Promise { 63 | // Default implementation does nothing 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /src/scraper/pipelines/HtmlPipeline.ts: -------------------------------------------------------------------------------- 1 | import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; 2 | import type { RawContent } from "../fetcher/types"; 3 | import type { ContentFetcher } from "../fetcher/types"; 4 | import { HtmlSanitizerMiddleware } from "../middleware"; 5 | import { HtmlCheerioParserMiddleware } from "../middleware/HtmlCheerioParserMiddleware"; 6 | import { HtmlLinkExtractorMiddleware } from "../middleware/HtmlLinkExtractorMiddleware"; 7 | import { HtmlMetadataExtractorMiddleware } from "../middleware/HtmlMetadataExtractorMiddleware"; 8 | import { HtmlPlaywrightMiddleware } from "../middleware/HtmlPlaywrightMiddleware"; 9 | import { HtmlToMarkdownMiddleware } from "../middleware/HtmlToMarkdownMiddleware"; 10 | import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; 11 | import type { ScraperOptions } from "../types"; 12 | import { convertToString } from "../utils/buffer"; 13 | import { BasePipeline } from "./BasePipeline"; 14 | import type { ProcessedContent } from "./types"; 15 | 16 | /** 17 | * Pipeline for processing HTML content using middleware. 18 | */ 19 | export class HtmlPipeline extends BasePipeline { 20 | private readonly playwrightMiddleware: HtmlPlaywrightMiddleware; 21 | private readonly standardMiddleware: ContentProcessorMiddleware[]; 22 | 23 | constructor() { 24 | super(); 25 | this.playwrightMiddleware = new HtmlPlaywrightMiddleware(); 26 | this.standardMiddleware = [ 27 | new HtmlCheerioParserMiddleware(), 28 | new HtmlMetadataExtractorMiddleware(), 29 | new HtmlLinkExtractorMiddleware(), 30 | new HtmlSanitizerMiddleware(), 31 | new HtmlToMarkdownMiddleware(), 32 | ]; 33 | } 34 | 35 | canProcess(rawContent: RawContent): boolean { 36 | return MimeTypeUtils.isHtml(rawContent.mimeType); 37 | } 38 | 39 | async process( 40 | rawContent: RawContent, 41 | options: ScraperOptions, 42 | fetcher?: ContentFetcher, 43 | ): Promise { 44 | const contentString = convertToString(rawContent.content, rawContent.charset); 45 | 46 | const context: MiddlewareContext = { 47 | content: contentString, 48 | source: rawContent.source, 49 | metadata: {}, 50 | links: [], 51 | errors: [], 52 | options, 53 | fetcher, 54 | }; 55 | 56 | // Build middleware stack dynamically based on scrapeMode 57 | let middleware: ContentProcessorMiddleware[] = [...this.standardMiddleware]; 58 | if (options.scrapeMode === "playwright" || options.scrapeMode === "auto") { 59 | middleware = [this.playwrightMiddleware, ...middleware]; 60 | } 61 | 62 | // Execute the middleware stack using the base class method 63 | await this.executeMiddlewareStack(middleware, context); 64 | 65 | return { 66 | textContent: typeof context.content === "string" ? context.content : "", 67 | metadata: context.metadata, 68 | links: context.links, 69 | errors: context.errors, 70 | }; 71 | } 72 | 73 | async close(): Promise { 74 | await this.playwrightMiddleware.closeBrowser(); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/scraper/pipelines/MarkdownPipeline.ts: -------------------------------------------------------------------------------- 1 | import { MimeTypeUtils } from "../../utils/mimeTypeUtils"; 2 | import type { RawContent } from "../fetcher/types"; 3 | import type { ContentFetcher } from "../fetcher/types"; 4 | import { MarkdownLinkExtractorMiddleware } from "../middleware/MarkdownLinkExtractorMiddleware"; 5 | import { MarkdownMetadataExtractorMiddleware } from "../middleware/MarkdownMetadataExtractorMiddleware"; 6 | import type { ContentProcessorMiddleware, MiddlewareContext } from "../middleware/types"; 7 | import type { ScraperOptions } from "../types"; 8 | import { convertToString } from "../utils/buffer"; 9 | import { BasePipeline } from "./BasePipeline"; 10 | import type { ProcessedContent } from "./types"; 11 | 12 | /** 13 | * Pipeline for processing Markdown content using middleware. 14 | */ 15 | export class MarkdownPipeline extends BasePipeline { 16 | private readonly middleware: ContentProcessorMiddleware[]; 17 | 18 | constructor() { 19 | super(); 20 | this.middleware = [ 21 | new MarkdownMetadataExtractorMiddleware(), 22 | new MarkdownLinkExtractorMiddleware(), 23 | ]; 24 | } 25 | 26 | canProcess(rawContent: RawContent): boolean { 27 | if (!rawContent.mimeType) return false; 28 | return ( 29 | MimeTypeUtils.isMarkdown(rawContent.mimeType) || 30 | MimeTypeUtils.isText(rawContent.mimeType) 31 | ); 32 | } 33 | 34 | async process( 35 | rawContent: RawContent, 36 | options: ScraperOptions, 37 | fetcher?: ContentFetcher, 38 | ): Promise { 39 | const contentString = convertToString(rawContent.content, rawContent.charset); 40 | 41 | const context: MiddlewareContext = { 42 | content: contentString, 43 | source: rawContent.source, 44 | metadata: {}, 45 | links: [], 46 | errors: [], 47 | options, 48 | fetcher, 49 | }; 50 | 51 | // Execute the middleware stack using the base class method 52 | await this.executeMiddlewareStack(this.middleware, context); 53 | 54 | return { 55 | textContent: typeof context.content === "string" ? context.content : "", 56 | metadata: context.metadata, 57 | links: context.links, 58 | errors: context.errors, 59 | }; 60 | } 61 | 62 | async close(): Promise {} 63 | } 64 | -------------------------------------------------------------------------------- /src/scraper/pipelines/types.ts: -------------------------------------------------------------------------------- 1 | import type { RawContent } from "../fetcher/types"; 2 | import type { ContentFetcher } from "../fetcher/types"; 3 | import type { ScraperOptions } from "../types"; 4 | 5 | /** 6 | * Represents the successfully processed content from a pipeline. 7 | */ 8 | export interface ProcessedContent { 9 | /** The final processed content, typically as a string (e.g., Markdown). */ 10 | textContent: string; 11 | /** Extracted metadata (e.g., title, description). */ 12 | metadata: Record; 13 | /** Extracted links from the content. */ 14 | links: string[]; 15 | /** Any non-critical errors encountered during processing. */ 16 | errors: Error[]; 17 | } 18 | 19 | /** 20 | * Interface for a content processing pipeline. 21 | * Each pipeline is specialized for a certain type of content (e.g., HTML, Markdown). 22 | */ 23 | export interface ContentPipeline { 24 | /** 25 | * Determines if this pipeline can process the given raw content. 26 | * @param rawContent The raw content fetched from a source. 27 | * @returns True if the pipeline can process the content, false otherwise. 28 | */ 29 | canProcess(rawContent: RawContent): boolean; 30 | 31 | /** 32 | * Processes the raw content. 33 | * @param rawContent The raw content to process. 34 | * @param options Scraper options that might influence processing. 35 | * @param fetcher An optional ContentFetcher for resolving relative resources. 36 | * @returns A promise that resolves with the ProcessedContent. 37 | */ 38 | process( 39 | rawContent: RawContent, 40 | options: ScraperOptions, 41 | fetcher?: ContentFetcher, 42 | ): Promise; 43 | 44 | /** 45 | * Closes any resources or connections used by the pipeline. 46 | */ 47 | close(): Promise; 48 | } 49 | -------------------------------------------------------------------------------- /src/scraper/strategies/GitHubScraperStrategy.ts: -------------------------------------------------------------------------------- 1 | import type { ProgressCallback } from "../../types"; 2 | import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; 3 | import { WebScraperStrategy } from "./WebScraperStrategy"; 4 | 5 | export class GitHubScraperStrategy implements ScraperStrategy { 6 | private defaultStrategy: WebScraperStrategy; 7 | 8 | canHandle(url: string): boolean { 9 | const { hostname } = new URL(url); 10 | return ["github.com", "www.github.com"].includes(hostname); 11 | } 12 | 13 | constructor() { 14 | const shouldFollowLink = (baseUrl: URL, targetUrl: URL) => { 15 | // Must be in same repository 16 | if (this.getRepoPath(baseUrl) !== this.getRepoPath(targetUrl)) { 17 | return false; 18 | } 19 | 20 | const path = targetUrl.pathname; 21 | 22 | // Root README (repository root) 23 | if (path === this.getRepoPath(targetUrl)) { 24 | return true; 25 | } 26 | 27 | // Wiki pages 28 | if (path.startsWith(`${this.getRepoPath(targetUrl)}/wiki`)) { 29 | return true; 30 | } 31 | 32 | // Markdown files under /blob/ 33 | if ( 34 | path.startsWith(`${this.getRepoPath(targetUrl)}/blob/`) && 35 | path.endsWith(".md") 36 | ) { 37 | return true; 38 | } 39 | 40 | return false; 41 | }; 42 | 43 | this.defaultStrategy = new WebScraperStrategy({ 44 | urlNormalizerOptions: { 45 | ignoreCase: true, 46 | removeHash: true, 47 | removeTrailingSlash: true, 48 | removeQuery: true, // Remove query parameters like ?tab=readme-ov-file 49 | }, 50 | shouldFollowLink, 51 | }); 52 | } 53 | 54 | private getRepoPath(url: URL): string { 55 | // Extract // from github.com///... 56 | const match = url.pathname.match(/^\/[^/]+\/[^/]+/); 57 | return match?.[0] || ""; 58 | } 59 | 60 | async scrape( 61 | options: ScraperOptions, 62 | progressCallback: ProgressCallback, 63 | signal?: AbortSignal, 64 | ): Promise { 65 | // Validate it's a GitHub URL 66 | const url = new URL(options.url); 67 | if (!url.hostname.includes("github.com")) { 68 | throw new Error("URL must be a GitHub URL"); 69 | } 70 | 71 | // Pass signal down to the delegated strategy 72 | await this.defaultStrategy.scrape(options, progressCallback, signal); 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/scraper/strategies/LocalFileStrategy.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs/promises"; 2 | import path from "node:path"; 3 | import type { Document, ProgressCallback } from "../../types"; 4 | import { logger } from "../../utils/logger"; 5 | import { FileFetcher } from "../fetcher"; 6 | import type { RawContent } from "../fetcher/types"; 7 | import { HtmlPipeline } from "../pipelines/HtmlPipeline"; 8 | import { MarkdownPipeline } from "../pipelines/MarkdownPipeline"; 9 | import type { ScraperOptions, ScraperProgress } from "../types"; 10 | import { BaseScraperStrategy, type QueueItem } from "./BaseScraperStrategy"; 11 | 12 | /** 13 | * LocalFileStrategy handles crawling and scraping of local files and folders using file:// URLs. 14 | * 15 | * All files with a MIME type of `text/*` are processed. This includes HTML, Markdown, plain text, and source code files such as `.js`, `.ts`, `.tsx`, `.css`, etc. Binary files, PDFs, images, and other non-text formats are ignored. 16 | * 17 | * Supports include/exclude filters and percent-encoded paths. 18 | */ 19 | export class LocalFileStrategy extends BaseScraperStrategy { 20 | private readonly fileFetcher = new FileFetcher(); 21 | private readonly htmlPipeline: HtmlPipeline; 22 | private readonly markdownPipeline: MarkdownPipeline; 23 | private readonly pipelines: [HtmlPipeline, MarkdownPipeline]; 24 | 25 | constructor() { 26 | super(); 27 | this.htmlPipeline = new HtmlPipeline(); 28 | this.markdownPipeline = new MarkdownPipeline(); 29 | this.pipelines = [this.htmlPipeline, this.markdownPipeline]; 30 | } 31 | 32 | canHandle(url: string): boolean { 33 | return url.startsWith("file://"); 34 | } 35 | 36 | protected async processItem( 37 | item: QueueItem, 38 | options: ScraperOptions, 39 | _progressCallback?: ProgressCallback, 40 | _signal?: AbortSignal, 41 | ): Promise<{ document?: Document; links?: string[] }> { 42 | // Always decode the file path from file:// URL 43 | const filePath = decodeURIComponent(item.url.replace(/^file:\/\//, "")); 44 | const stats = await fs.stat(filePath); 45 | 46 | if (stats.isDirectory()) { 47 | const contents = await fs.readdir(filePath); 48 | // Only return links that pass shouldProcessUrl 49 | const links = contents 50 | .map((name) => `file://${path.join(filePath, name)}`) 51 | .filter((url) => this.shouldProcessUrl(url, options)); 52 | return { links }; 53 | } 54 | 55 | logger.info(`🗂️ Processing file ${this.pageCount}/${options.maxPages}: ${filePath}`); 56 | 57 | const rawContent: RawContent = await this.fileFetcher.fetch(item.url); 58 | 59 | let processed: Awaited> | undefined; 60 | 61 | for (const pipeline of this.pipelines) { 62 | if (pipeline.canProcess(rawContent)) { 63 | processed = await pipeline.process(rawContent, options, this.fileFetcher); 64 | break; 65 | } 66 | } 67 | 68 | if (!processed) { 69 | logger.warn( 70 | `⚠️ Unsupported content type "${rawContent.mimeType}" for file ${filePath}. Skipping processing.`, 71 | ); 72 | return { document: undefined, links: [] }; 73 | } 74 | 75 | for (const err of processed.errors) { 76 | logger.warn(`⚠️ Processing error for ${filePath}: ${err.message}`); 77 | } 78 | 79 | return { 80 | document: { 81 | content: typeof processed.textContent === "string" ? processed.textContent : "", 82 | metadata: { 83 | url: rawContent.source, 84 | title: 85 | typeof processed.metadata.title === "string" 86 | ? processed.metadata.title 87 | : "Untitled", 88 | library: options.library, 89 | version: options.version, 90 | }, 91 | } satisfies Document, 92 | }; 93 | } 94 | 95 | async scrape( 96 | options: ScraperOptions, 97 | progressCallback: ProgressCallback, 98 | signal?: AbortSignal, 99 | ): Promise { 100 | try { 101 | await super.scrape(options, progressCallback, signal); 102 | } finally { 103 | await this.htmlPipeline.close(); 104 | await this.markdownPipeline.close(); 105 | } 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/scraper/strategies/NpmScraperStrategy.ts: -------------------------------------------------------------------------------- 1 | import type { ProgressCallback } from "../../types"; 2 | import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; 3 | import { WebScraperStrategy } from "./WebScraperStrategy"; 4 | 5 | export class NpmScraperStrategy implements ScraperStrategy { 6 | private defaultStrategy: WebScraperStrategy; 7 | 8 | canHandle(url: string): boolean { 9 | const { hostname } = new URL(url); 10 | return ["npmjs.org", "npmjs.com", "www.npmjs.com"].includes(hostname); 11 | } 12 | 13 | constructor() { 14 | this.defaultStrategy = new WebScraperStrategy({ 15 | urlNormalizerOptions: { 16 | ignoreCase: true, 17 | removeHash: true, 18 | removeTrailingSlash: true, 19 | removeQuery: true, // Enable removeQuery for NPM packages 20 | }, 21 | }); 22 | } 23 | 24 | async scrape( 25 | options: ScraperOptions, 26 | progressCallback: ProgressCallback, 27 | signal?: AbortSignal, 28 | ): Promise { 29 | // Use default strategy with our configuration, passing the signal 30 | await this.defaultStrategy.scrape(options, progressCallback, signal); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/scraper/strategies/PyPiScraperStrategy.ts: -------------------------------------------------------------------------------- 1 | import type { ProgressCallback } from "../../types"; 2 | import type { ScraperOptions, ScraperProgress, ScraperStrategy } from "../types"; 3 | import { WebScraperStrategy } from "./WebScraperStrategy"; 4 | 5 | export class PyPiScraperStrategy implements ScraperStrategy { 6 | private defaultStrategy: WebScraperStrategy; 7 | 8 | canHandle(url: string): boolean { 9 | const { hostname } = new URL(url); 10 | return ["pypi.org", "www.pypi.org"].includes(hostname); 11 | } 12 | 13 | constructor() { 14 | this.defaultStrategy = new WebScraperStrategy({ 15 | urlNormalizerOptions: { 16 | ignoreCase: true, 17 | removeHash: true, 18 | removeTrailingSlash: true, 19 | removeQuery: true, // Enable removeQuery for PyPI packages 20 | }, 21 | }); 22 | } 23 | 24 | async scrape( 25 | options: ScraperOptions, 26 | progressCallback: ProgressCallback, 27 | signal?: AbortSignal, 28 | ): Promise { 29 | // Use default strategy with our configuration, passing the signal 30 | await this.defaultStrategy.scrape(options, progressCallback, signal); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /src/scraper/types.ts: -------------------------------------------------------------------------------- 1 | import type { Document, ProgressCallback } from "../types"; 2 | 3 | /** 4 | * Enum defining the available HTML processing strategies. 5 | */ 6 | export enum ScrapeMode { 7 | Fetch = "fetch", 8 | Playwright = "playwright", 9 | Auto = "auto", 10 | } 11 | 12 | /** 13 | * Strategy interface for implementing different scraping behaviors 14 | */ 15 | export interface ScraperStrategy { 16 | canHandle(url: string): boolean; 17 | scrape( 18 | options: ScraperOptions, 19 | progressCallback: ProgressCallback, 20 | signal?: AbortSignal, // Add optional signal 21 | ): Promise; 22 | } 23 | 24 | /** 25 | * Options for configuring the scraping process 26 | */ 27 | export interface ScraperOptions { 28 | url: string; 29 | library: string; 30 | version: string; 31 | maxPages?: number; 32 | maxDepth?: number; 33 | /** 34 | * Defines the allowed crawling boundary relative to the starting URL 35 | * - 'subpages': Only crawl URLs on the same hostname and within the same starting path (default) 36 | * - 'hostname': Crawl any URL on the same exact hostname, regardless of path 37 | * - 'domain': Crawl any URL on the same top-level domain, including subdomains 38 | */ 39 | scope?: "subpages" | "hostname" | "domain"; 40 | /** 41 | * Controls whether HTTP redirects (3xx responses) should be followed 42 | * - When true: Redirects are followed automatically (default) 43 | * - When false: A RedirectError is thrown when a 3xx response is received 44 | */ 45 | followRedirects?: boolean; 46 | maxConcurrency?: number; 47 | ignoreErrors?: boolean; 48 | /** CSS selectors for elements to exclude during HTML processing */ 49 | excludeSelectors?: string[]; 50 | /** 51 | * Determines the HTML processing strategy. 52 | * - 'fetch': Use a simple DOM parser (faster, less JS support). 53 | * - 'playwright': Use a headless browser (slower, full JS support). 54 | * - 'auto': Automatically select the best strategy (currently defaults to 'playwright'). 55 | * @default ScrapeMode.Auto 56 | */ 57 | scrapeMode?: ScrapeMode; 58 | /** Optional AbortSignal for cancellation */ 59 | signal?: AbortSignal; 60 | /** 61 | * Patterns for including URLs during scraping. If not set, all are included by default. 62 | */ 63 | includePatterns?: string[]; 64 | /** 65 | * Patterns for excluding URLs during scraping. Exclude takes precedence over include. 66 | */ 67 | excludePatterns?: string[]; 68 | /** 69 | * Custom HTTP headers to send with each HTTP request (e.g., for authentication). 70 | * Keys are header names, values are header values. 71 | */ 72 | headers?: Record; 73 | } 74 | 75 | /** 76 | * Result of scraping a single page. Used internally by HtmlScraper. 77 | */ 78 | export interface ScrapedPage { 79 | content: string; 80 | title: string; 81 | url: string; 82 | /** URLs extracted from page links, used for recursive scraping */ 83 | links: string[]; 84 | } 85 | 86 | /** 87 | * Progress information during scraping 88 | */ 89 | export interface ScraperProgress { 90 | pagesScraped: number; 91 | maxPages: number; 92 | currentUrl: string; 93 | depth: number; 94 | maxDepth: number; 95 | document?: Document; 96 | } 97 | -------------------------------------------------------------------------------- /src/scraper/utils/buffer.test.ts: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2025 2 | import { describe, expect, it } from "vitest"; 3 | import { convertToString } from "./buffer"; 4 | 5 | describe("buffer utilities", () => { 6 | describe("convertToString", () => { 7 | it("returns string content unchanged", () => { 8 | const input = "Hello, world!"; 9 | expect(convertToString(input)).toBe(input); 10 | }); 11 | 12 | it("converts Buffer to string with default UTF-8 charset", () => { 13 | const input = Buffer.from("Hello, world!", "utf-8"); 14 | expect(convertToString(input)).toBe("Hello, world!"); 15 | }); 16 | 17 | it("converts Buffer to string with specified UTF-8 charset", () => { 18 | const input = Buffer.from("Hello, world!", "utf-8"); 19 | expect(convertToString(input, "utf-8")).toBe("Hello, world!"); 20 | }); 21 | 22 | it("converts Buffer to string with ISO-8859-1 charset", () => { 23 | // Create a buffer with ISO-8859-1 encoding (Latin-1) 24 | // This contains characters that would be encoded differently in UTF-8 25 | const input = Buffer.from("Café", "latin1"); 26 | expect(convertToString(input, "iso-8859-1")).toBe("Café"); 27 | }); 28 | 29 | it("handles special characters correctly with different charsets", () => { 30 | // Test with a string containing various special characters 31 | const specialChars = "äöüßéèêëàáâãåçñ¿¡"; 32 | 33 | // Create buffer with ISO-8859-1 encoding 34 | const latinBuffer = Buffer.from(specialChars, "latin1"); 35 | expect(convertToString(latinBuffer, "iso-8859-1")).toBe(specialChars); 36 | 37 | // Create buffer with UTF-8 encoding 38 | const utf8Buffer = Buffer.from(specialChars, "utf-8"); 39 | expect(convertToString(utf8Buffer, "utf-8")).toBe(specialChars); 40 | }); 41 | 42 | it("defaults to UTF-8 when charset is not specified", () => { 43 | const input = Buffer.from("Hello, world!", "utf-8"); 44 | expect(convertToString(input, undefined)).toBe("Hello, world!"); 45 | }); 46 | 47 | it("handles empty buffer correctly", () => { 48 | const input = Buffer.from([]); 49 | expect(convertToString(input)).toBe(""); 50 | }); 51 | 52 | it("converts Buffer to string with UTF-16LE BOM", () => { 53 | // UTF-16LE BOM: 0xFF 0xFE 54 | const utf16le = Buffer.from([0xff, 0xfe, 0x68, 0x00, 0x69, 0x00]); // 'hi' in UTF-16LE 55 | // Node TextDecoder supports BOM-aware decoding 56 | expect(convertToString(utf16le, "utf-16le")).toBe("hi"); 57 | }); 58 | 59 | it("converts Buffer to string with UTF-16BE BOM", () => { 60 | // UTF-16BE BOM: 0xFE 0xFF 61 | const utf16be = Buffer.from([0xfe, 0xff, 0x00, 0x68, 0x00, 0x69]); // 'hi' in UTF-16BE 62 | // Node TextDecoder does not natively support utf-16be, so skip if not supported 63 | let decoded: string | undefined; 64 | try { 65 | decoded = convertToString(utf16be, "utf-16be"); 66 | } catch { 67 | decoded = undefined; 68 | } 69 | // Accept either 'hi' or undefined if not supported 70 | expect(["hi", undefined]).toContain(decoded); 71 | }); 72 | 73 | it("converts Buffer to string with UTF-8 BOM", () => { 74 | // UTF-8 BOM: 0xEF 0xBB 0xBF 75 | const utf8bom = Buffer.from([0xef, 0xbb, 0xbf, 0x68, 0x69]); // '\uFEFFhi' in UTF-8 76 | // Node TextDecoder strips BOM by default, so accept both with and without BOM 77 | const result = convertToString(utf8bom, "utf-8"); 78 | expect(["hi", "\uFEFFhi"]).toContain(result); 79 | }); 80 | }); 81 | }); 82 | -------------------------------------------------------------------------------- /src/scraper/utils/buffer.ts: -------------------------------------------------------------------------------- 1 | import iconv from "iconv-lite"; 2 | 3 | /** 4 | * Decodes a Buffer or string to a JavaScript string using the specified charset. 5 | * The charset should be the encoding as reported by the source (e.g., HTTP header). 6 | * The result is always a valid JS string (Unicode/UTF-16). 7 | * 8 | * If the charset is missing or unsupported, falls back to UTF-8. 9 | * 10 | * @param content The content to decode (Buffer or string) 11 | * @param charset The source encoding (e.g., 'utf-8', 'iso-8859-1', 'utf-16le', etc.) 12 | * @returns The decoded string 13 | */ 14 | export function convertToString(content: string | Buffer, charset?: string): string { 15 | if (typeof content === "string") return content; 16 | try { 17 | return iconv.decode(content, charset || "utf-8"); 18 | } catch { 19 | // Fallback to utf-8 if decoding fails 20 | return iconv.decode(content, "utf-8"); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/scraper/utils/patternMatcher.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from "vitest"; 2 | import { 3 | extractPathAndQuery, 4 | isRegexPattern, 5 | matchesAnyPattern, 6 | patternToRegExp, 7 | shouldIncludeUrl, 8 | } from "./patternMatcher"; 9 | 10 | describe("patternMatcher", () => { 11 | it("isRegexPattern detects regex", () => { 12 | expect(isRegexPattern("/foo.*/")).toBe(true); 13 | expect(isRegexPattern("foo.*/")).toBe(false); 14 | expect(isRegexPattern("/foo.*/")).toBe(true); 15 | expect(isRegexPattern("foo.*")).toBe(false); 16 | }); 17 | 18 | it("patternToRegExp auto-detects regex and glob", () => { 19 | expect(patternToRegExp("/foo.*/").test("foo123")).toBe(true); 20 | expect(patternToRegExp("foo*bar").test("fooxbar")).toBe(true); 21 | expect(patternToRegExp("foo*bar").test("fooyyybar")).toBe(true); 22 | expect(patternToRegExp("foo*bar").test("foo/bar")).toBe(false); 23 | }); 24 | 25 | it("matchesAnyPattern works for globs and regex", () => { 26 | expect(matchesAnyPattern("foo/abc/bar", ["foo/*/bar"])).toBe(true); 27 | expect(matchesAnyPattern("foo/abc/bar", ["/foo/.*/bar/"])).toBe(true); 28 | expect(matchesAnyPattern("foo/abc/bar", ["baz/*"])).toBe(false); 29 | }); 30 | 31 | it("extractPathAndQuery extracts path and query", () => { 32 | expect(extractPathAndQuery("https://example.com/foo/bar?x=1")).toBe("/foo/bar?x=1"); 33 | expect(extractPathAndQuery("/foo/bar?x=1")).toBe("/foo/bar?x=1"); 34 | }); 35 | 36 | it("shouldIncludeUrl applies exclude over include", () => { 37 | // Exclude wins 38 | expect(shouldIncludeUrl("https://x.com/foo", ["foo*"], ["/foo/"])).toBe(false); 39 | // Include only 40 | expect(shouldIncludeUrl("https://x.com/foo", ["foo*"], undefined)).toBe(true); 41 | // No include/exclude 42 | expect(shouldIncludeUrl("https://x.com/foo", undefined, undefined)).toBe(true); 43 | // Exclude only 44 | expect(shouldIncludeUrl("https://x.com/foo", undefined, ["foo*"])).toBe(false); 45 | }); 46 | }); 47 | -------------------------------------------------------------------------------- /src/scraper/utils/patternMatcher.ts: -------------------------------------------------------------------------------- 1 | import { minimatch } from "minimatch"; 2 | 3 | /** 4 | * Utility functions for pattern matching (glob and regex) for URL filtering. 5 | * Supports auto-detection and conversion of glob patterns to RegExp. 6 | * 7 | * Patterns starting and ending with '/' are treated as regex, otherwise as glob (minimatch syntax). 8 | * Glob wildcards supported: '*' (any chars except '/'), '**' (any chars, including '/'). 9 | * 10 | * @module patternMatcher 11 | */ 12 | 13 | /** 14 | * Detects if a pattern is a regex (starts and ends with '/') 15 | */ 16 | export function isRegexPattern(pattern: string): boolean { 17 | return pattern.length > 2 && pattern.startsWith("/") && pattern.endsWith("/"); 18 | } 19 | 20 | /** 21 | * Converts a pattern string to a RegExp instance (auto-detects glob/regex). 22 | * For globs, uses minimatch's internal conversion. 23 | */ 24 | export function patternToRegExp(pattern: string): RegExp { 25 | if (isRegexPattern(pattern)) { 26 | return new RegExp(pattern.slice(1, -1)); 27 | } 28 | // For globs, minimatch.makeRe returns a RegExp 29 | const re = minimatch.makeRe(pattern, { dot: true }); 30 | if (!re) throw new Error(`Invalid glob pattern: ${pattern}`); 31 | return re; 32 | } 33 | 34 | /** 35 | * Checks if a given path matches any pattern in the list. 36 | * For globs, uses minimatch. For regex, uses RegExp. 37 | */ 38 | export function matchesAnyPattern(path: string, patterns?: string[]): boolean { 39 | if (!patterns || patterns.length === 0) return false; 40 | // Always match from a leading slash for path-based globs 41 | const normalizedPath = path.startsWith("/") ? path : `/${path}`; 42 | return patterns.some((pattern) => { 43 | if (isRegexPattern(pattern)) { 44 | return patternToRegExp(pattern).test(normalizedPath); 45 | } 46 | // minimatch expects no leading slash for relative globs, but we keep it for consistency 47 | // so we strip the leading slash for minimatch 48 | return minimatch(normalizedPath.replace(/^\//, ""), pattern, { dot: true }); 49 | }); 50 | } 51 | 52 | /** 53 | * Extracts the path and query from a URL string (no domain). 54 | */ 55 | export function extractPathAndQuery(url: string): string { 56 | try { 57 | const u = new URL(url); 58 | return u.pathname + (u.search || ""); 59 | } catch { 60 | return url; // fallback: return as-is 61 | } 62 | } 63 | 64 | /** 65 | * Determines if a URL should be included based on include/exclude patterns. 66 | * Exclude patterns take precedence. If no include patterns, all are included by default. 67 | */ 68 | export function shouldIncludeUrl( 69 | url: string, 70 | includePatterns?: string[], 71 | excludePatterns?: string[], 72 | ): boolean { 73 | // Always match from a leading slash for path-based globs 74 | const path = extractPathAndQuery(url); 75 | const normalizedPath = path.startsWith("/") ? path : `/${path}`; 76 | // For file:// URLs, also match against the basename (strip leading slash from pattern for basename matching) 77 | let basename: string | undefined; 78 | if (url.startsWith("file://")) { 79 | try { 80 | const u = new URL(url); 81 | basename = u.pathname ? u.pathname.split("/").pop() : undefined; 82 | } catch {} 83 | } 84 | // Helper to strip leading slash from patterns for basename matching 85 | const stripSlash = (patterns?: string[]) => 86 | patterns?.map((p) => (p.startsWith("/") ? p.slice(1) : p)); 87 | // Exclude patterns take precedence 88 | if ( 89 | matchesAnyPattern(normalizedPath, excludePatterns) || 90 | (basename && matchesAnyPattern(basename, stripSlash(excludePatterns))) 91 | ) 92 | return false; 93 | if (!includePatterns || includePatterns.length === 0) return true; 94 | return ( 95 | matchesAnyPattern(normalizedPath, includePatterns) || 96 | (basename ? matchesAnyPattern(basename, stripSlash(includePatterns)) : false) 97 | ); 98 | } 99 | -------------------------------------------------------------------------------- /src/scraper/utils/scope.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it } from "vitest"; 2 | import { isInScope } from "./scope"; 3 | 4 | describe("isInScope", () => { 5 | const base = new URL("https://docs.example.com/docs/start"); 6 | 7 | it("returns true for subpages in subpages scope", () => { 8 | expect( 9 | isInScope(base, new URL("https://docs.example.com/docs/intro"), "subpages"), 10 | ).toBe(true); 11 | expect( 12 | isInScope(base, new URL("https://docs.example.com/docs/start/child"), "subpages"), 13 | ).toBe(true); 14 | expect(isInScope(base, new URL("https://docs.example.com/docs"), "subpages")).toBe( 15 | false, 16 | ); 17 | expect(isInScope(base, new URL("https://docs.example.com/api"), "subpages")).toBe( 18 | false, 19 | ); 20 | expect(isInScope(base, new URL("https://other.com/docs/start"), "subpages")).toBe( 21 | false, 22 | ); 23 | }); 24 | 25 | it("returns true for same hostname in hostname scope", () => { 26 | expect( 27 | isInScope(base, new URL("https://docs.example.com/docs/intro"), "hostname"), 28 | ).toBe(true); 29 | expect(isInScope(base, new URL("https://docs.example.com/api"), "hostname")).toBe( 30 | true, 31 | ); 32 | expect(isInScope(base, new URL("https://other.com/docs/start"), "hostname")).toBe( 33 | false, 34 | ); 35 | }); 36 | 37 | it("returns true for same domain in domain scope", () => { 38 | expect( 39 | isInScope(base, new URL("https://docs.example.com/docs/intro"), "domain"), 40 | ).toBe(true); 41 | expect(isInScope(base, new URL("https://api.example.com/"), "domain")).toBe(true); 42 | expect(isInScope(base, new URL("https://other.com/docs/start"), "domain")).toBe( 43 | false, 44 | ); 45 | expect(isInScope(base, new URL("https://example.com/"), "domain")).toBe(true); 46 | }); 47 | 48 | it("returns false for different protocol", () => { 49 | expect( 50 | isInScope(base, new URL("http://docs.example.com/docs/intro"), "hostname"), 51 | ).toBe(false); 52 | expect( 53 | isInScope(base, new URL("ftp://docs.example.com/docs/intro"), "hostname"), 54 | ).toBe(false); 55 | }); 56 | }); 57 | -------------------------------------------------------------------------------- /src/scraper/utils/scope.ts: -------------------------------------------------------------------------------- 1 | // Utility for scope filtering, extracted from WebScraperStrategy 2 | import type { URL } from "node:url"; 3 | 4 | /** 5 | * Returns true if the targetUrl is in scope of the baseUrl for the given scope. 6 | * - "subpages": same hostname, and target path starts with the parent directory of the base path 7 | * - "hostname": same hostname 8 | * - "domain": same top-level domain (e.g. example.com) 9 | */ 10 | export function isInScope( 11 | baseUrl: URL, 12 | targetUrl: URL, 13 | scope: "subpages" | "hostname" | "domain", 14 | ): boolean { 15 | if (baseUrl.protocol !== targetUrl.protocol) return false; 16 | switch (scope) { 17 | case "subpages": { 18 | if (baseUrl.hostname !== targetUrl.hostname) return false; 19 | // Use the parent directory of the base path 20 | const baseDir = baseUrl.pathname.endsWith("/") 21 | ? baseUrl.pathname 22 | : baseUrl.pathname.replace(/\/[^/]*$/, "/"); 23 | return targetUrl.pathname.startsWith(baseDir); 24 | } 25 | case "hostname": 26 | return baseUrl.hostname === targetUrl.hostname; 27 | case "domain": { 28 | // Compare the last two segments of the hostname (e.g. example.com) 29 | const getDomain = (host: string) => host.split(".").slice(-2).join("."); 30 | return getDomain(baseUrl.hostname) === getDomain(targetUrl.hostname); 31 | } 32 | default: 33 | return false; 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/splitter/errors.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Base error class for all splitter-related errors 3 | */ 4 | export class SplitterError extends Error {} 5 | 6 | /** 7 | * Thrown when content cannot be split further while maintaining its validity 8 | * (e.g., markdown tables require headers, code blocks require language and backticks) 9 | */ 10 | export class MinimumChunkSizeError extends SplitterError { 11 | constructor(size: number, maxSize: number) { 12 | super( 13 | `Cannot split content any further. Content requires minimum chunk size of ${size} bytes, but maximum allowed is ${maxSize} bytes.`, 14 | ); 15 | } 16 | } 17 | 18 | /** 19 | * Generic error for content splitting failures 20 | */ 21 | export class ContentSplitterError extends SplitterError {} 22 | -------------------------------------------------------------------------------- /src/splitter/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./SemanticMarkdownSplitter"; 2 | export * from "./GreedySplitter"; 3 | export * from "./errors"; 4 | -------------------------------------------------------------------------------- /src/splitter/splitters/CodeContentSplitter.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it, vi } from "vitest"; 2 | import { CodeContentSplitter } from "./CodeContentSplitter"; 3 | import type { ContentSplitterOptions } from "./types"; 4 | 5 | vi.mock("../../utils/logger"); 6 | 7 | describe("CodeContentSplitter", () => { 8 | const options = { 9 | chunkSize: 100, 10 | } satisfies ContentSplitterOptions; 11 | const splitter = new CodeContentSplitter(options); 12 | 13 | it("should preserve language in code blocks", async () => { 14 | const code = `function test() { 15 | console.log("Hello"); 16 | }`; 17 | const markdown = `\`\`\`typescript\n${code}\n\`\`\``; 18 | const chunks = await splitter.split(markdown); 19 | expect(chunks.length).toBe(1); 20 | expect(chunks[0]).toBe(markdown); 21 | }); 22 | 23 | it("should handle code without language", async () => { 24 | const code = `const x = 1; 25 | const y = 2;`; 26 | const markdown = `\`\`\`\n${code}\n\`\`\``; 27 | const chunks = await splitter.split(markdown); 28 | expect(chunks.length).toBe(1); 29 | expect(chunks[0]).toBe(markdown); 30 | }); 31 | 32 | it("should split large code blocks by lines", async () => { 33 | const longLine = 34 | "console.log('This is a very long line of code that should be split.');"; 35 | const code = Array(10).fill(longLine).join("\n"); 36 | 37 | const markdown = `\`\`\`javascript\n${code}\n\`\`\``; 38 | const chunks = await splitter.split(markdown); 39 | expect(chunks.length).toBeGreaterThan(1); 40 | for (const chunk of chunks) { 41 | expect(chunk.length).toBeLessThanOrEqual(options.chunkSize); 42 | expect(chunk.startsWith("```javascript\n")).toBe(true); 43 | expect(chunk.endsWith("\n```")).toBe(true); 44 | } 45 | }); 46 | 47 | it("should handle empty code blocks", async () => { 48 | const markdown = "```python\n\n```"; 49 | const chunks = await splitter.split(markdown); 50 | expect(chunks.length).toBe(1); 51 | expect(chunks[0]).toBe(markdown); 52 | }); 53 | 54 | it("should preserve indentation", async () => { 55 | const code = `function test() { 56 | if (condition) { 57 | for (let i = 0; i < 10; i++) { 58 | console.log(i); 59 | } 60 | } 61 | }`; 62 | const markdown = `\`\`\`typescript\n${code}\n\`\`\``; 63 | const chunks = await splitter.split(markdown); 64 | for (const chunk of chunks) { 65 | // Check if indentation is preserved within the chunk 66 | const lines = chunk.split("\n"); 67 | for (let i = 1; i < lines.length - 1; i++) { 68 | // Skip the first (```typescript) and last (```) lines 69 | if (lines[i].includes("if")) { 70 | expect(lines[i].startsWith(" ")); 71 | } else if (lines[i].includes("for")) { 72 | expect(lines[i].startsWith(" ")); 73 | } else if (lines[i].includes("console")) { 74 | expect(lines[i].startsWith(" ")); 75 | } 76 | } 77 | } 78 | }); 79 | }); 80 | -------------------------------------------------------------------------------- /src/splitter/splitters/CodeContentSplitter.ts: -------------------------------------------------------------------------------- 1 | import { MinimumChunkSizeError } from "../errors"; 2 | import type { ContentSplitter, ContentSplitterOptions } from "./types"; 3 | 4 | /** 5 | * Splits code content while preserving language information and formatting. 6 | * Uses line boundaries for splitting and ensures each chunk is properly 7 | * wrapped with language-specific code block markers. 8 | */ 9 | export class CodeContentSplitter implements ContentSplitter { 10 | constructor(private options: ContentSplitterOptions) {} 11 | 12 | async split(content: string): Promise { 13 | // Determine language and strip triple backticks from content 14 | const language = content.match(/^```(\w+)\n/)?.[1]; 15 | const strippedContent = content.replace(/^```(\w*)\n/, "").replace(/```\s*$/, ""); 16 | 17 | const lines = strippedContent.split("\n"); 18 | const chunks: string[] = []; 19 | let currentChunkLines: string[] = []; 20 | 21 | for (const line of lines) { 22 | // Check if a single line with code block markers exceeds chunkSize 23 | const singleLineSize = this.wrap(line, language).length; 24 | if (singleLineSize > this.options.chunkSize) { 25 | throw new MinimumChunkSizeError(singleLineSize, this.options.chunkSize); 26 | } 27 | 28 | currentChunkLines.push(line); 29 | const newChunkContent = this.wrap(currentChunkLines.join("\n"), language); 30 | const newChunkSize = newChunkContent.length; 31 | 32 | if (newChunkSize > this.options.chunkSize && currentChunkLines.length > 1) { 33 | // remove last item 34 | const lastLine = currentChunkLines.pop(); 35 | // wrap content and create chunk 36 | chunks.push(this.wrap(currentChunkLines.join("\n"), language)); 37 | currentChunkLines = [lastLine as string]; 38 | } 39 | } 40 | 41 | if (currentChunkLines.length > 0) { 42 | chunks.push(this.wrap(currentChunkLines.join("\n"), language)); 43 | } 44 | 45 | return chunks; 46 | } 47 | 48 | protected wrap(content: string, language?: string | null): string { 49 | return `\`\`\`${language || ""}\n${content.replace(/\n+$/, "")}\n\`\`\``; 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/splitter/splitters/TableContentSplitter.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it, vi } from "vitest"; 2 | import { MinimumChunkSizeError } from "../errors"; 3 | import { TableContentSplitter } from "./TableContentSplitter"; 4 | import type { ContentSplitterOptions } from "./types"; 5 | 6 | vi.mock("../../utils/logger"); 7 | 8 | describe("TableContentSplitter", () => { 9 | const options = { 10 | chunkSize: 100, 11 | } satisfies ContentSplitterOptions; 12 | const splitter = new TableContentSplitter(options); 13 | 14 | it("should preserve table headers in each chunk", async () => { 15 | const table = `| Column 1 | Column 2 | Column 3 | 16 | |----------|-----------|-----------| 17 | | Data A1 | Data A2 | Data A3 | 18 | | Data B1 | Data B2 | Data B3 |`; 19 | 20 | const chunks = await splitter.split(table); 21 | 22 | for (const chunk of chunks) { 23 | const lines = chunk.split("\n"); 24 | expect(lines[0]).toBe("| Column 1 | Column 2 | Column 3 |"); 25 | expect(lines[1]).toBe("|---|---|---|"); 26 | } 27 | }); 28 | 29 | it("should split large tables by rows", async () => { 30 | // Create a large table that *might* exceed chunkSize, depending on header length 31 | const rows = Array(20) 32 | .fill(0) 33 | .map((_, i) => `| Data ${i}A | Data ${i}B |`); 34 | const table = `| Header A | Header B | 35 | |----------|-----------| 36 | ${rows.join("\n")}`; 37 | 38 | const chunks = await splitter.split(table); 39 | expect(chunks.length).toBeGreaterThan(0); // It will split, even if not > 1 40 | for (const chunk of chunks) { 41 | const lines = chunk.split("\n"); 42 | expect(lines[0]).toBe("| Header A | Header B |"); 43 | expect(lines[1]).toBe("|---|---|"); 44 | } 45 | }); 46 | 47 | it("should throw MinimumChunkSizeError if single row with headers exceeds chunkSize", async () => { 48 | const splitter = new TableContentSplitter({ 49 | chunkSize: 50, // Small size for testing 50 | }); 51 | const table = `| Header A | Header B | Header C | 52 | |----------|-----------|-----------| 53 | | Very long data that exceeds max chunk size with headers | More data | And more |`; 54 | 55 | await expect(splitter.split(table)).rejects.toThrow(MinimumChunkSizeError); 56 | 57 | await expect(splitter.split(table)).rejects.toThrowError( 58 | "Cannot split content any further", 59 | ); 60 | }); 61 | 62 | it("should handle empty table", async () => { 63 | const splitter = new TableContentSplitter(options); 64 | const table = ""; 65 | const chunks = await splitter.split(table); 66 | expect(chunks.length).toBe(1); 67 | expect(chunks[0]).toBe(""); 68 | }); 69 | 70 | it("should preserve special characters", async () => { 71 | const splitter = new TableContentSplitter(options); 72 | const table = `| Symbol | Description | 73 | |---------|-------------| 74 | | → | Arrow | 75 | | 👋 | Wave | 76 | | © | Copyright | 77 | | | HTML Tag |`; 78 | 79 | const chunks = await splitter.split(table); 80 | const allContent = chunks.join(""); 81 | expect(allContent).toContain("→"); 82 | expect(allContent).toContain("👋"); 83 | expect(allContent).toContain("©"); 84 | expect(allContent).toContain(""); 85 | }); 86 | }); 87 | -------------------------------------------------------------------------------- /src/splitter/splitters/TableContentSplitter.ts: -------------------------------------------------------------------------------- 1 | import { MinimumChunkSizeError } from "../errors"; 2 | import type { ContentSplitter, ContentSplitterOptions } from "./types"; 3 | 4 | /** 5 | * Interface representing the structure of a parsed markdown table 6 | */ 7 | interface ParsedTable { 8 | headers: string[]; 9 | separator: string; 10 | rows: string[]; 11 | } 12 | 13 | /** 14 | * Splits table content while preserving headers and table formatting. 15 | * Each chunk maintains the table structure with headers and separator row. 16 | */ 17 | export class TableContentSplitter implements ContentSplitter { 18 | constructor(private options: ContentSplitterOptions) {} 19 | 20 | /** 21 | * Splits table content into chunks while preserving table structure 22 | */ 23 | async split(content: string): Promise { 24 | const parsedTable = this.parseTable(content); 25 | if (!parsedTable) { 26 | return [content]; 27 | } 28 | 29 | const { headers, rows } = parsedTable; 30 | 31 | const chunks: string[] = []; 32 | let currentRows: string[] = []; 33 | 34 | for (const row of rows) { 35 | // Check if a single row with headers exceeds chunkSize 36 | const singleRowSize = this.wrap(row, headers).length; 37 | if (singleRowSize > this.options.chunkSize) { 38 | throw new MinimumChunkSizeError(singleRowSize, this.options.chunkSize); 39 | } 40 | 41 | const newChunkContent = this.wrap([...currentRows, row].join("\n"), headers); 42 | const newChunkSize = newChunkContent.length; 43 | if (newChunkSize > this.options.chunkSize && currentRows.length > 0) { 44 | // Add current chunk, start new 45 | chunks.push(this.wrap(currentRows.join("\n"), headers)); 46 | currentRows = [row]; 47 | } else { 48 | currentRows.push(row); 49 | } 50 | } 51 | 52 | if (currentRows.length > 0) { 53 | chunks.push(this.wrap(currentRows.join("\n"), headers)); 54 | } 55 | 56 | // No merging of table chunks 57 | return chunks; 58 | } 59 | 60 | protected wrap(content: string, headers: string[]): string { 61 | const headerRow = `| ${headers.join(" | ")} |`; 62 | const separatorRow = `|${headers.map(() => "---").join("|")}|`; 63 | return [headerRow, separatorRow, content].join("\n"); 64 | } 65 | 66 | private parseTable(content: string): ParsedTable | null { 67 | const lines = content.trim().split("\n"); 68 | if (lines.length < 3) return null; // Need at least headers, separator, and one data row 69 | 70 | const headers = this.parseRow(lines[0]); 71 | if (!headers) return null; 72 | 73 | const separator = lines[1]; 74 | if (!this.isValidSeparator(separator)) return null; 75 | 76 | const rows = lines.slice(2).filter((row) => row.trim() !== ""); 77 | 78 | return { headers, separator, rows }; 79 | } 80 | 81 | /** 82 | * Parses a table row into cells 83 | */ 84 | private parseRow(row: string): string[] | null { 85 | if (!row.includes("|")) return null; 86 | return row 87 | .split("|") 88 | .map((cell) => cell.trim()) 89 | .filter((cell) => cell !== ""); 90 | } 91 | 92 | /** 93 | * Validates the separator row of the table 94 | */ 95 | private isValidSeparator(separator: string): boolean { 96 | return separator.includes("|") && /^\|?[\s-|]+\|?$/.test(separator); 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/splitter/splitters/TextContentSplitter.test.ts: -------------------------------------------------------------------------------- 1 | import { describe, expect, it, vi } from "vitest"; 2 | import { TextContentSplitter } from "./TextContentSplitter"; 3 | import type { ContentSplitterOptions } from "./types"; 4 | 5 | vi.mock("../../utils/logger"); 6 | 7 | describe("TextContentSplitter", () => { 8 | const options = { 9 | chunkSize: 100, 10 | } satisfies ContentSplitterOptions; 11 | const splitter = new TextContentSplitter(options); 12 | 13 | it("should split on paragraph boundaries when possible", async () => { 14 | const text = `First paragraph with some content. 15 | 16 | Second paragraph that continues the text. 17 | 18 | Third paragraph to complete the example.`; 19 | 20 | const chunks = await splitter.split(text); 21 | 22 | expect(chunks.length).toBe(3); 23 | expect(chunks[0]).toBe("First paragraph with some content."); 24 | expect(chunks[1]).toBe("Second paragraph that continues the text."); 25 | expect(chunks[2]).toBe("Third paragraph to complete the example."); 26 | }); 27 | 28 | it("should fall back to line breaks when paragraphs too large", async () => { 29 | // Create a paragraph larger than preferredChunkSize 30 | const longParagraph = Array(5) 31 | .fill("This is a very long line of text that should be split.") 32 | .join(" "); 33 | 34 | const text = `${longParagraph} 35 | Line two of the text. 36 | Line three continues here. 37 | And line four finishes it.`; 38 | 39 | const chunks = await splitter.split(text); 40 | 41 | // Should split into multiple chunks at line boundaries 42 | expect(chunks.length).toBeGreaterThan(1); 43 | for (const chunk of chunks) { 44 | expect(chunk.length).toBeLessThanOrEqual(options.chunkSize); 45 | } 46 | }); 47 | 48 | it("should merge small chunks when possible", async () => { 49 | const text = 50 | "Short line 1.\nShort line 2.\nShort line 3.\n\nAnother short one.\nAnd another."; 51 | 52 | const chunks = await splitter.split(text); 53 | 54 | // Small consecutive lines should be merged 55 | expect(chunks.length).toBeLessThan(6); // Less than total number of lines 56 | for (const chunk of chunks) { 57 | expect(chunk.length).toBeLessThanOrEqual(options.chunkSize); 58 | } 59 | }); 60 | 61 | it("should handle empty content gracefully", async () => { 62 | const emptyChunks = await splitter.split(""); 63 | expect(emptyChunks.length).toBe(1); 64 | expect(emptyChunks[0]).toBe(""); 65 | 66 | const whitespaceChunks = await splitter.split(" \n \n "); 67 | expect(whitespaceChunks.length).toBe(1); 68 | expect(whitespaceChunks[0]).toBe(""); 69 | }); 70 | 71 | it("should split words as last resort", async () => { 72 | const splitter = new TextContentSplitter({ 73 | chunkSize: 20, // Very small for testing word splitting 74 | }); 75 | 76 | const text = 77 | "This is a very long sentence that needs to be split into smaller chunks"; 78 | 79 | const chunks = await splitter.split(text); 80 | 81 | expect(chunks.length).toBeGreaterThan(1); 82 | for (const chunk of chunks) { 83 | expect(chunk.length).toBeLessThanOrEqual(20); 84 | } 85 | }); 86 | }); 87 | -------------------------------------------------------------------------------- /src/splitter/splitters/types.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Common configuration options for content splitters 3 | */ 4 | export interface ContentSplitterOptions { 5 | /** Maximum characters per chunk */ 6 | chunkSize: number; 7 | } 8 | 9 | /** 10 | * Core interface for content splitters 11 | */ 12 | export interface ContentSplitter { 13 | /** Split content into chunks respecting size constraints */ 14 | split(content: string): Promise; 15 | } 16 | -------------------------------------------------------------------------------- /src/splitter/types.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Types of content within a document section 3 | */ 4 | export type SectionContentType = "text" | "code" | "table" | "heading"; 5 | 6 | /** 7 | * Final output chunk after processing and size-based splitting 8 | */ 9 | export interface ContentChunk { 10 | types: SectionContentType[]; 11 | content: string; 12 | section: { 13 | level: number; 14 | path: string[]; 15 | }; 16 | } 17 | 18 | /** 19 | * Interface for a splitter that processes markdown content into chunks 20 | */ 21 | export interface DocumentSplitter { 22 | splitText(markdown: string): Promise; 23 | } 24 | -------------------------------------------------------------------------------- /src/store/embeddings/FixedDimensionEmbeddings.test.ts: -------------------------------------------------------------------------------- 1 | import { Embeddings } from "@langchain/core/embeddings"; 2 | import { describe, expect, test, vi } from "vitest"; 3 | import { DimensionError } from "../errors"; 4 | import { VECTOR_DIMENSION } from "../types"; 5 | import { FixedDimensionEmbeddings } from "./FixedDimensionEmbeddings"; 6 | 7 | // Suppress logger output during tests 8 | vi.mock("../../utils/logger"); 9 | 10 | // Mock embedding models that produce vectors of different sizes 11 | class MockBaseEmbeddings extends Embeddings { 12 | constructor(private dimension: number) { 13 | super({}); 14 | } 15 | 16 | async embedQuery(_text: string): Promise { 17 | return Array(this.dimension).fill(1); 18 | } 19 | 20 | async embedDocuments(_documents: string[]): Promise { 21 | return [Array(this.dimension).fill(1)]; 22 | } 23 | } 24 | 25 | describe("FixedDimensionEmbeddings", () => { 26 | const targetDimension = VECTOR_DIMENSION; 27 | 28 | test("should pass through vectors of correct dimension", async () => { 29 | const base = new MockBaseEmbeddings(targetDimension); 30 | const wrapper = new FixedDimensionEmbeddings(base, targetDimension, "test:model"); 31 | 32 | const vector = await wrapper.embedQuery("test"); 33 | expect(vector.length).toBe(targetDimension); 34 | }); 35 | 36 | test("should pad vectors that are too short", async () => { 37 | const shortDimension = 1024; 38 | const base = new MockBaseEmbeddings(shortDimension); 39 | const wrapper = new FixedDimensionEmbeddings(base, targetDimension, "test:model"); 40 | 41 | const vector = await wrapper.embedQuery("test"); 42 | expect(vector.length).toBe(targetDimension); 43 | // Check that first part contains the original values 44 | expect(vector.slice(0, shortDimension)).toEqual(Array(shortDimension).fill(1)); 45 | // Check that padding is zeros 46 | expect(vector.slice(shortDimension)).toEqual( 47 | Array(targetDimension - shortDimension).fill(0), 48 | ); 49 | }); 50 | 51 | test("should truncate oversized vectors when allowTruncate is true", async () => { 52 | const largeDimension = 2048; 53 | const base = new MockBaseEmbeddings(largeDimension); 54 | const wrapper = new FixedDimensionEmbeddings( 55 | base, 56 | targetDimension, 57 | "test:model", 58 | true, 59 | ); 60 | 61 | const vector = await wrapper.embedQuery("test"); 62 | expect(vector.length).toBe(targetDimension); 63 | expect(vector).toEqual(Array(targetDimension).fill(1)); 64 | }); 65 | 66 | test("should throw DimensionError for oversized vectors when allowTruncate is false", async () => { 67 | const largeDimension = 3072; 68 | const base = new MockBaseEmbeddings(largeDimension); 69 | const wrapper = new FixedDimensionEmbeddings(base, targetDimension, "test:model"); 70 | 71 | await expect(() => wrapper.embedQuery("test")).rejects.toThrow(DimensionError); 72 | }); 73 | 74 | test("should process multiple documents correctly", async () => { 75 | const shortDimension = 1024; 76 | const base = new MockBaseEmbeddings(shortDimension); 77 | const wrapper = new FixedDimensionEmbeddings(base, targetDimension, "test:model"); 78 | 79 | const vectors = await wrapper.embedDocuments(["test1", "test2"]); 80 | expect(vectors.length).toBe(1); // Our mock returns just one vector 81 | expect(vectors[0].length).toBe(targetDimension); 82 | // Check padding 83 | expect(vectors[0].slice(shortDimension)).toEqual( 84 | Array(targetDimension - shortDimension).fill(0), 85 | ); 86 | }); 87 | }); 88 | -------------------------------------------------------------------------------- /src/store/embeddings/FixedDimensionEmbeddings.ts: -------------------------------------------------------------------------------- 1 | import { Embeddings } from "@langchain/core/embeddings"; 2 | import { DimensionError } from "../errors"; 3 | 4 | /** 5 | * Wrapper around an Embeddings implementation that ensures vectors have a fixed dimension. 6 | * - If a vector's dimension is greater than the target and truncation is allowed, 7 | * the vector is truncated (e.g., for models that support MRL - Matryoshka 8 | * Representation Learning). 9 | * - If a vector's dimension is greater than the target and truncation is not 10 | * allowed, a DimensionError is thrown. 11 | * - If a vector's dimension is less than the target, it is padded with zeros. 12 | */ 13 | export class FixedDimensionEmbeddings extends Embeddings { 14 | private provider: string; 15 | private model: string; 16 | 17 | constructor( 18 | private readonly embeddings: Embeddings, 19 | private readonly targetDimension: number, 20 | providerAndModel: string, 21 | private readonly allowTruncate: boolean = false, 22 | ) { 23 | super({}); 24 | // Parse provider and model from string (e.g., "gemini:embedding-001" or just "text-embedding-3-small") 25 | const [providerOrModel, modelName] = providerAndModel.split(":"); 26 | this.provider = modelName ? providerOrModel : "openai"; // Default to openai if no provider specified 27 | this.model = modelName || providerOrModel; 28 | } 29 | 30 | /** 31 | * Normalize a vector to the target dimension by truncating (for MRL models) or padding. 32 | * @throws {DimensionError} If vector is too large and provider doesn't support MRL 33 | */ 34 | private normalizeVector(vector: number[]): number[] { 35 | const dimension = vector.length; 36 | 37 | if (dimension > this.targetDimension) { 38 | // If truncation is allowed (e.g., for MRL models like Gemini), truncate the vector 39 | if (this.allowTruncate) { 40 | return vector.slice(0, this.targetDimension); 41 | } 42 | // Otherwise, throw an error 43 | throw new DimensionError( 44 | `${this.provider}:${this.model}`, 45 | dimension, 46 | this.targetDimension, 47 | ); 48 | } 49 | 50 | if (dimension < this.targetDimension) { 51 | // Pad with zeros to reach target dimension 52 | return [...vector, ...new Array(this.targetDimension - dimension).fill(0)]; 53 | } 54 | 55 | return vector; 56 | } 57 | 58 | async embedQuery(text: string): Promise { 59 | const vector = await this.embeddings.embedQuery(text); 60 | return this.normalizeVector(vector); 61 | } 62 | 63 | async embedDocuments(documents: string[]): Promise { 64 | const vectors = await this.embeddings.embedDocuments(documents); 65 | return vectors.map((vector) => this.normalizeVector(vector)); 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /src/store/errors.ts: -------------------------------------------------------------------------------- 1 | class StoreError extends Error { 2 | constructor( 3 | message: string, 4 | public readonly cause?: unknown, 5 | ) { 6 | super(cause ? `${message} caused by ${cause}` : message); 7 | this.name = this.constructor.name; 8 | 9 | const causeError = 10 | cause instanceof Error ? cause : cause ? new Error(String(cause)) : undefined; 11 | if (causeError?.stack) { 12 | this.stack = causeError.stack; 13 | } 14 | } 15 | } 16 | 17 | class DimensionError extends StoreError { 18 | constructor( 19 | public readonly modelName: string, 20 | public readonly modelDimension: number, 21 | public readonly dbDimension: number, 22 | ) { 23 | super( 24 | `Model "${modelName}" produces ${modelDimension}-dimensional vectors, ` + 25 | `which exceeds the database's fixed dimension of ${dbDimension}. ` + 26 | `Please use a model with dimension ≤ ${dbDimension}.`, 27 | ); 28 | } 29 | } 30 | 31 | class ConnectionError extends StoreError {} 32 | 33 | class DocumentNotFoundError extends StoreError { 34 | constructor(public readonly id: string) { 35 | super(`Document ${id} not found`); 36 | } 37 | } 38 | 39 | export { StoreError, ConnectionError, DocumentNotFoundError, DimensionError }; 40 | -------------------------------------------------------------------------------- /src/store/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./DocumentStore"; 2 | export * from "./DocumentManagementService"; 3 | export * from "./errors"; 4 | -------------------------------------------------------------------------------- /src/store/types.ts: -------------------------------------------------------------------------------- 1 | import type { DocumentMetadata } from "../types"; 2 | 3 | /** Default vector dimension used across the application */ 4 | export const VECTOR_DIMENSION = 1536; 5 | 6 | /** 7 | * Database document record type matching the documents table schema 8 | */ 9 | export interface DbDocument { 10 | id: string; 11 | library: string; 12 | version: string; 13 | url: string; 14 | content: string; 15 | metadata: string; // JSON string of DocumentMetadata 16 | embedding: string | null; // JSON string of number[] 17 | sort_order: number; 18 | score: number | null; 19 | } 20 | 21 | /** 22 | * Utility type for handling SQLite query results that may be undefined 23 | */ 24 | export type DbQueryResult = T | undefined; 25 | 26 | /** 27 | * Maps raw database document to the Document type used by the application 28 | */ 29 | export function mapDbDocumentToDocument(doc: DbDocument) { 30 | return { 31 | id: doc.id, 32 | pageContent: doc.content, 33 | metadata: JSON.parse(doc.metadata) as DocumentMetadata, 34 | }; 35 | } 36 | 37 | /** 38 | * Search result type returned by the DocumentRetrieverService 39 | */ 40 | export interface StoreSearchResult { 41 | url: string; 42 | content: string; 43 | score: number | null; 44 | } 45 | 46 | /** 47 | * Represents a library and its indexed versions. 48 | */ 49 | export interface LibraryVersion { 50 | version: string; 51 | } 52 | 53 | /** 54 | * Detailed information about a specific indexed library version. 55 | */ 56 | export interface LibraryVersionDetails { 57 | version: string; 58 | documentCount: number; 59 | uniqueUrlCount: number; 60 | indexedAt: string | null; // ISO 8601 format from MIN(indexed_at) 61 | } 62 | 63 | /** 64 | * Result type for findBestVersion, indicating the best semver match 65 | * and whether unversioned documents exist. 66 | */ 67 | export interface FindVersionResult { 68 | bestMatch: string | null; 69 | hasUnversioned: boolean; 70 | } 71 | -------------------------------------------------------------------------------- /src/tools/CancelJobTool.ts: -------------------------------------------------------------------------------- 1 | import type { PipelineManager } from "../pipeline/PipelineManager"; 2 | import { PipelineJobStatus } from "../pipeline/types"; 3 | import { logger } from "../utils/logger"; 4 | 5 | /** 6 | * Input parameters for the CancelJobTool. 7 | */ 8 | export interface CancelJobInput { 9 | /** The ID of the job to cancel. */ 10 | jobId: string; 11 | } 12 | 13 | /** 14 | * Output result for the CancelJobTool. 15 | */ 16 | export interface CancelJobResult { 17 | /** A message indicating the outcome of the cancellation attempt. */ 18 | message: string; 19 | /** Indicates if the cancellation request was successfully initiated or if the job was already finished/cancelled. */ 20 | success: boolean; 21 | } 22 | 23 | /** 24 | * Tool for attempting to cancel a pipeline job. 25 | */ 26 | export class CancelJobTool { 27 | private manager: PipelineManager; 28 | 29 | /** 30 | * Creates an instance of CancelJobTool. 31 | * @param manager The PipelineManager instance. 32 | */ 33 | constructor(manager: PipelineManager) { 34 | this.manager = manager; 35 | } 36 | 37 | /** 38 | * Executes the tool to attempt cancellation of a specific job. 39 | * @param input - The input parameters, containing the jobId. 40 | * @returns A promise that resolves with the outcome message. 41 | */ 42 | async execute(input: CancelJobInput): Promise { 43 | try { 44 | // Retrieve the job first to check its status before attempting cancellation 45 | const job = await this.manager.getJob(input.jobId); 46 | 47 | if (!job) { 48 | logger.warn(`❓ [CancelJobTool] Job not found: ${input.jobId}`); 49 | return { 50 | message: `Job with ID ${input.jobId} not found.`, 51 | success: false, 52 | }; 53 | } 54 | 55 | // Check if the job is already in a final state 56 | if ( 57 | job.status === PipelineJobStatus.COMPLETED || // Use enum member 58 | job.status === PipelineJobStatus.FAILED || // Use enum member 59 | job.status === PipelineJobStatus.CANCELLED // Use enum member 60 | ) { 61 | logger.debug(`Job ${input.jobId} is already in a final state: ${job.status}.`); 62 | return { 63 | message: `Job ${input.jobId} is already ${job.status}. No action taken.`, 64 | success: true, // Considered success as no cancellation needed 65 | }; 66 | } 67 | 68 | // Attempt cancellation 69 | await this.manager.cancelJob(input.jobId); 70 | 71 | // Re-fetch the job to confirm status change (or check status directly if cancelJob returned it) 72 | // PipelineManager.cancelJob doesn't return status, so re-fetch is needed for confirmation. 73 | const updatedJob = await this.manager.getJob(input.jobId); 74 | const finalStatus = updatedJob?.status ?? "UNKNOWN (job disappeared?)"; 75 | 76 | logger.debug( 77 | `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}`, 78 | ); 79 | return { 80 | message: `Cancellation requested for job ${input.jobId}. Current status: ${finalStatus}.`, 81 | success: true, 82 | }; 83 | } catch (error) { 84 | logger.error(`❌ Error cancelling job ${input.jobId}: ${error}`); 85 | return { 86 | message: `Failed to cancel job ${input.jobId}: ${ 87 | error instanceof Error ? error.message : String(error) 88 | }`, 89 | success: false, 90 | }; 91 | } 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/tools/ClearCompletedJobsTool.test.ts: -------------------------------------------------------------------------------- 1 | import { type Mock, beforeEach, describe, expect, it, vi } from "vitest"; 2 | import type { PipelineManager } from "../pipeline/PipelineManager"; 3 | import { ClearCompletedJobsTool } from "./ClearCompletedJobsTool"; 4 | 5 | // Mock dependencies 6 | vi.mock("../pipeline/PipelineManager"); 7 | vi.mock("../utils/logger"); 8 | 9 | describe("ClearCompletedJobsTool", () => { 10 | let mockManagerInstance: Partial; 11 | let clearCompletedJobsTool: ClearCompletedJobsTool; 12 | 13 | beforeEach(() => { 14 | vi.resetAllMocks(); 15 | 16 | // Define the mock implementation for the manager instance 17 | mockManagerInstance = { 18 | clearCompletedJobs: vi.fn().mockResolvedValue(0), // Default to no jobs cleared 19 | }; 20 | 21 | // Instantiate the tool with the correctly typed mock instance 22 | clearCompletedJobsTool = new ClearCompletedJobsTool( 23 | mockManagerInstance as PipelineManager, 24 | ); 25 | }); 26 | 27 | it("should call manager.clearCompletedJobs", async () => { 28 | await clearCompletedJobsTool.execute({}); 29 | expect(mockManagerInstance.clearCompletedJobs).toHaveBeenCalledOnce(); 30 | }); 31 | 32 | it("should return success: true with count when jobs are cleared", async () => { 33 | const clearedCount = 3; 34 | (mockManagerInstance.clearCompletedJobs as Mock).mockResolvedValue(clearedCount); 35 | 36 | const result = await clearCompletedJobsTool.execute({}); 37 | 38 | expect(mockManagerInstance.clearCompletedJobs).toHaveBeenCalledOnce(); 39 | expect(result.success).toBe(true); 40 | expect(result.clearedCount).toBe(clearedCount); 41 | expect(result.message).toContain("Successfully cleared 3 completed jobs"); 42 | }); 43 | 44 | it("should return success: true with singular message when 1 job is cleared", async () => { 45 | const clearedCount = 1; 46 | (mockManagerInstance.clearCompletedJobs as Mock).mockResolvedValue(clearedCount); 47 | 48 | const result = await clearCompletedJobsTool.execute({}); 49 | 50 | expect(result.success).toBe(true); 51 | expect(result.clearedCount).toBe(clearedCount); 52 | expect(result.message).toContain("Successfully cleared 1 completed job"); 53 | expect(result.message).not.toContain("jobs"); // Should be singular 54 | }); 55 | 56 | it("should return success: true with appropriate message when no jobs are cleared", async () => { 57 | const clearedCount = 0; 58 | (mockManagerInstance.clearCompletedJobs as Mock).mockResolvedValue(clearedCount); 59 | 60 | const result = await clearCompletedJobsTool.execute({}); 61 | 62 | expect(result.success).toBe(true); 63 | expect(result.clearedCount).toBe(clearedCount); 64 | expect(result.message).toBe("No completed jobs to clear."); 65 | }); 66 | 67 | it("should return success: false if clearCompletedJobs throws an error", async () => { 68 | const clearError = new Error("Clear operation failed"); 69 | (mockManagerInstance.clearCompletedJobs as Mock).mockRejectedValue(clearError); 70 | 71 | const result = await clearCompletedJobsTool.execute({}); 72 | 73 | expect(mockManagerInstance.clearCompletedJobs).toHaveBeenCalledOnce(); 74 | expect(result.success).toBe(false); 75 | expect(result.clearedCount).toBe(0); 76 | expect(result.message).toContain("Failed to clear completed jobs"); 77 | expect(result.message).toContain(clearError.message); 78 | }); 79 | 80 | it("should handle non-Error exceptions gracefully", async () => { 81 | const clearError = "String error message"; 82 | (mockManagerInstance.clearCompletedJobs as Mock).mockRejectedValue(clearError); 83 | 84 | const result = await clearCompletedJobsTool.execute({}); 85 | 86 | expect(result.success).toBe(false); 87 | expect(result.clearedCount).toBe(0); 88 | expect(result.message).toContain("Failed to clear completed jobs"); 89 | expect(result.message).toContain(clearError); 90 | }); 91 | }); 92 | -------------------------------------------------------------------------------- /src/tools/ClearCompletedJobsTool.ts: -------------------------------------------------------------------------------- 1 | import type { PipelineManager } from "../pipeline/PipelineManager"; 2 | import { logger } from "../utils/logger"; 3 | 4 | /** 5 | * Input parameters for the ClearCompletedJobsTool. 6 | */ 7 | // biome-ignore lint/suspicious/noEmptyInterface: No input parameters needed for this tool 8 | export interface ClearCompletedJobsInput { 9 | // No input parameters needed for this tool 10 | } 11 | 12 | /** 13 | * Output result for the ClearCompletedJobsTool. 14 | */ 15 | export interface ClearCompletedJobsResult { 16 | /** A message indicating the outcome of the clear operation. */ 17 | message: string; 18 | /** Indicates if the clear operation was successful. */ 19 | success: boolean; 20 | /** The number of jobs that were cleared. */ 21 | clearedCount: number; 22 | } 23 | 24 | /** 25 | * Tool for clearing all completed, cancelled, and failed jobs from the pipeline. 26 | * This helps keep the job queue clean by removing jobs that are no longer active. 27 | */ 28 | export class ClearCompletedJobsTool { 29 | private manager: PipelineManager; 30 | 31 | /** 32 | * Creates an instance of ClearCompletedJobsTool. 33 | * @param manager The PipelineManager instance. 34 | */ 35 | constructor(manager: PipelineManager) { 36 | this.manager = manager; 37 | } 38 | 39 | /** 40 | * Executes the tool to clear all completed jobs from the pipeline. 41 | * @param input - The input parameters (currently unused). 42 | * @returns A promise that resolves with the outcome of the clear operation. 43 | */ 44 | async execute(input: ClearCompletedJobsInput): Promise { 45 | try { 46 | const clearedCount = await this.manager.clearCompletedJobs(); 47 | 48 | const message = 49 | clearedCount > 0 50 | ? `Successfully cleared ${clearedCount} completed job${clearedCount === 1 ? "" : "s"} from the queue.` 51 | : "No completed jobs to clear."; 52 | 53 | logger.debug(`[ClearCompletedJobsTool] ${message}`); 54 | 55 | return { 56 | message, 57 | success: true, 58 | clearedCount, 59 | }; 60 | } catch (error) { 61 | const errorMessage = `Failed to clear completed jobs: ${ 62 | error instanceof Error ? error.message : String(error) 63 | }`; 64 | 65 | logger.error(`❌ [ClearCompletedJobsTool] ${errorMessage}`); 66 | 67 | return { 68 | message: errorMessage, 69 | success: false, 70 | clearedCount: 0, 71 | }; 72 | } 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/tools/FindVersionTool.ts: -------------------------------------------------------------------------------- 1 | import type { DocumentManagementService } from "../store"; 2 | import { logger } from "../utils/logger"; 3 | import { VersionNotFoundError } from "./errors"; 4 | 5 | export interface FindVersionToolOptions { 6 | library: string; 7 | targetVersion?: string; 8 | } 9 | 10 | /** 11 | * Tool for finding the best matching version of a library in the store. 12 | * Supports exact version matches and X-Range patterns (e.g., '5.x', '5.2.x'). 13 | */ 14 | export class FindVersionTool { 15 | private docService: DocumentManagementService; 16 | 17 | constructor(docService: DocumentManagementService) { 18 | this.docService = docService; 19 | } 20 | 21 | /** 22 | * Executes the tool to find the best matching version and checks for unversioned docs. 23 | * @returns A descriptive string indicating the best match and unversioned status, or an error message. 24 | */ 25 | async execute(options: FindVersionToolOptions): Promise { 26 | const { library, targetVersion } = options; 27 | const targetVersionString = targetVersion ? `@${targetVersion}` : ""; 28 | 29 | try { 30 | const { bestMatch, hasUnversioned } = await this.docService.findBestVersion( 31 | library, 32 | targetVersion, 33 | ); 34 | 35 | let message = ""; 36 | if (bestMatch) { 37 | message = `Best match: ${bestMatch}.`; 38 | if (hasUnversioned) { 39 | message += " Unversioned docs also available."; 40 | } 41 | } else if (hasUnversioned) { 42 | message = `No matching version found for ${library}${targetVersionString}, but unversioned docs exist.`; 43 | } else { 44 | // This case should ideally be caught by VersionNotFoundError below, 45 | // but added for completeness. 46 | message = `No matching version or unversioned documents found for ${library}${targetVersionString}.`; 47 | } 48 | return message; 49 | } catch (error) { 50 | if (error instanceof VersionNotFoundError) { 51 | // This error is thrown when no semver versions AND no unversioned docs exist. 52 | logger.info(`ℹ️ Version not found: ${error.message}`); 53 | return `No matching version or unversioned documents found for ${library}${targetVersionString}. Available: ${ 54 | error.availableVersions.length > 0 55 | ? error.availableVersions.map((v) => v.version).join(", ") 56 | : "None" 57 | }.`; 58 | } 59 | // Re-throw unexpected errors 60 | logger.error( 61 | `❌ Error finding version for ${library}${targetVersionString}: ${error instanceof Error ? error.message : error}`, 62 | ); 63 | throw error; 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/tools/GetJobInfoTool.test.ts: -------------------------------------------------------------------------------- 1 | import { type Mock, beforeEach, describe, expect, it, vi } from "vitest"; 2 | import type { PipelineManager } from "../pipeline/PipelineManager"; 3 | import { type PipelineJob, PipelineJobStatus } from "../pipeline/types"; 4 | import type { ScraperOptions } from "../scraper/types"; 5 | import { GetJobInfoTool } from "./GetJobInfoTool"; // Updated import 6 | 7 | // Mock dependencies 8 | vi.mock("../pipeline/PipelineManager"); 9 | vi.mock("../utils/logger"); 10 | 11 | describe("GetJobInfoTool", () => { 12 | // Updated describe block 13 | let mockManagerInstance: Partial; 14 | let getJobInfoTool: GetJobInfoTool; // Updated variable name 15 | 16 | const MOCK_JOB_ID_FOUND = "job-found-123"; 17 | const MOCK_JOB_ID_NOT_FOUND = "job-not-found-456"; 18 | 19 | const mockJob: PipelineJob = { 20 | id: MOCK_JOB_ID_FOUND, 21 | library: "lib-a", 22 | version: "1.0.0", 23 | status: PipelineJobStatus.RUNNING, 24 | createdAt: new Date("2023-01-01T10:00:00Z"), 25 | startedAt: new Date("2023-01-01T10:05:00Z"), 26 | options: { library: "lib-a", version: "1.0.0", url: "url1" } as ScraperOptions, 27 | progress: null, 28 | error: null, 29 | finishedAt: null, 30 | abortController: new AbortController(), 31 | completionPromise: Promise.resolve(), 32 | resolveCompletion: () => {}, 33 | rejectCompletion: () => {}, 34 | }; 35 | 36 | beforeEach(() => { 37 | vi.resetAllMocks(); 38 | 39 | // Define the mock implementation for the manager instance 40 | mockManagerInstance = { 41 | // Mock getJob to return the job if ID matches, otherwise undefined 42 | getJob: vi.fn().mockImplementation(async (jobId: string) => { 43 | if (jobId === MOCK_JOB_ID_FOUND) { 44 | return mockJob; 45 | } 46 | return undefined; // Simulate job not found 47 | }), 48 | }; 49 | 50 | // Instantiate the tool with the correctly typed mock instance 51 | getJobInfoTool = new GetJobInfoTool(mockManagerInstance as PipelineManager); // Updated instantiation 52 | }); 53 | 54 | it("should call manager.getJob with the provided jobId", async () => { 55 | await getJobInfoTool.execute({ jobId: MOCK_JOB_ID_FOUND }); // Updated tool call 56 | expect(mockManagerInstance.getJob).toHaveBeenCalledWith(MOCK_JOB_ID_FOUND); 57 | }); 58 | 59 | it("should return the job details if the job is found", async () => { 60 | const result = await getJobInfoTool.execute({ jobId: MOCK_JOB_ID_FOUND }); // Updated tool call 61 | 62 | expect(result.job).not.toBeNull(); 63 | // Check properties of the simplified JobInfo object 64 | expect(result.job?.id).toBe(mockJob.id); 65 | expect(result.job?.library).toBe(mockJob.library); 66 | expect(result.job?.version).toBe(mockJob.version); 67 | expect(result.job?.status).toBe(mockJob.status); 68 | expect(result.job?.createdAt).toBe(mockJob.createdAt.toISOString()); 69 | expect(result.job?.startedAt).toBe(mockJob.startedAt?.toISOString()); 70 | expect(result.job?.finishedAt).toBeNull(); // Based on mockJob 71 | expect(result.job?.error).toBeNull(); // Based on mockJob 72 | }); 73 | 74 | it("should return null if the job is not found", async () => { 75 | const result = await getJobInfoTool.execute({ jobId: MOCK_JOB_ID_NOT_FOUND }); // Updated tool call 76 | 77 | expect(mockManagerInstance.getJob).toHaveBeenCalledWith(MOCK_JOB_ID_NOT_FOUND); 78 | expect(result.job).toBeNull(); 79 | }); 80 | }); 81 | -------------------------------------------------------------------------------- /src/tools/GetJobInfoTool.ts: -------------------------------------------------------------------------------- 1 | import type { PipelineManager } from "../pipeline/PipelineManager"; 2 | import type { PipelineJob, PipelineJobStatus } from "../pipeline/types"; 3 | 4 | /** 5 | * Input parameters for the GetJobInfoTool. 6 | */ 7 | export interface GetJobInfoInput { 8 | /** The ID of the job to retrieve info for. */ 9 | jobId: string; 10 | } 11 | 12 | /** 13 | * Simplified information about a pipeline job for external use. 14 | */ 15 | export interface JobInfo { 16 | id: string; 17 | library: string; 18 | version: string; 19 | status: PipelineJobStatus; 20 | createdAt: string; 21 | startedAt: string | null; 22 | finishedAt: string | null; 23 | error: string | null; 24 | } 25 | 26 | /** 27 | * Response structure for the GetJobInfoTool. 28 | */ 29 | export interface GetJobInfoToolResponse { 30 | job: JobInfo | null; 31 | } 32 | 33 | /** 34 | * Tool for retrieving simplified information about a specific pipeline job. 35 | */ 36 | export class GetJobInfoTool { 37 | private manager: PipelineManager; 38 | 39 | /** 40 | * Creates an instance of GetJobInfoTool. 41 | * @param manager The PipelineManager instance. 42 | */ 43 | constructor(manager: PipelineManager) { 44 | this.manager = manager; 45 | } 46 | 47 | /** 48 | * Executes the tool to retrieve simplified info for a specific job. 49 | * @param input - The input parameters, containing the jobId. 50 | * @returns A promise that resolves with the simplified job info or null if not found. 51 | */ 52 | async execute(input: GetJobInfoInput): Promise { 53 | const job = await this.manager.getJob(input.jobId); 54 | 55 | if (!job) { 56 | // Return null in the result if job not found 57 | return { job: null }; 58 | } 59 | 60 | // Transform the job into a simplified object 61 | const jobInfo: JobInfo = { 62 | id: job.id, 63 | library: job.library, 64 | version: job.version, 65 | status: job.status, 66 | createdAt: job.createdAt.toISOString(), 67 | startedAt: job.startedAt?.toISOString() ?? null, 68 | finishedAt: job.finishedAt?.toISOString() ?? null, 69 | error: job.error?.message ?? null, 70 | }; 71 | 72 | return { job: jobInfo }; 73 | } 74 | } 75 | -------------------------------------------------------------------------------- /src/tools/ListJobsTool.ts: -------------------------------------------------------------------------------- 1 | import type { PipelineManager } from "../pipeline/PipelineManager"; 2 | import type { PipelineJob, PipelineJobStatus } from "../pipeline/types"; 3 | import type { JobInfo } from "./GetJobInfoTool"; // Import JobInfo 4 | 5 | /** 6 | * Input parameters for the ListJobsTool. 7 | */ 8 | export interface ListJobsInput { 9 | /** Optional status to filter jobs by. */ 10 | status?: PipelineJobStatus; 11 | } 12 | 13 | /** 14 | * Response structure for the ListJobsTool. 15 | */ 16 | export interface ListJobsToolResponse { 17 | jobs: JobInfo[]; 18 | } 19 | 20 | /** 21 | * Tool for listing pipeline jobs managed by the PipelineManager. 22 | * Allows filtering jobs by their status. 23 | */ 24 | export class ListJobsTool { 25 | private manager: PipelineManager; // Change property name and type 26 | 27 | /** 28 | * Creates an instance of ListJobsTool. 29 | * @param manager The PipelineManager instance. 30 | */ 31 | constructor(manager: PipelineManager) { 32 | // Change constructor parameter 33 | this.manager = manager; 34 | } 35 | 36 | /** 37 | * Executes the tool to retrieve a list of pipeline jobs. 38 | * @param input - The input parameters, optionally including a status filter. 39 | * @returns A promise that resolves with the list of simplified job objects. 40 | * @throws {PipelineStateError} If the pipeline manager is somehow unavailable. 41 | */ 42 | async execute(input: ListJobsInput): Promise { 43 | const jobs = await this.manager.getJobs(input.status); 44 | 45 | // Transform jobs into simplified objects 46 | const simplifiedJobs: JobInfo[] = jobs.map( 47 | (job: PipelineJob): JobInfo => ({ 48 | id: job.id, 49 | library: job.library, 50 | version: job.version, 51 | status: job.status, 52 | createdAt: job.createdAt.toISOString(), 53 | startedAt: job.startedAt?.toISOString() ?? null, 54 | finishedAt: job.finishedAt?.toISOString() ?? null, 55 | error: job.error?.message ?? null, 56 | }), 57 | ); 58 | 59 | return { jobs: simplifiedJobs }; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/tools/ListLibrariesTool.ts: -------------------------------------------------------------------------------- 1 | import type { DocumentManagementService } from "../store/DocumentManagementService"; 2 | import type { LibraryVersionDetails } from "../store/types"; 3 | 4 | // Define the structure for the tool's output, using the detailed version info 5 | export interface LibraryInfo { 6 | name: string; 7 | versions: LibraryVersionDetails[]; // Use the detailed interface 8 | } 9 | 10 | export interface ListLibrariesResult { 11 | libraries: LibraryInfo[]; 12 | } 13 | 14 | /** 15 | * Tool for listing all available libraries and their indexed versions in the store. 16 | */ 17 | export class ListLibrariesTool { 18 | private docService: DocumentManagementService; 19 | 20 | constructor(docService: DocumentManagementService) { 21 | this.docService = docService; 22 | } 23 | 24 | async execute(options?: Record): Promise { 25 | // docService.listLibraries() now returns the detailed structure directly 26 | const rawLibraries = await this.docService.listLibraries(); 27 | 28 | // The structure returned by listLibraries already matches LibraryInfo[] 29 | // No complex mapping is needed here anymore, just ensure the names match 30 | const libraries: LibraryInfo[] = rawLibraries.map(({ library, versions }) => ({ 31 | name: library, 32 | versions: versions, // Directly assign the detailed versions array 33 | })); 34 | 35 | return { libraries }; 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/tools/RemoveTool.ts: -------------------------------------------------------------------------------- 1 | import type { PipelineManager } from "../pipeline/PipelineManager"; 2 | import { PipelineJobStatus } from "../pipeline/types"; 3 | import type { DocumentManagementService } from "../store"; 4 | import { logger } from "../utils/logger"; 5 | import { ToolError } from "./errors"; 6 | 7 | /** 8 | * Represents the arguments for the remove_docs tool. 9 | * The MCP server should validate the input against RemoveToolInputSchema before calling execute. 10 | */ 11 | export interface RemoveToolArgs { 12 | library: string; 13 | version?: string; 14 | } 15 | 16 | /** 17 | * Tool to remove indexed documentation for a specific library version. 18 | * This class provides the core logic, intended to be called by the McpServer. 19 | */ 20 | export class RemoveTool { 21 | constructor( 22 | private readonly documentManagementService: DocumentManagementService, 23 | private readonly pipelineManager?: PipelineManager, // Optional for backward compatibility 24 | ) {} 25 | 26 | /** 27 | * Executes the tool to remove the specified library version documents. 28 | * Aborts any QUEUED/RUNNING job for the same library+version before deleting. 29 | */ 30 | async execute(args: RemoveToolArgs): Promise<{ message: string }> { 31 | const { library, version } = args; 32 | 33 | logger.info( 34 | `🗑️ Removing library: ${library}${version ? `, version: ${version}` : " (unversioned)"}`, 35 | ); 36 | 37 | try { 38 | // Abort any QUEUED or RUNNING job for this library+version 39 | if (this.pipelineManager) { 40 | const jobs = this.pipelineManager.findJobsByLibraryVersion( 41 | library, 42 | (version ?? "").toLowerCase(), 43 | [PipelineJobStatus.QUEUED, PipelineJobStatus.RUNNING], 44 | ); 45 | for (const job of jobs) { 46 | logger.info( 47 | `🚫 Aborting job for ${library}@${version ?? ""} before deletion: ${job.id}`, 48 | ); 49 | await this.pipelineManager.cancelJob(job.id); 50 | // Wait for job to finish cancelling if running 51 | await this.pipelineManager.waitForJobCompletion(job.id); 52 | } 53 | } 54 | // Core logic: Call the document management service 55 | await this.documentManagementService.removeAllDocuments(library, version); 56 | 57 | const message = `Successfully removed documents for ${library}${version ? `@${version}` : " (unversioned)"}.`; 58 | logger.info(`✅ ${message}`); 59 | // Return a simple success object, the McpServer will format the final response 60 | return { message }; 61 | } catch (error) { 62 | const errorMessage = `Failed to remove documents for ${library}${version ? `@${version}` : " (unversioned)"}: ${error instanceof Error ? error.message : String(error)}`; 63 | logger.error(`❌ Error removing library: ${errorMessage}`); 64 | // Re-throw the error for the McpServer to handle and format 65 | throw new ToolError(errorMessage, this.constructor.name); 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/tools/SearchTool.ts: -------------------------------------------------------------------------------- 1 | import type { DocumentManagementService } from "../store"; 2 | import type { LibraryVersionDetails, StoreSearchResult } from "../store/types"; // Import LibraryVersionDetails 3 | import { logger } from "../utils/logger"; 4 | import { VersionNotFoundError } from "./errors"; 5 | 6 | export interface SearchToolOptions { 7 | library: string; 8 | version?: string; 9 | query: string; 10 | limit?: number; 11 | exactMatch?: boolean; 12 | } 13 | 14 | export interface SearchToolResultError { 15 | message: string; 16 | availableVersions?: LibraryVersionDetails[]; // Use LibraryVersionDetails 17 | suggestions?: string[]; // Specific to LibraryNotFoundError 18 | } 19 | 20 | export interface SearchToolResult { 21 | results: StoreSearchResult[]; 22 | } 23 | 24 | /** 25 | * Tool for searching indexed documentation. 26 | * Supports exact version matches and version range patterns. 27 | * Returns available versions when requested version is not found. 28 | */ 29 | export class SearchTool { 30 | private docService: DocumentManagementService; 31 | 32 | constructor(docService: DocumentManagementService) { 33 | this.docService = docService; 34 | } 35 | 36 | async execute(options: SearchToolOptions): Promise { 37 | const { library, version, query, limit = 5, exactMatch = false } = options; 38 | 39 | // When exactMatch is true, version must be specified and not 'latest' 40 | if (exactMatch && (!version || version === "latest")) { 41 | // Get available *detailed* versions for error message 42 | await this.docService.validateLibraryExists(library); 43 | // Fetch detailed versions using listLibraries and find the specific library 44 | const allLibraries = await this.docService.listLibraries(); 45 | const libraryInfo = allLibraries.find((lib) => lib.library === library); 46 | const detailedVersions = libraryInfo ? libraryInfo.versions : []; 47 | throw new VersionNotFoundError( 48 | library, 49 | "latest", // Or perhaps the original 'version' if it wasn't 'latest'? Check logic. 50 | detailedVersions, 51 | ); 52 | } 53 | 54 | // Default to 'latest' only when exactMatch is false 55 | const resolvedVersion = version || "latest"; 56 | 57 | logger.info( 58 | `🔍 Searching ${library}@${resolvedVersion} for: ${query}${exactMatch ? " (exact match)" : ""}`, 59 | ); 60 | 61 | try { 62 | // 1. Validate library exists first 63 | await this.docService.validateLibraryExists(library); 64 | 65 | // 2. Proceed with version finding and searching 66 | let versionToSearch: string | null | undefined = resolvedVersion; 67 | 68 | if (!exactMatch) { 69 | // If not exact match, find the best version (which might be null) 70 | const versionResult = await this.docService.findBestVersion(library, version); 71 | // Use the bestMatch from the result, which could be null 72 | versionToSearch = versionResult.bestMatch; 73 | 74 | // If findBestVersion returned null (no matching semver) AND unversioned docs exist, 75 | // should we search unversioned? The current logic passes null to searchStore, 76 | // which gets normalized to "" (unversioned). This seems reasonable. 77 | // If findBestVersion threw VersionNotFoundError, it's caught below. 78 | } 79 | // If exactMatch is true, versionToSearch remains the originally provided version. 80 | 81 | // Note: versionToSearch can be string | null | undefined here. 82 | // searchStore handles null/undefined by normalizing to "". 83 | const results = await this.docService.searchStore( 84 | library, 85 | versionToSearch, 86 | query, 87 | limit, 88 | ); 89 | logger.info(`✅ Found ${results.length} matching results`); 90 | 91 | return { results }; 92 | } catch (error) { 93 | logger.error( 94 | `❌ Search failed: ${error instanceof Error ? error.message : "Unknown error"}`, 95 | ); 96 | throw error; 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /src/tools/errors.ts: -------------------------------------------------------------------------------- 1 | import semver from "semver"; 2 | import type { LibraryVersionDetails } from "../store/types"; // Import LibraryVersionDetails 3 | 4 | class ToolError extends Error { 5 | constructor( 6 | message: string, 7 | public readonly toolName: string, 8 | ) { 9 | super(message); 10 | this.name = this.constructor.name; 11 | } 12 | } 13 | 14 | class VersionNotFoundError extends ToolError { 15 | constructor( 16 | public readonly library: string, 17 | public readonly requestedVersion: string, 18 | public readonly availableVersions: LibraryVersionDetails[], // Use LibraryVersionDetails 19 | ) { 20 | super( 21 | `Version ${requestedVersion} not found for ${library}. Available versions: ${availableVersions.map((v) => v.version).join(", ")}`, 22 | "SearchTool", 23 | ); 24 | } 25 | 26 | getLatestVersion() { 27 | return this.availableVersions.sort((a, b) => semver.compare(b.version, a.version))[0]; 28 | } 29 | } 30 | 31 | /** 32 | * Error thrown when a requested library cannot be found in the store. 33 | * Includes suggestions for similar library names if available. 34 | */ 35 | class LibraryNotFoundError extends ToolError { 36 | constructor( 37 | public readonly requestedLibrary: string, 38 | public readonly suggestions: string[] = [], 39 | ) { 40 | let message = `Library '${requestedLibrary}' not found.`; 41 | if (suggestions.length > 0) { 42 | message += ` Did you mean one of these: ${suggestions.join(", ")}?`; 43 | } 44 | // Assuming this error might originate from various tools, but SearchTool is a primary candidate. 45 | // We might need to adjust the toolName if it's thrown elsewhere. 46 | super(message, "SearchTool"); 47 | } 48 | } 49 | 50 | export { LibraryNotFoundError, ToolError, VersionNotFoundError }; 51 | -------------------------------------------------------------------------------- /src/tools/index.ts: -------------------------------------------------------------------------------- 1 | export * from "./CancelJobTool"; 2 | export * from "./ClearCompletedJobsTool"; 3 | export * from "./errors"; 4 | export * from "./FetchUrlTool"; 5 | export * from "./FindVersionTool"; 6 | export * from "./GetJobInfoTool"; 7 | export * from "./ListJobsTool"; 8 | export * from "./ListLibrariesTool"; 9 | export * from "./RemoveTool"; 10 | export * from "./ScrapeTool"; 11 | export * from "./SearchTool"; 12 | -------------------------------------------------------------------------------- /src/types/index.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Common document content type shared across modules 3 | */ 4 | export interface Document { 5 | content: string; 6 | metadata: DocumentMetadata; 7 | } 8 | 9 | /** 10 | * Common metadata fields shared across document chunks 11 | */ 12 | export interface DocumentMetadata { 13 | url: string; 14 | title: string; 15 | library: string; 16 | version: string; 17 | level?: number; // Optional during scraping 18 | path?: string[]; // Optional during scraping 19 | } 20 | 21 | /** 22 | * Generic progress callback type 23 | */ 24 | export type ProgressCallback = (progress: T) => void | Promise; 25 | 26 | /** 27 | * Standard progress response format 28 | */ 29 | export interface ProgressResponse { 30 | content: { type: string; text: string }[]; 31 | } 32 | -------------------------------------------------------------------------------- /src/utils/config.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Default configuration values for the scraping pipeline and server 3 | */ 4 | 5 | /** Maximum number of pages to scrape in a single job */ 6 | export const DEFAULT_MAX_PAGES = 1000; 7 | 8 | /** Maximum navigation depth when crawling links */ 9 | export const DEFAULT_MAX_DEPTH = 3; 10 | 11 | /** Maximum number of concurrent page requests */ 12 | export const DEFAULT_MAX_CONCURRENCY = 3; 13 | 14 | /** Default protocol for the MCP server */ 15 | export const DEFAULT_PROTOCOL = "stdio"; 16 | 17 | /** Default port for the HTTP protocol */ 18 | export const DEFAULT_HTTP_PORT = 6280; 19 | 20 | /** Default port for the Web UI */ 21 | export const DEFAULT_WEB_PORT = 6281; 22 | 23 | /** 24 | * Default timeout in milliseconds for page operations (e.g., Playwright waitForSelector). 25 | */ 26 | export const DEFAULT_PAGE_TIMEOUT = 5000; 27 | 28 | /** 29 | * Maximum number of retries for HTTP fetcher requests. 30 | */ 31 | export const FETCHER_MAX_RETRIES = 6; 32 | 33 | /** 34 | * Base delay in milliseconds for HTTP fetcher retry backoff. 35 | */ 36 | export const FETCHER_BASE_DELAY = 1000; 37 | 38 | /** 39 | * Default chunk size settings for splitters 40 | */ 41 | export const SPLITTER_MIN_CHUNK_SIZE = 500; 42 | export const SPLITTER_PREFERRED_CHUNK_SIZE = 1500; 43 | export const SPLITTER_MAX_CHUNK_SIZE = 5000; 44 | 45 | /** 46 | * Maximum number of documents to process in a single batch for embeddings. 47 | */ 48 | export const EMBEDDING_BATCH_SIZE = 100; 49 | 50 | /** 51 | * Maximum number of retries for database migrations if busy. 52 | */ 53 | export const MIGRATION_MAX_RETRIES = 5; 54 | 55 | /** 56 | * Delay in milliseconds between migration retry attempts. 57 | */ 58 | export const MIGRATION_RETRY_DELAY_MS = 300; 59 | -------------------------------------------------------------------------------- /src/utils/dom.ts: -------------------------------------------------------------------------------- 1 | import { JSDOM, VirtualConsole } from "jsdom"; 2 | import type { ConstructorOptions } from "jsdom"; 3 | 4 | /** 5 | * Creates a JSDOM instance with a pre-configured virtual console to suppress console noise. 6 | * This utility simplifies the setup of JSDOM by providing a standard configuration. 7 | * 8 | * @param html - The HTML content to parse. 9 | * @param options - Optional JSDOM configuration options. These will be merged with the default virtual console setup. 10 | * @returns A JSDOM instance. 11 | */ 12 | export function createJSDOM(html: string, options?: ConstructorOptions): JSDOM { 13 | const virtualConsole = new VirtualConsole(); 14 | // Suppress console output from JSDOM by default 15 | virtualConsole.on("error", () => {}); 16 | virtualConsole.on("warn", () => {}); 17 | virtualConsole.on("info", () => {}); 18 | virtualConsole.on("debug", () => {}); 19 | virtualConsole.on("log", () => {}); // Also suppress regular logs 20 | 21 | const defaultOptions: ConstructorOptions = { 22 | virtualConsole, 23 | }; 24 | 25 | // Merge provided options with defaults, letting provided options override 26 | const finalOptions: ConstructorOptions = { ...defaultOptions, ...options }; 27 | 28 | return new JSDOM(html, finalOptions); 29 | } 30 | -------------------------------------------------------------------------------- /src/utils/errors.ts: -------------------------------------------------------------------------------- 1 | class ScraperError extends Error { 2 | constructor( 3 | message: string, 4 | public readonly isRetryable: boolean = false, 5 | public readonly cause?: Error, 6 | ) { 7 | super(message); 8 | this.name = this.constructor.name; 9 | if (cause?.stack) { 10 | this.stack = `${this.stack}\nCaused by: ${cause.stack}`; 11 | } 12 | } 13 | } 14 | 15 | class NetworkError extends ScraperError { 16 | constructor( 17 | message: string, 18 | public readonly statusCode?: number, 19 | cause?: Error, 20 | ) { 21 | super(message, true, cause); 22 | } 23 | } 24 | 25 | class RateLimitError extends ScraperError { 26 | constructor( 27 | message: string, 28 | public readonly retryAfter?: number, 29 | ) { 30 | super(message, true); 31 | } 32 | } 33 | 34 | class InvalidUrlError extends ScraperError { 35 | constructor(url: string, cause?: Error) { 36 | super(`Invalid URL: ${url}`, false, cause); 37 | } 38 | } 39 | 40 | class ParsingError extends ScraperError { 41 | constructor(message: string, cause?: Error) { 42 | super(`Failed to parse content: ${message}`, false, cause); 43 | } 44 | } 45 | 46 | class RedirectError extends ScraperError { 47 | constructor( 48 | public readonly originalUrl: string, 49 | public readonly redirectUrl: string, 50 | public readonly statusCode: number, 51 | ) { 52 | super( 53 | `Redirect detected from ${originalUrl} to ${redirectUrl} (status: ${statusCode})`, 54 | false, 55 | ); 56 | } 57 | } 58 | 59 | export { 60 | ScraperError, 61 | NetworkError, 62 | RateLimitError, 63 | InvalidUrlError, 64 | ParsingError, 65 | RedirectError, 66 | }; 67 | -------------------------------------------------------------------------------- /src/utils/logger.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Defines the available log levels. 3 | */ 4 | export enum LogLevel { 5 | ERROR = 0, 6 | WARN = 1, 7 | INFO = 2, 8 | DEBUG = 3, 9 | } 10 | 11 | let currentLogLevel: LogLevel = LogLevel.INFO; // Default level 12 | 13 | /** 14 | * Sets the current logging level for the application. 15 | * @param level - The desired log level. 16 | */ 17 | export function setLogLevel(level: LogLevel): void { 18 | currentLogLevel = level; 19 | } 20 | 21 | /** 22 | * Provides logging functionalities with level control. 23 | */ 24 | export const logger = { 25 | /** 26 | * Logs a debug message if the current log level is DEBUG or higher. 27 | * @param message - The message to log. 28 | */ 29 | debug: (message: string) => { 30 | if (currentLogLevel >= LogLevel.DEBUG && !process.env.VITEST_WORKER_ID) { 31 | console.debug(message); 32 | } 33 | }, 34 | /** 35 | * Logs an info message if the current log level is INFO or higher. 36 | * @param message - The message to log. 37 | */ 38 | info: (message: string) => { 39 | if (currentLogLevel >= LogLevel.INFO && !process.env.VITEST_WORKER_ID) { 40 | console.log(message); // Using console.log for INFO 41 | } 42 | }, 43 | /** 44 | * Logs a warning message if the current log level is WARN or higher. 45 | * @param message - The message to log. 46 | */ 47 | warn: (message: string) => { 48 | if (currentLogLevel >= LogLevel.WARN && !process.env.VITEST_WORKER_ID) { 49 | console.warn(message); 50 | } 51 | }, 52 | /** 53 | * Logs an error message if the current log level is ERROR or higher (always logs). 54 | * @param message - The message to log. 55 | */ 56 | error: (message: string) => { 57 | if (currentLogLevel >= LogLevel.ERROR && !process.env.VITEST_WORKER_ID) { 58 | console.error(message); 59 | } 60 | }, 61 | }; 62 | -------------------------------------------------------------------------------- /src/utils/mimeTypeUtils.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Represents a parsed Content-Type header. 3 | */ 4 | export interface ParsedContentType { 5 | mimeType: string; 6 | charset?: string; 7 | } 8 | 9 | /** 10 | * Utility functions for handling MIME types and charsets. 11 | */ 12 | // biome-ignore lint/complexity/noStaticOnlyClass: helpers are static 13 | export class MimeTypeUtils { 14 | /** 15 | * Parses a Content-Type header string into its MIME type and charset. 16 | * @param contentTypeHeader The Content-Type header string (e.g., "text/html; charset=utf-8"). 17 | * @returns A ParsedContentType object, or a default if parsing fails. 18 | */ 19 | public static parseContentType(contentTypeHeader?: string | null): ParsedContentType { 20 | if (!contentTypeHeader) { 21 | return { mimeType: "application/octet-stream" }; 22 | } 23 | const parts = contentTypeHeader.split(";").map((part) => part.trim()); 24 | const mimeType = parts[0].toLowerCase(); 25 | let charset: string | undefined; 26 | 27 | for (let i = 1; i < parts.length; i++) { 28 | const param = parts[i]; 29 | if (param.toLowerCase().startsWith("charset=")) { 30 | charset = param.substring("charset=".length).toLowerCase(); 31 | break; 32 | } 33 | } 34 | return { mimeType, charset }; 35 | } 36 | 37 | /** 38 | * Checks if a MIME type represents HTML content. 39 | */ 40 | public static isHtml(mimeType: string): boolean { 41 | return mimeType === "text/html" || mimeType === "application/xhtml+xml"; 42 | } 43 | 44 | /** 45 | * Checks if a MIME type represents Markdown content. 46 | */ 47 | public static isMarkdown(mimeType: string): boolean { 48 | return mimeType === "text/markdown" || mimeType === "text/x-markdown"; 49 | } 50 | 51 | /** 52 | * Checks if a MIME type represents plain text content. 53 | */ 54 | public static isText(mimeType: string): boolean { 55 | return mimeType.startsWith("text/"); 56 | } 57 | 58 | // Extend with more helpers as needed (isJson, isXml, isPdf, etc.) 59 | } 60 | -------------------------------------------------------------------------------- /src/utils/paths.ts: -------------------------------------------------------------------------------- 1 | import fs from "node:fs"; 2 | import path from "node:path"; 3 | import { fileURLToPath } from "node:url"; 4 | 5 | let projectRoot: string | null = null; 6 | 7 | /** 8 | * Finds the project root directory by searching upwards from the current file 9 | * for a directory containing 'package.json'. Caches the result. 10 | * 11 | * @returns {string} The absolute path to the project root. 12 | * @throws {Error} If package.json cannot be found. 13 | */ 14 | export function getProjectRoot(): string { 15 | // Return cached result if available 16 | if (projectRoot) { 17 | return projectRoot; 18 | } 19 | 20 | // Start from the directory of the current module 21 | const currentFilePath = fileURLToPath(import.meta.url); 22 | let currentDir = path.dirname(currentFilePath); 23 | 24 | // eslint-disable-next-line no-constant-condition 25 | while (true) { 26 | const packageJsonPath = path.join(currentDir, "package.json"); 27 | if (fs.existsSync(packageJsonPath)) { 28 | projectRoot = currentDir; // Cache the result 29 | return projectRoot; 30 | } 31 | 32 | const parentDir = path.dirname(currentDir); 33 | // Check if we have reached the filesystem root 34 | if (parentDir === currentDir) { 35 | throw new Error("Could not find project root containing package.json."); 36 | } 37 | currentDir = parentDir; 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /src/utils/string.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Thoroughly removes all types of whitespace characters from both ends of a string. 3 | * Handles spaces, tabs, line breaks, and carriage returns. 4 | */ 5 | export const fullTrim = (str: string): string => { 6 | return str.replace(/^[\s\r\n\t]+|[\s\r\n\t]+$/g, ""); 7 | }; 8 | -------------------------------------------------------------------------------- /src/utils/url.ts: -------------------------------------------------------------------------------- 1 | import psl from "psl"; 2 | import { InvalidUrlError } from "./errors"; 3 | 4 | interface UrlNormalizerOptions { 5 | ignoreCase?: boolean; 6 | removeHash?: boolean; 7 | removeTrailingSlash?: boolean; 8 | removeQuery?: boolean; 9 | removeIndex?: boolean; 10 | } 11 | 12 | const defaultNormalizerOptions: UrlNormalizerOptions = { 13 | ignoreCase: true, 14 | removeHash: true, 15 | removeTrailingSlash: true, 16 | removeQuery: false, 17 | removeIndex: true, 18 | }; 19 | 20 | export function normalizeUrl( 21 | url: string, 22 | options: UrlNormalizerOptions = defaultNormalizerOptions, 23 | ): string { 24 | try { 25 | const parsedUrl = new URL(url); 26 | const finalOptions = { ...defaultNormalizerOptions, ...options }; 27 | 28 | // Create a new URL to ensure proper structure 29 | const normalized = new URL(parsedUrl.origin + parsedUrl.pathname); 30 | 31 | // Remove index files first, before handling trailing slashes 32 | if (finalOptions.removeIndex) { 33 | normalized.pathname = normalized.pathname.replace( 34 | /\/index\.(html|htm|asp|php|jsp)$/i, 35 | "/", 36 | ); 37 | } 38 | 39 | // Handle trailing slash 40 | if (finalOptions.removeTrailingSlash && normalized.pathname.length > 1) { 41 | normalized.pathname = normalized.pathname.replace(/\/+$/, ""); 42 | } 43 | 44 | // Keep original parts we want to preserve 45 | const preservedHash = !finalOptions.removeHash ? parsedUrl.hash : ""; 46 | const preservedSearch = !finalOptions.removeQuery ? parsedUrl.search : ""; 47 | 48 | // Construct final URL string in correct order (query before hash) 49 | let result = normalized.origin + normalized.pathname; 50 | if (preservedSearch) { 51 | result += preservedSearch; 52 | } 53 | if (preservedHash) { 54 | result += preservedHash; 55 | } 56 | 57 | // Apply case normalization if configured 58 | if (finalOptions.ignoreCase) { 59 | result = result.toLowerCase(); 60 | } 61 | 62 | return result; 63 | } catch { 64 | return url; // Return original URL if parsing fails 65 | } 66 | } 67 | 68 | /** 69 | * Validates if a string is a valid URL 70 | * @throws {InvalidUrlError} If the URL is invalid 71 | */ 72 | export function validateUrl(url: string): void { 73 | try { 74 | new URL(url); 75 | } catch (error) { 76 | throw new InvalidUrlError(url, error instanceof Error ? error : undefined); 77 | } 78 | } 79 | 80 | /** 81 | * Checks if two URLs have the exact same hostname 82 | */ 83 | export function hasSameHostname(urlA: URL, urlB: URL): boolean { 84 | return urlA.hostname.toLowerCase() === urlB.hostname.toLowerCase(); 85 | } 86 | 87 | /** 88 | * Checks if two URLs are on the same domain (including subdomains) 89 | * Using the public suffix list to properly handle domains like .co.uk 90 | */ 91 | export function hasSameDomain(urlA: URL, urlB: URL): boolean { 92 | const domainA = psl.get(urlA.hostname.toLowerCase()); 93 | const domainB = psl.get(urlB.hostname.toLowerCase()); 94 | return domainA !== null && domainA === domainB; 95 | } 96 | 97 | /** 98 | * Checks if a target URL is under the same path as the base URL 99 | * Example: base = https://example.com/docs/ 100 | * target = https://example.com/docs/getting-started 101 | * result = true 102 | */ 103 | export function isSubpath(baseUrl: URL, targetUrl: URL): boolean { 104 | // Normalize paths to ensure consistent comparison 105 | const basePath = baseUrl.pathname.endsWith("/") 106 | ? baseUrl.pathname 107 | : `${baseUrl.pathname}/`; 108 | 109 | return targetUrl.pathname.startsWith(basePath); 110 | } 111 | 112 | export type { UrlNormalizerOptions }; 113 | -------------------------------------------------------------------------------- /src/web/components/Alert.tsx: -------------------------------------------------------------------------------- 1 | import type { PropsWithChildren } from "@kitajs/html"; 2 | 3 | /** 4 | * Defines the possible types for the Alert component. 5 | */ 6 | type AlertType = "success" | "error" | "warning" | "info"; 7 | 8 | /** 9 | * Props for the Alert component. 10 | */ 11 | interface AlertProps extends PropsWithChildren { 12 | type: AlertType; 13 | title?: string; 14 | message: string | JSX.Element; // Allow JSX for messages 15 | } 16 | 17 | /** 18 | * Reusable Alert component using Flowbite styling. 19 | * Displays messages with appropriate colors and icons based on the type. 20 | * @param props - Component props including type, title (optional), and message. 21 | */ 22 | const Alert = ({ type, title, message }: AlertProps) => { 23 | let iconSvg: JSX.Element; 24 | let colorClasses: string; 25 | let defaultTitle: string; 26 | 27 | switch (type) { 28 | case "success": 29 | defaultTitle = "Success:"; 30 | colorClasses = 31 | "text-green-800 border-green-300 bg-green-50 dark:bg-gray-800 dark:text-green-400 dark:border-green-800"; 32 | iconSvg = ( 33 | 42 | ); 43 | break; 44 | case "error": 45 | defaultTitle = "Error:"; 46 | colorClasses = 47 | "text-red-800 border-red-300 bg-red-50 dark:bg-gray-800 dark:text-red-400 dark:border-red-800"; 48 | iconSvg = ( 49 | 58 | ); 59 | break; 60 | case "warning": 61 | defaultTitle = "Warning:"; 62 | colorClasses = 63 | "text-yellow-800 border-yellow-300 bg-yellow-50 dark:bg-gray-800 dark:text-yellow-300 dark:border-yellow-800"; 64 | iconSvg = ( 65 | 74 | ); 75 | break; 76 | case "info": 77 | default: // Default to info style 78 | defaultTitle = "Info:"; 79 | colorClasses = 80 | "text-blue-800 border-blue-300 bg-blue-50 dark:bg-gray-800 dark:text-blue-400 dark:border-blue-800"; 81 | iconSvg = ( 82 | 91 | ); 92 | break; 93 | } 94 | 95 | const displayTitle = title ?? defaultTitle; 96 | 97 | return ( 98 | 113 | ); 114 | }; 115 | 116 | export default Alert; 117 | -------------------------------------------------------------------------------- /src/web/components/JobList.tsx: -------------------------------------------------------------------------------- 1 | import type { JobInfo } from "../../tools/GetJobInfoTool"; 2 | import JobItem from "./JobItem"; // Adjusted import path 3 | 4 | /** 5 | * Props for the JobList component. 6 | */ 7 | interface JobListProps { 8 | jobs: JobInfo[]; 9 | } 10 | 11 | /** 12 | * Renders a list of JobItem components or a message if the list is empty. 13 | * Adds a listener for the 'job-list-refresh' event to trigger a reload of the job list using HTMX. 14 | * @param props - Component props including the array of jobs. 15 | */ 16 | const JobList = ({ jobs }: JobListProps) => ( 17 |
18 | {jobs.length === 0 ? ( 19 |

20 | No pending jobs. 21 |

22 | ) : ( 23 | jobs.map((job) => ) 24 | )} 25 | {/* NOTE: To enable live job list refresh after stopping a job, add the following script to your main HTML layout or main.client.ts: 26 | document.addEventListener('job-list-refresh', function () { 27 | if (window.htmx) { 28 | window.htmx.ajax('GET', '/api/jobs', '#job-list'); 29 | } else { 30 | window.location.reload(); 31 | } 32 | }); 33 | */} 34 |
35 | ); 36 | 37 | export default JobList; 38 | -------------------------------------------------------------------------------- /src/web/components/Layout.tsx: -------------------------------------------------------------------------------- 1 | import type { PropsWithChildren } from "@kitajs/html"; 2 | import { readFileSync } from "node:fs"; 3 | 4 | /** 5 | * Props for the Layout component. 6 | */ 7 | interface LayoutProps extends PropsWithChildren { 8 | title: string; 9 | /** Optional version string to display next to the title. */ 10 | version?: string; 11 | } 12 | 13 | /** 14 | * Base HTML layout component for all pages. 15 | * Includes common head elements, header, and scripts. 16 | * @param props - Component props including title, version, and children. 17 | */ 18 | const Layout = ({ title, version, children }: LayoutProps) => { 19 | let versionString = version; 20 | if (!versionString) { 21 | // If no version is provided, use the version from package.json 22 | // We cannot bake the version into the bundle, as the package.json will 23 | // be updated by the build process, after the bundle is created. 24 | try { 25 | const packageJson = JSON.parse(readFileSync("package.json", "utf-8")) as { 26 | version: string; 27 | }; 28 | versionString = packageJson.version; 29 | } catch (error) { 30 | console.error("Error reading package.json:", error); 31 | } 32 | } 33 | return ( 34 | 35 | 36 | 37 | 38 | {title} 39 | {/* Bundled CSS (includes Tailwind and Flowbite) */} 40 | 41 | {/* Add style for htmx-indicator behavior (needed globally) */} 42 | 67 | 68 | 69 |
70 |
71 |

72 | MCP Docs 73 | {versionString ? ( 74 | 79 | v{versionString} 80 | 81 | ) : null} 82 |

83 |
84 | 85 |
{children}
86 |
87 | 88 | {/* Bundled JS (includes Flowbite, HTMX, AlpineJS, and initialization) */} 89 | 90 | 91 | 92 | ); 93 | }; 94 | 95 | export default Layout; 96 | -------------------------------------------------------------------------------- /src/web/components/LibraryDetailCard.tsx: -------------------------------------------------------------------------------- 1 | import type { LibraryInfo } from "../../tools/ListLibrariesTool"; 2 | import VersionDetailsRow from "./VersionDetailsRow"; // Adjusted import path 3 | 4 | /** 5 | * Props for the LibraryDetailCard component. 6 | */ 7 | interface LibraryDetailCardProps { 8 | library: LibraryInfo; 9 | } 10 | 11 | /** 12 | * Renders a card displaying library details and its versions. 13 | * Uses VersionDetailsRow without the delete button. 14 | * @param props - Component props including the library information. 15 | */ 16 | const LibraryDetailCard = ({ library }: LibraryDetailCardProps) => ( 17 | // Use Flowbite Card structure with updated padding and border, and white background 18 |
19 |

20 | {library.name} 21 |

22 | {/* Container for version rows */} 23 |
24 | {library.versions.length > 0 ? ( 25 | library.versions.map((version) => ( 26 | 31 | )) 32 | ) : ( 33 | // Display message if no versions are indexed 34 |

35 | No versions indexed. 36 |

37 | )} 38 |
39 |
40 | ); 41 | 42 | export default LibraryDetailCard; 43 | -------------------------------------------------------------------------------- /src/web/components/LibraryItem.tsx: -------------------------------------------------------------------------------- 1 | import type { LibraryInfo } from "../../tools/ListLibrariesTool"; 2 | import VersionDetailsRow from "./VersionDetailsRow"; // Adjusted import path 3 | 4 | /** 5 | * Props for the LibraryItem component. 6 | */ 7 | interface LibraryItemProps { 8 | library: LibraryInfo; 9 | } 10 | 11 | /** 12 | * Renders a card for a single library, listing its versions with details. 13 | * Uses VersionDetailsRow to display each version. 14 | * @param props - Component props including the library information. 15 | */ 16 | const LibraryItem = ({ library }: LibraryItemProps) => ( 17 | // Use Flowbite Card structure with updated padding and border, and white background 18 |
19 |

20 | 24 | {library.name} 25 | 26 |

27 | {/* Container for version rows */} 28 |
29 | {library.versions.length > 0 ? ( 30 | library.versions.map((version) => ( 31 | 32 | )) 33 | ) : ( 34 | // Display message if no versions are indexed 35 |

36 | No versions indexed. 37 |

38 | )} 39 |
40 |
41 | ); 42 | 43 | export default LibraryItem; 44 | -------------------------------------------------------------------------------- /src/web/components/LibraryList.tsx: -------------------------------------------------------------------------------- 1 | import type { LibraryInfo } from "../../tools/ListLibrariesTool"; 2 | import LibraryItem from "./LibraryItem"; // Adjusted import path 3 | 4 | /** 5 | * Props for the LibraryList component. 6 | */ 7 | interface LibraryListProps { 8 | libraries: LibraryInfo[]; 9 | } 10 | 11 | /** 12 | * Renders a list of LibraryItem components. 13 | * @param props - Component props including the array of libraries. 14 | */ 15 | const LibraryList = ({ libraries }: LibraryListProps) => { 16 | return ( 17 | <> 18 |
19 | {libraries.map((library) => ( 20 | 21 | ))} 22 |
23 | 24 | ); 25 | }; 26 | 27 | export default LibraryList; 28 | -------------------------------------------------------------------------------- /src/web/components/LibrarySearchCard.tsx: -------------------------------------------------------------------------------- 1 | import type { LibraryInfo } from "../../tools/ListLibrariesTool"; 2 | import LoadingSpinner from "./LoadingSpinner"; // Import spinner 3 | 4 | /** 5 | * Props for the LibrarySearchCard component. 6 | */ 7 | interface LibrarySearchCardProps { 8 | library: LibraryInfo; 9 | } 10 | 11 | /** 12 | * Renders the search form card for a specific library. 13 | * Includes a version dropdown and query input. 14 | * @param props - Component props including the library information. 15 | */ 16 | const LibrarySearchCard = ({ library }: LibrarySearchCardProps) => { 17 | return ( 18 |
19 |

20 | Search {library.name} Documentation 21 |

22 |
29 | 40 | 47 | 57 |
58 | {/* Add style for htmx-indicator behavior on button */} 59 | {/* Styles moved to Layout.tsx */} 60 |
61 | ); 62 | }; 63 | 64 | export default LibrarySearchCard; 65 | -------------------------------------------------------------------------------- /src/web/components/LoadingSpinner.tsx: -------------------------------------------------------------------------------- 1 | /** 2 | * Renders an SVG loading spinner icon. 3 | * Used for indicating loading states in buttons or other elements. 4 | */ 5 | const LoadingSpinner = () => ( 6 | 12 | 20 | 25 | 26 | ); 27 | 28 | export default LoadingSpinner; 29 | -------------------------------------------------------------------------------- /src/web/components/ScrapeForm.tsx: -------------------------------------------------------------------------------- 1 | import ScrapeFormContent from "./ScrapeFormContent"; // Adjusted import path 2 | 3 | /** 4 | * Wrapper component for the ScrapeFormContent. 5 | * Provides a container div, often used as a target for HTMX OOB swaps. 6 | */ 7 | const ScrapeForm = () => ( 8 |
9 | 10 |
11 | ); 12 | 13 | export default ScrapeForm; 14 | -------------------------------------------------------------------------------- /src/web/components/SearchResultItem.tsx: -------------------------------------------------------------------------------- 1 | import { unified } from "unified"; // Import unified 2 | import remarkParse from "remark-parse"; // Import unified plugins 3 | import remarkGfm from "remark-gfm"; 4 | import remarkHtml from "remark-html"; 5 | import DOMPurify from "dompurify"; // Import DOMPurify 6 | import { createJSDOM } from "../../utils/dom"; // Import JSDOM helper 7 | import type { StoreSearchResult } from "../../store/types"; 8 | 9 | /** 10 | * Props for the SearchResultItem component. 11 | */ 12 | interface SearchResultItemProps { 13 | result: StoreSearchResult; 14 | } 15 | 16 | /** 17 | * Renders a single search result item. 18 | * Converts markdown content to HTML using unified. 19 | * @param props - Component props including the search result data. 20 | */ 21 | const SearchResultItem = async ({ result }: SearchResultItemProps) => { 22 | // Use unified pipeline to convert markdown to HTML 23 | const processor = unified().use(remarkParse).use(remarkGfm).use(remarkHtml); 24 | const file = await processor.process(result.content); 25 | const rawHtml = String(file); 26 | 27 | // Create JSDOM instance and initialize DOMPurify 28 | const jsdom = createJSDOM(""); 29 | const purifier = DOMPurify(jsdom.window); 30 | 31 | // Sanitize the HTML content 32 | const safeHtml = purifier.sanitize(rawHtml); 33 | 34 | return ( 35 |
36 | 47 | {/* Render the sanitized HTML content */} 48 |
{safeHtml}
49 |
50 | ); 51 | }; 52 | 53 | export default SearchResultItem; 54 | -------------------------------------------------------------------------------- /src/web/components/SearchResultList.tsx: -------------------------------------------------------------------------------- 1 | import type { StoreSearchResult } from "../../store/types"; 2 | import SearchResultItem from "./SearchResultItem"; // Adjusted import path 3 | 4 | /** 5 | * Props for the SearchResultList component. 6 | */ 7 | interface SearchResultListProps { 8 | results: StoreSearchResult[]; 9 | } 10 | 11 | /** 12 | * Renders the list of search results using SearchResultItem. 13 | * Displays a message if no results are found. 14 | * @param props - Component props including the array of search results. 15 | */ 16 | const SearchResultList = ({ results }: SearchResultListProps) => { 17 | if (results.length === 0) { 18 | return ( 19 |

No results found.

20 | ); 21 | } 22 | return ( 23 |
24 | {results.map((result) => ( 25 | 26 | ))} 27 |
28 | ); 29 | }; 30 | 31 | export default SearchResultList; 32 | -------------------------------------------------------------------------------- /src/web/components/SearchResultSkeletonItem.tsx: -------------------------------------------------------------------------------- 1 | /** 2 | * Renders a skeleton placeholder for a search result item. 3 | * Used to indicate loading state while search results are being fetched. 4 | */ 5 | const SearchResultSkeletonItem = () => ( 6 |
7 |
8 |
9 |
10 |
11 | ); 12 | 13 | export default SearchResultSkeletonItem; 14 | -------------------------------------------------------------------------------- /src/web/components/Tooltip.tsx: -------------------------------------------------------------------------------- 1 | import type { PropsWithChildren } from "@kitajs/html"; 2 | 3 | /** 4 | * Props for the Tooltip component. 5 | */ 6 | interface TooltipProps extends PropsWithChildren { 7 | text: string | Promise | Element; 8 | position?: "top" | "right" | "bottom" | "left"; 9 | } 10 | 11 | /** 12 | * Reusable Tooltip component using Alpine.js for state management. 13 | * Displays a help icon that shows a tooltip on hover/focus. 14 | * 15 | * @param props - Component props including text and optional position. 16 | */ 17 | const Tooltip = ({ text, position = "top" }: TooltipProps) => { 18 | // Map position to Tailwind classes 19 | const positionClasses = { 20 | top: "bottom-full left-1/2 transform -translate-x-1/2 -translate-y-1 mb-1", 21 | right: "left-full top-1/2 transform -translate-y-1/2 translate-x-1 ml-1", 22 | bottom: "top-full left-1/2 transform -translate-x-1/2 translate-y-1 mt-1", 23 | left: "right-full top-1/2 transform -translate-y-1/2 -translate-x-1 mr-1", 24 | }; 25 | 26 | return ( 27 |
31 | 56 |
61 | {text as "safe"} 62 |
63 |
64 | ); 65 | }; 66 | 67 | export default Tooltip; 68 | -------------------------------------------------------------------------------- /src/web/components/VersionBadge.tsx: -------------------------------------------------------------------------------- 1 | interface VersionBadgeProps { 2 | version: string; 3 | } 4 | 5 | const VersionBadge = ({ version }: VersionBadgeProps) => { 6 | if (!version) { 7 | return null; // Don't render if no version is provided 8 | } 9 | 10 | return ( 11 | 12 | {version} 13 | 14 | ); 15 | }; 16 | 17 | export default VersionBadge; 18 | -------------------------------------------------------------------------------- /src/web/components/utils.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/arabold/docs-mcp-server/2c6fb88ac09b82baea2068e878855d5b78e2a6e2/src/web/components/utils.ts -------------------------------------------------------------------------------- /src/web/main.client.ts: -------------------------------------------------------------------------------- 1 | // Import the main CSS file which includes Tailwind and Flowbite styles 2 | import "./styles/main.css"; 3 | 4 | import Alpine from "alpinejs"; 5 | import { initFlowbite } from "flowbite"; 6 | import htmx from "htmx.org"; 7 | 8 | // Ensure Alpine global store for confirmation actions is initialized before Alpine components render 9 | Alpine.store("confirmingAction", { 10 | type: null, 11 | id: null, 12 | timeoutId: null, 13 | isDeleting: false, 14 | }); 15 | 16 | Alpine.start(); 17 | 18 | // Initialize Flowbite components 19 | initFlowbite(); 20 | 21 | // Add a global event listener for 'job-list-refresh' that uses HTMX to reload the job list 22 | // This is still useful for manual refresh after actions like clearing jobs 23 | document.addEventListener("job-list-refresh", () => { 24 | htmx.ajax("GET", "/api/jobs", "#job-queue"); 25 | }); 26 | 27 | // Add a global event listener for 'version-list-refresh' that reloads the version list container using HTMX 28 | document.addEventListener("version-list-refresh", (event: Event) => { 29 | const customEvent = event as CustomEvent<{ library: string }>; 30 | const library = customEvent.detail?.library; 31 | if (library) { 32 | htmx.ajax( 33 | "GET", 34 | `/api/libraries/${encodeURIComponent(library)}/versions`, 35 | "#version-list", 36 | ); 37 | } 38 | }); 39 | 40 | // Listen for htmx swaps after a version delete and dispatch version-list-refresh with payload 41 | document.body.addEventListener("htmx:afterSwap", (event) => { 42 | // Always re-initialize AlpineJS for swapped-in DOM to fix $store errors 43 | if (event.target instanceof HTMLElement) { 44 | Alpine.initTree(event.target); 45 | } 46 | 47 | // Existing logic for version delete refresh 48 | const detail = (event as CustomEvent).detail; 49 | if ( 50 | detail?.xhr?.status === 204 && 51 | detail?.requestConfig?.verb === "delete" && 52 | (event.target as HTMLElement)?.id?.startsWith("row-") 53 | ) { 54 | // Extract library name from the row id: row-- 55 | const rowId = (event.target as HTMLElement).id; 56 | const match = rowId.match(/^row-([^-]+)-/); 57 | const library = match ? match[1] : null; 58 | if (library) { 59 | document.dispatchEvent( 60 | new CustomEvent("version-list-refresh", { detail: { library } }), 61 | ); 62 | } else { 63 | window.location.reload(); 64 | } 65 | } 66 | }); 67 | -------------------------------------------------------------------------------- /src/web/routes/index.tsx: -------------------------------------------------------------------------------- 1 | import type { FastifyInstance } from "fastify"; 2 | import Layout from "../components/Layout"; // Import the Layout component 3 | 4 | /** 5 | * Registers the root route that serves the main HTML page. 6 | * @param server - The Fastify instance. 7 | */ 8 | export function registerIndexRoute(server: FastifyInstance) { 9 | server.get("/", async (_, reply) => { 10 | reply.type("text/html"); 11 | // Use the Layout component and define the main content within it 12 | return ( 13 | "" + 14 | ( 15 | 16 | {/* Job Queue Section */} 17 |
18 |
19 |

20 | Job Queue 21 |

22 | 33 |
34 | {/* Container for the job list, loaded via HTMX */} 35 |
36 | {/* Initial loading state */} 37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 | {/* Add New Job Section */} 45 |
46 | {/* Container for the add job form, loaded via HTMX */} 47 |
48 | {/* Initial loading state (optional, could just be empty) */} 49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 | {/* Indexed Documentation Section */} 57 |
58 |

59 | Indexed Documentation 60 |

61 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 | 74 | ) 75 | ); 76 | }); 77 | } 78 | -------------------------------------------------------------------------------- /src/web/routes/jobs/cancel.tsx: -------------------------------------------------------------------------------- 1 | import type { FastifyInstance } from "fastify"; 2 | import type { CancelJobTool } from "../../../tools/CancelJobTool"; 3 | 4 | /** 5 | * Registers the API route for cancelling jobs. 6 | * @param server - The Fastify instance. 7 | * @param cancelJobTool - The tool instance for cancelling jobs. 8 | */ 9 | export function registerCancelJobRoute( 10 | server: FastifyInstance, 11 | cancelJobTool: CancelJobTool 12 | ) { 13 | // POST /api/jobs/:jobId/cancel - Cancel a job by ID 14 | server.post<{ Params: { jobId: string } }>( 15 | "/api/jobs/:jobId/cancel", 16 | async (request, reply) => { 17 | const { jobId } = request.params; 18 | const result = await cancelJobTool.execute({ jobId }); 19 | if (result.success) { 20 | return { success: true, message: result.message }; 21 | } else { 22 | reply.status(400); 23 | return { success: false, message: result.message }; 24 | } 25 | } 26 | ); 27 | } 28 | -------------------------------------------------------------------------------- /src/web/routes/jobs/clear-completed.tsx: -------------------------------------------------------------------------------- 1 | import type { FastifyInstance } from "fastify"; 2 | import type { ClearCompletedJobsTool } from "../../../tools/ClearCompletedJobsTool"; 3 | 4 | /** 5 | * Registers the API route for clearing completed jobs. 6 | * @param server - The Fastify instance. 7 | * @param clearCompletedJobsTool - The tool instance for clearing completed jobs. 8 | */ 9 | export function registerClearCompletedJobsRoute( 10 | server: FastifyInstance, 11 | clearCompletedJobsTool: ClearCompletedJobsTool 12 | ) { 13 | // POST /api/jobs/clear-completed - Clear all completed jobs 14 | server.post("/api/jobs/clear-completed", async (_, reply) => { 15 | try { 16 | const result = await clearCompletedJobsTool.execute({}); 17 | 18 | reply.type("application/json"); 19 | return { 20 | success: result.success, 21 | message: result.message, 22 | }; 23 | } catch (error) { 24 | reply.code(500); 25 | return { 26 | success: false, 27 | message: `Internal server error: ${error instanceof Error ? error.message : String(error)}`, 28 | }; 29 | } 30 | }); 31 | } 32 | -------------------------------------------------------------------------------- /src/web/routes/jobs/list.tsx: -------------------------------------------------------------------------------- 1 | import type { FastifyInstance } from "fastify"; 2 | import type { ListJobsTool } from "../../../tools/ListJobsTool"; // Adjusted import path 3 | import JobList from "../../components/JobList"; // Import the extracted component 4 | 5 | /** 6 | * Registers the API route for listing jobs. 7 | * @param server - The Fastify instance. 8 | * @param listJobsTool - The tool instance for listing jobs. 9 | */ 10 | export function registerJobListRoutes( 11 | server: FastifyInstance, 12 | listJobsTool: ListJobsTool 13 | ) { 14 | // GET /api/jobs - List current jobs (only the list) 15 | server.get("/api/jobs", async () => { 16 | const result = await listJobsTool.execute({}); 17 | return ; 18 | }); 19 | } 20 | -------------------------------------------------------------------------------- /src/web/routes/libraries/list.tsx: -------------------------------------------------------------------------------- 1 | import type { FastifyInstance } from "fastify"; 2 | import type { ListLibrariesTool } from "../../../tools/ListLibrariesTool"; 3 | import { RemoveTool } from "../../../tools"; 4 | import LibraryList from "../../components/LibraryList"; 5 | 6 | /** 7 | * Registers the API routes for library management. 8 | * @param server - The Fastify instance. 9 | * @param listLibrariesTool - The tool instance for listing libraries. 10 | * @param removeTool - The tool instance for removing library versions. 11 | */ 12 | export function registerLibrariesRoutes( 13 | server: FastifyInstance, 14 | listLibrariesTool: ListLibrariesTool, 15 | removeTool: RemoveTool // Accept RemoveTool 16 | ) { 17 | server.get("/api/libraries", async (_request, reply) => { 18 | // Add reply 19 | try { 20 | const result = await listLibrariesTool.execute(); 21 | // Set content type to HTML for JSX rendering 22 | reply.type("text/html; charset=utf-8"); 23 | // Render the component directly 24 | return ; 25 | } catch (error) { 26 | server.log.error(error, "Failed to list libraries"); 27 | reply.status(500).send("Internal Server Error"); // Handle errors 28 | } 29 | }); 30 | 31 | // Add DELETE route for removing versions 32 | server.delete<{ Params: { libraryName: string; versionParam: string } }>( 33 | "/api/libraries/:libraryName/versions/:versionParam", 34 | async (request, reply) => { 35 | const { libraryName, versionParam } = request.params; 36 | const version = versionParam === "unversioned" ? undefined : versionParam; 37 | try { 38 | await removeTool.execute({ library: libraryName, version }); 39 | reply.status(204).send(); // No Content on success 40 | } catch (error: any) { 41 | server.log.error( 42 | error, 43 | `Failed to remove ${libraryName}@${versionParam}` 44 | ); 45 | // Check for specific errors if needed, e.g., NotFoundError 46 | reply 47 | .status(500) 48 | .send({ message: error.message || "Failed to remove version." }); 49 | } 50 | } 51 | ); 52 | } 53 | -------------------------------------------------------------------------------- /src/web/styles/main.css: -------------------------------------------------------------------------------- 1 | /* Import Tailwind CSS */ 2 | @import "tailwindcss"; 3 | 4 | /* Import Flowbite default theme */ 5 | @import "flowbite/src/themes/default"; 6 | 7 | /* Import Flowbite plugin */ 8 | @plugin "flowbite/plugin"; 9 | @plugin "flowbite-typography"; 10 | 11 | /* Configure Flowbite source files */ 12 | @source "../../../node_modules/flowbite"; 13 | 14 | @layer components { 15 | a { 16 | @apply underline-offset-8; 17 | } 18 | 19 | button { 20 | @apply cursor-pointer; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/web/web.ts: -------------------------------------------------------------------------------- 1 | import path from "node:path"; 2 | import formBody from "@fastify/formbody"; 3 | import fastifyStatic from "@fastify/static"; 4 | import Fastify, { type FastifyInstance } from "fastify"; 5 | import type { PipelineManager } from "../pipeline/PipelineManager"; 6 | import type { DocumentManagementService } from "../store/DocumentManagementService"; 7 | import { SearchTool } from "../tools"; 8 | import { CancelJobTool } from "../tools/CancelJobTool"; 9 | import { ClearCompletedJobsTool } from "../tools/ClearCompletedJobsTool"; 10 | import { ListJobsTool } from "../tools/ListJobsTool"; 11 | import { ListLibrariesTool } from "../tools/ListLibrariesTool"; 12 | import { RemoveTool } from "../tools/RemoveTool"; 13 | import { ScrapeTool } from "../tools/ScrapeTool"; 14 | import { logger } from "../utils/logger"; 15 | import { getProjectRoot } from "../utils/paths"; 16 | import { registerIndexRoute } from "./routes/index"; 17 | import { registerCancelJobRoute } from "./routes/jobs/cancel"; 18 | import { registerClearCompletedJobsRoute } from "./routes/jobs/clear-completed"; 19 | import { registerJobListRoutes } from "./routes/jobs/list"; 20 | import { registerNewJobRoutes } from "./routes/jobs/new"; 21 | import { registerLibraryDetailRoutes } from "./routes/libraries/detail"; 22 | import { registerLibrariesRoutes } from "./routes/libraries/list"; 23 | 24 | /** 25 | * Initializes the Fastify web server instance. 26 | * 27 | * @param port The port number for the web server. 28 | * @param docService The document management service instance. 29 | * @param pipelineManager The pipeline manager instance. 30 | * @returns The initialized Fastify server instance. 31 | */ 32 | export async function startWebServer( 33 | port: number, 34 | docService: DocumentManagementService, 35 | pipelineManager: PipelineManager, 36 | ): Promise { 37 | const server = Fastify({ 38 | logger: false, // Use our own logger instead 39 | }); 40 | 41 | // Register plugins 42 | await server.register(formBody); // Register formbody to parse form data 43 | 44 | // Instantiate tools using provided services 45 | const listLibrariesTool = new ListLibrariesTool(docService); 46 | const listJobsTool = new ListJobsTool(pipelineManager); 47 | const scrapeTool = new ScrapeTool(docService, pipelineManager); 48 | const removeTool = new RemoveTool(docService, pipelineManager); 49 | const searchTool = new SearchTool(docService); 50 | const cancelJobTool = new CancelJobTool(pipelineManager); 51 | const clearCompletedJobsTool = new ClearCompletedJobsTool(pipelineManager); 52 | 53 | // Register static file serving 54 | await server.register(fastifyStatic, { 55 | // Use project root to construct absolute path to public directory 56 | root: path.join(getProjectRoot(), "public"), 57 | prefix: "/", 58 | index: false, // Disable automatic index.html serving 59 | }); 60 | 61 | // Register routes 62 | registerIndexRoute(server); // Register the root route first 63 | registerJobListRoutes(server, listJobsTool); 64 | registerNewJobRoutes(server, scrapeTool); 65 | registerCancelJobRoute(server, cancelJobTool); 66 | registerClearCompletedJobsRoute(server, clearCompletedJobsTool); 67 | registerLibrariesRoutes(server, listLibrariesTool, removeTool); 68 | registerLibraryDetailRoutes(server, listLibrariesTool, searchTool); 69 | 70 | // Graceful shutdown of services will be handled by the caller (src/index.ts) 71 | 72 | try { 73 | const address = await server.listen({ port, host: "0.0.0.0" }); 74 | logger.info(`🚀 Web UI available at ${address}`); 75 | return server; // Return the server instance 76 | } catch (error) { 77 | logger.error(`❌ Failed to start web UI: ${error}`); 78 | // Ensure server is closed if listen fails but initialization succeeded partially 79 | await server.close(); 80 | throw error; 81 | } 82 | } 83 | 84 | /** 85 | * Stops the provided Fastify web server instance. 86 | * 87 | * @param server - The Fastify server instance to stop. 88 | */ 89 | export async function stopWebServer(server: FastifyInstance): Promise { 90 | try { 91 | await server.close(); 92 | logger.info("🛑 Web UI stopped."); 93 | } catch (error) { 94 | logger.error(`❌ Failed to stop web server gracefully: ${error}`); 95 | // Rethrow or handle as needed, but ensure the process doesn't hang 96 | throw error; 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2022", 4 | "jsx": "react-jsx", 5 | "jsxImportSource": "@kitajs/html", 6 | "plugins": [{ "name": "@kitajs/ts-html-plugin" }], 7 | "module": "ESNext", 8 | "moduleResolution": "bundler", 9 | "noEmit": true, 10 | "esModuleInterop": true, 11 | "forceConsistentCasingInFileNames": true, 12 | "strict": true, 13 | "skipLibCheck": true, 14 | "outDir": "dist", 15 | "sourceMap": true, 16 | "declaration": true, 17 | "allowJs": true, 18 | "resolveJsonModule": true, 19 | "rootDir": "src", 20 | "types": ["@kitajs/html/htmx.d.ts", "vite/client"] 21 | }, 22 | "include": ["src/**/*"] 23 | } 24 | -------------------------------------------------------------------------------- /vite.config.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vitest/config"; 2 | import path from 'path'; 3 | import packageJson from "./package.json"; 4 | 5 | export default defineConfig({ 6 | plugins: [ 7 | ], 8 | resolve: { 9 | // Keep existing resolve extensions 10 | extensions: [".ts", ".tsx", ".js", ".jsx", ".json"], 11 | }, 12 | optimizeDeps: { 13 | force: true 14 | }, 15 | build: { 16 | outDir: 'dist', // Output directory 17 | sourcemap: true, // Generate sourcemaps 18 | emptyOutDir: true, // Clean the output directory before build (replaces tsup clean:true) 19 | lib: { 20 | // Define entry points using path.resolve for robustness 21 | entry: { 22 | index: path.resolve(__dirname, 'src/index.ts'), 23 | }, 24 | formats: ['es'], // Output ESM format only 25 | // Output filename will be based on the entry key (index.js) 26 | // fileName: (format, entryName) => `${entryName}.js`, 27 | }, 28 | rollupOptions: { 29 | // Externalize dependencies and node built-ins 30 | external: [ 31 | /^node:/, // Externalize all node built-ins (e.g., 'node:fs', 'node:path') 32 | ...Object.keys(packageJson.dependencies || {}), 33 | // Explicitly externalize potentially problematic packages if needed 34 | 'fingerprint-generator', 35 | 'header-generator', 36 | 'better-sqlite3', // Often needs to be external due to native bindings 37 | 'playwright', // Playwright should definitely be external 38 | 'sqlite-vec', // Likely involves native bindings 39 | ], 40 | 41 | output: { 42 | // Optional: Configure output further if needed 43 | // preserveModules: true, // Uncomment if you need to preserve source file structure 44 | // entryFileNames: '[name].js', // Adjust naming if needed 45 | }, 46 | }, 47 | // Target Node.js environment based on the version running the build 48 | target: `node${process.versions.node.split('.')[0]}`, 49 | ssr: true, // Explicitly mark this as an SSR/Node build 50 | }, 51 | test: { 52 | globals: true, 53 | environment: "node", 54 | testTimeout: 5000, 55 | include: ["src/**/*.test.ts", "src/**/*.test.tsx"], 56 | }, 57 | }); 58 | -------------------------------------------------------------------------------- /vite.config.web.ts: -------------------------------------------------------------------------------- 1 | import { defineConfig } from "vite"; 2 | import path from "node:path"; 3 | import tailwindcss from '@tailwindcss/vite' 4 | import packageJson from "./package.json"; 5 | 6 | // Vite configuration specifically for building frontend assets (CSS, JS) 7 | export default defineConfig({ 8 | // No need for dts plugin for frontend assets 9 | plugins: [tailwindcss()], 10 | resolve: { 11 | // Keep existing resolve extensions 12 | extensions: [".ts", ".tsx", ".js", ".jsx", ".json"], 13 | }, 14 | build: { 15 | // Output assets to public/assets, so they can be served statically 16 | outDir: path.resolve(__dirname, "public/assets"), 17 | sourcemap: true, // Generate sourcemaps for easier debugging 18 | emptyOutDir: true, // Clean the output directory before build 19 | // Define the frontend entry point 20 | lib: { 21 | entry: path.resolve(__dirname, "src/web/main.client.ts"), // Updated entry point 22 | // Use 'es' format for modern browsers 23 | formats: ["es"], 24 | // Define a fixed output filename for the JS bundle 25 | fileName: () => "main.js", 26 | }, 27 | rollupOptions: { 28 | // Unlike the backend build, we DO NOT externalize frontend dependencies 29 | // They should be bundled into main.js 30 | external: [], // Ensure no dependencies are externalized 31 | output: { 32 | // Ensure CSS is output as a separate file named main.css 33 | assetFileNames: "main.css", // Directly name the CSS output 34 | }, 35 | }, 36 | // Target modern browsers 37 | target: "esnext", 38 | // This is NOT an SSR build 39 | ssr: false, 40 | }, 41 | }); 42 | --------------------------------------------------------------------------------