├── .gitattributes ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── archive │ ├── js-sdk.yml │ ├── publish-js-sdk.yml │ ├── publish-python-sdk.yml │ ├── publish-rust-sdk.yml │ ├── python-sdk.yml │ └── rust-sdk.yml ├── dependabot.yml ├── scripts │ ├── check_version_has_incremented.py │ └── requirements.txt └── workflows │ ├── build-docker-images.yml │ ├── check-queues.yml │ └── clean-before-24h-complete-jobs.yml ├── .gitignore ├── .gitmodules ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── SELF_HOST.md ├── apps ├── api │ ├── .dockerignore │ ├── .env.example │ ├── .env.local │ ├── .gitattributes │ ├── .gitignore │ ├── Dockerfile │ ├── jest.config.js │ ├── jest.setup.js │ ├── package.json │ ├── pnpm-lock.yaml │ ├── requests.http │ ├── src │ │ ├── __tests__ │ │ │ ├── e2e_full_withAuth │ │ │ │ └── index.test.ts │ │ │ ├── e2e_noAuth │ │ │ │ └── index.test.ts │ │ │ ├── e2e_v1_withAuth │ │ │ │ └── index.test.ts │ │ │ └── e2e_withAuth │ │ │ │ └── index.test.ts │ │ ├── controllers │ │ │ ├── __tests__ │ │ │ │ └── crawl.test.ts │ │ │ ├── auth.ts │ │ │ ├── v0 │ │ │ │ ├── admin │ │ │ │ │ ├── queue.ts │ │ │ │ │ └── redis-health.ts │ │ │ │ ├── crawl-cancel.ts │ │ │ │ ├── crawl-status.ts │ │ │ │ ├── crawl.ts │ │ │ │ ├── crawlPreview.ts │ │ │ │ ├── keyAuth.ts │ │ │ │ ├── scrape.ts │ │ │ │ └── status.ts │ │ │ └── v1 │ │ │ │ ├── __tests__ │ │ │ │ └── urlValidation.test.ts │ │ │ │ ├── crawl-cancel.ts │ │ │ │ ├── crawl-status.ts │ │ │ │ ├── crawl.ts │ │ │ │ ├── liveness.ts │ │ │ │ ├── map.ts │ │ │ │ ├── readiness.ts │ │ │ │ ├── scrape-status.ts │ │ │ │ ├── scrape.ts │ │ │ │ └── types.ts │ │ ├── index.ts │ │ ├── lib │ │ │ ├── __tests__ │ │ │ │ ├── html-to-markdown.test.ts │ │ │ │ └── job-priority.test.ts │ │ │ ├── batch-process.ts │ │ │ ├── crawl-redis.ts │ │ │ ├── custom-error.ts │ │ │ ├── default-values.ts │ │ │ ├── entities.ts │ │ │ ├── go-html-to-md │ │ │ │ ├── README.md │ │ │ │ ├── go.mod │ │ │ │ ├── go.sum │ │ │ │ └── html-to-markdown.go │ │ │ ├── html-to-markdown.ts │ │ │ ├── job-priority.ts │ │ │ ├── logger.ts │ │ │ ├── parse-mode.ts │ │ │ ├── parseApi.ts │ │ │ ├── timeout.ts │ │ │ ├── validateUrl.test.ts │ │ │ ├── validateUrl.ts │ │ │ └── withAuth.ts │ │ ├── main │ │ │ └── runWebScraper.ts │ │ ├── openapi │ │ │ └── index.ts │ │ ├── routes │ │ │ ├── admin.ts │ │ │ └── v1.ts │ │ ├── run-req.ts │ │ ├── scraper │ │ │ └── WebScraper │ │ │ │ ├── __tests__ │ │ │ │ ├── crawler.test.ts │ │ │ │ └── dns.test.ts │ │ │ │ ├── crawler.ts │ │ │ │ ├── global.ts │ │ │ │ ├── index.ts │ │ │ │ ├── scrapers │ │ │ │ ├── fetch.ts │ │ │ │ └── playwright.ts │ │ │ │ ├── single_url.ts │ │ │ │ ├── sitemap.ts │ │ │ │ └── utils │ │ │ │ ├── __tests__ │ │ │ │ ├── maxDepthUtils.test.ts │ │ │ │ ├── parseTable.test.ts │ │ │ │ ├── removeUnwantedElements.test.ts │ │ │ │ └── replacePaths.test.ts │ │ │ │ ├── custom │ │ │ │ └── website_params.ts │ │ │ │ ├── excludeTags.ts │ │ │ │ ├── maxDepthUtils.ts │ │ │ │ ├── metadata.ts │ │ │ │ ├── parseTable.ts │ │ │ │ ├── removeUnwantedElements.ts │ │ │ │ ├── replacePaths.ts │ │ │ │ └── utils.ts │ │ ├── scripts │ │ │ └── generate-openapi.ts │ │ ├── services │ │ │ ├── alerts │ │ │ │ └── index.ts │ │ │ ├── idempotency │ │ │ │ └── create.ts │ │ │ ├── logtail.ts │ │ │ ├── queue-jobs.ts │ │ │ ├── queue-service.ts │ │ │ ├── queue-worker.ts │ │ │ ├── rate-limiter.test.ts │ │ │ ├── rate-limiter.ts │ │ │ ├── redis.ts │ │ │ ├── redlock.ts │ │ │ └── system-monitor.ts │ │ ├── strings.ts │ │ ├── supabase_types.ts │ │ └── types.ts │ ├── tsconfig.json │ └── v1-openapi.json ├── puppeteer-service-ts │ ├── .dockerignore │ ├── Dockerfile │ ├── README.md │ ├── api.ts │ ├── helpers │ │ └── get_error.ts │ ├── openapi │ │ └── index.ts │ ├── package.json │ ├── pnpm-lock.yaml │ └── tsconfig.json └── test-suite │ ├── .env.example │ ├── README.md │ ├── data │ ├── crawl.json │ └── scrape.json │ ├── jest.config.js │ ├── jest.setup.js │ ├── load-test-results │ ├── tests-1-5 │ │ ├── assets │ │ │ ├── CPU-utilization-report-test-1.png │ │ │ ├── memory-utilization-report-test-1.png │ │ │ ├── metrics-test-2.png │ │ │ ├── metrics-test-3.png │ │ │ ├── metrics-test-4.png │ │ │ └── metrics-test-5.png │ │ ├── load-test-1.md │ │ ├── load-test-2.md │ │ ├── load-test-3.md │ │ ├── load-test-4.md │ │ └── load-test-5.md │ └── tests-6-7 │ │ ├── assets │ │ ├── metrics-fire-engine-test-7-2.png │ │ ├── metrics-fire-engine-test-7.png │ │ ├── metrics-fire-engine-test-8.png │ │ ├── metrics-test-6.png │ │ ├── metrics-test-7.png │ │ └── metrics-test-8.png │ │ └── load-test-6.md │ ├── load-test.yml │ ├── package.json │ ├── pnpm-lock.yaml │ ├── tsconfig.json │ └── utils │ ├── misc.ts │ └── types.ts ├── docker-compose.yaml └── img └── firecrawl_logo.png /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[Bug] " 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the Bug** 11 | Provide a clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the issue: 15 | 1. Configure the environment or settings with '...' 16 | 2. Run the command '...' 17 | 3. Observe the error or unexpected output at '...' 18 | 4. Log output/error message 19 | 20 | **Screenshots** 21 | If applicable, add screenshots or copies of the command line output to help explain the issue. 22 | 23 | **Logs** 24 | If applicable, include detailed logs to help understand the problem. 25 | 26 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[Feat] " 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Proposed Feature** 11 | Provide a clear and concise description of the feature you would like implemented. 12 | 13 | **Use Case** 14 | Explain how this feature would be used and what benefits it would bring. Include specific examples to illustrate how this would improve functionality or user experience. 15 | -------------------------------------------------------------------------------- /.github/archive/js-sdk.yml: -------------------------------------------------------------------------------- 1 | name: Run JavaScript SDK E2E Tests 2 | 3 | on: [] 4 | 5 | env: 6 | BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} 7 | HOST: ${{ secrets.HOST }} 8 | NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} 9 | PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} 10 | PORT: ${{ secrets.PORT }} 11 | REDIS_URL: ${{ secrets.REDIS_URL }} 12 | TEST_API_KEY: ${{ secrets.TEST_API_KEY }} 13 | HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} 14 | HDX_NODE_BETA_MODE: 1 15 | 16 | jobs: 17 | build: 18 | runs-on: ubuntu-latest 19 | services: 20 | redis: 21 | image: redis 22 | ports: 23 | - 6379:6379 24 | 25 | steps: 26 | - uses: actions/checkout@v3 27 | - name: Set up Node.js 28 | uses: actions/setup-node@v3 29 | with: 30 | node-version: "20" 31 | - name: Install pnpm 32 | run: npm install -g pnpm 33 | - name: Install dependencies for API 34 | run: pnpm install 35 | working-directory: ./apps/api 36 | - name: Start the application 37 | run: npm start & 38 | working-directory: ./apps/api 39 | - name: Start workers 40 | run: npm run workers & 41 | working-directory: ./apps/api 42 | - name: Install dependencies for JavaScript SDK 43 | run: pnpm install 44 | working-directory: ./apps/js-sdk/firecrawl 45 | - name: Run E2E tests for JavaScript SDK 46 | run: npm run test 47 | working-directory: ./apps/js-sdk/firecrawl -------------------------------------------------------------------------------- /.github/archive/publish-js-sdk.yml: -------------------------------------------------------------------------------- 1 | name: Publish JavaScript SDK 2 | 3 | on: [] 4 | 5 | env: 6 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 7 | 8 | jobs: 9 | build-and-publish: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | - name: Set up Node.js 15 | uses: actions/setup-node@v3 16 | with: 17 | node-version: '20' 18 | registry-url: 'https://registry.npmjs.org/' 19 | scope: '@mendable' 20 | always-auth: true 21 | 22 | - name: Install pnpm 23 | run: npm install -g pnpm 24 | 25 | - name: Install python for running version check script 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install setuptools wheel requests packaging 29 | 30 | - name: Install dependencies for JavaScript SDK 31 | run: pnpm install 32 | working-directory: ./apps/js-sdk/firecrawl 33 | 34 | - name: Run version check script 35 | id: version_check_script 36 | run: | 37 | VERSION_INCREMENTED=$(python .github/scripts/check_version_has_incremented.py js ./apps/js-sdk/firecrawl @mendable/firecrawl-js) 38 | echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV 39 | 40 | - name: Build and publish to npm 41 | if: ${{ env.VERSION_INCREMENTED == 'true' }} 42 | env: 43 | NODE_AUTH_TOKEN: ${{ secrets.NPM_TOKEN }} 44 | run: | 45 | npm run build-and-publish 46 | working-directory: ./apps/js-sdk/firecrawl 47 | -------------------------------------------------------------------------------- /.github/archive/publish-python-sdk.yml: -------------------------------------------------------------------------------- 1 | name: Publish Python SDK 2 | 3 | on: [] 4 | 5 | env: 6 | PYPI_USERNAME: ${{ secrets.PYPI_USERNAME }} 7 | PYPI_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 8 | 9 | jobs: 10 | build-and-publish: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout repository 15 | uses: actions/checkout@v3 16 | 17 | - name: Set up Python 18 | uses: actions/setup-python@v4 19 | with: 20 | python-version: '3.x' 21 | 22 | - name: Install dependencies 23 | run: | 24 | python -m pip install --upgrade pip 25 | pip install setuptools wheel twine build requests packaging 26 | 27 | - name: Run version check script 28 | id: version_check_script 29 | run: | 30 | VERSION_INCREMENTED=$(python .github/scripts/check_version_has_incremented.py python ./apps/python-sdk/firecrawl firecrawl-py) 31 | echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV 32 | 33 | - name: Build the package 34 | if: ${{ env.VERSION_INCREMENTED == 'true' }} 35 | run: | 36 | python -m build 37 | working-directory: ./apps/python-sdk 38 | 39 | - name: Publish to PyPI 40 | if: ${{ env.VERSION_INCREMENTED == 'true' }} 41 | env: 42 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 43 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 44 | run: | 45 | twine upload dist/* 46 | working-directory: ./apps/python-sdk 47 | 48 | -------------------------------------------------------------------------------- /.github/archive/publish-rust-sdk.yml: -------------------------------------------------------------------------------- 1 | name: Publish Rust SDK 2 | 3 | on: [] 4 | 5 | env: 6 | CRATES_IO_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} 7 | 8 | jobs: 9 | build-and-publish: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout repository 14 | uses: actions/checkout@v3 15 | 16 | - name: Set up Rust 17 | uses: actions-rs/toolchain@v1 18 | with: 19 | toolchain: stable 20 | default: true 21 | profile: minimal 22 | 23 | - name: Install dependencies 24 | run: cargo build --release 25 | 26 | - name: Run version check script 27 | id: version_check_script 28 | run: | 29 | VERSION_INCREMENTED=$(cargo search --limit 1 my_crate_name | grep my_crate_name) 30 | echo "VERSION_INCREMENTED=$VERSION_INCREMENTED" >> $GITHUB_ENV 31 | 32 | - name: Build the package 33 | if: ${{ env.VERSION_INCREMENTED == 'true' }} 34 | run: cargo package 35 | working-directory: ./apps/rust-sdk 36 | 37 | - name: Publish to crates.io 38 | if: ${{ env.VERSION_INCREMENTED == 'true' }} 39 | env: 40 | CARGO_REG_TOKEN: ${{ secrets.CRATES_IO_TOKEN }} 41 | run: cargo publish 42 | working-directory: ./apps/rust-sdk -------------------------------------------------------------------------------- /.github/archive/python-sdk.yml: -------------------------------------------------------------------------------- 1 | name: Run Python SDK E2E Tests 2 | 3 | on: [] 4 | 5 | env: 6 | BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} 7 | HOST: ${{ secrets.HOST }} 8 | NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} 9 | PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} 10 | PORT: ${{ secrets.PORT }} 11 | REDIS_URL: ${{ secrets.REDIS_URL }} 12 | TEST_API_KEY: ${{ secrets.TEST_API_KEY }} 13 | HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} 14 | HDX_NODE_BETA_MODE: 1 15 | 16 | jobs: 17 | build: 18 | runs-on: ubuntu-latest 19 | strategy: 20 | matrix: 21 | python-version: ["3.10"] 22 | services: 23 | redis: 24 | image: redis 25 | ports: 26 | - 6379:6379 27 | 28 | steps: 29 | - uses: actions/checkout@v3 30 | - name: Set up Node.js 31 | uses: actions/setup-node@v3 32 | with: 33 | node-version: "20" 34 | - name: Install pnpm 35 | run: npm install -g pnpm 36 | - name: Install dependencies for API 37 | run: pnpm install 38 | working-directory: ./apps/api 39 | - name: Start the application 40 | run: npm start & 41 | working-directory: ./apps/api 42 | id: start_app 43 | - name: Start workers 44 | run: npm run workers & 45 | working-directory: ./apps/api 46 | id: start_workers 47 | - name: Set up Python ${{ matrix.python-version }} 48 | uses: actions/setup-python@v4 49 | with: 50 | python-version: ${{ matrix.python-version }} 51 | - name: Install Python dependencies 52 | run: | 53 | python -m pip install --upgrade pip 54 | pip install -r requirements.txt 55 | working-directory: ./apps/python-sdk 56 | - name: Run E2E tests for Python SDK 57 | run: | 58 | pytest firecrawl/__tests__/e2e_withAuth/test.py 59 | working-directory: ./apps/python-sdk 60 | -------------------------------------------------------------------------------- /.github/archive/rust-sdk.yml: -------------------------------------------------------------------------------- 1 | name: Run Rust SDK E2E Tests 2 | 3 | on: [] 4 | 5 | env: 6 | BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} 7 | HOST: ${{ secrets.HOST }} 8 | NUM_WORKERS_PER_QUEUE: ${{ secrets.NUM_WORKERS_PER_QUEUE }} 9 | PLAYWRIGHT_MICROSERVICE_URL: ${{ secrets.PLAYWRIGHT_MICROSERVICE_URL }} 10 | PORT: ${{ secrets.PORT }} 11 | REDIS_URL: ${{ secrets.REDIS_URL }} 12 | TEST_API_KEY: ${{ secrets.TEST_API_KEY }} 13 | HYPERDX_API_KEY: ${{ secrets.HYPERDX_API_KEY }} 14 | HDX_NODE_BETA_MODE: 1 15 | 16 | 17 | jobs: 18 | build: 19 | runs-on: ubuntu-latest 20 | services: 21 | redis: 22 | image: redis 23 | ports: 24 | - 6379:6379 25 | 26 | steps: 27 | - name: Checkout repository 28 | uses: actions/checkout@v3 29 | - name: Install pnpm 30 | run: npm install -g pnpm 31 | - name: Install dependencies for API 32 | run: pnpm install 33 | working-directory: ./apps/api 34 | - name: Start the application 35 | run: npm start & 36 | working-directory: ./apps/api 37 | id: start_app 38 | - name: Start workers 39 | run: npm run workers & 40 | working-directory: ./apps/api 41 | id: start_workers 42 | - name: Set up Rust 43 | uses: actions/setup-rust@v1 44 | with: 45 | rust-version: stable 46 | - name: Try the lib build 47 | working-directory: ./apps/rust-sdk 48 | run: cargo build 49 | - name: Run E2E tests for Rust SDK 50 | run: cargo test --test e2e_with_auth 51 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | # playwright-service 4 | - package-ecosystem: "pip" 5 | directory: "/apps/playwright-service" 6 | schedule: 7 | interval: "weekly" 8 | open-pull-requests-limit: 0 # Disable version updates 9 | security-updates: "all" 10 | commit-message: 11 | prefix: "apps/playwright-service" 12 | include: "scope" 13 | 14 | # python-sdk 15 | - package-ecosystem: "pip" 16 | directory: "/apps/python-sdk" 17 | schedule: 18 | interval: "weekly" 19 | open-pull-requests-limit: 0 # Disable version updates 20 | security-updates: "all" 21 | commit-message: 22 | prefix: "apps/python-sdk" 23 | include: "scope" 24 | 25 | # api 26 | - package-ecosystem: "npm" 27 | directory: "/apps/api" 28 | schedule: 29 | interval: "weekly" 30 | open-pull-requests-limit: 0 # Disable version updates 31 | security-updates: "all" 32 | commit-message: 33 | prefix: "apps/api" 34 | include: "scope" 35 | 36 | # test-suite 37 | - package-ecosystem: "npm" 38 | directory: "/apps/test-suite" 39 | schedule: 40 | interval: "weekly" 41 | open-pull-requests-limit: 0 # Disable version updates 42 | security-updates: "all" 43 | commit-message: 44 | prefix: "apps/test-suite" 45 | include: "scope" -------------------------------------------------------------------------------- /.github/scripts/check_version_has_incremented.py: -------------------------------------------------------------------------------- 1 | """ 2 | checks local versions against published versions. 3 | 4 | # Usage: 5 | 6 | python .github/scripts/check_version_has_incremented.py js ./apps/js-sdk/firecrawl @mendable/firecrawl-js 7 | Local version: 0.0.22 8 | Published version: 0.0.21 9 | true 10 | 11 | python .github/scripts/check_version_has_incremented.py python ./apps/python-sdk/firecrawl firecrawl-py 12 | Local version: 0.0.11 13 | Published version: 0.0.11 14 | false 15 | 16 | """ 17 | import json 18 | import toml 19 | import os 20 | import re 21 | import sys 22 | from pathlib import Path 23 | 24 | import requests 25 | from packaging.version import Version 26 | from packaging.version import parse as parse_version 27 | 28 | 29 | def get_python_version(file_path: str) -> str: 30 | """Extract version string from Python file.""" 31 | version_file = Path(file_path).read_text() 32 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", version_file, re.M) 33 | if version_match: 34 | return version_match.group(1).strip() 35 | raise RuntimeError("Unable to find version string.") 36 | 37 | def get_pypi_version(package_name: str) -> str: 38 | """Get latest version of Python package from PyPI.""" 39 | response = requests.get(f"https://pypi.org/pypi/{package_name}/json") 40 | version = response.json()['info']['version'] 41 | return version.strip() 42 | 43 | def get_js_version(file_path: str) -> str: 44 | """Extract version string from package.json.""" 45 | with open(file_path, 'r') as file: 46 | package_json = json.load(file) 47 | if 'version' in package_json: 48 | return package_json['version'].strip() 49 | raise RuntimeError("Unable to find version string in package.json.") 50 | 51 | def get_npm_version(package_name: str) -> str: 52 | """Get latest version of JavaScript package from npm.""" 53 | response = requests.get(f"https://registry.npmjs.org/{package_name}/latest") 54 | version = response.json()['version'] 55 | return version.strip() 56 | 57 | def get_rust_version(file_path: str) -> str: 58 | """Extract version string from Cargo.toml.""" 59 | cargo_toml = toml.load(file_path) 60 | if 'package' in cargo_toml and 'version' in cargo_toml['package']: 61 | return cargo_toml['package']['version'].strip() 62 | raise RuntimeError("Unable to find version string in Cargo.toml.") 63 | 64 | def get_crates_version(package_name: str) -> str: 65 | """Get latest version of Rust package from crates.io.""" 66 | response = requests.get(f"https://crates.io/api/v1/crates/{package_name}") 67 | version = response.json()['crate']['newest_version'] 68 | return version.strip() 69 | 70 | def is_version_incremented(local_version: str, published_version: str) -> bool: 71 | """Compare local and published versions.""" 72 | local_version_parsed: Version = parse_version(local_version) 73 | published_version_parsed: Version = parse_version(published_version) 74 | return local_version_parsed > published_version_parsed 75 | 76 | if __name__ == "__main__": 77 | package_type = sys.argv[1] 78 | package_path = sys.argv[2] 79 | package_name = sys.argv[3] 80 | 81 | if package_type == "python": 82 | # Get current version from __init__.py 83 | current_version = get_python_version(os.path.join(package_path, '__init__.py')) 84 | # Get published version from PyPI 85 | published_version = get_pypi_version(package_name) 86 | elif package_type == "js": 87 | # Get current version from package.json 88 | current_version = get_js_version(os.path.join(package_path, 'package.json')) 89 | # Get published version from npm 90 | published_version = get_npm_version(package_name) 91 | if package_type == "rust": 92 | # Get current version from Cargo.toml 93 | current_version = get_rust_version(os.path.join(package_path, 'Cargo.toml')) 94 | # Get published version from crates.io 95 | published_version = get_crates_version(package_name) 96 | 97 | else: 98 | raise ValueError("Invalid package type. Use 'python' or 'js'.") 99 | 100 | # Print versions for debugging 101 | # print(f"Local version: {current_version}") 102 | # print(f"Published version: {published_version}") 103 | 104 | # Compare versions and print result 105 | if is_version_incremented(current_version, published_version): 106 | print("true") 107 | else: 108 | print("false") 109 | -------------------------------------------------------------------------------- /.github/scripts/requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | packaging 3 | toml -------------------------------------------------------------------------------- /.github/workflows/build-docker-images.yml: -------------------------------------------------------------------------------- 1 | name: Build Docker Images 2 | 3 | on: 4 | pull_request: 5 | workflow_dispatch: 6 | 7 | jobs: 8 | build-api: 9 | name: Build API Image 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout the repo 13 | uses: actions/checkout@v4 14 | 15 | - name: Setup buildx 16 | uses: docker/setup-buildx-action@v3 17 | 18 | - name: Build Docker image 19 | uses: docker/build-push-action@v5 20 | with: 21 | context: ./apps/api 22 | file: ./apps/api/Dockerfile 23 | push: false 24 | load: true 25 | tags: trieve/firecrawl:pr-${{ github.event.number }} 26 | build-args: | 27 | PORT=8080 28 | 29 | build-puppeteer: 30 | name: Build Puppeteer Service 31 | runs-on: ubuntu-latest 32 | steps: 33 | - name: Checkout the repo 34 | uses: actions/checkout@v4 35 | 36 | - name: Setup buildx 37 | uses: docker/setup-buildx-action@v3 38 | 39 | - name: Build Docker image 40 | uses: docker/build-push-action@v5 41 | with: 42 | context: ./apps/puppeteer-service-ts/ 43 | file: ./apps/puppeteer-service-ts/Dockerfile 44 | push: false 45 | load: true 46 | tags: trieve/puppeteer-service-ts:pr-${{ github.event.number }} 47 | build-args: | 48 | PORT=3000 49 | -------------------------------------------------------------------------------- /.github/workflows/check-queues.yml: -------------------------------------------------------------------------------- 1 | name: Check Queues 2 | on: 3 | schedule: 4 | - cron: '*/5 * * * *' 5 | 6 | env: 7 | BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} 8 | 9 | jobs: 10 | clean-jobs: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Send GET request to check queues 14 | run: | 15 | response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/check-queues) 16 | if [ "$response" -ne 200 ]; then 17 | echo "Failed to check queues. Response: $response" 18 | exit 1 19 | fi 20 | echo "Successfully checked queues. Response: $response" 21 | -------------------------------------------------------------------------------- /.github/workflows/clean-before-24h-complete-jobs.yml: -------------------------------------------------------------------------------- 1 | name: Clean Every 30 Minutes Before 24h Completed Jobs 2 | on: 3 | schedule: 4 | - cron: '30 * * * *' 5 | 6 | env: 7 | BULL_AUTH_KEY: ${{ secrets.BULL_AUTH_KEY }} 8 | 9 | jobs: 10 | clean-jobs: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Send GET request to clean jobs 14 | run: | 15 | response=$(curl --write-out '%{http_code}' --silent --output /dev/null --max-time 180 https://api.firecrawl.dev/admin/${{ secrets.BULL_AUTH_KEY }}/clean-before-24h-complete-jobs) 16 | if [ "$response" -ne 200 ]; then 17 | echo "Failed to clean jobs. Response: $response" 18 | exit 1 19 | fi 20 | echo "Successfully cleaned jobs. Response: $response" 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | node_modules 3 | dist 4 | .env 5 | *.csv 6 | dump.rdb 7 | apps/js-sdk/node_modules/ 8 | 9 | apps/api/.env.local 10 | 11 | apps/test-suite/node_modules/ 12 | 13 | 14 | apps/test-suite/.env 15 | apps/test-suite/logs 16 | apps/test-suite/load-test-results/test-run-report.json 17 | 18 | apps/playwright-service-ts/node_modules/ 19 | apps/playwright-service-ts/package-lock.json 20 | 21 | 22 | /examples/o1_web_crawler/venv 23 | *.pyc 24 | .rdb 25 | 26 | apps/js-sdk/firecrawl/dist 27 | 28 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "apps/go-sdk/firecrawl-go"] 2 | path = apps/go-sdk/firecrawl-go 3 | url = https://github.com/mendableai/firecrawl-go 4 | [submodule "apps/go-sdk/firecrawl-go-examples"] 5 | path = apps/go-sdk/firecrawl-go-examples 6 | url = https://github.com/mendableai/firecrawl-go-examples 7 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributors guide: 2 | 3 | Welcome to [Firecrawl](https://firecrawl.dev) 🔥! Here are some instructions on how to get the project locally, so you can run it on your own (and contribute) 4 | 5 | If you're contributing, note that the process is similar to other open source repos i.e. (fork firecrawl, make changes, run tests, PR). If you have any questions, and would like help gettin on board, reach out to hello@mendable.ai for more or submit an issue! 6 | 7 | ## Running the project locally 8 | 9 | First, start by installing dependencies 10 | 11 | 1. node.js [instructions](https://nodejs.org/en/learn/getting-started/how-to-install-nodejs) 12 | 2. pnpm [instructions](https://pnpm.io/installation) 13 | 3. redis [instructions](https://redis.io/docs/latest/operate/oss_and_stack/install/install-redis/) 14 | 15 | Set environment variables in a .env in the /apps/api/ directory you can copy over the template in .env.example. 16 | 17 | To start, we wont set up authentication, or any optional sub services (pdf parsing, JS blocking support, AI features ) 18 | 19 | .env: 20 | 21 | ``` 22 | # ===== Required ENVS ====== 23 | NUM_WORKERS_PER_QUEUE=8 24 | PORT=3002 25 | HOST=0.0.0.0 26 | REDIS_URL=redis://localhost:6379 27 | REDIS_RATE_LIMIT_URL=redis://localhost:6379 28 | 29 | # ===== Optional ENVS ====== 30 | 31 | # Other Optionals 32 | TEST_API_KEY= # use if you've set up authentication and want to test with a real API key 33 | BULL_AUTH_KEY= @ 34 | PLAYWRIGHT_MICROSERVICE_URL= # set if you'd like to run a playwright fallback 35 | ``` 36 | 37 | ### Installing dependencies 38 | 39 | First, install the dependencies using pnpm. 40 | 41 | ```bash 42 | pnpm install 43 | ``` 44 | 45 | ### Running the project 46 | 47 | You're going to need to open 3 terminals. 48 | 49 | ### Terminal 1 - setting up redis 50 | 51 | Run the command anywhere within your project 52 | 53 | ```bash 54 | redis-server 55 | ``` 56 | 57 | ### Terminal 2 - setting up workers 58 | 59 | Now, navigate to the apps/api/ directory and run: 60 | 61 | ```bash 62 | pnpm run workers 63 | ``` 64 | 65 | This will start the workers who are responsible for processing crawl jobs. 66 | 67 | ### Terminal 3 - setting up the main server 68 | 69 | To do this, navigate to the apps/api/ directory and run if you don’t have this already, install pnpm here: https://pnpm.io/installation 70 | Next, run your server with: 71 | 72 | ```bash 73 | pnpm run start 74 | ``` 75 | 76 | ### Terminal 3 - sending our first request. 77 | 78 | Alright: now let’s send our first request. 79 | 80 | ```curl 81 | curl -X GET http://localhost:3002/test 82 | ``` 83 | 84 | This should return the response Hello, world! 85 | 86 | If you’d like to test the crawl endpoint, you can run this 87 | 88 | ```curl 89 | curl -X POST http://localhost:3002/v1/crawl \ 90 | -H 'Content-Type: application/json' \ 91 | -d '{ 92 | "url": "https://mendable.ai" 93 | }' 94 | ``` 95 | 96 | ## Tests: 97 | 98 | The best way to do this is run the test with `npm run test:local-no-auth` if you'd like to run the tests without authentication. 99 | 100 | If you'd like to run the tests with authentication, run `npm run test:prod` 101 | -------------------------------------------------------------------------------- /apps/api/.dockerignore: -------------------------------------------------------------------------------- 1 | /node_modules/ 2 | /dist/ 3 | .env 4 | *.csv 5 | -------------------------------------------------------------------------------- /apps/api/.env.example: -------------------------------------------------------------------------------- 1 | # ===== Required ENVS ====== 2 | NUM_WORKERS_PER_QUEUE=8 3 | PORT=3002 4 | HOST=0.0.0.0 5 | REDIS_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379 6 | REDIS_RATE_LIMIT_URL=redis://redis:6379 #for self-hosting using docker, use redis://redis:6379. For running locally, use redis://localhost:6379 7 | PLAYWRIGHT_MICROSERVICE_URL=http://playwright-service:3000/html 8 | 9 | # ===== Optional ENVS ====== 10 | 11 | # Other Optionals 12 | # use if you've set up authentication and want to test with a real API key 13 | TEST_API_KEY= 14 | # set if you'd like to test the scraping rate limit 15 | RATE_LIMIT_TEST_API_KEY_SCRAPE= 16 | # set if you'd like to test the crawling rate limit 17 | RATE_LIMIT_TEST_API_KEY_CRAWL= 18 | # add for LLM dependednt features (image alt generation, etc.) 19 | BULL_AUTH_KEY=@ 20 | 21 | STRIPE_PRICE_ID_STANDARD= 22 | STRIPE_PRICE_ID_SCALE= 23 | STRIPE_PRICE_ID_STARTER= 24 | STRIPE_PRICE_ID_HOBBY= 25 | STRIPE_PRICE_ID_HOBBY_YEARLY= 26 | STRIPE_PRICE_ID_STANDARD_NEW= 27 | STRIPE_PRICE_ID_STANDARD_NEW_YEARLY= 28 | STRIPE_PRICE_ID_GROWTH= 29 | STRIPE_PRICE_ID_GROWTH_YEARLY= 30 | 31 | HYPERDX_API_KEY= 32 | HDX_NODE_BETA_MODE=1 33 | 34 | # Proxy Settings for Playwright (Alternative you can can use a proxy service like oxylabs, which rotates IPs for you on every request) 35 | PROXY_SERVER= 36 | PROXY_USERNAME= 37 | PROXY_PASSWORD= 38 | 39 | # 2captcha token if you want to enable captcha solves with the puppeteer service 40 | TWOCAPTCHA_TOKEN= 41 | 42 | # Maximal number of parallel workers. Defaults to 1. 43 | MAX_CONCURRENCY=20 44 | 45 | # LOGGING_LEVEL determines the verbosity of logs that the system will output. 46 | # Available levels are: 47 | # NONE - No logs will be output. 48 | # ERROR - For logging error messages that indicate a failure in a specific operation. 49 | # WARN - For logging potentially harmful situations that are not necessarily errors. 50 | # INFO - For logging informational messages that highlight the progress of the application. 51 | # DEBUG - For logging detailed information on the flow through the system, primarily used for debugging. 52 | # TRACE - For logging more detailed information than the DEBUG level. 53 | # Set LOGGING_LEVEL to one of the above options to control logging output. 54 | LOGGING_LEVEL=INFO 55 | -------------------------------------------------------------------------------- /apps/api/.env.local: -------------------------------------------------------------------------------- 1 | NUM_WORKERS_PER_QUEUE=8 2 | PORT= 3 | HOST= 4 | SUPABASE_ANON_TOKEN= 5 | SUPABASE_URL= 6 | SUPABASE_SERVICE_TOKEN= 7 | REDIS_URL= 8 | REDIS_RATE_LIMIT_URL= 9 | SCRAPING_BEE_API_KEY= 10 | OPENAI_API_KEY= 11 | ANTHROPIC_API_KEY= 12 | BULL_AUTH_KEY= 13 | LOGTAIL_KEY= 14 | PLAYWRIGHT_MICROSERVICE_URL= 15 | 16 | -------------------------------------------------------------------------------- /apps/api/.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /apps/api/.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules/ 2 | /dist/ 3 | .env 4 | *.csv 5 | dump.rdb 6 | 7 | /.next/ 8 | 9 | .rdb 10 | .sentryclirc 11 | -------------------------------------------------------------------------------- /apps/api/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:20-slim AS base 2 | 3 | # Create app directory 4 | WORKDIR /app 5 | 6 | # Install pnpm 7 | RUN npm install -g pnpm corepack@latest 8 | RUN corepack enable 9 | 10 | # Copy package files 11 | COPY ./package.json ./pnpm-lock.yaml ./ 12 | 13 | # Install dependencies 14 | RUN pnpm install --frozen-lockfile 15 | 16 | # Install necessary build dependencies 17 | RUN apt-get update -qq && \ 18 | apt-get install -y \ 19 | ca-certificates \ 20 | git \ 21 | golang-go \ 22 | && update-ca-certificates 23 | 24 | # Copy the rest of the application 25 | COPY . ./ 26 | 27 | # Build Go module 28 | COPY ./src/lib/go-html-to-md/ ./src/lib/go-html-to-md/ 29 | RUN cd src/lib/go-html-to-md && \ 30 | go mod tidy && \ 31 | go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go && \ 32 | chmod +x html-to-markdown.so 33 | 34 | # Build the application 35 | RUN pnpm run build 36 | 37 | # Install runtime dependencies 38 | RUN apt-get install --no-install-recommends -y \ 39 | chromium \ 40 | chromium-sandbox \ 41 | && rm -rf /var/lib/apt/lists/* /var/cache/apt/archives 42 | 43 | # Environment setup 44 | ENV PUPPETEER_EXECUTABLE_PATH="/usr/bin/chromium" 45 | ARG PORT=8080 46 | ENV PORT=${PORT} 47 | EXPOSE ${PORT} 48 | 49 | CMD ["pnpm", "start"] 50 | -------------------------------------------------------------------------------- /apps/api/jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: "ts-jest", 3 | testEnvironment: "node", 4 | setupFiles: ["./jest.setup.js"], 5 | // ignore dist folder root dir 6 | modulePathIgnorePatterns: ["/dist/"], 7 | 8 | }; 9 | -------------------------------------------------------------------------------- /apps/api/jest.setup.js: -------------------------------------------------------------------------------- 1 | global.fetch = require('jest-fetch-mock'); 2 | -------------------------------------------------------------------------------- /apps/api/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "firecrawl-scraper-js", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "start": "nodemon --exec ts-node src/index.ts", 8 | "start:production": "tsc && node dist/src/index.js", 9 | "format": "prettier --write \"src/**/*.(js|ts)\"", 10 | "start:dev": "nodemon --exec ts-node src/index.ts", 11 | "build": "tsc", 12 | "test": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_noAuth/*'", 13 | "test:local-no-auth": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathIgnorePatterns='src/__tests__/e2e_withAuth/*'", 14 | "workers": "nodemon --exec ts-node src/services/queue-worker.ts", 15 | "worker:production": "node dist/src/services/queue-worker.js" 16 | }, 17 | "author": "", 18 | "license": "ISC", 19 | "devDependencies": { 20 | "@jest/globals": "^29.7.0", 21 | "@tsconfig/recommended": "^1.0.3", 22 | "@types/body-parser": "^1.19.2", 23 | "@types/cors": "^2.8.13", 24 | "@types/express": "^4.17.17", 25 | "@types/jest": "^29.5.12", 26 | "@types/node": "^20.17.6", 27 | "@types/swagger-jsdoc": "^6.0.4", 28 | "@types/uuid": "^10.0.0", 29 | "body-parser": "^1.20.1", 30 | "express": "^4.18.2", 31 | "jest": "^29.6.3", 32 | "jest-fetch-mock": "^3.0.3", 33 | "nodemon": "^3.1.7", 34 | "supertest": "^6.3.3", 35 | "ts-jest": "^29.1.1", 36 | "ts-node": "^10.9.1", 37 | "typescript": "^5.4.2" 38 | }, 39 | "dependencies": { 40 | "@types/express-ws": "^3.0.4", 41 | "@types/ws": "^8.5.12", 42 | "async": "^3.2.5", 43 | "async-mutex": "^0.5.0", 44 | "axios": "^1.3.4", 45 | "bullmq": "^5.11.0", 46 | "cacheable-lookup": "^6.1.0", 47 | "cheerio": "^1.0.0-rc.12", 48 | "cors": "^2.8.5", 49 | "dotenv": "^16.3.1", 50 | "express-ws": "^5.0.2", 51 | "glob": "^10.4.2", 52 | "ioredis": "^5.4.1", 53 | "joplin-turndown-plugin-gfm": "^1.0.12", 54 | "koffi": "^2.9.0", 55 | "puppeteer": "^22.12.1", 56 | "rate-limiter-flexible": "2.4.2", 57 | "redlock": "5.0.0-beta.2", 58 | "redoc-express": "^2.1.0", 59 | "robots-parser": "^3.0.1", 60 | "swagger-jsdoc": "^6.2.8", 61 | "systeminformation": "^5.22.11", 62 | "turndown": "^7.1.3", 63 | "turndown-plugin-gfm": "^1.0.2", 64 | "uuid": "^10.0.0", 65 | "ws": "^8.18.0", 66 | "xml2js": "^0.6.2", 67 | "zod": "^3.23.8" 68 | }, 69 | "nodemonConfig": { 70 | "ignore": [ 71 | "*.docx", 72 | "*.json", 73 | "temp" 74 | ] 75 | }, 76 | "packageManager": "pnpm@9.12.3+sha512.cce0f9de9c5a7c95bef944169cc5dfe8741abfb145078c0d508b868056848a87c81e626246cb60967cbd7fd29a6c062ef73ff840d96b3c86c40ac92cf4a813ee" 77 | } 78 | -------------------------------------------------------------------------------- /apps/api/requests.http: -------------------------------------------------------------------------------- 1 | ### Check Job Status 2 | GET http://localhost:3002/v1/crawl/1dd0f924-a36f-4b96-94ea-32ed954dac67 HTTP/1.1 3 | Authorization: Bearer fc- 4 | -------------------------------------------------------------------------------- /apps/api/src/__tests__/e2e_full_withAuth/index.test.ts: -------------------------------------------------------------------------------- 1 | import request from "supertest"; 2 | import dotenv from "dotenv"; 3 | import { v4 as uuidv4 } from "uuid"; 4 | 5 | dotenv.config(); 6 | 7 | // const TEST_URL = 'http://localhost:3002' 8 | const TEST_URL = "http://127.0.0.1:3002"; 9 | 10 | describe("E2E Tests for API Routes", () => { 11 | describe("GET /", () => { 12 | it.concurrent("should return Hello, world! message", async () => { 13 | const response = await request(TEST_URL).get("/"); 14 | 15 | expect(response.statusCode).toBe(200); 16 | expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); 17 | }); 18 | }); 19 | 20 | describe("GET /test", () => { 21 | it.concurrent("should return Hello, world! message", async () => { 22 | const response = await request(TEST_URL).get("/test"); 23 | expect(response.statusCode).toBe(200); 24 | expect(response.text).toContain("Hello, world!"); 25 | }); 26 | }); 27 | 28 | describe("GET /is-production", () => { 29 | it.concurrent("should return the production status", async () => { 30 | const response = await request(TEST_URL).get("/is-production"); 31 | expect(response.statusCode).toBe(200); 32 | expect(response.body).toHaveProperty("isProduction"); 33 | }); 34 | }); 35 | }); 36 | -------------------------------------------------------------------------------- /apps/api/src/__tests__/e2e_noAuth/index.test.ts: -------------------------------------------------------------------------------- 1 | import request from "supertest"; 2 | import dotenv from "dotenv"; 3 | const fs = require("fs"); 4 | const path = require("path"); 5 | 6 | dotenv.config(); 7 | 8 | const TEST_URL = "http://127.0.0.1:3002"; 9 | 10 | describe("E2E Tests for API Routes with No Authentication", () => { 11 | let originalEnv: NodeJS.ProcessEnv; 12 | 13 | // save original process.env 14 | beforeAll(() => { 15 | originalEnv = { ...process.env }; 16 | process.env.BULL_AUTH_KEY = ""; 17 | process.env.PLAYWRIGHT_MICROSERVICE_URL = ""; 18 | process.env.TEST_API_KEY = ""; 19 | }); 20 | 21 | // restore original process.env 22 | afterAll(() => { 23 | process.env = originalEnv; 24 | }); 25 | 26 | 27 | describe("GET /", () => { 28 | it("should return Hello, world! message", async () => { 29 | const response = await request(TEST_URL).get("/"); 30 | expect(response.statusCode).toBe(200); 31 | expect(response.text).toContain("SCRAPERS-JS: Hello, world! Fly.io"); 32 | }); 33 | }); 34 | 35 | describe("GET /test", () => { 36 | it("should return Hello, world! message", async () => { 37 | const response = await request(TEST_URL).get("/test"); 38 | expect(response.statusCode).toBe(200); 39 | expect(response.text).toContain("Hello, world!"); 40 | }); 41 | }); 42 | 43 | describe("GET /is-production", () => { 44 | it("should return the production status", async () => { 45 | const response = await request(TEST_URL).get("/is-production"); 46 | expect(response.statusCode).toBe(200); 47 | expect(response.body).toHaveProperty("isProduction"); 48 | }); 49 | }); 50 | }); 51 | -------------------------------------------------------------------------------- /apps/api/src/__tests__/e2e_withAuth/index.test.ts: -------------------------------------------------------------------------------- 1 | import request from "supertest"; 2 | import dotenv from "dotenv"; 3 | import { 4 | FirecrawlCrawlResponse, 5 | FirecrawlCrawlStatusResponse, 6 | FirecrawlScrapeResponse, 7 | } from "../../types"; 8 | 9 | dotenv.config(); 10 | const TEST_URL = "http://127.0.0.1:3002"; 11 | 12 | describe("E2E Tests for v0 API Routes", () => { 13 | describe("GET /is-production", () => { 14 | it.concurrent("should return the production status", async () => { 15 | const response = await request(TEST_URL).get("/is-production"); 16 | expect(response.statusCode).toBe(200); 17 | expect(response.body).toHaveProperty("isProduction"); 18 | }); 19 | }); 20 | }); 21 | -------------------------------------------------------------------------------- /apps/api/src/controllers/__tests__/crawl.test.ts: -------------------------------------------------------------------------------- 1 | import { crawlController } from '../v0/crawl' 2 | import { Request, Response } from 'express'; 3 | import { v4 as uuidv4 } from 'uuid'; 4 | 5 | jest.mock('../auth', () => ({ 6 | authenticateUser: jest.fn().mockResolvedValue({ 7 | success: true, 8 | team_id: 'team123', 9 | error: null, 10 | status: 200 11 | }), 12 | reduce: jest.fn() 13 | })); 14 | jest.mock('../../services/idempotency/validate'); 15 | 16 | describe('crawlController', () => { 17 | it('should prevent duplicate requests using the same idempotency key', async () => { 18 | const req = { 19 | headers: { 20 | 'x-idempotency-key': await uuidv4(), 21 | 'Authorization': `Bearer ${process.env.TEST_API_KEY}` 22 | }, 23 | body: { 24 | url: 'https://mendable.ai' 25 | } 26 | } as unknown as Request; 27 | const res = { 28 | status: jest.fn().mockReturnThis(), 29 | json: jest.fn() 30 | } as unknown as Response; 31 | 32 | // First request should succeed 33 | await crawlController(req, res); 34 | expect(res.status).not.toHaveBeenCalledWith(409); 35 | 36 | // Second request with the same key should fail 37 | await crawlController(req, res); 38 | expect(res.status).toHaveBeenCalledWith(409); 39 | expect(res.json).toHaveBeenCalledWith({ error: 'Idempotency key already used' }); 40 | }); 41 | }); -------------------------------------------------------------------------------- /apps/api/src/controllers/v0/admin/queue.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | 3 | import { Job } from "bullmq"; 4 | import { Logger } from "../../../lib/logger"; 5 | import { getScrapeQueue } from "../../../services/queue-service"; 6 | import { checkAlerts } from "../../../services/alerts"; 7 | 8 | export async function cleanBefore24hCompleteJobsController( 9 | req: Request, 10 | res: Response 11 | ) { 12 | Logger.info("🐂 Cleaning jobs older than 24h"); 13 | try { 14 | const scrapeQueue = getScrapeQueue(); 15 | const batchSize = 10; 16 | const numberOfBatches = 9; // Adjust based on your needs 17 | const completedJobsPromises: Promise[] = []; 18 | for (let i = 0; i < numberOfBatches; i++) { 19 | completedJobsPromises.push( 20 | scrapeQueue.getJobs( 21 | ["completed"], 22 | i * batchSize, 23 | i * batchSize + batchSize, 24 | true 25 | ) 26 | ); 27 | } 28 | const completedJobs: Job[] = ( 29 | await Promise.all(completedJobsPromises) 30 | ).flat(); 31 | const before24hJobs = 32 | completedJobs.filter( 33 | (job) => job.finishedOn < Date.now() - 24 * 60 * 60 * 1000 34 | ) || []; 35 | 36 | let count = 0; 37 | 38 | if (!before24hJobs) { 39 | return res.status(200).send(`No jobs to remove.`); 40 | } 41 | 42 | for (const job of before24hJobs) { 43 | try { 44 | await job.remove(); 45 | count++; 46 | } catch (jobError) { 47 | Logger.error(`🐂 Failed to remove job with ID ${job.id}: ${jobError}`); 48 | } 49 | } 50 | return res.status(200).send(`Removed ${count} completed jobs.`); 51 | } catch (error) { 52 | Logger.error(`🐂 Failed to clean last 24h complete jobs: ${error}`); 53 | return res.status(500).send("Failed to clean jobs"); 54 | } 55 | } 56 | 57 | export async function checkQueuesController(req: Request, res: Response) { 58 | try { 59 | await checkAlerts(); 60 | return res.status(200).send("Alerts initialized"); 61 | } catch (error) { 62 | Logger.debug(`Failed to initialize alerts: ${error}`); 63 | return res.status(500).send("Failed to initialize alerts"); 64 | } 65 | } 66 | 67 | // Use this as a "health check" that way we dont destroy the server 68 | export async function queuesController(req: Request, res: Response) { 69 | try { 70 | const scrapeQueue = getScrapeQueue(); 71 | 72 | const [webScraperActive] = await Promise.all([ 73 | scrapeQueue.getActiveCount(), 74 | ]); 75 | 76 | const noActiveJobs = webScraperActive === 0; 77 | // 200 if no active jobs, 503 if there are active jobs 78 | return res.status(noActiveJobs ? 200 : 500).json({ 79 | webScraperActive, 80 | noActiveJobs, 81 | }); 82 | } catch (error) { 83 | Logger.error(error); 84 | return res.status(500).json({ error: error.message }); 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v0/admin/redis-health.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import Redis from "ioredis"; 3 | import { Logger } from "../../../lib/logger"; 4 | import { redisRateLimitClient } from "../../../services/rate-limiter"; 5 | 6 | export async function redisHealthController(req: Request, res: Response) { 7 | const retryOperation = async (operation, retries = 3) => { 8 | for (let attempt = 1; attempt <= retries; attempt++) { 9 | try { 10 | return await operation(); 11 | } catch (error) { 12 | if (attempt === retries) throw error; 13 | Logger.warn(`Attempt ${attempt} failed: ${error.message}. Retrying...`); 14 | await new Promise((resolve) => setTimeout(resolve, 2000)); // Wait 2 seconds before retrying 15 | } 16 | } 17 | }; 18 | 19 | try { 20 | const queueRedis = new Redis(process.env.REDIS_URL); 21 | 22 | const testKey = "test"; 23 | const testValue = "test"; 24 | 25 | // Test queueRedis 26 | let queueRedisHealth; 27 | try { 28 | await retryOperation(() => queueRedis.set(testKey, testValue)); 29 | queueRedisHealth = await retryOperation(() => queueRedis.get(testKey)); 30 | await retryOperation(() => queueRedis.del(testKey)); 31 | } catch (error) { 32 | Logger.error(`queueRedis health check failed: ${error}`); 33 | queueRedisHealth = null; 34 | } 35 | 36 | // Test redisRateLimitClient 37 | let redisRateLimitHealth; 38 | try { 39 | await retryOperation(() => redisRateLimitClient.set(testKey, testValue)); 40 | redisRateLimitHealth = await retryOperation(() => 41 | redisRateLimitClient.get(testKey) 42 | ); 43 | await retryOperation(() => redisRateLimitClient.del(testKey)); 44 | } catch (error) { 45 | Logger.error(`redisRateLimitClient health check failed: ${error}`); 46 | redisRateLimitHealth = null; 47 | } 48 | 49 | const healthStatus = { 50 | queueRedis: queueRedisHealth === testValue ? "healthy" : "unhealthy", 51 | redisRateLimitClient: 52 | redisRateLimitHealth === testValue ? "healthy" : "unhealthy", 53 | }; 54 | 55 | if ( 56 | healthStatus.queueRedis === "healthy" && 57 | healthStatus.redisRateLimitClient === "healthy" 58 | ) { 59 | Logger.info("Both Redis instances are healthy"); 60 | return res.status(200).json({ status: "healthy", details: healthStatus }); 61 | } else { 62 | Logger.info( 63 | `Redis instances health check: ${JSON.stringify(healthStatus)}` 64 | ); 65 | return res 66 | .status(500) 67 | .json({ status: "unhealthy", details: healthStatus }); 68 | } 69 | } catch (error) { 70 | Logger.error(`Redis health check failed: ${error}`); 71 | return res 72 | .status(500) 73 | .json({ status: "unhealthy", message: error.message }); 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v0/crawl-cancel.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { authenticateUser } from "../auth"; 3 | import { RateLimiterMode } from "../../../src/types"; 4 | import { Logger } from "../../../src/lib/logger"; 5 | import { getCrawl, saveCrawl } from "../../../src/lib/crawl-redis"; 6 | import { configDotenv } from "dotenv"; 7 | configDotenv(); 8 | 9 | export async function crawlCancelController(req: Request, res: Response) { 10 | try { 11 | const { success, team_id, error, status } = await authenticateUser( 12 | req, 13 | res, 14 | RateLimiterMode.CrawlStatus 15 | ); 16 | if (!success) { 17 | return res.status(status).json({ error }); 18 | } 19 | 20 | const sc = await getCrawl(req.params.jobId); 21 | if (!sc) { 22 | return res.status(404).json({ error: "Job not found" }); 23 | } 24 | 25 | try { 26 | sc.cancelled = true; 27 | await saveCrawl(req.params.jobId, sc); 28 | } catch (error) { 29 | Logger.error(error); 30 | } 31 | 32 | res.json({ 33 | status: "cancelled", 34 | }); 35 | } catch (error) { 36 | Logger.error(error); 37 | return res.status(500).json({ error: error.message }); 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v0/crawl-status.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { authenticateUser } from "../auth"; 3 | import { RateLimiterMode } from "../../../src/types"; 4 | import { getScrapeQueue } from "../../../src/services/queue-service"; 5 | import { Logger } from "../../../src/lib/logger"; 6 | import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; 7 | import { configDotenv } from "dotenv"; 8 | configDotenv(); 9 | 10 | export async function getJobs(crawlId: string, ids: string[]) { 11 | const jobs = (await Promise.all(ids.map(x => getScrapeQueue().getJob(x)))).filter(x => x); 12 | 13 | jobs.forEach(job => { 14 | job.returnvalue = Array.isArray(job.returnvalue) ? job.returnvalue[0] : job.returnvalue; 15 | }); 16 | 17 | return jobs; 18 | } 19 | 20 | export async function crawlStatusController(req: Request, res: Response) { 21 | try { 22 | const { success, team_id, error, status } = await authenticateUser( 23 | req, 24 | res, 25 | RateLimiterMode.CrawlStatus 26 | ); 27 | if (!success) { 28 | return res.status(status).json({ error }); 29 | } 30 | 31 | const sc = await getCrawl(req.params.jobId); 32 | if (!sc) { 33 | return res.status(404).json({ error: "Job not found" }); 34 | } 35 | 36 | if (sc.team_id !== team_id) { 37 | return res.status(403).json({ error: "Forbidden" }); 38 | } 39 | 40 | const jobIDs = await getCrawlJobs(req.params.jobId); 41 | 42 | const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); 43 | const jobStatuses = await Promise.all(jobs.map(x => x.getState())); 44 | const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active"; 45 | 46 | const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); 47 | 48 | if ( 49 | jobs.length > 0 && 50 | jobs[0].data && 51 | jobs[0].data.pageOptions && 52 | !jobs[0].data.pageOptions.includeRawHtml 53 | ) { 54 | data.forEach(item => { 55 | if (item) { 56 | delete item.rawHtml; 57 | } 58 | }); 59 | } 60 | 61 | res.json({ 62 | status: jobStatus, 63 | current: jobStatuses.filter(x => x === "completed" || x === "failed").length, 64 | total: jobs.length, 65 | data: jobStatus === "completed" ? data : null, 66 | partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null), 67 | }); 68 | } catch (error) { 69 | Logger.error(error); 70 | return res.status(500).json({ error: error.message }); 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v0/crawlPreview.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { authenticateUser } from "../auth"; 3 | import { RateLimiterMode } from "../../../src/types"; 4 | import { v4 as uuidv4 } from "uuid"; 5 | import { Logger } from "../../../src/lib/logger"; 6 | import { 7 | addCrawlJob, 8 | crawlToCrawler, 9 | lockURL, 10 | saveCrawl, 11 | StoredCrawl, 12 | } from "../../../src/lib/crawl-redis"; 13 | import { addScrapeJobRaw } from "../../../src/services/queue-jobs"; 14 | import { checkAndUpdateURL } from "../../../src/lib/validateUrl"; 15 | 16 | export async function crawlPreviewController(req: Request, res: Response) { 17 | try { 18 | const { 19 | success, 20 | error, 21 | status, 22 | team_id: a, 23 | plan, 24 | } = await authenticateUser(req, res, RateLimiterMode.Preview); 25 | 26 | const team_id = "preview"; 27 | 28 | if (!success) { 29 | return res.status(status).json({ error }); 30 | } 31 | 32 | let url = req.body.url; 33 | if (!url) { 34 | return res.status(400).json({ error: "Url is required" }); 35 | } 36 | try { 37 | url = checkAndUpdateURL(url).url; 38 | } catch (e) { 39 | return res 40 | .status(e instanceof Error && e.message === "Invalid URL" ? 400 : 500) 41 | .json({ error: e.message ?? e }); 42 | } 43 | 44 | const crawlerOptions = req.body.crawlerOptions ?? {}; 45 | const pageOptions = req.body.pageOptions ?? { 46 | removeTags: [], 47 | }; 48 | 49 | const id = uuidv4(); 50 | 51 | let robots; 52 | 53 | try { 54 | robots = await this.getRobotsTxt(); 55 | } catch (_) {} 56 | 57 | const sc: StoredCrawl = { 58 | originUrl: url, 59 | crawlerOptions, 60 | pageOptions, 61 | team_id, 62 | plan, 63 | robots, 64 | createdAt: Date.now(), 65 | }; 66 | 67 | await saveCrawl(id, sc); 68 | 69 | const crawler = crawlToCrawler(id, sc); 70 | 71 | const sitemap = 72 | sc.crawlerOptions?.ignoreSitemap ?? true 73 | ? null 74 | : await crawler.tryGetSitemap(); 75 | 76 | if (sitemap !== null) { 77 | for (const url of sitemap.map((x) => x.url)) { 78 | await lockURL(id, sc, url); 79 | const job = await addScrapeJobRaw( 80 | { 81 | url, 82 | mode: "single_urls", 83 | crawlerOptions: crawlerOptions, 84 | team_id: team_id, 85 | pageOptions: pageOptions, 86 | origin: "website-preview", 87 | crawl_id: id, 88 | sitemapped: true, 89 | }, 90 | {}, 91 | uuidv4(), 92 | 10 93 | ); 94 | await addCrawlJob(id, job.id); 95 | } 96 | } else { 97 | await lockURL(id, sc, url); 98 | const job = await addScrapeJobRaw( 99 | { 100 | url, 101 | mode: "single_urls", 102 | crawlerOptions: crawlerOptions, 103 | team_id: team_id, 104 | pageOptions: pageOptions, 105 | origin: "website-preview", 106 | crawl_id: id, 107 | }, 108 | {}, 109 | uuidv4(), 110 | 10 111 | ); 112 | await addCrawlJob(id, job.id); 113 | } 114 | 115 | res.json({ jobId: id }); 116 | } catch (error) { 117 | Logger.error(error); 118 | return res.status(500).json({ error: error.message }); 119 | } 120 | } 121 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v0/keyAuth.ts: -------------------------------------------------------------------------------- 1 | 2 | import { AuthResponse, RateLimiterMode } from "../../types"; 3 | 4 | import { Request, Response } from "express"; 5 | import { authenticateUser } from "../auth"; 6 | 7 | 8 | export const keyAuthController = async (req: Request, res: Response) => { 9 | try { 10 | // make sure to authenticate user first, Bearer 11 | const { success, team_id, error, status } = await authenticateUser( 12 | req, 13 | res 14 | ); 15 | if (!success) { 16 | return res.status(status).json({ error }); 17 | } 18 | // if success, return success: true 19 | return res.status(200).json({ success: true }); 20 | } catch (error) { 21 | return res.status(500).json({ error: error.message }); 22 | } 23 | }; 24 | 25 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v0/scrape.ts: -------------------------------------------------------------------------------- 1 | import { PageOptions } from "./../../lib/entities"; 2 | import { Request, Response } from "express"; 3 | import { authenticateUser } from "../auth"; 4 | import { PlanType, RateLimiterMode } from "../../types"; 5 | import { Document } from "../../lib/entities"; 6 | import { 7 | defaultPageOptions, 8 | defaultTimeout, 9 | defaultOrigin, 10 | } from "../../lib/default-values"; 11 | import { addScrapeJobRaw, waitForJob } from "../../services/queue-jobs"; 12 | import { v4 as uuidv4 } from "uuid"; 13 | import { Logger } from "../../lib/logger"; 14 | import { getJobPriority } from "../../lib/job-priority"; 15 | 16 | export async function scrapeHelper( 17 | jobId: string, 18 | req: Request, 19 | team_id: string, 20 | crawlerOptions: any, 21 | pageOptions: PageOptions, 22 | timeout: number, 23 | plan?: PlanType 24 | ): Promise<{ 25 | success: boolean; 26 | error?: string; 27 | data?: Document; 28 | returnCode: number; 29 | }> { 30 | const url = req.body.url; 31 | if (typeof url !== "string") { 32 | return { success: false, error: "Url is required", returnCode: 400 }; 33 | } 34 | 35 | const jobPriority = await getJobPriority({ plan, team_id, basePriority: 10 }); 36 | 37 | const job = await addScrapeJobRaw( 38 | { 39 | url, 40 | mode: "single_urls", 41 | crawlerOptions, 42 | team_id, 43 | pageOptions, 44 | origin: req.body.origin ?? defaultOrigin, 45 | is_scrape: true, 46 | }, 47 | {}, 48 | jobId, 49 | jobPriority 50 | ); 51 | 52 | let doc; 53 | 54 | const err = (async () => { 55 | try { 56 | doc = (await waitForJob(job.id, timeout))[0]; 57 | } catch (e) { 58 | if (e instanceof Error && e.message.startsWith("Job wait")) { 59 | return { 60 | success: false, 61 | error: "Request timed out", 62 | returnCode: 408, 63 | }; 64 | } else if ( 65 | typeof e === "string" && 66 | (e.includes("Error generating completions: ") || 67 | e.includes("Invalid schema for function") || 68 | e.includes( 69 | "LLM extraction did not match the extraction schema you provided." 70 | )) 71 | ) { 72 | return { 73 | success: false, 74 | error: e, 75 | returnCode: 500, 76 | }; 77 | } else { 78 | throw e; 79 | } 80 | } 81 | return null; 82 | })(); 83 | 84 | if (err !== null) { 85 | return err; 86 | } 87 | 88 | await job.remove(); 89 | 90 | if (!doc) { 91 | console.error("!!! PANIC DOC IS", doc, job); 92 | return { 93 | success: true, 94 | error: "No page found", 95 | returnCode: 200, 96 | data: doc, 97 | }; 98 | } 99 | 100 | delete doc.index; 101 | delete doc.provider; 102 | 103 | return { 104 | success: true, 105 | data: doc, 106 | returnCode: 200, 107 | }; 108 | } 109 | 110 | export async function scrapeController(req: Request, res: Response) { 111 | try { 112 | let earlyReturn = false; 113 | // make sure to authenticate user first, Bearer 114 | const { success, team_id, error, status, plan } = await authenticateUser( 115 | req, 116 | res, 117 | RateLimiterMode.Scrape 118 | ); 119 | if (!success) { 120 | return res.status(status).json({ error }); 121 | } 122 | 123 | const crawlerOptions = req.body.crawlerOptions ?? {}; 124 | const pageOptions = { ...defaultPageOptions, ...req.body.pageOptions }; 125 | let timeout = req.body.timeout ?? defaultTimeout; 126 | 127 | const jobId = uuidv4(); 128 | 129 | const result = await scrapeHelper( 130 | jobId, 131 | req, 132 | team_id, 133 | crawlerOptions, 134 | pageOptions, 135 | timeout, 136 | plan 137 | ); 138 | let doc = result.data; 139 | if (!pageOptions || !pageOptions.includeRawHtml) { 140 | if (doc && doc.rawHtml) { 141 | delete doc.rawHtml; 142 | } 143 | } 144 | 145 | if (pageOptions && pageOptions.includeExtract) { 146 | if (!pageOptions.includeMarkdown && doc && doc.markdown) { 147 | delete doc.markdown; 148 | } 149 | } 150 | 151 | return res.status(result.returnCode).json(result); 152 | } catch (error) { 153 | Logger.error(error); 154 | return res.status(500).json({ 155 | error: 156 | typeof error === "string" 157 | ? error 158 | : error?.message ?? "Internal Server Error", 159 | }); 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v0/status.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { Logger } from "../../../src/lib/logger"; 3 | import { getCrawl, getCrawlJobs } from "../../../src/lib/crawl-redis"; 4 | import { getJobs } from "./crawl-status"; 5 | 6 | export async function crawlJobStatusPreviewController(req: Request, res: Response) { 7 | try { 8 | const sc = await getCrawl(req.params.jobId); 9 | if (!sc) { 10 | return res.status(404).json({ error: "Job not found" }); 11 | } 12 | 13 | const jobIDs = await getCrawlJobs(req.params.jobId); 14 | 15 | const jobs = (await getJobs(req.params.jobId, jobIDs)).sort((a, b) => a.timestamp - b.timestamp); 16 | const jobStatuses = await Promise.all(jobs.map(x => x.getState())); 17 | const jobStatus = sc.cancelled ? "failed" : jobStatuses.every(x => x === "completed") ? "completed" : jobStatuses.some(x => x === "failed") ? "failed" : "active"; 18 | 19 | const data = jobs.map(x => Array.isArray(x.returnvalue) ? x.returnvalue[0] : x.returnvalue); 20 | 21 | res.json({ 22 | status: jobStatus, 23 | current: jobStatuses.filter(x => x === "completed" || x === "failed").length, 24 | total: jobs.length, 25 | data: jobStatus === "completed" ? data : null, 26 | partial_data: jobStatus === "completed" ? [] : data.filter(x => x !== null), 27 | }); 28 | } catch (error) { 29 | Logger.error(error); 30 | return res.status(500).json({ error: error.message }); 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v1/__tests__/urlValidation.test.ts: -------------------------------------------------------------------------------- 1 | import { url } from "../types"; 2 | 3 | describe("URL Schema Validation", () => { 4 | beforeEach(() => { 5 | jest.resetAllMocks(); 6 | }); 7 | 8 | it("should prepend http:// to URLs without a protocol", () => { 9 | const result = url.parse("example.com"); 10 | expect(result).toBe("http://example.com"); 11 | }); 12 | 13 | it("should allow valid URLs with http or https", () => { 14 | expect(() => url.parse("http://example.com")).not.toThrow(); 15 | expect(() => url.parse("https://example.com")).not.toThrow(); 16 | }); 17 | 18 | it("should allow valid URLs with http or https", () => { 19 | expect(() => url.parse("example.com")).not.toThrow(); 20 | }); 21 | 22 | it("should reject URLs with unsupported protocols", () => { 23 | expect(() => url.parse("ftp://example.com")).toThrow("Invalid URL"); 24 | }); 25 | 26 | it("should reject URLs without a valid top-level domain", () => { 27 | expect(() => url.parse("http://example")).toThrow("URL must have a valid top-level domain or be a valid path"); 28 | }); 29 | 30 | it("should reject blocked URLs", () => { 31 | expect(() => url.parse("https://facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); 32 | }); 33 | 34 | it("should handle URLs with subdomains correctly", () => { 35 | expect(() => url.parse("http://sub.example.com")).not.toThrow(); 36 | expect(() => url.parse("https://blog.example.com")).not.toThrow(); 37 | }); 38 | 39 | it("should handle URLs with paths correctly", () => { 40 | expect(() => url.parse("http://example.com/path")).not.toThrow(); 41 | expect(() => url.parse("https://example.com/another/path")).not.toThrow(); 42 | }); 43 | 44 | it("should handle URLs with subdomains that are blocked", () => { 45 | expect(() => url.parse("https://sub.facebook.com")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); 46 | }); 47 | 48 | it("should handle URLs with paths that are blocked", () => { 49 | expect(() => url.parse("http://facebook.com/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); 50 | expect(() => url.parse("https://facebook.com/another/path")).toThrow("Firecrawl currently does not support social media scraping due to policy restrictions. We're actively working on building support for it."); 51 | }); 52 | 53 | it("should reject malformed URLs starting with 'http://http'", () => { 54 | expect(() => url.parse("http://http://example.com")).toThrow("Invalid URL. Invalid protocol."); 55 | }); 56 | 57 | it("should reject malformed URLs containing multiple 'http://'", () => { 58 | expect(() => url.parse("http://example.com/http://example.com")).not.toThrow(); 59 | }); 60 | 61 | it("should reject malformed URLs containing multiple 'http://'", () => { 62 | expect(() => url.parse("http://ex ample.com/")).toThrow("Invalid URL"); 63 | }); 64 | }) -------------------------------------------------------------------------------- /apps/api/src/controllers/v1/crawl-cancel.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { authenticateUser } from "../auth"; 3 | import { RateLimiterMode } from "../../types"; 4 | import { Logger } from "../../lib/logger"; 5 | import { getCrawl, saveCrawl } from "../../lib/crawl-redis"; 6 | import { configDotenv } from "dotenv"; 7 | configDotenv(); 8 | 9 | /** 10 | * @openapi 11 | * /v1/crawl/{jobId}: 12 | * delete: 13 | * tags: 14 | * - Crawling 15 | * summary: Cancel a crawl job 16 | * security: 17 | * - BearerAuth: [] 18 | * parameters: 19 | * - name: jobId 20 | * in: path 21 | * required: true 22 | * schema: 23 | * type: string 24 | * responses: 25 | * 200: 26 | * description: Success 27 | * content: 28 | * application/json: 29 | * schema: 30 | * type: object 31 | * properties: 32 | * success: 33 | * type: boolean 34 | */ 35 | export async function crawlCancelController(req: Request, res: Response) { 36 | try { 37 | const { success, team_id, error, status } = await authenticateUser( 38 | req, 39 | res, 40 | RateLimiterMode.CrawlStatus 41 | ); 42 | if (!success) { 43 | return res.status(status).json({ error }); 44 | } 45 | 46 | const sc = await getCrawl(req.params.jobId); 47 | if (!sc) { 48 | return res.status(404).json({ error: "Job not found" }); 49 | } 50 | 51 | try { 52 | sc.cancelled = true; 53 | await saveCrawl(req.params.jobId, sc); 54 | } catch (error) { 55 | Logger.error(error); 56 | } 57 | 58 | res.json({ 59 | status: "cancelled" 60 | }); 61 | } catch (error) { 62 | Logger.error(error); 63 | return res.status(500).json({ error: error.message }); 64 | } 65 | } 66 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v1/liveness.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | 3 | /** 4 | * @openapi 5 | * /v1/health/liveness: 6 | * get: 7 | * tags: 8 | * - Health 9 | * summary: Check if service is alive 10 | * responses: 11 | * 200: 12 | * description: Success 13 | * content: 14 | * application/json: 15 | * schema: 16 | * type: object 17 | * properties: 18 | * status: 19 | * type: string 20 | * example: ok 21 | */ 22 | export async function livenessController(req: Request, res: Response) { 23 | //TODO: add checks if the application is live and healthy like checking the redis connection 24 | res.status(200).json({ status: "ok" }); 25 | } 26 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v1/map.ts: -------------------------------------------------------------------------------- 1 | import { Response } from "express"; 2 | import { v4 as uuidv4 } from "uuid"; 3 | import { 4 | legacyCrawlerOptions, 5 | mapRequestSchema, 6 | RequestWithAuth, 7 | } from "./types"; 8 | import { crawlToCrawler, StoredCrawl } from "../../lib/crawl-redis"; 9 | import { MapResponse, MapRequest } from "./types"; 10 | import { configDotenv } from "dotenv"; 11 | import { 12 | checkAndUpdateURLForMap, 13 | isSameDomain, 14 | isSameSubdomain, 15 | removeDuplicateUrls, 16 | } from "../../lib/validateUrl"; 17 | 18 | configDotenv(); 19 | 20 | /** 21 | * @openapi 22 | * /v1/map: 23 | * post: 24 | * tags: 25 | * - Mapping 26 | * summary: Generate sitemap 27 | * security: 28 | * - BearerAuth: [] 29 | * requestBody: 30 | * required: true 31 | * content: 32 | * application/json: 33 | * schema: 34 | * type: object 35 | * required: 36 | * - url 37 | * properties: 38 | * url: 39 | * type: string 40 | * format: uri 41 | * responses: 42 | * 200: 43 | * description: Success 44 | * content: 45 | * application/json: 46 | * schema: 47 | * type: object 48 | * properties: 49 | * success: 50 | * type: boolean 51 | * urls: 52 | * type: array 53 | * items: 54 | * type: string 55 | */ 56 | export async function mapController( 57 | req: RequestWithAuth<{}, MapResponse, MapRequest>, 58 | res: Response 59 | ) { 60 | req.body = mapRequestSchema.parse(req.body); 61 | 62 | const limit: number = req.body.limit ?? 5000; 63 | 64 | const id = uuidv4(); 65 | let links: string[] = [req.body.url]; 66 | 67 | const sc: StoredCrawl = { 68 | originUrl: req.body.url, 69 | crawlerOptions: legacyCrawlerOptions(req.body), 70 | pageOptions: {}, 71 | team_id: req.auth.team_id, 72 | createdAt: Date.now(), 73 | plan: req.auth.plan, 74 | }; 75 | 76 | const crawler = crawlToCrawler(id, sc); 77 | 78 | const sitemap = 79 | req.body.ignoreSitemap ?? true ? null : await crawler.tryGetSitemap(); 80 | 81 | if (sitemap !== null) { 82 | sitemap.map((x) => { 83 | links.push(x.url); 84 | }); 85 | } 86 | 87 | links = links 88 | .map((x) => { 89 | try { 90 | return checkAndUpdateURLForMap(x).url.trim(); 91 | } catch (_) { 92 | return null; 93 | } 94 | }) 95 | .filter((x) => x !== null); 96 | 97 | links = links.filter((x) => isSameDomain(x, req.body.url)); 98 | 99 | if (!req.body.includeSubdomains) { 100 | links = links.filter((x) => isSameSubdomain(x, req.body.url)); 101 | } 102 | 103 | links = removeDuplicateUrls(links); 104 | 105 | const linksToReturn = links.slice(0, limit).filter((x) => x !== null); 106 | 107 | return res.status(200).json({ 108 | success: true, 109 | links: linksToReturn, 110 | scrape_id: req.body.origin?.includes("website") ? id : undefined, 111 | }); 112 | } 113 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v1/readiness.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | 3 | /** 4 | * @openapi 5 | * /v1/health/readiness: 6 | * get: 7 | * tags: 8 | * - Health 9 | * summary: Check if service is ready 10 | * responses: 11 | * 200: 12 | * description: Success 13 | * content: 14 | * application/json: 15 | * schema: 16 | * type: object 17 | * properties: 18 | * status: 19 | * type: string 20 | * example: ok 21 | */ 22 | export async function readinessController(req: Request, res: Response) { 23 | // TODO: add checks when the application is ready to serve traffic 24 | res.status(200).json({ status: "ok" }); 25 | } 26 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v1/scrape-status.ts: -------------------------------------------------------------------------------- 1 | import { scrapeStatusRateLimiter } from "../../services/rate-limiter"; 2 | 3 | /** 4 | * @openapi 5 | * /v1/scrape/{jobId}: 6 | * get: 7 | * tags: 8 | * - Scraping 9 | * summary: Get scrape job status 10 | * parameters: 11 | * - name: jobId 12 | * in: path 13 | * required: true 14 | * schema: 15 | * type: string 16 | * responses: 17 | * 200: 18 | * description: Success 19 | * content: 20 | * application/json: 21 | * schema: 22 | * type: object 23 | * properties: 24 | * status: 25 | * type: string 26 | * enum: [completed, failed, in_progress] 27 | * content: 28 | * type: string 29 | */ 30 | export async function scrapeStatusController(req: any, res: any) { 31 | try { 32 | const rateLimiter = scrapeStatusRateLimiter; 33 | const incomingIP = (req.headers["x-forwarded-for"] || 34 | req.socket.remoteAddress) as string; 35 | const iptoken = incomingIP; 36 | await rateLimiter.consume(iptoken); 37 | 38 | return res.status(200).json({ 39 | success: true, 40 | data: null, 41 | }); 42 | } catch (error) { 43 | if (error instanceof Error && error.message == "Too Many Requests") { 44 | return res.status(429).json({ 45 | success: false, 46 | error: "Rate limit exceeded. Please try again later.", 47 | }); 48 | } else { 49 | return res.status(500).json({ 50 | success: false, 51 | error: "An unexpected error occurred.", 52 | }); 53 | } 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /apps/api/src/controllers/v1/scrape.ts: -------------------------------------------------------------------------------- 1 | import { Request, Response } from "express"; 2 | import { Logger } from "../../lib/logger"; 3 | import { 4 | legacyDocumentConverter, 5 | legacyScrapeOptions, 6 | RequestWithAuth, 7 | ScrapeRequest, 8 | scrapeRequestSchema, 9 | ScrapeResponse, 10 | } from "./types"; 11 | import { v4 as uuidv4 } from "uuid"; 12 | import { addScrapeJobRaw, waitForJob } from "../../services/queue-jobs"; 13 | import { getJobPriority } from "../../lib/job-priority"; 14 | import { PlanType } from "../../types"; 15 | 16 | /** 17 | * @openapi 18 | * /v1/scrape: 19 | * post: 20 | * tags: 21 | * - Scraping 22 | * summary: Scrape a single webpage 23 | * security: 24 | * - BearerAuth: [] 25 | * requestBody: 26 | * required: true 27 | * content: 28 | * application/json: 29 | * schema: 30 | * type: object 31 | * required: 32 | * - url 33 | * properties: 34 | * url: 35 | * type: string 36 | * format: uri 37 | * waitUntil: 38 | * type: string 39 | * enum: ['load', 'domcontentloaded', 'networkidle0', 'networkidle2'] 40 | * timeout: 41 | * type: integer 42 | * minimum: 1000 43 | * responses: 44 | * 200: 45 | * description: Success 46 | * content: 47 | * application/json: 48 | * schema: 49 | * type: object 50 | * properties: 51 | * success: 52 | * type: boolean 53 | * jobId: 54 | * type: string 55 | */ 56 | export async function scrapeController( 57 | req: RequestWithAuth<{}, ScrapeResponse, ScrapeRequest>, 58 | res: Response 59 | ) { 60 | req.body = scrapeRequestSchema.parse(req.body); 61 | 62 | const origin = req.body.origin; 63 | const timeout = req.body.timeout; 64 | const pageOptions = legacyScrapeOptions(req.body); 65 | const jobId = uuidv4(); 66 | 67 | const jobPriority = await getJobPriority({ 68 | plan: req.auth.plan as PlanType, 69 | team_id: req.auth.team_id, 70 | basePriority: 10, 71 | }); 72 | 73 | const job = await addScrapeJobRaw( 74 | { 75 | url: req.body.url, 76 | mode: "single_urls", 77 | crawlerOptions: {}, 78 | team_id: req.auth.team_id, 79 | pageOptions, 80 | origin: req.body.origin, 81 | is_scrape: true, 82 | }, 83 | {}, 84 | jobId, 85 | jobPriority 86 | ); 87 | 88 | let doc: any | undefined; 89 | try { 90 | doc = (await waitForJob(job.id, timeout))[0]; 91 | } catch (e) { 92 | Logger.error(`Error in scrapeController: ${e}`); 93 | if (e instanceof Error && e.message.startsWith("Job wait")) { 94 | return res.status(408).json({ 95 | success: false, 96 | error: "Request timed out", 97 | }); 98 | } else { 99 | return res.status(500).json({ 100 | success: false, 101 | error: `(Internal server error) - ${e && e?.message ? e.message : e}`, 102 | }); 103 | } 104 | } 105 | 106 | await job.remove(); 107 | 108 | if (!doc) { 109 | console.error("!!! PANIC DOC IS", doc, job); 110 | return res.status(200).json({ 111 | success: true, 112 | warning: "No page found", 113 | data: doc, 114 | }); 115 | } 116 | 117 | delete doc.index; 118 | delete doc.provider; 119 | 120 | if (!pageOptions || !pageOptions.includeRawHtml) { 121 | if (doc && doc.rawHtml) { 122 | delete doc.rawHtml; 123 | } 124 | } 125 | 126 | if (pageOptions && pageOptions.includeExtract) { 127 | if (!pageOptions.includeMarkdown && doc && doc.markdown) { 128 | delete doc.markdown; 129 | } 130 | } 131 | 132 | return res.status(200).json({ 133 | success: true, 134 | data: legacyDocumentConverter(doc), 135 | scrape_id: origin?.includes("website") ? jobId : undefined, 136 | }); 137 | } 138 | -------------------------------------------------------------------------------- /apps/api/src/lib/__tests__/html-to-markdown.test.ts: -------------------------------------------------------------------------------- 1 | import { parseMarkdown } from '../html-to-markdown'; 2 | 3 | describe('parseMarkdown', () => { 4 | it('should correctly convert simple HTML to Markdown', async () => { 5 | const html = '

Hello, world!

'; 6 | const expectedMarkdown = 'Hello, world!'; 7 | await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); 8 | }); 9 | 10 | it('should convert complex HTML with nested elements to Markdown', async () => { 11 | const html = '

Hello bold world!

  • List item
'; 12 | const expectedMarkdown = 'Hello **bold** world!\n\n- List item'; 13 | await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); 14 | }); 15 | 16 | it('should return empty string when input is empty', async () => { 17 | const html = ''; 18 | const expectedMarkdown = ''; 19 | await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); 20 | }); 21 | 22 | it('should handle null input gracefully', async () => { 23 | const html = null; 24 | const expectedMarkdown = ''; 25 | await expect(parseMarkdown(html)).resolves.toBe(expectedMarkdown); 26 | }); 27 | 28 | it('should handle various types of invalid HTML gracefully', async () => { 29 | const invalidHtmls = [ 30 | { html: '

Unclosed tag', expected: 'Unclosed tag' }, 31 | { html: '

Missing closing div', expected: 'Missing closing div' }, 32 | { html: '

Wrong nesting

', expected: '**Wrong nesting**' }, 33 | { html: 'Link without closing tag', expected: '[Link without closing tag](http://example.com)' } 34 | ]; 35 | 36 | for (const { html, expected } of invalidHtmls) { 37 | await expect(parseMarkdown(html)).resolves.toBe(expected); 38 | } 39 | }); 40 | }); 41 | -------------------------------------------------------------------------------- /apps/api/src/lib/__tests__/job-priority.test.ts: -------------------------------------------------------------------------------- 1 | import { 2 | getJobPriority, 3 | addJobPriority, 4 | deleteJobPriority, 5 | } from "../job-priority"; 6 | import { redisConnection } from "../../services/queue-service"; 7 | import { PlanType } from "../../types"; 8 | 9 | jest.mock("../../services/queue-service", () => ({ 10 | redisConnection: { 11 | sadd: jest.fn(), 12 | srem: jest.fn(), 13 | scard: jest.fn(), 14 | expire: jest.fn(), 15 | }, 16 | })); 17 | 18 | describe("Job Priority Tests", () => { 19 | afterEach(() => { 20 | jest.clearAllMocks(); 21 | }); 22 | 23 | test("addJobPriority should add job_id to the set and set expiration", async () => { 24 | const team_id = "team1"; 25 | const job_id = "job1"; 26 | await addJobPriority(team_id, job_id); 27 | expect(redisConnection.sadd).toHaveBeenCalledWith( 28 | `limit_team_id:${team_id}`, 29 | job_id 30 | ); 31 | expect(redisConnection.expire).toHaveBeenCalledWith( 32 | `limit_team_id:${team_id}`, 33 | 60 34 | ); 35 | }); 36 | 37 | test("deleteJobPriority should remove job_id from the set", async () => { 38 | const team_id = "team1"; 39 | const job_id = "job1"; 40 | await deleteJobPriority(team_id, job_id); 41 | expect(redisConnection.srem).toHaveBeenCalledWith( 42 | `limit_team_id:${team_id}`, 43 | job_id 44 | ); 45 | }); 46 | 47 | test("getJobPriority should return correct priority based on plan and set length", async () => { 48 | const team_id = "team1"; 49 | const plan: PlanType = "standard"; 50 | (redisConnection.scard as jest.Mock).mockResolvedValue(150); 51 | 52 | const priority = await getJobPriority({ plan, team_id }); 53 | expect(priority).toBe(10); 54 | 55 | (redisConnection.scard as jest.Mock).mockResolvedValue(250); 56 | const priorityExceeded = await getJobPriority({ plan, team_id }); 57 | expect(priorityExceeded).toBe(20); // basePriority + Math.ceil((250 - 200) * 0.4) 58 | }); 59 | 60 | test("getJobPriority should handle different plans correctly", async () => { 61 | const team_id = "team1"; 62 | 63 | (redisConnection.scard as jest.Mock).mockResolvedValue(50); 64 | let plan: PlanType = "hobby"; 65 | let priority = await getJobPriority({ plan, team_id }); 66 | expect(priority).toBe(10); 67 | 68 | (redisConnection.scard as jest.Mock).mockResolvedValue(150); 69 | plan = "hobby"; 70 | priority = await getJobPriority({ plan, team_id }); 71 | expect(priority).toBe(25); // basePriority + Math.ceil((150 - 50) * 0.3) 72 | 73 | (redisConnection.scard as jest.Mock).mockResolvedValue(25); 74 | plan = "free"; 75 | priority = await getJobPriority({ plan, team_id }); 76 | expect(priority).toBe(10); 77 | 78 | (redisConnection.scard as jest.Mock).mockResolvedValue(60); 79 | plan = "free"; 80 | priority = await getJobPriority({ plan, team_id }); 81 | expect(priority).toBe(28); // basePriority + Math.ceil((60 - 25) * 0.5) 82 | }); 83 | 84 | test("addJobPriority should reset expiration time when adding new job", async () => { 85 | const team_id = "team1"; 86 | const job_id1 = "job1"; 87 | const job_id2 = "job2"; 88 | 89 | await addJobPriority(team_id, job_id1); 90 | expect(redisConnection.expire).toHaveBeenCalledWith( 91 | `limit_team_id:${team_id}`, 92 | 60 93 | ); 94 | 95 | // Clear the mock calls 96 | (redisConnection.expire as jest.Mock).mockClear(); 97 | 98 | // Add another job 99 | await addJobPriority(team_id, job_id2); 100 | expect(redisConnection.expire).toHaveBeenCalledWith( 101 | `limit_team_id:${team_id}`, 102 | 60 103 | ); 104 | }); 105 | 106 | test("Set should expire after 60 seconds", async () => { 107 | const team_id = "team1"; 108 | const job_id = "job1"; 109 | 110 | jest.useFakeTimers(); 111 | 112 | await addJobPriority(team_id, job_id); 113 | expect(redisConnection.expire).toHaveBeenCalledWith( 114 | `limit_team_id:${team_id}`, 115 | 60 116 | ); 117 | 118 | // Fast-forward time by 59 seconds 119 | jest.advanceTimersByTime(59000); 120 | 121 | // The set should still exist 122 | expect(redisConnection.scard).not.toHaveBeenCalled(); 123 | 124 | // Fast-forward time by 2 more seconds (total 61 seconds) 125 | jest.advanceTimersByTime(2000); 126 | 127 | // Check if the set has been removed (scard should return 0) 128 | (redisConnection.scard as jest.Mock).mockResolvedValue(0); 129 | const setSize = await redisConnection.scard(`limit_team_id:${team_id}`); 130 | expect(setSize).toBe(0); 131 | 132 | jest.useRealTimers(); 133 | }); 134 | }); 135 | -------------------------------------------------------------------------------- /apps/api/src/lib/batch-process.ts: -------------------------------------------------------------------------------- 1 | export async function batchProcess( 2 | array: T[], 3 | batchSize: number, 4 | asyncFunction: (item: T, index: number) => Promise 5 | ): Promise { 6 | const batches = []; 7 | for (let i = 0; i < array.length; i += batchSize) { 8 | const batch = array.slice(i, i + batchSize); 9 | batches.push(batch); 10 | } 11 | 12 | for (const batch of batches) { 13 | await Promise.all(batch.map((item, i) => asyncFunction(item, i))); 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /apps/api/src/lib/crawl-redis.ts: -------------------------------------------------------------------------------- 1 | import { WebCrawler } from "../scraper/WebScraper/crawler"; 2 | import { redisConnection } from "../services/queue-service"; 3 | 4 | export type StoredCrawl = { 5 | originUrl: string; 6 | crawlerOptions: any; 7 | pageOptions: any; 8 | team_id: string; 9 | plan: string; 10 | robots?: string; 11 | cancelled?: boolean; 12 | createdAt: number; 13 | }; 14 | 15 | export async function saveCrawl(id: string, crawl: StoredCrawl) { 16 | await redisConnection.set("crawl:" + id, JSON.stringify(crawl)); 17 | await redisConnection.expire("crawl:" + id, 24 * 60 * 60, "NX"); 18 | } 19 | 20 | export async function getCrawl(id: string): Promise { 21 | const x = await redisConnection.get("crawl:" + id); 22 | 23 | if (x === null) { 24 | return null; 25 | } 26 | 27 | return JSON.parse(x); 28 | } 29 | 30 | export async function getCrawlExpiry(id: string): Promise { 31 | const d = new Date(); 32 | const ttl = await redisConnection.pttl("crawl:" + id); 33 | d.setMilliseconds(d.getMilliseconds() + ttl); 34 | d.setMilliseconds(0); 35 | return d; 36 | } 37 | 38 | export async function addCrawlJob(id: string, job_id: string) { 39 | await redisConnection.sadd("crawl:" + id + ":jobs", job_id); 40 | await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); 41 | } 42 | 43 | export async function addCrawlJobs(id: string, job_ids: string[]) { 44 | await redisConnection.sadd("crawl:" + id + ":jobs", ...job_ids); 45 | await redisConnection.expire("crawl:" + id + ":jobs", 24 * 60 * 60, "NX"); 46 | } 47 | 48 | export async function addCrawlJobDone(id: string, job_id: string) { 49 | await redisConnection.sadd("crawl:" + id + ":jobs_done", job_id); 50 | await redisConnection.lpush("crawl:" + id + ":jobs_done_ordered", job_id); 51 | await redisConnection.expire( 52 | "crawl:" + id + ":jobs_done", 53 | 24 * 60 * 60, 54 | "NX", 55 | ); 56 | await redisConnection.expire( 57 | "crawl:" + id + ":jobs_done_ordered", 58 | 24 * 60 * 60, 59 | "NX", 60 | ); 61 | } 62 | 63 | export async function getDoneJobsOrderedLength(id: string): Promise { 64 | return await redisConnection.llen("crawl:" + id + ":jobs_done_ordered"); 65 | } 66 | 67 | export async function getDoneJobsOrdered( 68 | id: string, 69 | start = 0, 70 | end = -1, 71 | ): Promise { 72 | return await redisConnection.lrange( 73 | "crawl:" + id + ":jobs_done_ordered", 74 | start, 75 | end, 76 | ); 77 | } 78 | 79 | export async function isCrawlFinished(id: string) { 80 | return ( 81 | (await redisConnection.scard("crawl:" + id + ":jobs_done")) === 82 | (await redisConnection.scard("crawl:" + id + ":jobs")) 83 | ); 84 | } 85 | 86 | export async function isCrawlFinishedLocked(id: string) { 87 | return await redisConnection.exists("crawl:" + id + ":finish"); 88 | } 89 | 90 | export async function finishCrawl(id: string) { 91 | if (await isCrawlFinished(id)) { 92 | const set = await redisConnection.setnx("crawl:" + id + ":finish", "yes"); 93 | if (set === 1) { 94 | await redisConnection.expire("crawl:" + id + ":finish", 24 * 60 * 60); 95 | } 96 | return set === 1; 97 | } 98 | } 99 | 100 | export async function getCrawlJobs(id: string): Promise { 101 | return await redisConnection.smembers("crawl:" + id + ":jobs"); 102 | } 103 | 104 | export async function lockURL( 105 | id: string, 106 | sc: StoredCrawl, 107 | url: string, 108 | ): Promise { 109 | if (typeof sc.crawlerOptions?.limit === "number") { 110 | if ( 111 | (await redisConnection.scard("crawl:" + id + ":visited")) >= 112 | sc.crawlerOptions.limit 113 | ) { 114 | return false; 115 | } 116 | } 117 | const res = 118 | (await redisConnection.sadd("crawl:" + id + ":visited", url)) !== 0; 119 | await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); 120 | return res; 121 | } 122 | 123 | /// NOTE: does not check limit. only use if limit is checked beforehand e.g. with sitemap 124 | export async function lockURLs(id: string, urls: string[]): Promise { 125 | const res = 126 | (await redisConnection.sadd("crawl:" + id + ":visited", ...urls)) !== 0; 127 | await redisConnection.expire("crawl:" + id + ":visited", 24 * 60 * 60, "NX"); 128 | return res; 129 | } 130 | 131 | export function crawlToCrawler(id: string, sc: StoredCrawl): WebCrawler { 132 | const crawler = new WebCrawler({ 133 | jobId: id, 134 | initialUrl: sc.originUrl, 135 | includes: sc.crawlerOptions?.includes ?? [], 136 | excludes: sc.crawlerOptions?.excludes ?? [], 137 | maxCrawledLinks: sc.crawlerOptions?.maxCrawledLinks ?? 1000, 138 | maxCrawledDepth: sc.crawlerOptions?.maxDepth ?? 10, 139 | limit: sc.crawlerOptions?.limit ?? 10000, 140 | allowExternalLinks: sc.crawlerOptions?.allowExternalLinks ?? false, 141 | crawlId: id, 142 | }); 143 | 144 | if (sc.robots !== undefined) { 145 | try { 146 | crawler.importRobotsTxt(sc.robots); 147 | } catch (_) {} 148 | } 149 | 150 | return crawler; 151 | } 152 | -------------------------------------------------------------------------------- /apps/api/src/lib/custom-error.ts: -------------------------------------------------------------------------------- 1 | export class CustomError extends Error { 2 | statusCode: number; 3 | status: string; 4 | message: string; 5 | dataIngestionJob: any; 6 | 7 | constructor( 8 | statusCode: number, 9 | status: string, 10 | message: string = "", 11 | dataIngestionJob?: any, 12 | ) { 13 | super(message); 14 | this.statusCode = statusCode; 15 | this.status = status; 16 | this.message = message; 17 | this.dataIngestionJob = dataIngestionJob; 18 | 19 | Object.setPrototypeOf(this, CustomError.prototype); 20 | } 21 | } 22 | 23 | -------------------------------------------------------------------------------- /apps/api/src/lib/default-values.ts: -------------------------------------------------------------------------------- 1 | export const defaultOrigin = "api"; 2 | 3 | export const defaultTimeout = 60000; // 60 seconds 4 | 5 | export const defaultPageOptions = { 6 | waitFor: 0, 7 | screenshot: false, 8 | fullPageScreenshot: false, 9 | parsePDF: true 10 | }; 11 | 12 | export const defaultCrawlerOptions = { 13 | limit: 10000 14 | } 15 | 16 | export const defaultCrawlPageOptions = { 17 | removeTags: [], 18 | parsePDF: true 19 | } 20 | -------------------------------------------------------------------------------- /apps/api/src/lib/entities.ts: -------------------------------------------------------------------------------- 1 | export interface Progress { 2 | current: number; 3 | total: number; 4 | status: string; 5 | metadata?: { 6 | sourceURL?: string; 7 | [key: string]: any; 8 | }; 9 | currentDocumentUrl?: string; 10 | currentDocument?: Document; 11 | } 12 | 13 | export type PageOptions = { 14 | includeMarkdown?: boolean; 15 | includeExtract?: boolean; 16 | includeRawHtml?: boolean; 17 | fallback?: boolean; 18 | fetchPageContent?: boolean; 19 | waitFor?: number; 20 | screenshot?: boolean; 21 | fullPageScreenshot?: boolean; 22 | headers?: Record; 23 | replaceAllPathsWithAbsolutePaths?: boolean; 24 | parsePDF?: boolean; 25 | removeTags?: string | string[]; 26 | onlyIncludeTags?: string | string[]; 27 | includeLinks?: boolean; 28 | useFastMode?: boolean; // beta 29 | disableJsDom?: boolean; // beta 30 | atsv?: boolean; // beta 31 | }; 32 | 33 | export type SearchOptions = { 34 | limit?: number; 35 | tbs?: string; 36 | filter?: string; 37 | lang?: string; 38 | country?: string; 39 | location?: string; 40 | }; 41 | 42 | export type CrawlerOptions = { 43 | returnOnlyUrls?: boolean; 44 | includes?: string | string[]; 45 | excludes?: string | string[]; 46 | maxCrawledLinks?: number; 47 | maxDepth?: number; 48 | limit?: number; 49 | replaceAllPathsWithAbsolutePaths?: boolean; 50 | ignoreSitemap?: boolean; 51 | mode?: "default" | "fast"; // have a mode of some sort 52 | allowExternalLinks?: boolean; 53 | }; 54 | 55 | export type WebScraperOptions = { 56 | jobId: string; 57 | urls: string[]; 58 | mode: "single_urls" | "sitemap" | "crawl"; 59 | crawlerOptions?: CrawlerOptions; 60 | pageOptions?: PageOptions; 61 | webhookUrls?: string[]; 62 | webhookMetadata?: any; 63 | concurrentRequests?: number; 64 | bullJobId?: string; 65 | crawlId: string; 66 | priority?: number; 67 | teamId?: string; 68 | }; 69 | 70 | export interface DocumentUrl { 71 | url: string; 72 | } 73 | 74 | export class Document { 75 | id?: string; 76 | url?: string; // Used only in /search for now 77 | content: string; 78 | markdown?: string; 79 | html?: string; 80 | rawHtml?: string; 81 | llm_extraction?: Record; 82 | createdAt?: Date; 83 | updatedAt?: Date; 84 | type?: string; 85 | metadata: { 86 | sourceURL?: string; 87 | [key: string]: any; 88 | }; 89 | childrenLinks?: string[]; 90 | provider?: string; 91 | warning?: string; 92 | 93 | index?: number; 94 | linksOnPage?: string[]; // Add this new field as a separate property 95 | 96 | constructor(data: Partial) { 97 | if (!data.content) { 98 | throw new Error("Missing required fields"); 99 | } 100 | this.content = data.content; 101 | this.createdAt = data.createdAt || new Date(); 102 | this.updatedAt = data.updatedAt || new Date(); 103 | this.type = data.type || "unknown"; 104 | this.metadata = data.metadata || { sourceURL: "" }; 105 | this.markdown = data.markdown || ""; 106 | this.childrenLinks = data.childrenLinks || undefined; 107 | this.provider = data.provider || undefined; 108 | this.linksOnPage = data.linksOnPage; // Assign linksOnPage if provided 109 | } 110 | } 111 | 112 | export class SearchResult { 113 | url: string; 114 | title: string; 115 | description: string; 116 | 117 | constructor(url: string, title: string, description: string) { 118 | this.url = url; 119 | this.title = title; 120 | this.description = description; 121 | } 122 | 123 | toString(): string { 124 | return `SearchResult(url=${this.url}, title=${this.title}, description=${this.description})`; 125 | } 126 | } 127 | -------------------------------------------------------------------------------- /apps/api/src/lib/go-html-to-md/README.md: -------------------------------------------------------------------------------- 1 | To build the go-html-to-md library, run the following command: 2 | 3 | ```bash 4 | cd apps/api/src/lib/go-html-to-md 5 | go build -o html-to-markdown.so -buildmode=c-shared html-to-markdown.go 6 | chmod +x html-to-markdown.so 7 | ``` -------------------------------------------------------------------------------- /apps/api/src/lib/go-html-to-md/go.mod: -------------------------------------------------------------------------------- 1 | module html-to-markdown.go 2 | 3 | go 1.19 4 | 5 | require github.com/JohannesKaufmann/html-to-markdown v1.6.0 6 | 7 | require ( 8 | github.com/PuerkitoBio/goquery v1.9.2 // indirect 9 | github.com/andybalholm/cascadia v1.3.2 // indirect 10 | github.com/kr/pretty v0.3.0 // indirect 11 | golang.org/x/net v0.25.0 // indirect 12 | gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect 13 | gopkg.in/yaml.v2 v2.4.0 // indirect 14 | ) 15 | -------------------------------------------------------------------------------- /apps/api/src/lib/go-html-to-md/html-to-markdown.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "C" 5 | "log" 6 | 7 | md "github.com/JohannesKaufmann/html-to-markdown" 8 | "github.com/JohannesKaufmann/html-to-markdown/plugin" 9 | ) 10 | 11 | //export ConvertHTMLToMarkdown 12 | func ConvertHTMLToMarkdown(html *C.char) *C.char { 13 | converter := md.NewConverter("", true, nil) 14 | converter.Use(plugin.GitHubFlavored()) 15 | 16 | markdown, err := converter.ConvertString(C.GoString(html)) 17 | if err != nil { 18 | log.Fatal(err) 19 | } 20 | return C.CString(markdown) 21 | } 22 | 23 | func main() { 24 | // This function is required for the main package 25 | } 26 | -------------------------------------------------------------------------------- /apps/api/src/lib/html-to-markdown.ts: -------------------------------------------------------------------------------- 1 | import koffi from "koffi"; 2 | import { join } from "path"; 3 | import dotenv from "dotenv"; 4 | import { Logger } from "./logger"; 5 | dotenv.config(); 6 | 7 | class GoMarkdownConverter { 8 | private static instance: GoMarkdownConverter; 9 | private convert: any; 10 | 11 | private constructor() { 12 | const goExecutablePath = join( 13 | __dirname, 14 | "go-html-to-md/html-to-markdown.so" 15 | ); 16 | const lib = koffi.load(goExecutablePath); 17 | this.convert = lib.func("ConvertHTMLToMarkdown", "string", ["string"]); 18 | } 19 | 20 | public static getInstance(): GoMarkdownConverter { 21 | if (!GoMarkdownConverter.instance) { 22 | GoMarkdownConverter.instance = new GoMarkdownConverter(); 23 | } 24 | return GoMarkdownConverter.instance; 25 | } 26 | 27 | public async convertHTMLToMarkdown(html: string): Promise { 28 | return new Promise((resolve, reject) => { 29 | this.convert.async(html, (err: Error, res: string) => { 30 | if (err) { 31 | reject(err); 32 | } else { 33 | resolve(res); 34 | } 35 | }); 36 | }); 37 | } 38 | } 39 | 40 | export async function parseMarkdown(html: string): Promise { 41 | if (!html) { 42 | return ""; 43 | } 44 | 45 | try { 46 | if (process.env.USE_GO_MARKDOWN_PARSER == "true") { 47 | const converter = GoMarkdownConverter.getInstance(); 48 | let markdownContent = await converter.convertHTMLToMarkdown(html); 49 | 50 | markdownContent = processMultiLineLinks(markdownContent); 51 | markdownContent = removeSkipToContentLinks(markdownContent); 52 | Logger.info(`HTML to Markdown conversion using Go parser successful`); 53 | return markdownContent; 54 | } 55 | } catch (error) { 56 | Logger.error(`Error converting HTML to Markdown with Go parser: ${error}`); 57 | } 58 | 59 | // Fallback to TurndownService if Go parser fails or is not enabled 60 | var TurndownService = require("turndown"); 61 | var turndownPluginGfm = require("joplin-turndown-plugin-gfm"); 62 | 63 | const turndownService = new TurndownService(); 64 | turndownService.addRule("inlineLink", { 65 | filter: function (node, options) { 66 | return ( 67 | options.linkStyle === "inlined" && 68 | node.nodeName === "A" && 69 | node.getAttribute("href") 70 | ); 71 | }, 72 | replacement: function (content, node) { 73 | var href = node.getAttribute("href").trim(); 74 | var title = node.title ? ' "' + node.title + '"' : ""; 75 | return "[" + content.trim() + "](" + href + title + ")\n"; 76 | }, 77 | }); 78 | var gfm = turndownPluginGfm.gfm; 79 | turndownService.use(gfm); 80 | 81 | try { 82 | let markdownContent = await turndownService.turndown(html); 83 | markdownContent = processMultiLineLinks(markdownContent); 84 | markdownContent = removeSkipToContentLinks(markdownContent); 85 | 86 | return markdownContent; 87 | } catch (error) { 88 | console.error("Error converting HTML to Markdown: ", error); 89 | return ""; // Optionally return an empty string or handle the error as needed 90 | } 91 | } 92 | 93 | function processMultiLineLinks(markdownContent: string): string { 94 | let insideLinkContent = false; 95 | let newMarkdownContent = ""; 96 | let linkOpenCount = 0; 97 | for (let i = 0; i < markdownContent.length; i++) { 98 | const char = markdownContent[i]; 99 | 100 | if (char == "[") { 101 | linkOpenCount++; 102 | } else if (char == "]") { 103 | linkOpenCount = Math.max(0, linkOpenCount - 1); 104 | } 105 | insideLinkContent = linkOpenCount > 0; 106 | 107 | if (insideLinkContent && char == "\n") { 108 | newMarkdownContent += "\\" + "\n"; 109 | } else { 110 | newMarkdownContent += char; 111 | } 112 | } 113 | return newMarkdownContent; 114 | } 115 | 116 | function removeSkipToContentLinks(markdownContent: string): string { 117 | // Remove [Skip to Content](#page) and [Skip to content](#skip) 118 | const newMarkdownContent = markdownContent.replace( 119 | /\[Skip to Content\]\(#[^\)]*\)/gi, 120 | "" 121 | ); 122 | return newMarkdownContent; 123 | } 124 | -------------------------------------------------------------------------------- /apps/api/src/lib/job-priority.ts: -------------------------------------------------------------------------------- 1 | import { redisConnection } from "../../src/services/queue-service"; 2 | import { PlanType } from "../../src/types"; 3 | import { Logger } from "./logger"; 4 | 5 | const SET_KEY_PREFIX = "limit_team_id:"; 6 | export async function addJobPriority(team_id, job_id) { 7 | try { 8 | const setKey = SET_KEY_PREFIX + team_id; 9 | 10 | // Add scrape job id to the set 11 | await redisConnection.sadd(setKey, job_id); 12 | 13 | // This approach will reset the expiration time to 60 seconds every time a new job is added to the set. 14 | await redisConnection.expire(setKey, 60); 15 | } catch (e) { 16 | Logger.error(`Add job priority (sadd) failed: ${team_id}, ${job_id}`); 17 | } 18 | } 19 | 20 | export async function deleteJobPriority(team_id, job_id) { 21 | try { 22 | const setKey = SET_KEY_PREFIX + team_id; 23 | 24 | // remove job_id from the set 25 | await redisConnection.srem(setKey, job_id); 26 | } catch (e) { 27 | Logger.error(`Delete job priority (srem) failed: ${team_id}, ${job_id}`); 28 | } 29 | } 30 | 31 | export async function getJobPriority({ 32 | plan, 33 | team_id, 34 | basePriority = 10, 35 | }: { 36 | plan: PlanType; 37 | team_id: string; 38 | basePriority?: number; 39 | }): Promise { 40 | try { 41 | const setKey = SET_KEY_PREFIX + team_id; 42 | 43 | // Get the length of the set 44 | const setLength = await redisConnection.scard(setKey); 45 | 46 | // Determine the priority based on the plan and set length 47 | let planModifier = 1; 48 | let bucketLimit = 0; 49 | 50 | switch (plan) { 51 | case "free": 52 | bucketLimit = 25; 53 | planModifier = 0.5; 54 | break; 55 | case "hobby": 56 | bucketLimit = 100; 57 | planModifier = 0.3; 58 | break; 59 | case "standard": 60 | case "standardnew": 61 | bucketLimit = 200; 62 | planModifier = 0.2; 63 | break; 64 | case "growth": 65 | case "growthdouble": 66 | bucketLimit = 400; 67 | planModifier = 0.1; 68 | break; 69 | 70 | default: 71 | bucketLimit = 25; 72 | planModifier = 1; 73 | break; 74 | } 75 | 76 | // if length set is smaller than set, just return base priority 77 | if (setLength <= bucketLimit) { 78 | return basePriority; 79 | } else { 80 | // If not, we keep base priority + planModifier 81 | return Math.ceil( 82 | basePriority + Math.ceil((setLength - bucketLimit) * planModifier) 83 | ); 84 | } 85 | } catch (e) { 86 | Logger.error( 87 | `Get job priority failed: ${team_id}, ${plan}, ${basePriority}` 88 | ); 89 | return basePriority; 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /apps/api/src/lib/logger.ts: -------------------------------------------------------------------------------- 1 | import { configDotenv } from "dotenv"; 2 | configDotenv(); 3 | 4 | enum LogLevel { 5 | NONE = "NONE", // No logs will be output. 6 | ERROR = "ERROR", // For logging error messages that indicate a failure in a specific operation. 7 | WARN = "WARN", // For logging potentially harmful situations that are not necessarily errors. 8 | INFO = "INFO", // For logging informational messages that highlight the progress of the application. 9 | DEBUG = "DEBUG", // For logging detailed information on the flow through the system, primarily used for debugging. 10 | TRACE = "TRACE", // For logging more detailed information than the DEBUG level. 11 | } 12 | export class Logger { 13 | static colors = { 14 | ERROR: "\x1b[31m%s\x1b[0m", // Red 15 | WARN: "\x1b[33m%s\x1b[0m", // Yellow 16 | INFO: "\x1b[34m%s\x1b[0m", // Blue 17 | DEBUG: "\x1b[36m%s\x1b[0m", // Cyan 18 | TRACE: "\x1b[35m%s\x1b[0m", // Magenta 19 | }; 20 | 21 | static log(message: string, level: LogLevel) { 22 | const logLevel: LogLevel = 23 | LogLevel[process.env.LOGGING_LEVEL as keyof typeof LogLevel] || 24 | LogLevel.INFO; 25 | const levels = [ 26 | LogLevel.NONE, 27 | LogLevel.ERROR, 28 | LogLevel.WARN, 29 | LogLevel.INFO, 30 | LogLevel.DEBUG, 31 | LogLevel.TRACE, 32 | ]; 33 | const currentLevelIndex = levels.indexOf(logLevel); 34 | const messageLevelIndex = levels.indexOf(level); 35 | 36 | if (currentLevelIndex >= messageLevelIndex) { 37 | const color = Logger.colors[level]; 38 | console[level.toLowerCase()]( 39 | color, 40 | `[${new Date().toISOString()}]${level} - ${message}` 41 | ); 42 | } 43 | } 44 | static error(message: string | any) { 45 | Logger.log(message, LogLevel.ERROR); 46 | } 47 | 48 | static warn(message: string) { 49 | Logger.log(message, LogLevel.WARN); 50 | } 51 | 52 | static info(message: string) { 53 | Logger.log(message, LogLevel.INFO); 54 | } 55 | 56 | static debug(message: string) { 57 | Logger.log(message, LogLevel.DEBUG); 58 | } 59 | 60 | static trace(message: string) { 61 | Logger.log(message, LogLevel.TRACE); 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /apps/api/src/lib/parse-mode.ts: -------------------------------------------------------------------------------- 1 | export function parseMode(mode: string) { 2 | switch (mode) { 3 | case "single_urls": 4 | return "single_urls"; 5 | case "sitemap": 6 | return "sitemap"; 7 | case "crawl": 8 | return "crawl"; 9 | default: 10 | return "single_urls"; 11 | } 12 | } 13 | -------------------------------------------------------------------------------- /apps/api/src/lib/parseApi.ts: -------------------------------------------------------------------------------- 1 | export function parseApi(api: string) { 2 | // Handle older versions of the API that don't have the fc- prefix 3 | if (!api.startsWith("fc-")) { 4 | return api; 5 | } 6 | 7 | // remove the fc- prefix 8 | // re add all the dashes based on the uuidv4 format 9 | // 3d478a29-6e59-403e-85c7-94aba81ffd2a 10 | const uuid = api 11 | .replace(/^fc-/, "") 12 | .replace(/(.{8})(.{4})(.{4})(.{4})(.{12})/, "$1-$2-$3-$4-$5"); 13 | return uuid; 14 | } 15 | 16 | 17 | export function uuidToFcUuid(uuid: string) { 18 | const uuidWithoutDashes = uuid.replace(/-/g, ""); 19 | return `fc-${uuidWithoutDashes}`; 20 | } 21 | -------------------------------------------------------------------------------- /apps/api/src/lib/timeout.ts: -------------------------------------------------------------------------------- 1 | export const axiosTimeout = 3000; -------------------------------------------------------------------------------- /apps/api/src/lib/withAuth.ts: -------------------------------------------------------------------------------- 1 | import { AuthResponse } from "../../src/types"; 2 | import { Logger } from "./logger"; 3 | import { configDotenv } from "dotenv"; 4 | configDotenv(); 5 | 6 | let warningCount = 0; 7 | 8 | export function withAuth( 9 | originalFunction: (...args: U) => Promise 10 | ) { 11 | return async function (...args: U): Promise { 12 | const useDbAuthentication = process.env.USE_DB_AUTHENTICATION === 'true'; 13 | if (!useDbAuthentication) { 14 | if (warningCount < 5) { 15 | Logger.warn("You're bypassing authentication"); 16 | warningCount++; 17 | } 18 | return { success: true } as T; 19 | } else { 20 | try { 21 | return await originalFunction(...args); 22 | } catch (error) { 23 | Logger.error(`Error in withAuth function: ${error}`); 24 | return { success: false, error: error.message } as T; 25 | } 26 | } 27 | }; 28 | } 29 | -------------------------------------------------------------------------------- /apps/api/src/main/runWebScraper.ts: -------------------------------------------------------------------------------- 1 | import { Job } from "bullmq"; 2 | import { 3 | WebScraperOptions, 4 | RunWebScraperParams, 5 | RunWebScraperResult, 6 | } from "../types"; 7 | import { WebScraperDataProvider } from "../scraper/WebScraper"; 8 | import { Progress } from "../lib/entities"; 9 | import { Document } from "../lib/entities"; 10 | import { Logger } from "../lib/logger"; 11 | import { configDotenv } from "dotenv"; 12 | configDotenv(); 13 | 14 | export async function startWebScraperPipeline({ 15 | job, 16 | token, 17 | }: { 18 | job: Job; 19 | token: string; 20 | }) { 21 | let partialDocs: Document[] = []; 22 | return (await runWebScraper({ 23 | url: job.data.url, 24 | mode: job.data.mode, 25 | crawlerOptions: job.data.crawlerOptions, 26 | pageOptions: { 27 | ...job.data.pageOptions, 28 | ...(job.data.crawl_id 29 | ? { 30 | includeRawHtml: true, 31 | } 32 | : {}), 33 | }, 34 | webhookUrls: job.data.webhookUrls, 35 | webhookMetadata: job.data.webhookMetadata, 36 | inProgress: (progress) => { 37 | Logger.debug(`🐂 Job in progress ${job.id}`); 38 | if (progress.currentDocument) { 39 | partialDocs.push(progress.currentDocument); 40 | if (partialDocs.length > 50) { 41 | partialDocs = partialDocs.slice(-50); 42 | } 43 | // job.updateProgress({ ...progress, partialDocs: partialDocs }); 44 | } 45 | }, 46 | onSuccess: (result, mode) => { 47 | Logger.debug(`🐂 Job completed ${job.id}`); 48 | }, 49 | onError: (error) => { 50 | Logger.error(`🐂 Job failed ${job.id}`); 51 | job.moveToFailed(error, token, false); 52 | }, 53 | team_id: job.data.team_id, 54 | bull_job_id: job.id.toString(), 55 | priority: job.opts.priority, 56 | is_scrape: job.data.is_scrape ?? false, 57 | crawl_id: job.data.crawl_id, 58 | })) as { success: boolean; message: string; docs: Document[] }; 59 | } 60 | export async function runWebScraper({ 61 | url, 62 | mode, 63 | crawlerOptions, 64 | pageOptions, 65 | webhookUrls, 66 | webhookMetadata, 67 | inProgress, 68 | onSuccess, 69 | onError, 70 | team_id, 71 | bull_job_id, 72 | crawl_id, 73 | priority, 74 | is_scrape = false, 75 | }: RunWebScraperParams): Promise { 76 | try { 77 | const provider = new WebScraperDataProvider(); 78 | if (mode === "crawl") { 79 | provider.setOptions({ 80 | jobId: bull_job_id, 81 | mode: mode, 82 | urls: [url], 83 | crawlerOptions: crawlerOptions, 84 | pageOptions: pageOptions, 85 | webhookUrls: webhookUrls, 86 | webhookMetadata: webhookMetadata, 87 | bullJobId: bull_job_id, 88 | crawlId: crawl_id, 89 | priority, 90 | }); 91 | } else { 92 | provider.setOptions({ 93 | jobId: bull_job_id, 94 | mode: mode, 95 | urls: url.split(","), 96 | crawlerOptions: crawlerOptions, 97 | pageOptions: pageOptions, 98 | webhookUrls: webhookUrls, 99 | webhookMetadata: webhookMetadata, 100 | crawlId: crawl_id, 101 | teamId: team_id, 102 | }); 103 | } 104 | const docs = (await provider.getDocuments(false, (progress: Progress) => { 105 | inProgress(progress); 106 | })) as Document[]; 107 | 108 | if (docs.length === 0) { 109 | return { 110 | success: true, 111 | message: "No pages found", 112 | docs: [], 113 | }; 114 | } 115 | 116 | // remove docs with empty content 117 | const filteredDocs = crawlerOptions.returnOnlyUrls 118 | ? docs.map((doc) => { 119 | if (doc.metadata.sourceURL) { 120 | return { url: doc.metadata.sourceURL }; 121 | } 122 | }) 123 | : docs; 124 | 125 | // This is where the returnvalue from the job is set 126 | onSuccess(filteredDocs, mode); 127 | 128 | // this return doesn't matter too much for the job completion result 129 | return { success: true, message: "", docs: filteredDocs }; 130 | } catch (error) { 131 | onError(error); 132 | return { success: false, message: error.message, docs: [] }; 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /apps/api/src/openapi/index.ts: -------------------------------------------------------------------------------- 1 | import { WithWebsocketMethod } from "express-ws"; 2 | import { Application } from "express"; 3 | import swaggerJsdoc from "swagger-jsdoc"; 4 | import redoc from "redoc-express"; 5 | 6 | const options = { 7 | failOnErrors: true, 8 | definition: { 9 | openapi: "3.0.0", 10 | info: { 11 | title: "Firecrawl API", 12 | description: "API for web scraping and crawling", 13 | version: "1.0.0", 14 | }, 15 | servers: [ 16 | { 17 | url: "/api/v1", 18 | description: "Version 1", 19 | }, 20 | ], 21 | components: { 22 | securitySchemes: { 23 | BearerAuth: { 24 | type: "http", 25 | scheme: "bearer", 26 | }, 27 | }, 28 | }, 29 | }, 30 | apis: ["./src/controllers/v1/*.ts"], 31 | }; 32 | 33 | export function setupOpenAPI(app: Application & WithWebsocketMethod) { 34 | const openapiSpecification = swaggerJsdoc(options); 35 | 36 | app.get("/api-docs/openapi.json", (req, res) => { 37 | res.setHeader("Content-Type", "application/json"); 38 | res.send(openapiSpecification); 39 | }); 40 | 41 | app.get( 42 | "/redoc", 43 | redoc({ 44 | title: "API Docs", 45 | specUrl: "/api-docs/openapi.json", 46 | nonce: "", // <= it is optional,we can omit this key and value 47 | // we are now start supporting the redocOptions object 48 | // you can omit the options object if you don't need it 49 | // https://redocly.com/docs/api-reference-docs/configuration/functionality/ 50 | redocOptions: {}, 51 | }) 52 | ); 53 | } 54 | -------------------------------------------------------------------------------- /apps/api/src/routes/admin.ts: -------------------------------------------------------------------------------- 1 | import express from "express"; 2 | import { redisHealthController } from "../controllers/v0/admin/redis-health"; 3 | import { 4 | checkQueuesController, 5 | cleanBefore24hCompleteJobsController, 6 | queuesController, 7 | } from "../controllers/v0/admin/queue"; 8 | 9 | export const adminRouter = express.Router(); 10 | 11 | adminRouter.get( 12 | `/admin/${process.env.BULL_AUTH_KEY}/redis-health`, 13 | redisHealthController 14 | ); 15 | 16 | adminRouter.get( 17 | `/admin/${process.env.BULL_AUTH_KEY}/clean-before-24h-complete-jobs`, 18 | cleanBefore24hCompleteJobsController 19 | ); 20 | 21 | adminRouter.get( 22 | `/admin/${process.env.BULL_AUTH_KEY}/check-queues`, 23 | checkQueuesController 24 | ); 25 | 26 | adminRouter.get(`/admin/${process.env.BULL_AUTH_KEY}/queues`, queuesController); 27 | -------------------------------------------------------------------------------- /apps/api/src/routes/v1.ts: -------------------------------------------------------------------------------- 1 | import express, { NextFunction, Request, Response } from "express"; 2 | import { RateLimiterMode } from "../types"; 3 | import { authenticateUser } from "../controllers/auth"; 4 | import { crawlController } from "../controllers/v1/crawl"; 5 | import { scrapeController } from "../../src/controllers/v1/scrape"; 6 | import { crawlStatusController } from "../controllers/v1/crawl-status"; 7 | import { mapController } from "../controllers/v1/map"; 8 | import { RequestWithMaybeAuth } from "../controllers/v1/types"; 9 | import { createIdempotencyKey } from "../services/idempotency/create"; 10 | import { crawlCancelController } from "../controllers/v1/crawl-cancel"; 11 | import { scrapeStatusController } from "../controllers/v1/scrape-status"; 12 | import { livenessController } from "../controllers/v1/liveness"; 13 | import { readinessController } from "../controllers/v1/readiness"; 14 | import expressWs from "express-ws"; 15 | 16 | export function authMiddleware( 17 | rateLimiterMode: RateLimiterMode 18 | ): (req: RequestWithMaybeAuth, res: Response, next: NextFunction) => void { 19 | return (req, res, next) => { 20 | (async () => { 21 | const { success, team_id, error, status, plan } = await authenticateUser( 22 | req, 23 | res, 24 | rateLimiterMode 25 | ); 26 | 27 | if (!success) { 28 | if (!res.headersSent) { 29 | return res.status(status).json({ success: false, error }); 30 | } 31 | } 32 | 33 | req.auth = { team_id, plan }; 34 | next(); 35 | })().catch((err) => next(err)); 36 | }; 37 | } 38 | 39 | function idempotencyMiddleware( 40 | req: Request, 41 | res: Response, 42 | next: NextFunction 43 | ) { 44 | (async () => { 45 | if (req.headers["x-idempotency-key"]) { 46 | createIdempotencyKey(req); 47 | } 48 | next(); 49 | })().catch((err) => next(err)); 50 | } 51 | 52 | function wrap( 53 | controller: (req: Request, res: Response) => Promise 54 | ): (req: Request, res: Response, next: NextFunction) => any { 55 | return (req, res, next) => { 56 | controller(req, res).catch((err) => next(err)); 57 | }; 58 | } 59 | 60 | expressWs(express()); 61 | 62 | export const v1Router = express.Router(); 63 | 64 | v1Router.post( 65 | "/scrape", 66 | authMiddleware(RateLimiterMode.Scrape), 67 | wrap(scrapeController) 68 | ); 69 | 70 | v1Router.post( 71 | "/crawl", 72 | authMiddleware(RateLimiterMode.Crawl), 73 | idempotencyMiddleware, 74 | wrap(crawlController) 75 | ); 76 | 77 | v1Router.post("/map", authMiddleware(RateLimiterMode.Map), wrap(mapController)); 78 | 79 | v1Router.get( 80 | "/crawl/:jobId", 81 | authMiddleware(RateLimiterMode.CrawlStatus), 82 | wrap(crawlStatusController) 83 | ); 84 | 85 | v1Router.get("/scrape/:jobId", wrap(scrapeStatusController)); 86 | 87 | v1Router.delete( 88 | "/crawl/:jobId", 89 | authMiddleware(RateLimiterMode.Crawl), 90 | crawlCancelController 91 | ); 92 | 93 | // Health/Probe routes 94 | v1Router.get("/health/liveness", livenessController); 95 | v1Router.get("/health/readiness", readinessController); 96 | -------------------------------------------------------------------------------- /apps/api/src/run-req.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import { promises as fs } from "fs"; 3 | import { v4 as uuidV4 } from "uuid"; 4 | 5 | interface Result { 6 | start_url: string; 7 | job_id?: string; 8 | idempotency_key?: string; 9 | result_data_jsonb?: any; 10 | } 11 | 12 | async function processResults(results: Result[]): Promise { 13 | let processedCount = 0; 14 | let starterCount = 0; 15 | const queue: Result[] = []; 16 | const processedUrls = new Set(); 17 | 18 | // Initialize the queue with the first 1000 results 19 | for (let i = 0; i < Math.min(100, results.length); i++) { 20 | queue.push(results[i]); 21 | processedUrls.add(results[i].start_url); 22 | } 23 | } 24 | 25 | // Example call 26 | 27 | async function getStartUrls(): Promise { 28 | try { 29 | const data = await fs.readFile("starturls.json", "utf-8"); 30 | return JSON.parse(data); 31 | } catch (error) { 32 | console.error("Error reading starturls.json:", error); 33 | return []; 34 | } 35 | } 36 | 37 | async function main() { 38 | const results: Result[] = (await getStartUrls()).slice(3999, 6000); 39 | // console.log(results.map((r) => r.start_url).slice(0, 3)); 40 | 41 | processResults(results) 42 | .then(() => { 43 | console.log("All results processed."); 44 | }) 45 | .catch((error) => { 46 | console.error("Error processing results:", error); 47 | }); 48 | } 49 | 50 | main(); 51 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/__tests__/crawler.test.ts: -------------------------------------------------------------------------------- 1 | // crawler.test.ts 2 | import { WebCrawler } from "../crawler"; 3 | import axios from "axios"; 4 | import robotsParser from "robots-parser"; 5 | 6 | jest.mock("axios"); 7 | jest.mock("robots-parser"); 8 | 9 | describe("WebCrawler", () => { 10 | let crawler: WebCrawler; 11 | const mockAxios = axios as jest.Mocked; 12 | const mockRobotsParser = robotsParser as jest.MockedFunction< 13 | typeof robotsParser 14 | >; 15 | 16 | beforeEach(() => { 17 | // Setup default mocks 18 | mockAxios.get.mockImplementation((url) => { 19 | if (url.includes("robots.txt")) { 20 | return Promise.resolve({ data: "User-agent: *\nAllow: /" }); 21 | } else if (url.includes("sitemap.xml")) { 22 | return Promise.resolve({ data: "sitemap content" }); // You would normally parse this to URLs 23 | } 24 | return Promise.resolve({ data: "" }); 25 | }); 26 | 27 | mockRobotsParser.mockReturnValue({ 28 | isAllowed: jest.fn().mockReturnValue(true), 29 | isDisallowed: jest.fn().mockReturnValue(false), 30 | getMatchingLineNumber: jest.fn().mockReturnValue(0), 31 | getCrawlDelay: jest.fn().mockReturnValue(0), 32 | getSitemaps: jest.fn().mockReturnValue([]), 33 | getPreferredHost: jest.fn().mockReturnValue("example.com"), 34 | }); 35 | }); 36 | 37 | it("should ignore social media and email links", async () => { 38 | const urlsWhichShouldGetBlocked = [ 39 | "http://facebook.com", 40 | "http://www.facebook.com", 41 | "https://facebook.com", 42 | "https://test.facebook.com", 43 | "https://en.wikipedia.com/barman", 44 | "https://docs.mux.com/guides/player", 45 | "https://mux.com", 46 | "https://x.com", 47 | ]; 48 | 49 | crawler = new WebCrawler({ 50 | jobId: "TEST", 51 | initialUrl: "http://example.com", 52 | includes: [], 53 | excludes: [], 54 | limit: 100, 55 | maxCrawledDepth: 10, 56 | crawlId: "TEST", 57 | }); 58 | 59 | const filteredLinks = urlsWhichShouldGetBlocked.filter( 60 | (url) => !crawler.isSocialMediaOrEmail(url), 61 | ); 62 | 63 | expect(filteredLinks).toContain("https://docs.mux.com/guides/player"); 64 | expect(filteredLinks.length).toBe(2); 65 | }); 66 | }); 67 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/__tests__/dns.test.ts: -------------------------------------------------------------------------------- 1 | import CacheableLookup from 'cacheable-lookup'; 2 | import https from 'node:https'; 3 | import axios from "axios"; 4 | 5 | describe("DNS", () => { 6 | it("cached dns", async () => { 7 | const cachedDns = new CacheableLookup(); 8 | cachedDns.install(https.globalAgent); 9 | jest.spyOn(cachedDns, "lookupAsync"); 10 | 11 | const res = await axios.get("https://example.com"); 12 | expect(res.status).toBe(200); 13 | expect(cachedDns.lookupAsync).toHaveBeenCalled(); 14 | }); 15 | }); 16 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/global.ts: -------------------------------------------------------------------------------- 1 | export const universalTimeout = 60000; -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/scrapers/fetch.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import { universalTimeout } from "../global"; 3 | import { Logger } from "../../../lib/logger"; 4 | 5 | /** 6 | * Scrapes a URL with Axios 7 | * @param url The URL to scrape 8 | * @returns The scraped content 9 | */ 10 | export async function scrapeWithFetch( 11 | url: string 12 | ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { 13 | const logParams = { 14 | url, 15 | scraper: "fetch", 16 | success: false, 17 | response_code: null, 18 | time_taken_seconds: null, 19 | error_message: null, 20 | html: "", 21 | startTime: Date.now(), 22 | }; 23 | 24 | try { 25 | const response = await axios.get(url, { 26 | headers: { 27 | "Content-Type": "application/json", 28 | }, 29 | timeout: universalTimeout, 30 | transformResponse: [(data) => data], // Prevent axios from parsing JSON automatically 31 | }); 32 | 33 | if (response.status !== 200) { 34 | Logger.debug( 35 | `⛏️ Axios: Failed to fetch url: ${url} with status: ${response.status}` 36 | ); 37 | logParams.error_message = response.statusText; 38 | logParams.response_code = response.status; 39 | return { 40 | content: "", 41 | pageStatusCode: response.status, 42 | pageError: response.statusText, 43 | }; 44 | } 45 | 46 | const text = response.data; 47 | logParams.success = true; 48 | logParams.html = text; 49 | logParams.response_code = response.status; 50 | return { content: text, pageStatusCode: response.status, pageError: null }; 51 | } catch (error) { 52 | if (error.code === "ECONNABORTED") { 53 | logParams.error_message = "Request timed out"; 54 | Logger.debug(`⛏️ Axios: Request timed out for ${url}`); 55 | } else { 56 | logParams.error_message = error.message || error; 57 | Logger.debug(`⛏️ Axios: Failed to fetch url: ${url} | Error: ${error}`); 58 | } 59 | return { 60 | content: "", 61 | pageStatusCode: null, 62 | pageError: logParams.error_message, 63 | }; 64 | } finally { 65 | const endTime = Date.now(); 66 | logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/scrapers/playwright.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import { generateRequestParams } from "../single_url"; 3 | import { universalTimeout } from "../global"; 4 | import { Logger } from "../../../lib/logger"; 5 | 6 | /** 7 | * Scrapes a URL with Playwright 8 | * @param url The URL to scrape 9 | * @param waitFor The time to wait for the page to load 10 | * @param headers The headers to send with the request 11 | * @param pageOptions The options for the page 12 | * @returns The scraped content 13 | */ 14 | export async function scrapeWithPlaywright( 15 | url: string, 16 | waitFor: number = 0, 17 | headers?: Record, 18 | ): Promise<{ content: string; pageStatusCode?: number; pageError?: string }> { 19 | const logParams = { 20 | url, 21 | scraper: "playwright", 22 | success: false, 23 | response_code: null, 24 | time_taken_seconds: null, 25 | error_message: null, 26 | html: "", 27 | startTime: Date.now(), 28 | }; 29 | 30 | try { 31 | const reqParams = await generateRequestParams(url); 32 | const waitParam = reqParams["params"]?.wait ?? waitFor; 33 | 34 | const response = await axios.post( 35 | process.env.PLAYWRIGHT_MICROSERVICE_URL, 36 | { 37 | url: url, 38 | wait_after_load: waitParam, 39 | headers: headers, 40 | }, 41 | { 42 | headers: { 43 | "Content-Type": "application/json", 44 | }, 45 | timeout: universalTimeout + waitParam, 46 | transformResponse: [(data) => data], 47 | } 48 | ); 49 | 50 | if (response.status !== 200) { 51 | Logger.debug( 52 | `⛏️ Playwright: Failed to fetch url: ${url} | status: ${response.status}, error: ${response.data?.pageError}` 53 | ); 54 | logParams.error_message = response.data?.pageError; 55 | logParams.response_code = response.data?.pageStatusCode; 56 | return { 57 | content: "", 58 | pageStatusCode: response.data?.pageStatusCode, 59 | pageError: response.data?.pageError, 60 | }; 61 | } 62 | 63 | const textData = response.data; 64 | try { 65 | const data = JSON.parse(textData); 66 | const html = data.content; 67 | logParams.success = true; 68 | logParams.html = html; 69 | logParams.response_code = data.pageStatusCode; 70 | logParams.error_message = data.pageError; 71 | return { 72 | content: html ?? "", 73 | pageStatusCode: data.pageStatusCode, 74 | pageError: data.pageError, 75 | }; 76 | } catch (jsonError) { 77 | logParams.error_message = jsonError.message || jsonError; 78 | Logger.debug( 79 | `⛏️ Playwright: Error parsing JSON response for url: ${url} | Error: ${jsonError}` 80 | ); 81 | return { 82 | content: "", 83 | pageStatusCode: null, 84 | pageError: logParams.error_message, 85 | }; 86 | } 87 | } catch (error) { 88 | if (error.code === "ECONNABORTED") { 89 | logParams.error_message = "Request timed out"; 90 | Logger.debug(`⛏️ Playwright: Request timed out for ${url}`); 91 | } else { 92 | logParams.error_message = error.message || error; 93 | Logger.debug( 94 | `⛏️ Playwright: Failed to fetch url: ${url} | Error: ${error}` 95 | ); 96 | } 97 | return { 98 | content: "", 99 | pageStatusCode: null, 100 | pageError: logParams.error_message, 101 | }; 102 | } finally { 103 | const endTime = Date.now(); 104 | logParams.time_taken_seconds = (endTime - logParams.startTime) / 1000; 105 | } 106 | } 107 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/sitemap.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import { axiosTimeout } from "../../lib/timeout"; 3 | import { parseStringPromise } from "xml2js"; 4 | import { WebCrawler } from "./crawler"; 5 | import { Logger } from "../../lib/logger"; 6 | 7 | export async function getLinksFromSitemap({ 8 | sitemapUrl, 9 | allUrls = [], 10 | }: { 11 | sitemapUrl: string; 12 | allUrls?: string[]; 13 | }): Promise { 14 | try { 15 | let content: string; 16 | try { 17 | const response = await axios.get(sitemapUrl, { timeout: axiosTimeout }); 18 | content = response.data; 19 | } catch (error) { 20 | Logger.error(`Request failed for ${sitemapUrl}: ${error.message}`); 21 | 22 | return allUrls; 23 | } 24 | 25 | const parsed = await parseStringPromise(content); 26 | const root = parsed.urlset || parsed.sitemapindex; 27 | 28 | if (root && root.sitemap) { 29 | for (const sitemap of root.sitemap) { 30 | if (sitemap.loc && sitemap.loc.length > 0) { 31 | await getLinksFromSitemap({ 32 | sitemapUrl: sitemap.loc[0], 33 | allUrls, 34 | }); 35 | } 36 | } 37 | } else if (root && root.url) { 38 | for (const url of root.url) { 39 | if ( 40 | url.loc && 41 | url.loc.length > 0 && 42 | !WebCrawler.prototype.isFile(url.loc[0]) 43 | ) { 44 | allUrls.push(url.loc[0]); 45 | } 46 | } 47 | } 48 | } catch (error) { 49 | Logger.debug( 50 | `Error processing sitemapUrl: ${sitemapUrl} | Error: ${error.message}` 51 | ); 52 | } 53 | 54 | return allUrls; 55 | } 56 | 57 | export const fetchSitemapData = async ( 58 | url: string, 59 | timeout?: number 60 | ): Promise => { 61 | const sitemapUrl = url.endsWith("/sitemap.xml") ? url : `${url}/sitemap.xml`; 62 | try { 63 | const response = await axios.get(sitemapUrl, { 64 | timeout: timeout || axiosTimeout, 65 | }); 66 | if (response.status === 200) { 67 | const xml = response.data; 68 | const parsedXml = await parseStringPromise(xml); 69 | 70 | const sitemapData: SitemapEntry[] = []; 71 | if (parsedXml.urlset && parsedXml.urlset.url) { 72 | for (const urlElement of parsedXml.urlset.url) { 73 | const sitemapEntry: SitemapEntry = { loc: urlElement.loc[0] }; 74 | if (urlElement.lastmod) sitemapEntry.lastmod = urlElement.lastmod[0]; 75 | if (urlElement.changefreq) 76 | sitemapEntry.changefreq = urlElement.changefreq[0]; 77 | if (urlElement.priority) 78 | sitemapEntry.priority = Number(urlElement.priority[0]); 79 | sitemapData.push(sitemapEntry); 80 | } 81 | } 82 | 83 | return sitemapData; 84 | } 85 | return null; 86 | } catch (error) { 87 | // Error handling for failed sitemap fetch 88 | } 89 | return []; 90 | }; 91 | 92 | export interface SitemapEntry { 93 | loc: string; 94 | lastmod?: string; 95 | changefreq?: string; 96 | priority?: number; 97 | } 98 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/__tests__/maxDepthUtils.test.ts: -------------------------------------------------------------------------------- 1 | import { getURLDepth, getAdjustedMaxDepth } from '../maxDepthUtils'; 2 | 3 | describe('Testing getURLDepth and getAdjustedMaxDepth', () => { 4 | it('should return 0 for root - mendable.ai', () => { 5 | const enteredURL = "https://www.mendable.ai/" 6 | expect(getURLDepth(enteredURL)).toBe(0); 7 | }); 8 | 9 | it('should return 0 for root - scrapethissite.com', () => { 10 | const enteredURL = "https://scrapethissite.com/" 11 | expect(getURLDepth(enteredURL)).toBe(0); 12 | }); 13 | 14 | it('should return 1 for scrapethissite.com/pages', () => { 15 | const enteredURL = "https://scrapethissite.com/pages" 16 | expect(getURLDepth(enteredURL)).toBe(1); 17 | }); 18 | 19 | it('should return 2 for scrapethissite.com/pages/articles', () => { 20 | const enteredURL = "https://scrapethissite.com/pages/articles" 21 | expect(getURLDepth(enteredURL)).toBe(2); 22 | 23 | }); 24 | 25 | it('Adjusted maxDepth should return 1 for scrapethissite.com and max depth param of 1', () => { 26 | const enteredURL = "https://scrapethissite.com" 27 | expect(getAdjustedMaxDepth(enteredURL, 1)).toBe(1); 28 | 29 | }); 30 | it('Adjusted maxDepth should return 0 for scrapethissite.com and max depth param of 0', () => { 31 | const enteredURL = "https://scrapethissite.com" 32 | expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0); 33 | 34 | }); 35 | 36 | it('Adjusted maxDepth should return 0 for mendable.ai and max depth param of 0', () => { 37 | const enteredURL = "https://mendable.ai" 38 | expect(getAdjustedMaxDepth(enteredURL, 0)).toBe(0); 39 | }); 40 | 41 | it('Adjusted maxDepth should return 4 for scrapethissite.com/pages/articles and max depth param of 2', () => { 42 | const enteredURL = "https://scrapethissite.com/pages/articles" 43 | expect(getAdjustedMaxDepth(enteredURL, 2)).toBe(4); 44 | }); 45 | 46 | 47 | }); 48 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/__tests__/parseTable.test.ts: -------------------------------------------------------------------------------- 1 | import { parseTablesToMarkdown, convertTableElementToMarkdown, convertTableRowElementToMarkdown, createMarkdownDividerRow } from '../parseTable'; 2 | import cheerio from 'cheerio'; 3 | 4 | describe('parseTablesToMarkdown', () => { 5 | it('converts a simple HTML table to Markdown', async () => { 6 | const html = ` 7 | 8 | 9 | 10 | 11 |
Header 1Header 2
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
12 | `; 13 | const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; 14 | const markdown = await parseTablesToMarkdown(html); 15 | expect(markdown).toBe(expectedMarkdown); 16 | }); 17 | 18 | it('converts a table with a single row to Markdown', async () => { 19 | const html = ` 20 | 21 | 22 | 23 |
Header 1Header 2
Row 1 Col 1Row 1 Col 2
24 | `; 25 | const expectedMarkdown = `
| Header 1 | Header 2 |\n| --- | --- |\n| Row 1 Col 1 | Row 1 Col 2 |
`; 26 | const markdown = await parseTablesToMarkdown(html); 27 | expect(markdown).toBe(expectedMarkdown); 28 | }); 29 | 30 | it('converts a table with a single column to Markdown', async () => { 31 | const html = ` 32 | 33 | 34 | 35 | 36 |
Header 1
Row 1 Col 1
Row 2 Col 1
37 | `; 38 | const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |\n| Row 2 Col 1 |
`; 39 | const markdown = await parseTablesToMarkdown(html); 40 | expect(markdown).toBe(expectedMarkdown); 41 | }); 42 | 43 | it('converts a table with a single cell to Markdown', async () => { 44 | const html = ` 45 | 46 | 47 | 48 |
Header 1
Row 1 Col 1
49 | `; 50 | const expectedMarkdown = `
| Header 1 |\n| --- |\n| Row 1 Col 1 |
`; 51 | const markdown = await parseTablesToMarkdown(html); 52 | expect(markdown).toBe(expectedMarkdown); 53 | }); 54 | 55 | it('converts a table with no header to Markdown', async () => { 56 | const html = ` 57 | 58 | 59 | 60 |
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
61 | `; 62 | const expectedMarkdown = `
| Row 1 Col 1 | Row 1 Col 2 |\n| Row 2 Col 1 | Row 2 Col 2 |
`; 63 | const markdown = await parseTablesToMarkdown(html); 64 | expect(markdown).toBe(expectedMarkdown); 65 | }); 66 | 67 | it('converts a table with no rows to Markdown', async () => { 68 | const html = ` 69 | 70 |
71 | `; 72 | const expectedMarkdown = `
`; 73 | const markdown = await parseTablesToMarkdown(html); 74 | expect(markdown).toBe(expectedMarkdown); 75 | }); 76 | 77 | it('converts a table with no cells to Markdown', async () => { 78 | const html = ` 79 | 80 | 81 |
82 | `; 83 | const expectedMarkdown = `
`; 84 | const markdown = await parseTablesToMarkdown(html); 85 | expect(markdown).toBe(expectedMarkdown); 86 | }); 87 | 88 | it('converts a table with no columns to Markdown', async () => { 89 | const html = ` 90 | 91 | 92 |
93 | `; 94 | const expectedMarkdown = `
`; 95 | const markdown = await parseTablesToMarkdown(html); 96 | expect(markdown).toBe(expectedMarkdown); 97 | }); 98 | 99 | it('converts a table with no table to Markdown', async () => { 100 | const html = ``; 101 | const expectedMarkdown = ``; 102 | const markdown = await parseTablesToMarkdown(html); 103 | expect(markdown).toBe(expectedMarkdown); 104 | }); 105 | 106 | it('converts a table inside of a bunch of html noise', async () => { 107 | const html = ` 108 |
109 |

Some text before

110 | 111 | 112 | 113 |
Row 1 Col 1Row 1 Col 2
Row 2 Col 1Row 2 Col 2
114 |

Some text after

115 |
116 | `; 117 | const expectedMarkdown = `
118 |

Some text before

119 |
| Row 1 Col 1 | Row 1 Col 2 | 120 | | Row 2 Col 1 | Row 2 Col 2 |
121 |

Some text after

122 |
`; 123 | 124 | const markdown = await parseTablesToMarkdown(html); 125 | expect(markdown).toBe(expectedMarkdown); 126 | }); 127 | 128 | }); 129 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/excludeTags.ts: -------------------------------------------------------------------------------- 1 | export const excludeNonMainTags = [ 2 | "header", 3 | "footer", 4 | "nav", 5 | "aside", 6 | ".header", 7 | ".top", 8 | ".navbar", 9 | "#header", 10 | ".footer", 11 | ".bottom", 12 | "#footer", 13 | ".sidebar", 14 | ".side", 15 | ".aside", 16 | "#sidebar", 17 | ".modal", 18 | ".popup", 19 | "#modal", 20 | ".overlay", 21 | ".ad", 22 | ".ads", 23 | ".advert", 24 | "#ad", 25 | ".lang-selector", 26 | ".language", 27 | "#language-selector", 28 | ".social", 29 | ".social-media", 30 | ".social-links", 31 | "#social", 32 | ".menu", 33 | ".navigation", 34 | "#nav", 35 | ".breadcrumbs", 36 | "#breadcrumbs", 37 | "#search-form", 38 | ".search", 39 | "#search", 40 | ".share", 41 | "#share", 42 | ".widget", 43 | "#widget", 44 | ".cookie", 45 | "#cookie" 46 | ]; 47 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/maxDepthUtils.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | export function getAdjustedMaxDepth(url: string, maxCrawlDepth: number): number { 4 | const baseURLDepth = getURLDepth(url); 5 | const adjustedMaxDepth = maxCrawlDepth + baseURLDepth; 6 | return adjustedMaxDepth; 7 | } 8 | 9 | export function getURLDepth(url: string): number { 10 | const pathSplits = new URL(url).pathname.split('/'); 11 | return pathSplits.length - (pathSplits[0].length === 0 && pathSplits[pathSplits.length - 1].length === 0 ? 1 : 0) - 1; 12 | } 13 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/parseTable.ts: -------------------------------------------------------------------------------- 1 | import cheerio, { CheerioAPI } from "cheerio"; 2 | 3 | interface Replacement { 4 | start: number; 5 | end: number; 6 | markdownTable: string; 7 | } 8 | 9 | export const parseTablesToMarkdown = async (html: string): Promise => { 10 | const soup: CheerioAPI = cheerio.load(html, { 11 | xmlMode: true, 12 | }); 13 | let tables = soup("table"); 14 | let replacements: Replacement[] = []; 15 | 16 | if (tables.length) { 17 | tables.each((_, tableElement) => { 18 | const start: number = tableElement.startIndex; 19 | const end: number = tableElement.endIndex + 1; // Include the closing tag properly 20 | let markdownTable: string = convertTableElementToMarkdown( 21 | cheerio.load(tableElement) 22 | ); 23 | const isTableEmpty: boolean = 24 | markdownTable.replace(/[|\- \n]/g, "").length === 0; 25 | if (isTableEmpty) { 26 | markdownTable = ""; 27 | } 28 | replacements.push({ start, end, markdownTable }); 29 | }); 30 | } 31 | 32 | replacements.sort((a, b) => b.start - a.start); 33 | 34 | let modifiedHtml: string = html; 35 | replacements.forEach(({ start, end, markdownTable }) => { 36 | modifiedHtml = 37 | modifiedHtml.slice(0, start) + 38 | `
${markdownTable}
` + 39 | modifiedHtml.slice(end); 40 | }); 41 | 42 | return modifiedHtml.trim(); 43 | }; 44 | 45 | export const convertTableElementToMarkdown = ( 46 | tableSoup: CheerioAPI 47 | ): string => { 48 | let rows: string[] = []; 49 | let headerRowFound: boolean = false; 50 | tableSoup("tr").each((i, tr) => { 51 | const cells: string = tableSoup(tr) 52 | .find("th, td") 53 | .map((_, cell) => { 54 | let cellText: string = tableSoup(cell).text().trim(); 55 | if (tableSoup(cell).is("th") && !headerRowFound) { 56 | headerRowFound = true; 57 | } 58 | return ` ${cellText} |`; 59 | }) 60 | .get() 61 | .join(""); 62 | if (cells) { 63 | rows.push(`|${cells}`); 64 | } 65 | if (headerRowFound && i === 0) { 66 | // Header row 67 | rows.push(createMarkdownDividerRow(tableSoup(tr).find("th, td").length)); 68 | } 69 | }); 70 | 71 | return rows.join("\n").trim(); 72 | }; 73 | 74 | export function convertTableRowElementToMarkdown( 75 | rowSoup: CheerioAPI, 76 | rowNumber: number 77 | ): string { 78 | const cells: string = rowSoup("td, th") 79 | .map((_, cell) => { 80 | let cellText: string = rowSoup(cell).text().trim(); 81 | return ` ${cellText} |`; 82 | }) 83 | .get() 84 | .join(""); 85 | 86 | return `|${cells}`; 87 | } 88 | 89 | export function createMarkdownDividerRow(cellCount: number): string { 90 | return "| " + Array(cellCount).fill("---").join(" | ") + " |"; 91 | } 92 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/removeUnwantedElements.ts: -------------------------------------------------------------------------------- 1 | import { load } from "cheerio"; 2 | import { PageOptions } from "../../../lib/entities"; 3 | 4 | export const removeUnwantedElements = ( 5 | html: string, 6 | pageOptions: PageOptions 7 | ) => { 8 | let soup = load(html); 9 | 10 | soup("script, style, iframe, noscript, meta, head").remove(); 11 | 12 | if ( 13 | pageOptions.onlyIncludeTags && 14 | pageOptions.onlyIncludeTags.length > 0 && 15 | pageOptions.onlyIncludeTags[0] !== "" 16 | ) { 17 | if (typeof pageOptions.onlyIncludeTags === "string") { 18 | pageOptions.onlyIncludeTags = [pageOptions.onlyIncludeTags]; 19 | } 20 | if (pageOptions.onlyIncludeTags.length !== 0) { 21 | // Create a new root element to hold the tags to keep 22 | const newRoot = load("
")("div"); 23 | pageOptions.onlyIncludeTags.forEach((tag) => { 24 | soup(tag).each((index, element) => { 25 | newRoot.append(soup(element).clone()); 26 | }); 27 | }); 28 | 29 | soup = load(newRoot.html()); 30 | } 31 | } 32 | 33 | if ( 34 | pageOptions.removeTags && 35 | pageOptions.removeTags.length > 0 && 36 | pageOptions.removeTags[0] !== "" 37 | ) { 38 | if (typeof pageOptions.removeTags === "string") { 39 | pageOptions.removeTags = [pageOptions.removeTags]; 40 | } 41 | 42 | if (Array.isArray(pageOptions.removeTags)) { 43 | pageOptions.removeTags.forEach((tag) => { 44 | let elementsToRemove: any; 45 | if (tag.startsWith("*") && tag.endsWith("*")) { 46 | let classMatch = false; 47 | 48 | const regexPattern = new RegExp(tag.slice(1, -1), "i"); 49 | elementsToRemove = soup("*").filter((i, element) => { 50 | if (element.type === "tag") { 51 | const attributes = element.attribs; 52 | const tagNameMatches = regexPattern.test(element.name); 53 | const attributesMatch = Object.keys(attributes).some((attr) => 54 | regexPattern.test(`${attr}="${attributes[attr]}"`) 55 | ); 56 | if (tag.startsWith("*.")) { 57 | classMatch = Object.keys(attributes).some((attr) => 58 | regexPattern.test(`class="${attributes[attr]}"`) 59 | ); 60 | } 61 | return tagNameMatches || attributesMatch || classMatch; 62 | } 63 | return false; 64 | }); 65 | } else { 66 | elementsToRemove = soup(tag); 67 | } 68 | elementsToRemove.remove(); 69 | }); 70 | } 71 | } 72 | 73 | const cleanedHtml = soup.html(); 74 | return cleanedHtml; 75 | }; 76 | -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/replacePaths.ts: -------------------------------------------------------------------------------- 1 | import { Logger } from "../../../lib/logger"; 2 | import { Document } from "../../../lib/entities"; 3 | 4 | export const replacePathsWithAbsolutePaths = (documents: Document[]): Document[] => { 5 | try { 6 | documents.forEach((document) => { 7 | const baseUrl = new URL(document.metadata.sourceURL).origin; 8 | const paths = 9 | document.content.match( 10 | /!?\[.*?\]\(.*?\)|href=".+?"/g 11 | ) || []; 12 | 13 | paths.forEach((path: string) => { 14 | try { 15 | const isImage = path.startsWith("!"); 16 | let matchedUrl = path.match(/\((.*?)\)/) || path.match(/href="([^"]+)"/); 17 | let url = matchedUrl[1]; 18 | 19 | if (!url.startsWith("data:") && !url.startsWith("http")) { 20 | if (url.startsWith("/")) { 21 | url = url.substring(1); 22 | } 23 | url = new URL(url, baseUrl).toString(); 24 | } 25 | 26 | const markdownLinkOrImageText = path.match(/(!?\[.*?\])/)[0]; 27 | // Image is handled afterwards 28 | if (!isImage) { 29 | document.content = document.content.replace( 30 | path, 31 | `${markdownLinkOrImageText}(${url})` 32 | ); 33 | } 34 | } catch (error) { 35 | 36 | } 37 | }); 38 | document.markdown = document.content; 39 | }); 40 | 41 | return documents; 42 | } catch (error) { 43 | Logger.debug(`Error replacing paths with absolute paths: ${error}`); 44 | return documents; 45 | } 46 | }; 47 | 48 | export const replaceImgPathsWithAbsolutePaths = (documents: Document[]): Document[] => { 49 | try { 50 | documents.forEach((document) => { 51 | const baseUrl = new URL(document.metadata.sourceURL).origin; 52 | const images = 53 | document.content.match( 54 | /!\[.*?\]\(.*?\)/g 55 | ) || []; 56 | 57 | images.forEach((image: string) => { 58 | let imageUrl = image.match(/\((.*?)\)/)[1]; 59 | let altText = image.match(/\[(.*?)\]/)[1]; 60 | 61 | if (!imageUrl.startsWith("data:image")) { 62 | if (!imageUrl.startsWith("http")) { 63 | if (imageUrl.startsWith("/")) { 64 | imageUrl = imageUrl.substring(1); 65 | imageUrl = new URL(imageUrl, baseUrl).toString(); 66 | } else { 67 | imageUrl = new URL(imageUrl, document.metadata.sourceURL).toString(); 68 | } 69 | } 70 | } 71 | 72 | document.content = document.content.replace( 73 | image, 74 | `![${altText}](${imageUrl})` 75 | ); 76 | }); 77 | document.markdown = document.content; 78 | }); 79 | 80 | return documents; 81 | } catch (error) { 82 | Logger.error(`Error replacing img paths with absolute paths: ${error}`); 83 | return documents; 84 | } 85 | }; -------------------------------------------------------------------------------- /apps/api/src/scraper/WebScraper/utils/utils.ts: -------------------------------------------------------------------------------- 1 | import axios from "axios"; 2 | import * as cheerio from "cheerio"; 3 | import { Logger } from "../../../lib/logger"; 4 | 5 | export async function attemptScrapWithRequests( 6 | urlToScrap: string 7 | ): Promise { 8 | try { 9 | const response = await axios.get(urlToScrap, { timeout: 60000 }); 10 | 11 | if (!response.data) { 12 | Logger.debug("Failed normal requests as well"); 13 | return null; 14 | } 15 | 16 | return response.data; 17 | } catch (error) { 18 | Logger.debug(`Error in attemptScrapWithRequests: ${error}`); 19 | return null; 20 | } 21 | } 22 | 23 | export function sanitizeText(text: string): string { 24 | return text.replace("\u0000", ""); 25 | } 26 | 27 | export function extractLinks(html: string, baseUrl: string): string[] { 28 | const $ = cheerio.load(html); 29 | const links: string[] = []; 30 | 31 | $("a").each((_, element) => { 32 | let href = $(element).attr("href"); 33 | if (href) { 34 | if (href.startsWith("/")) { 35 | // Relative URL starting with '/', append to origin 36 | href = new URL(href, baseUrl).href; 37 | } else if (!href.startsWith("#") && !href.startsWith("mailto:")) { 38 | // Relative URL not starting with '/', append to base URL 39 | href = new URL(href, baseUrl).href; 40 | } 41 | } 42 | 43 | links.push(href); 44 | }); 45 | 46 | const dedupedLinks = [...new Set(links)]; 47 | 48 | Logger.debug( 49 | `extractLinks extracted ${dedupedLinks.length} links from ${baseUrl}` 50 | ); 51 | 52 | return dedupedLinks; 53 | } 54 | -------------------------------------------------------------------------------- /apps/api/src/scripts/generate-openapi.ts: -------------------------------------------------------------------------------- 1 | import { writeFileSync } from 'fs'; 2 | import { resolve } from 'path'; 3 | import swaggerJsdoc from 'swagger-jsdoc'; 4 | 5 | const options = { 6 | failOnErrors: true, 7 | definition: { 8 | openapi: '3.0.0', 9 | info: { 10 | title: 'Firecrawl API', 11 | description: 'API for web scraping and crawling', 12 | version: '1.0.0', 13 | }, 14 | servers: [ 15 | { 16 | url: '/api/v1', 17 | description: 'Version 1' 18 | } 19 | ], 20 | components: { 21 | securitySchemes: { 22 | BearerAuth: { 23 | type: 'http', 24 | scheme: 'bearer' 25 | } 26 | } 27 | } 28 | }, 29 | apis: ['./src/controllers/v1/*.ts'] 30 | }; 31 | 32 | async function generateOpenAPI() { 33 | try { 34 | const openapiSpecification = swaggerJsdoc(options); 35 | 36 | writeFileSync( 37 | resolve(__dirname, '../v1-openapi.json'), 38 | JSON.stringify(openapiSpecification, null, 2) 39 | ); 40 | 41 | console.log('OpenAPI spec generated successfully!'); 42 | } catch (error) { 43 | console.error('Error generating OpenAPI spec:', error); 44 | process.exit(1); 45 | } 46 | } 47 | 48 | generateOpenAPI(); -------------------------------------------------------------------------------- /apps/api/src/services/alerts/index.ts: -------------------------------------------------------------------------------- 1 | import { Logger } from "../../../src/lib/logger"; 2 | import { getScrapeQueue } from "../queue-service"; 3 | 4 | export async function checkAlerts() { 5 | try { 6 | if ( 7 | process.env.ENV === "production" && 8 | process.env.ALERT_NUM_ACTIVE_JOBS && 9 | process.env.ALERT_NUM_WAITING_JOBS 10 | ) { 11 | Logger.info("Initializing alerts"); 12 | const checkActiveJobs = async () => { 13 | try { 14 | const scrapeQueue = getScrapeQueue(); 15 | const activeJobs = await scrapeQueue.getActiveCount(); 16 | if (activeJobs > Number(process.env.ALERT_NUM_ACTIVE_JOBS)) { 17 | Logger.warn( 18 | `Alert: Number of active jobs is over ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}.` 19 | ); 20 | } else { 21 | Logger.info( 22 | `Number of active jobs is under ${process.env.ALERT_NUM_ACTIVE_JOBS}. Current active jobs: ${activeJobs}` 23 | ); 24 | } 25 | } catch (error) { 26 | Logger.error(`Failed to check active jobs: ${error}`); 27 | } 28 | }; 29 | 30 | const checkWaitingQueue = async () => { 31 | const scrapeQueue = getScrapeQueue(); 32 | const waitingJobs = await scrapeQueue.getWaitingCount(); 33 | 34 | if (waitingJobs > Number(process.env.ALERT_NUM_WAITING_JOBS)) { 35 | Logger.warn( 36 | `Alert: Number of waiting jobs is over ${process.env.ALERT_NUM_WAITING_JOBS}. Current waiting jobs: ${waitingJobs}.` 37 | ); 38 | } 39 | }; 40 | 41 | const checkAll = async () => { 42 | await checkActiveJobs(); 43 | await checkWaitingQueue(); 44 | }; 45 | 46 | await checkAll(); 47 | // setInterval(checkAll, 10000); // Run every 48 | } 49 | } catch (error) { 50 | Logger.error(`Failed to initialize alerts: ${error}`); 51 | } 52 | } 53 | -------------------------------------------------------------------------------- /apps/api/src/services/idempotency/create.ts: -------------------------------------------------------------------------------- 1 | import { Request } from "express"; 2 | export async function createIdempotencyKey( 3 | req: Request, 4 | ): Promise { 5 | const idempotencyKey = req.headers['x-idempotency-key'] as string; 6 | if (!idempotencyKey) { 7 | throw new Error("No idempotency key provided in the request headers."); 8 | } 9 | 10 | return idempotencyKey; 11 | } 12 | -------------------------------------------------------------------------------- /apps/api/src/services/logtail.ts: -------------------------------------------------------------------------------- 1 | import "dotenv/config"; 2 | import { Logger } from "../lib/logger"; 3 | 4 | class MockLogtail { 5 | info(message: string, context?: Record): void { 6 | Logger.debug(`${message} - ${context}`); 7 | } 8 | error(message: string, context: Record = {}): void { 9 | Logger.error(`${message} - ${context}`); 10 | } 11 | } 12 | 13 | export const logtail = new MockLogtail(); 14 | -------------------------------------------------------------------------------- /apps/api/src/services/queue-jobs.ts: -------------------------------------------------------------------------------- 1 | import { Job } from "bullmq"; 2 | import { getScrapeQueue } from "./queue-service"; 3 | 4 | export async function addScrapeJobRaw( 5 | webScraperOptions: any, 6 | options: any, 7 | jobId: string, 8 | jobPriority: number = 10 9 | ): Promise { 10 | return await getScrapeQueue().add(jobId, webScraperOptions, { 11 | ...options, 12 | priority: jobPriority, 13 | jobId, 14 | }); 15 | } 16 | 17 | export function waitForJob(jobId: string, timeout: number) { 18 | return new Promise((resolve, reject) => { 19 | const start = Date.now(); 20 | const int = setInterval(async () => { 21 | if (Date.now() >= start + timeout) { 22 | clearInterval(int); 23 | reject(new Error("Job wait ")); 24 | } else { 25 | const state = await getScrapeQueue().getJobState(jobId); 26 | if (state === "completed") { 27 | clearInterval(int); 28 | resolve((await getScrapeQueue().getJob(jobId)).returnvalue); 29 | } else if (state === "failed") { 30 | // console.log("failed", (await getScrapeQueue().getJob(jobId)).failedReason); 31 | clearInterval(int); 32 | reject((await getScrapeQueue().getJob(jobId)).failedReason); 33 | } 34 | } 35 | }, 500); 36 | }); 37 | } 38 | -------------------------------------------------------------------------------- /apps/api/src/services/queue-service.ts: -------------------------------------------------------------------------------- 1 | import { Queue } from "bullmq"; 2 | import { Logger } from "../lib/logger"; 3 | import IORedis from "ioredis"; 4 | 5 | let scrapeQueue: Queue; 6 | 7 | export const redisConnection = new IORedis(process.env.REDIS_URL, { 8 | maxRetriesPerRequest: null, 9 | }); 10 | 11 | export const scrapeQueueName = "{scrapeQueue}"; 12 | 13 | export function getScrapeQueue(): Queue { 14 | if (!scrapeQueue) { 15 | scrapeQueue = new Queue(scrapeQueueName, { 16 | connection: redisConnection, 17 | defaultJobOptions: { 18 | removeOnComplete: { 19 | age: 90000, // 25 hours 20 | }, 21 | removeOnFail: { 22 | age: 90000, // 25 hours 23 | }, 24 | }, 25 | }); 26 | Logger.info("Web scraper queue created"); 27 | } 28 | return scrapeQueue; 29 | } 30 | -------------------------------------------------------------------------------- /apps/api/src/services/rate-limiter.ts: -------------------------------------------------------------------------------- 1 | import { RateLimiterRedis } from "rate-limiter-flexible"; 2 | import { RateLimiterMode } from "../../src/types"; 3 | import Redis from "ioredis"; 4 | 5 | const RATE_LIMITS = { 6 | crawl: { 7 | default: 3, 8 | free: 2, 9 | starter: 10, 10 | standard: 5, 11 | standardOld: 40, 12 | scale: 50, 13 | hobby: 3, 14 | standardNew: 10, 15 | standardnew: 10, 16 | growth: 50, 17 | growthdouble: 50, 18 | }, 19 | scrape: { 20 | default: 20, 21 | free: 10, 22 | starter: 100, 23 | standard: 100, 24 | standardOld: 100, 25 | scale: 500, 26 | hobby: 20, 27 | standardNew: 100, 28 | standardnew: 100, 29 | growth: 1000, 30 | growthdouble: 1000, 31 | }, 32 | search: { 33 | default: 20, 34 | free: 5, 35 | starter: 50, 36 | standard: 50, 37 | standardOld: 40, 38 | scale: 500, 39 | hobby: 10, 40 | standardNew: 50, 41 | standardnew: 50, 42 | growth: 500, 43 | growthdouble: 500, 44 | }, 45 | map:{ 46 | default: 20, 47 | free: 5, 48 | starter: 50, 49 | standard: 50, 50 | standardOld: 50, 51 | scale: 500, 52 | hobby: 10, 53 | standardNew: 50, 54 | standardnew: 50, 55 | growth: 500, 56 | growthdouble: 500, 57 | }, 58 | preview: { 59 | free: 5, 60 | default: 5, 61 | }, 62 | account: { 63 | free: 100, 64 | default: 100, 65 | }, 66 | crawlStatus: { 67 | free: 150, 68 | default: 250, 69 | }, 70 | testSuite: { 71 | free: 10000, 72 | default: 10000, 73 | }, 74 | }; 75 | 76 | export const redisRateLimitClient = new Redis( 77 | process.env.REDIS_RATE_LIMIT_URL 78 | ) 79 | 80 | const createRateLimiter = (keyPrefix, points) => 81 | new RateLimiterRedis({ 82 | storeClient: redisRateLimitClient, 83 | keyPrefix, 84 | points, 85 | duration: 60, // Duration in seconds 86 | }); 87 | 88 | export const serverRateLimiter = createRateLimiter( 89 | "server", 90 | RATE_LIMITS.account.default 91 | ); 92 | 93 | export const testSuiteRateLimiter = new RateLimiterRedis({ 94 | storeClient: redisRateLimitClient, 95 | keyPrefix: "test-suite", 96 | points: 10000, 97 | duration: 60, // Duration in seconds 98 | }); 99 | 100 | export const devBRateLimiter = new RateLimiterRedis({ 101 | storeClient: redisRateLimitClient, 102 | keyPrefix: "dev-b", 103 | points: 1200, 104 | duration: 60, // Duration in seconds 105 | }); 106 | 107 | export const manualRateLimiter = new RateLimiterRedis({ 108 | storeClient: redisRateLimitClient, 109 | keyPrefix: "manual", 110 | points: 2000, 111 | duration: 60, // Duration in seconds 112 | }); 113 | 114 | 115 | export const scrapeStatusRateLimiter = new RateLimiterRedis({ 116 | storeClient: redisRateLimitClient, 117 | keyPrefix: "scrape-status", 118 | points: 400, 119 | duration: 60, // Duration in seconds 120 | }); 121 | 122 | const testSuiteTokens = ["a01ccae", "6254cf9", "0f96e673", "23befa1b", "69141c4"]; 123 | 124 | const manual = ["69be9e74-7624-4990-b20d-08e0acc70cf6"]; 125 | 126 | export function getRateLimiter( 127 | mode: RateLimiterMode, 128 | token: string, 129 | plan?: string, 130 | teamId?: string 131 | ) { 132 | 133 | if (testSuiteTokens.some(testToken => token.includes(testToken))) { 134 | return testSuiteRateLimiter; 135 | } 136 | 137 | if(teamId && teamId === process.env.DEV_B_TEAM_ID) { 138 | return devBRateLimiter; 139 | } 140 | 141 | if(teamId && manual.includes(teamId)) { 142 | return manualRateLimiter; 143 | } 144 | 145 | const rateLimitConfig = RATE_LIMITS[mode]; // {default : 5} 146 | 147 | if (!rateLimitConfig) return serverRateLimiter; 148 | 149 | const planKey = plan ? plan.replace("-", "") : "default"; // "default" 150 | const points = 151 | rateLimitConfig[planKey] || rateLimitConfig.default || rateLimitConfig; // 5 152 | 153 | return createRateLimiter(`${mode}-${planKey}`, points); 154 | } 155 | -------------------------------------------------------------------------------- /apps/api/src/services/redis.ts: -------------------------------------------------------------------------------- 1 | import Redis from "ioredis"; 2 | import { redisRateLimitClient } from "./rate-limiter"; 3 | import { Logger } from "../lib/logger"; 4 | 5 | // Listen to 'error' events to the Redis connection 6 | redisRateLimitClient.on("error", (error) => { 7 | try { 8 | if (error.message === "ECONNRESET") { 9 | Logger.error("Connection to Redis Session Rate Limit Store timed out."); 10 | } else if (error.message === "ECONNREFUSED") { 11 | Logger.error("Connection to Redis Session Rate Limit Store refused!"); 12 | } else Logger.error(error); 13 | } catch (error) {} 14 | }); 15 | 16 | // Listen to 'reconnecting' event to Redis 17 | redisRateLimitClient.on("reconnecting", (err) => { 18 | try { 19 | if (redisRateLimitClient.status === "reconnecting") 20 | Logger.info("Reconnecting to Redis Session Rate Limit Store..."); 21 | else Logger.error("Error reconnecting to Redis Session Rate Limit Store."); 22 | } catch (error) {} 23 | }); 24 | 25 | // Listen to the 'connect' event to Redis 26 | redisRateLimitClient.on("connect", (err) => { 27 | try { 28 | if (!err) Logger.info("Connected to Redis Session Rate Limit Store!"); 29 | } catch (error) {} 30 | }); 31 | 32 | /** 33 | * Set a value in Redis with an optional expiration time. 34 | * @param {string} key The key under which to store the value. 35 | * @param {string} value The value to store. 36 | * @param {number} [expire] Optional expiration time in seconds. 37 | */ 38 | const setValue = async (key: string, value: string, expire?: number) => { 39 | if (expire) { 40 | await redisRateLimitClient.set(key, value, "EX", expire); 41 | } else { 42 | await redisRateLimitClient.set(key, value); 43 | } 44 | }; 45 | 46 | /** 47 | * Get a value from Redis. 48 | * @param {string} key The key of the value to retrieve. 49 | * @returns {Promise} The value, if found, otherwise null. 50 | */ 51 | const getValue = async (key: string): Promise => { 52 | const value = await redisRateLimitClient.get(key); 53 | return value; 54 | }; 55 | 56 | /** 57 | * Delete a key from Redis. 58 | * @param {string} key The key to delete. 59 | */ 60 | const deleteKey = async (key: string) => { 61 | await redisRateLimitClient.del(key); 62 | }; 63 | 64 | export { setValue, getValue, deleteKey }; 65 | -------------------------------------------------------------------------------- /apps/api/src/services/redlock.ts: -------------------------------------------------------------------------------- 1 | import Redlock from "redlock"; 2 | import Client from "ioredis"; 3 | 4 | export const redlock = new Redlock( 5 | // You should have one client for each independent redis node 6 | // or cluster. 7 | [new Client(process.env.REDIS_RATE_LIMIT_URL)], 8 | { 9 | // The expected clock drift; for more details see: 10 | // http://redis.io/topics/distlock 11 | driftFactor: 0.01, // multiplied by lock ttl to determine drift time 12 | 13 | // The max number of times Redlock will attempt to lock a resource 14 | // before erroring. 15 | retryCount: 5, 16 | 17 | // the time in ms between attempts 18 | retryDelay: 100, // time in ms 19 | 20 | // the max time in ms randomly added to retries 21 | // to improve performance under high contention 22 | // see https://www.awsarchitectureblog.com/2015/03/backoff.html 23 | retryJitter: 200, // time in ms 24 | 25 | // The minimum remaining time on a lock before an extension is automatically 26 | // attempted with the `using` API. 27 | automaticExtensionThreshold: 500, // time in ms 28 | } 29 | ); 30 | -------------------------------------------------------------------------------- /apps/api/src/services/system-monitor.ts: -------------------------------------------------------------------------------- 1 | import si from 'systeminformation'; 2 | import { Mutex } from "async-mutex"; 3 | 4 | const MAX_CPU = process.env.MAX_CPU ? parseFloat(process.env.MAX_CPU) : 0.8; 5 | const MAX_RAM = process.env.MAX_RAM ? parseFloat(process.env.MAX_RAM) : 0.8; 6 | const CACHE_DURATION = process.env.SYS_INFO_MAX_CACHE_DURATION ? parseFloat(process.env.SYS_INFO_MAX_CACHE_DURATION) : 150; 7 | 8 | class SystemMonitor { 9 | private static instance: SystemMonitor; 10 | private static instanceMutex = new Mutex(); 11 | 12 | private cpuUsageCache: number | null = null; 13 | private memoryUsageCache: number | null = null; 14 | private lastCpuCheck: number = 0; 15 | private lastMemoryCheck: number = 0; 16 | 17 | private constructor() {} 18 | 19 | public static async getInstance(): Promise { 20 | if (SystemMonitor.instance) { 21 | return SystemMonitor.instance; 22 | } 23 | 24 | await this.instanceMutex.runExclusive(async () => { 25 | if (!SystemMonitor.instance) { 26 | SystemMonitor.instance = new SystemMonitor(); 27 | } 28 | }); 29 | 30 | return SystemMonitor.instance; 31 | } 32 | 33 | private async checkMemoryUsage() { 34 | const now = Date.now(); 35 | if (this.memoryUsageCache !== null && (now - this.lastMemoryCheck) < CACHE_DURATION) { 36 | return this.memoryUsageCache; 37 | } 38 | 39 | const memoryData = await si.mem(); 40 | const totalMemory = memoryData.total; 41 | const availableMemory = memoryData.available; 42 | const usedMemory = totalMemory - availableMemory; 43 | const usedMemoryPercentage = (usedMemory / totalMemory); 44 | 45 | this.memoryUsageCache = usedMemoryPercentage; 46 | this.lastMemoryCheck = now; 47 | 48 | return usedMemoryPercentage; 49 | } 50 | 51 | private async checkCpuUsage() { 52 | const now = Date.now(); 53 | if (this.cpuUsageCache !== null && (now - this.lastCpuCheck) < CACHE_DURATION) { 54 | return this.cpuUsageCache; 55 | } 56 | 57 | const cpuData = await si.currentLoad(); 58 | const cpuLoad = cpuData.currentLoad / 100; 59 | 60 | this.cpuUsageCache = cpuLoad; 61 | this.lastCpuCheck = now; 62 | 63 | return cpuLoad; 64 | } 65 | 66 | public async acceptConnection() { 67 | const cpuUsage = await this.checkCpuUsage(); 68 | const memoryUsage = await this.checkMemoryUsage(); 69 | 70 | return cpuUsage < MAX_CPU && memoryUsage < MAX_RAM; 71 | } 72 | 73 | public clearCache() { 74 | this.cpuUsageCache = null; 75 | this.memoryUsageCache = null; 76 | this.lastCpuCheck = 0; 77 | this.lastMemoryCheck = 0; 78 | } 79 | } 80 | 81 | export default SystemMonitor.getInstance(); -------------------------------------------------------------------------------- /apps/api/src/strings.ts: -------------------------------------------------------------------------------- 1 | export const errorNoResults = 2 | "No results found, please check the URL or contact us at help@mendable.ai to file a ticket."; 3 | 4 | export const clientSideError = "client-side exception has occurred" -------------------------------------------------------------------------------- /apps/api/src/types.ts: -------------------------------------------------------------------------------- 1 | import { Document, DocumentUrl } from "./lib/entities"; 2 | 3 | type Mode = "crawl" | "single_urls" | "sitemap"; 4 | 5 | export interface CrawlResult { 6 | source: string; 7 | content: string; 8 | options?: { 9 | summarize?: boolean; 10 | summarize_max_chars?: number; 11 | }; 12 | metadata?: any; 13 | raw_context_id?: number | string; 14 | permissions?: any[]; 15 | } 16 | 17 | export interface IngestResult { 18 | success: boolean; 19 | error: string; 20 | data: CrawlResult[]; 21 | } 22 | 23 | export interface WebScraperOptions { 24 | url: string; 25 | mode: Mode; 26 | crawlerOptions: any; 27 | pageOptions: any; 28 | team_id: string; 29 | origin?: string; 30 | crawl_id: string; 31 | sitemapped?: boolean; 32 | webhookUrls?: string[]; 33 | webhookMetadata?: any; 34 | v1?: boolean; 35 | is_scrape?: boolean; 36 | } 37 | 38 | export interface RunWebScraperParams { 39 | url: string; 40 | mode: Mode; 41 | crawlerOptions: any; 42 | pageOptions?: any; 43 | webhookUrls?: string[]; 44 | webhookMetadata?: any; 45 | inProgress: (progress: any) => void; 46 | onSuccess: (result: any, mode: string) => void; 47 | onError: (error: Error) => void; 48 | team_id: string; 49 | bull_job_id: string; 50 | crawl_id: string; 51 | priority?: number; 52 | is_scrape?: boolean; 53 | } 54 | 55 | export interface RunWebScraperResult { 56 | success: boolean; 57 | message: string; 58 | docs: Document[] | DocumentUrl[]; 59 | } 60 | 61 | export interface FirecrawlJob { 62 | job_id?: string; 63 | success: boolean; 64 | message: string; 65 | num_docs: number; 66 | docs: any[]; 67 | time_taken: number; 68 | team_id: string; 69 | mode: string; 70 | url: string; 71 | crawlerOptions?: any; 72 | pageOptions?: any; 73 | origin: string; 74 | num_tokens?: number; 75 | retry?: boolean; 76 | crawl_id?: string; 77 | } 78 | 79 | export interface FirecrawlScrapeResponse { 80 | statusCode: number; 81 | body: { 82 | status: string; 83 | data: Document; 84 | }; 85 | error?: string; 86 | } 87 | 88 | export interface FirecrawlCrawlResponse { 89 | statusCode: number; 90 | body: { 91 | status: string; 92 | jobId: string; 93 | }; 94 | error?: string; 95 | } 96 | 97 | export interface FirecrawlCrawlStatusResponse { 98 | statusCode: number; 99 | body: { 100 | status: string; 101 | data: Document[]; 102 | }; 103 | error?: string; 104 | } 105 | 106 | export enum RateLimiterMode { 107 | Crawl = "crawl", 108 | CrawlStatus = "crawlStatus", 109 | Scrape = "scrape", 110 | Preview = "preview", 111 | Search = "search", 112 | Map = "map", 113 | } 114 | 115 | export interface AuthResponse { 116 | success: boolean; 117 | team_id?: string; 118 | error?: string; 119 | status?: number; 120 | api_key?: string; 121 | plan?: PlanType; 122 | } 123 | 124 | export enum NotificationType { 125 | APPROACHING_LIMIT = "approachingLimit", 126 | LIMIT_REACHED = "limitReached", 127 | RATE_LIMIT_REACHED = "rateLimitReached", 128 | } 129 | 130 | export type ScrapeLog = { 131 | url: string; 132 | scraper: string; 133 | success?: boolean; 134 | response_code?: number; 135 | time_taken_seconds?: number; 136 | proxy?: string; 137 | retried?: boolean; 138 | error_message?: string; 139 | date_added?: string; // ISO 8601 format 140 | html?: string; 141 | ipv4_support?: boolean | null; 142 | ipv6_support?: boolean | null; 143 | }; 144 | 145 | export type PlanType = 146 | | "starter" 147 | | "standard" 148 | | "scale" 149 | | "hobby" 150 | | "standardnew" 151 | | "growth" 152 | | "growthdouble" 153 | | "free" 154 | | ""; 155 | -------------------------------------------------------------------------------- /apps/api/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "rootDir": "./src", 4 | "lib": ["es6","DOM"], 5 | 6 | // or higher 7 | "target": "ES2020", 8 | 9 | "module": "commonjs", 10 | "esModuleInterop": true, 11 | "sourceMap": true, 12 | "outDir": "./dist/src", 13 | "moduleResolution": "node", 14 | "baseUrl": ".", 15 | 16 | "paths": { 17 | "*": ["node_modules/*", "src/types/*"], 18 | }, 19 | 20 | "inlineSources": true 21 | }, 22 | "include": ["src/","src/**/*", "utils/utils.ts", "utils/EventEmmitter.ts", "src/services/queue-service.ts"] 23 | } 24 | -------------------------------------------------------------------------------- /apps/puppeteer-service-ts/.dockerignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | node_modules/ 3 | -------------------------------------------------------------------------------- /apps/puppeteer-service-ts/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM node:18-slim 2 | 3 | WORKDIR /usr/src/app 4 | 5 | RUN npm install -g pnpm 6 | 7 | COPY . . 8 | 9 | RUN pnpm install 10 | 11 | # Install Playwright dependencies 12 | RUN npx playwright install --with-deps 13 | 14 | RUN pnpm run build 15 | 16 | ARG PORT 17 | ENV PORT=${PORT} 18 | 19 | EXPOSE ${PORT} 20 | 21 | CMD [ "pnpm", "start" ] 22 | FROM node:18-slim 23 | 24 | # Install system dependencies for Playwright 25 | RUN apt-get update && \ 26 | apt-get install -y wget gnupg && \ 27 | wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ 28 | echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \ 29 | apt-get update && \ 30 | apt-get install -y \ 31 | google-chrome-stable \ 32 | fonts-ipafont-gothic \ 33 | fonts-wqy-zenhei \ 34 | fonts-thai-tlwg \ 35 | fonts-kacst \ 36 | fonts-symbola \ 37 | fonts-noto \ 38 | fonts-freefont-ttf \ 39 | --no-install-recommends && \ 40 | rm -rf /var/lib/apt/lists/* 41 | 42 | WORKDIR /app 43 | 44 | # Install pnpm globally 45 | RUN npm install -g pnpm 46 | 47 | # Copy package files 48 | COPY ./package.json ./pnpm-lock.yaml ./ 49 | 50 | # Install dependencies 51 | RUN pnpm install --frozen-lockfile 52 | 53 | # Copy the rest of the application 54 | COPY . . 55 | 56 | # Install Playwright with dependencies 57 | RUN npx playwright install --with-deps chromium 58 | 59 | # Build the application 60 | RUN pnpm run build 61 | 62 | # Set up environment 63 | ARG PORT=3000 64 | ENV PORT=${PORT} 65 | ENV PLAYWRIGHT_BROWSERS_PATH=/app/ms-playwright 66 | ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1 67 | 68 | EXPOSE ${PORT} 69 | 70 | CMD ["pnpm", "start"] 71 | -------------------------------------------------------------------------------- /apps/puppeteer-service-ts/README.md: -------------------------------------------------------------------------------- 1 | # Playwright Scrape API 2 | 3 | This is a simple web scraping service built with Express and Playwright. 4 | 5 | ## Features 6 | 7 | - Scrapes HTML content from specified URLs. 8 | - Blocks requests to known ad-serving domains. 9 | - Blocks media files to reduce bandwidth usage. 10 | - Uses random user-agent strings to avoid detection. 11 | - Strategy to ensure the page is fully rendered. 12 | 13 | ## Install 14 | 15 | ```bash 16 | pnpm install 17 | npx playwright install 18 | ``` 19 | 20 | ## RUN 21 | 22 | ```bash 23 | pnpm run build 24 | pnpm start 25 | ``` 26 | 27 | OR 28 | 29 | ```bash 30 | pnpm run dev 31 | ``` 32 | 33 | ## USE 34 | 35 | ```bash 36 | curl -X POST http://localhost:3000/scrape \ 37 | -H "Content-Type: application/json" \ 38 | -d '{ 39 | "url": "https://example.com", 40 | "wait_after_load": 1000, 41 | "timeout": 60000, 42 | "headers": { 43 | "Custom-Header": "value" 44 | }, 45 | "check_selector": "#content" 46 | }' 47 | ``` 48 | 49 | ## USING WITH FIRECRAWL 50 | 51 | Add `PLAYWRIGHT_MICROSERVICE_URL=http://localhost:3003/scrape` to `/apps/api/.env` to configure the API to use this Playwright microservice for scraping operations. 52 | -------------------------------------------------------------------------------- /apps/puppeteer-service-ts/helpers/get_error.ts: -------------------------------------------------------------------------------- 1 | //impired by firecrawl repo @rafaelsideguide 2 | export const getError = (statusCode: number | null): string | null => { 3 | if (statusCode === null) { 4 | return 'No response received'; 5 | } 6 | 7 | const errorMessages: { [key: number]: string } = { 8 | 300: "Multiple Choices", 9 | 301: "Moved Permanently", 10 | 302: "Found", 11 | 303: "See Other", 12 | 304: "Not Modified", 13 | 305: "Use Proxy", 14 | 307: "Temporary Redirect", 15 | 308: "Permanent Redirect", 16 | 309: "Resume Incomplete", 17 | 310: "Too Many Redirects", 18 | 311: "Unavailable For Legal Reasons", 19 | 312: "Previously Used", 20 | 313: "I'm Used", 21 | 314: "Switch Proxy", 22 | 315: "Temporary Redirect", 23 | 316: "Resume Incomplete", 24 | 317: "Too Many Redirects", 25 | 400: "Bad Request", 26 | 401: "Unauthorized", 27 | 403: "Forbidden", 28 | 404: "Not Found", 29 | 405: "Method Not Allowed", 30 | 406: "Not Acceptable", 31 | 407: "Proxy Authentication Required", 32 | 408: "Request Timeout", 33 | 409: "Conflict", 34 | 410: "Gone", 35 | 411: "Length Required", 36 | 412: "Precondition Failed", 37 | 413: "Payload Too Large", 38 | 414: "URI Too Long", 39 | 415: "Unsupported Media Type", 40 | 416: "Range Not Satisfiable", 41 | 417: "Expectation Failed", 42 | 418: "I'm a teapot", 43 | 421: "Misdirected Request", 44 | 422: "Unprocessable Entity", 45 | 423: "Locked", 46 | 424: "Failed Dependency", 47 | 425: "Too Early", 48 | 426: "Upgrade Required", 49 | 428: "Precondition Required", 50 | 429: "Too Many Requests", 51 | 431: "Request Header Fields Too Large", 52 | 451: "Unavailable For Legal Reasons", 53 | 500: "Internal Server Error", 54 | 501: "Not Implemented", 55 | 502: "Bad Gateway", 56 | 503: "Service Unavailable", 57 | 504: "Gateway Timeout", 58 | 505: "HTTP Version Not Supported", 59 | 506: "Variant Also Negotiates", 60 | 507: "Insufficient Storage", 61 | 508: "Loop Detected", 62 | 510: "Not Extended", 63 | 511: "Network Authentication Required", 64 | 599: "Network Connect Timeout Error" 65 | }; 66 | 67 | if (statusCode < 300) { 68 | return null; 69 | } 70 | 71 | return errorMessages[statusCode] || "Unknown Error"; 72 | }; 73 | -------------------------------------------------------------------------------- /apps/puppeteer-service-ts/openapi/index.ts: -------------------------------------------------------------------------------- 1 | import { Application } from "express"; 2 | import swaggerJsdoc from "swagger-jsdoc"; 3 | import redoc from "redoc-express"; 4 | 5 | const options = { 6 | failOnErrors: true, 7 | definition: { 8 | openapi: "3.0.0", 9 | info: { 10 | title: "Puppeteer Service API", 11 | description: "API for browser automation and scraping", 12 | version: "1.0.0", 13 | }, 14 | servers: [ 15 | { 16 | url: "/", 17 | description: "Puppeteer Service", 18 | }, 19 | ], 20 | }, 21 | apis: ["./api.ts"], 22 | }; 23 | 24 | export function setupOpenAPI(app: Application) { 25 | const openapiSpecification = swaggerJsdoc(options); 26 | 27 | app.get("/api-docs/openapi.json", (req, res) => { 28 | res.setHeader("Content-Type", "application/json"); 29 | res.send(openapiSpecification); 30 | }); 31 | 32 | app.get( 33 | "/redoc", 34 | redoc({ 35 | title: "API Docs", 36 | specUrl: "/api-docs/openapi.json", 37 | nonce: "", // <= it is optional,we can omit this key and value 38 | // we are now start supporting the redocOptions object 39 | // you can omit the options object if you don't need it 40 | // https://redocly.com/docs/api-reference-docs/configuration/functionality/ 41 | redocOptions: {}, 42 | }) 43 | ); 44 | } 45 | -------------------------------------------------------------------------------- /apps/puppeteer-service-ts/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "puppeteerscraper-api", 3 | "version": "1.0.0", 4 | "description": "scraper api with ulixee hero", 5 | "main": "api.ts", 6 | "scripts": { 7 | "start": "node dist/api.js", 8 | "build": "tsc", 9 | "dev": "ts-node api.ts" 10 | }, 11 | "keywords": [], 12 | "author": "Jeff Pereira", 13 | "license": "ISC", 14 | "dependencies": { 15 | "@ulixee/hero": "2.0.0-alpha.31", 16 | "@ulixee/hero-core": "2.0.0-alpha.31", 17 | "@ulixee/hero-plugin-utils": "2.0.0-alpha.31", 18 | "@ulixee/net": "2.0.0-alpha.31", 19 | "@ulixee/commons": "2.0.0-alpha.31", 20 | "@ulixee/hero-interfaces": "2.0.0-alpha.31", 21 | "body-parser": "^1.20.2", 22 | "dotenv": "^16.4.5", 23 | "express": "^4.19.2", 24 | "playwright-core": "^1.48.2", 25 | "puppeteer": "^23.6.1", 26 | "puppeteer-cluster": "^0.24.0", 27 | "puppeteer-extra": "^3.3.6", 28 | "puppeteer-extra-plugin-adblocker": "^2.13.6", 29 | "puppeteer-extra-plugin-recaptcha": "^3.6.8", 30 | "puppeteer-extra-plugin-stealth": "^2.11.2", 31 | "redoc-express": "^2.1.0", 32 | "swagger-jsdoc": "^6.2.8" 33 | }, 34 | "devDependencies": { 35 | "@types/body-parser": "^1.19.5", 36 | "@types/express": "^4.17.21", 37 | "@types/node": "^20.14.9", 38 | "@types/random-useragent": "^0.3.3", 39 | "@types/swagger-jsdoc": "^6.0.4", 40 | "ts-node": "^10.9.2", 41 | "typescript": "^5.5.2" 42 | }, 43 | "packageManager": "pnpm@9.12.3+sha512.cce0f9de9c5a7c95bef944169cc5dfe8741abfb145078c0d508b868056848a87c81e626246cb60967cbd7fd29a6c062ef73ff840d96b3c86c40ac92cf4a813ee" 44 | } 45 | -------------------------------------------------------------------------------- /apps/puppeteer-service-ts/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "es2022", 4 | "module": "CommonJS", 5 | "moduleResolution": "node", 6 | "outDir": "./dist", 7 | "rootDir": "./", 8 | "esModuleInterop": true, 9 | "forceConsistentCasingInFileNames": true, 10 | "strict": true, 11 | "skipLibCheck": true, 12 | "resolveJsonModule": true, 13 | "allowJs": true, 14 | "types": ["node"], 15 | "baseUrl": ".", 16 | "paths": { 17 | "@ulixee/*": ["node_modules/@ulixee/*"] 18 | } 19 | }, 20 | "include": [ 21 | "**/*" 22 | ], 23 | "exclude": [ 24 | "node_modules", 25 | "dist" 26 | ] 27 | } -------------------------------------------------------------------------------- /apps/test-suite/.env.example: -------------------------------------------------------------------------------- 1 | TEST_API_KEY= 2 | TEST_URL=http://localhost:3002 3 | ENV= -------------------------------------------------------------------------------- /apps/test-suite/README.md: -------------------------------------------------------------------------------- 1 | # Test Suite for Firecrawl 2 | 3 | This document provides an overview of the test suite for the Firecrawl project. It includes instructions on how to run the tests and interpret the results. 4 | 5 | ## Overview 6 | 7 | The test suite is designed to ensure the reliability and performance of the Firecrawl system. It includes a series of automated tests that check various functionalities and performance metrics. 8 | 9 | ## Running the Tests 10 | 11 | To run the tests, navigate to the `test-suite` directory and execute the following command: 12 | 13 | ```bash 14 | npm install 15 | npx playwright install 16 | npm run test 17 | ``` 18 | 19 | ## Running Load Tests with Artillery 20 | 21 | To run load tests using Artillery, follow these steps: 22 | 23 | 1. Install Artillery globally if you haven't already: 24 | 25 | ```bash 26 | npm install -g artillery 27 | ``` 28 | 29 | 2. Run the load test: 30 | 31 | ```bash 32 | artillery run load-test.yml 33 | ``` 34 | 35 | ## Test Results 36 | 37 | The tests are designed to cover various aspects of the system, including: 38 | 39 | - Crawling accuracy 40 | - Response time 41 | - Error handling 42 | 43 | ### Example Test Case 44 | 45 | - **Test Name**: Accuracy Test 46 | - **Description**: This test checks the accuracy of the scraping mechanism with 100 pages and a fuzzy threshold of 0.8. 47 | - **Expected Result**: Accuracy >= 0.9 48 | - **Received Result**: Accuracy between 0.2 and 0.3 49 | 50 | ## Troubleshooting 51 | 52 | If you encounter any failures or unexpected results, please check the following: 53 | - Ensure your network connection is stable. 54 | - Verify that all dependencies are correctly installed. 55 | - Review the error logs for any specific error messages. 56 | 57 | ## Contributing 58 | 59 | Contributions to the test suite are welcome. Please refer to the project's main [CONTRIBUTING.md](../CONTRIBUTING.md) file for guidelines on how to contribute. -------------------------------------------------------------------------------- /apps/test-suite/data/scrape.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "website": "https://www.anthropic.com/claude", 4 | "prompt": "Does this website contain pricing information?", 5 | "expected_output": "yes" 6 | }, 7 | { 8 | "website": "https://mendable.ai/pricing", 9 | "prompt": "Does this website contain pricing information?", 10 | "expected_output": "yes" 11 | }, 12 | { 13 | "website": "https://openai.com/news", 14 | "prompt": "Does this website contain a list of research news?", 15 | "expected_output": "yes" 16 | }, 17 | { 18 | "website": "https://agentops.ai", 19 | "prompt": "Does this website contain a code snippets?", 20 | "expected_output": "yes" 21 | }, 22 | { 23 | "website": "https://ycombinator.com/companies", 24 | "prompt": "Does this website contain a list bigger than 5 of ycombinator companies?", 25 | "expected_output": "yes" 26 | }, 27 | { 28 | "website": "https://firecrawl.dev", 29 | "prompt": "Does this website contain a list bigger than 5 of ycombinator companies?", 30 | "expected_output": "no" 31 | }, 32 | { 33 | "website": "https://en.wikipedia.org/wiki/T._N._Seshan", 34 | "prompt": "Does this website talk about Seshan's career?", 35 | "expected_output": "yes" 36 | }, 37 | { 38 | "website": "https://mendable.ai/blog", 39 | "prompt": "Does this website contain multiple blog articles?", 40 | "expected_output": "yes" 41 | }, 42 | { 43 | "website": "https://www.framer.com/pricing", 44 | "prompt": "Is there an enterprise pricing option?", 45 | "expected_output": "yes" 46 | }, 47 | { 48 | "website": "https://fly.io/docs/gpus/gpu-quickstart", 49 | "prompt": "Is there a fly deploy command on this page?", 50 | "expected_output": "yes" 51 | }, 52 | { 53 | "website": "https://news.ycombinator.com/", 54 | "prompt": "Does this website contain a list of articles in a table markdown format?", 55 | "expected_output": "yes" 56 | }, 57 | { 58 | "website": "https://www.vellum.ai/llm-leaderboard", 59 | "prompt": "Does this website contain a model comparison table?", 60 | "expected_output": "yes" 61 | }, 62 | { 63 | "website": "https://www.bigbadtoystore.com", 64 | "prompt": "are there more than 3 toys in the new arrivals section?", 65 | "expected_output": "yes" 66 | }, 67 | { 68 | "website": "https://www.instructables.com", 69 | "prompt": "Does the site offer more than 5 links about circuits?", 70 | "expected_output": "yes" 71 | }, 72 | { 73 | "website": "https://www.powells.com", 74 | "prompt": "is there at least 10 books webpage links?", 75 | "expected_output": "yes" 76 | }, 77 | { 78 | "website": "https://www.royalacademy.org.uk", 79 | "prompt": "is there information on upcoming art exhibitions?", 80 | "expected_output": "yes" 81 | }, 82 | { 83 | "website": "https://www.eastbaytimes.com", 84 | "prompt": "Is there a Trending Nationally section that lists articles?", 85 | "expected_output": "yes" 86 | }, 87 | { 88 | "website": "https://www.manchestereveningnews.co.uk", 89 | "prompt": "is the content focused on Manchester sports news?", 90 | "expected_output": "no" 91 | }, 92 | { 93 | "website": "https://physicsworld.com", 94 | "prompt": "does the site provide at least 15 updates on the latest physics research?", 95 | "expected_output": "yes" 96 | }, 97 | { 98 | "website": "https://richmondconfidential.org", 99 | "prompt": "does the page contains more than 4 articles?", 100 | "expected_output": "yes" 101 | }, 102 | { 103 | "website": "https://www.techinasia.com", 104 | "prompt": "is there at least 10 articles of the startup scene in Asia?", 105 | "expected_output": "yes", 106 | "notes": "The website has a paywall and bot detectors." 107 | }, 108 | { 109 | "website": "https://www.boardgamegeek.com", 110 | "prompt": "are there more than 5 board game news?", 111 | "expected_output": "yes" 112 | }, 113 | { 114 | "website": "https://www.mountainproject.com", 115 | "prompt": "Are there more than 3 climbing guides for Arizona?", 116 | "expected_output": "yes" 117 | } 118 | ] 119 | -------------------------------------------------------------------------------- /apps/test-suite/jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: "ts-jest", 3 | testEnvironment: "node", 4 | setupFiles: ["./jest.setup.js"], 5 | }; 6 | -------------------------------------------------------------------------------- /apps/test-suite/jest.setup.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/jest.setup.js -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/assets/CPU-utilization-report-test-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-1-5/assets/CPU-utilization-report-test-1.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/assets/memory-utilization-report-test-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-1-5/assets/memory-utilization-report-test-1.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/assets/metrics-test-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-1-5/assets/metrics-test-2.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/assets/metrics-test-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-1-5/assets/metrics-test-3.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/assets/metrics-test-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-1-5/assets/metrics-test-4.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/assets/metrics-test-5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-1-5/assets/metrics-test-5.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/load-test-1.md: -------------------------------------------------------------------------------- 1 | # Scraping Load Testing - Test #1 2 | 3 | ## Summary 4 | 5 | The load test successfully processed 600 requests in 60 seconds with all requests returning HTTP 200 status codes. The average response time was 1380.1 ms, with CPU utilization peaking at around 50% on both machines, indicating sufficient CPU resources. However, there was a significant increase in memory usage post-test, which did not return to pre-test levels, suggesting a potential memory leak. Further investigation and additional load tests are recommended to address this issue and optimize the system's performance. 6 | 7 | ## Table of Contents 8 | 9 | - [Scraping Load Testing - Test #1](#scraping-load-testing---test-1) 10 | - [Summary](#summary) 11 | - [Table of Contents](#table-of-contents) 12 | - [Test environment](#test-environment) 13 | - [Machines](#machines) 14 | - [Load #1 - 600 reqs 60 secs (initial load only)](#load-1---600-reqs-60-secs-initial-load-only) 15 | - [Archillery Report](#archillery-report) 16 | - [CPU Utilization](#cpu-utilization) 17 | - [Memory Utilization](#memory-utilization) 18 | - [Conclusions and Next Steps](#conclusions-and-next-steps) 19 | - [Conclusions](#conclusions) 20 | - [Next Steps](#next-steps) 21 | 22 | ## Test environment 23 | ### Machines 24 | 25 | | Machine | Size/CPU | 26 | |---|---| 27 | | e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | 28 | | 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | 29 | 30 | --- 31 | 32 | ## Load #1 - 600 reqs 60 secs (initial load only) 33 | 34 | ```yml 35 | # load-test.yml 36 | - duration: 60 37 | arrivalRate: 10 # Initial load 38 | ``` 39 | 40 | ### Archillery Report 41 | Date: 10:49:39(-0300) 42 | 43 | | Metric | Value | 44 | |---------------------------------------------|---------| 45 | | http.codes.200 | 600 | 46 | | http.downloaded_bytes | 0 | 47 | | http.request_rate | 10/sec | 48 | | http.requests | 600 | 49 | | http.response_time.min | 984 | 50 | | http.response_time.max | 2267 | 51 | | http.response_time.mean | 1380.1 | 52 | | http.response_time.median | 1353.1 | 53 | | http.response_time.p95 | 1755 | 54 | | http.response_time.p99 | 2059.5 | 55 | | http.responses | 600 | 56 | | vusers.completed | 600 | 57 | | vusers.created | 600 | 58 | | vusers.created_by_name.Scrape a URL | 600 | 59 | | vusers.failed | 0 | 60 | | vusers.session_length.min | 1053.7 | 61 | | vusers.session_length.max | 2332.6 | 62 | | vusers.session_length.mean | 1447.4 | 63 | | vusers.session_length.median | 1436.8 | 64 | | vusers.session_length.p95 | 1863.5 | 65 | | vusers.session_length.p99 | 2143.5 | 66 | 67 | ### CPU Utilization 68 | ![](./assets/CPU-utilization-report-test-1.png) 69 | 70 | Both machines peaked at around 50% CPU utilization. 71 | 72 | ### Memory Utilization 73 | ![](./assets/memory-utilization-report-test-1.png) 74 | 75 | | Machine | Before | After Load Test | 76 | |---|---|---| 77 | | e286de4f711e86 | 295 MiB | 358 MiB | 78 | | 73d8dd909c1189 | 296 MiB | 355 MiB | 79 | 80 | Notice that the memory utilization has not re-stabilished to the pre-test values during the check window, which may indicate a memory leak problem. 81 | 82 | --- 83 | 84 | ## Conclusions and Next Steps 85 | 86 | ### Conclusions 87 | 1. **Performance:** The system handled 600 requests in 60 seconds with a mean response time of 1380.1 ms. All requests were successful (HTTP 200). 88 | 2. **CPU Utilization:** Both machines peaked at around 50% CPU utilization, indicating that the CPU resources were sufficient for the load. 89 | 3. **Memory Utilization:** There was a noticeable increase in memory usage on both machines post-test, and the memory did not re-stabilize to pre-test levels, suggesting a potential memory leak. 90 | 91 | ### Next Steps 92 | 1. **Investigate Memory Leak:** Conduct a detailed analysis to identify and fix the potential memory leak. This may involve profiling the application and reviewing the code for memory management issues. 93 | 2. **Additional Load Tests:** Perform additional load tests with varying request rates and durations to further assess the system's performance and stability. 94 | 3. **Optimize Performance:** Based on the findings, optimize the application to improve response times and resource utilization. 95 | 4. **Monitor in Production:** Implement monitoring in the production environment to ensure that similar issues do not occur under real-world conditions. 96 | 5. **Documentation:** Update the documentation with the findings and any changes made to the system as a result of this test. 97 | 98 | By following these steps, we can ensure that the system is robust, efficient, and ready to handle production workloads. -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/load-test-2.md: -------------------------------------------------------------------------------- 1 | # Scraping Load Testing - Test #2 2 | 3 | ## Summary 4 | 5 | The load test encountered significant issues, processing 9000 requests with 5473 timeouts and a 61.6% failure rate. The average response time was 3682.1 ms, with a peak response time of 9919 ms. Both machines reached 100% CPU utilization, leading to severe performance bottlenecks and high failure rates. This indicates the need for substantial optimizations, autoscaling, and further investigation. 6 | 7 | ## Table of Contents 8 | 9 | - [Scraping Load Testing - Test #2](#scraping-load-testing---test-2) 10 | - [Summary](#summary) 11 | - [Table of Contents](#table-of-contents) 12 | - [Test environment](#test-environment) 13 | - [Machines](#machines) 14 | - [Load #2 - 9000 reqs 7 mins 11 secs (4 phases)](#load-2---9000-reqs-7-mins-11-secs-4-phases) 15 | - [Archillery Report](#archillery-report) 16 | - [Metrics](#metrics) 17 | - [Conclusions and Next Steps](#conclusions-and-next-steps) 18 | - [Conclusions](#conclusions) 19 | - [Next Steps](#next-steps) 20 | 21 | ## Test environment 22 | ### Machines 23 | 24 | | Machine | Size/CPU | 25 | |---|---| 26 | | e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | 27 | | 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | 28 | 29 | --- 30 | 31 | ## Load #2 - 9000 reqs 7 mins 11 secs (4 phases) 32 | 33 | ```yml 34 | 35 | # load-test.yml 36 | - duration: 60 37 | arrivalRate: 10 # Initial load 38 | - duration: 120 39 | arrivalRate: 20 # Increased load 40 | - duration: 180 41 | arrivalRate: 30 # Peak load 42 | - duration: 60 43 | arrivalRate: 10 # Cool down 44 | ``` 45 | 46 | 47 | ### Archillery Report 48 | Date: 13:50:08(-0300) 49 | 50 | | Metric | Value | 51 | |---------------------------------------------|---------| 52 | | errors.ETIMEDOUT | 5473 | 53 | | errors.Failed capture or match | 73 | 54 | | http.codes.200 | 3454 | 55 | | http.codes.401 | 64 | 56 | | http.codes.402 | 9 | 57 | | http.downloaded_bytes | 0 | 58 | | http.request_rate | 21/sec | 59 | | http.requests | 9000 | 60 | | http.response_time.min | 929 | 61 | | http.response_time.max | 9919 | 62 | | http.response_time.mean | 3682.1 | 63 | | http.response_time.median | 3395.5 | 64 | | http.response_time.p95 | 8024.5 | 65 | | http.response_time.p99 | 9607.1 | 66 | | http.responses | 3527 | 67 | | vusers.completed | 3454 | 68 | | vusers.created | 9000 | 69 | | vusers.created_by_name.Scrape a URL | 9000 | 70 | | vusers.failed | 5546 | 71 | | vusers.session_length.min | 1127.6 | 72 | | vusers.session_length.max | 9982.2 | 73 | | vusers.session_length.mean | 3730.6 | 74 | | vusers.session_length.median | 3464.1 | 75 | | vusers.session_length.p95 | 7865.6 | 76 | | vusers.session_length.p99 | 9607.1 | 77 | 78 | ### Metrics 79 | 80 | ![](./assets/metrics-test-2.png) 81 | 82 | Both machines reached 100% CPU utilization, which led to a significant number of request failures (61.6% failure rate). 83 | 84 | --- 85 | 86 | ## Conclusions and Next Steps 87 | 88 | ### Conclusions 89 | 1. **Performance:** The system struggled with 9000 requests, resulting in 5473 timeouts and a mean response time of 3682.1 ms. 90 | 2. **CPU Utilization:** Both machines experienced 100% CPU utilization, causing severe performance degradation and high failure rates. 91 | 92 | ### Next Steps 93 | Implement an autoscaling solution on Fly.io and conduct tests using the same configurations. 94 | -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/load-test-3.md: -------------------------------------------------------------------------------- 1 | # Scraping Load Testing - Test #3 2 | 3 | ## Summary 4 | 5 | The load test involved setting up an autoscaling option and adjusting the hard and soft limits for the Fly.io configuration. The test environment consisted of 5 machines, with 3 machines automatically scaling up during the test. Despite the scaling, there were 653 timeouts (7.3%) and 2 HTTP 502 responses (0.02%). The average response time was 3037.2 ms, with a peak response time of 9941 ms. Further adjustments to the soft limit are recommended to improve performance and reduce errors. 6 | 7 | ## Table of Contents 8 | 9 | - [Scraping Load Testing - Test #3](#scraping-load-testing---test-3) 10 | - [Summary](#summary) 11 | - [Table of Contents](#table-of-contents) 12 | - [Test environment](#test-environment) 13 | - [Machines](#machines) 14 | - [Load Test Phases](#load-test-phases) 15 | - [Configuration](#configuration) 16 | - [Results](#results) 17 | - [Metrics](#metrics) 18 | - [Conclusions and Next Steps](#conclusions-and-next-steps) 19 | - [Conclusions](#conclusions) 20 | - [Next Steps](#next-steps) 21 | 22 | ## Test environment 23 | ### Machines 24 | 25 | | Machine | Size/CPU | Status | 26 | |---|---|---| 27 | | e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on | 28 | | 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on | 29 | | 6e82050c726358 mia (app) | performance-cpu-1x@2048MB | paused | 30 | | 4d89505a6e5038 mia (app) | performance-cpu-1x@2048MB | paused | 31 | | 48ed6e6b74e378 mia (app) | performance-cpu-1x@2048MB | paused | 32 | 33 | --- 34 | 35 | ## Load Test Phases 36 | 37 | ### Configuration 38 | 39 | ```toml 40 | # fly.staging.toml 41 | [http_service.concurrency] 42 | type = "requests" 43 | hard_limit = 100 44 | soft_limit = 75 45 | ``` 46 | ```yml 47 | # load-test.yml 48 | - duration: 60 49 | arrivalRate: 10 # Initial load 50 | - duration: 120 51 | arrivalRate: 20 # Increased load 52 | - duration: 180 53 | arrivalRate: 30 # Peak load 54 | - duration: 60 55 | arrivalRate: 10 # Cool down 56 | ``` 57 | 58 | 59 | ### Results 60 | Date: 14:53:32(-0300) 61 | 62 | | Metric | Value | 63 | |---------------------------------------------|---------| 64 | | errors.ETIMEDOUT | 653 | 65 | | errors.Failed capture or match | 2 | 66 | | http.codes.200 | 8345 | 67 | | http.codes.502 | 2 | 68 | | http.downloaded_bytes | 0 | 69 | | http.request_rate | 11/sec | 70 | | http.requests | 9000 | 71 | | http.response_time.min | 979 | 72 | | http.response_time.max | 9941 | 73 | | http.response_time.mean | 3037.2 | 74 | | http.response_time.median | 2059.5 | 75 | | http.response_time.p95 | 7709.8 | 76 | | http.response_time.p99 | 9416.8 | 77 | | http.responses | 8347 | 78 | | vusers.completed | 8345 | 79 | | vusers.created | 9000 | 80 | | vusers.created_by_name.Scrape a URL | 9000 | 81 | | vusers.failed | 655 | 82 | | vusers.session_length.min | 1044.5 | 83 | | vusers.session_length.max | 9998.8 | 84 | | vusers.session_length.mean | 3109.7 | 85 | | vusers.session_length.median | 2143.5 | 86 | | vusers.session_length.p95 | 7709.8 | 87 | | vusers.session_length.p99 | 9416.8 | 88 | 89 | ### Metrics 90 | 91 | ![](./assets/metrics-test-3.png) 92 | 93 | --- 94 | 95 | ## Conclusions and Next Steps 96 | 97 | ### Conclusions 98 | 1. **Performance:** The system handled 9000 requests with a mean response time of 3037.2 ms. There were 653 timeouts and 2 HTTP 502 responses. 99 | 2. **Autoscaling:** Three machines automatically scaled up during the test, but the scaling was not sufficient to prevent all errors. 100 | 3. **Response Times:** The peak response time was 9941 ms, indicating that the system struggled under peak load conditions. 101 | 102 | ### Next Steps 103 | 104 | 1. **Adjust Soft Limit:** Change the soft limit to 100 and the hard limit to 50 to test if machines will start faster and reduce the number of 502 errors. 105 | 2. **Further Load Tests:** Conduct additional load tests with the new configuration to assess improvements. 106 | 107 | By following these steps, we can enhance the system's performance and reliability under varying load conditions. 108 | -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/load-test-4.md: -------------------------------------------------------------------------------- 1 | # Scraping Load Testing - Test #4 2 | 3 | ## Summary 4 | 5 | The load test was conducted with the Fly.io configuration set to a hard limit of 100 and a soft limit of 50. The test involved four phases with varying arrival rates. Despite the adjustments, there were 1329 timeouts (14.8%) but no HTTP 502 responses. The average response time was 3547.9 ms, with a peak response time of 9935 ms. Further adjustments to the artillery timeout configuration are recommended to improve performance. 6 | 7 | ## Table of Contents 8 | 9 | - [Scraping Load Testing - Test #4](#scraping-load-testing---test-4) 10 | - [Summary](#summary) 11 | - [Table of Contents](#table-of-contents) 12 | - [Test environment](#test-environment) 13 | - [Machines](#machines) 14 | - [Load Test Phases](#load-test-phases) 15 | - [Configuration](#configuration) 16 | - [Results](#results) 17 | - [Metrics](#metrics) 18 | - [Conclusions and Next Steps](#conclusions-and-next-steps) 19 | - [Conclusions](#conclusions) 20 | - [Next Steps](#next-steps) 21 | 22 | ## Test environment 23 | ### Machines 24 | 25 | | Machine | Size/CPU | Status | 26 | |---|---|---| 27 | | e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on | 28 | | 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on | 29 | | 6e82050c726358 mia (app) | performance-cpu-1x@2048MB | paused | 30 | | 4d89505a6e5038 mia (app) | performance-cpu-1x@2048MB | paused | 31 | | 48ed6e6b74e378 mia (app) | performance-cpu-1x@2048MB | paused | 32 | 33 | --- 34 | 35 | ## Load Test Phases 36 | 37 | ### Configuration 38 | 39 | ```toml 40 | # fly.staging.toml 41 | [http_service.concurrency] 42 | type = "requests" 43 | hard_limit = 100 44 | soft_limit = 50 45 | ``` 46 | ```yml 47 | # load-test.yml 48 | - duration: 60 49 | arrivalRate: 10 # Initial load 50 | - duration: 120 51 | arrivalRate: 20 # Increased load 52 | - duration: 180 53 | arrivalRate: 30 # Peak load 54 | - duration: 60 55 | arrivalRate: 10 # Cool down 56 | ``` 57 | 58 | 59 | ### Results 60 | Date: 15:43:26(-0300) 61 | 62 | | Metric | Value | 63 | |---------------------------------------------|---------| 64 | | errors.ETIMEDOUT | 1329 | 65 | | http.codes.200 | 7671 | 66 | | http.downloaded_bytes | 0 | 67 | | http.request_rate | 23/sec | 68 | | http.requests | 9000 | 69 | | http.response_time.min | 999 | 70 | | http.response_time.max | 9935 | 71 | | http.response_time.mean | 3547.9 | 72 | | http.response_time.median | 2836.2 | 73 | | http.response_time.p95 | 8352 | 74 | | http.response_time.p99 | 9607.1 | 75 | | http.responses | 7671 | 76 | | vusers.completed | 7671 | 77 | | vusers.created | 9000 | 78 | | vusers.created_by_name.Scrape a URL | 9000 | 79 | | vusers.failed | 1329 | 80 | | vusers.session_length.min | 1063.4 | 81 | | vusers.session_length.max | 10006.8 | 82 | | vusers.session_length.mean | 3616 | 83 | | vusers.session_length.median | 2893.5 | 84 | | vusers.session_length.p95 | 8352 | 85 | | vusers.session_length.p99 | 9607.1 | 86 | 87 | ## Metrics 88 | 89 | ![](./assets/metrics-test-4.png) 90 | 91 | --- 92 | 93 | ## Conclusions and Next Steps 94 | 95 | ### Conclusions 96 | 1. **Performance:** The system handled 9000 requests with a mean response time of 3547.9 ms. There were 1329 timeouts but no HTTP 502 responses. 97 | 2. **Response Times:** The peak response time was 9935 ms, indicating that the system struggled under peak load conditions. 98 | 99 | ### Next Steps 100 | 1. **Adjust Timeout Configuration:** Change the artillery timeout configuration to reduce the number of timeouts. 101 | 2. **Further Load Tests:** Conduct additional load tests with the new timeout configuration to assess improvements. 102 | 103 | By following these steps, we can enhance the system's performance and reliability under varying load conditions. 104 | -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-1-5/load-test-5.md: -------------------------------------------------------------------------------- 1 | # Scraping Load Testing - Test #5 2 | 3 | ## Summary 4 | 5 | The load test was conducted with a higher timeout configuration to address previous timeout issues. The test involved 9000 requests with a timeout set to 30 seconds. The system handled the load well, with only 4 HTTP 502 responses (0.04%). The average response time was 5661.8 ms, with a peak response time of 18924 ms. Further analysis is recommended to optimize response times. 6 | 7 | ## Table of Contents 8 | 9 | - [Scraping Load Testing - Test #5](#scraping-load-testing---test-5) 10 | - [Summary](#summary) 11 | - [Table of Contents](#table-of-contents) 12 | - [Test environment](#test-environment) 13 | - [Machines](#machines) 14 | - [Load Test Configuration](#load-test-configuration) 15 | - [Configuration](#configuration) 16 | - [Results](#results) 17 | - [Metrics](#metrics) 18 | - [Conclusions and Next Steps](#conclusions-and-next-steps) 19 | - [Conclusions](#conclusions) 20 | - [Next Steps](#next-steps) 21 | 22 | ## Test environment 23 | ### Machines 24 | 25 | | Machine | Size/CPU | Status | 26 | |---|---|---| 27 | | e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on | 28 | | 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on | 29 | | 6e82050c726358 mia (app) | performance-cpu-1x@2048MB | paused | 30 | | 4d89505a6e5038 mia (app) | performance-cpu-1x@2048MB | paused | 31 | | 48ed6e6b74e378 mia (app) | performance-cpu-1x@2048MB | paused | 32 | 33 | --- 34 | 35 | ## Load Test Configuration 36 | 37 | ### Configuration 38 | 39 | ```yml 40 | http: 41 | timeout: 30 42 | ``` 43 | 44 | 45 | ### Results 46 | Date: 15:59:50(-0300) 47 | 48 | | Metric | Value | 49 | |---------------------------------------------|---------| 50 | | errors.Failed capture or match | 4 | 51 | | http.codes.200 | 8996 | 52 | | http.codes.502 | 4 | 53 | | http.downloaded_bytes | 0 | 54 | | http.request_rate | 23/sec | 55 | | http.requests | 9000 | 56 | | http.response_time.min | 62 | 57 | | http.response_time.max | 18924 | 58 | | http.response_time.mean | 5661.8 | 59 | | http.response_time.median | 5378.9 | 60 | | http.response_time.p95 | 11050.8 | 61 | | http.response_time.p99 | 12968.3 | 62 | | http.responses | 9000 | 63 | | vusers.completed | 8996 | 64 | | vusers.created | 9000 | 65 | | vusers.created_by_name.Scrape a URL | 9000 | 66 | | vusers.failed | 4 | 67 | | vusers.session_length.min | 1079.2 | 68 | | vusers.session_length.max | 18980.3 | 69 | | vusers.session_length.mean | 5734.4 | 70 | | vusers.session_length.median | 5487.5 | 71 | | vusers.session_length.p95 | 11050.8 | 72 | | vusers.session_length.p99 | 12968.3 | 73 | 74 | ### Metrics 75 | 76 | ![](./assets/metrics-test-5.png) 77 | 78 | --- 79 | 80 | ## Conclusions and Next Steps 81 | 82 | ### Conclusions 83 | 1. **Performance:** The system handled 9000 requests with a mean response time of 5661.8 ms. There were only 4 HTTP 502 responses which represent a 0.04% failure rate. 84 | 2. **Response Times:** The peak response time was 18924 ms, indicating that while the system handled the load, there is room for optimization. 85 | 86 | ### Next Steps 87 | 88 | 2. **Testing Scraping Strategies:** Conduct further testing on the Playwright instance to ensure it can handle increased load and identify any potential bottlenecks. 89 | 3. **Load Testing Other Functionalities:** Evaluate the performance of other critical routes, such as the crawl route, through additional load tests to ensure comprehensive system reliability. 90 | 4. **Optimize Response Times:** Investigate and implement strategies to reduce the peak response time from 18924 ms. This could involve optimizing database queries, improving server configurations, or enhancing caching mechanisms. 91 | 5. **Error Handling Improvements:** Analyze the causes of the 4 HTTP 502 responses and implement robust error handling and recovery mechanisms to minimize such occurrences in future tests. 92 | 6. **Scalability Assessment:** Assess the system's scalability by gradually increasing the load beyond 9000 requests to determine its breaking point and plan for necessary infrastructure upgrades. 93 | 94 | By following these steps, we can further enhance the system's performance and reliability under varying load conditions. 95 | -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-6-7/assets/metrics-fire-engine-test-7-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-6-7/assets/metrics-fire-engine-test-7-2.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-6-7/assets/metrics-fire-engine-test-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-6-7/assets/metrics-fire-engine-test-7.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-6-7/assets/metrics-fire-engine-test-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-6-7/assets/metrics-fire-engine-test-8.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-6-7/assets/metrics-test-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-6-7/assets/metrics-test-6.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-6-7/assets/metrics-test-7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-6-7/assets/metrics-test-7.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-6-7/assets/metrics-test-8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/apps/test-suite/load-test-results/tests-6-7/assets/metrics-test-8.png -------------------------------------------------------------------------------- /apps/test-suite/load-test-results/tests-6-7/load-test-6.md: -------------------------------------------------------------------------------- 1 | # Load Testing Crawl Routes - Test #6 2 | 3 | ## Summary 4 | 5 | The load test was conducted with a duration of 10 minutes and an arrival rate of 10 requests per second. The system handled the load well, with no failed requests. The average response time was 838.1 ms, with a peak response time of 1416 ms. Further analysis is recommended to optimize response times and assess the impact of higher loads. 6 | 7 | ## Table of Contents 8 | 9 | - [Load Testing Crawl Routes - Test #6](#load-testing-crawl-routes---test-6) 10 | - [Summary](#summary) 11 | - [Table of Contents](#table-of-contents) 12 | - [Test environment](#test-environment) 13 | - [Machines](#machines) 14 | - [Load Test Configuration](#load-test-configuration) 15 | - [Configuration](#configuration) 16 | - [Results](#results) 17 | - [Metrics](#metrics) 18 | - [Conclusions and Next Steps](#conclusions-and-next-steps) 19 | - [Conclusions](#conclusions) 20 | - [Next Steps](#next-steps) 21 | 22 | ## Test environment 23 | ### Machines 24 | 25 | | Machine | Size/CPU | Status | 26 | |---|---|---| 27 | | 06e825d0da2387 mia (worker) | performance-cpu-1x@2048MB | always on | 28 | | 178134db566489 mia (worker) | performance-cpu-1x@2048MB | always on | 29 | | 73d8dd909c1189 mia (app) | performance-cpu-1x@2048MB | always on | 30 | | e286de4f711e86 mia (app) | performance-cpu-1x@2048MB | always on | 31 | 32 | Other app machines with autoscaling shouldn't start during crawl tests. 33 | 34 | --- 35 | 36 | ## Load Test Configuration 37 | 38 | ### Configuration 39 | 40 | ```yml 41 | # load-test.yml 42 | - duration: 10 43 | arrivalRate: 10 44 | ``` 45 | 46 | 47 | ### Results 48 | Date: 16:00:06(-0300) 49 | 50 | | Metric | Value | 51 | |---------------------------------------------|---------| 52 | | http.codes.200 | 200 | 53 | | http.downloaded_bytes | 0 | 54 | | http.request_rate | 10/sec | 55 | | http.requests | 200 | 56 | | http.response_time.min | 687 | 57 | | http.response_time.max | 1416 | 58 | | http.response_time.mean | 838.1 | 59 | | http.response_time.median | 788.5 | 60 | | http.response_time.p95 | 1085.9 | 61 | | http.response_time.p99 | 1274.3 | 62 | | http.responses | 200 | 63 | | vusers.completed | 100 | 64 | | vusers.created | 100 | 65 | | vusers.created_by_name.Crawl a URL | 100 | 66 | | vusers.failed | 0 | 67 | | vusers.session_length.min | 11647.5 | 68 | | vusers.session_length.max | 12310 | 69 | | vusers.session_length.mean | 11812.7 | 70 | | vusers.session_length.median | 11734.2 | 71 | | vusers.session_length.p95 | 11971.2 | 72 | | vusers.session_length.p99 | 12213.1 | 73 | 74 | ### Metrics 75 | 76 | ![](./assets/metrics-test-6.png) 77 | 78 | 79 | **CPU Utilization:** 80 | - **App machines:** Less than 2.3% CPU utilization with no changes in memory utilization. 81 | - **Worker machines:** High CPU utilization for over 4 minutes and 45 seconds, with 56% (peaking at 75.8%) on 178134db566489 and 40% (peaking at 62.7%) on 06e825d0da2387. 82 | 83 | **Memory Utilization:** 84 | - **App machines:** No relevant changes during the tests. 85 | - **Worker machines:** 86 | - 06e825d0da2387: From 359MiB to over 388MiB during 4 minutes and 45 seconds (peaking at 461MiB). 87 | - 178134db566489: From 366MiB to over 449MiB during 4 minutes and 45 seconds (peaking at 523MiB). 88 | 89 | 90 | --- 91 | 92 | ## Conclusions and Next Steps 93 | 94 | ### Conclusions 95 | 1. **Performance:** The system handled 200 requests with a mean response time of 838.1 ms. There were no failed requests. 96 | 2. **Response Times:** The peak response time was 1416 ms, indicating that while the system handled the load, there is room for optimization. 97 | 98 | ### Next Steps 99 | 100 | 1. **Higher Load Testing:** Conduct further testing with higher loads to assess the system's performance under increased stress. 101 | 2. **Optimize Response Times:** Investigate and implement strategies to reduce the peak response time from 1416 ms. This could involve optimizing database queries, improving server configurations, or enhancing caching mechanisms. 102 | 3. **Scalability Assessment:** Assess the system's scalability by gradually increasing the load beyond the current configuration to determine its breaking point and plan for necessary infrastructure upgrades. 103 | 104 | By following these steps, we can further enhance the system's performance and reliability under varying load conditions. -------------------------------------------------------------------------------- /apps/test-suite/load-test.yml: -------------------------------------------------------------------------------- 1 | config: 2 | target: "https://staging-firecrawl-scraper-js.fly.dev/v0" 3 | http: 4 | timeout: 30 5 | phases: 6 | - duration: 60 7 | arrivalRate: 1 # Initial load 8 | - duration: 120 9 | arrivalRate: 2 # Increased load 10 | - duration: 180 11 | arrivalRate: 3 # Peak load 12 | - duration: 60 13 | arrivalRate: 1 # Cool down 14 | defaults: 15 | headers: 16 | Authorization: "Bearer YOUR_API_KEY" 17 | scenarios: 18 | - name: Crawl a URL 19 | flow: 20 | - post: 21 | url: "/crawl" 22 | json: 23 | url: "https://rsseau.fr" 24 | crawlerOptions: 25 | limit: 100 26 | pageOptions: 27 | capture: 28 | - json: "$.jobId" 29 | as: job_id 30 | - think: 10 31 | - get: 32 | url: "/crawl/status/{{ job_id }}" 33 | capture: 34 | - json: "$.status" 35 | as: crawl_status 36 | until: 37 | - condition: "equals" 38 | value: "completed" 39 | variable: "crawl_status" 40 | retry: 41 | count: 20 42 | wait: 10 43 | -------------------------------------------------------------------------------- /apps/test-suite/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "test-suite", 3 | "version": "1.0.0", 4 | "description": "", 5 | "scripts": { 6 | "test:suite": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false", 7 | "test:load": "artillery run --output ./load-test-results/test-run-report.json load-test.yml", 8 | "test:scrape": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/scrape.test.ts", 9 | "test:crawl": "npx jest --detectOpenHandles --forceExit --openHandlesTimeout=120000 --watchAll=false --testPathPattern=tests/crawl.test.ts" 10 | }, 11 | "author": "", 12 | "license": "ISC", 13 | "dependencies": { 14 | "@dqbd/tiktoken": "^1.0.14", 15 | "dotenv": "^16.4.5", 16 | "jest": "^29.7.0", 17 | "playwright": "^1.43.1", 18 | "supertest": "^7.0.0", 19 | "ts-jest": "^29.1.2" 20 | }, 21 | "devDependencies": { 22 | "@types/jest": "^29.5.12", 23 | "@types/supertest": "^6.0.2", 24 | "artillery": "^2.0.19", 25 | "typescript": "^5.4.5" 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /apps/test-suite/utils/misc.ts: -------------------------------------------------------------------------------- 1 | const getRandomLinksFromContent = async (options: { 2 | content: string; 3 | excludes: string[]; 4 | limit: number; 5 | }): Promise => { 6 | const regex = /(?<=\()https:\/\/(.*?)(?=\))/g; 7 | const links = options.content.match(regex); 8 | const filteredLinks = links 9 | ? links.filter( 10 | (link) => !options.excludes.some((exclude) => link.includes(exclude)) 11 | ) 12 | : []; 13 | const uniqueLinks = [...new Set(filteredLinks)]; // Ensure all links are unique 14 | const randomLinks = []; 15 | while (randomLinks.length < options.limit && uniqueLinks.length > 0) { 16 | const randomIndex = Math.floor(Math.random() * uniqueLinks.length); 17 | randomLinks.push(uniqueLinks.splice(randomIndex, 1)[0]); 18 | } 19 | return randomLinks; 20 | }; 21 | 22 | function fuzzyContains(options: { 23 | largeText: string; 24 | queryText: string; 25 | threshold?: number; 26 | }): boolean { 27 | // Normalize texts: lowercasing and removing non-alphanumeric characters 28 | const normalize = (text: string) => 29 | text.toLowerCase().replace(/[^a-z0-9]+/g, " "); 30 | 31 | const normalizedLargeText = normalize(options.largeText); 32 | const normalizedQueryText = normalize(options.queryText); 33 | 34 | // Split the query into words 35 | const queryWords = normalizedQueryText.split(/\s+/); 36 | 37 | // Count how many query words are in the large text 38 | const matchCount = queryWords.reduce((count, word) => { 39 | return count + (normalizedLargeText.includes(word) ? 1 : 0); 40 | }, 0); 41 | 42 | // Calculate the percentage of words matched 43 | const matchPercentage = matchCount / queryWords.length; 44 | 45 | // Check if the match percentage meets or exceeds the threshold 46 | return matchPercentage >= (options.threshold || 0.8); 47 | } -------------------------------------------------------------------------------- /apps/test-suite/utils/types.ts: -------------------------------------------------------------------------------- 1 | export interface WebsiteScrapeError { 2 | website: string; 3 | prompt: string; 4 | expected_output: string; 5 | actual_output: string; 6 | error: string; 7 | } 8 | -------------------------------------------------------------------------------- /docker-compose.yaml: -------------------------------------------------------------------------------- 1 | name: firecrawl 2 | 3 | x-common-service: &common-service 4 | build: apps/api 5 | networks: 6 | - backend 7 | environment: 8 | - REDIS_URL=${REDIS_URL:-redis://redis:6379} 9 | - REDIS_RATE_LIMIT_URL=${REDIS_URL:-redis://redis:6379} 10 | - PLAYWRIGHT_MICROSERVICE_URL=${PLAYWRIGHT_MICROSERVICE_URL:-http://playwright-service:3000} 11 | - PORT=${PORT:-3002} 12 | - NUM_WORKERS_PER_QUEUE=${NUM_WORKERS_PER_QUEUE} 13 | - BULL_AUTH_KEY=${BULL_AUTH_KEY} 14 | - TEST_API_KEY=${TEST_API_KEY} 15 | - HOST=${HOST:-0.0.0.0} 16 | - LOGGING_LEVEL=${LOGGING_LEVEL} 17 | - MAX_RAM=${MAX_RAM:-0.95} 18 | - MAX_CPU=${MAX_CPU:-0.95} 19 | extra_hosts: 20 | - "host.docker.internal:host-gateway" 21 | 22 | services: 23 | puppeteer-service: 24 | image: trieve/puppeteer-service-ts 25 | build: 26 | context: ./apps/puppeteer-service-ts/ 27 | dockerfile: Dockerfile 28 | environment: 29 | - PORT=3000 30 | - PROXY_SERVER=${PROXY_SERVER} 31 | - PROXY_USERNAME=${PROXY_USERNAME} 32 | - PROXY_PASSWORD=${PROXY_PASSWORD} 33 | - TWOCAPTCHA_TOKEN=${TWOCAPTCHA_TOKEN} 34 | - MAX_CONCURRENCY=${MAX_CONCURRENCY} 35 | networks: 36 | - backend 37 | 38 | api: 39 | image: trieve/firecrawl 40 | <<: *common-service 41 | depends_on: 42 | - redis 43 | - puppeteer-service 44 | ports: 45 | - "3002:3002" 46 | command: ["pnpm", "run", "start:production"] 47 | 48 | worker: 49 | <<: *common-service 50 | depends_on: 51 | - redis 52 | - puppeteer-service 53 | - api 54 | command: ["pnpm", "run", "workers"] 55 | 56 | redis: 57 | image: redis:alpine 58 | networks: 59 | - backend 60 | command: redis-server --bind 0.0.0.0 61 | 62 | networks: 63 | backend: 64 | driver: bridge 65 | -------------------------------------------------------------------------------- /img/firecrawl_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/devflowinc/firecrawl-simple/6dfc712a1929e3a6c01aff27d4f796d90c6ced5c/img/firecrawl_logo.png --------------------------------------------------------------------------------