├── .dockerignore ├── .eslintignore ├── .eslintrc.cjs ├── .github └── workflows │ ├── ci.yaml │ ├── deploy-dev-channel.yaml │ ├── docs-publish.yaml │ ├── make-draft-release.yaml │ └── release.yaml ├── .gitignore ├── .husky └── pre-commit ├── .pre-commit-config.yaml ├── .prettierignore ├── .prettierrc ├── CHANGES.md ├── Dockerfile ├── LICENSE ├── NOTICE ├── README.md ├── config └── policies │ ├── brave.json │ └── chromium.json ├── docker-compose.yml ├── docker-entrypoint.sh ├── docs ├── docs │ ├── CNAME │ ├── assets │ │ ├── brand │ │ │ ├── browsertrix-crawler-icon-color-dynamic.svg │ │ │ └── browsertrix-crawler-white.svg │ │ └── fonts │ │ │ ├── Inter-Italic.var.woff2 │ │ │ ├── Inter.var.woff2 │ │ │ └── Recursive_VF_1.084.woff2 │ ├── develop │ │ ├── docs.md │ │ └── index.md │ ├── index.md │ ├── overrides │ │ ├── .icons │ │ │ └── bootstrap │ │ │ │ ├── bug-fill.svg │ │ │ │ ├── chat-left-text-fill.svg │ │ │ │ ├── check-circle-fill.svg │ │ │ │ ├── check-circle.svg │ │ │ │ ├── dash-circle.svg │ │ │ │ ├── exclamation-circle-fill.svg │ │ │ │ ├── exclamation-diamond-fill.svg │ │ │ │ ├── exclamation-triangle-fill.svg │ │ │ │ ├── exclamation-triangle.svg │ │ │ │ ├── eye.svg │ │ │ │ ├── file-earmark-text-fill.svg │ │ │ │ ├── github.svg │ │ │ │ ├── globe.svg │ │ │ │ ├── info-circle-fill.svg │ │ │ │ ├── mastodon.svg │ │ │ │ ├── mortarboard-fill.svg │ │ │ │ ├── pencil-fill.svg │ │ │ │ ├── pencil.svg │ │ │ │ ├── question-circle-fill.svg │ │ │ │ ├── quote.svg │ │ │ │ ├── x-octagon-fill.svg │ │ │ │ ├── x-octagon.svg │ │ │ │ └── youtube.svg │ │ └── main.html │ ├── stylesheets │ │ └── extra.css │ └── user-guide │ │ ├── behaviors.md │ │ ├── browser-profiles.md │ │ ├── cli-options.md │ │ ├── common-options.md │ │ ├── crawl-scope.md │ │ ├── exit-codes.md │ │ ├── index.md │ │ ├── outputs.md │ │ ├── proxies.md │ │ ├── qa.md │ │ └── yaml-config.md ├── gen-cli.sh └── mkdocs.yml ├── html ├── createProfile.html ├── replay.html ├── screencast.html └── vnc_lite.html ├── package.json ├── requirements.txt ├── src ├── crawler.ts ├── create-login-profile.ts ├── main.ts ├── replaycrawler.ts └── util │ ├── argParser.ts │ ├── blockrules.ts │ ├── browser.ts │ ├── constants.ts │ ├── file_reader.ts │ ├── flowbehavior.ts │ ├── healthcheck.ts │ ├── logger.ts │ ├── originoverride.ts │ ├── proxy.ts │ ├── recorder.ts │ ├── redis.ts │ ├── replayserver.ts │ ├── reqresp.ts │ ├── screencaster.ts │ ├── screenshots.ts │ ├── seeds.ts │ ├── sitemapper.ts │ ├── state.ts │ ├── storage.ts │ ├── textextract.ts │ ├── timing.ts │ ├── wacz.ts │ ├── warcwriter.ts │ └── worker.ts ├── test-setup.js ├── tests ├── .DS_Store ├── adblockrules.test.js ├── add-exclusion.test.js ├── basic_crawl.test.js ├── blockrules.test.js ├── brave-query-redir.test.js ├── collection_name.test.js ├── config_file.test.js ├── config_stdin.test.js ├── crawl_overwrite.js ├── custom-behavior-flow.test.js ├── custom-behavior.test.js ├── custom-behaviors │ ├── custom-2.js │ ├── custom-flow.json │ └── custom.js ├── custom_driver.test.js ├── custom_selector.test.js ├── dryrun.test.js ├── exclude-redirected.test.js ├── extra_hops_depth.test.js ├── file_stats.test.js ├── fixtures │ ├── crawl-1.yaml │ ├── crawl-2.yaml │ ├── driver-1.mjs │ ├── pages.jsonl │ ├── proxy-key │ ├── proxy-key.pub │ └── urlSeedFile.txt ├── http-auth.test.js ├── invalid-behaviors │ └── invalid-export.js ├── lang-code.test.js ├── limit_reached.test.js ├── log_filtering.test.js ├── mult_url_crawl_with_favicon.test.js ├── multi-instance-crawl.test.js ├── non-html-crawl.test.js ├── pageinfo-records.test.js ├── proxy.test.js ├── qa_compare.test.js ├── retry-failed.test.js ├── rollover-writer.test.js ├── saved-state.test.js ├── scopes.test.js ├── screenshot.test.js ├── seeds.test.js ├── sitemap-parse.test.js ├── storage.test.js ├── text-extract.test.js ├── upload-wacz.test.js ├── url_file_list.test.js └── warcinfo.test.js ├── tsconfig.eslint.json ├── tsconfig.json └── yarn.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | output/ 2 | node_modules/ 3 | crawls/ 4 | test-crawls/ 5 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | .* 2 | behaviors.js 3 | behaviors/ 4 | scratch/ 5 | -------------------------------------------------------------------------------- /.eslintrc.cjs: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | env: { 3 | browser: true, 4 | es2021: true, 5 | node: true, 6 | jest: true, 7 | }, 8 | extends: [ 9 | "eslint:recommended", 10 | "plugin:@typescript-eslint/recommended", 11 | "prettier", 12 | ], 13 | parser: "@typescript-eslint/parser", 14 | plugins: ["@typescript-eslint"], 15 | parserOptions: { 16 | ecmaVersion: 12, 17 | sourceType: "module", 18 | project: ["./tsconfig.eslint.json"], 19 | tsconfigRootDir: __dirname, 20 | }, 21 | rules: { 22 | "no-constant-condition": ["error", { checkLoops: false }], 23 | "no-use-before-define": [ 24 | "error", 25 | { 26 | variables: true, 27 | functions: false, 28 | classes: false, 29 | allowNamedExports: true, 30 | }, 31 | ], 32 | "@typescript-eslint/no-floating-promises": "error", 33 | "@typescript-eslint/await-thenable": "error" 34 | }, 35 | reportUnusedDisableDirectives: true, 36 | }; 37 | -------------------------------------------------------------------------------- /.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Node.js CI 2 | 3 | on: 4 | push: 5 | pull_request: 6 | 7 | # Cancel in progress workflows on pull_requests. 8 | # https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value 9 | concurrency: 10 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 11 | cancel-in-progress: true 12 | 13 | jobs: 14 | lint: 15 | runs-on: ubuntu-latest 16 | 17 | strategy: 18 | matrix: 19 | node-version: [20.x] 20 | 21 | steps: 22 | - uses: actions/checkout@v3 23 | - name: Use Node.js ${{ matrix.node-version }} 24 | uses: actions/setup-node@v3 25 | with: 26 | node-version: ${{ matrix.node-version }} 27 | - name: install requirements 28 | run: yarn install 29 | - name: run linter 30 | run: yarn lint && yarn format 31 | 32 | build: 33 | runs-on: ubuntu-latest 34 | 35 | strategy: 36 | matrix: 37 | node-version: [20.x] 38 | 39 | steps: 40 | - uses: actions/checkout@v3 41 | 42 | - name: Use Node.js ${{ matrix.node-version }} 43 | uses: actions/setup-node@v3 44 | with: 45 | node-version: ${{ matrix.node-version }} 46 | 47 | - uses: actions/setup-python@v4 48 | with: 49 | python-version: 3.x 50 | 51 | - name: install requirements 52 | run: yarn install 53 | 54 | - name: build js 55 | run: yarn run tsc 56 | 57 | - name: Cache Docker Images 58 | uses: ScribeMD/docker-cache@0.5.0 59 | with: 60 | key: docker-${{ runner.os }}-${{ hashFiles('Dockerfile') }} 61 | 62 | - name: Login to DockerHub 63 | uses: docker/login-action@v3 64 | with: 65 | username: ${{ secrets.DOCKER_USERNAME }} 66 | password: ${{ secrets.DOCKER_PASSWORD }} 67 | 68 | - name: build docker 69 | run: docker compose build 70 | 71 | - name: install python deps for docs 72 | run: pip install mkdocs-material 73 | 74 | - name: build docs for crawl test 75 | run: cd docs/ && mkdocs build 76 | 77 | - name: add http-server for tests 78 | run: yarn add -D http-server 79 | 80 | - name: install py-wacz as root for tests 81 | run: sudo pip install wacz --ignore-installed 82 | 83 | - name: run all tests as root 84 | run: sudo DOCKER_HOST_NAME=172.17.0.1 CI=true yarn test -validate 85 | 86 | - name: run saved state + qa compare test as non-root - with volume owned by current user 87 | run: | 88 | sudo rm -rf ./test-crawls 89 | mkdir test-crawls 90 | sudo CI=true yarn test ./tests/saved-state.test.js ./tests/qa_compare.test.js 91 | -------------------------------------------------------------------------------- /.github/workflows/deploy-dev-channel.yaml: -------------------------------------------------------------------------------- 1 | name: "*** Deploy Crawler to Dev Channel ***" 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | channel: 7 | description: Crawler Channel 8 | type: choice 9 | required: true 10 | default: dev 11 | options: 12 | - dev 13 | - dev-2 14 | 15 | jobs: 16 | build_and_deploy_crawler: 17 | runs-on: ubuntu-latest 18 | steps: 19 | - name: Checkout 20 | uses: actions/checkout@v3 21 | 22 | - name: Set up Docker Buildx 23 | uses: docker/setup-buildx-action@v2 24 | with: 25 | driver-opts: network=host 26 | 27 | - name: Login to Registry 28 | uses: docker/login-action@v2 29 | with: 30 | registry: ${{ secrets.DEPLOY_REGISTRY }} 31 | username: ${{ secrets.DEPLOY_REGISTRY_API_TOKEN }} 32 | password: ${{ secrets.DEPLOY_REGISTRY_API_TOKEN }} 33 | 34 | - name: Build Image 35 | uses: docker/build-push-action@v3 36 | with: 37 | context: . 38 | push: true 39 | tags: ${{ secrets.DEPLOY_REGISTRY_PATH }}/webrecorder/browsertrix-crawler:${{ github.event.inputs.channel }} 40 | cache-from: type=gha,scope=backend 41 | cache-to: type=gha,scope=backend,mode=max 42 | -------------------------------------------------------------------------------- /.github/workflows/docs-publish.yaml: -------------------------------------------------------------------------------- 1 | name: docs-publish 2 | on: 3 | push: 4 | branches: 5 | - main 6 | paths: 7 | - 'docs/**' 8 | 9 | permissions: 10 | contents: write 11 | 12 | jobs: 13 | deploy_docs: 14 | runs-on: ubuntu-latest 15 | steps: 16 | - uses: actions/checkout@v3 17 | - uses: actions/setup-python@v4 18 | with: 19 | python-version: 3.x 20 | 21 | - name: build docker image (for getting cli) 22 | run: docker compose build 23 | 24 | - name: generate cli 25 | run: docs/gen-cli.sh 26 | 27 | - run: pip install mkdocs-material 28 | - run: cd docs/ && mkdocs gh-deploy --force 29 | -------------------------------------------------------------------------------- /.github/workflows/make-draft-release.yaml: -------------------------------------------------------------------------------- 1 | name: Generate Draft Release 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - "*-release" 8 | 9 | jobs: 10 | package_chart: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Check out Git repository 15 | uses: actions/checkout@v3 16 | 17 | - name: Get Version 18 | run: | 19 | echo "version=$(jq -r .version package.json)" >> "$GITHUB_ENV" 20 | 21 | - name: Make Draft Release 22 | uses: softprops/action-gh-release@v1 23 | with: 24 | name: "Browsertrix Crawler v${{ env.version }}" 25 | tag_name: v${{ env.version }} 26 | draft: true 27 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Docker image 2 | on: 3 | release: 4 | types: [published] 5 | 6 | jobs: 7 | push_to_registries: 8 | name: Build x86 and ARM Images and push to Dockerhub 9 | runs-on: ubuntu-22.04 10 | steps: 11 | - name: Check out the repo 12 | uses: actions/checkout@v4 13 | 14 | - name: Docker image metadata 15 | id: meta 16 | uses: docker/metadata-action@v5 17 | with: 18 | images: webrecorder/browsertrix-crawler 19 | tags: | 20 | type=semver,pattern={{version}} 21 | 22 | - name: Set up QEMU 23 | uses: docker/setup-qemu-action@v3 24 | with: 25 | platforms: arm64 26 | 27 | - name: Set up Docker Buildx 28 | uses: docker/setup-buildx-action@v1 29 | - name: Login to DockerHub 30 | uses: docker/login-action@v3 31 | with: 32 | username: ${{ secrets.DOCKER_USERNAME }} 33 | password: ${{ secrets.DOCKER_PASSWORD }} 34 | - name: Build and push 35 | id: docker_build 36 | uses: docker/build-push-action@v3 37 | with: 38 | context: . 39 | push: true 40 | tags: ${{ steps.meta.outputs.tags }} 41 | platforms: "linux/amd64,linux/arm64" 42 | - name: Image digest 43 | run: echo ${{ steps.docker_build.outputs.digest }} 44 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | __pycache__ 3 | *.egg-info/ 4 | collections/ 5 | node_modules/ 6 | crawls/ 7 | test-crawls/ 8 | .DS_Store 9 | dist 10 | scratch/ 11 | venv/ 12 | docs/venv/ 13 | -------------------------------------------------------------------------------- /.husky/pre-commit: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env sh 2 | . "$(dirname -- "$0")/_/husky.sh" 3 | yarn lint:fix 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: local 3 | hooks: 4 | - id: husky-run-pre-commit 5 | name: husky 6 | language: system 7 | entry: .husky/pre-commit 8 | pass_filenames: false 9 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | dist 2 | scratch 3 | crawls 4 | test-crawls 5 | -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | {} 2 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BROWSER_VERSION=1.79.118 2 | ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION} 3 | 4 | FROM ${BROWSER_IMAGE_BASE} 5 | 6 | # needed to add args to main build stage 7 | ARG BROWSER_VERSION 8 | 9 | ENV GEOMETRY=1360x1020x16 \ 10 | BROWSER_VERSION=${BROWSER_VERSION} \ 11 | BROWSER_BIN=google-chrome \ 12 | OPENSSL_CONF=/app/openssl.conf \ 13 | VNC_PASS=vncpassw0rd! \ 14 | DETACHED_CHILD_PROC=1 15 | 16 | EXPOSE 9222 9223 6080 17 | 18 | WORKDIR /app 19 | 20 | ADD package.json yarn.lock /app/ 21 | 22 | # to allow forcing rebuilds from this stage 23 | ARG REBUILD 24 | 25 | # Download and format ad host blocklist as JSON 26 | RUN mkdir -p /tmp/ads && cd /tmp/ads && \ 27 | curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \ 28 | cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \ 29 | rm /tmp/ads/ad-hosts.txt 30 | 31 | RUN yarn install --network-timeout 1000000 32 | 33 | ADD tsconfig.json /app/ 34 | ADD src /app/src 35 | 36 | RUN yarn run tsc 37 | 38 | ADD config/ /app/ 39 | 40 | ADD html/ /app/html/ 41 | 42 | ARG RWP_VERSION=2.3.7 43 | ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/ 44 | ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/ 45 | ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz 46 | 47 | RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/* 48 | 49 | RUN ln -s /app/dist/main.js /usr/bin/crawl; \ 50 | ln -s /app/dist/main.js /usr/bin/qa; \ 51 | ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile 52 | 53 | RUN mkdir -p /app/behaviors 54 | 55 | WORKDIR /crawls 56 | 57 | # enable to test custom behaviors build (from browsertrix-behaviors) 58 | # COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js 59 | 60 | # add brave/chromium group policies 61 | RUN mkdir -p /etc/brave/policies/managed/ 62 | ADD config/policies /etc/brave/policies/managed/ 63 | 64 | ADD docker-entrypoint.sh /docker-entrypoint.sh 65 | ENTRYPOINT ["/docker-entrypoint.sh"] 66 | 67 | CMD ["crawl"] 68 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | browsertrix-mini 2 | 3 | Copyright (C) 2020 Webrecorder Software 4 | 5 | This program is free software: you can redistribute it and/or modify 6 | it under the terms of the GNU Affero General Public License as published by 7 | the Free Software Foundation, either version 3 of the License, or 8 | (at your option) any later version. 9 | 10 | This program is distributed in the hope that it will be useful, 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 | GNU Affero General Public License for more details. 14 | 15 | You should have received a copy of the GNU Affero General Public License 16 | along with this program. If not, see . 17 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Browsertrix Crawler 1.x 2 | 3 | Browsertrix Crawler is a standalone browser-based high-fidelity crawling system, designed to run a complex, customizable browser-based crawl in a single Docker container. Browsertrix Crawler uses [Puppeteer](https://github.com/puppeteer/puppeteer) to control one or more [Brave Browser](https://brave.com/) browser windows in parallel. Data is captured through the [Chrome Devtools Protocol (CDP)](https://chromedevtools.github.io/devtools-protocol/) in the browser. 4 | 5 | For information on how to use and develop Browsertrix Crawler, see the hosted [Browsertrix Crawler documentation](https://crawler.docs.browsertrix.com). 6 | 7 | For information on how to build the docs locally, see the [docs page](docs/docs/develop/docs.md). 8 | 9 | 10 | ## Support 11 | Initial support for 0.x version of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder. 12 | 13 | Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/). 14 | 15 | ## License 16 | 17 | [AGPLv3](https://www.gnu.org/licenses/agpl-3.0) or later, see [LICENSE](LICENSE) for more details. 18 | -------------------------------------------------------------------------------- /config/policies/brave.json: -------------------------------------------------------------------------------- 1 | { 2 | "BraveRewardsDisabled": true, 3 | "BraveWalletDisabled": true, 4 | "BraveVPNDisabled": 1, 5 | "BraveAIChatEnabled": false, 6 | "TorDisabled": true 7 | } 8 | -------------------------------------------------------------------------------- /config/policies/chromium.json: -------------------------------------------------------------------------------- 1 | { 2 | "AlwaysOpenPdfExternally": true, 3 | "NewTabPageLocation": "about:blank", 4 | "RestoreOnStartup": 5, 5 | "IncognitoModeAvailability": 1, 6 | "AllowFileSelectionDialogs": false, 7 | "AutoLaunchProtocolsFromOrigins": [{ 8 | "allowed_origins":["https://t.me"], 9 | "protocol": "tg" 10 | }], 11 | "URLBlocklist": [ 12 | "file://*" 13 | ], 14 | "DownloadDirectory": "/dev/null", 15 | "SpellcheckEnabled": false, 16 | "HttpsUpgradesEnabled": false 17 | } 18 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.5" 2 | 3 | services: 4 | crawler: 5 | image: ${REGISTRY}webrecorder/browsertrix-crawler:latest 6 | build: 7 | context: ./ 8 | 9 | volumes: 10 | - ./crawls:/crawls 11 | 12 | cap_add: 13 | - NET_ADMIN 14 | - SYS_ADMIN 15 | 16 | shm_size: 1gb 17 | -------------------------------------------------------------------------------- /docker-entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # disable core dumps 4 | ulimit -c 0 5 | 6 | # Get UID/GID from volume dir 7 | 8 | VOLUME_UID=$(stat -c '%u' /crawls) 9 | VOLUME_GID=$(stat -c '%g' /crawls) 10 | 11 | # Get the UID/GID we are running as 12 | 13 | MY_UID=$(id -u) 14 | MY_GID=$(id -g) 15 | 16 | # If we aren't running as the owner of the /crawls/ dir then add a new user 17 | # btrix with the same UID/GID of the /crawls dir and run as that user instead. 18 | 19 | if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then 20 | groupadd btrix 21 | groupmod -o --gid $VOLUME_GID btrix 22 | 23 | useradd -ms /bin/bash -g $VOLUME_GID btrix 24 | usermod -o -u $VOLUME_UID btrix > /dev/null 25 | 26 | exec gosu btrix:btrix "$@" 27 | else 28 | exec "$@" 29 | fi 30 | 31 | -------------------------------------------------------------------------------- /docs/docs/CNAME: -------------------------------------------------------------------------------- 1 | crawler.docs.browsertrix.com 2 | -------------------------------------------------------------------------------- /docs/docs/assets/brand/browsertrix-crawler-icon-color-dynamic.svg: -------------------------------------------------------------------------------- 1 | 3 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /docs/docs/assets/brand/browsertrix-crawler-white.svg: -------------------------------------------------------------------------------- 1 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /docs/docs/assets/fonts/Inter-Italic.var.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/d2a6aa9805fa08c9a51b43005e0a562a032fd78a/docs/docs/assets/fonts/Inter-Italic.var.woff2 -------------------------------------------------------------------------------- /docs/docs/assets/fonts/Inter.var.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/d2a6aa9805fa08c9a51b43005e0a562a032fd78a/docs/docs/assets/fonts/Inter.var.woff2 -------------------------------------------------------------------------------- /docs/docs/assets/fonts/Recursive_VF_1.084.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/d2a6aa9805fa08c9a51b43005e0a562a032fd78a/docs/docs/assets/fonts/Recursive_VF_1.084.woff2 -------------------------------------------------------------------------------- /docs/docs/develop/docs.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | This documentation is built with the [Mkdocs](https://www.mkdocs.org/) static site generator. 4 | 5 | ## Docs Setup 6 | 7 | Python is required to build the docs, then run: 8 | 9 | pip install mkdocs-material 10 | 11 | 12 | ## Docs Server 13 | 14 | To start the docs server, simply run: 15 | 16 | mkdocs serve 17 | 18 | The documentation will then be available on `http://localhost:8000/` 19 | 20 | The command-line options are rebuilt using the `docs/gen-cli.sh` script. 21 | 22 | Refer to the [Mkdocs](https://www.mkdocs.org/) and [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/) pages 23 | for more info about the documentation. 24 | -------------------------------------------------------------------------------- /docs/docs/develop/index.md: -------------------------------------------------------------------------------- 1 | # Development 2 | 3 | ## Usage with Docker Compose 4 | 5 | Many examples in User Guide demonstrate running Browsertrix Crawler with `docker run`. 6 | 7 | Docker Compose is recommended for building the image and for simple configurations. A simple Docker Compose configuration file is included in the Git repository. 8 | 9 | To build the latest image, run: 10 | 11 | ```sh 12 | docker-compose build 13 | ``` 14 | 15 | Docker Compose also simplifies some config options, such as mounting the volume for the crawls. 16 | 17 | The following command starts a crawl with 2 workers and generates the CDX: 18 | 19 | ```sh 20 | docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --collection wr-net --workers 2 21 | ``` 22 | 23 | In this example, the crawl data is written to `./crawls/collections/wr-net` by default. 24 | 25 | While the crawl is running, the status of the crawl prints the progress to the JSON-L log output. This can be disabled by using the `--logging` option and not including `stats`. 26 | 27 | ## Multi-Platform Build / Support for Apple Silicon 28 | 29 | Browsertrix Crawler uses a browser image which supports amd64 and arm64. 30 | 31 | This means Browsertrix Crawler can be built natively on Apple Silicon systems using the default settings. Running `docker-compose build` on an Apple Silicon should build a native version that should work for development. 32 | 33 | ## Modifying Browser Image 34 | 35 | It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images using Brave Browser and Chrome/Chromium (depending on host system chip architecture) are supported via [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), however, only Brave Browser receives regular version updates from us. 36 | 37 | The browser base image used is specified and can be changed at the top of the Dockerfile in the Browsertrix Crawler repo. 38 | 39 | Custom browser images can be used by forking [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), locally building or publishing an image, and then modifying the Dockerfile in this repo to build from that image. 40 | -------------------------------------------------------------------------------- /docs/docs/index.md: -------------------------------------------------------------------------------- 1 | --- 2 | hide: 3 | - navigation 4 | - toc 5 | --- 6 | 7 | # Home 8 | 9 | Welcome to the Browsertrix Crawler official documentation. 10 | 11 | Browsertrix Crawler is a simplified browser-based high-fidelity crawling system, designed to run a complex, customizable browser-based crawl in a single Docker container. Browsertrix Crawler uses [Puppeteer](https://github.com/puppeteer/puppeteer) to control one or more [Brave Browser](https://brave.com/) browser windows in parallel. Data is captured through the [Chrome Devtools Protocol (CDP)](https://chromedevtools.github.io/devtools-protocol/) in the browser. 12 | 13 | Browsertrix Crawler is a command line application responsible for the core features of [Browsertrix](https://browsertrix.com), Webrecorder's cloud-based web archiving service. See the [Browsertrix documentation](https://docs.browsertrix.cloud/) for more information about Browsertrix, the cloud platform. 14 | 15 | !!! note 16 | 17 | This documentation applies to Browsertrix Crawler versions 1.0.0 and above. Documentation for earlier versions of the crawler is available in the [Browsertrix Crawler Github repository](https://github.com/webrecorder/browsertrix-crawler)'s README file in older commits. 18 | 19 | ## Features 20 | 21 | - Single-container, browser based crawling with a headless/headful browser running pages in multiple windows. 22 | - Support for custom browser behaviors, using [Browsertrix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay, and site-specific behaviors. 23 | - YAML-based configuration, passed via file or via stdin. 24 | - Seed lists and per-seed scoping rules. 25 | - URL blocking rules to block capture of specific URLs (including by iframe URL and/or by iframe contents). 26 | - Screencasting: Ability to watch crawling in real-time. 27 | - Screenshotting: Ability to take thumbnails, full page screenshots, and/or screenshots of the initial page view. 28 | - Optimized (non-browser) capture of non-HTML resources. 29 | - Extensible Puppeteer driver script for customizing behavior per crawl or page. 30 | - Ability to create and reuse browser profiles interactively or via automated user/password login using an embedded browser. 31 | - Multi-platform support — prebuilt Docker images available for Intel/AMD and Apple Silicon (M1/M2) CPUs. 32 | - Quality Assurance (QA) crawling — analyze the replay of existing crawls (via WACZ) and produce stats comparing what the browser encountered on a website during crawling against the replay of the crawl WACZ. 33 | 34 | ## Documentation 35 | 36 | If something is missing, unclear, or seems incorrect, please open an [issue](https://github.com/webrecorder/browsertrix-crawler/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc) and we'll try to make sure that your questions get answered here in the future! 37 | 38 | ## Code 39 | 40 | Browsertrix Crawler is free and open source software, with all code available in the [main repository on Github](https://github.com/webrecorder/browsertrix-crawler). 41 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/bug-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/chat-left-text-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/check-circle-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/check-circle.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/dash-circle.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/exclamation-circle-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/exclamation-diamond-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/exclamation-triangle-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/exclamation-triangle.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/eye.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/file-earmark-text-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/github.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/globe.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/info-circle-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/mastodon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/mortarboard-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/pencil-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/pencil.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/question-circle-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/quote.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/x-octagon-fill.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/x-octagon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /docs/docs/overrides/.icons/bootstrap/youtube.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /docs/docs/overrides/main.html: -------------------------------------------------------------------------------- 1 | {% extends "base.html" %} {% block icons %} {% set icon_path = 2 | "overrides/.icons/bootstrap/" %} {{ super() }} {% endblock %} 3 | -------------------------------------------------------------------------------- /docs/docs/stylesheets/extra.css: -------------------------------------------------------------------------------- 1 | /* Font style definitions */ 2 | 3 | @font-face { 4 | font-family: "Recursive"; 5 | font-style: oblique 0deg 15deg; 6 | font-weight: 300 1000; 7 | src: url("../assets/fonts/Recursive_VF_1.084.woff2") format("woff2"); 8 | font-feature-settings: "ss12"; 9 | } 10 | 11 | @font-face { 12 | font-family: "Inter"; 13 | font-weight: 100 900; 14 | font-display: swap; 15 | font-style: normal; 16 | src: url("../assets/fonts/Inter.var.woff2") format("woff2"); 17 | font-feature-settings: "ss03"; 18 | } 19 | 20 | @font-face { 21 | font-family: "Inter"; 22 | font-weight: 100 900; 23 | font-display: swap; 24 | font-style: italic; 25 | src: url("../assets/fonts/Inter-Italic.var.woff2") format("woff2"); 26 | font-feature-settings: "ss03"; 27 | } 28 | 29 | @font-face { 30 | font-family: "Konsole"; 31 | font-weight: 100 900; 32 | font-display: swap; 33 | font-style: normal; 34 | src: url("https://wr-static.sfo3.cdn.digitaloceanspaces.com/fonts/konsole/Konsolev1.1-VF.woff2") 35 | format("woff2"); 36 | } 37 | 38 | :root { 39 | --md-display-font: "Konsole", "Helvetica", sans-serif; 40 | --md-code-font: "Recursive", monospace; 41 | --md-text-font: "Inter", "Helvetica", "Arial", sans-serif; 42 | --wr-blue-primary: #088eaf; 43 | --wr-orange-primary: #bb4a00; 44 | } 45 | 46 | [data-md-color-scheme="webrecorder"] { 47 | --md-primary-fg-color: #4D7C0F; 48 | --md-primary-fg-color--light: #0782A1; 49 | --md-primary-fg-color--dark: #066B84; 50 | --md-typeset-color: black; 51 | --md-accent-fg-color: #0782A1; 52 | --md-typeset-a-color: #066B84; 53 | --md-code-bg-color: #F9FAFB; 54 | } 55 | 56 | /* Nav changes */ 57 | 58 | .md-header__title, 59 | .md-nav__title { 60 | font-family: var(--md-display-font); 61 | text-transform: uppercase; 62 | font-variation-settings: 63 | "wght" 750, 64 | "wdth" 87; 65 | margin-left: 0 !important; 66 | } 67 | 68 | .md-header__title--active { 69 | font-family: var(--md-display-font); 70 | text-transform: none; 71 | font-variation-settings: 72 | "wght" 550, 73 | "wdth" 90; 74 | } 75 | 76 | .md-header__button { 77 | margin-right: 0 !important; 78 | } 79 | 80 | /* Custom menu item hover */ 81 | 82 | .md-tabs__link { 83 | font-family: var(--md-code-font); 84 | font-weight: 400; 85 | opacity: 0.9; 86 | transition: 87 | 0.4s cubic-bezier(0.1, 0.7, 0.1, 1), 88 | opacity 0.25s; 89 | } 90 | 91 | .md-tabs__link:hover { 92 | font-weight: 600; 93 | } 94 | 95 | /* Custom body typography rules */ 96 | 97 | .md-typeset a { 98 | text-decoration: underline; 99 | } 100 | 101 | .headerlink { 102 | text-decoration: none !important; 103 | } 104 | 105 | code, 106 | pre, 107 | kbd { 108 | font-variation-settings: "MONO" 1; 109 | font-feature-settings: "ss01", "ss02", "ss08"; 110 | } 111 | 112 | code { 113 | border-width: 1px; 114 | border-color: #d1d5db; 115 | border-style: solid; 116 | 117 | white-space : pre-wrap !important; 118 | } 119 | 120 | .md-typeset h1, 121 | h2, 122 | h3, 123 | h4, 124 | h5 { 125 | color: black; 126 | } 127 | 128 | .md-typeset h1, 129 | h2, 130 | h3 { 131 | font-weight: 650 !important; 132 | font-variation-settings: "OPSZ" 35; 133 | } 134 | 135 | /* Custom badge classes, applies custom overrides to inline-code blocks */ 136 | 137 | .badge-blue { 138 | background-color: var(--wr-blue-primary) !important; 139 | border-color: var(--wr-blue-primary) !important; 140 | color: white !important; 141 | font-family: var(--md-text-font); 142 | font-weight: 600; 143 | } 144 | 145 | .badge-green { 146 | background-color: hsl(142 76% 36%) !important; 147 | border-color: hsl(142 76% 36%) !important; 148 | color: white !important; 149 | font-family: var(--md-text-font); 150 | font-weight: 600; 151 | } 152 | 153 | .badge-orange { 154 | background-color: var(--wr-orange-primary) !important; 155 | border-color: var(--wr-orange-primary) !important; 156 | color: white !important; 157 | font-family: var(--md-text-font); 158 | font-weight: 600; 159 | } 160 | 161 | /* Status Styling */ 162 | 163 | .status-success { 164 | font-family: var(--md-code-font); 165 | font-weight: 500; 166 | white-space: nowrap; 167 | & svg { 168 | color: hsl(142.1 76.2% 36.3%); 169 | } 170 | } 171 | 172 | .status-warning { 173 | font-family: var(--md-code-font); 174 | font-weight: 500; 175 | white-space: nowrap; 176 | & svg { 177 | color: hsl(32.1 94.6% 43.7%); 178 | } 179 | } 180 | 181 | .status-danger { 182 | font-family: var(--md-code-font); 183 | font-weight: 500; 184 | white-space: nowrap; 185 | & svg { 186 | color: hsl(0 72.2% 50.6%); 187 | } 188 | } 189 | 190 | .status-waiting { 191 | font-family: var(--md-code-font); 192 | font-weight: 500; 193 | white-space: nowrap; 194 | & svg { 195 | color: hsl(271.5 81.3% 55.9%); 196 | } 197 | } 198 | -------------------------------------------------------------------------------- /docs/docs/user-guide/browser-profiles.md: -------------------------------------------------------------------------------- 1 | # Creating and Using Browser Profiles 2 | 3 | Browsertrix Crawler can use existing browser profiles when running a crawl. This allows the browser to be pre-configured by logging in to certain sites or changing other settings, before running a crawl. By creating a logged in profile, the actual login credentials are not included in the crawl, only (temporary) session cookies. 4 | 5 | ## Interactive Profile Creation 6 | 7 | Interactive profile creation is used for creating profiles of more complex sites, or logging in to multiple sites at once. 8 | 9 | To use this mode, don't specify `--username` or `--password` flags and expose two ports on the Docker container to allow DevTools to connect to the browser and to serve a status page. 10 | 11 | In profile creation mode, Browsertrix Crawler launches a browser which uses a VNC server (via [noVNC](https://novnc.com/)) running on port 6080 to provide a 'remote desktop' for interacting with the browser. 12 | 13 | After interactively logging into desired sites or configuring other settings, _Create Profile_ should be clicked to initiate profile creation. Browsertrix Crawler will then stop the browser, and save the browser profile. 14 | 15 | To start in interactive profile creation mode, run: 16 | 17 | ```sh 18 | docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles/ -it webrecorder/browsertrix-crawler create-login-profile --url "https://example.com/" 19 | ``` 20 | 21 | Then, open a browser pointing to `http://localhost:9223/` and use the embedded browser to log in to any sites or configure any settings as needed. 22 | 23 | Click _Create Profile_ at the top when done. The profile will then be created in `./crawls/profiles/profile.tar.gz` containing the settings of this browsing session. 24 | 25 | It is also possible to use an existing profile via the `--profile` flag. This allows previous browsing sessions to be extended as needed. 26 | 27 | ```sh 28 | docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles -it webrecorder/browsertrix-crawler create-login-profile --url "https://example.com/" --filename "/crawls/profiles/newProfile.tar.gz" --profile "/crawls/profiles/oldProfile.tar.gz" 29 | ``` 30 | 31 | ## Headless vs Headful Profiles 32 | 33 | Browsertrix Crawler supports both headful and headless crawling. We have historically recommended using headful crawling to be most accurate to user experience, however, headless crawling may be faster and in recent versions of Chromium-based browsers should be much closer in fidelity to headful crawling. 34 | 35 | To use profiles in headless mode, profiles should also be created with `--headless` flag. 36 | 37 | When creating browser profile in headless mode, Browsertrix will use the devtools protocol on port 9222 to stream the browser interface. 38 | 39 | To create a profile in headless mode, run: 40 | 41 | ```sh 42 | docker run -p 9222:9222 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles/ -it webrecorder/browsertrix-crawler create-login-profile --headless --url "https://example.com/" 43 | ``` 44 | 45 | ## Automated Profile Creation for User Login 46 | 47 | If the `--automated` flag is provided, Browsertrix Crawler will attempt to create a profile automatically after logging in to sites with a username and password. The username and password can be provided via `--username` and `--password` flags or, if omitted, from a command-line prompt. 48 | 49 | When using `--automated` or `--username` / `--password`, Browsertrix Crawler will not launch an interactive browser and instead will attempt to finish automatically. 50 | 51 | The automated profile creation system will log in to a single website with supplied credentials and then save the profile. 52 | 53 | The script profile creation system also take a screenshot so you can check if the login succeeded. 54 | 55 | !!! example "Example: Launch a browser and login to the digipres.club Mastodon instance" 56 | 57 | To automatically created a logged-in browser profile, run: 58 | 59 | ```bash 60 | docker run -v $PWD/crawls/profiles:/crawls/profiles -it webrecorder/browsertrix-crawler create-login-profile --url "https://digipres.club/" 61 | ``` 62 | 63 | The script will then prompt you for login credentials, attempt to login, and create a tar.gz file in `./crawls/profiles/profile.tar.gz`. 64 | 65 | - The `--url` parameter should specify the URL of a login page. 66 | 67 | - To specify a custom filename, pass along `--filename` parameter. 68 | 69 | - To specify the username and password on the command line (for automated profile creation), pass `--username` and `--password` flags. 70 | 71 | - To specify headless mode, add the `--headless` flag. Note that for crawls run with `--headless` flag, it is recommended to also create the profile with `--headless` to ensure the profile is compatible. 72 | 73 | - To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900) 74 | 75 | The profile creation script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites. 76 | 77 | ## Using Browser Profile with a Crawl 78 | 79 | To use a previously created profile with a crawl, use the `--profile` flag or `profile` option. The `--profile` flag can then be used to specify any Brave Browser profile stored as a tarball. Browser profile can be either stored locally and provided as a path, or available online at any HTTP(S) URL which will be downloaded before starting the crawl. Using profiles created with same or older version of Browsertrix Crawler is recommended to ensure compatibility. This option allows running a crawl with the browser already pre-configured, logged in to certain sites, language settings configured, etc. 80 | 81 | After running the above command, you can now run a crawl with the profile, as follows: 82 | 83 | ```bash 84 | docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --profile /crawls/profiles/profile.tar.gz --url https://digipres.club/ --generateWACZ --collection test-with-profile 85 | ``` 86 | 87 | Profiles can also be loaded from an http/https URL, eg. `--profile https://example.com/path/to/profile.tar.gz`. 88 | -------------------------------------------------------------------------------- /docs/docs/user-guide/exit-codes.md: -------------------------------------------------------------------------------- 1 | # Exit codes 2 | 3 | The crawler uses following exit codes to indicate crawl result. 4 | 5 | | Code | Name | Description | 6 | |--|--|--| 7 | | 0 | Success | Crawl completed normally | 8 | | 1 | GenericError | Unspecified error, check logs for more details | 9 | | 3 | OutOfSpace | Disk is already full | 10 | | 9 | Failed | Crawl failed unexpectedly, might be worth retrying | 11 | | 10 | BrowserCrashed | Browser used to fetch pages has crashed | 12 | | 11 | SignalInterrupted | Crawl stopped gracefully in response to SIGINT signal | 13 | | 12 | FailedLimit | Limit on amount of failed pages, configured with `--failOnFailedLimit`, has been reached | 14 | | 13 | SignalInterruptedForce | Crawl stopped forcefully in response to SIGTERM or repeated SIGINT signal | 15 | | 14 | SizeLimit | Limit on maximum WARC size, configured with `--sizeLimit`, has been reached | 16 | | 15 | TimeLimit | Limit on maximum crawl duration, configured with `--timeLimit`, has been reached | 17 | | 16 | DiskUtilization | Limit on maximum disk usage, configured with `--diskUtilization`, has been reached | 18 | | 17 | Fatal | A fatal (non-retryable) error occured | 19 | | 21 | ProxyError | Unable to establish connection with proxy | -------------------------------------------------------------------------------- /docs/docs/user-guide/index.md: -------------------------------------------------------------------------------- 1 | # Browsertrix Crawler User Guide 2 | 3 | Welcome to the Browsertrix Crawler User Guide. This page covers the basics of using Browsertrix Crawler, Webrecorder's browser-based high-fidelity crawling system, designed to run a complex, customizable, browser-based crawl in a single Docker container. 4 | 5 | ## Getting Started 6 | 7 | Browsertrix Crawler requires [Docker](https://docs.docker.com/get-docker/) to be installed on the machine running the crawl. 8 | 9 | Assuming Docker is installed, you can run a crawl and test your archive with the following steps. 10 | 11 | You don't even need to clone the Browsertrix Crawler repo, just choose a directory where you'd like the crawl data to be placed, and then run 12 | the following commands. Replace `[URL]` with the website you'd like to crawl. 13 | 14 | 1. Run `docker pull webrecorder/browsertrix-crawler` 15 | 2. `docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --url [URL] --generateWACZ --text --collection test` 16 | 3. The crawl will now run and logs in [JSON Lines](https://jsonlines.org/) format will be output to the console. Depending on the size of the site, this may take a bit! 17 | 4. Once the crawl is finished, a WACZ file will be created in `crawls/collection/test/test.wacz` from the directory you ran the crawl! 18 | 5. You can go to [ReplayWeb.page](https://replayweb.page) and open the generated WACZ file and browse your newly crawled archive! 19 | 20 | ## Getting Started with Command-Line Options 21 | 22 | Here's how you can use some of the more common command-line options to configure the crawl: 23 | 24 | - To include automated text extraction for full text search to pages.jsonl, add the `--text` flag. To write extracted text to WARCs instead of or in addition to pages.jsonl, see [Text Extraction](common-options.md#text-extraction). 25 | 26 | - To limit the crawl to a maximum number of pages, add `--limit P` where P is the number of pages that will be crawled. 27 | 28 | - To limit the crawl to a maximum size, set `--sizeLimit` (size in bytes). 29 | 30 | - To limit the crawl time, set `--timeLimit` (in seconds). 31 | 32 | - To run more than one browser worker and crawl in parallel, and `--workers N` where N is number of browsers to run in parallel. More browsers will require more CPU and network bandwidth, and does not guarantee faster crawling. 33 | 34 | - To crawl into a new directory, specify a different name for the `--collection` param. If omitted, a new collection directory based on current time will be created. Adding the `--overwrite` flag will delete the collection directory at the start of the crawl, if it exists. 35 | 36 | Browsertrix Crawler includes a number of additional command-line options, explained in detail throughout this User Guide. 37 | 38 | ## Published Releases / Production Use 39 | 40 | When using Browsertrix Crawler in production, it is recommended to use a specific, published version of the image, eg. `webrecorder/browsertrix-crawler:[VERSION]` instead of `webrecorder/browsertrix-crawler` where `[VERSION]` corresponds to one of the published release tag. 41 | 42 | All released Docker Images are available from [Docker Hub, listed by release tag here](https://hub.docker.com/r/webrecorder/browsertrix-crawler/tags?page=1&ordering=last_updated). 43 | 44 | Details for each corresponding release tag are also available on GitHub under [Releases](https://github.com/webrecorder/browsertrix-crawler/releases). 45 | -------------------------------------------------------------------------------- /docs/docs/user-guide/outputs.md: -------------------------------------------------------------------------------- 1 | # Outputs 2 | 3 | This page covers the outputs created by Browsertrix Crawler for both crawls and browser profiles. 4 | 5 | ## Crawl Outputs 6 | 7 | Browsertrix Crawler crawl outputs are organized into collections, which can be found in the `/crawls/collection` directory. Each crawl creates a new collection by default, which can be named with the `-c` or `--collection` argument. If a collection name is not provided, Browsertrix Crawler will generate a unique collection name which includes the `crawl-` prefix followed by a timestamp of when the collection was created. Collections can be overwritten by specifying an existing collection name. 8 | 9 | Each collection is a directory which contains at minimum: 10 | 11 | - `archive/`: A directory containing gzipped [WARC](https://www.iso.org/standard/68004.html) files containing the web traffic recorded during crawling. 12 | - `logs/`: A directory containing one or more crawler log files in [JSON-Lines](https://jsonlines.org/) format. 13 | - `pages/`: A directory containing one or more "Page" files in [JSON-Lines](https://jsonlines.org/) format. At minimum, this directory will contain a `pages.jsonl` file with information about the seed URLs provided to the crawler. If additional pages were discovered and in scope during crawling, information about those non-seed pages is written to `extraPages.jsonl`. For more information about the contents of Page files, see the [WACZ specification](https://specs.webrecorder.net/wacz/1.1.1/#pages-jsonl). 14 | - `warc-cdx/`: A directory containing one or more [CDXJ](https://specs.webrecorder.net/cdxj/0.1.0/) index files created while recording traffic to WARC files. These index files are 15 | 16 | Additionally, the collection may include: 17 | 18 | - A WACZ file named after the collection, if the `--generateWACZ` argument is provided. 19 | - An `indexes/` directory containing merged [CDXJ](https://specs.webrecorder.net/cdxj/0.1.0/) index files for the crawl, if the `--generateCDX` or `--generateWACZ` arguments are provided. If the combined size of the CDXJ files in the `warc-cdx/` directory is over 50 KB, the resulting final CDXJ file will be gzipped. 20 | - A single combined gzipped [WARC](https://www.iso.org/standard/68004.html) file for the crawl, if the `--combineWARC` argument is provided. 21 | - A `crawls/` directory including YAML files describing the crawl state, if the `--saveState` argument is provided with a value of "always", or if the crawl is interrupted and `--saveState` is not set to "never". These files can be used to restart a crawl from its saved state. 22 | 23 | ## Profile Outputs 24 | 25 | Browser profiles that are saved by Browsertrix Crawler are written into the `crawls/profiles` directory. 26 | -------------------------------------------------------------------------------- /docs/docs/user-guide/proxies.md: -------------------------------------------------------------------------------- 1 | # Crawling with Proxies 2 | Browser Crawler supports crawling through HTTP and SOCKS5 proxies, including through a SOCKS5 proxy over an SSH tunnel. 3 | 4 | To specify a proxy, the `PROXY_SERVER` environment variable or `--proxyServer` CLI flag can be passed in. 5 | If both are provided, the `--proxyServer` CLI flag will take precedence. 6 | 7 | The proxy server can be specified as a `http://`, `socks5://`, or `ssh://` URL. 8 | 9 | ### HTTP Proxies 10 | 11 | To crawl through an HTTP proxy running at `http://path-to-proxy-host.example.com:9000`, run the crawler with: 12 | 13 | ```sh 14 | docker run -v $PWD/crawls/:/crawls/ -e PROXY_SERVER=http://path-to-proxy-host.example.com:9000 webrecorder/browsertrix-crawler crawl --url https://example.com/ 15 | ``` 16 | 17 | or 18 | 19 | ```sh 20 | docker run -v $PWD/crawls/:/crawls/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --proxyServer http://path-to-proxy-host.example.com:9000 21 | ``` 22 | 23 | The crawler *does not* support authentication for HTTP proxies, as that is not supported by the browser. 24 | 25 | (For backwards compatibility with crawler 0.x, `PROXY_HOST` and `PROXY_PORT` environment variables can be used to specify an HTTP proxy instead of `PROXY_SERVER` 26 | which takes precedence if provided). 27 | 28 | 29 | ### SOCKS5 Proxies 30 | 31 | To use a SOCKS5 proxy running at `path-to-proxy-host.example.com:9001`, run the crawler with: 32 | 33 | ```sh 34 | docker run -v $PWD/crawls/:/crawls/ -e PROXY_SERVER=socks5://path-to-proxy-host.example.com:9001 webrecorder/browsertrix-crawler crawl --url https://example.com/ 35 | ``` 36 | 37 | The crawler *does* support password authentication for SOCKS5 proxies, which can be provided as `user:password` in the proxy URL: 38 | 39 | ```sh 40 | docker run-v $PWD/crawls/:/crawls/ -e PROXY_SERVER=socks5://user:password@path-to-proxy-host.example.com:9001 webrecorder/browsertrix-crawler crawl --url https://example.com/ 41 | ``` 42 | 43 | ### SSH Proxies 44 | 45 | Starting with 1.3.0, the crawler also supports crawling through an SOCKS5 that is established over an SSH tunnel, via `ssh -D`. 46 | With this option, the crawler can SSH into a remote machine that has SSH and port forwarding enabled and crawl through that machine's network. 47 | 48 | To use this proxy, the private SSH key file must be provided via `--sshProxyPrivateKeyFile` CLI flag. 49 | 50 | The private key and public host key should be mounted as volumes into a path in the container, as shown below. 51 | 52 | For example, to connect via SSH to host `path-to-ssh-host.example.com` as user `user` with private key stored in `./my-proxy-private-key`, run: 53 | 54 | ```sh 55 | docker run -v $PWD/crawls/:/crawls/ -v $PWD/my-proxy-private-key:/tmp/private-key webrecorder/browsertrix-crawler crawl --url https://httpbin.org/ip --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key 56 | ``` 57 | 58 | To also provide the host public key (eg. `./known_hosts` file) for additional verification, run: 59 | 60 | ```sh 61 | docker run -v $PWD/crawls/:/crawls/ -v $PWD/my-proxy-private-key:/tmp/private-key -v $PWD/known_hosts:/tmp/known_hosts webrecorder/browsertrix-crawler crawl --url https://httpbin.org/ip --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key --sshProxyKnownHostsFile /tmp/known_hosts 62 | ``` 63 | 64 | The host key will only be checked if provided in a file via: `--sshProxyKnownHostsFile`. 65 | 66 | A custom SSH port can be provided with `--proxyServer ssh://user@path-to-ssh-host.example.com:2222`, otherwise the 67 | connection will be attempted via the default SSH port (port 22). 68 | 69 | The SSH connection establishes a tunnel on a local port in the container (9722) which will forward inbound/outbound traffic through the remote proxy. 70 | The `autossh` utility is used to automatically restart the SSH connection, if needed. 71 | 72 | Only key-based authentication is supposed for SSH proxies for now. 73 | 74 | 75 | ## Browser Profiles 76 | 77 | The above proxy settings also apply to [Browser Profile Creation](browser-profiles.md), and browser profiles can also be created using proxies, for example: 78 | 79 | ```sh 80 | docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles -v $PWD/my-proxy-private-key:/tmp/private-key -v $PWD/known_hosts:/tmp/known_hosts webrecorder/browsertrix-crawler create-login-profile --url https://example.com/ --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key --sshProxyKnownHostsFile /tmp/known_hosts 81 | ``` 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /docs/docs/user-guide/qa.md: -------------------------------------------------------------------------------- 1 | # Quality Assurance 2 | 3 | ## Overview 4 | 5 | Browsertrix Crawler can analyze an existing crawl to compare what the browser encountered on a website during crawling against the replay of the crawl WACZ. The WACZ produced by this analysis run includes additional comparison data (stored as WARC `resource` records) for the pages found during crawling against their replay in ReplayWeb.page. This works along several dimensions, including screenshot, extracted text, and page resource comparisons. 6 | 7 | !!! note 8 | 9 | QA features described on this page are available in Browsertrix Crawler releases 1.1.0 and later. 10 | 11 | ## Getting started 12 | 13 | To be able to run QA on a crawl, you must first have an existing crawl, for example: 14 | 15 | ```sh 16 | docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --collection example-crawl --text to-warc --screenshot view --generateWACZ 17 | ``` 18 | 19 | Note that this crawl must be run with `--generateWACZ` flag as QA requires a WACZ to work with, and also ideally the `--text to-warc` and `--screenshot view` flags as well (see below for more details on comparison dimensions). 20 | 21 | To analyze this crawl, call Browsertrix Crawler with the `qa` entrypoint, passing the original crawl WACZ as the `qaSource`: 22 | 23 | ```sh 24 | docker run -v $PWD/crawls/:/crawls/ -it webrecorder/browsertrix-crawler qa --qaSource /crawls/collections/example-crawl/example-crawl.wacz --collection example-qa --generateWACZ 25 | ``` 26 | 27 | The `qaSource` can be: 28 | - A local WACZ file path or a URL 29 | - A single WACZ or a JSON file containing a list of WACZ files in the `resources` json (Multi-WACZ) 30 | 31 | This assumes an existing crawl that was created in the `example-crawl` collection. 32 | 33 | A new WACZ for the analysis run will be created in the resulting `example-qa` collection. 34 | 35 | By default, the analysis crawl will visit all of the pages (as read from the source WACZ file(s)), however pages can further be limited by adding `--include` and `--exclude` regexes. The `--limit` flag will also limit how many pages are tested. 36 | 37 | The analysis crawl will skip over any non-HTML pages such as PDFs which can be relied upon to be bit-for-bit identical as long as the resource was fully fetched. 38 | 39 | ## Comparison Dimensions 40 | 41 | ### Screenshot Match 42 | 43 | One way to compare crawl and replay is to compare the screenshots of a page while it is being crawled with when it is being replayed. The initial viewport screenshots of each page from the crawl and replay are compared on the basis of pixel value similarity. This results in a score between 0 and 1.0 representing the percentage match between the crawl and replay screenshots for each page. The screenshots are stored in `urn:view:` WARC resource records. 44 | 45 | To enable comparison on this dimension, the crawl must be run with at least the `--screenshot view` option. (Additional screenshot options can be added as well). 46 | 47 | ### Text Match 48 | 49 | Another way to compare the crawl and replay results is to use the text extracted from the HTML. This is done by comparing the extracted text from crawl and replay on the basis of [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance). This results in a score between 0 and 1.0 representing the percentage match between the crawl and replay text for each page. The extracted text is stored in `urn:text:` WARC resource records. 50 | 51 | To enable comparison on this dimension, the original crawl must be run with at least the `--text to-warc` option. (Additional text options can be added as well) 52 | 53 | ### Resources and Page Info 54 | 55 | The `pageinfo` records produced by the crawl and analysis runs include a JSON document containing information about the resources loaded on each page, such as CSS stylesheets, JavaScript scripts, fonts, images, and videos. The URL, status code, MIME type, and resource type of each resource is saved in the `pageinfo` record for each page. 56 | 57 | Since `pageinfo` records are produced for all crawls, this data is always available. 58 | 59 | ### Comparison Data 60 | 61 | Comparison data is also added to the QA crawl's `pageinfo` records. The comparison data may look as follows: 62 | 63 | ```json 64 | "comparison": { 65 | "screenshotMatch": 0.95, 66 | "textMatch": 0.9, 67 | "resourceCounts": { 68 | "crawlGood": 10, 69 | "crawlBad": 0, 70 | "replayGood": 9, 71 | "replayBad": 1 72 | } 73 | } 74 | ``` 75 | 76 | This data indicates that: 77 | 78 | - When comparing `urn:view:` records for crawl and replay, the screenshots are 95% similar. 79 | - When comparing `urn:text:` records from crawl and replay WACZs, the text is 90% similar. 80 | - When comparing `urn:pageinfo:` resource entries from crawl and replay, the crawl record had 10 good responses (2xx/3xx status code) and 0 bad responses (4xx/5xx status code), while replay had 9 good and 1 bad. 81 | -------------------------------------------------------------------------------- /docs/docs/user-guide/yaml-config.md: -------------------------------------------------------------------------------- 1 | # YAML Crawl Config 2 | 3 | Browsertix Crawler supports the use of a YAML file to set parameters for a crawl. This can be used by passing a valid yaml file to the `--config` option. 4 | 5 | The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the YAML file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`. 6 | 7 | ```sh 8 | docker run -v $PWD/crawl-config.yaml:/app/crawl-config.yaml -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config /app/crawl-config.yaml 9 | ``` 10 | 11 | The config can also be passed via stdin, which can simplify the command. Note that this require running `docker run` with the `-i` flag. To read config from stdin, pass `--config stdin` 12 | 13 | ```sh 14 | cat ./crawl-config.yaml | docker run -i -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config stdin 15 | ``` 16 | 17 | An example config file (eg. crawl-config.yaml) might contain: 18 | 19 | ```yaml 20 | seeds: 21 | - https://example.com/ 22 | - https://www.iana.org/ 23 | 24 | combineWARC: true 25 | ``` 26 | 27 | The list of seeds can be loaded via an external file by specifying the filename via the `seedFile` config or command-line option. 28 | 29 | ## Seed File 30 | 31 | The URL seed file should be a text file formatted so that each line of the file is a url string. An example file is available in the Github repository's fixture folder as [urlSeedFile.txt](https://github.com/webrecorder/browsertrix-crawler/blob/main/tests/fixtures/urlSeedFile.txt). 32 | 33 | The seed file must be passed as a volume to the docker container. Your Docker command should be formatted similar to the following: 34 | 35 | ```sh 36 | docker run -v $PWD/seedFile.txt:/app/seedFile.txt -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --seedFile /app/seedFile.txt 37 | ``` 38 | 39 | ## Per-Seed Settings 40 | 41 | Certain settings such as scope type, scope includes and excludes, and depth can also be configured per-seed directly in the YAML file, for example: 42 | 43 | ```yaml 44 | seeds: 45 | - url: https://webrecorder.net/ 46 | depth: 1 47 | scopeType: "prefix" 48 | ``` 49 | 50 | ## HTTP Auth 51 | 52 | !!! warning "HTTP basic auth credentials are written to the archive" 53 | We recommend exercising caution and only archiving with dedicated archival accounts, changing your password or deleting the account when finished. 54 | 55 | Browsertrix Crawler supports [HTTP Basic Auth](https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication), which can be provide on a per-seed basis as part of the URL, for example: 56 | `--url https://username:password@example.com/`. 57 | 58 | Alternatively, credentials can be added to the `auth` field for each seed: 59 | 60 | ```yaml 61 | seeds: 62 | - url: https://example.com/ 63 | auth: username:password 64 | ``` 65 | -------------------------------------------------------------------------------- /docs/gen-cli.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | CURR=$(dirname "${BASH_SOURCE[0]}") 3 | 4 | out=$CURR/docs/user-guide/cli-options.md 5 | echo "# All Command-Line Options" > $out 6 | echo "" >> $out 7 | echo "The Browsertrix Crawler Docker image currently accepts the following parameters, broken down by entrypoint:" >> $out 8 | echo "" >> $out 9 | echo "## crawler" >> $out 10 | echo "" >> $out 11 | echo '```' >> $out 12 | #node $CURR/../dist/main.js --help >> $out 13 | docker run webrecorder/browsertrix-crawler crawl --help | tail -n +3 >> $out 14 | echo '```' >> $out 15 | echo "" >> $out 16 | echo "## create-login-profile" >> $out 17 | echo "" >> $out 18 | echo '```' >> $out 19 | docker run webrecorder/browsertrix-crawler create-login-profile --help | tail -n +3 >> $out 20 | echo '```' >> $out 21 | -------------------------------------------------------------------------------- /docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: Browsertrix Crawler Docs 2 | repo_url: https://github.com/webrecorder/browsertrix-crawler/ 3 | repo_name: Browsertrix Crawler 4 | edit_uri: edit/main/docs/docs/ 5 | extra_css: 6 | - stylesheets/extra.css 7 | theme: 8 | name: material 9 | custom_dir: docs/overrides 10 | features: 11 | - navigation.sections 12 | - navigation.tabs 13 | - navigation.tabs.sticky 14 | - navigation.instant 15 | - navigation.tracking 16 | - navigation.indexes 17 | - navigation.footer 18 | - content.code.copy 19 | - content.action.edit 20 | - content.tooltips 21 | - search.suggest 22 | palette: 23 | scheme: webrecorder 24 | logo: assets/brand/browsertrix-crawler-white.svg 25 | favicon: assets/brand/browsertrix-crawler-icon-color-dynamic.svg 26 | 27 | icon: 28 | admonition: 29 | note: bootstrap/pencil-fill 30 | abstract: bootstrap/file-earmark-text-fill 31 | info: bootstrap/info-circle-fill 32 | tip: bootstrap/exclamation-circle-fill 33 | success: bootstrap/check-circle-fill 34 | question: bootstrap/question-circle-fill 35 | warning: bootstrap/exclamation-triangle-fill 36 | failure: bootstrap/x-octagon-fill 37 | danger: bootstrap/exclamation-diamond-fill 38 | bug: bootstrap/bug-fill 39 | example: bootstrap/mortarboard-fill 40 | quote: bootstrap/quote 41 | 42 | repo: bootstrap/github 43 | edit: bootstrap/pencil 44 | view: bootstrap/eye 45 | 46 | nav: 47 | - index.md 48 | - Develop: 49 | - develop/index.md 50 | - develop/docs.md 51 | - User Guide: 52 | - user-guide/index.md 53 | - user-guide/outputs.md 54 | - user-guide/exit-codes.md 55 | - user-guide/common-options.md 56 | - user-guide/crawl-scope.md 57 | - user-guide/yaml-config.md 58 | - user-guide/browser-profiles.md 59 | - user-guide/proxies.md 60 | - user-guide/behaviors.md 61 | - user-guide/qa.md 62 | - user-guide/cli-options.md 63 | 64 | markdown_extensions: 65 | - toc: 66 | toc_depth: 4 67 | permalink: true 68 | - pymdownx.highlight: 69 | anchor_linenums: true 70 | - pymdownx.emoji: 71 | emoji_index: !!python/name:material.extensions.emoji.twemoji 72 | emoji_generator: !!python/name:material.extensions.emoji.to_svg 73 | options: 74 | custom_icons: 75 | - docs/overrides/.icons 76 | - admonition 77 | - pymdownx.inlinehilite 78 | - pymdownx.details 79 | - pymdownx.superfences 80 | - pymdownx.keys 81 | - def_list 82 | - attr_list 83 | 84 | extra: 85 | generator: false 86 | social: 87 | - icon: bootstrap/globe 88 | link: https://webrecorder.net 89 | - icon: bootstrap/chat-left-text-fill 90 | link: https://forum.webrecorder.net/ 91 | - icon: bootstrap/mastodon 92 | link: https://digipres.club/@webrecorder 93 | - icon: bootstrap/youtube 94 | link: https://www.youtube.com/@webrecorder 95 | copyright: "Creative Commons Attribution 4.0 International (CC BY 4.0)" 96 | 97 | plugins: 98 | - search 99 | -------------------------------------------------------------------------------- /html/createProfile.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 34 | 35 | 36 |
37 | Log in to any site(s) that you want to be part of the crawl profile using 38 | the embedded browser below. When done, click 39 |
40 | 41 |
42 |
43 | 44 | 45 | 46 | -------------------------------------------------------------------------------- /html/replay.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 27 | 28 | 29 | 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /html/screencast.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 16 | 84 | 85 | 86 |
87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "browsertrix-crawler", 3 | "version": "1.6.3", 4 | "main": "browsertrix-crawler", 5 | "type": "module", 6 | "repository": "https://github.com/webrecorder/browsertrix-crawler", 7 | "author": "Ilya Kreymer , Webrecorder Software", 8 | "license": "AGPL-3.0-or-later", 9 | "scripts": { 10 | "tsc": "tsc", 11 | "format": "prettier src/ --check", 12 | "format:fix": "prettier src/ --write", 13 | "lint": "eslint src/", 14 | "lint:fix": "yarn format:fix && eslint src/ --fix", 15 | "test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)", 16 | "prepare": "husky install" 17 | }, 18 | "dependencies": { 19 | "@novnc/novnc": "1.4.0", 20 | "@puppeteer/replay": "^3.1.1", 21 | "@webrecorder/wabac": "^2.23.3", 22 | "browsertrix-behaviors": "^0.8.5", 23 | "client-zip": "^2.4.5", 24 | "css-selector-parser": "^3.0.5", 25 | "fetch-socks": "^1.3.0", 26 | "get-folder-size": "^4.0.0", 27 | "husky": "^8.0.3", 28 | "ioredis": "^5.3.2", 29 | "iso-639-1": "^3.1.5", 30 | "js-levenshtein": "^1.1.6", 31 | "js-yaml": "^4.1.0", 32 | "minio": "^7.1.3", 33 | "p-queue": "^7.3.4", 34 | "pixelmatch": "^5.3.0", 35 | "pngjs": "^7.0.0", 36 | "puppeteer-core": "^24.7.2", 37 | "sax": "^1.3.0", 38 | "sharp": "^0.32.6", 39 | "tsc": "^2.0.4", 40 | "undici": "^6.18.2", 41 | "uuid": "8.3.2", 42 | "warcio": "^2.4.4", 43 | "ws": "^7.4.4", 44 | "yargs": "^17.7.2" 45 | }, 46 | "devDependencies": { 47 | "@types/js-levenshtein": "^1.1.3", 48 | "@types/js-yaml": "^4.0.8", 49 | "@types/node": "^20.8.7", 50 | "@types/pixelmatch": "^5.2.6", 51 | "@types/pngjs": "^6.0.4", 52 | "@types/sax": "^1.2.7", 53 | "@types/uuid": "^9.0.6", 54 | "@types/ws": "^8.5.8", 55 | "@typescript-eslint/eslint-plugin": "^6.10.0", 56 | "@typescript-eslint/parser": "^6.10.0", 57 | "eslint": "^8.53.0", 58 | "eslint-config-prettier": "^9.0.0", 59 | "eslint-plugin-react": "^7.22.0", 60 | "http-server": "^14.1.1", 61 | "jest": "^29.7.0", 62 | "lighthouse": "^12.5.1", 63 | "md5": "^2.3.0", 64 | "prettier": "3.0.3", 65 | "puppeteer": "^24.4.0", 66 | "typescript": "^5.5.4" 67 | }, 68 | "jest": { 69 | "transform": {}, 70 | "testTimeout": 90000 71 | }, 72 | "resolutions": { 73 | "wrap-ansi": "7.0.0", 74 | "warcio": "^2.4.4", 75 | "@novnc/novnc": "1.4.0" 76 | } 77 | } 78 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wacz>=0.5.0 2 | -------------------------------------------------------------------------------- /src/main.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env -S node --experimental-global-webcrypto 2 | 3 | import { logger } from "./util/logger.js"; 4 | import { setExitOnRedisError } from "./util/redis.js"; 5 | import { Crawler } from "./crawler.js"; 6 | import { ReplayCrawler } from "./replaycrawler.js"; 7 | import fs from "node:fs"; 8 | import { ExitCodes, InterruptReason } from "./util/constants.js"; 9 | 10 | let crawler: Crawler | null = null; 11 | 12 | let lastSigInt = 0; 13 | let forceTerm = false; 14 | 15 | async function handleTerminate(signame: string) { 16 | logger.info(`${signame} received...`); 17 | if (!crawler || !crawler.crawlState) { 18 | logger.error("error: no crawler running, exiting"); 19 | process.exit(ExitCodes.GenericError); 20 | } 21 | 22 | if (crawler.done) { 23 | logger.info("success: crawler done, exiting"); 24 | process.exit(ExitCodes.Success); 25 | } 26 | 27 | setExitOnRedisError(); 28 | 29 | try { 30 | await crawler.checkCanceled(); 31 | 32 | if (!crawler.interruptReason) { 33 | logger.info("SIGNAL: interrupt request received..."); 34 | crawler.gracefulFinishOnInterrupt(InterruptReason.SignalInterrupted); 35 | } else if (forceTerm || Date.now() - lastSigInt > 200) { 36 | logger.info("SIGNAL: stopping crawl now..."); 37 | await crawler.serializeAndExit(); 38 | } 39 | lastSigInt = Date.now(); 40 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 41 | } catch (e: any) { 42 | logger.error("Error stopping crawl after receiving termination signal", e); 43 | } 44 | } 45 | 46 | process.on("SIGINT", () => handleTerminate("SIGINT")); 47 | 48 | process.on("SIGTERM", () => handleTerminate("SIGTERM")); 49 | 50 | process.on("SIGABRT", async () => { 51 | logger.info("SIGABRT received, will force immediate exit on SIGTERM/SIGINT"); 52 | forceTerm = true; 53 | }); 54 | 55 | if (process.argv[1].endsWith("qa")) { 56 | crawler = new ReplayCrawler(); 57 | } else { 58 | crawler = new Crawler(); 59 | } 60 | 61 | // remove any core dumps which could be taking up space in the working dir 62 | try { 63 | fs.unlinkSync("./core"); 64 | } catch (e) { 65 | //ignore 66 | } 67 | 68 | await crawler.run(); 69 | -------------------------------------------------------------------------------- /src/util/constants.ts: -------------------------------------------------------------------------------- 1 | export const HTML_TYPES = [ 2 | "text/html", 3 | "application/xhtml", 4 | "application/xhtml+xml", 5 | ]; 6 | export const WAIT_UNTIL_OPTS = [ 7 | "load", 8 | "domcontentloaded", 9 | "networkidle0", 10 | "networkidle2", 11 | ]; 12 | 13 | export const SERVICE_WORKER_OPTS = [ 14 | "disabled", 15 | "disabled-if-profile", 16 | "enabled", 17 | ] as const; 18 | 19 | export type ServiceWorkerOpt = (typeof SERVICE_WORKER_OPTS)[number]; 20 | 21 | export const DETECT_SITEMAP = ""; 22 | 23 | export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; 24 | 25 | export enum BxFunctionBindings { 26 | BehaviorLogFunc = "__bx_log", 27 | AddLinkFunc = "__bx_addLink", 28 | FetchFunc = "__bx_fetch", 29 | AddToSeenSet = "__bx_addSet", 30 | 31 | InitFlow = "__bx_initFlow", 32 | NextFlowStep = "__bx_nextFlowStep", 33 | } 34 | 35 | export const MAX_DEPTH = 1000000; 36 | export const DEFAULT_MAX_RETRIES = 2; 37 | 38 | export const FETCH_HEADERS_TIMEOUT_SECS = 30; 39 | export const PAGE_OP_TIMEOUT_SECS = 5; 40 | export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; 41 | 42 | export type ExtractSelector = { 43 | selector: string; 44 | extract: string; 45 | attrOnly: boolean; 46 | }; 47 | 48 | export const DEFAULT_SELECTORS: ExtractSelector[] = [ 49 | { 50 | selector: "a[href]", 51 | extract: "href", 52 | attrOnly: false, 53 | }, 54 | ]; 55 | 56 | export const DEFAULT_CRAWL_ID_TEMPLATE = "@hostname-@id"; 57 | 58 | export const BEHAVIOR_TYPES = [ 59 | "autoplay", 60 | "autofetch", 61 | "autoscroll", 62 | "autoclick", 63 | "siteSpecific", 64 | ]; 65 | 66 | export const DISPLAY = ":99"; 67 | 68 | export enum ExitCodes { 69 | Success = 0, 70 | GenericError = 1, 71 | Failed = 9, 72 | OutOfSpace = 3, 73 | BrowserCrashed = 10, 74 | SignalInterrupted = 11, 75 | FailedLimit = 12, 76 | SignalInterruptedForce = 13, 77 | SizeLimit = 14, 78 | TimeLimit = 15, 79 | DiskUtilization = 16, 80 | Fatal = 17, 81 | ProxyError = 21, 82 | } 83 | 84 | export enum InterruptReason { 85 | SizeLimit = 1, 86 | TimeLimit = 2, 87 | FailedLimit = 3, 88 | DiskUtilization = 4, 89 | BrowserCrashed = 5, 90 | SignalInterrupted = 6, 91 | CrawlPaused = 7, 92 | } 93 | -------------------------------------------------------------------------------- /src/util/healthcheck.ts: -------------------------------------------------------------------------------- 1 | import http from "http"; 2 | import url from "url"; 3 | import { logger } from "./logger.js"; 4 | import { Browser } from "./browser.js"; 5 | 6 | // =========================================================================== 7 | export class HealthChecker { 8 | port: number; 9 | errorThreshold: number; 10 | healthServer: http.Server; 11 | browser: Browser; 12 | 13 | updater: (() => Promise) | null; 14 | 15 | errorCount = 0; 16 | 17 | constructor( 18 | port: number, 19 | errorThreshold: number, 20 | browser: Browser, 21 | updater: (() => Promise) | null = null, 22 | ) { 23 | this.port = port; 24 | this.browser = browser; 25 | this.errorThreshold = errorThreshold; 26 | 27 | this.healthServer = http.createServer((...args) => 28 | this.healthCheck(...args), 29 | ); 30 | logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck"); 31 | this.healthServer.listen(port); 32 | 33 | this.updater = updater; 34 | } 35 | 36 | async healthCheck(req: http.IncomingMessage, res: http.ServerResponse) { 37 | const pathname = req.url ? url.parse(req.url).pathname : ""; 38 | switch (pathname) { 39 | case "/healthz": 40 | if (this.errorCount < this.errorThreshold && !this.browser.crashed) { 41 | logger.debug( 42 | `health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`, 43 | {}, 44 | "healthcheck", 45 | ); 46 | res.writeHead(200); 47 | res.end(); 48 | } 49 | if (this.updater) { 50 | this.updater().catch((e) => 51 | logger.warn("Healthcheck Updater failed", e, "healthcheck"), 52 | ); 53 | } 54 | return; 55 | } 56 | 57 | logger.error( 58 | `health check failed: ${this.errorCount} >= ${this.errorThreshold}`, 59 | {}, 60 | "healthcheck", 61 | ); 62 | res.writeHead(503); 63 | res.end(); 64 | } 65 | 66 | resetErrors() { 67 | if (this.errorCount > 0) { 68 | logger.info( 69 | `Page loaded, resetting error count ${this.errorCount} to 0`, 70 | {}, 71 | "healthcheck", 72 | ); 73 | this.errorCount = 0; 74 | } 75 | } 76 | 77 | incError() { 78 | this.errorCount++; 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/util/logger.ts: -------------------------------------------------------------------------------- 1 | // =========================================================================== 2 | // to fix serialization of regexes for logging purposes 3 | 4 | import { Writable } from "node:stream"; 5 | import { RedisCrawlState } from "./state.js"; 6 | import { ExitCodes } from "./constants.js"; 7 | 8 | // RegExp.prototype.toJSON = RegExp.prototype.toString; 9 | Object.defineProperty(RegExp.prototype, "toJSON", { 10 | value: RegExp.prototype.toString, 11 | }); 12 | 13 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 14 | export type LogDetails = Record; 15 | 16 | // =========================================================================== 17 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 18 | export function formatErr(e: unknown): Record { 19 | if (e instanceof Error) { 20 | return { type: "exception", message: e.message, stack: e.stack || "" }; 21 | } else if (typeof e === "object") { 22 | return e || {}; 23 | } else { 24 | return { message: (e as object) + "" }; 25 | } 26 | } 27 | 28 | // =========================================================================== 29 | export const LOG_CONTEXT_TYPES = [ 30 | "general", 31 | "worker", 32 | "recorder", 33 | "recorderNetwork", 34 | "writer", 35 | "state", 36 | "redis", 37 | "storage", 38 | "text", 39 | "exclusion", 40 | "screenshots", 41 | "screencast", 42 | "originOverride", 43 | "healthcheck", 44 | "browser", 45 | "blocking", 46 | "behavior", 47 | "behaviorScript", 48 | "behaviorScriptCustom", 49 | "jsError", 50 | "fetch", 51 | "pageStatus", 52 | "memoryStatus", 53 | "crawlStatus", 54 | "links", 55 | "sitemap", 56 | "wacz", 57 | "replay", 58 | "proxy", 59 | ] as const; 60 | 61 | export type LogContext = (typeof LOG_CONTEXT_TYPES)[number]; 62 | 63 | export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [ 64 | "recorderNetwork", 65 | "jsError", 66 | "screencast", 67 | ]; 68 | 69 | // =========================================================================== 70 | class Logger { 71 | logStream: Writable | null = null; 72 | debugLogging = false; 73 | logErrorsToRedis = false; 74 | logBehaviorsToRedis = false; 75 | logLevels: string[] = []; 76 | contexts: LogContext[] = []; 77 | excludeContexts: LogContext[] = []; 78 | crawlState?: RedisCrawlState | null = null; 79 | fatalExitCode: ExitCodes = ExitCodes.Fatal; 80 | 81 | setDefaultFatalExitCode(exitCode: number) { 82 | this.fatalExitCode = exitCode; 83 | } 84 | 85 | setExternalLogStream(logFH: Writable | null) { 86 | this.logStream = logFH; 87 | } 88 | 89 | setDebugLogging(debugLog: boolean) { 90 | this.debugLogging = debugLog; 91 | } 92 | 93 | setLogErrorsToRedis(logErrorsToRedis: boolean) { 94 | this.logErrorsToRedis = logErrorsToRedis; 95 | } 96 | 97 | setLogBehaviorsToRedis(logBehaviorsToRedis: boolean) { 98 | this.logBehaviorsToRedis = logBehaviorsToRedis; 99 | } 100 | 101 | setLogLevel(logLevels: string[]) { 102 | this.logLevels = logLevels; 103 | } 104 | 105 | setContext(contexts: LogContext[]) { 106 | this.contexts = contexts; 107 | } 108 | 109 | setExcludeContext(contexts: LogContext[]) { 110 | this.excludeContexts = contexts; 111 | } 112 | 113 | setCrawlState(crawlState: RedisCrawlState) { 114 | this.crawlState = crawlState; 115 | } 116 | 117 | logAsJSON( 118 | message: string, 119 | dataUnknown: unknown, 120 | context: LogContext, 121 | logLevel = "info", 122 | ) { 123 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 124 | const data: Record = formatErr(dataUnknown); 125 | 126 | if (this.logLevels.length) { 127 | if (this.logLevels.indexOf(logLevel) < 0) { 128 | return; 129 | } 130 | } 131 | 132 | if (this.contexts.length) { 133 | if (this.contexts.indexOf(context) < 0) { 134 | return; 135 | } 136 | } 137 | 138 | if (this.excludeContexts.length) { 139 | if (this.excludeContexts.indexOf(context) >= 0) { 140 | return; 141 | } 142 | } 143 | 144 | const dataToLog = { 145 | timestamp: new Date().toISOString(), 146 | logLevel: logLevel, 147 | context: context, 148 | message: message, 149 | details: data, 150 | }; 151 | const string = JSON.stringify(dataToLog); 152 | console.log(string); 153 | try { 154 | if (this.logStream) { 155 | this.logStream.write(string + "\n"); 156 | } 157 | } catch (e) { 158 | // 159 | } 160 | 161 | const redisErrorLogLevels = ["error", "fatal"]; 162 | if ( 163 | this.logErrorsToRedis && 164 | this.crawlState && 165 | redisErrorLogLevels.includes(logLevel) 166 | ) { 167 | this.crawlState.logError(string).catch(() => {}); 168 | } 169 | 170 | const redisBehaviorLogLevels = ["info", "warn", "error"]; 171 | const behaviorContexts = ["behavior", "behaviorScript"]; 172 | if ( 173 | this.logBehaviorsToRedis && 174 | this.crawlState && 175 | ((behaviorContexts.includes(context) && 176 | redisBehaviorLogLevels.includes(logLevel)) || 177 | //always include behaviorScriptCustom 178 | context === "behaviorScriptCustom") 179 | ) { 180 | this.crawlState.logBehavior(string).catch(() => {}); 181 | } 182 | } 183 | 184 | info(message: string, data: unknown = {}, context: LogContext = "general") { 185 | this.logAsJSON(message, data, context); 186 | } 187 | 188 | error(message: string, data: unknown = {}, context: LogContext = "general") { 189 | this.logAsJSON(message, data, context, "error"); 190 | } 191 | 192 | warn(message: string, data: unknown = {}, context: LogContext = "general") { 193 | this.logAsJSON(message, data, context, "warn"); 194 | } 195 | 196 | debug(message: string, data: unknown = {}, context: LogContext = "general") { 197 | if (this.debugLogging) { 198 | this.logAsJSON(message, data, context, "debug"); 199 | } 200 | } 201 | 202 | fatal( 203 | message: string, 204 | data = {}, 205 | context: LogContext = "general", 206 | exitCode = ExitCodes.Success, 207 | ) { 208 | exitCode = exitCode || this.fatalExitCode; 209 | this.logAsJSON(`${message}. Quitting`, data, context, "fatal"); 210 | 211 | if (this.crawlState) { 212 | this.crawlState 213 | .setStatus("failed") 214 | .catch(() => {}) 215 | .finally(process.exit(exitCode)); 216 | } else { 217 | process.exit(exitCode); 218 | } 219 | } 220 | } 221 | 222 | export const logger = new Logger(); 223 | -------------------------------------------------------------------------------- /src/util/originoverride.ts: -------------------------------------------------------------------------------- 1 | import { HTTPRequest, Page } from "puppeteer-core"; 2 | import { formatErr, logger } from "./logger.js"; 3 | import { Browser } from "./browser.js"; 4 | 5 | import { fetch } from "undici"; 6 | import { getProxyDispatcher } from "./proxy.js"; 7 | 8 | export class OriginOverride { 9 | originOverride: { origUrl: URL; destUrl: URL }[]; 10 | 11 | constructor(originOverride: string[]) { 12 | this.originOverride = originOverride.map((override) => { 13 | const [orig, dest] = override.split("="); 14 | const origUrl = new URL(orig); 15 | const destUrl = new URL(dest); 16 | 17 | return { origUrl, destUrl }; 18 | }); 19 | } 20 | 21 | async initPage(browser: Browser, page: Page) { 22 | const onRequest = async (request: HTTPRequest) => { 23 | try { 24 | const url = request.url(); 25 | 26 | let newUrl = null; 27 | let orig = null; 28 | 29 | for (const { origUrl, destUrl } of this.originOverride) { 30 | if (url.startsWith(origUrl.origin)) { 31 | newUrl = destUrl.origin + url.slice(origUrl.origin.length); 32 | orig = origUrl; 33 | break; 34 | } 35 | } 36 | 37 | if (!newUrl || !orig) { 38 | await request.continue({}, -1); 39 | return; 40 | } 41 | 42 | const headers = new Headers(request.headers()); 43 | 44 | headers.set("host", orig.host); 45 | if (headers.get("origin")) { 46 | headers.set("origin", orig.origin); 47 | } 48 | 49 | const resp = await fetch(newUrl, { 50 | headers, 51 | dispatcher: getProxyDispatcher(), 52 | }); 53 | 54 | const body = Buffer.from(await resp.arrayBuffer()); 55 | const respHeaders = Object.fromEntries(resp.headers); 56 | const status = resp.status; 57 | 58 | logger.debug( 59 | "Origin overridden", 60 | { orig: url, dest: newUrl, status, body: body.length }, 61 | "originOverride", 62 | ); 63 | 64 | await request.respond({ body, headers: respHeaders, status }, -1); 65 | } catch (e) { 66 | logger.warn( 67 | "Error overriding origin", 68 | { ...formatErr(e), url: page.url() }, 69 | "originOverride", 70 | ); 71 | await request.continue({}, -1); 72 | } 73 | }; 74 | browser.interceptRequest(page, onRequest); 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /src/util/redis.ts: -------------------------------------------------------------------------------- 1 | import { Redis } from "ioredis"; 2 | import { logger } from "./logger.js"; 3 | 4 | const error = console.error; 5 | 6 | let lastLogTime = 0; 7 | let exitOnError = false; 8 | 9 | // log only once every 10 seconds 10 | const REDIS_ERROR_LOG_INTERVAL_SECS = 10000; 11 | 12 | console.error = function (...args) { 13 | if ( 14 | typeof args[0] === "string" && 15 | args[0].indexOf("[ioredis] Unhandled error event") === 0 16 | ) { 17 | const now = Date.now(); 18 | 19 | if (now - lastLogTime > REDIS_ERROR_LOG_INTERVAL_SECS) { 20 | if (lastLogTime && exitOnError) { 21 | logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis"); 22 | } 23 | logger.warn("ioredis error", { error: args[0] }, "redis"); 24 | lastLogTime = now; 25 | } 26 | return; 27 | } 28 | error.call(console, ...args); 29 | }; 30 | 31 | export async function initRedis(url: string) { 32 | const redis = new Redis(url, { lazyConnect: true }); 33 | await redis.connect(); 34 | return redis; 35 | } 36 | 37 | export function setExitOnRedisError() { 38 | exitOnError = true; 39 | } 40 | -------------------------------------------------------------------------------- /src/util/replayserver.ts: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import fsp from "fs/promises"; 3 | import http, { IncomingMessage, ServerResponse } from "http"; 4 | import path from "path"; 5 | 6 | const replayHTML = fs.readFileSync( 7 | new URL("../../html/replay.html", import.meta.url), 8 | { encoding: "utf8" }, 9 | ); 10 | 11 | const swJS = fs.readFileSync(new URL("../../html/rwp/sw.js", import.meta.url), { 12 | encoding: "utf8", 13 | }); 14 | 15 | const uiJS = fs.readFileSync(new URL("../../html/rwp/ui.js", import.meta.url), { 16 | encoding: "utf8", 17 | }); 18 | 19 | const adblockGZ = fs.readFileSync( 20 | new URL("../../html/rwp/adblock.gz", import.meta.url), 21 | {}, 22 | ); 23 | 24 | // ============================================================================ 25 | const PORT = 9990; 26 | 27 | // ============================================================================ 28 | export class ReplayServer { 29 | sourceUrl: string; 30 | origFileSource: string | null; 31 | sourceContentType: string | null; 32 | sourceSize?: number; 33 | 34 | constructor(sourceUrlOrFile: string) { 35 | if ( 36 | sourceUrlOrFile.startsWith("http://") || 37 | sourceUrlOrFile.startsWith("https://") 38 | ) { 39 | this.sourceUrl = sourceUrlOrFile; 40 | this.origFileSource = null; 41 | this.sourceContentType = null; 42 | } else { 43 | this.origFileSource = sourceUrlOrFile; 44 | const ext = path.extname(sourceUrlOrFile); 45 | this.sourceUrl = `/source${ext}`; 46 | 47 | switch (ext) { 48 | case ".wacz": 49 | this.sourceContentType = "application/wacz+zip"; 50 | break; 51 | 52 | case ".json": 53 | this.sourceContentType = "application/json"; 54 | break; 55 | 56 | default: 57 | this.sourceContentType = "application/octet-stream"; 58 | } 59 | } 60 | const httpServer = http.createServer((req, res) => 61 | this.handleRequest(req, res), 62 | ); 63 | httpServer.listen(PORT); 64 | } 65 | 66 | get homePage() { 67 | return `http://localhost:${PORT}/`; 68 | } 69 | 70 | async handleRequest(request: IncomingMessage, response: ServerResponse) { 71 | const parsedUrl = new URL( 72 | request.url || "", 73 | `http://${request.headers.host}`, 74 | ); 75 | const pathname = parsedUrl.pathname; 76 | 77 | switch (pathname) { 78 | case "/": 79 | response.writeHead(200, { "Content-Type": "text/html" }); 80 | response.end(replayHTML.replace("$SOURCE", this.sourceUrl)); 81 | return; 82 | 83 | case "/sw.js": 84 | case "/replay/sw.js": 85 | response.writeHead(200, { "Content-Type": "application/javascript" }); 86 | response.end(swJS); 87 | return; 88 | 89 | case "/ui.js": 90 | response.writeHead(200, { "Content-Type": "application/javascript" }); 91 | response.end(uiJS); 92 | return; 93 | 94 | case "/replay/adblock/adblock.gz": 95 | response.writeHead(200, { "Content-Type": "application/gzip" }); 96 | response.end(adblockGZ); 97 | return; 98 | 99 | case this.sourceUrl: 100 | if (this.sourceContentType && this.origFileSource) { 101 | if (!this.sourceSize) { 102 | const { size } = await fsp.stat(this.origFileSource); 103 | this.sourceSize = size; 104 | } 105 | const { opts, status, contentRange, contentLength } = 106 | this.getRespOptsForRequest(request, this.sourceSize); 107 | response.writeHead(status, { 108 | "Accept-Ranges": "bytes", 109 | "Content-Type": this.sourceContentType, 110 | "Content-Length": contentLength, 111 | "Content-Range": contentRange, 112 | }); 113 | //console.log(request.method, contentRange, opts); 114 | if (request.method === "GET") { 115 | fs.createReadStream(this.origFileSource, opts).pipe(response); 116 | } else { 117 | response.end(); 118 | } 119 | break; 120 | } 121 | // falls through 122 | 123 | default: 124 | response.writeHead(404, { "Content-Type": "application/json" }); 125 | response.end(JSON.stringify({ error: "not_found" })); 126 | return; 127 | } 128 | } 129 | 130 | getRespOptsForRequest(request: IncomingMessage, total: number) { 131 | const range = request.headers["range"] || ""; 132 | const array = range.match(/bytes=(\d+)?-(\d*)/); 133 | let contentRange = undefined; 134 | 135 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 136 | const opts: Record = {}; 137 | if (array) { 138 | opts.start = parseInt(array[1]); 139 | opts.end = parseInt(array[2]); 140 | // negative value, subtract from end 141 | if (isNaN(opts.start) && !isNaN(opts.end)) { 142 | opts.start = total - opts.end; 143 | opts.end = total - 1; 144 | } else if (isNaN(opts.end)) { 145 | opts.end = total - 1; 146 | } 147 | contentRange = `bytes ${opts.start}-${opts.end}/${total}`; 148 | return { 149 | status: 206, 150 | opts, 151 | contentRange, 152 | contentLength: opts.end - opts.start + 1, 153 | }; 154 | } 155 | return { status: 200, opts, contentRange, contentLength: total }; 156 | } 157 | } 158 | -------------------------------------------------------------------------------- /src/util/screenshots.ts: -------------------------------------------------------------------------------- 1 | import sharp from "sharp"; 2 | 3 | import { logger, formatErr } from "./logger.js"; 4 | import { Browser } from "./browser.js"; 5 | import { Page } from "puppeteer-core"; 6 | import { PageState } from "./state.js"; 7 | import { WARCWriter } from "./warcwriter.js"; 8 | 9 | // ============================================================================ 10 | 11 | type ScreenShotDesc = { 12 | type: "png" | "jpeg"; 13 | omitBackground: boolean; 14 | fullPage: boolean; 15 | encoding: "binary"; 16 | }; 17 | 18 | type ScreeshotType = "view" | "thumbnail" | "fullPage" | "fullPageFinal"; 19 | 20 | export const screenshotTypes: Record = { 21 | view: { 22 | type: "png", 23 | omitBackground: true, 24 | fullPage: false, 25 | encoding: "binary", 26 | }, 27 | thumbnail: { 28 | type: "jpeg", 29 | omitBackground: true, 30 | fullPage: false, 31 | encoding: "binary", 32 | }, 33 | fullPage: { 34 | type: "png", 35 | omitBackground: true, 36 | fullPage: true, 37 | encoding: "binary", 38 | }, 39 | fullPageFinal: { 40 | type: "png", 41 | omitBackground: true, 42 | fullPage: true, 43 | encoding: "binary", 44 | }, 45 | }; 46 | 47 | export type ScreenshotOpts = { 48 | browser: Browser; 49 | page: Page; 50 | url: string; 51 | writer: WARCWriter; 52 | }; 53 | 54 | export class Screenshots { 55 | browser: Browser; 56 | page: Page; 57 | url: string; 58 | writer: WARCWriter; 59 | 60 | constructor({ browser, page, writer, url }: ScreenshotOpts) { 61 | this.browser = browser; 62 | this.page = page; 63 | this.url = url; 64 | this.writer = writer; 65 | } 66 | 67 | async take( 68 | screenshotType: ScreeshotType = "view", 69 | state: PageState | null = null, 70 | ) { 71 | try { 72 | if (screenshotType !== "fullPage" && screenshotType !== "fullPageFinal") { 73 | await this.browser.setViewport(this.page, { 74 | width: 1920, 75 | height: 1080, 76 | }); 77 | } 78 | const options = screenshotTypes[screenshotType]; 79 | const screenshotBuffer = Buffer.from(await this.page.screenshot(options)); 80 | if (state && screenshotType === "view") { 81 | state.screenshotView = screenshotBuffer; 82 | } 83 | this.writer.writeNewResourceRecord( 84 | { 85 | buffer: screenshotBuffer, 86 | resourceType: screenshotType, 87 | contentType: "image/" + options.type, 88 | url: this.url, 89 | }, 90 | { 91 | resource: "screenshot", 92 | type: screenshotType, 93 | url: this.url, 94 | filename: this.writer.filename, 95 | }, 96 | "screenshots", 97 | ); 98 | // logger.info( 99 | // `Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.writer.filename}`, 100 | // ); 101 | } catch (e) { 102 | logger.error( 103 | "Taking screenshot failed", 104 | { page: this.url, type: screenshotType, ...formatErr(e) }, 105 | "screenshots", 106 | ); 107 | } 108 | } 109 | 110 | async takeFullPage() { 111 | await this.take("fullPage"); 112 | } 113 | 114 | async takeFullPageFinal() { 115 | await this.take("fullPageFinal"); 116 | } 117 | 118 | async takeThumbnail() { 119 | const screenshotType = "thumbnail"; 120 | try { 121 | await this.browser.setViewport(this.page, { width: 1920, height: 1080 }); 122 | const options = screenshotTypes[screenshotType]; 123 | const screenshotBuffer = await this.page.screenshot(options); 124 | const thumbnailBuffer = await sharp(screenshotBuffer) 125 | // 16:9 thumbnail 126 | .resize(640, 360) 127 | .toBuffer(); 128 | this.writer.writeNewResourceRecord( 129 | { 130 | buffer: thumbnailBuffer, 131 | resourceType: screenshotType, 132 | contentType: "image/" + options.type, 133 | url: this.url, 134 | }, 135 | { 136 | resource: "screenshot", 137 | type: screenshotType, 138 | url: this.url, 139 | filename: this.writer.filename, 140 | }, 141 | "screenshots", 142 | ); 143 | } catch (e) { 144 | logger.error( 145 | "Taking screenshot failed", 146 | { page: this.url, type: screenshotType, ...formatErr(e) }, 147 | "screenshots", 148 | ); 149 | } 150 | } 151 | } 152 | -------------------------------------------------------------------------------- /src/util/textextract.ts: -------------------------------------------------------------------------------- 1 | import { logger } from "./logger.js"; 2 | import { CDPSession, Protocol } from "puppeteer-core"; 3 | import { WARCWriter } from "./warcwriter.js"; 4 | 5 | // ============================================================================ 6 | type TextExtractOpts = { 7 | url: string; 8 | writer: WARCWriter; 9 | skipDocs: number; 10 | }; 11 | 12 | // ============================================================================ 13 | export abstract class BaseTextExtract { 14 | cdp: CDPSession; 15 | lastText: string | null = null; 16 | text: string | null = null; 17 | skipDocs: number = 0; 18 | writer: WARCWriter; 19 | url: string; 20 | 21 | constructor(cdp: CDPSession, { writer, skipDocs, url }: TextExtractOpts) { 22 | this.writer = writer; 23 | this.cdp = cdp; 24 | this.url = url; 25 | this.skipDocs = skipDocs || 0; 26 | } 27 | 28 | async extractAndStoreText( 29 | resourceType: string, 30 | ignoreIfMatchesLast = false, 31 | saveToWarc = false, 32 | ) { 33 | try { 34 | const text = await this.doGetText(); 35 | 36 | if (ignoreIfMatchesLast && text === this.lastText) { 37 | this.lastText = this.text; 38 | logger.debug( 39 | "Skipping, extracted text unchanged from last extraction", 40 | { url: this.url }, 41 | "text", 42 | ); 43 | return { changed: false, text }; 44 | } 45 | if (saveToWarc) { 46 | this.writer.writeNewResourceRecord( 47 | { 48 | buffer: new TextEncoder().encode(text), 49 | resourceType, 50 | contentType: "text/plain", 51 | url: this.url, 52 | }, 53 | { 54 | resource: "text", 55 | type: resourceType, 56 | url: this.url, 57 | filename: this.writer.filename, 58 | }, 59 | "text", 60 | ); 61 | } 62 | 63 | this.lastText = text; 64 | return { changed: true, text }; 65 | } catch (e) { 66 | logger.debug("Error extracting text", e, "text"); 67 | return { changed: false, text: null }; 68 | } 69 | } 70 | 71 | abstract doGetText(): Promise; 72 | } 73 | 74 | // ============================================================================ 75 | export class TextExtractViaSnapshot extends BaseTextExtract { 76 | async doGetText(): Promise { 77 | const result = await this.cdp.send("DOMSnapshot.captureSnapshot", { 78 | computedStyles: [], 79 | }); 80 | return this.parseTextFromDOMSnapshot(result); 81 | } 82 | 83 | parseTextFromDOMSnapshot( 84 | result: Protocol.DOMSnapshot.CaptureSnapshotResponse, 85 | ): string { 86 | const TEXT_NODE = 3; 87 | const ELEMENT_NODE = 1; 88 | 89 | const SKIPPED_NODES = [ 90 | "SCRIPT", 91 | "STYLE", 92 | "HEADER", 93 | "FOOTER", 94 | "BANNER-DIV", 95 | "NOSCRIPT", 96 | "TITLE", 97 | ]; 98 | 99 | const { strings, documents } = result; 100 | 101 | const accum: string[] = []; 102 | 103 | for (const doc of documents.slice(this.skipDocs)) { 104 | const nodeValues = doc.nodes.nodeValue || []; 105 | const nodeNames = doc.nodes.nodeName || []; 106 | const nodeTypes = doc.nodes.nodeType || []; 107 | const parentIndex = doc.nodes.parentIndex || []; 108 | 109 | for (let i = 0; i < nodeValues.length; i++) { 110 | if (nodeValues[i] === -1) { 111 | continue; 112 | } 113 | 114 | if (nodeTypes[i] === TEXT_NODE) { 115 | const pi = parentIndex[i]; 116 | if (pi >= 0 && nodeTypes[pi] === ELEMENT_NODE) { 117 | const name = strings[nodeNames[pi]]; 118 | 119 | if (!SKIPPED_NODES.includes(name)) { 120 | const value = strings[nodeValues[i]].trim(); 121 | if (value) { 122 | accum.push(value as string); 123 | } 124 | } 125 | } 126 | } 127 | } 128 | } 129 | 130 | return accum.join("\n"); 131 | } 132 | } 133 | 134 | // ============================================================================ 135 | export class TextExtractViaDocument extends BaseTextExtract { 136 | async doGetText(): Promise { 137 | const result = await this.cdp.send("DOM.getDocument", { 138 | depth: -1, 139 | pierce: true, 140 | }); 141 | return this.parseTextFromDOM(result); 142 | } 143 | 144 | parseTextFromDOM(dom: Protocol.DOM.GetDocumentResponse): string { 145 | const accum: string[] = []; 146 | const metadata = {}; 147 | 148 | this.parseText(dom.root, metadata, accum); 149 | 150 | return accum.join("\n"); 151 | } 152 | 153 | parseText( 154 | node: Protocol.DOM.Node, 155 | metadata: Record | null, 156 | accum: string[], 157 | ) { 158 | const SKIPPED_NODES = [ 159 | "head", 160 | "script", 161 | "style", 162 | "header", 163 | "footer", 164 | "banner-div", 165 | "noscript", 166 | ]; 167 | const EMPTY_LIST: Protocol.DOM.Node[] = []; 168 | const TEXT = "#text"; 169 | const TITLE = "title"; 170 | 171 | const name = node.nodeName.toLowerCase(); 172 | 173 | if (SKIPPED_NODES.includes(name)) { 174 | return; 175 | } 176 | 177 | const children = node.children || EMPTY_LIST; 178 | 179 | if (name === TEXT) { 180 | const value = node.nodeValue ? node.nodeValue.trim() : ""; 181 | if (value) { 182 | accum.push(value); 183 | } 184 | } else if (name === TITLE) { 185 | const title: string[] = []; 186 | 187 | for (const child of children) { 188 | this.parseText(child, null, title); 189 | } 190 | 191 | if (metadata) { 192 | metadata.title = title.join(" "); 193 | } else { 194 | accum.push(title.join(" ")); 195 | } 196 | } else { 197 | for (const child of children) { 198 | this.parseText(child, metadata, accum); 199 | } 200 | 201 | if (node.contentDocument) { 202 | this.parseText(node.contentDocument, null, accum); 203 | } 204 | } 205 | } 206 | } 207 | -------------------------------------------------------------------------------- /src/util/timing.ts: -------------------------------------------------------------------------------- 1 | import { LogContext, logger } from "./logger.js"; 2 | 3 | export function sleep(seconds: number) { 4 | return new Promise((resolve) => setTimeout(resolve, seconds * 1000)); 5 | } 6 | 7 | // TODO: Fix this the next time the file is edited. 8 | 9 | export function timedRun( 10 | // eslint-disable-next-line @typescript-eslint/no-explicit-any 11 | promise: Promise, 12 | seconds: number, 13 | message = "Promise timed out", 14 | logDetails = {}, 15 | context: LogContext = "general", 16 | isWarn = false, 17 | ) { 18 | // return Promise return value or log error if timeout is reached first 19 | const timeout = seconds * 1000; 20 | 21 | let tm: NodeJS.Timeout; 22 | 23 | const rejectPromiseOnTimeout = (timeout: number) => { 24 | return new Promise((resolve, reject) => { 25 | tm = setTimeout(() => reject("timeout reached"), timeout); 26 | }); 27 | }; 28 | 29 | return Promise.race([promise, rejectPromiseOnTimeout(timeout)]) 30 | .catch((err) => { 31 | if (err === "timeout reached") { 32 | const logFunc = isWarn ? logger.warn : logger.error; 33 | logFunc.call( 34 | logger, 35 | message, 36 | { seconds: seconds, ...logDetails }, 37 | context, 38 | ); 39 | } else { 40 | //logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context); 41 | throw err; 42 | } 43 | }) 44 | .finally(() => clearTimeout(tm)); 45 | } 46 | 47 | export function secondsElapsed(startTime: number, nowDate: Date | null = null) { 48 | nowDate = nowDate || new Date(); 49 | 50 | return (nowDate.getTime() - startTime) / 1000; 51 | } 52 | 53 | export function timestampNow() { 54 | return new Date().toISOString().replace(/[^\d]/g, ""); 55 | } 56 | -------------------------------------------------------------------------------- /test-setup.js: -------------------------------------------------------------------------------- 1 | import { jest } from "@jest/globals"; 2 | 3 | global.jest = jest; 4 | -------------------------------------------------------------------------------- /tests/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/d2a6aa9805fa08c9a51b43005e0a562a032fd78a/tests/.DS_Store -------------------------------------------------------------------------------- /tests/adblockrules.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | import yaml from "js-yaml"; 4 | 5 | function runCrawl(name, config, commandExtra = "") { 6 | config.generateCDX = true; 7 | config.depth = 0; 8 | config.collection = name; 9 | 10 | const configYaml = yaml.dump(config); 11 | 12 | try { 13 | const proc = child_process.execSync( 14 | `docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, 15 | { input: configYaml, stdin: "inherit", encoding: "utf8" }, 16 | ); 17 | 18 | //console.log(proc); 19 | } catch (error) { 20 | console.log(error); 21 | } 22 | } 23 | 24 | function doesCDXContain(coll, value) { 25 | const data = fs.readFileSync( 26 | `test-crawls/collections/${coll}/indexes/index.cdxj`, 27 | ); 28 | return data.indexOf(value) >= 0; 29 | } 30 | 31 | // Test Disabled for Brave -- should always be blocked, but seeing inconsistent ci behavior 32 | /* 33 | test("test crawl without ad block for specific URL", () => { 34 | const config = { 35 | "url": "https://www.mozilla.org/en-US/firefox/", 36 | "pageExtraDelay": 10 37 | }; 38 | 39 | runCrawl("adblock-no-block", config); 40 | 41 | // without ad blocking, URL with googletagmanager is included 42 | expect(doesCDXContain("adblock-no-block", "www.googletagmanager.com")).toBe(true); 43 | }); 44 | */ 45 | 46 | test("testcrawl with ad block for specific URL", () => { 47 | const config = { 48 | url: "https://www.mozilla.org/en-US/firefox/", 49 | blockAds: true, 50 | }; 51 | 52 | runCrawl("adblock-block", config); 53 | 54 | expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe( 55 | false, 56 | ); 57 | }); 58 | -------------------------------------------------------------------------------- /tests/add-exclusion.test.js: -------------------------------------------------------------------------------- 1 | import { exec } from "child_process"; 2 | import Redis from "ioredis"; 3 | 4 | function sleep(ms) { 5 | return new Promise((resolve) => setTimeout(resolve, ms)); 6 | } 7 | 8 | test("dynamically add exclusion while crawl is running", async () => { 9 | let callback = null; 10 | 11 | const p = new Promise((resolve) => { 12 | callback = (error, stdout, stderr) => { 13 | resolve({ error, stdout, stderr }); 14 | }; 15 | }); 16 | 17 | try { 18 | exec( 19 | "docker run -p 36382:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://old.webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis", 20 | { shell: "/bin/bash" }, 21 | callback, 22 | ); 23 | } catch (error) { 24 | console.log(error); 25 | } 26 | 27 | await sleep(3000); 28 | 29 | const redis = new Redis("redis://127.0.0.1:36382/0", { lazyConnect: true, retryStrategy: () => null }) 30 | 31 | await redis.connect(); 32 | 33 | while (true) { 34 | if (Number(await redis.zcard("test:q")) > 1) { 35 | break; 36 | } 37 | 38 | await sleep(500); 39 | } 40 | 41 | const uids = await redis.hkeys("test:status"); 42 | 43 | // exclude all pages containing 'webrecorder', should clear out the queue and end the crawl 44 | await redis.rpush( 45 | `${uids[0]}:msg`, 46 | JSON.stringify({ type: "addExclusion", regex: "webrecorder" }), 47 | ); 48 | 49 | // ensure 'Add Exclusion is contained in the debug logs 50 | const { stdout } = await p; 51 | 52 | expect(stdout.indexOf("Add Exclusion") > 0).toBe(true); 53 | 54 | expect(stdout.indexOf("Removing excluded URL") > 0).toBe(true); 55 | }); 56 | 57 | -------------------------------------------------------------------------------- /tests/basic_crawl.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | import path from "path"; 4 | import md5 from "md5"; 5 | 6 | const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0]; 7 | const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args); 8 | 9 | test("ensure basic crawl run with docker run passes", async () => { 10 | child_process.execSync( 11 | 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix', 12 | ); 13 | 14 | child_process.execSync( 15 | "unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz", 16 | ); 17 | }); 18 | 19 | testIf(doValidate, "validate wacz", () => { 20 | child_process.execSync( 21 | "wacz validate --file ./test-crawls/collections/wr-net/wr-net.wacz", 22 | ); 23 | }); 24 | 25 | test("check that individual WARCs have correct prefix and are under rollover size", () => { 26 | const archiveWarcLists = fs.readdirSync( 27 | "test-crawls/collections/wr-net/archive", 28 | ); 29 | 30 | archiveWarcLists.forEach((filename) => { 31 | expect(filename.startsWith("custom-prefix-")).toEqual(true); 32 | const size = fs.statSync( 33 | path.join("test-crawls/collections/wr-net/archive", filename), 34 | ).size; 35 | expect(size < 10000).toEqual(true); 36 | }); 37 | }); 38 | 39 | test("check that a combined warc file exists in the archive folder", () => { 40 | const warcLists = fs.readdirSync("test-crawls/collections/wr-net"); 41 | var captureFound = 0; 42 | 43 | for (var i = 0; i < warcLists.length; i++) { 44 | if (warcLists[i].endsWith("_0.warc.gz")) { 45 | captureFound = 1; 46 | } 47 | } 48 | expect(captureFound).toEqual(1); 49 | }); 50 | 51 | test("check that a combined warc file is under the rolloverSize", () => { 52 | const warcLists = fs.readdirSync( 53 | path.join("test-crawls/collections/wr-net/wacz", "archive"), 54 | ); 55 | let rolloverSize = 0; 56 | 57 | function getFileSize(filename) { 58 | return fs.statSync(filename).size; 59 | } 60 | 61 | for (let i = 0; i < warcLists.length; i++) { 62 | const size = getFileSize( 63 | path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]), 64 | ); 65 | if (size < 10000) { 66 | rolloverSize = 1; 67 | } 68 | } 69 | expect(rolloverSize).toEqual(1); 70 | }); 71 | 72 | test("check that the pages.jsonl file exists in the collection under the pages folder", () => { 73 | expect( 74 | fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl"), 75 | ).toBe(true); 76 | }); 77 | 78 | test("check that the pages.jsonl file exists in the wacz under the pages folder", () => { 79 | expect( 80 | fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl"), 81 | ).toBe(true); 82 | }); 83 | 84 | test("check that the hash in the pages folder and in the unzipped wacz folders match", () => { 85 | const crawl_hash = md5( 86 | JSON.parse( 87 | fs 88 | .readFileSync( 89 | "test-crawls/collections/wr-net/wacz/pages/pages.jsonl", 90 | "utf8", 91 | ) 92 | .split("\n")[1], 93 | )["text"], 94 | ); 95 | const wacz_hash = md5( 96 | JSON.parse( 97 | fs 98 | .readFileSync( 99 | "test-crawls/collections/wr-net/pages/pages.jsonl", 100 | "utf8", 101 | ) 102 | .split("\n")[1], 103 | )["text"], 104 | ); 105 | const fixture_hash = md5( 106 | JSON.parse( 107 | fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1], 108 | )["text"], 109 | ); 110 | 111 | expect(wacz_hash).toEqual(fixture_hash); 112 | expect(wacz_hash).toEqual(crawl_hash); 113 | }); 114 | 115 | test("check that the supplied title and description made it into datapackage.json", () => { 116 | expect( 117 | fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json"), 118 | ).toBe(true); 119 | 120 | const data = fs.readFileSync( 121 | "test-crawls/collections/wr-net/wacz/datapackage.json", 122 | "utf8", 123 | ); 124 | const dataPackageJSON = JSON.parse(data); 125 | expect(dataPackageJSON.title).toEqual("test title"); 126 | expect(dataPackageJSON.description).toEqual("test description"); 127 | }); 128 | -------------------------------------------------------------------------------- /tests/blockrules.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | import yaml from "js-yaml"; 4 | 5 | const isCI = !!process.env.CI; 6 | const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args); 7 | 8 | function runCrawl(name, config, commandExtra = "") { 9 | config.generateCDX = true; 10 | config.depth = 0; 11 | config.collection = name; 12 | 13 | const configYaml = yaml.dump(config); 14 | 15 | try { 16 | const proc = child_process.execSync( 17 | `docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`, 18 | { input: configYaml, stdin: "inherit", encoding: "utf8" }, 19 | ); 20 | 21 | //console.log(proc); 22 | } catch (error) { 23 | console.log(error); 24 | } 25 | } 26 | 27 | function doesCDXContain(coll, value) { 28 | const data = fs.readFileSync( 29 | `test-crawls/collections/${coll}/indexes/index.cdxj`, 30 | ); 31 | return data.indexOf(value) >= 0; 32 | } 33 | 34 | function checkVideo(coll) { 35 | return doesCDXContain(coll, '"video/mp4"'); 36 | } 37 | 38 | // Test Disabled for Brave -- should always be blocked, but seeing inconsistent ci behavior 39 | /* 40 | test("test crawl without block for specific URL", () => { 41 | const config = { 42 | "url": "https://www.iana.org/", 43 | "pageExtraDelay": 10 44 | }; 45 | 46 | runCrawl("block-1-no-block", config); 47 | 48 | // without blocks, URL with add sense is included 49 | expect(doesCDXContain("block-1-no-block", "https://cse.google.com/adsense/search/async-ads.js")).toBe(true); 50 | }); 51 | */ 52 | 53 | test("test block rule on specific URL", () => { 54 | const config = { 55 | url: "https://www.iana.org/", 56 | blockRules: [{ url: "adsense" }], 57 | }; 58 | 59 | runCrawl("block-1", config); 60 | 61 | expect( 62 | doesCDXContain( 63 | "block-1", 64 | "https://cse.google.com/adsense/search/async-ads.js", 65 | ), 66 | ).toBe(false); 67 | }); 68 | 69 | testIf(!isCI, "test block rule based on iframe text, content included due to match", () => { 70 | const config = { 71 | url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", 72 | blockRules: [ 73 | { 74 | url: "https://www.youtube.com/embed/", 75 | frameTextMatch: 76 | '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"', 77 | type: "allowOnly", 78 | }, 79 | ], 80 | }; 81 | 82 | runCrawl("block-2", config); 83 | 84 | expect(checkVideo("block-2")).toBe(true); 85 | }); 86 | 87 | test("test block rule based on iframe text, wrong text, content should be excluded", () => { 88 | const config = { 89 | url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", 90 | blockRules: [ 91 | { 92 | url: "https://www.youtube.com/embed/", 93 | frameTextMatch: 94 | '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\"', 95 | type: "allowOnly", 96 | }, 97 | ], 98 | }; 99 | 100 | runCrawl("block-3", config); 101 | 102 | expect(checkVideo("block-3")).toBe(false); 103 | }); 104 | 105 | test("test block rule based on iframe text, block matched", () => { 106 | const config = { 107 | url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", 108 | blockRules: [ 109 | { 110 | url: "https://www.youtube.com/embed/", 111 | frameTextMatch: 112 | '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"', 113 | }, 114 | ], 115 | }; 116 | 117 | runCrawl("block-4", config); 118 | 119 | expect(checkVideo("block-4")).toBe(false); 120 | }); 121 | 122 | testIf(!isCI, "test rule based on iframe text not matching, plus allowOnly iframe", () => { 123 | const config = { 124 | url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", 125 | blockRules: [ 126 | { 127 | url: "example.com/embed/", 128 | frameTextMatch: 129 | '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"', 130 | type: "block", 131 | }, 132 | { 133 | url: "(youtube.com|example.com)/embed/", 134 | type: "allowOnly", 135 | inFrameUrl: "oembed.link/", 136 | }, 137 | ], 138 | }; 139 | 140 | runCrawl("non-block-5", config); 141 | 142 | expect(checkVideo("non-block-5")).toBe(true); 143 | }); 144 | 145 | test("test block url in frame url", () => { 146 | const config = { 147 | url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI", 148 | blockRules: [ 149 | { 150 | url: "maxresdefault.jpg", 151 | type: "block", 152 | inFrameUrl: "youtube.com/embed", 153 | }, 154 | ], 155 | }; 156 | 157 | runCrawl("block-6", config); 158 | 159 | expect( 160 | doesCDXContain( 161 | "block-6", 162 | '"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg"', 163 | ), 164 | ).toBe(false); 165 | }); 166 | 167 | testIf(!isCI, "test block rules complex example, block external urls on main frame, but not on youtube", () => { 168 | const config = { 169 | seeds: ["https://archiveweb.page/en/troubleshooting/errors/"], 170 | depth: "0", 171 | blockRules: [ 172 | { 173 | url: "(archiveweb.page|www.youtube.com)", 174 | type: "allowOnly", 175 | inFrameUrl: "archiveweb.page", 176 | }, 177 | { 178 | url: "https://archiveweb.page/assets/js/vendor/lunr.min.js", 179 | inFrameUrl: "archiveweb.page", 180 | }, 181 | { 182 | url: "https://www.youtube.com/embed/", 183 | type: "allowOnly", 184 | frameTextMatch: 185 | '(\\\\"channelId\\\\":\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\")', 186 | }, 187 | ], 188 | 189 | combineWARC: true, 190 | 191 | logging: "stats,debug", 192 | }; 193 | 194 | runCrawl("block-7", config); 195 | 196 | expect( 197 | doesCDXContain( 198 | "block-7", 199 | '"https://archiveweb.page/assets/js/vendor/lunr.min.js"', 200 | ), 201 | ).toBe(false); 202 | expect(checkVideo("block-7")).toBe(true); 203 | }); 204 | -------------------------------------------------------------------------------- /tests/brave-query-redir.test.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import { execSync } from "child_process"; 3 | 4 | test("check that gclid query URL is automatically redirected to remove it", async () => { 5 | try { 6 | execSync( 7 | "docker run --rm -v $PWD/test-crawls:/crawls -i webrecorder/browsertrix-crawler crawl --url 'https://old.webrecorder.net/about?gclid=abc' --collection test-brave-redir --behaviors \"\" --limit 1 --generateCDX"); 8 | 9 | } catch (error) { 10 | console.log(error.stderr); 11 | } 12 | 13 | const filedata = fs.readFileSync( 14 | "test-crawls/collections/test-brave-redir/indexes/index.cdxj", 15 | { encoding: "utf-8" }, 16 | ); 17 | 18 | let responseFound = false; 19 | let redirectFound = false; 20 | 21 | const lines = filedata.trim().split("\n"); 22 | 23 | for (const line of lines) { 24 | const json = line.split(" ").slice(2).join(" "); 25 | const data = JSON.parse(json); 26 | if (data.url === "https://old.webrecorder.net/about?gclid=abc" && data.status === "307") { 27 | redirectFound = true; 28 | } else if (data.url === "https://old.webrecorder.net/about" && data.status === "200") { 29 | responseFound = true; 30 | } 31 | if (responseFound && redirectFound) { 32 | break; 33 | } 34 | } 35 | 36 | expect(redirectFound && responseFound).toBe(true); 37 | }); 38 | -------------------------------------------------------------------------------- /tests/collection_name.test.js: -------------------------------------------------------------------------------- 1 | import util from "util"; 2 | import { exec as execCallback } from "child_process"; 3 | 4 | const exec = util.promisify(execCallback); 5 | 6 | test("check that the collection name is properly validated", async () => { 7 | let passed = ""; 8 | 9 | try { 10 | await exec( 11 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid", 12 | ); 13 | passed = true; 14 | } catch (error) { 15 | passed = false; 16 | } 17 | expect(passed).toBe(true); 18 | }); 19 | 20 | test("check that the collection name is not accepted if it doesn't meets our standards", async () => { 21 | let passed = ""; 22 | 23 | try { 24 | await exec( 25 | "docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid", 26 | ); 27 | passed = true; 28 | } catch (e) { 29 | passed = false; 30 | } 31 | expect(passed).toBe(false); 32 | }); 33 | -------------------------------------------------------------------------------- /tests/config_file.test.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import yaml from "js-yaml"; 3 | 4 | import util from "util"; 5 | import { exec as execCallback } from "child_process"; 6 | 7 | const exec = util.promisify(execCallback); 8 | 9 | test("check yaml config file with seed list is used", async () => { 10 | try { 11 | await exec( 12 | "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0", 13 | ); 14 | } catch (error) { 15 | console.log(error); 16 | } 17 | 18 | const crawledPages = fs.readFileSync( 19 | "test-crawls/collections/configtest/pages/pages.jsonl", 20 | "utf8", 21 | ); 22 | const pages = new Set(); 23 | 24 | for (const line of crawledPages.trim().split("\n")) { 25 | const url = JSON.parse(line).url; 26 | if (url) { 27 | pages.add(url); 28 | } 29 | } 30 | 31 | const config = yaml.load( 32 | fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"), 33 | ); 34 | 35 | let foundAllSeeds = true; 36 | 37 | for (const seed of config.seeds) { 38 | const url = new URL(seed).href; 39 | if (!pages.has(url)) { 40 | foundAllSeeds = false; 41 | } 42 | } 43 | expect(foundAllSeeds).toBe(true); 44 | 45 | expect( 46 | fs.existsSync("test-crawls/collections/configtest/configtest.wacz"), 47 | ).toBe(true); 48 | }); 49 | 50 | test("check yaml config file will be overwritten by command line", async () => { 51 | try { 52 | await exec( 53 | "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000", 54 | ); 55 | } catch (error) { 56 | console.log(error); 57 | } 58 | 59 | const crawledPages = fs.readFileSync( 60 | "test-crawls/collections/configtest-2/pages/pages.jsonl", 61 | "utf8", 62 | ); 63 | const pages = new Set(); 64 | 65 | for (const line of crawledPages.trim().split("\n")) { 66 | const url = JSON.parse(line).url; 67 | if (url) { 68 | pages.add(url); 69 | } 70 | } 71 | 72 | expect(pages.has("https://specs.webrecorder.net/")).toBe(true); 73 | expect(pages.size).toBe(1); 74 | }); 75 | -------------------------------------------------------------------------------- /tests/config_stdin.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | import yaml from "js-yaml"; 4 | 5 | test("pass config file via stdin", async () => { 6 | const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8"); 7 | const config = yaml.load(configYaml); 8 | 9 | try { 10 | const proc = child_process.execSync( 11 | "docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202", 12 | { input: configYaml, stdin: "inherit", encoding: "utf8" }, 13 | ); 14 | 15 | //console.log(proc); 16 | } catch (error) { 17 | console.log(error); 18 | } 19 | 20 | const crawledPages = fs.readFileSync( 21 | "test-crawls/collections/config-stdin/pages/pages.jsonl", 22 | "utf8", 23 | ); 24 | const pages = new Set(); 25 | 26 | for (const line of crawledPages.trim().split("\n")) { 27 | const url = JSON.parse(line).url; 28 | if (!url) { 29 | continue; 30 | } 31 | pages.add(url); 32 | expect(url.indexOf("webrecorder.net/202")).toEqual(-1); 33 | } 34 | 35 | let foundAllSeeds = true; 36 | 37 | for (const seed of config.seeds) { 38 | const url = new URL(seed).href; 39 | if (!pages.has(url)) { 40 | foundAllSeeds = false; 41 | } 42 | } 43 | expect(foundAllSeeds).toBe(true); 44 | 45 | expect( 46 | fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz"), 47 | ).toBe(true); 48 | }); 49 | -------------------------------------------------------------------------------- /tests/crawl_overwrite.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | 4 | test("ensure --overwrite with existing collection results in a successful crawl", async () => { 5 | child_process.execSync( 6 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite", 7 | ); 8 | 9 | child_process.execSync( 10 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite --overwrite", 11 | ); 12 | }); 13 | 14 | test("check that the pages.jsonl file exists in the collection under the pages folder", () => { 15 | expect( 16 | fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"), 17 | ).toBe(true); 18 | }); 19 | 20 | test("check that the WACZ file exists in the collection", () => { 21 | expect( 22 | fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"), 23 | ).toBe(true); 24 | }); 25 | 26 | //----------- 27 | 28 | test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => { 29 | child_process.execSync( 30 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ --collection overwrite-nothing --overwrite", 31 | ); 32 | }); 33 | 34 | test("check that the pages.jsonl file exists in the collection under the pages folder", () => { 35 | expect( 36 | fs.existsSync( 37 | "test-crawls/collections/overwrite-nothing/pages/pages.jsonl", 38 | ), 39 | ).toBe(true); 40 | }); 41 | 42 | test("check that the WACZ file exists in the collection", () => { 43 | expect( 44 | fs.existsSync( 45 | "test-crawls/collections/overwrite-nothing/pages/pages.jsonl", 46 | ), 47 | ).toBe(true); 48 | }); 49 | -------------------------------------------------------------------------------- /tests/custom-behavior-flow.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import Redis from "ioredis"; 3 | 4 | 5 | async function sleep(time) { 6 | await new Promise((resolve) => setTimeout(resolve, time)); 7 | } 8 | 9 | test("test pushing behavior logs to redis", async () => { 10 | const child = child_process.exec("docker run -p 36398:6379 -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ -e CRAWL_ID=behavior-logs-flow-test --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --url https://webrecorder.net/ --customBehaviors /custom-behaviors/custom-flow.json --scopeType page --logBehaviorsToRedis --pageExtraDelay 20"); 11 | 12 | let crawlFinished = false; 13 | 14 | child.on("exit", function () { 15 | crawlFinished = true; 16 | }); 17 | 18 | const redis = new Redis("redis://127.0.0.1:36398/0", { lazyConnect: true, retryStrategy: () => null }); 19 | 20 | await sleep(3000); 21 | 22 | await redis.connect({ maxRetriesPerRequest: 50 }); 23 | 24 | let customLogLineCount = 0; 25 | let done = false; 26 | 27 | while (!crawlFinished) { 28 | let res = null; 29 | try { 30 | res = await redis.rpop("behavior-logs-flow-test:b"); 31 | } catch (e) { 32 | break; 33 | } 34 | if (!res) { 35 | await sleep(500); 36 | continue; 37 | } 38 | const json = JSON.parse(res); 39 | if (json.context === "behaviorScriptCustom") { 40 | customLogLineCount++; 41 | } 42 | if (json.message === "All Steps Done!") { 43 | done = true; 44 | } 45 | } 46 | 47 | expect(customLogLineCount).toEqual(4); 48 | expect(done).toBe(true); 49 | }); 50 | -------------------------------------------------------------------------------- /tests/custom-behaviors/custom-2.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-unused-vars */ 2 | class TestBehavior2 { 3 | static init() { 4 | return { 5 | state: {}, 6 | }; 7 | } 8 | 9 | static get id() { 10 | return "TestBehavior2"; 11 | } 12 | 13 | static isMatch() { 14 | return window.location.origin === "https://old.webrecorder.net"; 15 | } 16 | 17 | async *run(ctx) { 18 | ctx.log("In Test Behavior 2!"); 19 | yield ctx.Lib.getState(ctx, "test-stat-2"); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tests/custom-behaviors/custom-flow.json: -------------------------------------------------------------------------------- 1 | { 2 | "title": "test WR create click + enter URL", 3 | "steps": [ 4 | { 5 | "type": "navigate", 6 | "url": "https://webrecorder.net/" 7 | }, 8 | { 9 | "type": "click", 10 | "target": "main", 11 | "selectors": [ 12 | [ 13 | "aria/[role=\"main\"]", 14 | "aria/[role=\"textbox\"]" 15 | ], 16 | [ 17 | "#archive-url" 18 | ], 19 | [ 20 | "xpath///*[@id=\"archive-url\"]" 21 | ], 22 | [ 23 | "pierce/#archive-url" 24 | ] 25 | ], 26 | "offsetY": 19.0078125, 27 | "offsetX": 310.5 28 | }, 29 | { 30 | "type": "change", 31 | "value": "https://example.com/", 32 | "selectors": [ 33 | [ 34 | "aria/[role=\"main\"]", 35 | "aria/[role=\"textbox\"]" 36 | ], 37 | [ 38 | "#archive-url" 39 | ], 40 | [ 41 | "xpath///*[@id=\"archive-url\"]" 42 | ], 43 | [ 44 | "pierce/#archive-url" 45 | ] 46 | ], 47 | "target": "main" 48 | }, 49 | { 50 | "type": "keyDown", 51 | "target": "main", 52 | "key": "Enter" 53 | } 54 | ] 55 | } 56 | -------------------------------------------------------------------------------- /tests/custom-behaviors/custom.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable @typescript-eslint/no-unused-vars */ 2 | class TestBehavior { 3 | static init() { 4 | return { 5 | state: {}, 6 | }; 7 | } 8 | 9 | static get id() { 10 | return "TestBehavior"; 11 | } 12 | 13 | static isMatch() { 14 | return window.location.origin === "https://specs.webrecorder.net"; 15 | } 16 | 17 | async *run(ctx) { 18 | ctx.log("In Test Behavior!"); 19 | yield ctx.Lib.getState(ctx, "test-stat"); 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /tests/custom_driver.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | 4 | test("ensure custom driver creates PDF", async () => { 5 | try { 6 | child_process.execSync( 7 | "docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs --limit 1", 8 | ); 9 | } catch (error) { 10 | console.log(error); 11 | } 12 | 13 | const pdfs = fs.readdirSync("test-crawls/collections/custom-driver-1").filter(x => x.endsWith(".pdf")); 14 | expect(pdfs.length).toBe(1); 15 | }); 16 | -------------------------------------------------------------------------------- /tests/custom_selector.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | 4 | test("test custom selector crawls JS files as pages", async () => { 5 | try { 6 | child_process.execSync( 7 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-1 --selectLinks \"script[src]->src\"", 8 | ); 9 | } catch (error) { 10 | console.log(error); 11 | } 12 | 13 | const crawledPages = fs.readFileSync( 14 | "test-crawls/collections/custom-sel-1/pages/pages.jsonl", 15 | "utf8", 16 | ); 17 | const pages = new Set(); 18 | 19 | for (const line of crawledPages.trim().split("\n")) { 20 | const url = JSON.parse(line).url; 21 | if (!url) { 22 | continue; 23 | } 24 | pages.add(url); 25 | } 26 | 27 | const crawledExtraPages = fs.readFileSync( 28 | "test-crawls/collections/custom-sel-1/pages/extraPages.jsonl", 29 | "utf8", 30 | ); 31 | const extraPages = new Set(); 32 | 33 | for (const line of crawledExtraPages.trim().split("\n")) { 34 | const url = JSON.parse(line).url; 35 | if (!url) { 36 | continue; 37 | } 38 | extraPages.add(url); 39 | } 40 | 41 | const expectedPages = new Set([ 42 | "https://www.iana.org/", 43 | ]); 44 | 45 | const expectedExtraPages = new Set([ 46 | "https://www.iana.org/_js/jquery.js", 47 | "https://www.iana.org/_js/iana.js", 48 | ]); 49 | 50 | expect(pages).toEqual(expectedPages); 51 | expect(extraPages).toEqual(expectedExtraPages); 52 | }); 53 | 54 | 55 | test("test invalid selector, crawl fails", async () => { 56 | let status = 0; 57 | try { 58 | child_process.execSync( 59 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"", 60 | ); 61 | } catch (e) { 62 | status = e.status; 63 | } 64 | 65 | // logger fatal exit code 66 | expect(status).toBe(17); 67 | }); 68 | 69 | test("test valid autoclick selector passes validation", async () => { 70 | let failed = false; 71 | 72 | try { 73 | child_process.execSync( 74 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector button --scopeType page", 75 | ); 76 | } catch (e) { 77 | failed = true; 78 | } 79 | 80 | // valid clickSelector 81 | expect(failed).toBe(false); 82 | }); 83 | 84 | 85 | test("test invalid autoclick selector fails validation, crawl fails", async () => { 86 | let status = 0; 87 | 88 | try { 89 | child_process.execSync( 90 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector \",\" --scopeType page", 91 | ); 92 | } catch (e) { 93 | status = e.status; 94 | } 95 | 96 | // logger fatal exit code 97 | expect(status).toBe(17); 98 | }); 99 | 100 | -------------------------------------------------------------------------------- /tests/dryrun.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | 4 | test("ensure dryRun crawl only writes pages and logs", async () => { 5 | child_process.execSync( 6 | 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --generateWACZ --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun --exclude community', 7 | ); 8 | 9 | const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort(); 10 | expect(files.length).toBe(2); 11 | expect(files[0]).toBe("logs"); 12 | expect(files[1]).toBe("pages"); 13 | }); 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/exclude-redirected.test.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import { execSync } from "child_process"; 3 | 4 | // example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains' 5 | // pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded 6 | 7 | test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => { 8 | execSync( 9 | "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1"); 10 | 11 | // no entries besides header 12 | expect( 13 | fs 14 | .readFileSync( 15 | "test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl", 16 | "utf8", 17 | ).trim().split("\n").length 18 | ).toBe(1); 19 | 20 | }); 21 | 22 | -------------------------------------------------------------------------------- /tests/extra_hops_depth.test.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | 3 | import util from "util"; 4 | import { exec as execCallback, execSync } from "child_process"; 5 | 6 | const exec = util.promisify(execCallback); 7 | 8 | const extraHopsTimeout = 180000; 9 | 10 | test( 11 | "check that URLs are crawled 2 extra hops beyond depth", 12 | async () => { 13 | try { 14 | await exec( 15 | "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://old.webrecorder.net/ --limit 5 --timeout 10 --exclude community --exclude tools", 16 | ); 17 | } catch (error) { 18 | console.log(error); 19 | } 20 | 21 | const crawledPages = fs.readFileSync( 22 | "test-crawls/collections/extra-hops-beyond/pages/pages.jsonl", 23 | "utf8", 24 | ); 25 | const crawledPagesArray = crawledPages.trim().split("\n"); 26 | 27 | const crawledExtraPages = fs.readFileSync( 28 | "test-crawls/collections/extra-hops-beyond/pages/extraPages.jsonl", 29 | "utf8", 30 | ); 31 | const crawledExtraPagesArray = crawledExtraPages.trim().split("\n"); 32 | 33 | const expectedPages = [ 34 | "https://old.webrecorder.net/", 35 | ]; 36 | 37 | const expectedExtraPages = [ 38 | "https://old.webrecorder.net/blog", 39 | "https://old.webrecorder.net/about", 40 | "https://old.webrecorder.net/contact", 41 | "https://old.webrecorder.net/faq", 42 | ]; 43 | 44 | // first line is the header, not page, so adding -1 45 | expect(crawledPagesArray.length - 1).toEqual(expectedPages.length); 46 | expect(crawledExtraPagesArray.length - 1).toEqual(expectedExtraPages.length); 47 | 48 | for (const page of crawledPagesArray) { 49 | const parsedPage = JSON.parse(page); 50 | const url = parsedPage.url; 51 | if (!url) { 52 | continue; 53 | } 54 | expect(expectedPages.indexOf(url) >= 0).toBe(true); 55 | 56 | expect(parsedPage.seed).toEqual(true); 57 | expect(parsedPage.depth).toEqual(0); 58 | } 59 | 60 | for (const page of crawledExtraPagesArray) { 61 | const parsedPage = JSON.parse(page); 62 | const url = parsedPage.url; 63 | if (!url) { 64 | continue; 65 | } 66 | expect(expectedExtraPages.indexOf(url) >= 0).toBe(true); 67 | expect(parsedPage.depth >= 1).toBe(true); 68 | } 69 | }, 70 | extraHopsTimeout, 71 | ); 72 | 73 | 74 | test("extra hops applies beyond depth limit", () => { 75 | try { 76 | execSync( 77 | "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-depth-0 --extraHops 1 --url https://old.webrecorder.net/ --limit 2 --depth 0 --timeout 10 --exclude community --exclude tools", 78 | ); 79 | } catch (error) { 80 | console.log(error); 81 | } 82 | 83 | const crawledExtraPages = fs.readFileSync( 84 | "test-crawls/collections/extra-hops-depth-0/pages/extraPages.jsonl", 85 | "utf8", 86 | ); 87 | const crawledExtraPagesArray = crawledExtraPages.trim().split("\n"); 88 | 89 | expect(crawledExtraPagesArray.length - 1).toEqual(1); 90 | }); 91 | 92 | -------------------------------------------------------------------------------- /tests/file_stats.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | 4 | test("ensure that stats file is modified", async () => { 5 | const child = child_process.exec( 6 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --generateWACZ --text --limit 3 --exclude community --collection file-stats --statsFilename progress.json", 7 | ); 8 | 9 | // detect crawler exit 10 | let crawler_exited = false; 11 | child.on("exit", function () { 12 | crawler_exited = true; 13 | }); 14 | 15 | // helper function to sleep 16 | const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); 17 | 18 | // wait for stats file creation up to 30 secs (to not wait indefinitely) 19 | let counter = 0; 20 | while (!fs.existsSync("test-crawls/progress.json")) { 21 | await sleep(100); 22 | counter++; 23 | expect(counter < 300).toBe(true); 24 | } 25 | 26 | // get initial modification time 27 | const initial_mtime = fs.fstatSync( 28 | fs.openSync("test-crawls/progress.json", "r"), 29 | ).mtime; 30 | 31 | // wait for crawler exit 32 | while (!crawler_exited) { 33 | await sleep(100); 34 | } 35 | 36 | // get final modification time 37 | const final_mtime = fs.fstatSync( 38 | fs.openSync("test-crawls/progress.json", "r"), 39 | ).mtime; 40 | 41 | // compare initial and final modification time 42 | const diff = Math.abs(final_mtime - initial_mtime); 43 | expect(diff > 0).toBe(true); 44 | }); 45 | 46 | test("check that stats file format is correct", () => { 47 | const data = fs.readFileSync("test-crawls/progress.json", "utf8"); 48 | const dataJSON = JSON.parse(data); 49 | expect(dataJSON.crawled).toEqual(3); 50 | expect(dataJSON.total).toEqual(3); 51 | expect(dataJSON.pending).toEqual(0); 52 | expect(dataJSON.failed).toEqual(0); 53 | expect(dataJSON.limit.max).toEqual(3); 54 | expect(dataJSON.limit.hit).toBe(true); 55 | expect(dataJSON.pendingPages.length).toEqual(0); 56 | }); 57 | -------------------------------------------------------------------------------- /tests/fixtures/crawl-1.yaml: -------------------------------------------------------------------------------- 1 | name: crawl-test-1 2 | collection: configtest 3 | seeds: 4 | - https://webrecorder.net/ 5 | - https://specs.webrecorder.net/ 6 | 7 | generateWACZ: true 8 | -------------------------------------------------------------------------------- /tests/fixtures/crawl-2.yaml: -------------------------------------------------------------------------------- 1 | name: crawl-test-2 2 | 3 | seeds: 4 | - https://webrecorder.net/ 5 | 6 | collection: config-stdin 7 | depth: 1 8 | behaviors: "" 9 | 10 | generateWACZ: true 11 | 12 | warcinfo: 13 | operator: test 14 | host: hostname 15 | -------------------------------------------------------------------------------- /tests/fixtures/driver-1.mjs: -------------------------------------------------------------------------------- 1 | export default async ({ data, page, crawler }) => { 2 | await crawler.loadPage(page, data); 3 | 4 | await page.pdf({"path": `${crawler.collDir}/${data.pageid}.pdf`}); 5 | }; 6 | -------------------------------------------------------------------------------- /tests/fixtures/pages.jsonl: -------------------------------------------------------------------------------- 1 | {"format":"json-pages-1.0","id":"pages","title":"All Pages","hasText":true} 2 | {"title":"Example Domain","url":"http://www.example.com/","id":"2qok7uessksqo91vt90x8q","size":1256,"ts":"2021-02-24T02:31:27.538Z","text":"Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n domain in literature without prior coordination or asking for permission.\nMore information..."} 3 | -------------------------------------------------------------------------------- /tests/fixtures/proxy-key: -------------------------------------------------------------------------------- 1 | -----BEGIN OPENSSH PRIVATE KEY----- 2 | b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW 3 | QyNTUxOQAAACBlI7zERGb3+ugvSkqMQytJp/XEQhsAw5c2We9HccnV0gAAAJi1AenmtQHp 4 | 5gAAAAtzc2gtZWQyNTUxOQAAACBlI7zERGb3+ugvSkqMQytJp/XEQhsAw5c2We9HccnV0g 5 | AAAEB76AYPsL0SvcLL7AUKUwF9jY077ylBHaIea3sWs3b9s2UjvMREZvf66C9KSoxDK0mn 6 | 9cRCGwDDlzZZ70dxydXSAAAADnRlc3RAbG9jYWxob3N0AQIDBAUGBw== 7 | -----END OPENSSH PRIVATE KEY----- 8 | -------------------------------------------------------------------------------- /tests/fixtures/proxy-key.pub: -------------------------------------------------------------------------------- 1 | ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGUjvMREZvf66C9KSoxDK0mn9cRCGwDDlzZZ70dxydXS test@localhost 2 | -------------------------------------------------------------------------------- /tests/fixtures/urlSeedFile.txt: -------------------------------------------------------------------------------- 1 | https://webrecorder.net/about/ 2 | https://specs.webrecorder.net/wacz/1.1.1/ 3 | -------------------------------------------------------------------------------- /tests/http-auth.test.js: -------------------------------------------------------------------------------- 1 | import { execSync, spawn } from "child_process"; 2 | import fs from "fs"; 3 | import yaml from "js-yaml"; 4 | 5 | let proc = null; 6 | 7 | const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal"; 8 | 9 | beforeAll(() => { 10 | proc = spawn("../../node_modules/.bin/http-server", ["-p", "31501", "--username", "user", "--password", "pass"], {cwd: "./docs/site"}); 11 | }); 12 | 13 | afterAll(() => { 14 | if (proc) { 15 | proc.kill(); 16 | } 17 | }); 18 | 19 | test("run crawl without auth", () => { 20 | let status = 0; 21 | try { 22 | execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --limit 2 --failOnFailedSeed`); 23 | } catch (e) { 24 | status = e.status; 25 | } 26 | expect(status).toBe(1); 27 | }); 28 | 29 | test("run crawl with auth", () => { 30 | let status = 0; 31 | try { 32 | execSync(`docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://user:pass@${DOCKER_HOST_NAME}:31501 --limit 2 --failOnFailedSeed --collection http-auth-test`); 33 | } catch (e) { 34 | status = e.status; 35 | } 36 | 37 | expect(status).toBe(0); 38 | 39 | expect(fs 40 | .readFileSync( 41 | "test-crawls/collections/http-auth-test/pages/pages.jsonl", 42 | "utf8", 43 | ) 44 | .trim() 45 | .split("\n") 46 | .length).toBe(2); 47 | 48 | expect(fs 49 | .readFileSync( 50 | "test-crawls/collections/http-auth-test/pages/extraPages.jsonl", 51 | "utf8", 52 | ) 53 | .trim() 54 | .split("\n") 55 | .length).toBe(2); 56 | 57 | }); 58 | 59 | test("run crawl with auth config.yaml", () => { 60 | const config = { 61 | seeds: [{ 62 | url: `http://${DOCKER_HOST_NAME}:31501`, 63 | auth: "user:pass" 64 | }], 65 | limit: "2", 66 | collection: "http-auth-test-2", 67 | failOnFailedSeed: "true" 68 | } 69 | 70 | const configYaml = yaml.dump(config); 71 | 72 | let status = 0; 73 | try { 74 | execSync("docker run -i --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin", 75 | { input: configYaml, stdin: "inherit", encoding: "utf8" }); 76 | 77 | } catch (e) { 78 | console.log(e); 79 | status = e.status; 80 | } 81 | 82 | expect(status).toBe(0); 83 | 84 | expect(fs 85 | .readFileSync( 86 | "test-crawls/collections/http-auth-test-2/pages/pages.jsonl", 87 | "utf8", 88 | ) 89 | .trim() 90 | .split("\n") 91 | .length).toBe(2); 92 | 93 | expect(fs 94 | .readFileSync( 95 | "test-crawls/collections/http-auth-test-2/pages/extraPages.jsonl", 96 | "utf8", 97 | ) 98 | .trim() 99 | .split("\n") 100 | .length).toBe(2); 101 | }); 102 | -------------------------------------------------------------------------------- /tests/invalid-behaviors/invalid-export.js: -------------------------------------------------------------------------------- 1 | export class TestBehavior { 2 | static init() { 3 | return { 4 | state: {}, 5 | }; 6 | } 7 | 8 | static get id() { 9 | return "TestBehavior"; 10 | } 11 | 12 | static isMatch() { 13 | return window.location.origin === "https://example.com"; 14 | } 15 | 16 | async *run(ctx) { 17 | ctx.log("In Test Behavior!"); 18 | yield ctx.Lib.getState(ctx, "test-stat"); 19 | } 20 | } 21 | -------------------------------------------------------------------------------- /tests/lang-code.test.js: -------------------------------------------------------------------------------- 1 | import { execSync } from "child_process"; 2 | 3 | test("run crawl with invalid lang", () => { 4 | let status = 0; 5 | try { 6 | execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang e --limit 1`); 7 | } catch (e) { 8 | status = e.status; 9 | } 10 | expect(status).toBe(17); 11 | }); 12 | 13 | test("run crawl with valid lang", () => { 14 | let status = 0; 15 | try { 16 | execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang en --limit 1`); 17 | } catch (e) { 18 | status = e.status; 19 | } 20 | expect(status).toBe(0); 21 | }); 22 | 23 | 24 | -------------------------------------------------------------------------------- /tests/limit_reached.test.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import util from "util"; 3 | import { exec as execCallback, execSync } from "child_process"; 4 | 5 | const exec = util.promisify(execCallback); 6 | 7 | test("ensure page limit reached", async () => { 8 | execSync( 9 | 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://old.webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json --exclude community', 10 | ); 11 | }); 12 | 13 | test("check limit written to stats file is as expected", () => { 14 | const data = fs.readFileSync("test-crawls/stats.json", "utf8"); 15 | const dataJSON = JSON.parse(data); 16 | expect(dataJSON.crawled).toEqual(12); 17 | expect(dataJSON.total).toEqual(12); 18 | expect(dataJSON.limit.hit).toBe(true); 19 | }); 20 | 21 | test("ensure crawl fails if failOnFailedLimit is reached", async () => { 22 | let passed = true; 23 | try { 24 | await exec( 25 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/will404 --url https://specs.webrecorder.net --failOnInvalidStatus --failOnFailedLimit 1 --limit 10 --collection faillimitreached", 26 | ); 27 | } catch (error) { 28 | expect(error.code).toEqual(12); 29 | passed = false; 30 | } 31 | expect(passed).toBe(false); 32 | }); 33 | 34 | test("ensure crawl fails if timeLimit is reached", async () => { 35 | let passed = true; 36 | try { 37 | await exec( 38 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --timeLimit 1 --limit 2 --collection failontimelimitreached", 39 | ); 40 | } catch (error) { 41 | expect(error.code).toEqual(15); 42 | passed = false; 43 | } 44 | expect(passed).toBe(false); 45 | }); 46 | 47 | test("ensure crawl fails if sizeLimit is reached", async () => { 48 | let passed = true; 49 | try { 50 | await exec( 51 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --sizeLimit 1 --limit 2 --collection failonsizelimitreached", 52 | ); 53 | } catch (error) { 54 | expect(error.code).toEqual(14); 55 | passed = false; 56 | } 57 | expect(passed).toBe(false); 58 | }); 59 | 60 | test("ensure crawl fails if diskUtilizationLimit is reached", async () => { 61 | let passed = true; 62 | try { 63 | await exec( 64 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --diskUtilization 1 --limit 2 --collection failonsizelimitreached", 65 | ); 66 | } catch (error) { 67 | expect(error.code).toEqual(16); 68 | passed = false; 69 | } 70 | expect(passed).toBe(false); 71 | }); 72 | -------------------------------------------------------------------------------- /tests/log_filtering.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | import path from "path"; 4 | 5 | function jsonLinesToArray(string) { 6 | return string 7 | .split("\n") 8 | .filter((line) => { 9 | try { 10 | JSON.parse(line); 11 | return true; 12 | } catch (error) { 13 | return false; 14 | } 15 | }) 16 | .map((line) => JSON.parse(line)); 17 | } 18 | 19 | test("ensure crawl run with log options passes", async () => { 20 | child_process.execSync( 21 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general", 22 | ); 23 | }); 24 | 25 | test("check that log files exist and were filtered according to options", () => { 26 | const logDir = "test-crawls/collections/wr-specs-logs/logs/"; 27 | const logFiles = []; 28 | fs.readdirSync(logDir).forEach((file) => { 29 | if (file.endsWith(".log")) { 30 | logFiles.push(path.join(logDir, file)); 31 | } 32 | }); 33 | 34 | expect(logFiles.length).toBeGreaterThan(0); 35 | 36 | for (let i = 0; i < logFiles.length; i++) { 37 | const logFile = logFiles[i]; 38 | const parsedJSONLines = jsonLinesToArray(fs.readFileSync(logFile, "utf8")); 39 | 40 | expect(parsedJSONLines.length).toBeGreaterThan(0); 41 | 42 | parsedJSONLines.forEach((jsonLine) => { 43 | expect( 44 | jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn", 45 | ).toBe(true); 46 | expect(jsonLine.context).toBe("general"); 47 | }); 48 | } 49 | }); 50 | -------------------------------------------------------------------------------- /tests/mult_url_crawl_with_favicon.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | 4 | const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0]; 5 | const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args); 6 | 7 | test("ensure multi url crawl run with docker run passes", async () => { 8 | child_process.execSync( 9 | 'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://old.webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2 --exclude community', 10 | ); 11 | }); 12 | 13 | testIf(doValidate, "validate multi url crawl wacz", () => { 14 | child_process.execSync( 15 | "wacz validate --file ./test-crawls/collections/advanced/advanced.wacz", 16 | ); 17 | }); 18 | 19 | test("check that the favicon made it into the pages jsonl file", () => { 20 | expect( 21 | fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl"), 22 | ).toBe(true); 23 | 24 | const data1 = JSON.parse( 25 | fs 26 | .readFileSync( 27 | "test-crawls/collections/advanced/pages/pages.jsonl", 28 | "utf8", 29 | ) 30 | .split("\n")[1], 31 | ); 32 | const data2 = JSON.parse( 33 | fs 34 | .readFileSync( 35 | "test-crawls/collections/advanced/pages/pages.jsonl", 36 | "utf8", 37 | ) 38 | .split("\n")[2], 39 | ); 40 | const data = [data1, data2]; 41 | for (const d of data) { 42 | if (d.url === "https://old.webrecorder.net/") { 43 | expect(d.favIconUrl).toEqual( 44 | "https://old.webrecorder.net/assets/favicon.ico", 45 | ); 46 | } 47 | if (d.url === "https://iana.org/") { 48 | expect(d.favIconUrl).toEqual( 49 | "https://www.iana.org/_img/bookmark_icon.ico", 50 | ); 51 | } 52 | } 53 | }); 54 | -------------------------------------------------------------------------------- /tests/multi-instance-crawl.test.js: -------------------------------------------------------------------------------- 1 | import {exec, execSync} from "child_process"; 2 | import fs from "fs"; 3 | import { Redis } from "ioredis"; 4 | 5 | function sleep(ms) { 6 | return new Promise((resolve) => setTimeout(resolve, ms)); 7 | } 8 | 9 | 10 | let redisId; 11 | let crawler1, crawler2; 12 | 13 | beforeAll(() => { 14 | fs.rmSync("./test-crawls/collections/shared-crawler-1", { recursive: true, force: true }); 15 | fs.rmSync("./test-crawls/collections/shared-crawler-2", { recursive: true, force: true }); 16 | 17 | execSync("docker network create crawl"); 18 | 19 | redisId = execSync("docker run --rm --network=crawl -p 37379:6379 --name redis -d redis"); 20 | 21 | crawler1 = runCrawl("crawler-1"); 22 | crawler2 = runCrawl("crawler-2"); 23 | }); 24 | 25 | afterAll(async () => { 26 | execSync(`docker kill ${redisId}`); 27 | 28 | await sleep(3000); 29 | 30 | await Promise.allSettled([crawler1, crawler2]); 31 | 32 | execSync("docker network rm crawl"); 33 | }); 34 | 35 | function runCrawl(name) { 36 | const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=crawl --hostname=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection shared-${name} --crawlId testcrawl --redisStoreUrl redis://redis:6379`); 37 | 38 | return new Promise((resolve) => { 39 | crawler.on("exit", (code) => { 40 | resolve(code); 41 | }); 42 | }); 43 | } 44 | 45 | test("run crawlers with external redis", async () => { 46 | const redis = new Redis("redis://127.0.0.1:37379/0", { lazyConnect: true, retryStrategy: () => null }); 47 | 48 | await sleep(3000); 49 | 50 | await redis.connect({ maxRetriesPerRequest: 50 }); 51 | 52 | let count = 0; 53 | 54 | while (true) { 55 | try { 56 | const values = await redis.hgetall("testcrawl:status"); 57 | expect(values["crawler-1"]).toBe("running"); 58 | expect(values["crawler-2"]).toBe("running"); 59 | break; 60 | } catch (e) { 61 | if (count++ < 5) { 62 | await sleep(1000); 63 | continue; 64 | } 65 | 66 | throw e; 67 | } 68 | } 69 | 70 | }); 71 | 72 | 73 | test("finish crawls successfully", async () => { 74 | const res = await Promise.allSettled([crawler1, crawler2]); 75 | expect(res[0].value).toBe(0); 76 | expect(res[1].value).toBe(0); 77 | }, 180000); 78 | 79 | test("ensure correct number of pages", () => { 80 | 81 | expect( 82 | fs.existsSync("test-crawls/collections/shared-crawler-1/pages/pages.jsonl"), 83 | ).toBe(true); 84 | 85 | expect( 86 | fs.existsSync("test-crawls/collections/shared-crawler-2/pages/pages.jsonl"), 87 | ).toBe(true); 88 | 89 | const pages_1 = fs 90 | .readFileSync( 91 | "test-crawls/collections/shared-crawler-1/pages/pages.jsonl", 92 | "utf8", 93 | ) 94 | .trim() 95 | .split("\n"); 96 | 97 | const pages_2 = fs 98 | .readFileSync( 99 | "test-crawls/collections/shared-crawler-2/pages/pages.jsonl", 100 | "utf8", 101 | ) 102 | .trim() 103 | .split("\n"); 104 | 105 | // add 2 for heading in each file 106 | expect(pages_1.length + pages_2.length).toBe(1 + 2); 107 | }); 108 | 109 | test("ensure correct number of extraPages", () => { 110 | 111 | expect( 112 | fs.existsSync("test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl"), 113 | ).toBe(true); 114 | 115 | expect( 116 | fs.existsSync("test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl"), 117 | ).toBe(true); 118 | 119 | const pages_1 = fs 120 | .readFileSync( 121 | "test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl", 122 | "utf8", 123 | ) 124 | .trim() 125 | .split("\n"); 126 | 127 | const pages_2 = fs 128 | .readFileSync( 129 | "test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl", 130 | "utf8", 131 | ) 132 | .trim() 133 | .split("\n"); 134 | 135 | // add 2 for heading in each file 136 | expect(pages_1.length + pages_2.length).toBe(3 + 2); 137 | }); 138 | -------------------------------------------------------------------------------- /tests/non-html-crawl.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | import path from "path"; 4 | import { WARCParser } from "warcio"; 5 | 6 | const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf"; 7 | const PDF_HTTP = PDF.replace("https", "http"); 8 | 9 | const XML = "https://webrecorder.net/feed.xml"; 10 | const XML_REDIR = "https://www.webrecorder.net/feed.xml"; 11 | 12 | test("PDF: ensure pdf is crawled", () => { 13 | child_process.execSync( 14 | `docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${PDF}" --collection crawl-pdf` 15 | ); 16 | }); 17 | 18 | test("PDF: check that individual WARCs have PDF written as 200 response", async () => { 19 | const archiveWarcLists = fs.readdirSync( 20 | "test-crawls/collections/crawl-pdf/archive", 21 | ); 22 | 23 | const warcName = path.join("test-crawls/collections/crawl-pdf/archive", archiveWarcLists[0]); 24 | 25 | const nodeStream = fs.createReadStream(warcName); 26 | 27 | const parser = new WARCParser(nodeStream); 28 | 29 | let statusCode = -1; 30 | 31 | for await (const record of parser) { 32 | if (record.warcType !== "response") { 33 | continue; 34 | } 35 | 36 | if (record.warcTargetURI === PDF) { 37 | statusCode = record.httpHeaders.statusCode; 38 | } 39 | } 40 | 41 | expect(statusCode).toBe(200); 42 | }); 43 | 44 | test("PDF: ensure pdf with redirect is crawled", () => { 45 | child_process.execSync( 46 | `docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${PDF_HTTP}" --collection crawl-pdf --generateCDX` 47 | ); 48 | }); 49 | 50 | test("PDF: check that the pages.jsonl file entry contains status code and mime type", () => { 51 | expect( 52 | fs.existsSync("test-crawls/collections/crawl-pdf/pages/pages.jsonl"), 53 | ).toBe(true); 54 | 55 | 56 | const pages = fs 57 | .readFileSync( 58 | "test-crawls/collections/crawl-pdf/pages/pages.jsonl", 59 | "utf8", 60 | ) 61 | .trim() 62 | .split("\n"); 63 | 64 | expect(pages.length).toBe(3); 65 | 66 | const page = JSON.parse(pages[1]); 67 | expect(page.url).toBe(PDF); 68 | expect(page.status).toBe(200); 69 | expect(page.mime).toBe("application/pdf"); 70 | expect(page.loadState).toBe(2); 71 | 72 | const pageH = JSON.parse(pages[2]); 73 | expect(pageH.url).toBe(PDF_HTTP); 74 | expect(pageH.status).toBe(200); 75 | expect(pageH.mime).toBe("application/pdf"); 76 | expect(pageH.loadState).toBe(2); 77 | }); 78 | 79 | test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => { 80 | const filedata = fs.readFileSync( 81 | "test-crawls/collections/crawl-pdf/indexes/index.cdxj", 82 | { encoding: "utf-8" }, 83 | ); 84 | 85 | const lines = filedata.trim().split("\n"); 86 | const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1); 87 | 88 | expect(cdxj.length).toBe(5); 89 | 90 | expect(cdxj[0].url).toBe(PDF_HTTP); 91 | expect(cdxj[0].status).toBe("301"); 92 | 93 | expect(cdxj[1].url).toBe(PDF); 94 | expect(cdxj[1].status).toBe("200"); 95 | expect(cdxj[1].mime).toBe("application/pdf"); 96 | 97 | expect(cdxj[2].url).toBe(PDF); 98 | expect(cdxj[2].status).toBe("200"); 99 | expect(cdxj[2].mime).toBe("application/pdf"); 100 | 101 | expect(cdxj[3].url).toBe("urn:pageinfo:" + PDF_HTTP); 102 | expect(cdxj[3].mime).toBe("application/json"); 103 | 104 | expect(cdxj[4].url).toBe("urn:pageinfo:" + PDF); 105 | expect(cdxj[4].mime).toBe("application/json"); 106 | }); 107 | 108 | test("XML: ensure with and without redirect is crawled", () => { 109 | child_process.execSync( 110 | `docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url "${XML}" --url "${XML_REDIR}" --collection crawl-xml --generateCDX` 111 | ); 112 | }); 113 | 114 | test("XML: check pages.jsonl file entry contains status code and mime type", () => { 115 | expect( 116 | fs.existsSync("test-crawls/collections/crawl-xml/pages/pages.jsonl"), 117 | ).toBe(true); 118 | 119 | 120 | const pages = fs 121 | .readFileSync( 122 | "test-crawls/collections/crawl-xml/pages/pages.jsonl", 123 | "utf8", 124 | ) 125 | .trim() 126 | .split("\n"); 127 | 128 | expect(pages.length).toBe(3); 129 | 130 | const page = JSON.parse(pages[1]); 131 | expect(page.url).toBe(XML); 132 | expect(page.status).toBe(200); 133 | expect(page.mime).toBe("application/xml"); 134 | expect(page.loadState).toBe(2); 135 | 136 | const pageH = JSON.parse(pages[2]); 137 | expect(pageH.url).toBe(XML_REDIR); 138 | expect(pageH.status).toBe(200); 139 | expect(pageH.mime).toBe("application/xml"); 140 | expect(pageH.loadState).toBe(2); 141 | }); 142 | 143 | test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinfo entries", () => { 144 | const filedata = fs.readFileSync( 145 | "test-crawls/collections/crawl-xml/indexes/index.cdxj", 146 | { encoding: "utf-8" }, 147 | ); 148 | 149 | const lines = filedata.trim().split("\n"); 150 | const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1); 151 | 152 | expect(cdxj.length).toBe(6); 153 | 154 | expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico"); 155 | 156 | expect(cdxj[1].url).toBe(XML); 157 | expect(cdxj[1].status).toBe("200"); 158 | expect(cdxj[1].mime).toBe("application/xml"); 159 | 160 | expect(cdxj[2].url).toBe(XML); 161 | expect(cdxj[2].status).toBe("200"); 162 | expect(cdxj[2].mime).toBe("application/xml"); 163 | 164 | expect(cdxj[3].url).toBe(XML_REDIR); 165 | expect(cdxj[3].status).toBe("301"); 166 | 167 | expect(cdxj[4].url).toBe("urn:pageinfo:" + XML); 168 | expect(cdxj[4].mime).toBe("application/json"); 169 | 170 | expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR); 171 | expect(cdxj[5].mime).toBe("application/json"); 172 | }); 173 | 174 | 175 | -------------------------------------------------------------------------------- /tests/pageinfo-records.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | import path from "path"; 4 | import { WARCParser } from "warcio"; 5 | 6 | test("run warc and ensure pageinfo records contain the correct resources", async () => { 7 | child_process.execSync( 8 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC", 9 | ); 10 | 11 | const filename = path.join( 12 | "test-crawls", 13 | "collections", 14 | "page-info-test", 15 | "page-info-test_0.warc.gz", 16 | ); 17 | 18 | const nodeStream = fs.createReadStream(filename); 19 | 20 | const parser = new WARCParser(nodeStream); 21 | 22 | let foundIndex = false; 23 | let foundAbout = false; 24 | let foundInvalid = false; 25 | 26 | for await (const record of parser) { 27 | if (record.warcType === "response" && 28 | (record.warcTargetURI === "https://old.webrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) { 29 | expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3"); 30 | } 31 | 32 | if ( 33 | !foundIndex && 34 | record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/" 35 | ) { 36 | foundIndex = true; 37 | const text = await record.contentText(); 38 | validateResourcesIndex(JSON.parse(text)); 39 | } 40 | 41 | if ( 42 | !foundAbout && 43 | record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/about" 44 | ) { 45 | foundAbout = true; 46 | const text = await record.contentText(); 47 | validateResourcesAbout(JSON.parse(text)); 48 | } 49 | 50 | if ( 51 | !foundInvalid && 52 | record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/" 53 | ) { 54 | foundInvalid = true; 55 | const text = await record.contentText(); 56 | validateResourcesInvalid(JSON.parse(text)); 57 | } 58 | } 59 | 60 | expect(foundIndex).toBe(true); 61 | expect(foundAbout).toBe(true); 62 | expect(foundInvalid).toBe(true); 63 | }); 64 | 65 | function validateResourcesIndex(json) { 66 | expect(json).toHaveProperty("pageid"); 67 | expect(json).toHaveProperty("url"); 68 | expect(json).toHaveProperty("ts"); 69 | expect(json).toHaveProperty("urls"); 70 | expect(json.counts).toEqual({ jsErrors: 0 }); 71 | expect(json.urls).toEqual({ 72 | "https://old.webrecorder.net/": { 73 | status: 200, 74 | mime: "text/html", 75 | type: "document", 76 | }, 77 | "https://old.webrecorder.net/assets/tools/logo-pywb.png": { 78 | mime: "image/png", 79 | status: 200, 80 | type: "image", 81 | }, 82 | "https://old.webrecorder.net/assets/brand/archivewebpage-icon-color.svg": { 83 | mime: "image/svg+xml", 84 | status: 200, 85 | type: "image", 86 | }, 87 | "https://old.webrecorder.net/assets/brand/browsertrix-icon-color.svg": { 88 | mime: "image/svg+xml", 89 | status: 200, 90 | type: "image", 91 | }, 92 | "https://old.webrecorder.net/assets/brand/browsertrixcrawler-icon-color.svg": { 93 | mime: "image/svg+xml", 94 | status: 200, 95 | type: "image", 96 | }, 97 | "https://old.webrecorder.net/assets/brand/replaywebpage-icon-color.svg": { 98 | mime: "image/svg+xml", 99 | status: 200, 100 | type: "image", 101 | }, 102 | "https://old.webrecorder.net/assets/fontawesome/all.css": { 103 | status: 200, 104 | mime: "text/css", 105 | type: "stylesheet", 106 | }, 107 | "https://old.webrecorder.net/assets/wr-logo.svg": { 108 | status: 200, 109 | mime: "image/svg+xml", 110 | type: "image", 111 | }, 112 | "https://old.webrecorder.net/assets/main.css": { 113 | status: 200, 114 | mime: "text/css", 115 | type: "stylesheet", 116 | }, 117 | "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap": 118 | { status: 200, mime: "text/css", type: "stylesheet" }, 119 | "https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap": 120 | { status: 200, mime: "text/css", type: "stylesheet" }, 121 | "https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": 122 | { status: 200, mime: "font/woff2", type: "font" }, 123 | "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": 124 | { status: 200, mime: "font/woff2", type: "font" }, 125 | "https://old.webrecorder.net/assets/favicon.ico": { 126 | status: 200, 127 | mime: "image/vnd.microsoft.icon", 128 | type: "other", 129 | }, 130 | }); 131 | } 132 | 133 | function validateResourcesAbout(json) { 134 | expect(json).toHaveProperty("pageid"); 135 | expect(json).toHaveProperty("url"); 136 | expect(json).toHaveProperty("ts"); 137 | expect(json).toHaveProperty("urls"); 138 | expect(json.counts).toEqual({ jsErrors: 0 }); 139 | expect(json.urls).toEqual({ 140 | "https://old.webrecorder.net/about": { 141 | status: 200, 142 | mime: "text/html", 143 | type: "document", 144 | }, 145 | "https://old.webrecorder.net/assets/main.css": { 146 | status: 200, 147 | mime: "text/css", 148 | type: "stylesheet", 149 | }, 150 | "https://old.webrecorder.net/assets/fontawesome/all.css": { 151 | status: 200, 152 | mime: "text/css", 153 | type: "stylesheet", 154 | }, 155 | "https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap": 156 | { status: 200, mime: "text/css", type: "stylesheet" }, 157 | "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap": 158 | { status: 200, mime: "text/css", type: "stylesheet" }, 159 | "https://old.webrecorder.net/assets/wr-logo.svg": { 160 | status: 200, 161 | mime: "image/svg+xml", 162 | type: "image", 163 | }, 164 | "https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2": 165 | { status: 200, mime: "font/woff2", type: "font" }, 166 | "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2": 167 | { status: 200, mime: "font/woff2", type: "font" }, 168 | }); 169 | } 170 | 171 | function validateResourcesInvalid(json) { 172 | expect(json).toHaveProperty("pageid"); 173 | expect(json).toHaveProperty("url"); 174 | expect(json).toHaveProperty("urls"); 175 | expect(json.counts).toEqual({ jsErrors: 0 }); 176 | expect(json.urls).toEqual({ 177 | "https://invalid.invalid/": { 178 | status: 0, 179 | type: "document", 180 | error: "net::ERR_NAME_NOT_RESOLVED", 181 | }, 182 | }); 183 | } 184 | -------------------------------------------------------------------------------- /tests/qa_compare.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | import { Redis } from "ioredis"; 4 | 5 | const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); 6 | 7 | test("run initial crawl with text and screenshots to prepare for QA", async () => { 8 | fs.rmSync("./test-crawls/qa-wr-net", { recursive: true, force: true }); 9 | 10 | child_process.execSync( 11 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --url https://archiveweb.page/ --url https://old.webrecorder.net/contact --scopeType page --collection qa-wr-net --text to-warc --screenshot view --generateWACZ", 12 | ); 13 | 14 | expect( 15 | fs.existsSync("test-crawls/collections/qa-wr-net/qa-wr-net.wacz"), 16 | ).toBe(true); 17 | }); 18 | 19 | test("run QA comparison, with write pages to redis", async () => { 20 | fs.rmSync("./test-crawls/qa-wr-net-replay", { recursive: true, force: true }); 21 | 22 | const child = child_process.exec( 23 | "docker run -p 36380:6379 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler qa --qaSource /crawls/collections/qa-wr-net/qa-wr-net.wacz --collection qa-wr-net-replay --crawlId test --qaDebugImageDiff --writePagesToRedis --debugAccessRedis --exclude contact", 24 | ); 25 | 26 | // detect crawler exit 27 | let crawler_exited = false; 28 | child.on("exit", function () { 29 | crawler_exited = true; 30 | }); 31 | 32 | const redis = new Redis("redis://127.0.0.1:36380/0", { lazyConnect: true, retryStrategy: () => null }); 33 | 34 | await sleep(3000); 35 | 36 | await redis.connect({ maxRetriesPerRequest: 50 }); 37 | 38 | let count = 0; 39 | 40 | while (count < 3) { 41 | const res = await redis.lpop("test:pages"); 42 | if (!res) { 43 | if (crawler_exited) { 44 | break; 45 | } 46 | await sleep(100); 47 | continue; 48 | } 49 | const json = JSON.parse(res); 50 | expect(json).toHaveProperty("id"); 51 | expect(json).toHaveProperty("url"); 52 | expect(json).toHaveProperty("ts"); 53 | expect(json).toHaveProperty("title"); 54 | expect(json).toHaveProperty("loadState"); 55 | expect(json).toHaveProperty("comparison"); 56 | 57 | expect(json.title.indexOf("contact") < 0).toBe(true); 58 | 59 | expect(json.comparison).toHaveProperty("screenshotMatch"); 60 | expect(json.comparison).toHaveProperty("textMatch"); 61 | expect(json.comparison).toHaveProperty("resourceCounts"); 62 | expect(json.comparison.screenshotMatch).toBe(1); 63 | expect(json.comparison.textMatch).toBe(1); 64 | 65 | expect(json.comparison.resourceCounts).toHaveProperty("crawlGood"); 66 | expect(json.comparison.resourceCounts).toHaveProperty("crawlBad"); 67 | expect(json.comparison.resourceCounts).toHaveProperty("replayGood"); 68 | expect(json.comparison.resourceCounts).toHaveProperty("replayBad"); 69 | 70 | count++; 71 | } 72 | 73 | expect(count).toBe(3); 74 | 75 | // wait for crawler exit 76 | while (!crawler_exited) { 77 | await sleep(100); 78 | } 79 | }); 80 | -------------------------------------------------------------------------------- /tests/retry-failed.test.js: -------------------------------------------------------------------------------- 1 | import { exec, execSync } from "child_process"; 2 | import fs from "fs"; 3 | import http from "http"; 4 | import Redis from "ioredis"; 5 | 6 | const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal"; 7 | 8 | async function sleep(time) { 9 | await new Promise((resolve) => setTimeout(resolve, time)); 10 | } 11 | 12 | let requests = 0; 13 | let success = false; 14 | let server = null; 15 | 16 | beforeAll(() => { 17 | server = http.createServer((req, res) => { 18 | // 3 requests: 2 from browser, 1 direct fetch per attempt 19 | // succeed on 6th request == after 2 retries 20 | if (requests >= 6) { 21 | res.writeHead(200, {"Content-Type": "text/html"}); 22 | res.end("Test Data"); 23 | success = true; 24 | } else { 25 | res.writeHead(503, {"Content-Type": "text/html"}); 26 | res.end("Test Data"); 27 | } 28 | requests++; 29 | }); 30 | 31 | server.listen(31501, "0.0.0.0"); 32 | }); 33 | 34 | afterAll(() => { 35 | server.close(); 36 | }); 37 | 38 | 39 | 40 | test("run crawl with retries for no response", async () => { 41 | execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://invalid-host-x:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --retries 5`); 42 | 43 | const redis = new Redis("redis://127.0.0.1:36387/0", { lazyConnect: true, retryStrategy: () => null }); 44 | 45 | await sleep(3000); 46 | 47 | let numRetries = 0; 48 | 49 | try { 50 | await redis.connect({ 51 | maxRetriesPerRequest: 100, 52 | }); 53 | 54 | while (true) { 55 | const res = await redis.lrange("test:f", 0, -1); 56 | if (res.length) { 57 | const data = JSON.parse(res); 58 | if (data.retry) { 59 | numRetries = data.retry; 60 | break; 61 | } 62 | } 63 | await sleep(20); 64 | } 65 | 66 | } catch (e) { 67 | console.error(e); 68 | } finally { 69 | expect(numRetries).toBe(5); 70 | } 71 | }); 72 | 73 | 74 | test("check only one failed page entry is made", () => { 75 | expect( 76 | fs.existsSync("test-crawls/collections/retry-fail/pages/pages.jsonl"), 77 | ).toBe(true); 78 | 79 | expect( 80 | fs 81 | .readFileSync( 82 | "test-crawls/collections/retry-fail/pages/pages.jsonl", 83 | "utf8", 84 | ).trim().split("\n").length 85 | ).toBe(3); 86 | }); 87 | 88 | 89 | test("run crawl with retries for 503, enough retries to succeed", async () => { 90 | requests = 0; 91 | success = false; 92 | 93 | const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-2 --retries 2 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); 94 | 95 | let status = 0; 96 | 97 | const crawlFinished = new Promise(r => resolve = r); 98 | 99 | // detect crawler exit 100 | let crawler_exited = false; 101 | child.on("exit", function (code) { 102 | status = code; 103 | resolve(); 104 | }); 105 | 106 | await crawlFinished; 107 | 108 | expect(status).toBe(0); 109 | 110 | // (1 + 2) * 3 == 9 requests 111 | expect(requests).toBe(9); 112 | expect(success).toBe(true); 113 | }); 114 | 115 | 116 | test("run crawl with retries for 503, not enough retries, fail", async () => { 117 | requests = 0; 118 | success = false; 119 | 120 | const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-3 --retries 1 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); 121 | 122 | let status = 0; 123 | 124 | const crawlFinished = new Promise(r => resolve = r); 125 | 126 | // detect crawler exit 127 | let crawler_exited = false; 128 | child.on("exit", function (code) { 129 | status = code; 130 | resolve(); 131 | }); 132 | 133 | await crawlFinished; 134 | 135 | expect(status).toBe(1); 136 | // (1 + 1) * 3 requests == 6 requests 137 | expect(requests).toBe(6); 138 | expect(success).toBe(false); 139 | }); 140 | 141 | 142 | test("run crawl with retries for 503, no retries, fail", async () => { 143 | requests = 0; 144 | success = false; 145 | 146 | const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-4 --retries 0 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`); 147 | 148 | let status = 0; 149 | 150 | const crawlFinished = new Promise(r => resolve = r); 151 | 152 | // detect crawler exit 153 | let crawler_exited = false; 154 | child.on("exit", function (code) { 155 | status = code; 156 | resolve(); 157 | }); 158 | 159 | await crawlFinished; 160 | 161 | expect(status).toBe(1); 162 | // (1) * 3 requests == 3 requests 163 | expect(requests).toBe(3); 164 | expect(success).toBe(false); 165 | }); 166 | 167 | 168 | -------------------------------------------------------------------------------- /tests/rollover-writer.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | 4 | test("set rollover to 500K and ensure individual WARCs rollover, including screenshots", async () => { 5 | child_process.execSync( 6 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 5 --exclude community --collection rollover-500K --rolloverSize 500000 --screenshot view --logging debug" 7 | ); 8 | 9 | const warcLists = fs.readdirSync("test-crawls/collections/rollover-500K/archive"); 10 | 11 | let main = 0; 12 | let screenshots = 0; 13 | 14 | for (const name of warcLists) { 15 | if (name.startsWith("rec-")) { 16 | main++; 17 | } else if (name.startsWith("screenshots-")) { 18 | screenshots++; 19 | } 20 | } 21 | 22 | // expect at least 6 main WARCs 23 | expect(main).toBeGreaterThan(5); 24 | 25 | // expect at least 2 screenshot WARCs 26 | expect(screenshots).toBeGreaterThan(1); 27 | 28 | }); 29 | -------------------------------------------------------------------------------- /tests/saved-state.test.js: -------------------------------------------------------------------------------- 1 | import { execSync } from "child_process"; 2 | import fs from "fs"; 3 | import path from "path"; 4 | import yaml from "js-yaml"; 5 | import Redis from "ioredis"; 6 | 7 | 8 | const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl"; 9 | const extraPagesFile = "test-crawls/collections/int-state-test/pages/extraPages.jsonl"; 10 | 11 | 12 | function sleep(ms) { 13 | return new Promise((resolve) => setTimeout(resolve, ms)); 14 | } 15 | 16 | async function waitContainerDone(containerId) { 17 | // containerId is initially the full id, but docker ps 18 | // only prints the short id (first 12 characters) 19 | containerId = containerId.slice(0, 12); 20 | 21 | while (true) { 22 | try { 23 | const res = execSync("docker ps -q", { encoding: "utf-8" }); 24 | if (res.indexOf(containerId) < 0) { 25 | return; 26 | } 27 | } catch (e) { 28 | console.error(e); 29 | } 30 | await sleep(500); 31 | } 32 | } 33 | 34 | async function killContainer(containerId) { 35 | try { 36 | execSync(`docker kill -s SIGINT ${containerId}`); 37 | } catch (e) { 38 | return; 39 | } 40 | 41 | await waitContainerDone(containerId); 42 | } 43 | 44 | 45 | let savedStateFile; 46 | let state; 47 | let numDone; 48 | let numQueued; 49 | let finished; 50 | 51 | test("check crawl interrupted + saved state written", async () => { 52 | let containerId = null; 53 | 54 | try { 55 | containerId = execSync( 56 | "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url http://old.webrecorder.net/ --limit 10 --behaviors \"\" --exclude community", 57 | { encoding: "utf-8" }, 58 | //wait.callback, 59 | ); 60 | } catch (error) { 61 | console.log(error); 62 | } 63 | 64 | // remove existing pagesFile to support reentrancy 65 | try { 66 | fs.unlinkSync(pagesFile); 67 | } catch (e) { 68 | // ignore 69 | } 70 | 71 | while (true) { 72 | try { 73 | const pages = fs 74 | .readFileSync(pagesFile, { encoding: "utf-8" }) 75 | .trim() 76 | .split("\n"); 77 | 78 | if (pages.length >= 2) { 79 | break; 80 | } 81 | } catch (e) { 82 | // ignore 83 | } 84 | 85 | await sleep(500); 86 | } 87 | 88 | await killContainer(containerId); 89 | 90 | const savedStates = fs.readdirSync( 91 | "test-crawls/collections/int-state-test/crawls", 92 | ); 93 | expect(savedStates.length > 0).toEqual(true); 94 | 95 | savedStateFile = savedStates[savedStates.length - 1]; 96 | }); 97 | 98 | test("check parsing saved state + page done + queue present", () => { 99 | expect(savedStateFile).toBeTruthy(); 100 | 101 | const savedState = fs.readFileSync( 102 | path.join("test-crawls/collections/int-state-test/crawls", savedStateFile), 103 | "utf-8", 104 | ); 105 | 106 | const saved = yaml.load(savedState); 107 | 108 | state = saved.state; 109 | finished = state.finished; 110 | 111 | numDone = finished.length; 112 | numQueued = state.queued.length; 113 | 114 | expect(!!state).toBe(true); 115 | expect(numDone > 0).toEqual(true); 116 | expect(numQueued > 0).toEqual(true); 117 | expect(numDone + numQueued).toEqual(10); 118 | 119 | // ensure extra seeds also set 120 | expect(state.extraSeeds).toEqual([ 121 | `{"origSeedId":0,"newUrl":"https://old.webrecorder.net/"}`, 122 | ]); 123 | }); 124 | 125 | test("check crawl restarted with saved state", async () => { 126 | let containerId = null; 127 | 128 | const port = 36379; 129 | 130 | try { 131 | containerId = execSync( 132 | `docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://old.webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors "" --exclude community`, 133 | { encoding: "utf-8" }, 134 | ); 135 | } catch (error) { 136 | console.log(error); 137 | } 138 | 139 | await sleep(2000); 140 | 141 | const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null }); 142 | 143 | try { 144 | await redis.connect({ 145 | maxRetriesPerRequest: 100, 146 | }); 147 | 148 | await sleep(2000); 149 | 150 | expect(await redis.get("test:d")).toBe(numDone + ""); 151 | 152 | for (const url of finished) { 153 | const res = await redis.sismember("test:s", url); 154 | expect(res).toBe(1); 155 | } 156 | } catch (e) { 157 | console.log(e); 158 | } finally { 159 | await waitContainerDone(containerId); 160 | } 161 | }); 162 | 163 | test("ensure correct number of pages was written to pages + extraPages", () => { 164 | const pages = fs 165 | .readFileSync(pagesFile, { encoding: "utf-8" }) 166 | .trim() 167 | .split("\n"); 168 | 169 | // first line is the header 170 | expect(pages.length).toBe(2); 171 | 172 | const extraPages = fs 173 | .readFileSync(extraPagesFile, { encoding: "utf-8" }) 174 | .trim() 175 | .split("\n"); 176 | 177 | // first line is the header 178 | expect(extraPages.length).toBe(10); 179 | }); 180 | -------------------------------------------------------------------------------- /tests/screenshot.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import fs from "fs"; 3 | 4 | // screenshot 5 | 6 | function screenshotWarcExists(name) { 7 | const warcList = fs.readdirSync(`test-crawls/collections/${name}/archive/`); 8 | 9 | for (const warc of warcList) { 10 | if (warc.startsWith("screenshots-")) { 11 | return true; 12 | } 13 | } 14 | 15 | return false; 16 | } 17 | 18 | 19 | test("ensure basic crawl run with --screenshot passes", async () => { 20 | child_process.execSync( 21 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test-with-screenshots --url http://www.example.com/ --screenshot view --workers 2", 22 | ); 23 | }); 24 | 25 | test("check that a screenshots warc file exists in the test collection", () => { 26 | expect(screenshotWarcExists("test-with-screenshots")).toBe(true); 27 | }); 28 | 29 | // fullPageScreenshot 30 | 31 | test("ensure basic crawl run with --fullPageScreenshot passes", async () => { 32 | child_process.execSync( 33 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2", 34 | ); 35 | }); 36 | 37 | test("check that a screenshots warc file exists in the fullpage collection", () => { 38 | expect(screenshotWarcExists("fullpage")).toBe(true); 39 | }); 40 | 41 | // thumbnail 42 | 43 | test("ensure basic crawl run with --thumbnail passes", async () => { 44 | child_process.execSync( 45 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2", 46 | ); 47 | }); 48 | 49 | test("check that a screenshots warc file exists in the thumbnail collection", () => { 50 | expect(screenshotWarcExists("thumbnail")).toBe(true); 51 | }); 52 | 53 | // combination 54 | 55 | test("ensure basic crawl run with multiple screenshot types and --generateWACZ passes", async () => { 56 | child_process.execSync( 57 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2", 58 | ); 59 | }); 60 | 61 | test("check that a screenshots warc file exists in the combined collection", () => { 62 | expect(screenshotWarcExists("combined")).toBe(true); 63 | }); 64 | 65 | test("check that a wacz file exists in the combined collection", () => { 66 | const waczExists = fs.existsSync( 67 | "test-crawls/collections/combined/combined.wacz", 68 | ); 69 | expect(waczExists).toBe(true); 70 | }); 71 | -------------------------------------------------------------------------------- /tests/seeds.test.js: -------------------------------------------------------------------------------- 1 | import util from "util"; 2 | import { exec as execCallback } from "child_process"; 3 | 4 | const exec = util.promisify(execCallback); 5 | 6 | test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => { 7 | let passed = true; 8 | try { 9 | await exec( 10 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://example.invalid --generateWACZ --limit 2 --collection invalidseed", 11 | ); 12 | } catch (error) { 13 | console.log(error); 14 | passed = false; 15 | } 16 | expect(passed).toBe(true); 17 | }); 18 | 19 | test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => { 20 | let passed = true; 21 | try { 22 | await exec( 23 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.invalid --generateWACZ --limit 2 --failOnFailedSeed --collection failseed", 24 | ); 25 | } catch (error) { 26 | expect(error.code).toEqual(1); 27 | passed = false; 28 | } 29 | expect(passed).toBe(false); 30 | }); 31 | 32 | test("ensure seed with network error fails crawl if failOnFailedSeed and failOnInvalidStatus is set", async () => { 33 | let passed = true; 34 | try { 35 | await exec( 36 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://example.invalid --generateWACZ --limit 2 --failOnFailedSeed --failOnInvalidStatus --collection failseedstatus", 37 | ); 38 | } catch (error) { 39 | expect(error.code).toEqual(1); 40 | passed = false; 41 | } 42 | expect(passed).toBe(false); 43 | }); 44 | 45 | test("ensure seed with 4xx/5xx response fails crawl if failOnFailedSeed and failOnInvalidStatus are set", async () => { 46 | let passed = true; 47 | try { 48 | await exec( 49 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://old.webrecorder.net/doesntexist --generateWACZ --limit 2 --failOnFailedSeed --failOnInvalidStatus --collection failseed404status", 50 | ); 51 | } catch (error) { 52 | expect(error.code).toEqual(1); 53 | passed = false; 54 | } 55 | expect(passed).toBe(false); 56 | }); 57 | 58 | test("ensure seed with 4xx/5xx response succeeds if failOnInvalidStatus is not set", async () => { 59 | let passed = true; 60 | try { 61 | await exec( 62 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://old.webrecorder.net/doesntexist --generateWACZ --limit 2 --failOnFailedSeed --collection failseedwithoutinvalidstatus", 63 | ); 64 | } catch (error) { 65 | console.log(error); 66 | passed = false; 67 | } 68 | expect(passed).toBe(true); 69 | }); 70 | 71 | test("ensure crawl fails if no valid seeds are passed", async () => { 72 | let passed = true; 73 | try { 74 | await exec( 75 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.invalid --generateWACZ --limit 2 --collection allinvalidseeds", 76 | ); 77 | } catch (error) { 78 | expect(error.code).toEqual(17); 79 | passed = false; 80 | } 81 | expect(passed).toBe(false); 82 | }); 83 | -------------------------------------------------------------------------------- /tests/sitemap-parse.test.js: -------------------------------------------------------------------------------- 1 | import child_process from "child_process"; 2 | import Redis from "ioredis"; 3 | 4 | function sleep(ms) { 5 | return new Promise((resolve) => setTimeout(resolve, ms)); 6 | } 7 | 8 | async function waitContainer(containerId) { 9 | try { 10 | child_process.execSync(`docker kill -s SIGINT ${containerId}`); 11 | } catch (e) { 12 | return; 13 | } 14 | 15 | // containerId is initially the full id, but docker ps 16 | // only prints the short id (first 12 characters) 17 | containerId = containerId.slice(0, 12); 18 | 19 | while (true) { 20 | try { 21 | const res = child_process.execSync("docker ps -q", { encoding: "utf-8" }); 22 | if (res.indexOf(containerId) < 0) { 23 | return; 24 | } 25 | } catch (e) { 26 | console.error(e); 27 | } 28 | await sleep(500); 29 | } 30 | } 31 | 32 | async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") { 33 | const command = `docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`; 34 | const containerId = child_process.execSync(command, {encoding: "utf-8"}); 35 | 36 | await sleep(3000); 37 | 38 | const redis = new Redis("redis://127.0.0.1:36381/0", { lazyConnect: true, retryStrategy: () => null }); 39 | 40 | let finished = 0; 41 | 42 | try { 43 | await redis.connect({ 44 | maxRetriesPerRequest: 100, 45 | }); 46 | 47 | while (true) { 48 | finished = await redis.zcard("test:q"); 49 | 50 | if (await redis.get("test:sitemapDone")) { 51 | break; 52 | } 53 | if (finished >= numExpected) { 54 | break; 55 | } 56 | } 57 | } catch (e) { 58 | console.error(e); 59 | } finally { 60 | await waitContainer(containerId); 61 | } 62 | 63 | expect(finished).toBeGreaterThanOrEqual(numExpected); 64 | 65 | if (numExpectedLessThan) { 66 | expect(finished).toBeLessThanOrEqual(numExpectedLessThan); 67 | } 68 | } 69 | 70 | test("test sitemap fully finish", async () => { 71 | await runCrawl(3500, "https://developer.mozilla.org/", "", 0); 72 | }); 73 | 74 | test("test sitemap with limit", async () => { 75 | await runCrawl(1900, "https://developer.mozilla.org/", "", 2000); 76 | }); 77 | 78 | test("test sitemap with limit, specific URL", async () => { 79 | await runCrawl(1900, "https://developer.mozilla.org/", "https://developer.mozilla.org/sitemap.xml", 2000); 80 | }); 81 | 82 | test("test sitemap with application/xml content-type", async () => { 83 | await runCrawl(10, "https://bitarchivist.net/", "", 0); 84 | }); 85 | 86 | test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => { 87 | await runCrawl(0, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page"); 88 | }); 89 | -------------------------------------------------------------------------------- /tests/storage.test.js: -------------------------------------------------------------------------------- 1 | import { 2 | calculatePercentageUsed, 3 | checkDiskUtilization, 4 | } from "../dist/util/storage.js"; 5 | 6 | test("ensure calculatePercentageUsed returns expected values", () => { 7 | expect(calculatePercentageUsed(30, 100)).toEqual(30); 8 | 9 | expect(calculatePercentageUsed(1507, 35750)).toEqual(4); 10 | 11 | expect(calculatePercentageUsed(33819, 35750)).toEqual(95); 12 | 13 | expect(calculatePercentageUsed(140, 70)).toEqual(200); 14 | 15 | expect(calculatePercentageUsed(0, 5)).toEqual(0); 16 | }); 17 | 18 | test("verify end-to-end disk utilization not exceeded threshold", async () => { 19 | const params = { 20 | diskUtilization: 90, 21 | combineWARC: true, 22 | generateWACZ: true, 23 | }; 24 | 25 | const mockDfOutput = `\ 26 | Filesystem 1K-blocks Used Available Use% Mounted on 27 | grpcfuse 1000000 285000 715000 28% /crawls`; 28 | 29 | // with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31% 30 | // does not exceed 90% threshold 31 | const returnValue = await checkDiskUtilization( 32 | '/crawls', 33 | params, 34 | 5000 * 1024, 35 | mockDfOutput, 36 | false 37 | ); 38 | expect(returnValue).toEqual({ 39 | stop: false, 40 | used: 28, 41 | projected: 31, 42 | threshold: 90, 43 | }); 44 | }); 45 | 46 | test("verify end-to-end disk utilization exceeds threshold", async () => { 47 | const params = { 48 | diskUtilization: 90, 49 | combineWARC: false, 50 | generateWACZ: true, 51 | }; 52 | 53 | const mockDfOutput = `\ 54 | Filesystem 1K-blocks Used Available Use% Mounted on 55 | grpcfuse 100000 85000 15000 85% /crawls`; 56 | 57 | // with generateWACZ, projected is 85k + 3k x 2 = 91k = 91% 58 | // exceeds 90% threshold 59 | const returnValue = await checkDiskUtilization( 60 | '/crawls', 61 | params, 62 | 3000 * 1024, 63 | mockDfOutput, 64 | false 65 | ); 66 | expect(returnValue).toEqual({ 67 | stop: true, 68 | used: 85, 69 | projected: 91, 70 | threshold: 90, 71 | }); 72 | }); 73 | -------------------------------------------------------------------------------- /tests/text-extract.test.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import child_process from "child_process"; 3 | 4 | test("check that urn:text and urn:textfinal records are written to WARC", async () => { 5 | try { 6 | child_process.execSync( 7 | "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc", 8 | ); 9 | } catch (error) { 10 | //console.log(new TextDecoder().decode(error)); 11 | console.log(error.stderr); 12 | } 13 | 14 | const data = fs.readFileSync( 15 | "test-crawls/collections/text-extract/indexes/index.cdxj", 16 | { encoding: "utf-8" }, 17 | ); 18 | 19 | expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true); 20 | 21 | expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true); 22 | }); 23 | -------------------------------------------------------------------------------- /tests/upload-wacz.test.js: -------------------------------------------------------------------------------- 1 | import { execSync, exec } from "child_process"; 2 | import fs from "fs"; 3 | import { Redis } from "ioredis"; 4 | 5 | 6 | const sleep = (ms) => new Promise((res) => setTimeout(res, ms)); 7 | 8 | let minioId; 9 | 10 | beforeAll(() => { 11 | execSync("docker network create upload-test-net"); 12 | minioId = execSync("docker run --rm -d -p 9000:9000 -p 9001:9001 --name minio --network=upload-test-net minio/minio server /data --console-address ':9001'", {encoding: "utf-8"}); 13 | }); 14 | 15 | 16 | afterAll(async () => { 17 | execSync(`docker kill -s SIGINT ${minioId}`); 18 | await sleep(5000); 19 | execSync("docker network rm upload-test-net"); 20 | }); 21 | 22 | test("run crawl with upload", async () => { 23 | 24 | execSync(`docker exec ${minioId.trim()} mc mb /data/test-bucket`); 25 | 26 | const child = exec( 27 | "docker run --rm " + 28 | "-e STORE_ENDPOINT_URL=http://minio:9000/test-bucket/ " + 29 | "-e STORE_ACCESS_KEY=minioadmin " + 30 | "-e STORE_SECRET_KEY=minioadmin " + 31 | "-e STORE_PATH=prefix/ " + 32 | "--network=upload-test-net " + 33 | "-p 36390:6379 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 2 --collection upload-test --crawlId upload-test --writePagesToRedis --debugAccessRedis --generateWACZ", 34 | ); 35 | 36 | let resolve = null; 37 | const crawlFinished = new Promise(r => resolve = r); 38 | 39 | // detect crawler exit 40 | let crawler_exited = false; 41 | child.on("exit", function () { 42 | crawler_exited = true; 43 | resolve(); 44 | }); 45 | 46 | const redis = new Redis("redis://127.0.0.1:36390/0", { lazyConnect: true, retryStrategy: () => null }); 47 | 48 | await sleep(3000); 49 | 50 | await redis.connect({ maxRetriesPerRequest: 50 }); 51 | 52 | let filename; 53 | 54 | while (!crawler_exited) { 55 | const res = await redis.lpop("upload-test:pages"); 56 | if (!res) { 57 | await sleep(100); 58 | continue; 59 | } 60 | const json = JSON.parse(res); 61 | expect(json).toHaveProperty("id"); 62 | expect(json).toHaveProperty("url"); 63 | expect(json).toHaveProperty("ts"); 64 | expect(json).toHaveProperty("title"); 65 | expect(json).toHaveProperty("loadState"); 66 | expect(json).toHaveProperty("filename"); 67 | expect(json).toHaveProperty("depth"); 68 | expect(json).toHaveProperty("seed"); 69 | expect(json).toHaveProperty("favIconUrl"); 70 | filename = json.filename; 71 | break; 72 | } 73 | 74 | // ensure bucket is public 75 | execSync(`docker exec ${minioId.trim()} mc config host add local http://127.0.0.1:9000 minioadmin minioadmin`); 76 | execSync(`docker exec ${minioId.trim()} mc anonymous set download local/test-bucket`); 77 | 78 | // wait for crawler to finish 79 | await crawlFinished; 80 | 81 | // ensure WACZ exists at the specified filename 82 | const resp = await fetch(`http://127.0.0.1:9000/test-bucket/prefix/${filename}`); 83 | expect(resp.status).toBe(200); 84 | }); 85 | -------------------------------------------------------------------------------- /tests/url_file_list.test.js: -------------------------------------------------------------------------------- 1 | import util from "util"; 2 | import { exec as execCallback } from "child_process"; 3 | import fs from "fs"; 4 | 5 | const exec = util.promisify(execCallback); 6 | 7 | test("check that URLs in seed-list are crawled", async () => { 8 | try { 9 | await exec( 10 | "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000", 11 | ); 12 | } catch (error) { 13 | console.log(error); 14 | } 15 | 16 | let crawled_pages = fs.readFileSync( 17 | "test-crawls/collections/filelisttest/pages/pages.jsonl", 18 | "utf8", 19 | ); 20 | let seed_file = fs 21 | .readFileSync("tests/fixtures/urlSeedFile.txt", "utf8") 22 | .split("\n") 23 | .sort(); 24 | 25 | let seed_file_list = []; 26 | for (var j = 0; j < seed_file.length; j++) { 27 | if (seed_file[j] != undefined) { 28 | seed_file_list.push(seed_file[j]); 29 | } 30 | } 31 | 32 | let foundSeedUrl = true; 33 | 34 | for (var i = 1; i < seed_file_list.length; i++) { 35 | if (crawled_pages.indexOf(seed_file_list[i]) == -1) { 36 | foundSeedUrl = false; 37 | } 38 | } 39 | expect(foundSeedUrl).toBe(true); 40 | }); 41 | -------------------------------------------------------------------------------- /tests/warcinfo.test.js: -------------------------------------------------------------------------------- 1 | import fs from "fs"; 2 | import zlib from "zlib"; 3 | import path from "path"; 4 | import child_process from "child_process"; 5 | 6 | test("run crawl", async() => { 7 | let success = false; 8 | 9 | try { 10 | const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8"); 11 | const proc = child_process.execSync( 12 | "docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC", 13 | { input: configYaml, stdin: "inherit", encoding: "utf8" }, 14 | ); 15 | 16 | //console.log(proc); 17 | success = true; 18 | } catch (error) { 19 | console.log(error); 20 | } 21 | 22 | expect(success).toBe(true); 23 | }); 24 | 25 | test("check that the warcinfo for individual WARC is as expected", async () => { 26 | 27 | const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/"); 28 | 29 | let filename = ""; 30 | 31 | for (const name of warcs) { 32 | if (name.startsWith("rec-")) { 33 | filename = path.join("test-crawls/collections/warcinfo/archive/", name); 34 | break; 35 | } 36 | } 37 | 38 | const warcData = fs.readFileSync(filename); 39 | 40 | const data = zlib.gunzipSync(warcData); 41 | 42 | const string = data.toString("utf8"); 43 | 44 | expect(string.indexOf("operator: test")).toBeGreaterThan(-1); 45 | expect(string.indexOf("host: hostname")).toBeGreaterThan(-1); 46 | expect( 47 | string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/), 48 | ).not.toEqual(null); 49 | expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1); 50 | }); 51 | 52 | test("check that the warcinfo for combined WARC file is as expected", async () => { 53 | const warcData = fs.readFileSync( 54 | "test-crawls/collections/warcinfo/warcinfo_0.warc.gz", 55 | ); 56 | 57 | const data = zlib.gunzipSync(warcData); 58 | 59 | const string = data.toString("utf8"); 60 | 61 | expect(string.indexOf("operator: test")).toBeGreaterThan(-1); 62 | expect(string.indexOf("host: hostname")).toBeGreaterThan(-1); 63 | expect( 64 | string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/), 65 | ).not.toEqual(null); 66 | expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1); 67 | }); 68 | -------------------------------------------------------------------------------- /tsconfig.eslint.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "noEmit": true 4 | }, 5 | "extends": "./tsconfig.json", 6 | "include": ["**/*.ts", "**/*.js", ".*.js"], 7 | "exclude": ["dist", "configs", "crawls"] 8 | } 9 | --------------------------------------------------------------------------------