├── .dockerignore
├── .eslintignore
├── .eslintrc.cjs
├── .github
    └── workflows
    │   ├── ci.yaml
    │   ├── deploy-dev-channel.yaml
    │   ├── docs-publish.yaml
    │   ├── make-draft-release.yaml
    │   └── release.yaml
├── .gitignore
├── .husky
    └── pre-commit
├── .pre-commit-config.yaml
├── .prettierignore
├── .prettierrc
├── CHANGES.md
├── Dockerfile
├── LICENSE
├── NOTICE
├── README.md
├── config
    └── policies
    │   ├── brave.json
    │   └── chromium.json
├── docker-compose.yml
├── docker-entrypoint.sh
├── docs
    ├── docs
    │   ├── CNAME
    │   ├── assets
    │   │   ├── brand
    │   │   │   ├── browsertrix-crawler-icon-color-dynamic.svg
    │   │   │   └── browsertrix-crawler-white.svg
    │   │   └── fonts
    │   │   │   ├── Inter-Italic.var.woff2
    │   │   │   ├── Inter.var.woff2
    │   │   │   └── Recursive_VF_1.084.woff2
    │   ├── develop
    │   │   ├── docs.md
    │   │   └── index.md
    │   ├── index.md
    │   ├── overrides
    │   │   ├── .icons
    │   │   │   └── bootstrap
    │   │   │   │   ├── bug-fill.svg
    │   │   │   │   ├── chat-left-text-fill.svg
    │   │   │   │   ├── check-circle-fill.svg
    │   │   │   │   ├── check-circle.svg
    │   │   │   │   ├── dash-circle.svg
    │   │   │   │   ├── exclamation-circle-fill.svg
    │   │   │   │   ├── exclamation-diamond-fill.svg
    │   │   │   │   ├── exclamation-triangle-fill.svg
    │   │   │   │   ├── exclamation-triangle.svg
    │   │   │   │   ├── eye.svg
    │   │   │   │   ├── file-earmark-text-fill.svg
    │   │   │   │   ├── github.svg
    │   │   │   │   ├── globe.svg
    │   │   │   │   ├── info-circle-fill.svg
    │   │   │   │   ├── mastodon.svg
    │   │   │   │   ├── mortarboard-fill.svg
    │   │   │   │   ├── pencil-fill.svg
    │   │   │   │   ├── pencil.svg
    │   │   │   │   ├── question-circle-fill.svg
    │   │   │   │   ├── quote.svg
    │   │   │   │   ├── x-octagon-fill.svg
    │   │   │   │   ├── x-octagon.svg
    │   │   │   │   └── youtube.svg
    │   │   └── main.html
    │   ├── stylesheets
    │   │   └── extra.css
    │   └── user-guide
    │   │   ├── behaviors.md
    │   │   ├── browser-profiles.md
    │   │   ├── cli-options.md
    │   │   ├── common-options.md
    │   │   ├── crawl-scope.md
    │   │   ├── exit-codes.md
    │   │   ├── index.md
    │   │   ├── outputs.md
    │   │   ├── proxies.md
    │   │   ├── qa.md
    │   │   └── yaml-config.md
    ├── gen-cli.sh
    └── mkdocs.yml
├── html
    ├── createProfile.html
    ├── replay.html
    ├── screencast.html
    └── vnc_lite.html
├── package.json
├── requirements.txt
├── src
    ├── crawler.ts
    ├── create-login-profile.ts
    ├── main.ts
    ├── replaycrawler.ts
    └── util
    │   ├── argParser.ts
    │   ├── blockrules.ts
    │   ├── browser.ts
    │   ├── constants.ts
    │   ├── file_reader.ts
    │   ├── flowbehavior.ts
    │   ├── healthcheck.ts
    │   ├── logger.ts
    │   ├── originoverride.ts
    │   ├── proxy.ts
    │   ├── recorder.ts
    │   ├── redis.ts
    │   ├── replayserver.ts
    │   ├── reqresp.ts
    │   ├── screencaster.ts
    │   ├── screenshots.ts
    │   ├── seeds.ts
    │   ├── sitemapper.ts
    │   ├── state.ts
    │   ├── storage.ts
    │   ├── textextract.ts
    │   ├── timing.ts
    │   ├── wacz.ts
    │   ├── warcwriter.ts
    │   └── worker.ts
├── test-setup.js
├── tests
    ├── .DS_Store
    ├── adblockrules.test.js
    ├── add-exclusion.test.js
    ├── basic_crawl.test.js
    ├── blockrules.test.js
    ├── brave-query-redir.test.js
    ├── collection_name.test.js
    ├── config_file.test.js
    ├── config_stdin.test.js
    ├── crawl_overwrite.js
    ├── custom-behavior-flow.test.js
    ├── custom-behavior.test.js
    ├── custom-behaviors
    │   ├── custom-2.js
    │   ├── custom-flow.json
    │   └── custom.js
    ├── custom_driver.test.js
    ├── custom_selector.test.js
    ├── dryrun.test.js
    ├── exclude-redirected.test.js
    ├── extra_hops_depth.test.js
    ├── file_stats.test.js
    ├── fixtures
    │   ├── crawl-1.yaml
    │   ├── crawl-2.yaml
    │   ├── driver-1.mjs
    │   ├── pages.jsonl
    │   ├── proxy-key
    │   ├── proxy-key.pub
    │   └── urlSeedFile.txt
    ├── http-auth.test.js
    ├── invalid-behaviors
    │   └── invalid-export.js
    ├── lang-code.test.js
    ├── limit_reached.test.js
    ├── log_filtering.test.js
    ├── mult_url_crawl_with_favicon.test.js
    ├── multi-instance-crawl.test.js
    ├── non-html-crawl.test.js
    ├── pageinfo-records.test.js
    ├── proxy.test.js
    ├── qa_compare.test.js
    ├── retry-failed.test.js
    ├── rollover-writer.test.js
    ├── saved-state.test.js
    ├── scopes.test.js
    ├── screenshot.test.js
    ├── seeds.test.js
    ├── sitemap-parse.test.js
    ├── storage.test.js
    ├── text-extract.test.js
    ├── upload-wacz.test.js
    ├── url_file_list.test.js
    └── warcinfo.test.js
├── tsconfig.eslint.json
├── tsconfig.json
└── yarn.lock


/.dockerignore:
--------------------------------------------------------------------------------
1 | output/
2 | node_modules/
3 | crawls/
4 | test-crawls/
5 | 


--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | .*
2 | behaviors.js
3 | behaviors/
4 | scratch/
5 | 


--------------------------------------------------------------------------------
/.eslintrc.cjs:
--------------------------------------------------------------------------------
 1 | module.exports = {
 2 |   env: {
 3 |     browser: true,
 4 |     es2021: true,
 5 |     node: true,
 6 |     jest: true,
 7 |   },
 8 |   extends: [
 9 |     "eslint:recommended",
10 |     "plugin:@typescript-eslint/recommended",
11 |     "prettier",
12 |   ],
13 |   parser: "@typescript-eslint/parser",
14 |   plugins: ["@typescript-eslint"],
15 |   parserOptions: {
16 |     ecmaVersion: 12,
17 |     sourceType: "module",
18 |     project: ["./tsconfig.eslint.json"],
19 |     tsconfigRootDir: __dirname,
20 |   },
21 |   rules: {
22 |     "no-constant-condition": ["error", { checkLoops: false }],
23 |     "no-use-before-define": [
24 |       "error",
25 |       {
26 |         variables: true,
27 |         functions: false,
28 |         classes: false,
29 |         allowNamedExports: true,
30 |       },
31 |     ],
32 |     "@typescript-eslint/no-floating-promises": "error",
33 |     "@typescript-eslint/await-thenable": "error"
34 |   },
35 |   reportUnusedDisableDirectives: true,
36 | };
37 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
 1 | name: Node.js CI
 2 | 
 3 | on:
 4 |   push:
 5 |   pull_request:
 6 | 
 7 | # Cancel in progress workflows on pull_requests.
 8 | # https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value
 9 | concurrency:
10 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
11 |   cancel-in-progress: true
12 | 
13 | jobs:
14 |   lint:
15 |     runs-on: ubuntu-latest
16 | 
17 |     strategy:
18 |       matrix:
19 |         node-version: [20.x]
20 | 
21 |     steps:
22 |       - uses: actions/checkout@v3
23 |       - name: Use Node.js ${{ matrix.node-version }}
24 |         uses: actions/setup-node@v3
25 |         with:
26 |           node-version: ${{ matrix.node-version }}
27 |       - name: install requirements
28 |         run: yarn install
29 |       - name: run linter
30 |         run: yarn lint && yarn format
31 | 
32 |   build:
33 |     runs-on: ubuntu-latest
34 | 
35 |     strategy:
36 |       matrix:
37 |         node-version: [20.x]
38 | 
39 |     steps:
40 |       - uses: actions/checkout@v3
41 | 
42 |       - name: Use Node.js ${{ matrix.node-version }}
43 |         uses: actions/setup-node@v3
44 |         with:
45 |           node-version: ${{ matrix.node-version }}
46 | 
47 |       - uses: actions/setup-python@v4
48 |         with:
49 |           python-version: 3.x
50 | 
51 |       - name: install requirements
52 |         run: yarn install
53 | 
54 |       - name: build js
55 |         run: yarn run tsc
56 | 
57 |       - name: Cache Docker Images
58 |         uses: ScribeMD/docker-cache@0.5.0
59 |         with:
60 |           key: docker-${{ runner.os }}-${{ hashFiles('Dockerfile') }}
61 | 
62 |       - name: Login to DockerHub
63 |         uses: docker/login-action@v3
64 |         with:
65 |           username: ${{ secrets.DOCKER_USERNAME }}
66 |           password: ${{ secrets.DOCKER_PASSWORD }}
67 | 
68 |       - name: build docker
69 |         run: docker compose build
70 | 
71 |       - name: install python deps for docs
72 |         run: pip install mkdocs-material
73 | 
74 |       - name: build docs for crawl test
75 |         run: cd docs/ && mkdocs build
76 | 
77 |       - name: add http-server for tests
78 |         run: yarn add -D http-server
79 | 
80 |       - name: install py-wacz as root for tests
81 |         run: sudo pip install wacz --ignore-installed
82 | 
83 |       - name: run all tests as root
84 |         run: sudo DOCKER_HOST_NAME=172.17.0.1 CI=true yarn test -validate
85 | 
86 |       - name: run saved state + qa compare test as non-root - with volume owned by current user
87 |         run: |
88 |           sudo rm -rf ./test-crawls
89 |           mkdir test-crawls
90 |           sudo CI=true yarn test ./tests/saved-state.test.js ./tests/qa_compare.test.js
91 | 


--------------------------------------------------------------------------------
/.github/workflows/deploy-dev-channel.yaml:
--------------------------------------------------------------------------------
 1 | name: "*** Deploy Crawler to Dev Channel ***"
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       channel:
 7 |         description: Crawler Channel
 8 |         type: choice
 9 |         required: true
10 |         default: dev
11 |         options: 
12 |           - dev
13 |           - dev-2
14 | 
15 | jobs:
16 |   build_and_deploy_crawler:
17 |     runs-on: ubuntu-latest
18 |     steps:
19 |       - name: Checkout
20 |         uses: actions/checkout@v3
21 | 
22 |       - name: Set up Docker Buildx
23 |         uses: docker/setup-buildx-action@v2
24 |         with:
25 |           driver-opts: network=host
26 | 
27 |       - name: Login to Registry
28 |         uses: docker/login-action@v2
29 |         with:
30 |           registry: ${{ secrets.DEPLOY_REGISTRY }}
31 |           username: ${{ secrets.DEPLOY_REGISTRY_API_TOKEN }}
32 |           password: ${{ secrets.DEPLOY_REGISTRY_API_TOKEN }}
33 | 
34 |       - name: Build Image
35 |         uses: docker/build-push-action@v3
36 |         with:
37 |           context: .
38 |           push: true
39 |           tags: ${{ secrets.DEPLOY_REGISTRY_PATH }}/webrecorder/browsertrix-crawler:${{ github.event.inputs.channel }}
40 |           cache-from: type=gha,scope=backend
41 |           cache-to: type=gha,scope=backend,mode=max
42 | 


--------------------------------------------------------------------------------
/.github/workflows/docs-publish.yaml:
--------------------------------------------------------------------------------
 1 | name: docs-publish
 2 | on:
 3 |   push:
 4 |     branches:
 5 |       - main
 6 |     paths:
 7 |       - 'docs/**'
 8 | 
 9 | permissions:
10 |   contents: write
11 | 
12 | jobs:
13 |   deploy_docs:
14 |     runs-on: ubuntu-latest
15 |     steps:
16 |       - uses: actions/checkout@v3
17 |       - uses: actions/setup-python@v4
18 |         with:
19 |           python-version: 3.x
20 | 
21 |       - name: build docker image (for getting cli)
22 |         run: docker compose build
23 | 
24 |       - name: generate cli
25 |         run: docs/gen-cli.sh
26 | 
27 |       - run: pip install mkdocs-material
28 |       - run: cd docs/ && mkdocs gh-deploy --force
29 | 


--------------------------------------------------------------------------------
/.github/workflows/make-draft-release.yaml:
--------------------------------------------------------------------------------
 1 | name: Generate Draft Release
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - "*-release"
 8 | 
 9 | jobs:
10 |   package_chart:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - name: Check out Git repository
15 |         uses: actions/checkout@v3
16 | 
17 |       - name: Get Version
18 |         run: |
19 |           echo "version=$(jq -r .version package.json)" >> "$GITHUB_ENV"
20 | 
21 |       - name: Make Draft Release
22 |         uses: softprops/action-gh-release@v1
23 |         with:
24 |           name: "Browsertrix Crawler v${{ env.version }}"
25 |           tag_name: v${{ env.version }}
26 |           draft: true
27 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish Docker image
 2 | on:
 3 |   release:
 4 |     types: [published]
 5 | 
 6 | jobs:
 7 |   push_to_registries:
 8 |     name: Build x86 and ARM Images and push to Dockerhub
 9 |     runs-on: ubuntu-22.04
10 |     steps:
11 |       - name: Check out the repo
12 |         uses: actions/checkout@v4
13 | 
14 |       - name: Docker image metadata
15 |         id: meta
16 |         uses: docker/metadata-action@v5
17 |         with:
18 |           images: webrecorder/browsertrix-crawler
19 |           tags: |
20 |             type=semver,pattern={{version}}
21 | 
22 |       - name: Set up QEMU
23 |         uses: docker/setup-qemu-action@v3
24 |         with:
25 |           platforms: arm64
26 | 
27 |       - name: Set up Docker Buildx
28 |         uses: docker/setup-buildx-action@v1
29 |       - name: Login to DockerHub
30 |         uses: docker/login-action@v3
31 |         with:
32 |           username: ${{ secrets.DOCKER_USERNAME }}
33 |           password: ${{ secrets.DOCKER_PASSWORD }}
34 |       - name: Build and push
35 |         id: docker_build
36 |         uses: docker/build-push-action@v3
37 |         with:
38 |           context: .
39 |           push: true
40 |           tags: ${{ steps.meta.outputs.tags }}
41 |           platforms: "linux/amd64,linux/arm64"
42 |       - name: Image digest
43 |         run: echo ${{ steps.docker_build.outputs.digest }}
44 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | __pycache__
 3 | *.egg-info/
 4 | collections/
 5 | node_modules/
 6 | crawls/
 7 | test-crawls/
 8 | .DS_Store
 9 | dist
10 | scratch/
11 | venv/
12 | docs/venv/
13 | 


--------------------------------------------------------------------------------
/.husky/pre-commit:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env sh
2 | . "$(dirname -- "$0")/_/husky.sh"
3 | yarn lint:fix
4 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 |   - repo: local
3 |     hooks:
4 |       - id: husky-run-pre-commit
5 |         name: husky
6 |         language: system
7 |         entry: .husky/pre-commit
8 |         pass_filenames: false
9 | 


--------------------------------------------------------------------------------
/.prettierignore:
--------------------------------------------------------------------------------
1 | dist
2 | scratch
3 | crawls
4 | test-crawls
5 | 


--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {}
2 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BROWSER_VERSION=1.79.118
 2 | ARG BROWSER_IMAGE_BASE=webrecorder/browsertrix-browser-base:brave-${BROWSER_VERSION}
 3 | 
 4 | FROM ${BROWSER_IMAGE_BASE}
 5 | 
 6 | # needed to add args to main build stage
 7 | ARG BROWSER_VERSION
 8 | 
 9 | ENV GEOMETRY=1360x1020x16 \
10 |     BROWSER_VERSION=${BROWSER_VERSION} \
11 |     BROWSER_BIN=google-chrome \
12 |     OPENSSL_CONF=/app/openssl.conf \
13 |     VNC_PASS=vncpassw0rd! \
14 |     DETACHED_CHILD_PROC=1
15 | 
16 | EXPOSE 9222 9223 6080
17 | 
18 | WORKDIR /app
19 | 
20 | ADD package.json yarn.lock /app/
21 | 
22 | # to allow forcing rebuilds from this stage
23 | ARG REBUILD
24 | 
25 | # Download and format ad host blocklist as JSON
26 | RUN mkdir -p /tmp/ads && cd /tmp/ads && \
27 |     curl -vs -o ad-hosts.txt https://raw.githubusercontent.com/StevenBlack/hosts/master/hosts && \
28 |     cat ad-hosts.txt | grep '^0.0.0.0 '| awk '{ print $2; }' | grep -v '0.0.0.0' | jq --raw-input --slurp 'split("\n")' > /app/ad-hosts.json && \
29 |     rm /tmp/ads/ad-hosts.txt
30 | 
31 | RUN yarn install --network-timeout 1000000
32 | 
33 | ADD tsconfig.json /app/
34 | ADD src /app/src
35 | 
36 | RUN yarn run tsc
37 | 
38 | ADD config/ /app/
39 | 
40 | ADD html/ /app/html/
41 | 
42 | ARG RWP_VERSION=2.3.7
43 | ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/ui.js /app/html/rwp/
44 | ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/sw.js /app/html/rwp/
45 | ADD https://cdn.jsdelivr.net/npm/replaywebpage@${RWP_VERSION}/adblock/adblock.gz /app/html/rwp/adblock.gz
46 | 
47 | RUN chmod a+x /app/dist/main.js /app/dist/create-login-profile.js && chmod a+r /app/html/rwp/*
48 | 
49 | RUN ln -s /app/dist/main.js /usr/bin/crawl; \
50 |     ln -s /app/dist/main.js /usr/bin/qa; \
51 |     ln -s /app/dist/create-login-profile.js /usr/bin/create-login-profile
52 | 
53 | RUN mkdir -p /app/behaviors
54 | 
55 | WORKDIR /crawls
56 | 
57 | # enable to test custom behaviors build (from browsertrix-behaviors)
58 | # COPY behaviors.js /app/node_modules/browsertrix-behaviors/dist/behaviors.js
59 | 
60 | # add brave/chromium group policies
61 | RUN mkdir -p /etc/brave/policies/managed/
62 | ADD config/policies /etc/brave/policies/managed/
63 | 
64 | ADD docker-entrypoint.sh /docker-entrypoint.sh
65 | ENTRYPOINT ["/docker-entrypoint.sh"]
66 | 
67 | CMD ["crawl"]
68 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | browsertrix-mini
 2 | 
 3 | Copyright (C) 2020  Webrecorder Software
 4 | 
 5 | This program is free software: you can redistribute it and/or modify
 6 | it under the terms of the GNU Affero General Public License as published by
 7 | the Free Software Foundation, either version 3 of the License, or
 8 | (at your option) any later version.
 9 | 
10 | This program is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 | GNU Affero General Public License for more details.
14 | 
15 | You should have received a copy of the GNU Affero General Public License
16 | along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Browsertrix Crawler 1.x
 2 | 
 3 | Browsertrix Crawler is a standalone browser-based high-fidelity crawling system, designed to run a complex, customizable browser-based crawl in a single Docker container. Browsertrix Crawler uses [Puppeteer](https://github.com/puppeteer/puppeteer) to control one or more [Brave Browser](https://brave.com/) browser windows in parallel. Data is captured through the [Chrome Devtools Protocol (CDP)](https://chromedevtools.github.io/devtools-protocol/) in the browser.
 4 | 
 5 | For information on how to use and develop Browsertrix Crawler, see the hosted [Browsertrix Crawler documentation](https://crawler.docs.browsertrix.com).
 6 | 
 7 | For information on how to build the docs locally, see the [docs page](docs/docs/develop/docs.md).
 8 | 
 9 | 
10 | ## Support
11 | Initial support for 0.x version of Browsertrix Crawler, was provided by [Kiwix](https://kiwix.org/). The initial functionality for Browsertrix Crawler was developed to support the [zimit](https://github.com/openzim/zimit) project in a collaboration between Webrecorder and Kiwix, and this project has been split off from Zimit into a core component of Webrecorder.
12 | 
13 | Additional support for Browsertrix Crawler, including for the development of the 0.4.x version has been provided by [Portico](https://www.portico.org/).
14 | 
15 | ## License
16 | 
17 | [AGPLv3](https://www.gnu.org/licenses/agpl-3.0) or later, see [LICENSE](LICENSE) for more details.
18 | 


--------------------------------------------------------------------------------
/config/policies/brave.json:
--------------------------------------------------------------------------------
1 | {
2 |     "BraveRewardsDisabled": true,
3 |     "BraveWalletDisabled": true,
4 |     "BraveVPNDisabled": 1,
5 |     "BraveAIChatEnabled": false,
6 |     "TorDisabled": true
7 | }
8 | 


--------------------------------------------------------------------------------
/config/policies/chromium.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "AlwaysOpenPdfExternally": true,
 3 |     "NewTabPageLocation": "about:blank",
 4 |     "RestoreOnStartup": 5,
 5 |     "IncognitoModeAvailability": 1,
 6 |     "AllowFileSelectionDialogs": false,
 7 |     "AutoLaunchProtocolsFromOrigins": [{
 8 |       "allowed_origins":["https://t.me"],
 9 |       "protocol": "tg"
10 |     }],
11 |     "URLBlocklist": [
12 |         "file://*"
13 |     ],
14 |     "DownloadDirectory": "/dev/null",
15 |     "SpellcheckEnabled": false,
16 |     "HttpsUpgradesEnabled": false
17 | }
18 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3.5"
 2 | 
 3 | services:
 4 |   crawler:
 5 |     image: ${REGISTRY}webrecorder/browsertrix-crawler:latest
 6 |     build:
 7 |       context: ./
 8 | 
 9 |     volumes:
10 |       - ./crawls:/crawls
11 | 
12 |     cap_add:
13 |       - NET_ADMIN
14 |       - SYS_ADMIN
15 | 
16 |     shm_size: 1gb
17 | 


--------------------------------------------------------------------------------
/docker-entrypoint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | # disable core dumps
 4 | ulimit -c 0
 5 | 
 6 | # Get UID/GID from volume dir
 7 | 
 8 | VOLUME_UID=$(stat -c '%u' /crawls)
 9 | VOLUME_GID=$(stat -c '%g' /crawls)
10 | 
11 | # Get the UID/GID we are running as
12 | 
13 | MY_UID=$(id -u)
14 | MY_GID=$(id -g)
15 | 
16 | # If we aren't running as the owner of the /crawls/ dir then add a new user
17 | # btrix with the same UID/GID of the /crawls dir and run as that user instead.
18 | 
19 | if [ "$MY_GID" != "$VOLUME_GID" ] || [ "$MY_UID" != "$VOLUME_UID" ]; then
20 |     groupadd btrix
21 |     groupmod -o --gid $VOLUME_GID btrix
22 | 
23 |     useradd -ms /bin/bash -g $VOLUME_GID btrix
24 |     usermod -o -u $VOLUME_UID btrix > /dev/null
25 | 
26 |     exec gosu btrix:btrix "$@"
27 | else
28 |     exec "$@"
29 | fi
30 | 
31 | 


--------------------------------------------------------------------------------
/docs/docs/CNAME:
--------------------------------------------------------------------------------
1 | crawler.docs.browsertrix.com
2 | 


--------------------------------------------------------------------------------
/docs/docs/assets/brand/browsertrix-crawler-icon-color-dynamic.svg:
--------------------------------------------------------------------------------
 1 | <svg xmlns="http://www.w3.org/2000/svg" xml:space="preserve" fill-rule="evenodd"
 2 |   stroke-linejoin="round" stroke-miterlimit="2" clip-rule="evenodd" viewBox="0 0 24 24">
 3 |   <style>
 4 |     .b{fill: #0891b2;}.g{fill: #4d7c0f;}
 5 |     @media (prefers-color-scheme: dark) {
 6 |       .b{ fill: #0AAED7; }.g{ fill: #65A414;}
 7 |     }
 8 |   </style>
 9 |   <path class="b" d="m18.59 15.34-5.78-3.62a.24.24 0 0 1 0-.4l5.77-3.62a7.16 7.16 0 0 1 0 7.64Z"/>
10 |   <path class="g" d="M22.04 17.5c.06.03.1.08.11.15.01.06 0 .13-.03.18a11.52 11.52 0 1 1 0-12.62.24.24 0 0 1-.08.33L18.58 7.7a7.2 7.2 0 1 0 0 7.64s2.7 1.67 3.46 2.16Z"/>
11 | </svg>
12 | 


--------------------------------------------------------------------------------
/docs/docs/assets/brand/browsertrix-crawler-white.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xml:space="preserve" fill-rule="evenodd"
2 |   stroke-linejoin="round" stroke-miterlimit="2" clip-rule="evenodd" viewBox="0 0 24 24">
3 |   <path fill="none" d="M0 0h23.04v23.04H0z"/>
4 |   <g fill="#fff">
5 |     <path d="m18.59 15.34-5.78-3.62a.24.24 0 0 1 0-.4l5.77-3.62a7.16 7.16 0 0 1 0 7.64Z"/>
6 |     <path d="M22.04 17.5c.06.03.1.08.11.15.01.06 0 .13-.03.18a11.52 11.52 0 1 1 0-12.62.24.24 0 0 1-.08.33L18.58 7.7a7.2 7.2 0 1 0 0 7.64s2.7 1.67 3.46 2.16Z"/>
7 |   </g>
8 | </svg>
9 | 


--------------------------------------------------------------------------------
/docs/docs/assets/fonts/Inter-Italic.var.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/d2a6aa9805fa08c9a51b43005e0a562a032fd78a/docs/docs/assets/fonts/Inter-Italic.var.woff2


--------------------------------------------------------------------------------
/docs/docs/assets/fonts/Inter.var.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/d2a6aa9805fa08c9a51b43005e0a562a032fd78a/docs/docs/assets/fonts/Inter.var.woff2


--------------------------------------------------------------------------------
/docs/docs/assets/fonts/Recursive_VF_1.084.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/d2a6aa9805fa08c9a51b43005e0a562a032fd78a/docs/docs/assets/fonts/Recursive_VF_1.084.woff2


--------------------------------------------------------------------------------
/docs/docs/develop/docs.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | This documentation is built with the [Mkdocs](https://www.mkdocs.org/) static site generator.
 4 | 
 5 | ## Docs Setup
 6 | 
 7 | Python is required to build the docs, then run:
 8 | 
 9 |     pip install mkdocs-material
10 | 
11 | 
12 | ## Docs Server
13 | 
14 | To start the docs server, simply run:
15 | 
16 |     mkdocs serve
17 | 
18 | The documentation will then be available on `http://localhost:8000/`
19 | 
20 | The command-line options are rebuilt using the `docs/gen-cli.sh` script.
21 | 
22 | Refer to the [Mkdocs](https://www.mkdocs.org/) and [Material for MkDocs](https://squidfunk.github.io/mkdocs-material/) pages
23 | for more info about the documentation.
24 | 


--------------------------------------------------------------------------------
/docs/docs/develop/index.md:
--------------------------------------------------------------------------------
 1 | # Development
 2 | 
 3 | ## Usage with Docker Compose
 4 | 
 5 | Many examples in User Guide demonstrate running Browsertrix Crawler with `docker run`.
 6 | 
 7 | Docker Compose is recommended for building the image and for simple configurations. A simple Docker Compose configuration file is included in the Git repository.
 8 | 
 9 | To build the latest image, run:
10 | 
11 | ```sh
12 | docker-compose build
13 | ```
14 | 
15 | Docker Compose also simplifies some config options, such as mounting the volume for the crawls.
16 | 
17 | The following command starts a crawl with 2 workers and generates the CDX:
18 | 
19 | ```sh
20 | docker-compose run crawler crawl --url https://webrecorder.net/ --generateCDX --collection wr-net --workers 2
21 | ```
22 | 
23 | In this example, the crawl data is written to `./crawls/collections/wr-net` by default.
24 | 
25 | While the crawl is running, the status of the crawl prints the progress to the JSON-L log output. This can be disabled by using the `--logging` option and not including `stats`.
26 | 
27 | ## Multi-Platform Build / Support for Apple Silicon
28 | 
29 | Browsertrix Crawler uses a browser image which supports amd64 and arm64.
30 | 
31 | This means Browsertrix Crawler can be built natively on Apple Silicon systems using the default settings. Running `docker-compose build` on an Apple Silicon should build a native version that should work for development.
32 | 
33 | ## Modifying Browser Image
34 | 
35 | It is also possible to build Browsertrix Crawler with a different browser image. Currently, browser images using Brave Browser and Chrome/Chromium (depending on host system chip architecture) are supported via [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), however, only Brave Browser receives regular version updates from us.
36 | 
37 | The browser base image used is specified and can be changed at the top of the Dockerfile in the Browsertrix Crawler repo.
38 | 
39 | Custom browser images can be used by forking [browsertrix-browser-base](https://github.com/webrecorder/browsertrix-browser-base), locally building or publishing an image, and then modifying the Dockerfile in this repo to build from that image.
40 | 


--------------------------------------------------------------------------------
/docs/docs/index.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | hide:
 3 |   - navigation
 4 |   - toc
 5 | ---
 6 | 
 7 | # Home
 8 | 
 9 | Welcome to the Browsertrix Crawler official documentation.
10 | 
11 | Browsertrix Crawler is a simplified browser-based high-fidelity crawling system, designed to run a complex, customizable browser-based crawl in a single Docker container. Browsertrix Crawler uses [Puppeteer](https://github.com/puppeteer/puppeteer) to control one or more [Brave Browser](https://brave.com/) browser windows in parallel. Data is captured through the [Chrome Devtools Protocol (CDP)](https://chromedevtools.github.io/devtools-protocol/) in the browser.
12 | 
13 | Browsertrix Crawler is a command line application responsible for the core features of [Browsertrix](https://browsertrix.com), Webrecorder's cloud-based web archiving service. See the [Browsertrix documentation](https://docs.browsertrix.cloud/) for more information about Browsertrix, the cloud platform.
14 | 
15 | !!! note
16 | 
17 |     This documentation applies to Browsertrix Crawler versions 1.0.0 and above. Documentation for earlier versions of the crawler is available in the [Browsertrix Crawler Github repository](https://github.com/webrecorder/browsertrix-crawler)'s README file in older commits.
18 | 
19 | ## Features
20 | 
21 | - Single-container, browser based crawling with a headless/headful browser running pages in multiple windows.
22 | - Support for custom browser behaviors, using [Browsertrix Behaviors](https://github.com/webrecorder/browsertrix-behaviors) including autoscroll, video autoplay, and site-specific behaviors.
23 | - YAML-based configuration, passed via file or via stdin.
24 | - Seed lists and per-seed scoping rules.
25 | - URL blocking rules to block capture of specific URLs (including by iframe URL and/or by iframe contents).
26 | - Screencasting: Ability to watch crawling in real-time.
27 | - Screenshotting: Ability to take thumbnails, full page screenshots, and/or screenshots of the initial page view.
28 | - Optimized (non-browser) capture of non-HTML resources.
29 | - Extensible Puppeteer driver script for customizing behavior per crawl or page.
30 | - Ability to create and reuse browser profiles interactively or via automated user/password login using an embedded browser.
31 | - Multi-platform support — prebuilt Docker images available for Intel/AMD and Apple Silicon (M1/M2) CPUs.
32 | - Quality Assurance (QA) crawling — analyze the replay of existing crawls (via WACZ) and produce stats comparing what the browser encountered on a website during crawling against the replay of the crawl WACZ.
33 | 
34 | ## Documentation
35 | 
36 | If something is missing, unclear, or seems incorrect, please open an [issue](https://github.com/webrecorder/browsertrix-crawler/issues?q=is%3Aissue+is%3Aopen+sort%3Aupdated-desc) and we'll try to make sure that your questions get answered here in the future!
37 | 
38 | ## Code
39 | 
40 | Browsertrix Crawler is free and open source software, with all code available in the [main repository on Github](https://github.com/webrecorder/browsertrix-crawler).
41 | 


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/bug-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-bug-fill" viewBox="0 0 16 16">
2 |   <path d="M4.978.855a.5.5 0 1 0-.956.29l.41 1.352A4.985 4.985 0 0 0 3 6h10a4.985 4.985 0 0 0-1.432-3.503l.41-1.352a.5.5 0 1 0-.956-.29l-.291.956A4.978 4.978 0 0 0 8 1a4.979 4.979 0 0 0-2.731.811l-.29-.956z"/>
3 |   <path d="M13 6v1H8.5v8.975A5 5 0 0 0 13 11h.5a.5.5 0 0 1 .5.5v.5a.5.5 0 1 0 1 0v-.5a1.5 1.5 0 0 0-1.5-1.5H13V9h1.5a.5.5 0 0 0 0-1H13V7h.5A1.5 1.5 0 0 0 15 5.5V5a.5.5 0 0 0-1 0v.5a.5.5 0 0 1-.5.5H13zm-5.5 9.975V7H3V6h-.5a.5.5 0 0 1-.5-.5V5a.5.5 0 0 0-1 0v.5A1.5 1.5 0 0 0 2.5 7H3v1H1.5a.5.5 0 0 0 0 1H3v1h-.5A1.5 1.5 0 0 0 1 11.5v.5a.5.5 0 1 0 1 0v-.5a.5.5 0 0 1 .5-.5H3a5 5 0 0 0 4.5 4.975z"/>
4 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/chat-left-text-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-chat-left-text-fill" viewBox="0 0 16 16">
2 |   <path d="M0 2a2 2 0 0 1 2-2h12a2 2 0 0 1 2 2v8a2 2 0 0 1-2 2H4.414a1 1 0 0 0-.707.293L.854 15.146A.5.5 0 0 1 0 14.793V2zm3.5 1a.5.5 0 0 0 0 1h9a.5.5 0 0 0 0-1h-9zm0 2.5a.5.5 0 0 0 0 1h9a.5.5 0 0 0 0-1h-9zm0 2.5a.5.5 0 0 0 0 1h5a.5.5 0 0 0 0-1h-5z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/check-circle-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-check-circle-fill" viewBox="0 0 16 16">
2 |   <path d="M16 8A8 8 0 1 1 0 8a8 8 0 0 1 16 0zm-3.97-3.03a.75.75 0 0 0-1.08.022L7.477 9.417 5.384 7.323a.75.75 0 0 0-1.06 1.06L6.97 11.03a.75.75 0 0 0 1.079-.02l3.992-4.99a.75.75 0 0 0-.01-1.05z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/check-circle.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-check-circle" viewBox="0 0 16 16">
2 |   <path d="M8 15A7 7 0 1 1 8 1a7 7 0 0 1 0 14m0 1A8 8 0 1 0 8 0a8 8 0 0 0 0 16"/>
3 |   <path d="M10.97 4.97a.235.235 0 0 0-.02.022L7.477 9.417 5.384 7.323a.75.75 0 0 0-1.06 1.06L6.97 11.03a.75.75 0 0 0 1.079-.02l3.992-4.99a.75.75 0 0 0-1.071-1.05"/>
4 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/dash-circle.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-dash-circle" viewBox="0 0 16 16">
2 |   <path d="M8 15A7 7 0 1 1 8 1a7 7 0 0 1 0 14m0 1A8 8 0 1 0 8 0a8 8 0 0 0 0 16"/>
3 |   <path d="M4 8a.5.5 0 0 1 .5-.5h7a.5.5 0 0 1 0 1h-7A.5.5 0 0 1 4 8"/>
4 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/exclamation-circle-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-exclamation-circle-fill" viewBox="0 0 16 16">
2 |   <path d="M16 8A8 8 0 1 1 0 8a8 8 0 0 1 16 0zM8 4a.905.905 0 0 0-.9.995l.35 3.507a.552.552 0 0 0 1.1 0l.35-3.507A.905.905 0 0 0 8 4zm.002 6a1 1 0 1 0 0 2 1 1 0 0 0 0-2z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/exclamation-diamond-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-exclamation-diamond-fill" viewBox="0 0 16 16">
2 |   <path d="M9.05.435c-.58-.58-1.52-.58-2.1 0L.436 6.95c-.58.58-.58 1.519 0 2.098l6.516 6.516c.58.58 1.519.58 2.098 0l6.516-6.516c.58-.58.58-1.519 0-2.098L9.05.435zM8 4c.535 0 .954.462.9.995l-.35 3.507a.552.552 0 0 1-1.1 0L7.1 4.995A.905.905 0 0 1 8 4zm.002 6a1 1 0 1 1 0 2 1 1 0 0 1 0-2z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/exclamation-triangle-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-exclamation-triangle-fill" viewBox="0 0 16 16">
2 |   <path d="M8.982 1.566a1.13 1.13 0 0 0-1.96 0L.165 13.233c-.457.778.091 1.767.98 1.767h13.713c.889 0 1.438-.99.98-1.767L8.982 1.566zM8 5c.535 0 .954.462.9.995l-.35 3.507a.552.552 0 0 1-1.1 0L7.1 5.995A.905.905 0 0 1 8 5zm.002 6a1 1 0 1 1 0 2 1 1 0 0 1 0-2z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/exclamation-triangle.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-exclamation-triangle" viewBox="0 0 16 16">
2 |   <path d="M7.938 2.016A.13.13 0 0 1 8.002 2a.13.13 0 0 1 .063.016.146.146 0 0 1 .054.057l6.857 11.667c.036.06.035.124.002.183a.163.163 0 0 1-.054.06.116.116 0 0 1-.066.017H1.146a.115.115 0 0 1-.066-.017.163.163 0 0 1-.054-.06.176.176 0 0 1 .002-.183L7.884 2.073a.147.147 0 0 1 .054-.057zm1.044-.45a1.13 1.13 0 0 0-1.96 0L.165 13.233c-.457.778.091 1.767.98 1.767h13.713c.889 0 1.438-.99.98-1.767L8.982 1.566z"/>
3 |   <path d="M7.002 12a1 1 0 1 1 2 0 1 1 0 0 1-2 0zM7.1 5.995a.905.905 0 1 1 1.8 0l-.35 3.507a.552.552 0 0 1-1.1 0z"/>
4 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/eye.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-eye" viewBox="0 0 16 16">
2 |   <path d="M16 8s-3-5.5-8-5.5S0 8 0 8s3 5.5 8 5.5S16 8 16 8zM1.173 8a13.133 13.133 0 0 1 1.66-2.043C4.12 4.668 5.88 3.5 8 3.5c2.12 0 3.879 1.168 5.168 2.457A13.133 13.133 0 0 1 14.828 8c-.058.087-.122.183-.195.288-.335.48-.83 1.12-1.465 1.755C11.879 11.332 10.119 12.5 8 12.5c-2.12 0-3.879-1.168-5.168-2.457A13.134 13.134 0 0 1 1.172 8z"/>
3 |   <path d="M8 5.5a2.5 2.5 0 1 0 0 5 2.5 2.5 0 0 0 0-5zM4.5 8a3.5 3.5 0 1 1 7 0 3.5 3.5 0 0 1-7 0z"/>
4 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/file-earmark-text-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-file-earmark-text-fill" viewBox="0 0 16 16">
2 |   <path d="M9.293 0H4a2 2 0 0 0-2 2v12a2 2 0 0 0 2 2h8a2 2 0 0 0 2-2V4.707A1 1 0 0 0 13.707 4L10 .293A1 1 0 0 0 9.293 0zM9.5 3.5v-2l3 3h-2a1 1 0 0 1-1-1zM4.5 9a.5.5 0 0 1 0-1h7a.5.5 0 0 1 0 1h-7zM4 10.5a.5.5 0 0 1 .5-.5h7a.5.5 0 0 1 0 1h-7a.5.5 0 0 1-.5-.5zm.5 2.5a.5.5 0 0 1 0-1h4a.5.5 0 0 1 0 1h-4z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/github.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-github" viewBox="0 0 16 16">
2 |   <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/globe.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-globe" viewBox="0 0 16 16">
2 |   <path d="M0 8a8 8 0 1 1 16 0A8 8 0 0 1 0 8zm7.5-6.923c-.67.204-1.335.82-1.887 1.855A7.97 7.97 0 0 0 5.145 4H7.5V1.077zM4.09 4a9.267 9.267 0 0 1 .64-1.539 6.7 6.7 0 0 1 .597-.933A7.025 7.025 0 0 0 2.255 4H4.09zm-.582 3.5c.03-.877.138-1.718.312-2.5H1.674a6.958 6.958 0 0 0-.656 2.5h2.49zM4.847 5a12.5 12.5 0 0 0-.338 2.5H7.5V5H4.847zM8.5 5v2.5h2.99a12.495 12.495 0 0 0-.337-2.5H8.5zM4.51 8.5a12.5 12.5 0 0 0 .337 2.5H7.5V8.5H4.51zm3.99 0V11h2.653c.187-.765.306-1.608.338-2.5H8.5zM5.145 12c.138.386.295.744.468 1.068.552 1.035 1.218 1.65 1.887 1.855V12H5.145zm.182 2.472a6.696 6.696 0 0 1-.597-.933A9.268 9.268 0 0 1 4.09 12H2.255a7.024 7.024 0 0 0 3.072 2.472zM3.82 11a13.652 13.652 0 0 1-.312-2.5h-2.49c.062.89.291 1.733.656 2.5H3.82zm6.853 3.472A7.024 7.024 0 0 0 13.745 12H11.91a9.27 9.27 0 0 1-.64 1.539 6.688 6.688 0 0 1-.597.933zM8.5 12v2.923c.67-.204 1.335-.82 1.887-1.855.173-.324.33-.682.468-1.068H8.5zm3.68-1h2.146c.365-.767.594-1.61.656-2.5h-2.49a13.65 13.65 0 0 1-.312 2.5zm2.802-3.5a6.959 6.959 0 0 0-.656-2.5H12.18c.174.782.282 1.623.312 2.5h2.49zM11.27 2.461c.247.464.462.98.64 1.539h1.835a7.024 7.024 0 0 0-3.072-2.472c.218.284.418.598.597.933zM10.855 4a7.966 7.966 0 0 0-.468-1.068C9.835 1.897 9.17 1.282 8.5 1.077V4h2.355z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/info-circle-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-info-circle-fill" viewBox="0 0 16 16">
2 |   <path d="M8 16A8 8 0 1 0 8 0a8 8 0 0 0 0 16zm.93-9.412-1 4.705c-.07.34.029.533.304.533.194 0 .487-.07.686-.246l-.088.416c-.287.346-.92.598-1.465.598-.703 0-1.002-.422-.808-1.319l.738-3.468c.064-.293.006-.399-.287-.47l-.451-.081.082-.381 2.29-.287zM8 5.5a1 1 0 1 1 0-2 1 1 0 0 1 0 2z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/mastodon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-mastodon" viewBox="0 0 16 16">
2 |   <path d="M11.19 12.195c2.016-.24 3.77-1.475 3.99-2.603.348-1.778.32-4.339.32-4.339 0-3.47-2.286-4.488-2.286-4.488C12.062.238 10.083.017 8.027 0h-.05C5.92.017 3.942.238 2.79.765c0 0-2.285 1.017-2.285 4.488l-.002.662c-.004.64-.007 1.35.011 2.091.083 3.394.626 6.74 3.78 7.57 1.454.383 2.703.463 3.709.408 1.823-.1 2.847-.647 2.847-.647l-.06-1.317s-1.303.41-2.767.36c-1.45-.05-2.98-.156-3.215-1.928a3.614 3.614 0 0 1-.033-.496s1.424.346 3.228.428c1.103.05 2.137-.064 3.188-.189zm1.613-2.47H11.13v-4.08c0-.859-.364-1.295-1.091-1.295-.804 0-1.207.517-1.207 1.541v2.233H7.168V5.89c0-1.024-.403-1.541-1.207-1.541-.727 0-1.091.436-1.091 1.296v4.079H3.197V5.522c0-.859.22-1.541.66-2.046.456-.505 1.052-.764 1.793-.764.856 0 1.504.328 1.933.983L8 4.39l.417-.695c.429-.655 1.077-.983 1.934-.983.74 0 1.336.259 1.791.764.442.505.661 1.187.661 2.046v4.203z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/mortarboard-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-mortarboard-fill" viewBox="0 0 16 16">
2 |   <path d="M8.211 2.047a.5.5 0 0 0-.422 0l-7.5 3.5a.5.5 0 0 0 .025.917l7.5 3a.5.5 0 0 0 .372 0L14 7.14V13a1 1 0 0 0-1 1v2h3v-2a1 1 0 0 0-1-1V6.739l.686-.275a.5.5 0 0 0 .025-.917l-7.5-3.5Z"/>
3 |   <path d="M4.176 9.032a.5.5 0 0 0-.656.327l-.5 1.7a.5.5 0 0 0 .294.605l4.5 1.8a.5.5 0 0 0 .372 0l4.5-1.8a.5.5 0 0 0 .294-.605l-.5-1.7a.5.5 0 0 0-.656-.327L8 10.466 4.176 9.032Z"/>
4 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/pencil-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-pencil-fill" viewBox="0 0 16 16">
2 |   <path d="M12.854.146a.5.5 0 0 0-.707 0L10.5 1.793 14.207 5.5l1.647-1.646a.5.5 0 0 0 0-.708l-3-3zm.646 6.061L9.793 2.5 3.293 9H3.5a.5.5 0 0 1 .5.5v.5h.5a.5.5 0 0 1 .5.5v.5h.5a.5.5 0 0 1 .5.5v.5h.5a.5.5 0 0 1 .5.5v.207l6.5-6.5zm-7.468 7.468A.5.5 0 0 1 6 13.5V13h-.5a.5.5 0 0 1-.5-.5V12h-.5a.5.5 0 0 1-.5-.5V11h-.5a.5.5 0 0 1-.5-.5V10h-.5a.499.499 0 0 1-.175-.032l-.179.178a.5.5 0 0 0-.11.168l-2 5a.5.5 0 0 0 .65.65l5-2a.5.5 0 0 0 .168-.11l.178-.178z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/pencil.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-pencil" viewBox="0 0 16 16">
2 |   <path d="M12.146.146a.5.5 0 0 1 .708 0l3 3a.5.5 0 0 1 0 .708l-10 10a.5.5 0 0 1-.168.11l-5 2a.5.5 0 0 1-.65-.65l2-5a.5.5 0 0 1 .11-.168l10-10zM11.207 2.5 13.5 4.793 14.793 3.5 12.5 1.207 11.207 2.5zm1.586 3L10.5 3.207 4 9.707V10h.5a.5.5 0 0 1 .5.5v.5h.5a.5.5 0 0 1 .5.5v.5h.293l6.5-6.5zm-9.761 5.175-.106.106-1.528 3.821 3.821-1.528.106-.106A.5.5 0 0 1 5 12.5V12h-.5a.5.5 0 0 1-.5-.5V11h-.5a.5.5 0 0 1-.468-.325z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/question-circle-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-question-circle-fill" viewBox="0 0 16 16">
2 |   <path d="M16 8A8 8 0 1 1 0 8a8 8 0 0 1 16 0zM5.496 6.033h.825c.138 0 .248-.113.266-.25.09-.656.54-1.134 1.342-1.134.686 0 1.314.343 1.314 1.168 0 .635-.374.927-.965 1.371-.673.489-1.206 1.06-1.168 1.987l.003.217a.25.25 0 0 0 .25.246h.811a.25.25 0 0 0 .25-.25v-.105c0-.718.273-.927 1.01-1.486.609-.463 1.244-.977 1.244-2.056 0-1.511-1.276-2.241-2.673-2.241-1.267 0-2.655.59-2.75 2.286a.237.237 0 0 0 .241.247zm2.325 6.443c.61 0 1.029-.394 1.029-.927 0-.552-.42-.94-1.029-.94-.584 0-1.009.388-1.009.94 0 .533.425.927 1.01.927z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/quote.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-quote" viewBox="0 0 16 16">
2 |   <path d="M12 12a1 1 0 0 0 1-1V8.558a1 1 0 0 0-1-1h-1.388c0-.351.021-.703.062-1.054.062-.372.166-.703.31-.992.145-.29.331-.517.559-.683.227-.186.516-.279.868-.279V3c-.579 0-1.085.124-1.52.372a3.322 3.322 0 0 0-1.085.992 4.92 4.92 0 0 0-.62 1.458A7.712 7.712 0 0 0 9 7.558V11a1 1 0 0 0 1 1h2Zm-6 0a1 1 0 0 0 1-1V8.558a1 1 0 0 0-1-1H4.612c0-.351.021-.703.062-1.054.062-.372.166-.703.31-.992.145-.29.331-.517.559-.683.227-.186.516-.279.868-.279V3c-.579 0-1.085.124-1.52.372a3.322 3.322 0 0 0-1.085.992 4.92 4.92 0 0 0-.62 1.458A7.712 7.712 0 0 0 3 7.558V11a1 1 0 0 0 1 1h2Z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/x-octagon-fill.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-x-octagon-fill" viewBox="0 0 16 16">
2 |   <path d="M11.46.146A.5.5 0 0 0 11.107 0H4.893a.5.5 0 0 0-.353.146L.146 4.54A.5.5 0 0 0 0 4.893v6.214a.5.5 0 0 0 .146.353l4.394 4.394a.5.5 0 0 0 .353.146h6.214a.5.5 0 0 0 .353-.146l4.394-4.394a.5.5 0 0 0 .146-.353V4.893a.5.5 0 0 0-.146-.353L11.46.146zm-6.106 4.5L8 7.293l2.646-2.647a.5.5 0 0 1 .708.708L8.707 8l2.647 2.646a.5.5 0 0 1-.708.708L8 8.707l-2.646 2.647a.5.5 0 0 1-.708-.708L7.293 8 4.646 5.354a.5.5 0 1 1 .708-.708z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/x-octagon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-x-octagon" viewBox="0 0 16 16">
2 |   <path d="M4.54.146A.5.5 0 0 1 4.893 0h6.214a.5.5 0 0 1 .353.146l4.394 4.394a.5.5 0 0 1 .146.353v6.214a.5.5 0 0 1-.146.353l-4.394 4.394a.5.5 0 0 1-.353.146H4.893a.5.5 0 0 1-.353-.146L.146 11.46A.5.5 0 0 1 0 11.107V4.893a.5.5 0 0 1 .146-.353L4.54.146zM5.1 1 1 5.1v5.8L5.1 15h5.8l4.1-4.1V5.1L10.9 1z"/>
3 |   <path d="M4.646 4.646a.5.5 0 0 1 .708 0L8 7.293l2.646-2.647a.5.5 0 0 1 .708.708L8.707 8l2.647 2.646a.5.5 0 0 1-.708.708L8 8.707l-2.646 2.647a.5.5 0 0 1-.708-.708L7.293 8 4.646 5.354a.5.5 0 0 1 0-.708"/>
4 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/.icons/bootstrap/youtube.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-youtube" viewBox="0 0 16 16">
2 |   <path d="M8.051 1.999h.089c.822.003 4.987.033 6.11.335a2.01 2.01 0 0 1 1.415 1.42c.101.38.172.883.22 1.402l.01.104.022.26.008.104c.065.914.073 1.77.074 1.957v.075c-.001.194-.01 1.108-.082 2.06l-.008.105-.009.104c-.05.572-.124 1.14-.235 1.558a2.007 2.007 0 0 1-1.415 1.42c-1.16.312-5.569.334-6.18.335h-.142c-.309 0-1.587-.006-2.927-.052l-.17-.006-.087-.004-.171-.007-.171-.007c-1.11-.049-2.167-.128-2.654-.26a2.007 2.007 0 0 1-1.415-1.419c-.111-.417-.185-.986-.235-1.558L.09 9.82l-.008-.104A31.4 31.4 0 0 1 0 7.68v-.123c.002-.215.01-.958.064-1.778l.007-.103.003-.052.008-.104.022-.26.01-.104c.048-.519.119-1.023.22-1.402a2.007 2.007 0 0 1 1.415-1.42c.487-.13 1.544-.21 2.654-.26l.17-.007.172-.006.086-.003.171-.007A99.788 99.788 0 0 1 7.858 2h.193zM6.4 5.209v4.818l4.157-2.408L6.4 5.209z"/>
3 | </svg>


--------------------------------------------------------------------------------
/docs/docs/overrides/main.html:
--------------------------------------------------------------------------------
1 | {% extends "base.html" %} {% block icons %} {% set icon_path =
2 | "overrides/.icons/bootstrap/" %} {{ super() }} {% endblock %}
3 | 


--------------------------------------------------------------------------------
/docs/docs/stylesheets/extra.css:
--------------------------------------------------------------------------------
  1 | /* Font style definitions */
  2 | 
  3 | @font-face {
  4 |   font-family: "Recursive";
  5 |   font-style: oblique 0deg 15deg;
  6 |   font-weight: 300 1000;
  7 |   src: url("../assets/fonts/Recursive_VF_1.084.woff2") format("woff2");
  8 |   font-feature-settings: "ss12";
  9 | }
 10 | 
 11 | @font-face {
 12 |   font-family: "Inter";
 13 |   font-weight: 100 900;
 14 |   font-display: swap;
 15 |   font-style: normal;
 16 |   src: url("../assets/fonts/Inter.var.woff2") format("woff2");
 17 |   font-feature-settings: "ss03";
 18 | }
 19 | 
 20 | @font-face {
 21 |   font-family: "Inter";
 22 |   font-weight: 100 900;
 23 |   font-display: swap;
 24 |   font-style: italic;
 25 |   src: url("../assets/fonts/Inter-Italic.var.woff2") format("woff2");
 26 |   font-feature-settings: "ss03";
 27 | }
 28 | 
 29 | @font-face {
 30 |   font-family: "Konsole";
 31 |   font-weight: 100 900;
 32 |   font-display: swap;
 33 |   font-style: normal;
 34 |   src: url("https://wr-static.sfo3.cdn.digitaloceanspaces.com/fonts/konsole/Konsolev1.1-VF.woff2")
 35 |     format("woff2");
 36 | }
 37 | 
 38 | :root {
 39 |   --md-display-font: "Konsole", "Helvetica", sans-serif;
 40 |   --md-code-font: "Recursive", monospace;
 41 |   --md-text-font: "Inter", "Helvetica", "Arial", sans-serif;
 42 |   --wr-blue-primary: #088eaf;
 43 |   --wr-orange-primary: #bb4a00;
 44 | }
 45 | 
 46 | [data-md-color-scheme="webrecorder"] {
 47 |   --md-primary-fg-color: #4D7C0F;
 48 |   --md-primary-fg-color--light: #0782A1;
 49 |   --md-primary-fg-color--dark: #066B84;
 50 |   --md-typeset-color: black;
 51 |   --md-accent-fg-color: #0782A1;
 52 |   --md-typeset-a-color: #066B84;
 53 |   --md-code-bg-color: #F9FAFB;
 54 | }
 55 | 
 56 | /* Nav changes */
 57 | 
 58 | .md-header__title,
 59 | .md-nav__title {
 60 |   font-family: var(--md-display-font);
 61 |   text-transform: uppercase;
 62 |   font-variation-settings:
 63 |     "wght" 750,
 64 |     "wdth" 87;
 65 |   margin-left: 0 !important;
 66 | }
 67 | 
 68 | .md-header__title--active {
 69 |   font-family: var(--md-display-font);
 70 |   text-transform: none;
 71 |   font-variation-settings:
 72 |     "wght" 550,
 73 |     "wdth" 90;
 74 | }
 75 | 
 76 | .md-header__button {
 77 |   margin-right: 0 !important;
 78 | }
 79 | 
 80 | /* Custom menu item hover */
 81 | 
 82 | .md-tabs__link {
 83 |   font-family: var(--md-code-font);
 84 |   font-weight: 400;
 85 |   opacity: 0.9;
 86 |   transition:
 87 |     0.4s cubic-bezier(0.1, 0.7, 0.1, 1),
 88 |     opacity 0.25s;
 89 | }
 90 | 
 91 | .md-tabs__link:hover {
 92 |   font-weight: 600;
 93 | }
 94 | 
 95 | /* Custom body typography rules */
 96 | 
 97 | .md-typeset a {
 98 |   text-decoration: underline;
 99 | }
100 | 
101 | .headerlink {
102 |   text-decoration: none !important;
103 | }
104 | 
105 | code,
106 | pre,
107 | kbd {
108 |   font-variation-settings: "MONO" 1;
109 |   font-feature-settings: "ss01", "ss02", "ss08";
110 | }
111 | 
112 | code {
113 |   border-width: 1px;
114 |   border-color: #d1d5db;
115 |   border-style: solid;
116 | 
117 |   white-space : pre-wrap !important;
118 | }
119 | 
120 | .md-typeset h1,
121 | h2,
122 | h3,
123 | h4,
124 | h5 {
125 |   color: black;
126 | }
127 | 
128 | .md-typeset h1,
129 | h2,
130 | h3 {
131 |   font-weight: 650 !important;
132 |   font-variation-settings: "OPSZ" 35;
133 | }
134 | 
135 | /* Custom badge classes, applies custom overrides to inline-code blocks */
136 | 
137 | .badge-blue {
138 |   background-color: var(--wr-blue-primary) !important;
139 |   border-color: var(--wr-blue-primary) !important;
140 |   color: white !important;
141 |   font-family: var(--md-text-font);
142 |   font-weight: 600;
143 | }
144 | 
145 | .badge-green {
146 |   background-color: hsl(142 76% 36%) !important;
147 |   border-color: hsl(142 76% 36%) !important;
148 |   color: white !important;
149 |   font-family: var(--md-text-font);
150 |   font-weight: 600;
151 | }
152 | 
153 | .badge-orange {
154 |   background-color: var(--wr-orange-primary) !important;
155 |   border-color: var(--wr-orange-primary) !important;
156 |   color: white !important;
157 |   font-family: var(--md-text-font);
158 |   font-weight: 600;
159 | }
160 | 
161 | /* Status Styling */
162 | 
163 | .status-success {
164 |   font-family: var(--md-code-font);
165 |   font-weight: 500;
166 |   white-space: nowrap;
167 |   & svg {
168 |     color: hsl(142.1 76.2% 36.3%);
169 |   }
170 | }
171 | 
172 | .status-warning {
173 |   font-family: var(--md-code-font);
174 |   font-weight: 500;
175 |   white-space: nowrap;
176 |   & svg {
177 |     color: hsl(32.1 94.6% 43.7%);
178 |   }
179 | }
180 | 
181 | .status-danger {
182 |   font-family: var(--md-code-font);
183 |   font-weight: 500;
184 |   white-space: nowrap;
185 |   & svg {
186 |     color: hsl(0 72.2% 50.6%);
187 |   }
188 | }
189 | 
190 | .status-waiting {
191 |   font-family: var(--md-code-font);
192 |   font-weight: 500;
193 |   white-space: nowrap;
194 |   & svg {
195 |     color: hsl(271.5 81.3% 55.9%);
196 |   }
197 | }
198 | 


--------------------------------------------------------------------------------
/docs/docs/user-guide/browser-profiles.md:
--------------------------------------------------------------------------------
 1 | # Creating and Using Browser Profiles
 2 | 
 3 | Browsertrix Crawler can use existing browser profiles when running a crawl. This allows the browser to be pre-configured by logging in to certain sites or changing other settings, before running a crawl. By creating a logged in profile, the actual login credentials are not included in the crawl, only (temporary) session cookies.
 4 | 
 5 | ## Interactive Profile Creation
 6 | 
 7 | Interactive profile creation is used for creating profiles of more complex sites, or logging in to multiple sites at once.
 8 | 
 9 | To use this mode, don't specify `--username` or `--password` flags and expose two ports on the Docker container to allow DevTools to connect to the browser and to serve a status page.
10 | 
11 | In profile creation mode, Browsertrix Crawler launches a browser which uses a VNC server (via [noVNC](https://novnc.com/)) running on port 6080 to provide a 'remote desktop' for interacting with the browser.
12 | 
13 | After interactively logging into desired sites or configuring other settings, _Create Profile_ should be clicked to initiate profile creation. Browsertrix Crawler will then stop the browser, and save the browser profile.
14 | 
15 | To start in interactive profile creation mode, run:
16 | 
17 | ```sh
18 | docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles/ -it webrecorder/browsertrix-crawler create-login-profile --url "https://example.com/"
19 | ```
20 | 
21 | Then, open a browser pointing to `http://localhost:9223/` and use the embedded browser to log in to any sites or configure any settings as needed.
22 | 
23 | Click _Create Profile_ at the top when done. The profile will then be created in `./crawls/profiles/profile.tar.gz` containing the settings of this browsing session.
24 | 
25 | It is also possible to use an existing profile via the `--profile` flag. This allows previous browsing sessions to be extended as needed.
26 | 
27 | ```sh
28 | docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles -it webrecorder/browsertrix-crawler create-login-profile --url "https://example.com/" --filename "/crawls/profiles/newProfile.tar.gz" --profile "/crawls/profiles/oldProfile.tar.gz"
29 | ```
30 | 
31 | ## Headless vs Headful Profiles
32 | 
33 | Browsertrix Crawler supports both headful and headless crawling. We have historically recommended using headful crawling to be most accurate to user experience, however, headless crawling may be faster and in recent versions of Chromium-based browsers should be much closer in fidelity to headful crawling.
34 | 
35 | To use profiles in headless mode, profiles should also be created with `--headless` flag.
36 | 
37 | When creating browser profile in headless mode, Browsertrix will use the devtools protocol on port 9222 to stream the browser interface.
38 | 
39 | To create a profile in headless mode, run:
40 | 
41 | ```sh
42 | docker run -p 9222:9222 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles/ -it webrecorder/browsertrix-crawler create-login-profile --headless --url "https://example.com/"
43 | ```
44 | 
45 | ## Automated Profile Creation for User Login
46 | 
47 | If the `--automated` flag is provided, Browsertrix Crawler will attempt to create a profile automatically after logging in to sites with a username and password. The username and password can be provided via `--username` and `--password` flags or, if omitted, from a command-line prompt.
48 | 
49 | When using `--automated` or `--username` / `--password`, Browsertrix Crawler will not launch an interactive browser and instead will attempt to finish automatically.
50 | 
51 | The automated profile creation system will log in to a single website with supplied credentials and then save the profile.
52 | 
53 | The script profile creation system also take a screenshot so you can check if the login succeeded.
54 | 
55 | !!! example "Example: Launch a browser and login to the digipres.club Mastodon instance"
56 | 
57 | 	To automatically created a logged-in browser profile, run:
58 | 
59 | 	```bash
60 | 	docker run -v $PWD/crawls/profiles:/crawls/profiles -it webrecorder/browsertrix-crawler create-login-profile --url "https://digipres.club/"
61 | 	```
62 | 
63 | 	The script will then prompt you for login credentials, attempt to login, and create a tar.gz file in `./crawls/profiles/profile.tar.gz`.
64 | 
65 | - The `--url` parameter should specify the URL of a login page.
66 | 
67 | - To specify a custom filename, pass along `--filename` parameter.
68 | 
69 | - To specify the username and password on the command line (for automated profile creation), pass `--username` and `--password` flags.
70 | 
71 | - To specify headless mode, add the `--headless` flag. Note that for crawls run with `--headless` flag, it is recommended to also create the profile with `--headless` to ensure the profile is compatible.
72 | 
73 | - To specify the window size for the profile creation embedded browser, specify `--windowSize WIDTH,HEIGHT`. (The default is 1600x900)
74 | 
75 | The profile creation script attempts to detect the username and password fields on a site as generically as possible, but may not work for all sites.
76 | 
77 | ## Using Browser Profile with a Crawl
78 | 
79 | To use a previously created profile with a crawl, use the `--profile` flag or `profile` option. The `--profile` flag can then be used to specify any Brave Browser profile stored as a tarball. Browser profile can be either stored locally and provided as a path, or available online at any HTTP(S) URL which will be downloaded before starting the crawl. Using profiles created with same or older version of Browsertrix Crawler is recommended to ensure compatibility. This option allows running a crawl with the browser already pre-configured, logged in to certain sites, language settings configured, etc.
80 | 
81 | After running the above command, you can now run a crawl with the profile, as follows:
82 | 
83 | ```bash
84 | docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --profile /crawls/profiles/profile.tar.gz --url https://digipres.club/ --generateWACZ --collection test-with-profile
85 | ```
86 | 
87 | Profiles can also be loaded from an http/https URL, eg. `--profile https://example.com/path/to/profile.tar.gz`.
88 | 


--------------------------------------------------------------------------------
/docs/docs/user-guide/exit-codes.md:
--------------------------------------------------------------------------------
 1 | # Exit codes
 2 | 
 3 | The crawler uses following exit codes to indicate crawl result.
 4 | 
 5 | | Code | Name | Description |
 6 | |--|--|--|
 7 | | 0 | Success | Crawl completed normally |
 8 | | 1 | GenericError | Unspecified error, check logs for more details |
 9 | | 3 | OutOfSpace | Disk is already full |
10 | | 9 | Failed | Crawl failed unexpectedly, might be worth retrying |
11 | | 10 | BrowserCrashed | Browser used to fetch pages has crashed |
12 | | 11 | SignalInterrupted | Crawl stopped gracefully in response to SIGINT signal |
13 | | 12 | FailedLimit | Limit on amount of failed pages, configured with `--failOnFailedLimit`, has been reached |
14 | | 13 | SignalInterruptedForce | Crawl stopped forcefully in response to SIGTERM or repeated SIGINT signal |
15 | | 14 | SizeLimit | Limit on maximum WARC size, configured with `--sizeLimit`, has been reached |
16 | | 15 | TimeLimit | Limit on maximum crawl duration, configured with `--timeLimit`, has been reached |
17 | | 16 | DiskUtilization | Limit on maximum disk usage, configured with `--diskUtilization`, has been reached |
18 | | 17 | Fatal | A fatal (non-retryable) error occured |
19 | | 21 | ProxyError | Unable to establish connection with proxy |


--------------------------------------------------------------------------------
/docs/docs/user-guide/index.md:
--------------------------------------------------------------------------------
 1 | # Browsertrix Crawler User Guide
 2 | 
 3 | Welcome to the Browsertrix Crawler User Guide. This page covers the basics of using Browsertrix Crawler, Webrecorder's browser-based high-fidelity crawling system, designed to run a complex, customizable, browser-based crawl in a single Docker container.
 4 | 
 5 | ## Getting Started
 6 | 
 7 | Browsertrix Crawler requires [Docker](https://docs.docker.com/get-docker/) to be installed on the machine running the crawl.
 8 | 
 9 | Assuming Docker is installed, you can run a crawl and test your archive with the following steps.
10 | 
11 | You don't even need to clone the Browsertrix Crawler repo, just choose a directory where you'd like the crawl data to be placed, and then run
12 | the following commands. Replace `[URL]` with the website you'd like to crawl.
13 | 
14 | 1. Run `docker pull webrecorder/browsertrix-crawler`
15 | 2. `docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --url [URL] --generateWACZ --text --collection test`
16 | 3. The crawl will now run and logs in [JSON Lines](https://jsonlines.org/) format will be output to the console. Depending on the size of the site, this may take a bit!
17 | 4. Once the crawl is finished, a WACZ file will be created in `crawls/collection/test/test.wacz` from the directory you ran the crawl!
18 | 5. You can go to [ReplayWeb.page](https://replayweb.page) and open the generated WACZ file and browse your newly crawled archive!
19 | 
20 | ## Getting Started with Command-Line Options
21 | 
22 | Here's how you can use some of the more common command-line options to configure the crawl:
23 | 
24 | - To include automated text extraction for full text search to pages.jsonl, add the `--text` flag. To write extracted text to WARCs instead of or in addition to pages.jsonl, see [Text Extraction](common-options.md#text-extraction).
25 | 
26 | - To limit the crawl to a maximum number of pages, add `--limit P` where P is the number of pages that will be crawled.
27 | 
28 | - To limit the crawl to a maximum size, set `--sizeLimit` (size in bytes).
29 | 
30 | - To limit the crawl time, set `--timeLimit` (in seconds).
31 | 
32 | - To run more than one browser worker and crawl in parallel, and `--workers N` where N is number of browsers to run in parallel. More browsers will require more CPU and network bandwidth, and does not guarantee faster crawling.
33 | 
34 | - To crawl into a new directory, specify a different name for the `--collection` param. If omitted, a new collection directory based on current time will be created. Adding the `--overwrite` flag will delete the collection directory at the start of the crawl, if it exists.
35 | 
36 | Browsertrix Crawler includes a number of additional command-line options, explained in detail throughout this User Guide.
37 | 
38 | ## Published Releases / Production Use
39 | 
40 | When using Browsertrix Crawler in production, it is recommended to use a specific, published version of the image, eg. `webrecorder/browsertrix-crawler:[VERSION]` instead of `webrecorder/browsertrix-crawler` where `[VERSION]` corresponds to one of the published release tag.
41 | 
42 | All released Docker Images are available from [Docker Hub, listed by release tag here](https://hub.docker.com/r/webrecorder/browsertrix-crawler/tags?page=1&ordering=last_updated).
43 | 
44 | Details for each corresponding release tag are also available on GitHub under [Releases](https://github.com/webrecorder/browsertrix-crawler/releases).
45 | 


--------------------------------------------------------------------------------
/docs/docs/user-guide/outputs.md:
--------------------------------------------------------------------------------
 1 | # Outputs
 2 | 
 3 | This page covers the outputs created by Browsertrix Crawler for both crawls and browser profiles.
 4 | 
 5 | ## Crawl Outputs
 6 | 
 7 | Browsertrix Crawler crawl outputs are organized into collections, which can be found in the `/crawls/collection` directory. Each crawl creates a new collection by default, which can be named with the `-c` or `--collection` argument. If a collection name is not provided, Browsertrix Crawler will generate a unique collection name which includes the `crawl-` prefix followed by a timestamp of when the collection was created. Collections can be overwritten by specifying an existing collection name.
 8 | 
 9 | Each collection is a directory which contains at minimum:
10 | 
11 | - `archive/`: A directory containing gzipped [WARC](https://www.iso.org/standard/68004.html) files containing the web traffic recorded during crawling.
12 | - `logs/`: A directory containing one or more crawler log files in [JSON-Lines](https://jsonlines.org/) format.
13 | - `pages/`: A directory containing one or more "Page" files in [JSON-Lines](https://jsonlines.org/) format. At minimum, this directory will contain a `pages.jsonl` file with information about the seed URLs provided to the crawler. If additional pages were discovered and in scope during crawling, information about those non-seed pages is written to `extraPages.jsonl`. For more information about the contents of Page files, see the [WACZ specification](https://specs.webrecorder.net/wacz/1.1.1/#pages-jsonl).
14 | - `warc-cdx/`: A directory containing one or more [CDXJ](https://specs.webrecorder.net/cdxj/0.1.0/) index files created while recording traffic to WARC files. These index files are 
15 | 
16 | Additionally, the collection may include:
17 | 
18 | - A WACZ file named after the collection, if the `--generateWACZ` argument is provided.
19 | - An `indexes/` directory containing merged [CDXJ](https://specs.webrecorder.net/cdxj/0.1.0/) index files for the crawl, if the `--generateCDX` or `--generateWACZ` arguments are provided. If the combined size of the CDXJ files in the `warc-cdx/` directory is over 50 KB, the resulting final CDXJ file will be gzipped.
20 | - A single combined gzipped [WARC](https://www.iso.org/standard/68004.html) file for the crawl, if the `--combineWARC` argument is provided.
21 | - A `crawls/` directory including YAML files describing the crawl state, if the `--saveState` argument is provided with a value of "always", or if the crawl is interrupted and `--saveState` is not set to "never". These files can be used to restart a crawl from its saved state.
22 | 
23 | ## Profile Outputs
24 | 
25 | Browser profiles that are saved by Browsertrix Crawler are written into the `crawls/profiles` directory.
26 | 


--------------------------------------------------------------------------------
/docs/docs/user-guide/proxies.md:
--------------------------------------------------------------------------------
 1 | # Crawling with Proxies
 2 | Browser Crawler supports crawling through HTTP and SOCKS5 proxies, including through a SOCKS5 proxy over an SSH tunnel.
 3 | 
 4 | To specify a proxy, the `PROXY_SERVER` environment variable or `--proxyServer` CLI flag can be passed in.
 5 | If both are provided, the `--proxyServer` CLI flag will take precedence.
 6 | 
 7 | The proxy server can be specified as a `http://`, `socks5://`, or `ssh://` URL.
 8 | 
 9 | ### HTTP Proxies
10 | 
11 | To crawl through an HTTP proxy running at `http://path-to-proxy-host.example.com:9000`, run the crawler with:
12 | 
13 | ```sh
14 | docker run -v $PWD/crawls/:/crawls/ -e PROXY_SERVER=http://path-to-proxy-host.example.com:9000 webrecorder/browsertrix-crawler crawl --url https://example.com/
15 | ```
16 | 
17 | or
18 | 
19 | ```sh
20 | docker run -v $PWD/crawls/:/crawls/ webrecorder/browsertrix-crawler crawl --url https://example.com/ --proxyServer http://path-to-proxy-host.example.com:9000 
21 | ```
22 | 
23 | The crawler *does not* support authentication for HTTP proxies, as that is not supported by the browser.
24 | 
25 | (For backwards compatibility with crawler 0.x, `PROXY_HOST` and `PROXY_PORT` environment variables can be used to specify an HTTP proxy instead of `PROXY_SERVER`
26 | which takes precedence if provided).
27 | 
28 | 
29 | ### SOCKS5 Proxies
30 | 
31 | To use a SOCKS5 proxy running at `path-to-proxy-host.example.com:9001`, run the crawler with:
32 | 
33 | ```sh
34 | docker run -v $PWD/crawls/:/crawls/ -e PROXY_SERVER=socks5://path-to-proxy-host.example.com:9001 webrecorder/browsertrix-crawler crawl --url https://example.com/
35 | ```
36 | 
37 | The crawler *does* support password authentication for SOCKS5 proxies, which can be provided as `user:password` in the proxy URL:
38 | 
39 | ```sh
40 | docker run-v $PWD/crawls/:/crawls/ -e PROXY_SERVER=socks5://user:password@path-to-proxy-host.example.com:9001 webrecorder/browsertrix-crawler crawl --url https://example.com/
41 | ```
42 | 
43 | ### SSH Proxies
44 | 
45 | Starting with 1.3.0, the crawler also supports crawling through an SOCKS5 that is established over an SSH tunnel, via `ssh -D`.
46 | With this option, the crawler can SSH into a remote machine that has SSH and port forwarding enabled and crawl through that machine's network.
47 | 
48 | To use this proxy, the private SSH key file must be provided via `--sshProxyPrivateKeyFile` CLI flag.
49 | 
50 | The private key and public host key should be mounted as volumes into a path in the container, as shown below.
51 | 
52 | For example, to connect via SSH to host `path-to-ssh-host.example.com` as user `user` with private key stored in `./my-proxy-private-key`, run:
53 | 
54 | ```sh
55 | docker run -v $PWD/crawls/:/crawls/ -v $PWD/my-proxy-private-key:/tmp/private-key webrecorder/browsertrix-crawler crawl --url https://httpbin.org/ip --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key
56 | ```
57 | 
58 | To also provide the host public key (eg. `./known_hosts` file) for additional verification, run:
59 | 
60 | ```sh
61 | docker run -v $PWD/crawls/:/crawls/ -v $PWD/my-proxy-private-key:/tmp/private-key -v $PWD/known_hosts:/tmp/known_hosts webrecorder/browsertrix-crawler crawl --url https://httpbin.org/ip --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key --sshProxyKnownHostsFile /tmp/known_hosts
62 | ```
63 | 
64 | The host key will only be checked if provided in a file via: `--sshProxyKnownHostsFile`.
65 | 
66 | A custom SSH port can be provided with `--proxyServer ssh://user@path-to-ssh-host.example.com:2222`, otherwise the
67 | connection will be attempted via the default SSH port (port 22).
68 | 
69 | The SSH connection establishes a tunnel on a local port in the container (9722) which will forward inbound/outbound traffic through the remote proxy.
70 | The `autossh` utility is used to automatically restart the SSH connection, if needed.
71 | 
72 | Only key-based authentication is supposed for SSH proxies for now.
73 | 
74 | 
75 | ## Browser Profiles
76 | 
77 | The above proxy settings also apply to [Browser Profile Creation](browser-profiles.md), and browser profiles can also be created using proxies, for example:
78 | 
79 | ```sh
80 | docker run -p 6080:6080 -p 9223:9223 -v $PWD/crawls/profiles:/crawls/profiles -v $PWD/my-proxy-private-key:/tmp/private-key -v $PWD/known_hosts:/tmp/known_hosts webrecorder/browsertrix-crawler create-login-profile --url https://example.com/ --proxyServer ssh://user@path-to-ssh-host.example.com --sshProxyPrivateKeyFile /tmp/private-key --sshProxyKnownHostsFile /tmp/known_hosts
81 | ```
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/docs/docs/user-guide/qa.md:
--------------------------------------------------------------------------------
 1 | # Quality Assurance
 2 | 
 3 | ## Overview
 4 | 
 5 | Browsertrix Crawler can analyze an existing crawl to compare what the browser encountered on a website during crawling against the replay of the crawl WACZ. The WACZ produced by this analysis run includes additional comparison data (stored as WARC `resource` records) for the pages found during crawling against their replay in ReplayWeb.page. This works along several dimensions, including screenshot, extracted text, and page resource comparisons.
 6 | 
 7 | !!! note
 8 | 
 9 |     QA features described on this page are available in Browsertrix Crawler releases 1.1.0 and later.
10 | 
11 | ## Getting started
12 | 
13 | To be able to run QA on a crawl, you must first have an existing crawl, for example:
14 | 
15 | ```sh
16 | docker run -v $PWD/crawls:/crawls/ -it webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/ --collection example-crawl --text to-warc --screenshot view --generateWACZ
17 | ```
18 | 
19 | Note that this crawl must be run with `--generateWACZ` flag as QA requires a WACZ to work with, and also ideally the `--text to-warc` and `--screenshot view` flags as well (see below for more details on comparison dimensions).
20 | 
21 | To analyze this crawl, call Browsertrix Crawler with the `qa` entrypoint, passing the original crawl WACZ as the `qaSource`:
22 | 
23 | ```sh
24 | docker run -v $PWD/crawls/:/crawls/ -it webrecorder/browsertrix-crawler qa --qaSource /crawls/collections/example-crawl/example-crawl.wacz --collection example-qa --generateWACZ
25 | ```
26 | 
27 | The `qaSource` can be:
28 | - A local WACZ file path or a URL
29 | - A single WACZ or a JSON file containing a list of WACZ files in the `resources` json (Multi-WACZ)
30 | 
31 | This assumes an existing crawl that was created in the `example-crawl` collection.
32 | 
33 | A new WACZ for the analysis run will be created in the resulting `example-qa` collection.
34 | 
35 | By default, the analysis crawl will visit all of the pages (as read from the source WACZ file(s)), however pages can further be limited by adding `--include` and `--exclude` regexes. The `--limit` flag will also limit how many pages are tested.
36 | 
37 | The analysis crawl will skip over any non-HTML pages such as PDFs which can be relied upon to be bit-for-bit identical as long as the resource was fully fetched.
38 | 
39 | ## Comparison Dimensions
40 | 
41 | ### Screenshot Match
42 | 
43 | One way to compare crawl and replay is to compare the screenshots of a page while it is being crawled with when it is being replayed. The initial viewport screenshots of each page from the crawl and replay are compared on the basis of pixel value similarity. This results in a score between 0 and 1.0 representing the percentage match between the crawl and replay screenshots for each page. The screenshots are stored in `urn:view:<url>` WARC resource records.
44 | 
45 | To enable comparison on this dimension, the crawl must be run with at least the `--screenshot view` option. (Additional screenshot options can be added as well).
46 | 
47 | ### Text Match
48 | 
49 | Another way to compare the crawl and replay results is to use the text extracted from the HTML. This is done by comparing the extracted text from crawl and replay on the basis of [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance). This results in a score between 0 and 1.0 representing the percentage match between the crawl and replay text for each page. The extracted text is stored in `urn:text:<url>` WARC resource records.
50 | 
51 | To enable comparison on this dimension, the original crawl must be run with at least the `--text to-warc` option. (Additional text options can be added as well)
52 | 
53 | ### Resources and Page Info
54 | 
55 | The `pageinfo` records produced by the crawl and analysis runs include a JSON document containing information about the resources loaded on each page, such as CSS stylesheets, JavaScript scripts, fonts, images, and videos. The URL, status code, MIME type, and resource type of each resource is saved in the `pageinfo` record for each page.
56 | 
57 | Since `pageinfo` records are produced for all crawls, this data is always available.
58 | 
59 | ### Comparison Data
60 | 
61 | Comparison data is also added to the QA crawl's `pageinfo` records. The comparison data may look as follows:
62 | 
63 | ```json
64 | "comparison": {
65 |   "screenshotMatch": 0.95,
66 |   "textMatch": 0.9,
67 |   "resourceCounts": {
68 |     "crawlGood": 10,
69 |     "crawlBad": 0,
70 |     "replayGood": 9,
71 |     "replayBad": 1
72 |   }
73 | }
74 | ```
75 | 
76 | This data indicates that:
77 | 
78 | - When comparing `urn:view:<url>` records for crawl and replay, the screenshots are 95% similar.
79 | - When comparing `urn:text:<url>` records from crawl and replay WACZs, the text is 90% similar.
80 | - When comparing `urn:pageinfo:<url>` resource entries from crawl and replay, the crawl record had 10 good responses (2xx/3xx status code) and 0 bad responses (4xx/5xx status code), while replay had 9 good and 1 bad.
81 | 


--------------------------------------------------------------------------------
/docs/docs/user-guide/yaml-config.md:
--------------------------------------------------------------------------------
 1 | # YAML Crawl Config
 2 | 
 3 | Browsertix Crawler supports the use of a YAML file to set parameters for a crawl. This can be used by passing a valid yaml file to the `--config` option.
 4 | 
 5 | The YAML file can contain the same parameters as the command-line arguments. If a parameter is set on the command-line and in the YAML file, the value from the command-line will be used. For example, the following should start a crawl with config in `crawl-config.yaml`.
 6 | 
 7 | ```sh
 8 | docker run -v $PWD/crawl-config.yaml:/app/crawl-config.yaml -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config /app/crawl-config.yaml
 9 | ```
10 | 
11 | The config can also be passed via stdin, which can simplify the command. Note that this require running `docker run` with the `-i` flag. To read config from stdin, pass `--config stdin`
12 | 
13 | ```sh
14 | cat ./crawl-config.yaml | docker run -i -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --config stdin
15 | ```
16 | 
17 | An example config file (eg. crawl-config.yaml) might contain:
18 | 
19 | ```yaml
20 | seeds:
21 |   - https://example.com/
22 |   - https://www.iana.org/
23 | 
24 | combineWARC: true
25 | ```
26 | 
27 | The list of seeds can be loaded via an external file by specifying the filename via the `seedFile` config or command-line option.
28 | 
29 | ## Seed File
30 | 
31 | The URL seed file should be a text file formatted so that each line of the file is a url string. An example file is available in the Github repository's fixture folder as [urlSeedFile.txt](https://github.com/webrecorder/browsertrix-crawler/blob/main/tests/fixtures/urlSeedFile.txt).
32 | 
33 | The seed file must be passed as a volume to the docker container. Your Docker command should be formatted similar to the following:
34 | 
35 | ```sh
36 | docker run -v $PWD/seedFile.txt:/app/seedFile.txt -v $PWD/crawls:/crawls/ webrecorder/browsertrix-crawler crawl --seedFile /app/seedFile.txt
37 | ```
38 | 
39 | ## Per-Seed Settings
40 | 
41 | Certain settings such as scope type, scope includes and excludes, and depth can also be configured per-seed directly in the YAML file, for example:
42 | 
43 | ```yaml
44 | seeds:
45 |   - url: https://webrecorder.net/
46 |     depth: 1
47 |     scopeType: "prefix"
48 | ```
49 | 
50 | ## HTTP Auth
51 | 
52 | !!! warning "HTTP basic auth credentials are written to the archive"
53 |     We recommend exercising caution and only archiving with dedicated archival accounts, changing your password or deleting the account when finished.
54 | 
55 | Browsertrix Crawler supports [HTTP Basic Auth](https://developer.mozilla.org/en-US/docs/Web/HTTP/Authentication), which can be provide on a per-seed basis as part of the URL, for example:
56 | `--url https://username:password@example.com/`.
57 | 
58 | Alternatively, credentials can be added to the `auth` field for each seed:
59 | 
60 | ```yaml
61 | seeds:
62 |   - url: https://example.com/
63 |     auth: username:password
64 | ```
65 | 


--------------------------------------------------------------------------------
/docs/gen-cli.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | CURR=$(dirname "${BASH_SOURCE[0]}")
 3 | 
 4 | out=$CURR/docs/user-guide/cli-options.md
 5 | echo "# All Command-Line Options" > $out
 6 | echo "" >> $out
 7 | echo "The Browsertrix Crawler Docker image currently accepts the following parameters, broken down by entrypoint:" >> $out
 8 | echo "" >> $out
 9 | echo "## crawler" >> $out
10 | echo "" >> $out
11 | echo '```' >> $out
12 | #node $CURR/../dist/main.js --help >> $out
13 | docker run webrecorder/browsertrix-crawler crawl --help | tail -n +3 >> $out
14 | echo '```' >> $out
15 | echo "" >> $out
16 | echo "## create-login-profile" >> $out
17 | echo "" >> $out
18 | echo '```' >> $out
19 | docker run webrecorder/browsertrix-crawler create-login-profile --help | tail -n +3 >> $out
20 | echo '```' >> $out
21 | 


--------------------------------------------------------------------------------
/docs/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: Browsertrix Crawler Docs
 2 | repo_url: https://github.com/webrecorder/browsertrix-crawler/
 3 | repo_name: Browsertrix Crawler
 4 | edit_uri: edit/main/docs/docs/
 5 | extra_css:
 6 |   - stylesheets/extra.css
 7 | theme:
 8 |   name: material
 9 |   custom_dir: docs/overrides
10 |   features:
11 |     - navigation.sections
12 |     - navigation.tabs
13 |     - navigation.tabs.sticky
14 |     - navigation.instant
15 |     - navigation.tracking
16 |     - navigation.indexes
17 |     - navigation.footer
18 |     - content.code.copy
19 |     - content.action.edit
20 |     - content.tooltips
21 |     - search.suggest
22 |   palette:
23 |     scheme: webrecorder
24 |   logo: assets/brand/browsertrix-crawler-white.svg
25 |   favicon: assets/brand/browsertrix-crawler-icon-color-dynamic.svg
26 | 
27 |   icon:
28 |     admonition:
29 |       note: bootstrap/pencil-fill
30 |       abstract: bootstrap/file-earmark-text-fill
31 |       info: bootstrap/info-circle-fill
32 |       tip: bootstrap/exclamation-circle-fill
33 |       success: bootstrap/check-circle-fill
34 |       question: bootstrap/question-circle-fill
35 |       warning: bootstrap/exclamation-triangle-fill
36 |       failure: bootstrap/x-octagon-fill
37 |       danger: bootstrap/exclamation-diamond-fill
38 |       bug: bootstrap/bug-fill
39 |       example: bootstrap/mortarboard-fill
40 |       quote: bootstrap/quote
41 | 
42 |     repo: bootstrap/github
43 |     edit: bootstrap/pencil
44 |     view: bootstrap/eye
45 | 
46 | nav:
47 |   - index.md
48 |   - Develop:
49 |     - develop/index.md
50 |     - develop/docs.md
51 |   - User Guide:
52 |     - user-guide/index.md
53 |     - user-guide/outputs.md
54 |     - user-guide/exit-codes.md
55 |     - user-guide/common-options.md
56 |     - user-guide/crawl-scope.md
57 |     - user-guide/yaml-config.md
58 |     - user-guide/browser-profiles.md
59 |     - user-guide/proxies.md
60 |     - user-guide/behaviors.md
61 |     - user-guide/qa.md
62 |     - user-guide/cli-options.md
63 | 
64 | markdown_extensions:
65 |   - toc:
66 |       toc_depth: 4
67 |       permalink: true
68 |   - pymdownx.highlight:
69 |       anchor_linenums: true
70 |   - pymdownx.emoji:
71 |       emoji_index: !!python/name:material.extensions.emoji.twemoji
72 |       emoji_generator: !!python/name:material.extensions.emoji.to_svg
73 |       options:
74 |         custom_icons:
75 |           - docs/overrides/.icons
76 |   - admonition
77 |   - pymdownx.inlinehilite
78 |   - pymdownx.details
79 |   - pymdownx.superfences
80 |   - pymdownx.keys
81 |   - def_list
82 |   - attr_list
83 | 
84 | extra:
85 |   generator: false
86 |   social:
87 |     - icon: bootstrap/globe
88 |       link: https://webrecorder.net
89 |     - icon: bootstrap/chat-left-text-fill
90 |       link: https://forum.webrecorder.net/
91 |     - icon: bootstrap/mastodon
92 |       link: https://digipres.club/@webrecorder
93 |     - icon: bootstrap/youtube
94 |       link: https://www.youtube.com/@webrecorder
95 | copyright: "Creative Commons Attribution 4.0 International (CC BY 4.0)"
96 | 
97 | plugins:
98 |   - search
99 | 


--------------------------------------------------------------------------------
/html/createProfile.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 |   <head>
 4 |     <style>
 5 |       html,
 6 |       body,
 7 |       iframe {
 8 |         width: 100%;
 9 |         height: 100%;
10 |         margin: 0;
11 |         padding: 0;
12 |         border: 0;
13 |         overflow: hidden;
14 |         font-family: sans-serif;
15 |       }
16 |       body {
17 |         display: flex;
18 |         flex-direction: column;
19 |       }
20 |       iframe#main {
21 |         height: calc(100% - 36px);
22 |       }
23 |       div#info {
24 |         margin: 8px;
25 |       }
26 |       form {
27 |         display: inline;
28 |       }
29 |       button {
30 |         font-weight: bold;
31 |         font-size: 15px;
32 |       }
33 |     </style>
34 |   </head>
35 |   <body>
36 |     <div id="info">
37 |       Log in to any site(s) that you want to be part of the crawl profile using
38 |       the embedded browser below. When done, click
39 |       <form action="/createProfile" method="post">
40 |         <button type="submit">Create Profile</button>
41 |       </form>
42 |     </div>
43 |     <iframe id="main" src="$DEVTOOLS_SRC"></iframe>
44 |   </body>
45 | </html>
46 | 


--------------------------------------------------------------------------------
/html/replay.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 |   <head>
 4 |     <script src="/ui.js"></script>
 5 |     <style>
 6 |       html {
 7 |         width: 100%;
 8 |         height: 100%;
 9 |         display: flex;
10 |       }
11 |       body {
12 |         width: 100%;
13 |         margin: 0;
14 |         padding: 0;
15 |       }
16 |       replay-web-page {
17 |         margin: 0;
18 |         padding: 0;
19 |         border: 0;
20 |         position: fixed;
21 |         width: 100vw;
22 |         height: 100vh;
23 |         top: 0;
24 |         left: 0;
25 |       }
26 |     </style>
27 |   </head>
28 |   <body>
29 |     <replay-web-page
30 |       embed="replayonly"
31 |       deepLink="true"
32 |       source="$SOURCE"
33 |       url="about:blank"
34 |       ts=""
35 |       coll="replay"
36 |       useAdblock
37 |     >
38 |     </replay-web-page>
39 |   </body>
40 | </html>
41 | 


--------------------------------------------------------------------------------
/html/screencast.html:
--------------------------------------------------------------------------------
 1 | <!doctype html>
 2 | <html>
 3 |   <head>
 4 |     <style>
 5 |       #content {
 6 |         display: flex;
 7 |         flex-direction: row;
 8 |         flex-wrap: wrap;
 9 |       }
10 |       #content img {
11 |         width: 640px;
12 |         height: 480px;
13 |         margin: 2rem;
14 |       }
15 |     </style>
16 |     <script>
17 |       const ws = new WebSocket(
18 |         window.location.href.replace("http", "ws") + "ws",
19 |       );
20 |       ws.addEventListener("message", (event) => handleMessage(event.data));
21 | 
22 |       const unusedElems = [];
23 | 
24 |       function handleMessage(resp) {
25 |         resp = JSON.parse(resp);
26 | 
27 |         switch (resp.msg) {
28 |           case "init":
29 |             if (resp.width && resp.height) {
30 |               try {
31 |                 self.document.styleSheets[0].rules[1].style.width = resp.width + "px";
32 |                 self.document.styleSheets[0].rules[1].style.height = resp.height + "px";
33 |               } catch (e) {
34 |                 console.log("Error adjusting stylesheet: ", e);
35 |               }
36 |             }
37 |             break;
38 | 
39 |           case "screencast":
40 |             img = createImage(resp.id);
41 |             if (resp.data) {
42 |               setImageData(img, resp.data);
43 |             }
44 |             break;
45 | 
46 |           case "close":
47 |             img = unuseImage(resp.id);
48 |             break;
49 |         }
50 |       }
51 | 
52 |       function setImageData(img, data) {
53 |         //img.style.display = "";
54 |         img.src = "data:image/png;base64," + data;
55 |       }
56 | 
57 |       function createImage(id) {
58 |         let elem = document.getElementById(id);
59 |         if (elem) {
60 |           return elem;
61 |         }
62 | 
63 |         if (unusedElems.length) {
64 |           elem = unusedElems.shift();
65 |           elem.setAttribute("id", id);
66 |           return elem;
67 |         }
68 | 
69 |         elem = document.createElement("img");
70 |         elem.setAttribute("id", id);
71 |         document.getElementById("content").appendChild(elem);
72 |         return elem;
73 |       }
74 | 
75 |       function unuseImage(id) {
76 |         const elem = document.getElementById(id);
77 |         if (!elem) {
78 |           return;
79 |         }
80 |         //elem.style.display = "none";
81 |         unusedElems.push(elem);
82 |       }
83 |     </script>
84 |     <head>
85 |       <body>
86 |         <div id="content"></div>
87 |       </body>
88 |     </head>
89 |   </head>
90 | </html>
91 | 


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "browsertrix-crawler",
 3 |   "version": "1.6.3",
 4 |   "main": "browsertrix-crawler",
 5 |   "type": "module",
 6 |   "repository": "https://github.com/webrecorder/browsertrix-crawler",
 7 |   "author": "Ilya Kreymer <ikreymer@gmail.com>, Webrecorder Software",
 8 |   "license": "AGPL-3.0-or-later",
 9 |   "scripts": {
10 |     "tsc": "tsc",
11 |     "format": "prettier src/ --check",
12 |     "format:fix": "prettier src/ --write",
13 |     "lint": "eslint src/",
14 |     "lint:fix": "yarn format:fix && eslint src/ --fix",
15 |     "test": "yarn node --experimental-vm-modules $(yarn bin jest --bail 1)",
16 |     "prepare": "husky install"
17 |   },
18 |   "dependencies": {
19 |     "@novnc/novnc": "1.4.0",
20 |     "@puppeteer/replay": "^3.1.1",
21 |     "@webrecorder/wabac": "^2.23.3",
22 |     "browsertrix-behaviors": "^0.8.5",
23 |     "client-zip": "^2.4.5",
24 |     "css-selector-parser": "^3.0.5",
25 |     "fetch-socks": "^1.3.0",
26 |     "get-folder-size": "^4.0.0",
27 |     "husky": "^8.0.3",
28 |     "ioredis": "^5.3.2",
29 |     "iso-639-1": "^3.1.5",
30 |     "js-levenshtein": "^1.1.6",
31 |     "js-yaml": "^4.1.0",
32 |     "minio": "^7.1.3",
33 |     "p-queue": "^7.3.4",
34 |     "pixelmatch": "^5.3.0",
35 |     "pngjs": "^7.0.0",
36 |     "puppeteer-core": "^24.7.2",
37 |     "sax": "^1.3.0",
38 |     "sharp": "^0.32.6",
39 |     "tsc": "^2.0.4",
40 |     "undici": "^6.18.2",
41 |     "uuid": "8.3.2",
42 |     "warcio": "^2.4.4",
43 |     "ws": "^7.4.4",
44 |     "yargs": "^17.7.2"
45 |   },
46 |   "devDependencies": {
47 |     "@types/js-levenshtein": "^1.1.3",
48 |     "@types/js-yaml": "^4.0.8",
49 |     "@types/node": "^20.8.7",
50 |     "@types/pixelmatch": "^5.2.6",
51 |     "@types/pngjs": "^6.0.4",
52 |     "@types/sax": "^1.2.7",
53 |     "@types/uuid": "^9.0.6",
54 |     "@types/ws": "^8.5.8",
55 |     "@typescript-eslint/eslint-plugin": "^6.10.0",
56 |     "@typescript-eslint/parser": "^6.10.0",
57 |     "eslint": "^8.53.0",
58 |     "eslint-config-prettier": "^9.0.0",
59 |     "eslint-plugin-react": "^7.22.0",
60 |     "http-server": "^14.1.1",
61 |     "jest": "^29.7.0",
62 |     "lighthouse": "^12.5.1",
63 |     "md5": "^2.3.0",
64 |     "prettier": "3.0.3",
65 |     "puppeteer": "^24.4.0",
66 |     "typescript": "^5.5.4"
67 |   },
68 |   "jest": {
69 |     "transform": {},
70 |     "testTimeout": 90000
71 |   },
72 |   "resolutions": {
73 |     "wrap-ansi": "7.0.0",
74 |     "warcio": "^2.4.4",
75 |     "@novnc/novnc": "1.4.0"
76 |   }
77 | }
78 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | wacz>=0.5.0
2 | 


--------------------------------------------------------------------------------
/src/main.ts:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env -S node --experimental-global-webcrypto
 2 | 
 3 | import { logger } from "./util/logger.js";
 4 | import { setExitOnRedisError } from "./util/redis.js";
 5 | import { Crawler } from "./crawler.js";
 6 | import { ReplayCrawler } from "./replaycrawler.js";
 7 | import fs from "node:fs";
 8 | import { ExitCodes, InterruptReason } from "./util/constants.js";
 9 | 
10 | let crawler: Crawler | null = null;
11 | 
12 | let lastSigInt = 0;
13 | let forceTerm = false;
14 | 
15 | async function handleTerminate(signame: string) {
16 |   logger.info(`${signame} received...`);
17 |   if (!crawler || !crawler.crawlState) {
18 |     logger.error("error: no crawler running, exiting");
19 |     process.exit(ExitCodes.GenericError);
20 |   }
21 | 
22 |   if (crawler.done) {
23 |     logger.info("success: crawler done, exiting");
24 |     process.exit(ExitCodes.Success);
25 |   }
26 | 
27 |   setExitOnRedisError();
28 | 
29 |   try {
30 |     await crawler.checkCanceled();
31 | 
32 |     if (!crawler.interruptReason) {
33 |       logger.info("SIGNAL: interrupt request received...");
34 |       crawler.gracefulFinishOnInterrupt(InterruptReason.SignalInterrupted);
35 |     } else if (forceTerm || Date.now() - lastSigInt > 200) {
36 |       logger.info("SIGNAL: stopping crawl now...");
37 |       await crawler.serializeAndExit();
38 |     }
39 |     lastSigInt = Date.now();
40 |     // eslint-disable-next-line @typescript-eslint/no-explicit-any
41 |   } catch (e: any) {
42 |     logger.error("Error stopping crawl after receiving termination signal", e);
43 |   }
44 | }
45 | 
46 | process.on("SIGINT", () => handleTerminate("SIGINT"));
47 | 
48 | process.on("SIGTERM", () => handleTerminate("SIGTERM"));
49 | 
50 | process.on("SIGABRT", async () => {
51 |   logger.info("SIGABRT received, will force immediate exit on SIGTERM/SIGINT");
52 |   forceTerm = true;
53 | });
54 | 
55 | if (process.argv[1].endsWith("qa")) {
56 |   crawler = new ReplayCrawler();
57 | } else {
58 |   crawler = new Crawler();
59 | }
60 | 
61 | // remove any core dumps which could be taking up space in the working dir
62 | try {
63 |   fs.unlinkSync("./core");
64 | } catch (e) {
65 |   //ignore
66 | }
67 | 
68 | await crawler.run();
69 | 


--------------------------------------------------------------------------------
/src/util/constants.ts:
--------------------------------------------------------------------------------
 1 | export const HTML_TYPES = [
 2 |   "text/html",
 3 |   "application/xhtml",
 4 |   "application/xhtml+xml",
 5 | ];
 6 | export const WAIT_UNTIL_OPTS = [
 7 |   "load",
 8 |   "domcontentloaded",
 9 |   "networkidle0",
10 |   "networkidle2",
11 | ];
12 | 
13 | export const SERVICE_WORKER_OPTS = [
14 |   "disabled",
15 |   "disabled-if-profile",
16 |   "enabled",
17 | ] as const;
18 | 
19 | export type ServiceWorkerOpt = (typeof SERVICE_WORKER_OPTS)[number];
20 | 
21 | export const DETECT_SITEMAP = "<detect>";
22 | 
23 | export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];
24 | 
25 | export enum BxFunctionBindings {
26 |   BehaviorLogFunc = "__bx_log",
27 |   AddLinkFunc = "__bx_addLink",
28 |   FetchFunc = "__bx_fetch",
29 |   AddToSeenSet = "__bx_addSet",
30 | 
31 |   InitFlow = "__bx_initFlow",
32 |   NextFlowStep = "__bx_nextFlowStep",
33 | }
34 | 
35 | export const MAX_DEPTH = 1000000;
36 | export const DEFAULT_MAX_RETRIES = 2;
37 | 
38 | export const FETCH_HEADERS_TIMEOUT_SECS = 30;
39 | export const PAGE_OP_TIMEOUT_SECS = 5;
40 | export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30;
41 | 
42 | export type ExtractSelector = {
43 |   selector: string;
44 |   extract: string;
45 |   attrOnly: boolean;
46 | };
47 | 
48 | export const DEFAULT_SELECTORS: ExtractSelector[] = [
49 |   {
50 |     selector: "a[href]",
51 |     extract: "href",
52 |     attrOnly: false,
53 |   },
54 | ];
55 | 
56 | export const DEFAULT_CRAWL_ID_TEMPLATE = "@hostname-@id";
57 | 
58 | export const BEHAVIOR_TYPES = [
59 |   "autoplay",
60 |   "autofetch",
61 |   "autoscroll",
62 |   "autoclick",
63 |   "siteSpecific",
64 | ];
65 | 
66 | export const DISPLAY = ":99";
67 | 
68 | export enum ExitCodes {
69 |   Success = 0,
70 |   GenericError = 1,
71 |   Failed = 9,
72 |   OutOfSpace = 3,
73 |   BrowserCrashed = 10,
74 |   SignalInterrupted = 11,
75 |   FailedLimit = 12,
76 |   SignalInterruptedForce = 13,
77 |   SizeLimit = 14,
78 |   TimeLimit = 15,
79 |   DiskUtilization = 16,
80 |   Fatal = 17,
81 |   ProxyError = 21,
82 | }
83 | 
84 | export enum InterruptReason {
85 |   SizeLimit = 1,
86 |   TimeLimit = 2,
87 |   FailedLimit = 3,
88 |   DiskUtilization = 4,
89 |   BrowserCrashed = 5,
90 |   SignalInterrupted = 6,
91 |   CrawlPaused = 7,
92 | }
93 | 


--------------------------------------------------------------------------------
/src/util/healthcheck.ts:
--------------------------------------------------------------------------------
 1 | import http from "http";
 2 | import url from "url";
 3 | import { logger } from "./logger.js";
 4 | import { Browser } from "./browser.js";
 5 | 
 6 | // ===========================================================================
 7 | export class HealthChecker {
 8 |   port: number;
 9 |   errorThreshold: number;
10 |   healthServer: http.Server;
11 |   browser: Browser;
12 | 
13 |   updater: (() => Promise<void>) | null;
14 | 
15 |   errorCount = 0;
16 | 
17 |   constructor(
18 |     port: number,
19 |     errorThreshold: number,
20 |     browser: Browser,
21 |     updater: (() => Promise<void>) | null = null,
22 |   ) {
23 |     this.port = port;
24 |     this.browser = browser;
25 |     this.errorThreshold = errorThreshold;
26 | 
27 |     this.healthServer = http.createServer((...args) =>
28 |       this.healthCheck(...args),
29 |     );
30 |     logger.info(`Healthcheck server started on ${port}`, {}, "healthcheck");
31 |     this.healthServer.listen(port);
32 | 
33 |     this.updater = updater;
34 |   }
35 | 
36 |   async healthCheck(req: http.IncomingMessage, res: http.ServerResponse) {
37 |     const pathname = req.url ? url.parse(req.url).pathname : "";
38 |     switch (pathname) {
39 |       case "/healthz":
40 |         if (this.errorCount < this.errorThreshold && !this.browser.crashed) {
41 |           logger.debug(
42 |             `health check ok, num errors ${this.errorCount} < ${this.errorThreshold}`,
43 |             {},
44 |             "healthcheck",
45 |           );
46 |           res.writeHead(200);
47 |           res.end();
48 |         }
49 |         if (this.updater) {
50 |           this.updater().catch((e) =>
51 |             logger.warn("Healthcheck Updater failed", e, "healthcheck"),
52 |           );
53 |         }
54 |         return;
55 |     }
56 | 
57 |     logger.error(
58 |       `health check failed: ${this.errorCount} >= ${this.errorThreshold}`,
59 |       {},
60 |       "healthcheck",
61 |     );
62 |     res.writeHead(503);
63 |     res.end();
64 |   }
65 | 
66 |   resetErrors() {
67 |     if (this.errorCount > 0) {
68 |       logger.info(
69 |         `Page loaded, resetting error count ${this.errorCount} to 0`,
70 |         {},
71 |         "healthcheck",
72 |       );
73 |       this.errorCount = 0;
74 |     }
75 |   }
76 | 
77 |   incError() {
78 |     this.errorCount++;
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/util/logger.ts:
--------------------------------------------------------------------------------
  1 | // ===========================================================================
  2 | // to fix serialization of regexes for logging purposes
  3 | 
  4 | import { Writable } from "node:stream";
  5 | import { RedisCrawlState } from "./state.js";
  6 | import { ExitCodes } from "./constants.js";
  7 | 
  8 | // RegExp.prototype.toJSON = RegExp.prototype.toString;
  9 | Object.defineProperty(RegExp.prototype, "toJSON", {
 10 |   value: RegExp.prototype.toString,
 11 | });
 12 | 
 13 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
 14 | export type LogDetails = Record<string, any>;
 15 | 
 16 | // ===========================================================================
 17 | // eslint-disable-next-line @typescript-eslint/no-explicit-any
 18 | export function formatErr(e: unknown): Record<string, any> {
 19 |   if (e instanceof Error) {
 20 |     return { type: "exception", message: e.message, stack: e.stack || "" };
 21 |   } else if (typeof e === "object") {
 22 |     return e || {};
 23 |   } else {
 24 |     return { message: (e as object) + "" };
 25 |   }
 26 | }
 27 | 
 28 | // ===========================================================================
 29 | export const LOG_CONTEXT_TYPES = [
 30 |   "general",
 31 |   "worker",
 32 |   "recorder",
 33 |   "recorderNetwork",
 34 |   "writer",
 35 |   "state",
 36 |   "redis",
 37 |   "storage",
 38 |   "text",
 39 |   "exclusion",
 40 |   "screenshots",
 41 |   "screencast",
 42 |   "originOverride",
 43 |   "healthcheck",
 44 |   "browser",
 45 |   "blocking",
 46 |   "behavior",
 47 |   "behaviorScript",
 48 |   "behaviorScriptCustom",
 49 |   "jsError",
 50 |   "fetch",
 51 |   "pageStatus",
 52 |   "memoryStatus",
 53 |   "crawlStatus",
 54 |   "links",
 55 |   "sitemap",
 56 |   "wacz",
 57 |   "replay",
 58 |   "proxy",
 59 | ] as const;
 60 | 
 61 | export type LogContext = (typeof LOG_CONTEXT_TYPES)[number];
 62 | 
 63 | export const DEFAULT_EXCLUDE_LOG_CONTEXTS: LogContext[] = [
 64 |   "recorderNetwork",
 65 |   "jsError",
 66 |   "screencast",
 67 | ];
 68 | 
 69 | // ===========================================================================
 70 | class Logger {
 71 |   logStream: Writable | null = null;
 72 |   debugLogging = false;
 73 |   logErrorsToRedis = false;
 74 |   logBehaviorsToRedis = false;
 75 |   logLevels: string[] = [];
 76 |   contexts: LogContext[] = [];
 77 |   excludeContexts: LogContext[] = [];
 78 |   crawlState?: RedisCrawlState | null = null;
 79 |   fatalExitCode: ExitCodes = ExitCodes.Fatal;
 80 | 
 81 |   setDefaultFatalExitCode(exitCode: number) {
 82 |     this.fatalExitCode = exitCode;
 83 |   }
 84 | 
 85 |   setExternalLogStream(logFH: Writable | null) {
 86 |     this.logStream = logFH;
 87 |   }
 88 | 
 89 |   setDebugLogging(debugLog: boolean) {
 90 |     this.debugLogging = debugLog;
 91 |   }
 92 | 
 93 |   setLogErrorsToRedis(logErrorsToRedis: boolean) {
 94 |     this.logErrorsToRedis = logErrorsToRedis;
 95 |   }
 96 | 
 97 |   setLogBehaviorsToRedis(logBehaviorsToRedis: boolean) {
 98 |     this.logBehaviorsToRedis = logBehaviorsToRedis;
 99 |   }
100 | 
101 |   setLogLevel(logLevels: string[]) {
102 |     this.logLevels = logLevels;
103 |   }
104 | 
105 |   setContext(contexts: LogContext[]) {
106 |     this.contexts = contexts;
107 |   }
108 | 
109 |   setExcludeContext(contexts: LogContext[]) {
110 |     this.excludeContexts = contexts;
111 |   }
112 | 
113 |   setCrawlState(crawlState: RedisCrawlState) {
114 |     this.crawlState = crawlState;
115 |   }
116 | 
117 |   logAsJSON(
118 |     message: string,
119 |     dataUnknown: unknown,
120 |     context: LogContext,
121 |     logLevel = "info",
122 |   ) {
123 |     // eslint-disable-next-line @typescript-eslint/no-explicit-any
124 |     const data: Record<string, any> = formatErr(dataUnknown);
125 | 
126 |     if (this.logLevels.length) {
127 |       if (this.logLevels.indexOf(logLevel) < 0) {
128 |         return;
129 |       }
130 |     }
131 | 
132 |     if (this.contexts.length) {
133 |       if (this.contexts.indexOf(context) < 0) {
134 |         return;
135 |       }
136 |     }
137 | 
138 |     if (this.excludeContexts.length) {
139 |       if (this.excludeContexts.indexOf(context) >= 0) {
140 |         return;
141 |       }
142 |     }
143 | 
144 |     const dataToLog = {
145 |       timestamp: new Date().toISOString(),
146 |       logLevel: logLevel,
147 |       context: context,
148 |       message: message,
149 |       details: data,
150 |     };
151 |     const string = JSON.stringify(dataToLog);
152 |     console.log(string);
153 |     try {
154 |       if (this.logStream) {
155 |         this.logStream.write(string + "\n");
156 |       }
157 |     } catch (e) {
158 |       //
159 |     }
160 | 
161 |     const redisErrorLogLevels = ["error", "fatal"];
162 |     if (
163 |       this.logErrorsToRedis &&
164 |       this.crawlState &&
165 |       redisErrorLogLevels.includes(logLevel)
166 |     ) {
167 |       this.crawlState.logError(string).catch(() => {});
168 |     }
169 | 
170 |     const redisBehaviorLogLevels = ["info", "warn", "error"];
171 |     const behaviorContexts = ["behavior", "behaviorScript"];
172 |     if (
173 |       this.logBehaviorsToRedis &&
174 |       this.crawlState &&
175 |       ((behaviorContexts.includes(context) &&
176 |         redisBehaviorLogLevels.includes(logLevel)) ||
177 |         //always include behaviorScriptCustom
178 |         context === "behaviorScriptCustom")
179 |     ) {
180 |       this.crawlState.logBehavior(string).catch(() => {});
181 |     }
182 |   }
183 | 
184 |   info(message: string, data: unknown = {}, context: LogContext = "general") {
185 |     this.logAsJSON(message, data, context);
186 |   }
187 | 
188 |   error(message: string, data: unknown = {}, context: LogContext = "general") {
189 |     this.logAsJSON(message, data, context, "error");
190 |   }
191 | 
192 |   warn(message: string, data: unknown = {}, context: LogContext = "general") {
193 |     this.logAsJSON(message, data, context, "warn");
194 |   }
195 | 
196 |   debug(message: string, data: unknown = {}, context: LogContext = "general") {
197 |     if (this.debugLogging) {
198 |       this.logAsJSON(message, data, context, "debug");
199 |     }
200 |   }
201 | 
202 |   fatal(
203 |     message: string,
204 |     data = {},
205 |     context: LogContext = "general",
206 |     exitCode = ExitCodes.Success,
207 |   ) {
208 |     exitCode = exitCode || this.fatalExitCode;
209 |     this.logAsJSON(`${message}. Quitting`, data, context, "fatal");
210 | 
211 |     if (this.crawlState) {
212 |       this.crawlState
213 |         .setStatus("failed")
214 |         .catch(() => {})
215 |         .finally(process.exit(exitCode));
216 |     } else {
217 |       process.exit(exitCode);
218 |     }
219 |   }
220 | }
221 | 
222 | export const logger = new Logger();
223 | 


--------------------------------------------------------------------------------
/src/util/originoverride.ts:
--------------------------------------------------------------------------------
 1 | import { HTTPRequest, Page } from "puppeteer-core";
 2 | import { formatErr, logger } from "./logger.js";
 3 | import { Browser } from "./browser.js";
 4 | 
 5 | import { fetch } from "undici";
 6 | import { getProxyDispatcher } from "./proxy.js";
 7 | 
 8 | export class OriginOverride {
 9 |   originOverride: { origUrl: URL; destUrl: URL }[];
10 | 
11 |   constructor(originOverride: string[]) {
12 |     this.originOverride = originOverride.map((override) => {
13 |       const [orig, dest] = override.split("=");
14 |       const origUrl = new URL(orig);
15 |       const destUrl = new URL(dest);
16 | 
17 |       return { origUrl, destUrl };
18 |     });
19 |   }
20 | 
21 |   async initPage(browser: Browser, page: Page) {
22 |     const onRequest = async (request: HTTPRequest) => {
23 |       try {
24 |         const url = request.url();
25 | 
26 |         let newUrl = null;
27 |         let orig = null;
28 | 
29 |         for (const { origUrl, destUrl } of this.originOverride) {
30 |           if (url.startsWith(origUrl.origin)) {
31 |             newUrl = destUrl.origin + url.slice(origUrl.origin.length);
32 |             orig = origUrl;
33 |             break;
34 |           }
35 |         }
36 | 
37 |         if (!newUrl || !orig) {
38 |           await request.continue({}, -1);
39 |           return;
40 |         }
41 | 
42 |         const headers = new Headers(request.headers());
43 | 
44 |         headers.set("host", orig.host);
45 |         if (headers.get("origin")) {
46 |           headers.set("origin", orig.origin);
47 |         }
48 | 
49 |         const resp = await fetch(newUrl, {
50 |           headers,
51 |           dispatcher: getProxyDispatcher(),
52 |         });
53 | 
54 |         const body = Buffer.from(await resp.arrayBuffer());
55 |         const respHeaders = Object.fromEntries(resp.headers);
56 |         const status = resp.status;
57 | 
58 |         logger.debug(
59 |           "Origin overridden",
60 |           { orig: url, dest: newUrl, status, body: body.length },
61 |           "originOverride",
62 |         );
63 | 
64 |         await request.respond({ body, headers: respHeaders, status }, -1);
65 |       } catch (e) {
66 |         logger.warn(
67 |           "Error overriding origin",
68 |           { ...formatErr(e), url: page.url() },
69 |           "originOverride",
70 |         );
71 |         await request.continue({}, -1);
72 |       }
73 |     };
74 |     browser.interceptRequest(page, onRequest);
75 |   }
76 | }
77 | 


--------------------------------------------------------------------------------
/src/util/redis.ts:
--------------------------------------------------------------------------------
 1 | import { Redis } from "ioredis";
 2 | import { logger } from "./logger.js";
 3 | 
 4 | const error = console.error;
 5 | 
 6 | let lastLogTime = 0;
 7 | let exitOnError = false;
 8 | 
 9 | // log only once every 10 seconds
10 | const REDIS_ERROR_LOG_INTERVAL_SECS = 10000;
11 | 
12 | console.error = function (...args) {
13 |   if (
14 |     typeof args[0] === "string" &&
15 |     args[0].indexOf("[ioredis] Unhandled error event") === 0
16 |   ) {
17 |     const now = Date.now();
18 | 
19 |     if (now - lastLogTime > REDIS_ERROR_LOG_INTERVAL_SECS) {
20 |       if (lastLogTime && exitOnError) {
21 |         logger.fatal("Crawl interrupted, redis gone, exiting", {}, "redis");
22 |       }
23 |       logger.warn("ioredis error", { error: args[0] }, "redis");
24 |       lastLogTime = now;
25 |     }
26 |     return;
27 |   }
28 |   error.call(console, ...args);
29 | };
30 | 
31 | export async function initRedis(url: string) {
32 |   const redis = new Redis(url, { lazyConnect: true });
33 |   await redis.connect();
34 |   return redis;
35 | }
36 | 
37 | export function setExitOnRedisError() {
38 |   exitOnError = true;
39 | }
40 | 


--------------------------------------------------------------------------------
/src/util/replayserver.ts:
--------------------------------------------------------------------------------
  1 | import fs from "fs";
  2 | import fsp from "fs/promises";
  3 | import http, { IncomingMessage, ServerResponse } from "http";
  4 | import path from "path";
  5 | 
  6 | const replayHTML = fs.readFileSync(
  7 |   new URL("../../html/replay.html", import.meta.url),
  8 |   { encoding: "utf8" },
  9 | );
 10 | 
 11 | const swJS = fs.readFileSync(new URL("../../html/rwp/sw.js", import.meta.url), {
 12 |   encoding: "utf8",
 13 | });
 14 | 
 15 | const uiJS = fs.readFileSync(new URL("../../html/rwp/ui.js", import.meta.url), {
 16 |   encoding: "utf8",
 17 | });
 18 | 
 19 | const adblockGZ = fs.readFileSync(
 20 |   new URL("../../html/rwp/adblock.gz", import.meta.url),
 21 |   {},
 22 | );
 23 | 
 24 | // ============================================================================
 25 | const PORT = 9990;
 26 | 
 27 | // ============================================================================
 28 | export class ReplayServer {
 29 |   sourceUrl: string;
 30 |   origFileSource: string | null;
 31 |   sourceContentType: string | null;
 32 |   sourceSize?: number;
 33 | 
 34 |   constructor(sourceUrlOrFile: string) {
 35 |     if (
 36 |       sourceUrlOrFile.startsWith("http://") ||
 37 |       sourceUrlOrFile.startsWith("https://")
 38 |     ) {
 39 |       this.sourceUrl = sourceUrlOrFile;
 40 |       this.origFileSource = null;
 41 |       this.sourceContentType = null;
 42 |     } else {
 43 |       this.origFileSource = sourceUrlOrFile;
 44 |       const ext = path.extname(sourceUrlOrFile);
 45 |       this.sourceUrl = `/source${ext}`;
 46 | 
 47 |       switch (ext) {
 48 |         case ".wacz":
 49 |           this.sourceContentType = "application/wacz+zip";
 50 |           break;
 51 | 
 52 |         case ".json":
 53 |           this.sourceContentType = "application/json";
 54 |           break;
 55 | 
 56 |         default:
 57 |           this.sourceContentType = "application/octet-stream";
 58 |       }
 59 |     }
 60 |     const httpServer = http.createServer((req, res) =>
 61 |       this.handleRequest(req, res),
 62 |     );
 63 |     httpServer.listen(PORT);
 64 |   }
 65 | 
 66 |   get homePage() {
 67 |     return `http://localhost:${PORT}/`;
 68 |   }
 69 | 
 70 |   async handleRequest(request: IncomingMessage, response: ServerResponse) {
 71 |     const parsedUrl = new URL(
 72 |       request.url || "",
 73 |       `http://${request.headers.host}`,
 74 |     );
 75 |     const pathname = parsedUrl.pathname;
 76 | 
 77 |     switch (pathname) {
 78 |       case "/":
 79 |         response.writeHead(200, { "Content-Type": "text/html" });
 80 |         response.end(replayHTML.replace("$SOURCE", this.sourceUrl));
 81 |         return;
 82 | 
 83 |       case "/sw.js":
 84 |       case "/replay/sw.js":
 85 |         response.writeHead(200, { "Content-Type": "application/javascript" });
 86 |         response.end(swJS);
 87 |         return;
 88 | 
 89 |       case "/ui.js":
 90 |         response.writeHead(200, { "Content-Type": "application/javascript" });
 91 |         response.end(uiJS);
 92 |         return;
 93 | 
 94 |       case "/replay/adblock/adblock.gz":
 95 |         response.writeHead(200, { "Content-Type": "application/gzip" });
 96 |         response.end(adblockGZ);
 97 |         return;
 98 | 
 99 |       case this.sourceUrl:
100 |         if (this.sourceContentType && this.origFileSource) {
101 |           if (!this.sourceSize) {
102 |             const { size } = await fsp.stat(this.origFileSource);
103 |             this.sourceSize = size;
104 |           }
105 |           const { opts, status, contentRange, contentLength } =
106 |             this.getRespOptsForRequest(request, this.sourceSize);
107 |           response.writeHead(status, {
108 |             "Accept-Ranges": "bytes",
109 |             "Content-Type": this.sourceContentType,
110 |             "Content-Length": contentLength,
111 |             "Content-Range": contentRange,
112 |           });
113 |           //console.log(request.method, contentRange, opts);
114 |           if (request.method === "GET") {
115 |             fs.createReadStream(this.origFileSource, opts).pipe(response);
116 |           } else {
117 |             response.end();
118 |           }
119 |           break;
120 |         }
121 |       // falls through
122 | 
123 |       default:
124 |         response.writeHead(404, { "Content-Type": "application/json" });
125 |         response.end(JSON.stringify({ error: "not_found" }));
126 |         return;
127 |     }
128 |   }
129 | 
130 |   getRespOptsForRequest(request: IncomingMessage, total: number) {
131 |     const range = request.headers["range"] || "";
132 |     const array = range.match(/bytes=(\d+)?-(\d*)/);
133 |     let contentRange = undefined;
134 | 
135 |     // eslint-disable-next-line @typescript-eslint/no-explicit-any
136 |     const opts: Record<string, any> = {};
137 |     if (array) {
138 |       opts.start = parseInt(array[1]);
139 |       opts.end = parseInt(array[2]);
140 |       // negative value, subtract from end
141 |       if (isNaN(opts.start) && !isNaN(opts.end)) {
142 |         opts.start = total - opts.end;
143 |         opts.end = total - 1;
144 |       } else if (isNaN(opts.end)) {
145 |         opts.end = total - 1;
146 |       }
147 |       contentRange = `bytes ${opts.start}-${opts.end}/${total}`;
148 |       return {
149 |         status: 206,
150 |         opts,
151 |         contentRange,
152 |         contentLength: opts.end - opts.start + 1,
153 |       };
154 |     }
155 |     return { status: 200, opts, contentRange, contentLength: total };
156 |   }
157 | }
158 | 


--------------------------------------------------------------------------------
/src/util/screenshots.ts:
--------------------------------------------------------------------------------
  1 | import sharp from "sharp";
  2 | 
  3 | import { logger, formatErr } from "./logger.js";
  4 | import { Browser } from "./browser.js";
  5 | import { Page } from "puppeteer-core";
  6 | import { PageState } from "./state.js";
  7 | import { WARCWriter } from "./warcwriter.js";
  8 | 
  9 | // ============================================================================
 10 | 
 11 | type ScreenShotDesc = {
 12 |   type: "png" | "jpeg";
 13 |   omitBackground: boolean;
 14 |   fullPage: boolean;
 15 |   encoding: "binary";
 16 | };
 17 | 
 18 | type ScreeshotType = "view" | "thumbnail" | "fullPage" | "fullPageFinal";
 19 | 
 20 | export const screenshotTypes: Record<string, ScreenShotDesc> = {
 21 |   view: {
 22 |     type: "png",
 23 |     omitBackground: true,
 24 |     fullPage: false,
 25 |     encoding: "binary",
 26 |   },
 27 |   thumbnail: {
 28 |     type: "jpeg",
 29 |     omitBackground: true,
 30 |     fullPage: false,
 31 |     encoding: "binary",
 32 |   },
 33 |   fullPage: {
 34 |     type: "png",
 35 |     omitBackground: true,
 36 |     fullPage: true,
 37 |     encoding: "binary",
 38 |   },
 39 |   fullPageFinal: {
 40 |     type: "png",
 41 |     omitBackground: true,
 42 |     fullPage: true,
 43 |     encoding: "binary",
 44 |   },
 45 | };
 46 | 
 47 | export type ScreenshotOpts = {
 48 |   browser: Browser;
 49 |   page: Page;
 50 |   url: string;
 51 |   writer: WARCWriter;
 52 | };
 53 | 
 54 | export class Screenshots {
 55 |   browser: Browser;
 56 |   page: Page;
 57 |   url: string;
 58 |   writer: WARCWriter;
 59 | 
 60 |   constructor({ browser, page, writer, url }: ScreenshotOpts) {
 61 |     this.browser = browser;
 62 |     this.page = page;
 63 |     this.url = url;
 64 |     this.writer = writer;
 65 |   }
 66 | 
 67 |   async take(
 68 |     screenshotType: ScreeshotType = "view",
 69 |     state: PageState | null = null,
 70 |   ) {
 71 |     try {
 72 |       if (screenshotType !== "fullPage" && screenshotType !== "fullPageFinal") {
 73 |         await this.browser.setViewport(this.page, {
 74 |           width: 1920,
 75 |           height: 1080,
 76 |         });
 77 |       }
 78 |       const options = screenshotTypes[screenshotType];
 79 |       const screenshotBuffer = Buffer.from(await this.page.screenshot(options));
 80 |       if (state && screenshotType === "view") {
 81 |         state.screenshotView = screenshotBuffer;
 82 |       }
 83 |       this.writer.writeNewResourceRecord(
 84 |         {
 85 |           buffer: screenshotBuffer,
 86 |           resourceType: screenshotType,
 87 |           contentType: "image/" + options.type,
 88 |           url: this.url,
 89 |         },
 90 |         {
 91 |           resource: "screenshot",
 92 |           type: screenshotType,
 93 |           url: this.url,
 94 |           filename: this.writer.filename,
 95 |         },
 96 |         "screenshots",
 97 |       );
 98 |       // logger.info(
 99 |       //   `Screenshot (type: ${screenshotType}) for ${this.url} written to ${this.writer.filename}`,
100 |       // );
101 |     } catch (e) {
102 |       logger.error(
103 |         "Taking screenshot failed",
104 |         { page: this.url, type: screenshotType, ...formatErr(e) },
105 |         "screenshots",
106 |       );
107 |     }
108 |   }
109 | 
110 |   async takeFullPage() {
111 |     await this.take("fullPage");
112 |   }
113 | 
114 |   async takeFullPageFinal() {
115 |     await this.take("fullPageFinal");
116 |   }
117 | 
118 |   async takeThumbnail() {
119 |     const screenshotType = "thumbnail";
120 |     try {
121 |       await this.browser.setViewport(this.page, { width: 1920, height: 1080 });
122 |       const options = screenshotTypes[screenshotType];
123 |       const screenshotBuffer = await this.page.screenshot(options);
124 |       const thumbnailBuffer = await sharp(screenshotBuffer)
125 |         // 16:9 thumbnail
126 |         .resize(640, 360)
127 |         .toBuffer();
128 |       this.writer.writeNewResourceRecord(
129 |         {
130 |           buffer: thumbnailBuffer,
131 |           resourceType: screenshotType,
132 |           contentType: "image/" + options.type,
133 |           url: this.url,
134 |         },
135 |         {
136 |           resource: "screenshot",
137 |           type: screenshotType,
138 |           url: this.url,
139 |           filename: this.writer.filename,
140 |         },
141 |         "screenshots",
142 |       );
143 |     } catch (e) {
144 |       logger.error(
145 |         "Taking screenshot failed",
146 |         { page: this.url, type: screenshotType, ...formatErr(e) },
147 |         "screenshots",
148 |       );
149 |     }
150 |   }
151 | }
152 | 


--------------------------------------------------------------------------------
/src/util/textextract.ts:
--------------------------------------------------------------------------------
  1 | import { logger } from "./logger.js";
  2 | import { CDPSession, Protocol } from "puppeteer-core";
  3 | import { WARCWriter } from "./warcwriter.js";
  4 | 
  5 | // ============================================================================
  6 | type TextExtractOpts = {
  7 |   url: string;
  8 |   writer: WARCWriter;
  9 |   skipDocs: number;
 10 | };
 11 | 
 12 | // ============================================================================
 13 | export abstract class BaseTextExtract {
 14 |   cdp: CDPSession;
 15 |   lastText: string | null = null;
 16 |   text: string | null = null;
 17 |   skipDocs: number = 0;
 18 |   writer: WARCWriter;
 19 |   url: string;
 20 | 
 21 |   constructor(cdp: CDPSession, { writer, skipDocs, url }: TextExtractOpts) {
 22 |     this.writer = writer;
 23 |     this.cdp = cdp;
 24 |     this.url = url;
 25 |     this.skipDocs = skipDocs || 0;
 26 |   }
 27 | 
 28 |   async extractAndStoreText(
 29 |     resourceType: string,
 30 |     ignoreIfMatchesLast = false,
 31 |     saveToWarc = false,
 32 |   ) {
 33 |     try {
 34 |       const text = await this.doGetText();
 35 | 
 36 |       if (ignoreIfMatchesLast && text === this.lastText) {
 37 |         this.lastText = this.text;
 38 |         logger.debug(
 39 |           "Skipping, extracted text unchanged from last extraction",
 40 |           { url: this.url },
 41 |           "text",
 42 |         );
 43 |         return { changed: false, text };
 44 |       }
 45 |       if (saveToWarc) {
 46 |         this.writer.writeNewResourceRecord(
 47 |           {
 48 |             buffer: new TextEncoder().encode(text),
 49 |             resourceType,
 50 |             contentType: "text/plain",
 51 |             url: this.url,
 52 |           },
 53 |           {
 54 |             resource: "text",
 55 |             type: resourceType,
 56 |             url: this.url,
 57 |             filename: this.writer.filename,
 58 |           },
 59 |           "text",
 60 |         );
 61 |       }
 62 | 
 63 |       this.lastText = text;
 64 |       return { changed: true, text };
 65 |     } catch (e) {
 66 |       logger.debug("Error extracting text", e, "text");
 67 |       return { changed: false, text: null };
 68 |     }
 69 |   }
 70 | 
 71 |   abstract doGetText(): Promise<string>;
 72 | }
 73 | 
 74 | // ============================================================================
 75 | export class TextExtractViaSnapshot extends BaseTextExtract {
 76 |   async doGetText(): Promise<string> {
 77 |     const result = await this.cdp.send("DOMSnapshot.captureSnapshot", {
 78 |       computedStyles: [],
 79 |     });
 80 |     return this.parseTextFromDOMSnapshot(result);
 81 |   }
 82 | 
 83 |   parseTextFromDOMSnapshot(
 84 |     result: Protocol.DOMSnapshot.CaptureSnapshotResponse,
 85 |   ): string {
 86 |     const TEXT_NODE = 3;
 87 |     const ELEMENT_NODE = 1;
 88 | 
 89 |     const SKIPPED_NODES = [
 90 |       "SCRIPT",
 91 |       "STYLE",
 92 |       "HEADER",
 93 |       "FOOTER",
 94 |       "BANNER-DIV",
 95 |       "NOSCRIPT",
 96 |       "TITLE",
 97 |     ];
 98 | 
 99 |     const { strings, documents } = result;
100 | 
101 |     const accum: string[] = [];
102 | 
103 |     for (const doc of documents.slice(this.skipDocs)) {
104 |       const nodeValues = doc.nodes.nodeValue || [];
105 |       const nodeNames = doc.nodes.nodeName || [];
106 |       const nodeTypes = doc.nodes.nodeType || [];
107 |       const parentIndex = doc.nodes.parentIndex || [];
108 | 
109 |       for (let i = 0; i < nodeValues.length; i++) {
110 |         if (nodeValues[i] === -1) {
111 |           continue;
112 |         }
113 | 
114 |         if (nodeTypes[i] === TEXT_NODE) {
115 |           const pi = parentIndex[i];
116 |           if (pi >= 0 && nodeTypes[pi] === ELEMENT_NODE) {
117 |             const name = strings[nodeNames[pi]];
118 | 
119 |             if (!SKIPPED_NODES.includes(name)) {
120 |               const value = strings[nodeValues[i]].trim();
121 |               if (value) {
122 |                 accum.push(value as string);
123 |               }
124 |             }
125 |           }
126 |         }
127 |       }
128 |     }
129 | 
130 |     return accum.join("\n");
131 |   }
132 | }
133 | 
134 | // ============================================================================
135 | export class TextExtractViaDocument extends BaseTextExtract {
136 |   async doGetText(): Promise<string> {
137 |     const result = await this.cdp.send("DOM.getDocument", {
138 |       depth: -1,
139 |       pierce: true,
140 |     });
141 |     return this.parseTextFromDOM(result);
142 |   }
143 | 
144 |   parseTextFromDOM(dom: Protocol.DOM.GetDocumentResponse): string {
145 |     const accum: string[] = [];
146 |     const metadata = {};
147 | 
148 |     this.parseText(dom.root, metadata, accum);
149 | 
150 |     return accum.join("\n");
151 |   }
152 | 
153 |   parseText(
154 |     node: Protocol.DOM.Node,
155 |     metadata: Record<string, string> | null,
156 |     accum: string[],
157 |   ) {
158 |     const SKIPPED_NODES = [
159 |       "head",
160 |       "script",
161 |       "style",
162 |       "header",
163 |       "footer",
164 |       "banner-div",
165 |       "noscript",
166 |     ];
167 |     const EMPTY_LIST: Protocol.DOM.Node[] = [];
168 |     const TEXT = "#text";
169 |     const TITLE = "title";
170 | 
171 |     const name = node.nodeName.toLowerCase();
172 | 
173 |     if (SKIPPED_NODES.includes(name)) {
174 |       return;
175 |     }
176 | 
177 |     const children = node.children || EMPTY_LIST;
178 | 
179 |     if (name === TEXT) {
180 |       const value = node.nodeValue ? node.nodeValue.trim() : "";
181 |       if (value) {
182 |         accum.push(value);
183 |       }
184 |     } else if (name === TITLE) {
185 |       const title: string[] = [];
186 | 
187 |       for (const child of children) {
188 |         this.parseText(child, null, title);
189 |       }
190 | 
191 |       if (metadata) {
192 |         metadata.title = title.join(" ");
193 |       } else {
194 |         accum.push(title.join(" "));
195 |       }
196 |     } else {
197 |       for (const child of children) {
198 |         this.parseText(child, metadata, accum);
199 |       }
200 | 
201 |       if (node.contentDocument) {
202 |         this.parseText(node.contentDocument, null, accum);
203 |       }
204 |     }
205 |   }
206 | }
207 | 


--------------------------------------------------------------------------------
/src/util/timing.ts:
--------------------------------------------------------------------------------
 1 | import { LogContext, logger } from "./logger.js";
 2 | 
 3 | export function sleep(seconds: number) {
 4 |   return new Promise((resolve) => setTimeout(resolve, seconds * 1000));
 5 | }
 6 | 
 7 | // TODO: Fix this the next time the file is edited.
 8 | 
 9 | export function timedRun(
10 |   // eslint-disable-next-line @typescript-eslint/no-explicit-any
11 |   promise: Promise<any>,
12 |   seconds: number,
13 |   message = "Promise timed out",
14 |   logDetails = {},
15 |   context: LogContext = "general",
16 |   isWarn = false,
17 | ) {
18 |   // return Promise return value or log error if timeout is reached first
19 |   const timeout = seconds * 1000;
20 | 
21 |   let tm: NodeJS.Timeout;
22 | 
23 |   const rejectPromiseOnTimeout = (timeout: number) => {
24 |     return new Promise((resolve, reject) => {
25 |       tm = setTimeout(() => reject("timeout reached"), timeout);
26 |     });
27 |   };
28 | 
29 |   return Promise.race([promise, rejectPromiseOnTimeout(timeout)])
30 |     .catch((err) => {
31 |       if (err === "timeout reached") {
32 |         const logFunc = isWarn ? logger.warn : logger.error;
33 |         logFunc.call(
34 |           logger,
35 |           message,
36 |           { seconds: seconds, ...logDetails },
37 |           context,
38 |         );
39 |       } else {
40 |         //logger.error("Unknown exception", {...errJSON(err), ...logDetails}, context);
41 |         throw err;
42 |       }
43 |     })
44 |     .finally(() => clearTimeout(tm));
45 | }
46 | 
47 | export function secondsElapsed(startTime: number, nowDate: Date | null = null) {
48 |   nowDate = nowDate || new Date();
49 | 
50 |   return (nowDate.getTime() - startTime) / 1000;
51 | }
52 | 
53 | export function timestampNow() {
54 |   return new Date().toISOString().replace(/[^\d]/g, "");
55 | }
56 | 


--------------------------------------------------------------------------------
/test-setup.js:
--------------------------------------------------------------------------------
1 | import { jest } from "@jest/globals";
2 | 
3 | global.jest = jest;
4 | 


--------------------------------------------------------------------------------
/tests/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/webrecorder/browsertrix-crawler/d2a6aa9805fa08c9a51b43005e0a562a032fd78a/tests/.DS_Store


--------------------------------------------------------------------------------
/tests/adblockrules.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | import yaml from "js-yaml";
 4 | 
 5 | function runCrawl(name, config, commandExtra = "") {
 6 |   config.generateCDX = true;
 7 |   config.depth = 0;
 8 |   config.collection = name;
 9 | 
10 |   const configYaml = yaml.dump(config);
11 | 
12 |   try {
13 |     const proc = child_process.execSync(
14 |       `docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
15 |       { input: configYaml, stdin: "inherit", encoding: "utf8" },
16 |     );
17 | 
18 |     //console.log(proc);
19 |   } catch (error) {
20 |     console.log(error);
21 |   }
22 | }
23 | 
24 | function doesCDXContain(coll, value) {
25 |   const data = fs.readFileSync(
26 |     `test-crawls/collections/${coll}/indexes/index.cdxj`,
27 |   );
28 |   return data.indexOf(value) >= 0;
29 | }
30 | 
31 | // Test Disabled for Brave -- should always be blocked, but seeing inconsistent ci behavior
32 | /*
33 | test("test crawl without ad block for specific URL", () => {
34 |   const config = {
35 |     "url": "https://www.mozilla.org/en-US/firefox/",
36 |     "pageExtraDelay": 10
37 |   };
38 | 
39 |   runCrawl("adblock-no-block", config);
40 | 
41 |   // without ad blocking, URL with googletagmanager is included
42 |   expect(doesCDXContain("adblock-no-block", "www.googletagmanager.com")).toBe(true);
43 | });
44 | */
45 | 
46 | test("testcrawl with ad block for specific URL", () => {
47 |   const config = {
48 |     url: "https://www.mozilla.org/en-US/firefox/",
49 |     blockAds: true,
50 |   };
51 | 
52 |   runCrawl("adblock-block", config);
53 | 
54 |   expect(doesCDXContain("adblock-block", "www.googletagmanager.com")).toBe(
55 |     false,
56 |   );
57 | });
58 | 


--------------------------------------------------------------------------------
/tests/add-exclusion.test.js:
--------------------------------------------------------------------------------
 1 | import { exec } from "child_process";
 2 | import Redis from "ioredis";
 3 | 
 4 | function sleep(ms) {
 5 |   return new Promise((resolve) => setTimeout(resolve, ms));
 6 | }
 7 | 
 8 | test("dynamically add exclusion while crawl is running", async () => {
 9 |   let callback = null;
10 | 
11 |   const p = new Promise((resolve) => {
12 |     callback = (error, stdout, stderr) => {
13 |       resolve({ error, stdout, stderr });
14 |     };
15 |   });
16 | 
17 |   try {
18 |     exec(
19 |       "docker run -p 36382:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection add-exclusion --url https://old.webrecorder.net/ --scopeType prefix --limit 20 --logging debug --debugAccessRedis",
20 |       { shell: "/bin/bash" },
21 |       callback,
22 |     );
23 |   } catch (error) {
24 |     console.log(error);
25 |   }
26 | 
27 |   await sleep(3000);
28 | 
29 |   const redis = new Redis("redis://127.0.0.1:36382/0", { lazyConnect: true, retryStrategy: () => null })
30 | 
31 |   await redis.connect();
32 | 
33 |   while (true) {
34 |     if (Number(await redis.zcard("test:q")) > 1) {
35 |       break;
36 |     }
37 | 
38 |     await sleep(500);
39 |   }
40 | 
41 |   const uids = await redis.hkeys("test:status");
42 | 
43 |   // exclude all pages containing 'webrecorder', should clear out the queue and end the crawl
44 |   await redis.rpush(
45 |     `${uids[0]}:msg`,
46 |     JSON.stringify({ type: "addExclusion", regex: "webrecorder" }),
47 |   );
48 | 
49 |   // ensure 'Add Exclusion is contained in the debug logs
50 |   const { stdout } = await p;
51 | 
52 |   expect(stdout.indexOf("Add Exclusion") > 0).toBe(true);
53 | 
54 |   expect(stdout.indexOf("Removing excluded URL") > 0).toBe(true);
55 | });
56 | 
57 | 


--------------------------------------------------------------------------------
/tests/basic_crawl.test.js:
--------------------------------------------------------------------------------
  1 | import child_process from "child_process";
  2 | import fs from "fs";
  3 | import path from "path";
  4 | import md5 from "md5";
  5 | 
  6 | const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
  7 | const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args);
  8 | 
  9 | test("ensure basic crawl run with docker run passes", async () => {
 10 |   child_process.execSync(
 11 |     'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --generateWACZ  --text --collection wr-net --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --warcPrefix custom-prefix',
 12 |   );
 13 | 
 14 |   child_process.execSync(
 15 |     "unzip test-crawls/collections/wr-net/wr-net.wacz -d test-crawls/collections/wr-net/wacz",
 16 |   );
 17 | });
 18 | 
 19 | testIf(doValidate, "validate wacz", () => {
 20 |   child_process.execSync(
 21 |     "wacz validate --file ./test-crawls/collections/wr-net/wr-net.wacz",
 22 |   );
 23 | });
 24 | 
 25 | test("check that individual WARCs have correct prefix and are under rollover size", () => {
 26 |   const archiveWarcLists = fs.readdirSync(
 27 |     "test-crawls/collections/wr-net/archive",
 28 |   );
 29 | 
 30 |   archiveWarcLists.forEach((filename) => {
 31 |     expect(filename.startsWith("custom-prefix-")).toEqual(true);
 32 |     const size = fs.statSync(
 33 |       path.join("test-crawls/collections/wr-net/archive", filename),
 34 |     ).size;
 35 |     expect(size < 10000).toEqual(true);
 36 |   });
 37 | });
 38 | 
 39 | test("check that a combined warc file exists in the archive folder", () => {
 40 |   const warcLists = fs.readdirSync("test-crawls/collections/wr-net");
 41 |   var captureFound = 0;
 42 | 
 43 |   for (var i = 0; i < warcLists.length; i++) {
 44 |     if (warcLists[i].endsWith("_0.warc.gz")) {
 45 |       captureFound = 1;
 46 |     }
 47 |   }
 48 |   expect(captureFound).toEqual(1);
 49 | });
 50 | 
 51 | test("check that a combined warc file is under the rolloverSize", () => {
 52 |   const warcLists = fs.readdirSync(
 53 |     path.join("test-crawls/collections/wr-net/wacz", "archive"),
 54 |   );
 55 |   let rolloverSize = 0;
 56 | 
 57 |   function getFileSize(filename) {
 58 |     return fs.statSync(filename).size;
 59 |   }
 60 | 
 61 |   for (let i = 0; i < warcLists.length; i++) {
 62 |     const size = getFileSize(
 63 |       path.join("test-crawls/collections/wr-net/wacz/archive/", warcLists[i]),
 64 |     );
 65 |     if (size < 10000) {
 66 |       rolloverSize = 1;
 67 |     }
 68 |   }
 69 |   expect(rolloverSize).toEqual(1);
 70 | });
 71 | 
 72 | test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
 73 |   expect(
 74 |     fs.existsSync("test-crawls/collections/wr-net/pages/pages.jsonl"),
 75 |   ).toBe(true);
 76 | });
 77 | 
 78 | test("check that the pages.jsonl file exists in the wacz under the pages folder", () => {
 79 |   expect(
 80 |     fs.existsSync("test-crawls/collections/wr-net/wacz/pages/pages.jsonl"),
 81 |   ).toBe(true);
 82 | });
 83 | 
 84 | test("check that the hash in the pages folder and in the unzipped wacz folders match", () => {
 85 |   const crawl_hash = md5(
 86 |     JSON.parse(
 87 |       fs
 88 |         .readFileSync(
 89 |           "test-crawls/collections/wr-net/wacz/pages/pages.jsonl",
 90 |           "utf8",
 91 |         )
 92 |         .split("\n")[1],
 93 |     )["text"],
 94 |   );
 95 |   const wacz_hash = md5(
 96 |     JSON.parse(
 97 |       fs
 98 |         .readFileSync(
 99 |           "test-crawls/collections/wr-net/pages/pages.jsonl",
100 |           "utf8",
101 |         )
102 |         .split("\n")[1],
103 |     )["text"],
104 |   );
105 |   const fixture_hash = md5(
106 |     JSON.parse(
107 |       fs.readFileSync("tests/fixtures/pages.jsonl", "utf8").split("\n")[1],
108 |     )["text"],
109 |   );
110 | 
111 |   expect(wacz_hash).toEqual(fixture_hash);
112 |   expect(wacz_hash).toEqual(crawl_hash);
113 | });
114 | 
115 | test("check that the supplied title and description made it into datapackage.json", () => {
116 |   expect(
117 |     fs.existsSync("test-crawls/collections/wr-net/wacz/datapackage.json"),
118 |   ).toBe(true);
119 | 
120 |   const data = fs.readFileSync(
121 |     "test-crawls/collections/wr-net/wacz/datapackage.json",
122 |     "utf8",
123 |   );
124 |   const dataPackageJSON = JSON.parse(data);
125 |   expect(dataPackageJSON.title).toEqual("test title");
126 |   expect(dataPackageJSON.description).toEqual("test description");
127 | });
128 | 


--------------------------------------------------------------------------------
/tests/blockrules.test.js:
--------------------------------------------------------------------------------
  1 | import child_process from "child_process";
  2 | import fs from "fs";
  3 | import yaml from "js-yaml";
  4 | 
  5 | const isCI = !!process.env.CI;
  6 | const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args);
  7 | 
  8 | function runCrawl(name, config, commandExtra = "") {
  9 |   config.generateCDX = true;
 10 |   config.depth = 0;
 11 |   config.collection = name;
 12 | 
 13 |   const configYaml = yaml.dump(config);
 14 | 
 15 |   try {
 16 |     const proc = child_process.execSync(
 17 |       `docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin ${commandExtra}`,
 18 |       { input: configYaml, stdin: "inherit", encoding: "utf8" },
 19 |     );
 20 | 
 21 |     //console.log(proc);
 22 |   } catch (error) {
 23 |     console.log(error);
 24 |   }
 25 | }
 26 | 
 27 | function doesCDXContain(coll, value) {
 28 |   const data = fs.readFileSync(
 29 |     `test-crawls/collections/${coll}/indexes/index.cdxj`,
 30 |   );
 31 |   return data.indexOf(value) >= 0;
 32 | }
 33 | 
 34 | function checkVideo(coll) {
 35 |   return doesCDXContain(coll, '"video/mp4"');
 36 | }
 37 | 
 38 | // Test Disabled for Brave -- should always be blocked, but seeing inconsistent ci behavior
 39 | /*
 40 | test("test crawl without block for specific URL", () => {
 41 |   const config = {
 42 |     "url": "https://www.iana.org/",
 43 |     "pageExtraDelay": 10
 44 |   };
 45 | 
 46 |   runCrawl("block-1-no-block", config);
 47 | 
 48 |   // without blocks, URL with add sense is included
 49 |   expect(doesCDXContain("block-1-no-block", "https://cse.google.com/adsense/search/async-ads.js")).toBe(true);
 50 | });
 51 | */
 52 | 
 53 | test("test block rule on specific URL", () => {
 54 |   const config = {
 55 |     url: "https://www.iana.org/",
 56 |     blockRules: [{ url: "adsense" }],
 57 |   };
 58 | 
 59 |   runCrawl("block-1", config);
 60 | 
 61 |   expect(
 62 |     doesCDXContain(
 63 |       "block-1",
 64 |       "https://cse.google.com/adsense/search/async-ads.js",
 65 |     ),
 66 |   ).toBe(false);
 67 | });
 68 | 
 69 | testIf(!isCI, "test block rule based on iframe text, content included due to match", () => {
 70 |   const config = {
 71 |     url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
 72 |     blockRules: [
 73 |       {
 74 |         url: "https://www.youtube.com/embed/",
 75 |         frameTextMatch:
 76 |           '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
 77 |         type: "allowOnly",
 78 |       },
 79 |     ],
 80 |   };
 81 | 
 82 |   runCrawl("block-2", config);
 83 | 
 84 |   expect(checkVideo("block-2")).toBe(true);
 85 | });
 86 | 
 87 | test("test block rule based on iframe text, wrong text, content should be excluded", () => {
 88 |   const config = {
 89 |     url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
 90 |     blockRules: [
 91 |       {
 92 |         url: "https://www.youtube.com/embed/",
 93 |         frameTextMatch:
 94 |           '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_R\\\\"',
 95 |         type: "allowOnly",
 96 |       },
 97 |     ],
 98 |   };
 99 | 
100 |   runCrawl("block-3", config);
101 | 
102 |   expect(checkVideo("block-3")).toBe(false);
103 | });
104 | 
105 | test("test block rule based on iframe text, block matched", () => {
106 |   const config = {
107 |     url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
108 |     blockRules: [
109 |       {
110 |         url: "https://www.youtube.com/embed/",
111 |         frameTextMatch:
112 |           '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
113 |       },
114 |     ],
115 |   };
116 | 
117 |   runCrawl("block-4", config);
118 | 
119 |   expect(checkVideo("block-4")).toBe(false);
120 | });
121 | 
122 | testIf(!isCI, "test rule based on iframe text not matching, plus allowOnly iframe", () => {
123 |   const config = {
124 |     url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
125 |     blockRules: [
126 |       {
127 |         url: "example.com/embed/",
128 |         frameTextMatch:
129 |           '\\\\"channelId\\\\":\\\\"UCrQElMF25VP-1JjhBuFsW_Q\\\\"',
130 |         type: "block",
131 |       },
132 |       {
133 |         url: "(youtube.com|example.com)/embed/",
134 |         type: "allowOnly",
135 |         inFrameUrl: "oembed.link/",
136 |       },
137 |     ],
138 |   };
139 | 
140 |   runCrawl("non-block-5", config);
141 | 
142 |   expect(checkVideo("non-block-5")).toBe(true);
143 | });
144 | 
145 | test("test block url in frame url", () => {
146 |   const config = {
147 |     url: "https://oembed.link/https://www.youtube.com/watch?v=aT-Up5Y4uRI",
148 |     blockRules: [
149 |       {
150 |         url: "maxresdefault.jpg",
151 |         type: "block",
152 |         inFrameUrl: "youtube.com/embed",
153 |       },
154 |     ],
155 |   };
156 | 
157 |   runCrawl("block-6", config);
158 | 
159 |   expect(
160 |     doesCDXContain(
161 |       "block-6",
162 |       '"https://i.ytimg.com/vi/aT-Up5Y4uRI/maxresdefault.jpg"',
163 |     ),
164 |   ).toBe(false);
165 | });
166 | 
167 | testIf(!isCI, "test block rules complex example, block external urls on main frame, but not on youtube", () => {
168 |   const config = {
169 |     seeds: ["https://archiveweb.page/en/troubleshooting/errors/"],
170 |     depth: "0",
171 |     blockRules: [
172 |       {
173 |         url: "(archiveweb.page|www.youtube.com)",
174 |         type: "allowOnly",
175 |         inFrameUrl: "archiveweb.page",
176 |       },
177 |       {
178 |         url: "https://archiveweb.page/assets/js/vendor/lunr.min.js",
179 |         inFrameUrl: "archiveweb.page",
180 |       },
181 |       {
182 |         url: "https://www.youtube.com/embed/",
183 |         type: "allowOnly",
184 |         frameTextMatch:
185 |           '(\\\\"channelId\\\\":\\\\"UCOHO8gYUWpDYFWHXmIwE02g\\\\")',
186 |       },
187 |     ],
188 | 
189 |     combineWARC: true,
190 | 
191 |     logging: "stats,debug",
192 |   };
193 | 
194 |   runCrawl("block-7", config);
195 | 
196 |   expect(
197 |     doesCDXContain(
198 |       "block-7",
199 |       '"https://archiveweb.page/assets/js/vendor/lunr.min.js"',
200 |     ),
201 |   ).toBe(false);
202 |   expect(checkVideo("block-7")).toBe(true);
203 | });
204 | 


--------------------------------------------------------------------------------
/tests/brave-query-redir.test.js:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import { execSync } from "child_process";
 3 | 
 4 | test("check that gclid query URL is automatically redirected to remove it", async () => {
 5 |   try {
 6 |     execSync(
 7 |       "docker run --rm  -v $PWD/test-crawls:/crawls -i webrecorder/browsertrix-crawler crawl --url 'https://old.webrecorder.net/about?gclid=abc' --collection test-brave-redir --behaviors \"\" --limit 1 --generateCDX");
 8 | 
 9 |   } catch (error) {
10 |     console.log(error.stderr);
11 |   }
12 | 
13 |   const filedata = fs.readFileSync(
14 |     "test-crawls/collections/test-brave-redir/indexes/index.cdxj",
15 |     { encoding: "utf-8" },
16 |   );
17 | 
18 |   let responseFound = false;
19 |   let redirectFound = false;
20 | 
21 |   const lines = filedata.trim().split("\n");
22 | 
23 |   for (const line of lines) {
24 |     const json = line.split(" ").slice(2).join(" ");
25 |     const data = JSON.parse(json);
26 |     if (data.url === "https://old.webrecorder.net/about?gclid=abc" && data.status === "307") {
27 |       redirectFound = true;
28 |     } else if (data.url === "https://old.webrecorder.net/about" && data.status === "200") {
29 |       responseFound = true;
30 |     }
31 |     if (responseFound && redirectFound) {
32 |       break;
33 |     }
34 |   }
35 | 
36 |   expect(redirectFound && responseFound).toBe(true);
37 | });
38 | 


--------------------------------------------------------------------------------
/tests/collection_name.test.js:
--------------------------------------------------------------------------------
 1 | import util from "util";
 2 | import { exec as execCallback } from "child_process";
 3 | 
 4 | const exec = util.promisify(execCallback);
 5 | 
 6 | test("check that the collection name is properly validated", async () => {
 7 |   let passed = "";
 8 | 
 9 |   try {
10 |     await exec(
11 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection valid_collection-nameisvalid",
12 |     );
13 |     passed = true;
14 |   } catch (error) {
15 |     passed = false;
16 |   }
17 |   expect(passed).toBe(true);
18 | });
19 | 
20 | test("check that the collection name is not accepted if it doesn't meets our standards", async () => {
21 |   let passed = "";
22 | 
23 |   try {
24 |     await exec(
25 |       "docker run webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --collection invalid_c!!ollection-nameisvalid",
26 |     );
27 |     passed = true;
28 |   } catch (e) {
29 |     passed = false;
30 |   }
31 |   expect(passed).toBe(false);
32 | });
33 | 


--------------------------------------------------------------------------------
/tests/config_file.test.js:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import yaml from "js-yaml";
 3 | 
 4 | import util from "util";
 5 | import { exec as execCallback } from "child_process";
 6 | 
 7 | const exec = util.promisify(execCallback);
 8 | 
 9 | test("check yaml config file with seed list is used", async () => {
10 |   try {
11 |     await exec(
12 |       "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --config /tests/fixtures/crawl-1.yaml --depth 0",
13 |     );
14 |   } catch (error) {
15 |     console.log(error);
16 |   }
17 | 
18 |   const crawledPages = fs.readFileSync(
19 |     "test-crawls/collections/configtest/pages/pages.jsonl",
20 |     "utf8",
21 |   );
22 |   const pages = new Set();
23 | 
24 |   for (const line of crawledPages.trim().split("\n")) {
25 |     const url = JSON.parse(line).url;
26 |     if (url) {
27 |       pages.add(url);
28 |     }
29 |   }
30 | 
31 |   const config = yaml.load(
32 |     fs.readFileSync("tests/fixtures/crawl-1.yaml", "utf8"),
33 |   );
34 | 
35 |   let foundAllSeeds = true;
36 | 
37 |   for (const seed of config.seeds) {
38 |     const url = new URL(seed).href;
39 |     if (!pages.has(url)) {
40 |       foundAllSeeds = false;
41 |     }
42 |   }
43 |   expect(foundAllSeeds).toBe(true);
44 | 
45 |   expect(
46 |     fs.existsSync("test-crawls/collections/configtest/configtest.wacz"),
47 |   ).toBe(true);
48 | });
49 | 
50 | test("check yaml config file will be overwritten by command line", async () => {
51 |   try {
52 |     await exec(
53 |       "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection configtest-2 --config /tests/fixtures/crawl-1.yaml --url https://specs.webrecorder.net/ --scopeType page --timeout 20000",
54 |     );
55 |   } catch (error) {
56 |     console.log(error);
57 |   }
58 | 
59 |   const crawledPages = fs.readFileSync(
60 |     "test-crawls/collections/configtest-2/pages/pages.jsonl",
61 |     "utf8",
62 |   );
63 |   const pages = new Set();
64 | 
65 |   for (const line of crawledPages.trim().split("\n")) {
66 |     const url = JSON.parse(line).url;
67 |     if (url) {
68 |       pages.add(url);
69 |     }
70 |   }
71 | 
72 |   expect(pages.has("https://specs.webrecorder.net/")).toBe(true);
73 |   expect(pages.size).toBe(1);
74 | });
75 | 


--------------------------------------------------------------------------------
/tests/config_stdin.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | import yaml from "js-yaml";
 4 | 
 5 | test("pass config file via stdin", async () => {
 6 |   const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
 7 |   const config = yaml.load(configYaml);
 8 | 
 9 |   try {
10 |     const proc = child_process.execSync(
11 |       "docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --scopeExcludeRx webrecorder.net/202",
12 |       { input: configYaml, stdin: "inherit", encoding: "utf8" },
13 |     );
14 | 
15 |     //console.log(proc);
16 |   } catch (error) {
17 |     console.log(error);
18 |   }
19 | 
20 |   const crawledPages = fs.readFileSync(
21 |     "test-crawls/collections/config-stdin/pages/pages.jsonl",
22 |     "utf8",
23 |   );
24 |   const pages = new Set();
25 | 
26 |   for (const line of crawledPages.trim().split("\n")) {
27 |     const url = JSON.parse(line).url;
28 |     if (!url) {
29 |       continue;
30 |     }
31 |     pages.add(url);
32 |     expect(url.indexOf("webrecorder.net/202")).toEqual(-1);
33 |   }
34 | 
35 |   let foundAllSeeds = true;
36 | 
37 |   for (const seed of config.seeds) {
38 |     const url = new URL(seed).href;
39 |     if (!pages.has(url)) {
40 |       foundAllSeeds = false;
41 |     }
42 |   }
43 |   expect(foundAllSeeds).toBe(true);
44 | 
45 |   expect(
46 |     fs.existsSync("test-crawls/collections/config-stdin/config-stdin.wacz"),
47 |   ).toBe(true);
48 | });
49 | 


--------------------------------------------------------------------------------
/tests/crawl_overwrite.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | 
 4 | test("ensure --overwrite with existing collection results in a successful crawl", async () => {
 5 |   child_process.execSync(
 6 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ  --collection overwrite",
 7 |   );
 8 | 
 9 |   child_process.execSync(
10 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ  --collection overwrite --overwrite",
11 |   );
12 | });
13 | 
14 | test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
15 |   expect(
16 |     fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
17 |   ).toBe(true);
18 | });
19 | 
20 | test("check that the WACZ file exists in the collection", () => {
21 |   expect(
22 |     fs.existsSync("test-crawls/collections/overwrite/pages/pages.jsonl"),
23 |   ).toBe(true);
24 | });
25 | 
26 | //-----------
27 | 
28 | test("ensure --overwrite results in a successful crawl even if collection didn't exist", async () => {
29 |   child_process.execSync(
30 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://www.example.com/ --generateWACZ  --collection overwrite-nothing --overwrite",
31 |   );
32 | });
33 | 
34 | test("check that the pages.jsonl file exists in the collection under the pages folder", () => {
35 |   expect(
36 |     fs.existsSync(
37 |       "test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
38 |     ),
39 |   ).toBe(true);
40 | });
41 | 
42 | test("check that the WACZ file exists in the collection", () => {
43 |   expect(
44 |     fs.existsSync(
45 |       "test-crawls/collections/overwrite-nothing/pages/pages.jsonl",
46 |     ),
47 |   ).toBe(true);
48 | });
49 | 


--------------------------------------------------------------------------------
/tests/custom-behavior-flow.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import Redis from "ioredis";
 3 | 
 4 | 
 5 | async function sleep(time) {
 6 |   await new Promise((resolve) => setTimeout(resolve, time));
 7 | }
 8 | 
 9 | test("test pushing behavior logs to redis", async () => {
10 |   const child = child_process.exec("docker run -p 36398:6379 -v $PWD/test-crawls:/crawls -v $PWD/tests/custom-behaviors/:/custom-behaviors/ -e CRAWL_ID=behavior-logs-flow-test --rm webrecorder/browsertrix-crawler crawl --debugAccessRedis --url https://webrecorder.net/ --customBehaviors /custom-behaviors/custom-flow.json  --scopeType page --logBehaviorsToRedis --pageExtraDelay 20");
11 | 
12 |   let crawlFinished = false;
13 | 
14 |   child.on("exit", function () {
15 |     crawlFinished = true;
16 |   });
17 | 
18 |   const redis = new Redis("redis://127.0.0.1:36398/0", { lazyConnect: true, retryStrategy: () => null });
19 | 
20 |   await sleep(3000);
21 | 
22 |   await redis.connect({ maxRetriesPerRequest: 50 });
23 | 
24 |   let customLogLineCount = 0;
25 |   let done = false;
26 | 
27 |   while (!crawlFinished) {
28 |     let res = null;
29 |     try {
30 |        res = await redis.rpop("behavior-logs-flow-test:b");
31 |     } catch (e) {
32 |       break;
33 |     }
34 |     if (!res) {
35 |       await sleep(500);
36 |       continue;
37 |     }
38 |     const json = JSON.parse(res);
39 |     if (json.context === "behaviorScriptCustom") {
40 |       customLogLineCount++;
41 |     }
42 |     if (json.message === "All Steps Done!") {
43 |       done = true;
44 |     }
45 |   }
46 | 
47 |   expect(customLogLineCount).toEqual(4);
48 |   expect(done).toBe(true);
49 | });
50 | 


--------------------------------------------------------------------------------
/tests/custom-behaviors/custom-2.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable @typescript-eslint/no-unused-vars */
 2 | class TestBehavior2 {
 3 |   static init() {
 4 |     return {
 5 |       state: {},
 6 |     };
 7 |   }
 8 | 
 9 |   static get id() {
10 |     return "TestBehavior2";
11 |   }
12 | 
13 |   static isMatch() {
14 |     return window.location.origin === "https://old.webrecorder.net";
15 |   }
16 | 
17 |   async *run(ctx) {
18 |     ctx.log("In Test Behavior 2!");
19 |     yield ctx.Lib.getState(ctx, "test-stat-2");
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/custom-behaviors/custom-flow.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "title": "test WR create click + enter URL",
 3 |     "steps": [
 4 |         {
 5 |             "type": "navigate",
 6 |             "url": "https://webrecorder.net/"
 7 |         },
 8 |         {
 9 |             "type": "click",
10 |             "target": "main",
11 |             "selectors": [
12 |                 [
13 |                     "aria/[role=\"main\"]",
14 |                     "aria/[role=\"textbox\"]"
15 |                 ],
16 |                 [
17 |                     "#archive-url"
18 |                 ],
19 |                 [
20 |                     "xpath///*[@id=\"archive-url\"]"
21 |                 ],
22 |                 [
23 |                     "pierce/#archive-url"
24 |                 ]
25 |             ],
26 |             "offsetY": 19.0078125,
27 |             "offsetX": 310.5
28 |         },
29 |         {
30 |             "type": "change",
31 |             "value": "https://example.com/",
32 |             "selectors": [
33 |                 [
34 |                     "aria/[role=\"main\"]",
35 |                     "aria/[role=\"textbox\"]"
36 |                 ],
37 |                 [
38 |                     "#archive-url"
39 |                 ],
40 |                 [
41 |                     "xpath///*[@id=\"archive-url\"]"
42 |                 ],
43 |                 [
44 |                     "pierce/#archive-url"
45 |                 ]
46 |             ],
47 |             "target": "main"
48 |         },
49 |         {
50 |             "type": "keyDown",
51 |             "target": "main",
52 |             "key": "Enter"
53 |         }
54 |     ]
55 | }
56 | 


--------------------------------------------------------------------------------
/tests/custom-behaviors/custom.js:
--------------------------------------------------------------------------------
 1 | /* eslint-disable @typescript-eslint/no-unused-vars */
 2 | class TestBehavior {
 3 |   static init() {
 4 |     return {
 5 |       state: {},
 6 |     };
 7 |   }
 8 | 
 9 |   static get id() {
10 |     return "TestBehavior";
11 |   }
12 | 
13 |   static isMatch() {
14 |     return window.location.origin === "https://specs.webrecorder.net";
15 |   }
16 | 
17 |   async *run(ctx) {
18 |     ctx.log("In Test Behavior!");
19 |     yield ctx.Lib.getState(ctx, "test-stat");
20 |   }
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/custom_driver.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | 
 4 | test("ensure custom driver creates PDF", async () => {
 5 |   try {
 6 |     child_process.execSync(
 7 |       "docker run -v $PWD/tests/fixtures:/tests/fixtures -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --collection custom-driver-1 --driver /tests/fixtures/driver-1.mjs --limit 1",
 8 |     );
 9 |   } catch (error) {
10 |     console.log(error);
11 |   }
12 | 
13 |   const pdfs = fs.readdirSync("test-crawls/collections/custom-driver-1").filter(x => x.endsWith(".pdf"));
14 |   expect(pdfs.length).toBe(1);
15 | });
16 | 


--------------------------------------------------------------------------------
/tests/custom_selector.test.js:
--------------------------------------------------------------------------------
  1 | import child_process from "child_process";
  2 | import fs from "fs";
  3 | 
  4 | test("test custom selector crawls JS files as pages", async () => {
  5 |   try {
  6 |     child_process.execSync(
  7 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-1 --selectLinks \"script[src]->src\"",
  8 |     );
  9 |   } catch (error) {
 10 |     console.log(error);
 11 |   }
 12 | 
 13 |   const crawledPages = fs.readFileSync(
 14 |     "test-crawls/collections/custom-sel-1/pages/pages.jsonl",
 15 |     "utf8",
 16 |   );
 17 |   const pages = new Set();
 18 | 
 19 |   for (const line of crawledPages.trim().split("\n")) {
 20 |     const url = JSON.parse(line).url;
 21 |     if (!url) {
 22 |       continue;
 23 |     }
 24 |     pages.add(url);
 25 |   }
 26 | 
 27 |   const crawledExtraPages = fs.readFileSync(
 28 |     "test-crawls/collections/custom-sel-1/pages/extraPages.jsonl",
 29 |     "utf8",
 30 |   );
 31 |   const extraPages = new Set();
 32 | 
 33 |   for (const line of crawledExtraPages.trim().split("\n")) {
 34 |     const url = JSON.parse(line).url;
 35 |     if (!url) {
 36 |       continue;
 37 |     }
 38 |     extraPages.add(url);
 39 |   }
 40 | 
 41 |   const expectedPages = new Set([
 42 |     "https://www.iana.org/",
 43 |   ]);
 44 | 
 45 |   const expectedExtraPages = new Set([
 46 |     "https://www.iana.org/_js/jquery.js",
 47 |     "https://www.iana.org/_js/iana.js",
 48 |   ]);
 49 | 
 50 |   expect(pages).toEqual(expectedPages);
 51 |   expect(extraPages).toEqual(expectedExtraPages);
 52 | });
 53 | 
 54 | 
 55 | test("test invalid selector, crawl fails", async () => {
 56 |   let status = 0;
 57 |   try {
 58 |     child_process.execSync(
 59 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --collection custom-sel-invalid --selectLinks \"script[\"",
 60 |     );
 61 |   } catch (e) {
 62 |     status = e.status;
 63 |   }
 64 | 
 65 |   // logger fatal exit code
 66 |   expect(status).toBe(17);
 67 | });
 68 | 
 69 | test("test valid autoclick selector passes validation", async () => {
 70 |   let failed = false;
 71 | 
 72 |   try {
 73 |     child_process.execSync(
 74 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector button --scopeType page",
 75 |     );
 76 |   } catch (e) {
 77 |     failed = true;
 78 |   }
 79 | 
 80 |   // valid clickSelector
 81 |   expect(failed).toBe(false);
 82 | });
 83 | 
 84 | 
 85 | test("test invalid autoclick selector fails validation, crawl fails", async () => {
 86 |   let status = 0;
 87 | 
 88 |   try {
 89 |     child_process.execSync(
 90 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --clickSelector \",\" --scopeType page",
 91 |     );
 92 |   } catch (e) {
 93 |     status = e.status;
 94 |   }
 95 | 
 96 |   // logger fatal exit code
 97 |   expect(status).toBe(17);
 98 | });
 99 |  
100 | 


--------------------------------------------------------------------------------
/tests/dryrun.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | 
 4 | test("ensure dryRun crawl only writes pages and logs", async () => {
 5 |   child_process.execSync(
 6 |     'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --generateWACZ  --text --collection dry-run-wr-net --combineWARC --rolloverSize 10000 --limit 2 --title "test title" --description "test description" --warcPrefix custom-prefix --dryRun --exclude community',
 7 |   );
 8 | 
 9 |   const files = fs.readdirSync("test-crawls/collections/dry-run-wr-net").sort();
10 |   expect(files.length).toBe(2);
11 |   expect(files[0]).toBe("logs");
12 |   expect(files[1]).toBe("pages");
13 | });
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/exclude-redirected.test.js:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import { execSync } from "child_process";
 3 | 
 4 | // example.com includes a link to 'https://www.iana.org/domains/example' which redirects to 'https://www.iana.org/help/example-domains'
 5 | // pgae loading should be blocked on redirected due to exclusion of 'help', though the initial link is loaded
 6 | 
 7 | test("ensure exclusion is applied on redirected URL, which contains 'help', so it is not crawled", () => {
 8 |   execSync(
 9 |       "docker run -p 9037:9037 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://example.com/ --exclude help --collection redir-exclude-test --extraHops 1");
10 | 
11 |   // no entries besides header
12 |   expect(
13 |     fs
14 |       .readFileSync(
15 |         "test-crawls/collections/redir-exclude-test/pages/extraPages.jsonl",
16 |         "utf8",
17 |       ).trim().split("\n").length
18 |   ).toBe(1);
19 |   
20 | });
21 | 
22 | 


--------------------------------------------------------------------------------
/tests/extra_hops_depth.test.js:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | 
 3 | import util from "util";
 4 | import { exec as execCallback, execSync } from "child_process";
 5 | 
 6 | const exec = util.promisify(execCallback);
 7 | 
 8 | const extraHopsTimeout = 180000;
 9 | 
10 | test(
11 |   "check that URLs are crawled 2 extra hops beyond depth",
12 |   async () => {
13 |     try {
14 |       await exec(
15 |         "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-beyond --extraHops 2 --url https://old.webrecorder.net/ --limit 5 --timeout 10 --exclude community --exclude tools",
16 |       );
17 |     } catch (error) {
18 |       console.log(error);
19 |     }
20 | 
21 |     const crawledPages = fs.readFileSync(
22 |       "test-crawls/collections/extra-hops-beyond/pages/pages.jsonl",
23 |       "utf8",
24 |     );
25 |     const crawledPagesArray = crawledPages.trim().split("\n");
26 | 
27 |     const crawledExtraPages = fs.readFileSync(
28 |       "test-crawls/collections/extra-hops-beyond/pages/extraPages.jsonl",
29 |       "utf8",
30 |     );
31 |     const crawledExtraPagesArray = crawledExtraPages.trim().split("\n");
32 | 
33 |     const expectedPages = [
34 |       "https://old.webrecorder.net/",
35 |     ];
36 | 
37 |     const expectedExtraPages = [
38 |       "https://old.webrecorder.net/blog",
39 |       "https://old.webrecorder.net/about",
40 |       "https://old.webrecorder.net/contact",
41 |       "https://old.webrecorder.net/faq",
42 |     ];
43 | 
44 |     // first line is the header, not page, so adding -1
45 |     expect(crawledPagesArray.length - 1).toEqual(expectedPages.length);
46 |     expect(crawledExtraPagesArray.length - 1).toEqual(expectedExtraPages.length);
47 | 
48 |     for (const page of crawledPagesArray) {
49 |       const parsedPage = JSON.parse(page);
50 |       const url = parsedPage.url;
51 |       if (!url) {
52 |         continue;
53 |       }
54 |       expect(expectedPages.indexOf(url) >= 0).toBe(true);
55 | 
56 |       expect(parsedPage.seed).toEqual(true);
57 |       expect(parsedPage.depth).toEqual(0);
58 |     }
59 | 
60 |     for (const page of crawledExtraPagesArray) {
61 |       const parsedPage = JSON.parse(page);
62 |       const url = parsedPage.url;
63 |       if (!url) {
64 |         continue;
65 |       }
66 |       expect(expectedExtraPages.indexOf(url) >= 0).toBe(true);
67 |       expect(parsedPage.depth >= 1).toBe(true);
68 |     }
69 |   },
70 |   extraHopsTimeout,
71 | );
72 | 
73 | 
74 | test("extra hops applies beyond depth limit", () => {
75 |     try {
76 |       execSync(
77 |         "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection extra-hops-depth-0 --extraHops 1 --url https://old.webrecorder.net/ --limit 2 --depth 0 --timeout 10 --exclude community --exclude tools",
78 |       );
79 |     } catch (error) {
80 |       console.log(error);
81 |     }
82 | 
83 |     const crawledExtraPages = fs.readFileSync(
84 |       "test-crawls/collections/extra-hops-depth-0/pages/extraPages.jsonl",
85 |       "utf8",
86 |     );
87 |     const crawledExtraPagesArray = crawledExtraPages.trim().split("\n");
88 | 
89 |     expect(crawledExtraPagesArray.length - 1).toEqual(1);
90 | });
91 | 
92 | 


--------------------------------------------------------------------------------
/tests/file_stats.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | 
 4 | test("ensure that stats file is modified", async () => {
 5 |   const child = child_process.exec(
 6 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --generateWACZ --text --limit 3 --exclude community --collection file-stats --statsFilename progress.json",
 7 |   );
 8 | 
 9 |   // detect crawler exit
10 |   let crawler_exited = false;
11 |   child.on("exit", function () {
12 |     crawler_exited = true;
13 |   });
14 | 
15 |   // helper function to sleep
16 |   const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
17 | 
18 |   // wait for stats file creation up to 30 secs (to not wait indefinitely)
19 |   let counter = 0;
20 |   while (!fs.existsSync("test-crawls/progress.json")) {
21 |     await sleep(100);
22 |     counter++;
23 |     expect(counter < 300).toBe(true);
24 |   }
25 | 
26 |   // get initial modification time
27 |   const initial_mtime = fs.fstatSync(
28 |     fs.openSync("test-crawls/progress.json", "r"),
29 |   ).mtime;
30 | 
31 |   // wait for crawler exit
32 |   while (!crawler_exited) {
33 |     await sleep(100);
34 |   }
35 | 
36 |   // get final modification time
37 |   const final_mtime = fs.fstatSync(
38 |     fs.openSync("test-crawls/progress.json", "r"),
39 |   ).mtime;
40 | 
41 |   // compare initial and final modification time
42 |   const diff = Math.abs(final_mtime - initial_mtime);
43 |   expect(diff > 0).toBe(true);
44 | });
45 | 
46 | test("check that stats file format is correct", () => {
47 |   const data = fs.readFileSync("test-crawls/progress.json", "utf8");
48 |   const dataJSON = JSON.parse(data);
49 |   expect(dataJSON.crawled).toEqual(3);
50 |   expect(dataJSON.total).toEqual(3);
51 |   expect(dataJSON.pending).toEqual(0);
52 |   expect(dataJSON.failed).toEqual(0);
53 |   expect(dataJSON.limit.max).toEqual(3);
54 |   expect(dataJSON.limit.hit).toBe(true);
55 |   expect(dataJSON.pendingPages.length).toEqual(0);
56 | });
57 | 


--------------------------------------------------------------------------------
/tests/fixtures/crawl-1.yaml:
--------------------------------------------------------------------------------
1 | name: crawl-test-1
2 | collection: configtest
3 | seeds:
4 |   - https://webrecorder.net/
5 |   - https://specs.webrecorder.net/
6 | 
7 | generateWACZ: true
8 | 


--------------------------------------------------------------------------------
/tests/fixtures/crawl-2.yaml:
--------------------------------------------------------------------------------
 1 | name: crawl-test-2
 2 | 
 3 | seeds:
 4 |   - https://webrecorder.net/
 5 | 
 6 | collection: config-stdin
 7 | depth: 1
 8 | behaviors: ""
 9 | 
10 | generateWACZ: true
11 | 
12 | warcinfo:
13 |   operator: test
14 |   host: hostname
15 | 


--------------------------------------------------------------------------------
/tests/fixtures/driver-1.mjs:
--------------------------------------------------------------------------------
1 | export default async ({ data, page, crawler }) => {
2 |   await crawler.loadPage(page, data);
3 | 
4 |   await page.pdf({"path": `${crawler.collDir}/${data.pageid}.pdf`});
5 | };
6 | 


--------------------------------------------------------------------------------
/tests/fixtures/pages.jsonl:
--------------------------------------------------------------------------------
1 | {"format":"json-pages-1.0","id":"pages","title":"All Pages","hasText":true}
2 | {"title":"Example Domain","url":"http://www.example.com/","id":"2qok7uessksqo91vt90x8q","size":1256,"ts":"2021-02-24T02:31:27.538Z","text":"Example Domain\nThis domain is for use in illustrative examples in documents. You may use this\n    domain in literature without prior coordination or asking for permission.\nMore information..."}
3 | 


--------------------------------------------------------------------------------
/tests/fixtures/proxy-key:
--------------------------------------------------------------------------------
1 | -----BEGIN OPENSSH PRIVATE KEY-----
2 | b3BlbnNzaC1rZXktdjEAAAAABG5vbmUAAAAEbm9uZQAAAAAAAAABAAAAMwAAAAtzc2gtZW
3 | QyNTUxOQAAACBlI7zERGb3+ugvSkqMQytJp/XEQhsAw5c2We9HccnV0gAAAJi1AenmtQHp
4 | 5gAAAAtzc2gtZWQyNTUxOQAAACBlI7zERGb3+ugvSkqMQytJp/XEQhsAw5c2We9HccnV0g
5 | AAAEB76AYPsL0SvcLL7AUKUwF9jY077ylBHaIea3sWs3b9s2UjvMREZvf66C9KSoxDK0mn
6 | 9cRCGwDDlzZZ70dxydXSAAAADnRlc3RAbG9jYWxob3N0AQIDBAUGBw==
7 | -----END OPENSSH PRIVATE KEY-----
8 | 


--------------------------------------------------------------------------------
/tests/fixtures/proxy-key.pub:
--------------------------------------------------------------------------------
1 | ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGUjvMREZvf66C9KSoxDK0mn9cRCGwDDlzZZ70dxydXS test@localhost
2 | 


--------------------------------------------------------------------------------
/tests/fixtures/urlSeedFile.txt:
--------------------------------------------------------------------------------
1 | https://webrecorder.net/about/
2 | https://specs.webrecorder.net/wacz/1.1.1/
3 | 


--------------------------------------------------------------------------------
/tests/http-auth.test.js:
--------------------------------------------------------------------------------
  1 | import { execSync, spawn } from "child_process";
  2 | import fs from "fs";
  3 | import yaml from "js-yaml";
  4 | 
  5 | let proc = null;
  6 | 
  7 | const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
  8 | 
  9 | beforeAll(() => {
 10 |   proc = spawn("../../node_modules/.bin/http-server", ["-p", "31501", "--username", "user", "--password", "pass"], {cwd: "./docs/site"});
 11 | });
 12 | 
 13 | afterAll(() => {
 14 |   if (proc) {
 15 |     proc.kill();
 16 |   }
 17 | });
 18 | 
 19 | test("run crawl without auth", () => {
 20 |   let status = 0;
 21 |   try {
 22 |     execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --limit 2 --failOnFailedSeed`);
 23 |   } catch (e) {
 24 |     status = e.status;
 25 |   }
 26 |   expect(status).toBe(1);
 27 | });
 28 | 
 29 | test("run crawl with auth", () => {
 30 |   let status = 0;
 31 |   try {
 32 |     execSync(`docker run --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://user:pass@${DOCKER_HOST_NAME}:31501 --limit 2 --failOnFailedSeed --collection http-auth-test`);
 33 |   } catch (e) {
 34 |     status = e.status;
 35 |   }
 36 | 
 37 |   expect(status).toBe(0);
 38 | 
 39 |   expect(fs
 40 |     .readFileSync(
 41 |       "test-crawls/collections/http-auth-test/pages/pages.jsonl",
 42 |       "utf8",
 43 |     )
 44 |     .trim()
 45 |     .split("\n")
 46 |     .length).toBe(2);
 47 | 
 48 |   expect(fs
 49 |     .readFileSync(
 50 |       "test-crawls/collections/http-auth-test/pages/extraPages.jsonl",
 51 |       "utf8",
 52 |     )
 53 |     .trim()
 54 |     .split("\n")
 55 |     .length).toBe(2);
 56 | 
 57 | });
 58 | 
 59 | test("run crawl with auth config.yaml", () => {
 60 |   const config = {
 61 |     seeds: [{
 62 |       url: `http://${DOCKER_HOST_NAME}:31501`,
 63 |       auth: "user:pass"
 64 |     }],
 65 |     limit: "2",
 66 |     collection: "http-auth-test-2",
 67 |     failOnFailedSeed: "true"
 68 |   }
 69 | 
 70 |   const configYaml = yaml.dump(config);
 71 | 
 72 |   let status = 0;
 73 |   try {
 74 |     execSync("docker run -i --rm -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin",
 75 |       { input: configYaml, stdin: "inherit", encoding: "utf8" });
 76 | 
 77 |   } catch (e) {
 78 |     console.log(e);
 79 |     status = e.status;
 80 |   }
 81 | 
 82 |   expect(status).toBe(0);
 83 | 
 84 |    expect(fs
 85 |     .readFileSync(
 86 |       "test-crawls/collections/http-auth-test-2/pages/pages.jsonl",
 87 |       "utf8",
 88 |     )
 89 |     .trim()
 90 |     .split("\n")
 91 |     .length).toBe(2);
 92 | 
 93 |   expect(fs
 94 |     .readFileSync(
 95 |       "test-crawls/collections/http-auth-test-2/pages/extraPages.jsonl",
 96 |       "utf8",
 97 |     )
 98 |     .trim()
 99 |     .split("\n")
100 |     .length).toBe(2);
101 | });
102 | 


--------------------------------------------------------------------------------
/tests/invalid-behaviors/invalid-export.js:
--------------------------------------------------------------------------------
 1 | export class TestBehavior {
 2 |   static init() {
 3 |     return {
 4 |       state: {},
 5 |     };
 6 |   }
 7 | 
 8 |   static get id() {
 9 |     return "TestBehavior";
10 |   }
11 | 
12 |   static isMatch() {
13 |     return window.location.origin === "https://example.com";
14 |   }
15 | 
16 |   async *run(ctx) {
17 |     ctx.log("In Test Behavior!");
18 |     yield ctx.Lib.getState(ctx, "test-stat");
19 |   }
20 | }
21 | 


--------------------------------------------------------------------------------
/tests/lang-code.test.js:
--------------------------------------------------------------------------------
 1 | import { execSync } from "child_process";
 2 | 
 3 | test("run crawl with invalid lang", () => {
 4 |   let status = 0;
 5 |   try {
 6 |     execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang e --limit 1`);
 7 |   } catch (e) {
 8 |     status = e.status;
 9 |   }
10 |   expect(status).toBe(17);
11 | });
12 | 
13 | test("run crawl with valid lang", () => {
14 |   let status = 0;
15 |   try {
16 |     execSync(`docker run --rm webrecorder/browsertrix-crawler crawl --url https://webrecorder.net/feed.xml --lang en --limit 1`);
17 |   } catch (e) {
18 |     status = e.status;
19 |   }
20 |   expect(status).toBe(0);
21 | });
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/limit_reached.test.js:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import util from "util";
 3 | import { exec as execCallback, execSync } from "child_process";
 4 | 
 5 | const exec = util.promisify(execCallback);
 6 | 
 7 | test("ensure page limit reached", async () => {
 8 |   execSync(
 9 |     'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --scopeType prefix --behaviors "" --url https://old.webrecorder.net/ --limit 12 --workers 2 --collection limit-test --statsFilename stats.json --exclude community',
10 |   );
11 | });
12 | 
13 | test("check limit written to stats file is as expected", () => {
14 |   const data = fs.readFileSync("test-crawls/stats.json", "utf8");
15 |   const dataJSON = JSON.parse(data);
16 |   expect(dataJSON.crawled).toEqual(12);
17 |   expect(dataJSON.total).toEqual(12);
18 |   expect(dataJSON.limit.hit).toBe(true);
19 | });
20 | 
21 | test("ensure crawl fails if failOnFailedLimit is reached", async () => {
22 |   let passed = true;
23 |   try {
24 |     await exec(
25 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/will404 --url https://specs.webrecorder.net --failOnInvalidStatus --failOnFailedLimit 1 --limit 10 --collection faillimitreached",
26 |     );
27 |   } catch (error) {
28 |     expect(error.code).toEqual(12);
29 |     passed = false;
30 |   }
31 |   expect(passed).toBe(false);
32 | });
33 | 
34 | test("ensure crawl fails if timeLimit is reached", async () => {
35 |   let passed = true;
36 |   try {
37 |     await exec(
38 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --timeLimit 1 --limit 2 --collection failontimelimitreached",
39 |     );
40 |   } catch (error) {
41 |     expect(error.code).toEqual(15);
42 |     passed = false;
43 |   }
44 |   expect(passed).toBe(false);
45 | });
46 | 
47 | test("ensure crawl fails if sizeLimit is reached", async () => {
48 |   let passed = true;
49 |   try {
50 |     await exec(
51 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --sizeLimit 1 --limit 2 --collection failonsizelimitreached",
52 |     );
53 |   } catch (error) {
54 |     expect(error.code).toEqual(14);
55 |     passed = false;
56 |   }
57 |   expect(passed).toBe(false);
58 | });
59 | 
60 | test("ensure crawl fails if diskUtilizationLimit is reached", async () => {
61 |   let passed = true;
62 |   try {
63 |     await exec(
64 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://specs.webrecorder.net --failOnInvalidStatus --diskUtilization 1 --limit 2 --collection failonsizelimitreached",
65 |     );
66 |   } catch (error) {
67 |     expect(error.code).toEqual(16);
68 |     passed = false;
69 |   }
70 |   expect(passed).toBe(false);
71 | });
72 | 


--------------------------------------------------------------------------------
/tests/log_filtering.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | import path from "path";
 4 | 
 5 | function jsonLinesToArray(string) {
 6 |   return string
 7 |     .split("\n")
 8 |     .filter((line) => {
 9 |       try {
10 |         JSON.parse(line);
11 |         return true;
12 |       } catch (error) {
13 |         return false;
14 |       }
15 |     })
16 |     .map((line) => JSON.parse(line));
17 | }
18 | 
19 | test("ensure crawl run with log options passes", async () => {
20 |   child_process.execSync(
21 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url http://specs.webrecorder.net --generateWACZ --collection wr-specs-logs --logging debug,stats --logLevel debug,warn --context general",
22 |   );
23 | });
24 | 
25 | test("check that log files exist and were filtered according to options", () => {
26 |   const logDir = "test-crawls/collections/wr-specs-logs/logs/";
27 |   const logFiles = [];
28 |   fs.readdirSync(logDir).forEach((file) => {
29 |     if (file.endsWith(".log")) {
30 |       logFiles.push(path.join(logDir, file));
31 |     }
32 |   });
33 | 
34 |   expect(logFiles.length).toBeGreaterThan(0);
35 | 
36 |   for (let i = 0; i < logFiles.length; i++) {
37 |     const logFile = logFiles[i];
38 |     const parsedJSONLines = jsonLinesToArray(fs.readFileSync(logFile, "utf8"));
39 | 
40 |     expect(parsedJSONLines.length).toBeGreaterThan(0);
41 | 
42 |     parsedJSONLines.forEach((jsonLine) => {
43 |       expect(
44 |         jsonLine.logLevel === "debug" || jsonLine.logLevel === "warn",
45 |       ).toBe(true);
46 |       expect(jsonLine.context).toBe("general");
47 |     });
48 |   }
49 | });
50 | 


--------------------------------------------------------------------------------
/tests/mult_url_crawl_with_favicon.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | 
 4 | const doValidate = process.argv.filter((x) => x.startsWith('-validate'))[0];
 5 | const testIf = (condition, ...args) => condition ? test(...args) : test.skip(...args);
 6 | 
 7 | test("ensure multi url crawl run with docker run passes", async () => {
 8 |   child_process.execSync(
 9 |     'docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://old.webrecorder.net/ --generateWACZ --text --collection advanced --combineWARC --rolloverSize 10000 --workers 2 --title "test title" --description "test description" --pages 2 --limit 2 --exclude community',
10 |   );
11 | });
12 | 
13 | testIf(doValidate, "validate multi url crawl wacz", () => {
14 |   child_process.execSync(
15 |     "wacz validate --file ./test-crawls/collections/advanced/advanced.wacz",
16 |   );
17 | });
18 | 
19 | test("check that the favicon made it into the pages jsonl file", () => {
20 |   expect(
21 |     fs.existsSync("test-crawls/collections/advanced/pages/pages.jsonl"),
22 |   ).toBe(true);
23 | 
24 |   const data1 = JSON.parse(
25 |     fs
26 |       .readFileSync(
27 |         "test-crawls/collections/advanced/pages/pages.jsonl",
28 |         "utf8",
29 |       )
30 |       .split("\n")[1],
31 |   );
32 |   const data2 = JSON.parse(
33 |     fs
34 |       .readFileSync(
35 |         "test-crawls/collections/advanced/pages/pages.jsonl",
36 |         "utf8",
37 |       )
38 |       .split("\n")[2],
39 |   );
40 |   const data = [data1, data2];
41 |   for (const d of data) {
42 |     if (d.url === "https://old.webrecorder.net/") {
43 |       expect(d.favIconUrl).toEqual(
44 |         "https://old.webrecorder.net/assets/favicon.ico",
45 |       );
46 |     }
47 |     if (d.url === "https://iana.org/") {
48 |       expect(d.favIconUrl).toEqual(
49 |         "https://www.iana.org/_img/bookmark_icon.ico",
50 |       );
51 |     }
52 |   }
53 | });
54 | 


--------------------------------------------------------------------------------
/tests/multi-instance-crawl.test.js:
--------------------------------------------------------------------------------
  1 | import {exec, execSync} from "child_process";
  2 | import fs from "fs";
  3 | import { Redis } from "ioredis";
  4 | 
  5 | function sleep(ms) {
  6 |   return new Promise((resolve) => setTimeout(resolve, ms));
  7 | }
  8 | 
  9 | 
 10 | let redisId;
 11 | let crawler1, crawler2;
 12 | 
 13 | beforeAll(() => {
 14 |   fs.rmSync("./test-crawls/collections/shared-crawler-1", { recursive: true, force: true });
 15 |   fs.rmSync("./test-crawls/collections/shared-crawler-2", { recursive: true, force: true });
 16 | 
 17 |   execSync("docker network create crawl");
 18 | 
 19 |   redisId = execSync("docker run --rm --network=crawl -p 37379:6379 --name redis -d redis");
 20 | 
 21 |   crawler1 = runCrawl("crawler-1");
 22 |   crawler2 = runCrawl("crawler-2");
 23 | });
 24 | 
 25 | afterAll(async () => {
 26 |   execSync(`docker kill ${redisId}`);
 27 | 
 28 |   await sleep(3000);
 29 | 
 30 |   await Promise.allSettled([crawler1, crawler2]);
 31 | 
 32 |   execSync("docker network rm crawl");
 33 | });
 34 | 
 35 | function runCrawl(name) {
 36 |   const crawler = exec(`docker run --rm -v $PWD/test-crawls:/crawls --network=crawl --hostname=${name} webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 4 --exclude community --collection shared-${name} --crawlId testcrawl --redisStoreUrl redis://redis:6379`);
 37 | 
 38 |   return new Promise((resolve) => {
 39 |     crawler.on("exit", (code) => {
 40 |       resolve(code);
 41 |     });
 42 |   });
 43 | }
 44 | 
 45 | test("run crawlers with external redis", async () => {
 46 |   const redis = new Redis("redis://127.0.0.1:37379/0", { lazyConnect: true, retryStrategy: () => null });
 47 | 
 48 |   await sleep(3000);
 49 | 
 50 |   await redis.connect({ maxRetriesPerRequest: 50 });
 51 | 
 52 |   let count = 0;
 53 | 
 54 |   while (true) {
 55 |     try {
 56 |       const values = await redis.hgetall("testcrawl:status");
 57 |       expect(values["crawler-1"]).toBe("running");
 58 |       expect(values["crawler-2"]).toBe("running");
 59 |       break;
 60 |     } catch (e) {
 61 |       if (count++ < 5) {
 62 |         await sleep(1000);
 63 |         continue;
 64 |       }
 65 | 
 66 |       throw e;
 67 |     }
 68 |   }
 69 | 
 70 | });
 71 | 
 72 | 
 73 | test("finish crawls successfully", async () => {
 74 |   const res = await Promise.allSettled([crawler1, crawler2]);
 75 |   expect(res[0].value).toBe(0);
 76 |   expect(res[1].value).toBe(0);
 77 | }, 180000);
 78 | 
 79 | test("ensure correct number of pages", () => {
 80 | 
 81 |   expect(
 82 |     fs.existsSync("test-crawls/collections/shared-crawler-1/pages/pages.jsonl"),
 83 |   ).toBe(true);
 84 | 
 85 |   expect(
 86 |     fs.existsSync("test-crawls/collections/shared-crawler-2/pages/pages.jsonl"),
 87 |   ).toBe(true);
 88 | 
 89 |   const pages_1 = fs
 90 |     .readFileSync(
 91 |       "test-crawls/collections/shared-crawler-1/pages/pages.jsonl",
 92 |       "utf8",
 93 |     )
 94 |     .trim()
 95 |     .split("\n");
 96 | 
 97 |   const pages_2 = fs
 98 |     .readFileSync(
 99 |       "test-crawls/collections/shared-crawler-2/pages/pages.jsonl",
100 |       "utf8",
101 |     )
102 |     .trim()
103 |     .split("\n");
104 | 
105 |   // add 2 for heading in each file
106 |   expect(pages_1.length + pages_2.length).toBe(1 + 2);
107 | });
108 | 
109 | test("ensure correct number of extraPages", () => {
110 | 
111 |   expect(
112 |     fs.existsSync("test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl"),
113 |   ).toBe(true);
114 | 
115 |   expect(
116 |     fs.existsSync("test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl"),
117 |   ).toBe(true);
118 | 
119 |   const pages_1 = fs
120 |     .readFileSync(
121 |       "test-crawls/collections/shared-crawler-1/pages/extraPages.jsonl",
122 |       "utf8",
123 |     )
124 |     .trim()
125 |     .split("\n");
126 | 
127 |   const pages_2 = fs
128 |     .readFileSync(
129 |       "test-crawls/collections/shared-crawler-2/pages/extraPages.jsonl",
130 |       "utf8",
131 |     )
132 |     .trim()
133 |     .split("\n");
134 | 
135 |   // add 2 for heading in each file
136 |   expect(pages_1.length + pages_2.length).toBe(3 + 2);
137 | });
138 | 


--------------------------------------------------------------------------------
/tests/non-html-crawl.test.js:
--------------------------------------------------------------------------------
  1 | import child_process from "child_process";
  2 | import fs from "fs";
  3 | import path from "path";
  4 | import { WARCParser } from "warcio";
  5 | 
  6 | const PDF = "https://specs.webrecorder.net/wacz/1.1.1/wacz-2021.pdf";
  7 | const PDF_HTTP = PDF.replace("https", "http");
  8 | 
  9 | const XML = "https://webrecorder.net/feed.xml";
 10 | const XML_REDIR = "https://www.webrecorder.net/feed.xml";
 11 | 
 12 | test("PDF: ensure pdf is crawled", () => {
 13 |   child_process.execSync(
 14 |     `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${PDF}" --collection crawl-pdf`
 15 |   );
 16 | });
 17 | 
 18 | test("PDF: check that individual WARCs have PDF written as 200 response", async () => {
 19 |   const archiveWarcLists = fs.readdirSync(
 20 |     "test-crawls/collections/crawl-pdf/archive",
 21 |   );
 22 | 
 23 |   const warcName = path.join("test-crawls/collections/crawl-pdf/archive", archiveWarcLists[0]);
 24 | 
 25 |   const nodeStream = fs.createReadStream(warcName);
 26 | 
 27 |   const parser = new WARCParser(nodeStream);
 28 | 
 29 |   let statusCode = -1;
 30 | 
 31 |   for await (const record of parser) {
 32 |     if (record.warcType !== "response") {
 33 |       continue;
 34 |     }
 35 | 
 36 |     if (record.warcTargetURI === PDF) {
 37 |       statusCode = record.httpHeaders.statusCode;
 38 |     }
 39 |   }
 40 | 
 41 |   expect(statusCode).toBe(200);
 42 | });
 43 | 
 44 | test("PDF: ensure pdf with redirect is crawled", () => {
 45 |   child_process.execSync(
 46 |     `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${PDF_HTTP}" --collection crawl-pdf --generateCDX`
 47 |   );
 48 | });
 49 | 
 50 | test("PDF: check that the pages.jsonl file entry contains status code and mime type", () => {
 51 |   expect(
 52 |     fs.existsSync("test-crawls/collections/crawl-pdf/pages/pages.jsonl"),
 53 |   ).toBe(true);
 54 | 
 55 | 
 56 |   const pages = fs
 57 |     .readFileSync(
 58 |       "test-crawls/collections/crawl-pdf/pages/pages.jsonl",
 59 |       "utf8",
 60 |     )
 61 |     .trim()
 62 |     .split("\n");
 63 | 
 64 |   expect(pages.length).toBe(3);
 65 | 
 66 |   const page = JSON.parse(pages[1]);
 67 |   expect(page.url).toBe(PDF);
 68 |   expect(page.status).toBe(200);
 69 |   expect(page.mime).toBe("application/pdf");
 70 |   expect(page.loadState).toBe(2);
 71 | 
 72 |   const pageH = JSON.parse(pages[2]);
 73 |   expect(pageH.url).toBe(PDF_HTTP);
 74 |   expect(pageH.status).toBe(200);
 75 |   expect(pageH.mime).toBe("application/pdf");
 76 |   expect(pageH.loadState).toBe(2);
 77 | });
 78 | 
 79 | test("PDF: check that CDX contains one pdf 200, one 301 and one 200, two pageinfo entries", () => {
 80 |   const filedata = fs.readFileSync(
 81 |     "test-crawls/collections/crawl-pdf/indexes/index.cdxj",
 82 |     { encoding: "utf-8" },
 83 |   );
 84 | 
 85 |   const lines = filedata.trim().split("\n");
 86 |   const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
 87 | 
 88 |   expect(cdxj.length).toBe(5);
 89 | 
 90 |   expect(cdxj[0].url).toBe(PDF_HTTP);
 91 |   expect(cdxj[0].status).toBe("301");
 92 | 
 93 |   expect(cdxj[1].url).toBe(PDF);
 94 |   expect(cdxj[1].status).toBe("200");
 95 |   expect(cdxj[1].mime).toBe("application/pdf");
 96 | 
 97 |   expect(cdxj[2].url).toBe(PDF);
 98 |   expect(cdxj[2].status).toBe("200");
 99 |   expect(cdxj[2].mime).toBe("application/pdf");
100 | 
101 |   expect(cdxj[3].url).toBe("urn:pageinfo:" + PDF_HTTP);
102 |   expect(cdxj[3].mime).toBe("application/json");
103 | 
104 |   expect(cdxj[4].url).toBe("urn:pageinfo:" + PDF);
105 |   expect(cdxj[4].mime).toBe("application/json");
106 | });
107 | 
108 | test("XML: ensure with and without redirect is crawled", () => {
109 |   child_process.execSync(
110 |     `docker run -v $PWD/test-crawls:/crawls  webrecorder/browsertrix-crawler crawl --url "${XML}" --url "${XML_REDIR}" --collection crawl-xml --generateCDX`
111 |   );
112 | });
113 | 
114 | test("XML: check pages.jsonl file entry contains status code and mime type", () => {
115 |   expect(
116 |     fs.existsSync("test-crawls/collections/crawl-xml/pages/pages.jsonl"),
117 |   ).toBe(true);
118 | 
119 | 
120 |   const pages = fs
121 |     .readFileSync(
122 |       "test-crawls/collections/crawl-xml/pages/pages.jsonl",
123 |       "utf8",
124 |     )
125 |     .trim()
126 |     .split("\n");
127 | 
128 |   expect(pages.length).toBe(3);
129 | 
130 |   const page = JSON.parse(pages[1]);
131 |   expect(page.url).toBe(XML);
132 |   expect(page.status).toBe(200);
133 |   expect(page.mime).toBe("application/xml");
134 |   expect(page.loadState).toBe(2);
135 | 
136 |   const pageH = JSON.parse(pages[2]);
137 |   expect(pageH.url).toBe(XML_REDIR);
138 |   expect(pageH.status).toBe(200);
139 |   expect(pageH.mime).toBe("application/xml");
140 |   expect(pageH.loadState).toBe(2);
141 | });
142 | 
143 | test("XML: check that CDX contains one xml 200, one 301 and one 200, two pageinfo entries", () => {
144 |   const filedata = fs.readFileSync(
145 |     "test-crawls/collections/crawl-xml/indexes/index.cdxj",
146 |     { encoding: "utf-8" },
147 |   );
148 | 
149 |   const lines = filedata.trim().split("\n");
150 |   const cdxj = lines.map(line => JSON.parse(line.split(" ").slice(2).join(" "))).sort((a, b) => a.url < b.url ? -1 : 1);
151 | 
152 |   expect(cdxj.length).toBe(6);
153 | 
154 |   expect(cdxj[0].url).toBe("https://webrecorder.net/favicon.ico");
155 | 
156 |   expect(cdxj[1].url).toBe(XML);
157 |   expect(cdxj[1].status).toBe("200");
158 |   expect(cdxj[1].mime).toBe("application/xml");
159 | 
160 |   expect(cdxj[2].url).toBe(XML);
161 |   expect(cdxj[2].status).toBe("200");
162 |   expect(cdxj[2].mime).toBe("application/xml");
163 | 
164 |   expect(cdxj[3].url).toBe(XML_REDIR);
165 |   expect(cdxj[3].status).toBe("301");
166 | 
167 |   expect(cdxj[4].url).toBe("urn:pageinfo:" + XML);
168 |   expect(cdxj[4].mime).toBe("application/json");
169 | 
170 |   expect(cdxj[5].url).toBe("urn:pageinfo:" + XML_REDIR);
171 |   expect(cdxj[5].mime).toBe("application/json");
172 | });
173 | 
174 | 
175 | 


--------------------------------------------------------------------------------
/tests/pageinfo-records.test.js:
--------------------------------------------------------------------------------
  1 | import child_process from "child_process";
  2 | import fs from "fs";
  3 | import path from "path";
  4 | import { WARCParser } from "warcio";
  5 | 
  6 | test("run warc and ensure pageinfo records contain the correct resources", async () => {
  7 |   child_process.execSync(
  8 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --url https://invalid.invalid/ --scopeType page --collection page-info-test --combineWARC",
  9 |   );
 10 | 
 11 |   const filename = path.join(
 12 |     "test-crawls",
 13 |     "collections",
 14 |     "page-info-test",
 15 |     "page-info-test_0.warc.gz",
 16 |   );
 17 | 
 18 |   const nodeStream = fs.createReadStream(filename);
 19 | 
 20 |   const parser = new WARCParser(nodeStream);
 21 | 
 22 |   let foundIndex = false;
 23 |   let foundAbout = false;
 24 |   let foundInvalid = false;
 25 | 
 26 |   for await (const record of parser) {
 27 |     if (record.warcType === "response" &&
 28 |       (record.warcTargetURI === "https://old.webrecorder.net/" || record.warcTargetURI === "https://old.webrecorder.net/about")) {
 29 |       expect(record.warcHeaders.headers.get("WARC-Protocol")).toBe("h2, tls/1.3");
 30 |     }
 31 | 
 32 |     if (
 33 |       !foundIndex &&
 34 |       record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/"
 35 |     ) {
 36 |       foundIndex = true;
 37 |       const text = await record.contentText();
 38 |       validateResourcesIndex(JSON.parse(text));
 39 |     }
 40 | 
 41 |     if (
 42 |       !foundAbout &&
 43 |       record.warcTargetURI === "urn:pageinfo:https://old.webrecorder.net/about"
 44 |     ) {
 45 |       foundAbout = true;
 46 |       const text = await record.contentText();
 47 |       validateResourcesAbout(JSON.parse(text));
 48 |     }
 49 | 
 50 |     if (
 51 |       !foundInvalid &&
 52 |       record.warcTargetURI === "urn:pageinfo:https://invalid.invalid/"
 53 |     ) {
 54 |       foundInvalid = true;
 55 |       const text = await record.contentText();
 56 |       validateResourcesInvalid(JSON.parse(text));
 57 |     }
 58 |   }
 59 | 
 60 |   expect(foundIndex).toBe(true);
 61 |   expect(foundAbout).toBe(true);
 62 |   expect(foundInvalid).toBe(true);
 63 | });
 64 | 
 65 | function validateResourcesIndex(json) {
 66 |   expect(json).toHaveProperty("pageid");
 67 |   expect(json).toHaveProperty("url");
 68 |   expect(json).toHaveProperty("ts");
 69 |   expect(json).toHaveProperty("urls");
 70 |   expect(json.counts).toEqual({ jsErrors: 0 });
 71 |   expect(json.urls).toEqual({
 72 |     "https://old.webrecorder.net/": {
 73 |       status: 200,
 74 |       mime: "text/html",
 75 |       type: "document",
 76 |     },
 77 |     "https://old.webrecorder.net/assets/tools/logo-pywb.png": {
 78 |       mime: "image/png",
 79 |       status: 200,
 80 |       type: "image",
 81 |     },
 82 |     "https://old.webrecorder.net/assets/brand/archivewebpage-icon-color.svg": {
 83 |       mime: "image/svg+xml",
 84 |       status: 200,
 85 |       type: "image",
 86 |     },
 87 |     "https://old.webrecorder.net/assets/brand/browsertrix-icon-color.svg": {
 88 |       mime: "image/svg+xml",
 89 |       status: 200,
 90 |       type: "image",
 91 |     },
 92 |     "https://old.webrecorder.net/assets/brand/browsertrixcrawler-icon-color.svg": {
 93 |       mime: "image/svg+xml",
 94 |       status: 200,
 95 |       type: "image",
 96 |     },
 97 |     "https://old.webrecorder.net/assets/brand/replaywebpage-icon-color.svg": {
 98 |       mime: "image/svg+xml",
 99 |       status: 200,
100 |       type: "image",
101 |     },
102 |     "https://old.webrecorder.net/assets/fontawesome/all.css": {
103 |       status: 200,
104 |       mime: "text/css",
105 |       type: "stylesheet",
106 |     },
107 |     "https://old.webrecorder.net/assets/wr-logo.svg": {
108 |       status: 200,
109 |       mime: "image/svg+xml",
110 |       type: "image",
111 |     },
112 |     "https://old.webrecorder.net/assets/main.css": {
113 |       status: 200,
114 |       mime: "text/css",
115 |       type: "stylesheet",
116 |     },
117 |     "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap":
118 |       { status: 200, mime: "text/css", type: "stylesheet" },
119 |     "https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
120 |       { status: 200, mime: "text/css", type: "stylesheet" },
121 |     "https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
122 |       { status: 200, mime: "font/woff2", type: "font" },
123 |     "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
124 |       { status: 200, mime: "font/woff2", type: "font" },
125 |     "https://old.webrecorder.net/assets/favicon.ico": {
126 |       status: 200,
127 |       mime: "image/vnd.microsoft.icon",
128 |       type: "other",
129 |     },
130 |   });
131 | }
132 | 
133 | function validateResourcesAbout(json) {
134 |   expect(json).toHaveProperty("pageid");
135 |   expect(json).toHaveProperty("url");
136 |   expect(json).toHaveProperty("ts");
137 |   expect(json).toHaveProperty("urls");
138 |   expect(json.counts).toEqual({ jsErrors: 0 });
139 |   expect(json.urls).toEqual({
140 |     "https://old.webrecorder.net/about": {
141 |       status: 200,
142 |       mime: "text/html",
143 |       type: "document",
144 |     },
145 |     "https://old.webrecorder.net/assets/main.css": {
146 |       status: 200,
147 |       mime: "text/css",
148 |       type: "stylesheet",
149 |     },
150 |     "https://old.webrecorder.net/assets/fontawesome/all.css": {
151 |       status: 200,
152 |       mime: "text/css",
153 |       type: "stylesheet",
154 |     },
155 |     "https://fonts.googleapis.com/css?family=Source+Code+Pro|Source+Sans+Pro&display=swap":
156 |       { status: 200, mime: "text/css", type: "stylesheet" },
157 |     "https://fonts.googleapis.com/css2?family=Source+Sans+Pro:wght@700;900&display=swap":
158 |       { status: 200, mime: "text/css", type: "stylesheet" },
159 |     "https://old.webrecorder.net/assets/wr-logo.svg": {
160 |       status: 200,
161 |       mime: "image/svg+xml",
162 |       type: "image",
163 |     },
164 |     "https://fonts.gstatic.com/s/sourcesanspro/v22/6xK3dSBYKcSV-LCoeQqfX1RYOo3qOK7l.woff2":
165 |       { status: 200, mime: "font/woff2", type: "font" },
166 |     "https://fonts.gstatic.com/s/sourcesanspro/v22/6xKydSBYKcSV-LCoeQqfX1RYOo3ig4vwlxdu.woff2":
167 |       { status: 200, mime: "font/woff2", type: "font" },
168 |   });
169 | }
170 | 
171 | function validateResourcesInvalid(json) {
172 |   expect(json).toHaveProperty("pageid");
173 |   expect(json).toHaveProperty("url");
174 |   expect(json).toHaveProperty("urls");
175 |   expect(json.counts).toEqual({ jsErrors: 0 });
176 |   expect(json.urls).toEqual({
177 |     "https://invalid.invalid/": {
178 |       status: 0,
179 |       type: "document",
180 |       error: "net::ERR_NAME_NOT_RESOLVED",
181 |     },
182 |   });
183 | }
184 | 


--------------------------------------------------------------------------------
/tests/qa_compare.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | import { Redis } from "ioredis";
 4 | 
 5 | const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
 6 | 
 7 | test("run initial crawl with text and screenshots to prepare for QA", async () => {
 8 |   fs.rmSync("./test-crawls/qa-wr-net", { recursive: true, force: true });
 9 | 
10 |   child_process.execSync(
11 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --url https://old.webrecorder.net/about --url https://archiveweb.page/ --url https://old.webrecorder.net/contact --scopeType page --collection qa-wr-net --text to-warc --screenshot view --generateWACZ",
12 |   );
13 | 
14 |   expect(
15 |     fs.existsSync("test-crawls/collections/qa-wr-net/qa-wr-net.wacz"),
16 |   ).toBe(true);
17 | });
18 | 
19 | test("run QA comparison, with write pages to redis", async () => {
20 |   fs.rmSync("./test-crawls/qa-wr-net-replay", { recursive: true, force: true });
21 | 
22 |   const child = child_process.exec(
23 |     "docker run -p 36380:6379 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler qa --qaSource /crawls/collections/qa-wr-net/qa-wr-net.wacz --collection qa-wr-net-replay --crawlId test --qaDebugImageDiff --writePagesToRedis --debugAccessRedis --exclude contact",
24 |   );
25 | 
26 |   // detect crawler exit
27 |   let crawler_exited = false;
28 |   child.on("exit", function () {
29 |     crawler_exited = true;
30 |   });
31 | 
32 |   const redis = new Redis("redis://127.0.0.1:36380/0", { lazyConnect: true, retryStrategy: () => null });
33 | 
34 |   await sleep(3000);
35 | 
36 |   await redis.connect({ maxRetriesPerRequest: 50 });
37 | 
38 |   let count = 0;
39 | 
40 |   while (count < 3) {
41 |     const res = await redis.lpop("test:pages");
42 |     if (!res) {
43 |       if (crawler_exited) {
44 |         break;
45 |       }
46 |       await sleep(100);
47 |       continue;
48 |     }
49 |     const json = JSON.parse(res);
50 |     expect(json).toHaveProperty("id");
51 |     expect(json).toHaveProperty("url");
52 |     expect(json).toHaveProperty("ts");
53 |     expect(json).toHaveProperty("title");
54 |     expect(json).toHaveProperty("loadState");
55 |     expect(json).toHaveProperty("comparison");
56 | 
57 |     expect(json.title.indexOf("contact") < 0).toBe(true);
58 | 
59 |     expect(json.comparison).toHaveProperty("screenshotMatch");
60 |     expect(json.comparison).toHaveProperty("textMatch");
61 |     expect(json.comparison).toHaveProperty("resourceCounts");
62 |     expect(json.comparison.screenshotMatch).toBe(1);
63 |     expect(json.comparison.textMatch).toBe(1);
64 | 
65 |     expect(json.comparison.resourceCounts).toHaveProperty("crawlGood");
66 |     expect(json.comparison.resourceCounts).toHaveProperty("crawlBad");
67 |     expect(json.comparison.resourceCounts).toHaveProperty("replayGood");
68 |     expect(json.comparison.resourceCounts).toHaveProperty("replayBad");
69 | 
70 |     count++;
71 |   }
72 | 
73 |   expect(count).toBe(3);
74 | 
75 |   // wait for crawler exit
76 |   while (!crawler_exited) {
77 |     await sleep(100);
78 |   }
79 | });
80 | 


--------------------------------------------------------------------------------
/tests/retry-failed.test.js:
--------------------------------------------------------------------------------
  1 | import { exec, execSync } from "child_process";
  2 | import fs from "fs";
  3 | import http from "http";
  4 | import Redis from "ioredis";
  5 | 
  6 | const DOCKER_HOST_NAME = process.env.DOCKER_HOST_NAME || "host.docker.internal";
  7 | 
  8 | async function sleep(time) {
  9 |   await new Promise((resolve) => setTimeout(resolve, time));
 10 | }
 11 | 
 12 | let requests = 0;
 13 | let success = false;
 14 | let server = null;
 15 | 
 16 | beforeAll(() => {
 17 |   server = http.createServer((req, res) => {
 18 |     // 3 requests: 2 from browser, 1 direct fetch per attempt
 19 |     // succeed on 6th request == after 2 retries
 20 |     if (requests >= 6) {
 21 |       res.writeHead(200, {"Content-Type": "text/html"});
 22 |       res.end("<html><body>Test Data</body></html>");
 23 |       success = true;
 24 |     } else {
 25 |       res.writeHead(503, {"Content-Type": "text/html"});
 26 |       res.end("<html><body>Test Data</body></html>");
 27 |     }
 28 |     requests++;
 29 |   });
 30 | 
 31 |   server.listen(31501, "0.0.0.0");
 32 | });
 33 | 
 34 | afterAll(() => {
 35 |   server.close();
 36 | });
 37 | 
 38 | 
 39 | 
 40 | test("run crawl with retries for no response", async () => {
 41 |   execSync(`docker run -d -v $PWD/test-crawls:/crawls -e CRAWL_ID=test -p 36387:6379 --rm webrecorder/browsertrix-crawler crawl --url http://invalid-host-x:31501 --url https://example.com/ --limit 2 --pageExtraDelay 10 --debugAccessRedis --collection retry-fail --retries 5`);
 42 | 
 43 |   const redis = new Redis("redis://127.0.0.1:36387/0", { lazyConnect: true, retryStrategy: () => null });
 44 | 
 45 |   await sleep(3000);
 46 | 
 47 |   let numRetries = 0;
 48 | 
 49 |   try {
 50 |     await redis.connect({
 51 |       maxRetriesPerRequest: 100,
 52 |     });
 53 | 
 54 |     while (true) {
 55 |       const res = await redis.lrange("test:f", 0, -1);
 56 |       if (res.length) {
 57 |         const data = JSON.parse(res);
 58 |         if (data.retry) {
 59 |           numRetries = data.retry;
 60 |           break;
 61 |         }
 62 |       }
 63 |       await sleep(20);
 64 |     }
 65 | 
 66 |   } catch (e) {
 67 |     console.error(e);
 68 |   } finally {
 69 |     expect(numRetries).toBe(5);
 70 |   }
 71 | });
 72 | 
 73 | 
 74 | test("check only one failed page entry is made", () => {
 75 |   expect(
 76 |     fs.existsSync("test-crawls/collections/retry-fail/pages/pages.jsonl"),
 77 |   ).toBe(true);
 78 | 
 79 |   expect(
 80 |     fs
 81 |       .readFileSync(
 82 |         "test-crawls/collections/retry-fail/pages/pages.jsonl",
 83 |         "utf8",
 84 |       ).trim().split("\n").length
 85 |   ).toBe(3);
 86 | });
 87 | 
 88 | 
 89 | test("run crawl with retries for 503, enough retries to succeed", async () => {
 90 |   requests = 0;
 91 |   success = false;
 92 | 
 93 |   const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-2 --retries 2 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
 94 | 
 95 |   let status = 0;
 96 | 
 97 |   const crawlFinished = new Promise(r => resolve = r);
 98 | 
 99 |   // detect crawler exit
100 |   let crawler_exited = false;
101 |   child.on("exit", function (code) {
102 |     status = code;
103 |     resolve();
104 |   });
105 | 
106 |   await crawlFinished;
107 | 
108 |   expect(status).toBe(0);
109 | 
110 |   // (1 + 2) * 3 == 9 requests
111 |   expect(requests).toBe(9);
112 |   expect(success).toBe(true);
113 | });
114 | 
115 | 
116 | test("run crawl with retries for 503, not enough retries, fail", async () => {
117 |   requests = 0;
118 |   success = false;
119 | 
120 |   const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-3 --retries 1 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
121 | 
122 |   let status = 0;
123 | 
124 |   const crawlFinished = new Promise(r => resolve = r);
125 | 
126 |   // detect crawler exit
127 |   let crawler_exited = false;
128 |   child.on("exit", function (code) {
129 |     status = code;
130 |     resolve();
131 |   });
132 | 
133 |   await crawlFinished;
134 | 
135 |   expect(status).toBe(1);
136 |   // (1 + 1) * 3 requests == 6 requests
137 |   expect(requests).toBe(6);
138 |   expect(success).toBe(false);
139 | });
140 | 
141 | 
142 | test("run crawl with retries for 503, no retries, fail", async () => {
143 |   requests = 0;
144 |   success = false;
145 | 
146 |   const child = exec(`docker run -v $PWD/test-crawls:/crawls --rm webrecorder/browsertrix-crawler crawl --url http://${DOCKER_HOST_NAME}:31501 --url https://example.com/ --limit 2 --collection retry-fail-4 --retries 0 --failOnInvalidStatus --failOnFailedSeed --logging stats,debug`);
147 | 
148 |   let status = 0;
149 | 
150 |   const crawlFinished = new Promise(r => resolve = r);
151 | 
152 |   // detect crawler exit
153 |   let crawler_exited = false;
154 |   child.on("exit", function (code) {
155 |     status = code;
156 |     resolve();
157 |   });
158 | 
159 |   await crawlFinished;
160 | 
161 |   expect(status).toBe(1);
162 |   // (1) * 3 requests == 3 requests
163 |   expect(requests).toBe(3);
164 |   expect(success).toBe(false);
165 | });
166 | 
167 | 
168 | 


--------------------------------------------------------------------------------
/tests/rollover-writer.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | 
 4 | test("set rollover to 500K and ensure individual WARCs rollover, including screenshots", async () => {
 5 |   child_process.execSync(
 6 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 5 --exclude community --collection rollover-500K --rolloverSize 500000 --screenshot view --logging debug"
 7 |   );
 8 | 
 9 |   const warcLists = fs.readdirSync("test-crawls/collections/rollover-500K/archive");
10 | 
11 |   let main = 0;
12 |   let screenshots = 0;
13 | 
14 |   for (const name of warcLists) {
15 |     if (name.startsWith("rec-")) {
16 |       main++;
17 |     } else if (name.startsWith("screenshots-")) {
18 |       screenshots++;
19 |     }
20 |   }
21 | 
22 |   // expect at least 6 main WARCs
23 |   expect(main).toBeGreaterThan(5);
24 | 
25 |   // expect at least 2 screenshot WARCs
26 |   expect(screenshots).toBeGreaterThan(1);
27 | 
28 | });
29 | 


--------------------------------------------------------------------------------
/tests/saved-state.test.js:
--------------------------------------------------------------------------------
  1 | import { execSync } from "child_process";
  2 | import fs from "fs";
  3 | import path from "path";
  4 | import yaml from "js-yaml";
  5 | import Redis from "ioredis";
  6 | 
  7 | 
  8 | const pagesFile = "test-crawls/collections/int-state-test/pages/pages.jsonl";
  9 | const extraPagesFile = "test-crawls/collections/int-state-test/pages/extraPages.jsonl";
 10 | 
 11 | 
 12 | function sleep(ms) {
 13 |   return new Promise((resolve) => setTimeout(resolve, ms));
 14 | }
 15 | 
 16 | async function waitContainerDone(containerId) {
 17 |   // containerId is initially the full id, but docker ps
 18 |   // only prints the short id (first 12 characters)
 19 |   containerId = containerId.slice(0, 12);
 20 | 
 21 |   while (true) {
 22 |     try {
 23 |       const res = execSync("docker ps -q", { encoding: "utf-8" });
 24 |       if (res.indexOf(containerId) < 0) {
 25 |         return;
 26 |       }
 27 |     } catch (e) {
 28 |       console.error(e);
 29 |     }
 30 |     await sleep(500);
 31 |   }
 32 | }
 33 | 
 34 | async function killContainer(containerId) {
 35 |   try {
 36 |     execSync(`docker kill -s SIGINT ${containerId}`);
 37 |   } catch (e) {
 38 |     return;
 39 |   }
 40 | 
 41 |   await waitContainerDone(containerId);
 42 | }
 43 | 
 44 | 
 45 | let savedStateFile;
 46 | let state;
 47 | let numDone;
 48 | let numQueued;
 49 | let finished;
 50 | 
 51 | test("check crawl interrupted + saved state written", async () => {
 52 |   let containerId = null;
 53 | 
 54 |   try {
 55 |     containerId = execSync(
 56 |       "docker run -d -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url http://old.webrecorder.net/ --limit 10 --behaviors \"\" --exclude community",
 57 |       { encoding: "utf-8" },
 58 |       //wait.callback,
 59 |     );
 60 |   } catch (error) {
 61 |     console.log(error);
 62 |   }
 63 | 
 64 |   // remove existing pagesFile to support reentrancy
 65 |   try {
 66 |     fs.unlinkSync(pagesFile);
 67 |   } catch (e) {
 68 |     // ignore
 69 |   }
 70 | 
 71 |   while (true) {
 72 |     try {
 73 |       const pages = fs
 74 |         .readFileSync(pagesFile, { encoding: "utf-8" })
 75 |         .trim()
 76 |         .split("\n");
 77 | 
 78 |       if (pages.length >= 2) {
 79 |         break;
 80 |       }
 81 |     } catch (e) {
 82 |       // ignore
 83 |     }
 84 | 
 85 |     await sleep(500);
 86 |   }
 87 | 
 88 |   await killContainer(containerId);
 89 | 
 90 |   const savedStates = fs.readdirSync(
 91 |     "test-crawls/collections/int-state-test/crawls",
 92 |   );
 93 |   expect(savedStates.length > 0).toEqual(true);
 94 | 
 95 |   savedStateFile = savedStates[savedStates.length - 1];
 96 | });
 97 | 
 98 | test("check parsing saved state + page done + queue present", () => {
 99 |   expect(savedStateFile).toBeTruthy();
100 | 
101 |   const savedState = fs.readFileSync(
102 |     path.join("test-crawls/collections/int-state-test/crawls", savedStateFile),
103 |     "utf-8",
104 |   );
105 | 
106 |   const saved = yaml.load(savedState);
107 | 
108 |   state = saved.state;
109 |   finished = state.finished;
110 | 
111 |   numDone = finished.length;
112 |   numQueued = state.queued.length;
113 | 
114 |   expect(!!state).toBe(true);
115 |   expect(numDone > 0).toEqual(true);
116 |   expect(numQueued > 0).toEqual(true);
117 |   expect(numDone + numQueued).toEqual(10);
118 | 
119 |   // ensure extra seeds also set
120 |   expect(state.extraSeeds).toEqual([
121 |     `{"origSeedId":0,"newUrl":"https://old.webrecorder.net/"}`,
122 |   ]);
123 | });
124 | 
125 | test("check crawl restarted with saved state", async () => {
126 |   let containerId = null;
127 | 
128 |   const port = 36379;
129 | 
130 |   try {
131 |     containerId = execSync(
132 |       `docker run -d -p ${port}:6379 -e CRAWL_ID=test -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection int-state-test --url https://old.webrecorder.net/ --config /crawls/collections/int-state-test/crawls/${savedStateFile} --debugAccessRedis --limit 10 --behaviors "" --exclude community`,
133 |       { encoding: "utf-8" },
134 |     );
135 |   } catch (error) {
136 |     console.log(error);
137 |   }
138 | 
139 |   await sleep(2000);
140 | 
141 |   const redis = new Redis(`redis://127.0.0.1:${port}/0`, { lazyConnect: true, retryStrategy: () => null });
142 | 
143 |   try {
144 |     await redis.connect({
145 |       maxRetriesPerRequest: 100,
146 |     });
147 | 
148 |     await sleep(2000);
149 | 
150 |     expect(await redis.get("test:d")).toBe(numDone + "");
151 | 
152 |     for (const url of finished) {
153 |       const res = await redis.sismember("test:s", url);
154 |       expect(res).toBe(1);
155 |     }
156 |   } catch (e) {
157 |     console.log(e);
158 |   } finally {
159 |     await waitContainerDone(containerId);
160 |   }
161 | });
162 | 
163 | test("ensure correct number of pages was written to pages + extraPages", () => {
164 |   const pages = fs
165 |     .readFileSync(pagesFile, { encoding: "utf-8" })
166 |     .trim()
167 |     .split("\n");
168 | 
169 |   // first line is the header
170 |   expect(pages.length).toBe(2);
171 | 
172 |   const extraPages = fs
173 |     .readFileSync(extraPagesFile, { encoding: "utf-8" })
174 |     .trim()
175 |     .split("\n");
176 | 
177 |   // first line is the header
178 |   expect(extraPages.length).toBe(10);
179 | });
180 | 


--------------------------------------------------------------------------------
/tests/screenshot.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import fs from "fs";
 3 | 
 4 | // screenshot
 5 | 
 6 | function screenshotWarcExists(name) {
 7 |   const warcList = fs.readdirSync(`test-crawls/collections/${name}/archive/`);
 8 | 
 9 |   for (const warc of warcList) {
10 |     if (warc.startsWith("screenshots-")) {
11 |       return true;
12 |     }
13 |   }
14 | 
15 |   return false;
16 | }
17 | 
18 | 
19 | test("ensure basic crawl run with --screenshot passes", async () => {
20 |   child_process.execSync(
21 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection test-with-screenshots --url http://www.example.com/ --screenshot view --workers 2",
22 |   );
23 | });
24 | 
25 | test("check that a screenshots warc file exists in the test collection", () => {
26 |   expect(screenshotWarcExists("test-with-screenshots")).toBe(true);
27 | });
28 | 
29 | // fullPageScreenshot
30 | 
31 | test("ensure basic crawl run with --fullPageScreenshot passes", async () => {
32 |   child_process.execSync(
33 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection fullpage --url http://www.example.com/ --screenshot fullPage --workers 2",
34 |   );
35 | });
36 | 
37 | test("check that a screenshots warc file exists in the fullpage collection", () => {
38 |   expect(screenshotWarcExists("fullpage")).toBe(true);
39 | });
40 | 
41 | // thumbnail
42 | 
43 | test("ensure basic crawl run with --thumbnail passes", async () => {
44 |   child_process.execSync(
45 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection thumbnail --url http://www.example.com/ --screenshot thumbnail --workers 2",
46 |   );
47 | });
48 | 
49 | test("check that a screenshots warc file exists in the thumbnail collection", () => {
50 |   expect(screenshotWarcExists("thumbnail")).toBe(true);
51 | });
52 | 
53 | // combination
54 | 
55 | test("ensure basic crawl run with multiple screenshot types and --generateWACZ passes", async () => {
56 |   child_process.execSync(
57 |     "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection combined --url http://www.example.com/ --screenshot thumbnail,view,fullPage --generateWACZ --workers 2",
58 |   );
59 | });
60 | 
61 | test("check that a screenshots warc file exists in the combined collection", () => {
62 |   expect(screenshotWarcExists("combined")).toBe(true);
63 | });
64 | 
65 | test("check that a wacz file exists in the combined collection", () => {
66 |   const waczExists = fs.existsSync(
67 |     "test-crawls/collections/combined/combined.wacz",
68 |   );
69 |   expect(waczExists).toBe(true);
70 | });
71 | 


--------------------------------------------------------------------------------
/tests/seeds.test.js:
--------------------------------------------------------------------------------
 1 | import util from "util";
 2 | import { exec as execCallback } from "child_process";
 3 | 
 4 | const exec = util.promisify(execCallback);
 5 | 
 6 | test("ensure one invalid seed doesn't end crawl if failOnFailedSeed is not set", async () => {
 7 |   let passed = true;
 8 |   try {
 9 |     await exec(
10 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://example.invalid --generateWACZ --limit 2 --collection invalidseed",
11 |     );
12 |   } catch (error) {
13 |     console.log(error);
14 |     passed = false;
15 |   }
16 |   expect(passed).toBe(true);
17 | });
18 | 
19 | test("ensure one invalid seed fails crawl if failOnFailedSeed is set", async () => {
20 |   let passed = true;
21 |   try {
22 |     await exec(
23 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url example.invalid --generateWACZ --limit 2 --failOnFailedSeed --collection failseed",
24 |     );
25 |   } catch (error) {
26 |     expect(error.code).toEqual(1);
27 |     passed = false;
28 |   }
29 |   expect(passed).toBe(false);
30 | });
31 | 
32 | test("ensure seed with network error fails crawl if failOnFailedSeed and failOnInvalidStatus is set", async () => {
33 |   let passed = true;
34 |   try {
35 |     await exec(
36 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://example.invalid --generateWACZ --limit 2 --failOnFailedSeed --failOnInvalidStatus --collection failseedstatus",
37 |     );
38 |   } catch (error) {
39 |     expect(error.code).toEqual(1);
40 |     passed = false;
41 |   }
42 |   expect(passed).toBe(false);
43 | });
44 | 
45 | test("ensure seed with 4xx/5xx response fails crawl if failOnFailedSeed and failOnInvalidStatus are set", async () => {
46 |   let passed = true;
47 |   try {
48 |     await exec(
49 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://old.webrecorder.net/doesntexist --generateWACZ --limit 2 --failOnFailedSeed --failOnInvalidStatus --collection failseed404status",
50 |     );
51 |   } catch (error) {
52 |     expect(error.code).toEqual(1);
53 |     passed = false;
54 |   }
55 |   expect(passed).toBe(false);
56 | });
57 | 
58 | test("ensure seed with 4xx/5xx response succeeds if failOnInvalidStatus is not set", async () => {
59 |   let passed = true;
60 |   try {
61 |     await exec(
62 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://www.iana.org/ --url https://old.webrecorder.net/doesntexist --generateWACZ --limit 2 --failOnFailedSeed --collection failseedwithoutinvalidstatus",
63 |     );
64 |   } catch (error) {
65 |     console.log(error);
66 |     passed = false;
67 |   }
68 |   expect(passed).toBe(true);
69 | });
70 | 
71 | test("ensure crawl fails if no valid seeds are passed", async () => {
72 |   let passed = true;
73 |   try {
74 |     await exec(
75 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url iana.org/ --url example.invalid --generateWACZ --limit 2 --collection allinvalidseeds",
76 |     );
77 |   } catch (error) {
78 |     expect(error.code).toEqual(17);
79 |     passed = false;
80 |   }
81 |   expect(passed).toBe(false);
82 | });
83 | 


--------------------------------------------------------------------------------
/tests/sitemap-parse.test.js:
--------------------------------------------------------------------------------
 1 | import child_process from "child_process";
 2 | import Redis from "ioredis";
 3 | 
 4 | function sleep(ms) {
 5 |   return new Promise((resolve) => setTimeout(resolve, ms));
 6 | }
 7 | 
 8 | async function waitContainer(containerId) {
 9 |   try {
10 |     child_process.execSync(`docker kill -s SIGINT ${containerId}`);
11 |   } catch (e) {
12 |     return;
13 |   }
14 | 
15 |   // containerId is initially the full id, but docker ps
16 |   // only prints the short id (first 12 characters)
17 |   containerId = containerId.slice(0, 12);
18 | 
19 |   while (true) {
20 |     try {
21 |       const res = child_process.execSync("docker ps -q", { encoding: "utf-8" });
22 |       if (res.indexOf(containerId) < 0) {
23 |         return;
24 |       }
25 |     } catch (e) {
26 |       console.error(e);
27 |     }
28 |     await sleep(500);
29 |   }
30 | }
31 | 
32 | async function runCrawl(numExpected, url, sitemap="", limit=0, numExpectedLessThan=0, extra="") {
33 |   const command = `docker run -d -p 36381:6379 -e CRAWL_ID=test webrecorder/browsertrix-crawler crawl --url ${url} --sitemap ${sitemap} --limit ${limit} --context sitemap --logging debug --debugAccessRedis ${extra}`;
34 |   const containerId = child_process.execSync(command, {encoding: "utf-8"});
35 | 
36 |   await sleep(3000);
37 | 
38 |   const redis = new Redis("redis://127.0.0.1:36381/0", { lazyConnect: true, retryStrategy: () => null });
39 | 
40 |   let finished = 0;
41 | 
42 |   try {
43 |     await redis.connect({
44 |       maxRetriesPerRequest: 100,
45 |     });
46 | 
47 |     while (true) {
48 |       finished = await redis.zcard("test:q");
49 | 
50 |       if (await redis.get("test:sitemapDone")) {
51 |         break;
52 |       }
53 |       if (finished >= numExpected) {
54 |         break;
55 |       }
56 |     }
57 |   } catch (e) {
58 |     console.error(e);
59 |   } finally {
60 |     await waitContainer(containerId);
61 |   }
62 | 
63 |   expect(finished).toBeGreaterThanOrEqual(numExpected);
64 | 
65 |   if (numExpectedLessThan) {
66 |     expect(finished).toBeLessThanOrEqual(numExpectedLessThan);
67 |   }
68 | }
69 | 
70 | test("test sitemap fully finish", async () => {
71 |   await runCrawl(3500, "https://developer.mozilla.org/", "", 0);
72 | });
73 | 
74 | test("test sitemap with limit", async () => {
75 |   await runCrawl(1900, "https://developer.mozilla.org/", "", 2000);
76 | });
77 | 
78 | test("test sitemap with limit, specific URL", async () => {
79 |   await runCrawl(1900, "https://developer.mozilla.org/", "https://developer.mozilla.org/sitemap.xml", 2000);
80 | });
81 | 
82 | test("test sitemap with application/xml content-type", async () => {
83 |   await runCrawl(10, "https://bitarchivist.net/", "", 0);
84 | });
85 | 
86 | test("test sitemap with narrow scope, extraHops, to ensure out-of-scope sitemap URLs do not count as extraHops", async () => {
87 |   await runCrawl(0, "https://www.mozilla.org/", "", 2000, 100, "--extraHops 1 --scopeType page");
88 | });
89 | 


--------------------------------------------------------------------------------
/tests/storage.test.js:
--------------------------------------------------------------------------------
 1 | import {
 2 |   calculatePercentageUsed,
 3 |   checkDiskUtilization,
 4 | } from "../dist/util/storage.js";
 5 | 
 6 | test("ensure calculatePercentageUsed returns expected values", () => {
 7 |   expect(calculatePercentageUsed(30, 100)).toEqual(30);
 8 | 
 9 |   expect(calculatePercentageUsed(1507, 35750)).toEqual(4);
10 | 
11 |   expect(calculatePercentageUsed(33819, 35750)).toEqual(95);
12 | 
13 |   expect(calculatePercentageUsed(140, 70)).toEqual(200);
14 | 
15 |   expect(calculatePercentageUsed(0, 5)).toEqual(0);
16 | });
17 | 
18 | test("verify end-to-end disk utilization not exceeded threshold", async () => {
19 |   const params = {
20 |     diskUtilization: 90,
21 |     combineWARC: true,
22 |     generateWACZ: true,
23 |   };
24 | 
25 |   const mockDfOutput = `\
26 | Filesystem     1K-blocks      Used Available Use% Mounted on
27 | grpcfuse       1000000      285000    715000  28% /crawls`;
28 | 
29 |   // with combineWARC + generateWACZ, projected is 285k + 4 * 5k = 310k = 31%
30 |   // does not exceed 90% threshold
31 |   const returnValue = await checkDiskUtilization(
32 |     '/crawls',
33 |     params,
34 |     5000 * 1024,
35 |     mockDfOutput,
36 |     false
37 |   );
38 |   expect(returnValue).toEqual({
39 |     stop: false,
40 |     used: 28,
41 |     projected: 31,
42 |     threshold: 90,
43 |   });
44 | });
45 | 
46 | test("verify end-to-end disk utilization exceeds threshold", async () => {
47 |   const params = {
48 |     diskUtilization: 90,
49 |     combineWARC: false,
50 |     generateWACZ: true,
51 |   };
52 | 
53 |   const mockDfOutput = `\
54 | Filesystem     1K-blocks  Used Available Use% Mounted on
55 | grpcfuse       100000    85000     15000  85% /crawls`;
56 | 
57 |   // with generateWACZ, projected is 85k + 3k x 2 = 91k = 91%
58 |   // exceeds 90% threshold
59 |   const returnValue = await checkDiskUtilization(
60 |     '/crawls',
61 |     params,
62 |     3000 * 1024,
63 |     mockDfOutput,
64 |     false
65 |   );
66 |   expect(returnValue).toEqual({
67 |     stop: true,
68 |     used: 85,
69 |     projected: 91,
70 |     threshold: 90,
71 |   });
72 | });
73 | 


--------------------------------------------------------------------------------
/tests/text-extract.test.js:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import child_process from "child_process";
 3 | 
 4 | test("check that urn:text and urn:textfinal records are written to WARC", async () => {
 5 |   try {
 6 |     child_process.execSync(
 7 |       "docker run -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --collection text-extract --url https://www.nytimes.com/ --scopeType page --generateCDX --text to-warc,final-to-warc",
 8 |     );
 9 |   } catch (error) {
10 |     //console.log(new TextDecoder().decode(error));
11 |     console.log(error.stderr);
12 |   }
13 | 
14 |   const data = fs.readFileSync(
15 |     "test-crawls/collections/text-extract/indexes/index.cdxj",
16 |     { encoding: "utf-8" },
17 |   );
18 | 
19 |   expect(data.indexOf("urn:text:https://www.nytimes.com/") > 0).toBe(true);
20 | 
21 |   expect(data.indexOf("urn:textFinal:https://www.nytimes.com/") > 0).toBe(true);
22 | });
23 | 


--------------------------------------------------------------------------------
/tests/upload-wacz.test.js:
--------------------------------------------------------------------------------
 1 | import { execSync, exec } from "child_process";
 2 | import fs from "fs";
 3 | import { Redis } from "ioredis";
 4 | 
 5 | 
 6 | const sleep = (ms) => new Promise((res) => setTimeout(res, ms));
 7 | 
 8 | let minioId;
 9 | 
10 | beforeAll(() => {
11 |   execSync("docker network create upload-test-net");
12 |   minioId = execSync("docker run --rm -d -p 9000:9000 -p 9001:9001 --name minio --network=upload-test-net minio/minio server /data --console-address ':9001'", {encoding: "utf-8"});
13 | });
14 | 
15 | 
16 | afterAll(async () => {
17 |   execSync(`docker kill -s SIGINT ${minioId}`);
18 |   await sleep(5000);
19 |   execSync("docker network rm upload-test-net");
20 | });
21 | 
22 | test("run crawl with upload", async () => {
23 | 
24 |   execSync(`docker exec ${minioId.trim()} mc mb /data/test-bucket`);
25 | 
26 |   const child = exec(
27 |     "docker run --rm " + 
28 |     "-e STORE_ENDPOINT_URL=http://minio:9000/test-bucket/ " +
29 |     "-e STORE_ACCESS_KEY=minioadmin " + 
30 |     "-e STORE_SECRET_KEY=minioadmin " + 
31 |     "-e STORE_PATH=prefix/ " +
32 |     "--network=upload-test-net " +
33 |     "-p 36390:6379 -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --url https://old.webrecorder.net/ --limit 2 --collection upload-test --crawlId upload-test --writePagesToRedis --debugAccessRedis --generateWACZ",
34 |   );
35 | 
36 |   let resolve = null;
37 |   const crawlFinished = new Promise(r => resolve = r);
38 | 
39 |   // detect crawler exit
40 |   let crawler_exited = false;
41 |   child.on("exit", function () {
42 |     crawler_exited = true;
43 |     resolve();
44 |   });
45 | 
46 |   const redis = new Redis("redis://127.0.0.1:36390/0", { lazyConnect: true, retryStrategy: () => null });
47 | 
48 |   await sleep(3000);
49 | 
50 |   await redis.connect({ maxRetriesPerRequest: 50 });
51 | 
52 |   let filename;
53 | 
54 |   while (!crawler_exited) {
55 |     const res = await redis.lpop("upload-test:pages");
56 |     if (!res) {
57 |       await sleep(100);
58 |       continue;
59 |     }
60 |     const json = JSON.parse(res);
61 |     expect(json).toHaveProperty("id");
62 |     expect(json).toHaveProperty("url");
63 |     expect(json).toHaveProperty("ts");
64 |     expect(json).toHaveProperty("title");
65 |     expect(json).toHaveProperty("loadState");
66 |     expect(json).toHaveProperty("filename");
67 |     expect(json).toHaveProperty("depth");
68 |     expect(json).toHaveProperty("seed");
69 |     expect(json).toHaveProperty("favIconUrl");
70 |     filename = json.filename;
71 |     break;
72 |   }
73 | 
74 |   // ensure bucket is public
75 |   execSync(`docker exec ${minioId.trim()} mc config host add local http://127.0.0.1:9000 minioadmin minioadmin`);
76 |   execSync(`docker exec ${minioId.trim()} mc anonymous set download local/test-bucket`);
77 | 
78 |   // wait for crawler to finish
79 |   await crawlFinished;
80 | 
81 |   // ensure WACZ exists at the specified filename
82 |   const resp = await fetch(`http://127.0.0.1:9000/test-bucket/prefix/${filename}`);
83 |   expect(resp.status).toBe(200);
84 | });
85 | 


--------------------------------------------------------------------------------
/tests/url_file_list.test.js:
--------------------------------------------------------------------------------
 1 | import util from "util";
 2 | import { exec as execCallback } from "child_process";
 3 | import fs from "fs";
 4 | 
 5 | const exec = util.promisify(execCallback);
 6 | 
 7 | test("check that URLs in seed-list are crawled", async () => {
 8 |   try {
 9 |     await exec(
10 |       "docker run -v $PWD/test-crawls:/crawls -v $PWD/tests/fixtures:/tests/fixtures webrecorder/browsertrix-crawler crawl --collection filelisttest --urlFile /tests/fixtures/urlSeedFile.txt --timeout 90000",
11 |     );
12 |   } catch (error) {
13 |     console.log(error);
14 |   }
15 | 
16 |   let crawled_pages = fs.readFileSync(
17 |     "test-crawls/collections/filelisttest/pages/pages.jsonl",
18 |     "utf8",
19 |   );
20 |   let seed_file = fs
21 |     .readFileSync("tests/fixtures/urlSeedFile.txt", "utf8")
22 |     .split("\n")
23 |     .sort();
24 | 
25 |   let seed_file_list = [];
26 |   for (var j = 0; j < seed_file.length; j++) {
27 |     if (seed_file[j] != undefined) {
28 |       seed_file_list.push(seed_file[j]);
29 |     }
30 |   }
31 | 
32 |   let foundSeedUrl = true;
33 | 
34 |   for (var i = 1; i < seed_file_list.length; i++) {
35 |     if (crawled_pages.indexOf(seed_file_list[i]) == -1) {
36 |       foundSeedUrl = false;
37 |     }
38 |   }
39 |   expect(foundSeedUrl).toBe(true);
40 | });
41 | 


--------------------------------------------------------------------------------
/tests/warcinfo.test.js:
--------------------------------------------------------------------------------
 1 | import fs from "fs";
 2 | import zlib from "zlib";
 3 | import path from "path";
 4 | import child_process from "child_process";
 5 | 
 6 | test("run crawl", async() => {
 7 |   let success = false;
 8 | 
 9 |   try {
10 |     const configYaml = fs.readFileSync("tests/fixtures/crawl-2.yaml", "utf8");
11 |     const proc = child_process.execSync(
12 |       "docker run -i -v $PWD/test-crawls:/crawls webrecorder/browsertrix-crawler crawl --config stdin --limit 1 --collection warcinfo --combineWARC",
13 |       { input: configYaml, stdin: "inherit", encoding: "utf8" },
14 |     );
15 | 
16 |     //console.log(proc);
17 |     success = true;
18 |   } catch (error) {
19 |     console.log(error);
20 |   }
21 | 
22 |   expect(success).toBe(true);
23 | });
24 | 
25 | test("check that the warcinfo for individual WARC is as expected", async () => {
26 | 
27 |   const warcs = fs.readdirSync("test-crawls/collections/warcinfo/archive/");
28 | 
29 |   let filename = "";
30 | 
31 |   for (const name of warcs) {
32 |     if (name.startsWith("rec-")) {
33 |       filename = path.join("test-crawls/collections/warcinfo/archive/", name);
34 |       break;
35 |     }
36 |   }
37 | 
38 |   const warcData = fs.readFileSync(filename);
39 | 
40 |   const data = zlib.gunzipSync(warcData);
41 | 
42 |   const string = data.toString("utf8");
43 | 
44 |   expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
45 |   expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
46 |   expect(
47 |     string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
48 |   ).not.toEqual(null);
49 |   expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1);
50 | });
51 | 
52 | test("check that the warcinfo for combined WARC file is as expected", async () => {
53 |   const warcData = fs.readFileSync(
54 |     "test-crawls/collections/warcinfo/warcinfo_0.warc.gz",
55 |   );
56 | 
57 |   const data = zlib.gunzipSync(warcData);
58 | 
59 |   const string = data.toString("utf8");
60 | 
61 |   expect(string.indexOf("operator: test")).toBeGreaterThan(-1);
62 |   expect(string.indexOf("host: hostname")).toBeGreaterThan(-1);
63 |   expect(
64 |     string.match(/Browsertrix-Crawler \d[\w.-]+ \(with warcio.js \d[\w.-]+\)/),
65 |   ).not.toEqual(null);
66 |   expect(string.indexOf("format: WARC File Format 1.1")).toBeGreaterThan(-1);
67 | });
68 | 


--------------------------------------------------------------------------------
/tsconfig.eslint.json:
--------------------------------------------------------------------------------
1 | {
2 |   "compilerOptions": {
3 |     "noEmit": true
4 |   },
5 |   "extends": "./tsconfig.json",
6 |   "include": ["**/*.ts", "**/*.js", ".*.js"],
7 |   "exclude": ["dist", "configs", "crawls"]
8 | }
9 | 


--------------------------------------------------------------------------------