├── .dockerignore ├── .env.example ├── .eslintignore ├── .eslintrc.js ├── .github └── workflows │ ├── main.yml │ ├── release.yml │ ├── release_docker.yml │ └── renovate.yml ├── .gitignore ├── .npmrc ├── .nvmrc ├── .prettierrc.js ├── .yarn └── releases │ └── yarn-4.0.2.cjs ├── .yarnrc.yml ├── CHANGELOG.md ├── CONTRIBUTING.md ├── Dockerfile ├── README.md ├── jest.config.js ├── jest.setup.ts ├── nodemon.json ├── package.json ├── public ├── index.html ├── static │ └── main.webmanifest ├── test-website │ ├── async.html │ ├── basic.html │ ├── blocked-requests.html │ ├── iframe.html │ ├── js-redirect-hash.html │ ├── js-redirect-history.html │ ├── js-redirect-path.html │ ├── js-redirect.html │ ├── login-double-password.html │ ├── login-multiple-input.html │ ├── meta-refresh-5.html │ ├── meta-refresh.html │ ├── page-crash.html │ └── slow.html └── views │ ├── login-2steps-js.ejs │ ├── login-step1.ejs │ ├── login-step2.ejs │ └── login.ejs ├── release.config.js ├── renovate.json ├── scripts ├── build.sh ├── start.sh ├── test_image.sh └── update_adblock_hosts.sh ├── src ├── __tests__ │ ├── __snapshots__ │ │ ├── async.test.ts.snap │ │ └── login.test.ts.snap │ ├── api.test.ts │ ├── async.test.ts │ ├── blockedRequests.test.ts │ ├── errors.test.ts │ ├── helpers.ts │ ├── index.test.ts │ ├── list.test.ts │ ├── login.real.test.ts │ ├── login.test.ts │ ├── redirect.test.ts │ └── tasksManager.test.ts ├── api │ ├── @types │ │ ├── getHealthy.ts │ │ ├── getList.ts │ │ ├── getRoot.ts │ │ ├── postLogin.ts │ │ ├── postRender.ts │ │ └── responses.ts │ ├── constants.ts │ ├── helpers │ │ ├── alt.ts │ │ ├── buildUrl.ts │ │ ├── errors.ts │ │ ├── getForwardedHeaders.ts │ │ ├── logger.ts │ │ └── requestLogger.ts │ ├── index.ts │ └── routes │ │ ├── healthy.ts │ │ ├── list.ts │ │ ├── login.ts │ │ ├── privates │ │ └── login.ts │ │ ├── ready.ts │ │ ├── render.ts │ │ └── root.ts ├── global.d.ts ├── helpers │ ├── errorReporting.ts │ ├── gracefulClose.ts │ ├── logger.ts │ ├── projectRoot.ts │ ├── promiseWithTimeout.ts │ ├── stats.ts │ ├── wait.ts │ └── waitForPendingRequests.ts ├── index.ts └── lib │ ├── TasksManager.ts │ ├── browser │ ├── Adblocker.ts │ ├── Browser.ts │ ├── Page.ts │ ├── TimeBudget.test.ts │ ├── TimeBudget.ts │ └── constants.ts │ ├── constants.ts │ ├── helpers │ ├── errors.ts │ ├── getInput.ts │ ├── injectBaseHref.ts │ └── validateURL.ts │ ├── singletons.ts │ ├── tasks │ ├── Login.ts │ ├── Render.ts │ └── Task.ts │ └── types.ts ├── tsconfig.json └── yarn.lock /.dockerignore: -------------------------------------------------------------------------------- 1 | # Only ignore files / folders here that are also in .gitgnore 2 | # 3 | # This file uses the same format as .gitgnore, 4 | # except that it's not recursive by default 5 | # 6 | # | .gitignore | .dockerignore | 7 | # |------------|---------------| 8 | # | .DS_Store | **/.DS_Store | 9 | # | /.env | .env | 10 | # 11 | # Only bother including: 12 | # * big files / folder to lower the build context 13 | # * often updated files to help the docker cache 14 | 15 | # Dependencies 16 | **/node_modules 17 | dist/ 18 | .env 19 | 20 | # Logs 21 | **/*.log 22 | 23 | .git/ 24 | .github/ 25 | .githooks/ 26 | .circleci/ 27 | .nodemon.json 28 | .editorconfig 29 | .gitattributes 30 | .prettierignore 31 | .prettierrc.js 32 | .eslintrc.js 33 | .nvmrc 34 | .npmrc 35 | .eslintignore 36 | .eslinrcjs 37 | .vscode 38 | .env.example 39 | .yarn/cache/ 40 | release.config.js 41 | nodemon.json 42 | cypress.json 43 | README.md 44 | CHANGELOG.md 45 | CONTRIBUTING.md 46 | **/*.test.ts 47 | renovate.json 48 | **/jest* 49 | **/.DS_Store 50 | **/.storybook/ 51 | **/__fixtures__/ 52 | **/__snapshots__/ 53 | **/__mocks__/ 54 | **/__mock__/ 55 | **/__tests__/ 56 | **/tsconfig.tsbuildinfo 57 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Allow calls on localhost IPs 2 | ALLOW_LOCALHOST="true" 3 | 4 | # Change the minimum level of the log that will be output 5 | LOG_LEVEL="info" 6 | 7 | # Comma-separated list of prefixes to whitelist when `ALLOW_LOCALHOST` is set to true. 8 | # Example: `IP_PREFIXES_WHITELIST=127.,0.,::1` (these are the default values used when the variable is not provided alongside `ALLOW_LOCALHOST`) 9 | IP_PREFIXES_WHITELIST= 10 | 11 | # Comma-separated list of headers to forward on navigation request 12 | # Example: `HEADERS_TO_FORWARD=Cookie,Authorization` (default value) 13 | HEADERS_TO_FORWARD= 14 | 15 | # Report errors to this Sentry URL. 16 | SENTRY_DSN= 17 | 18 | # Login credentials for testing 19 | # example: LOGIN_CREDENTIALS={"login.live.com":{"username":"FOOBAR@outlook.com","password":"FOOBAR"}} 20 | LOGIN_CREDENTIALS= 21 | -------------------------------------------------------------------------------- /.eslintignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | pw-browsers/ 3 | dist/ 4 | coverage/ 5 | node_modules/ 6 | 7 | .yarnrc.yml 8 | .yarn/ 9 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | // eslint-disable-next-line import/no-commonjs 2 | module.exports = { 3 | env: { 4 | browser: true, // For frontend only 5 | es2020: true, 6 | jest: true, 7 | }, 8 | extends: [ 9 | 'algolia', 10 | 'algolia/jest', 11 | 'algolia/typescript', 12 | 'plugin:import/typescript', 13 | ], 14 | parser: '@typescript-eslint/parser', 15 | parserOptions: { 16 | ecmaVersion: 11, 17 | sourceType: 'module', 18 | }, 19 | settings: { 20 | 'import/resolver': { 21 | typescript: {}, 22 | }, 23 | }, 24 | 25 | plugins: ['prettier', '@typescript-eslint', 'import', 'algolia'], 26 | rules: { 27 | 'algolia/func-style-toplevel': 'error', 28 | 29 | 'no-console': 'off', 30 | 'no-continue': 'off', 31 | 'no-loop-func': 'off', 32 | 'consistent-return': 'off', 33 | 34 | '@typescript-eslint/explicit-member-accessibility': [ 35 | 'error', 36 | { accessibility: 'no-public' }, 37 | ], 38 | 'eslint-comments/disable-enable-pair': ['error', { allowWholeFile: true }], 39 | 40 | 'no-param-reassign': [ 41 | 'error', 42 | { props: true, ignorePropertyModificationsFor: ['res', 'req'] }, // http://expressjs.com/en/api.html#res.locals 43 | ], 44 | 45 | // TMP 46 | 'jsdoc/check-examples': ['off'], 47 | '@typescript-eslint/prefer-optional-chain': ['off'], // to re-enable when this is fixed: https://github.com/typescript-eslint/typescript-eslint/issues/6024 48 | }, 49 | }; 50 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: Renderscript 2 | 3 | on: 4 | push: 5 | branches: 6 | - 'master' 7 | - 'renovate/**' 8 | pull_request: 9 | 10 | env: 11 | COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} 12 | 13 | jobs: 14 | lint: 15 | runs-on: ubuntu-latest 16 | name: Lint 17 | steps: 18 | - uses: actions/checkout@v4 19 | 20 | - name: Install Node 21 | uses: actions/setup-node@v3 22 | with: 23 | node-version-file: .nvmrc 24 | cache: yarn 25 | 26 | - run: yarn install --frozen-lockfile 27 | 28 | - name: Run Linter 29 | run: yarn lint 30 | 31 | tests: 32 | runs-on: ubuntu-latest 33 | name: Tests 34 | needs: lint 35 | steps: 36 | - uses: actions/checkout@v4 37 | 38 | - name: Install Node 39 | uses: actions/setup-node@v3 40 | with: 41 | node-version-file: .nvmrc 42 | cache: yarn 43 | 44 | - run: yarn install --frozen-lockfile 45 | 46 | - name: Install Playwright browsers 47 | run: yarn playwright install 48 | 49 | - name: Build 50 | run: yarn build 51 | 52 | - name: Background process 53 | run: | 54 | yarn ci:start & 55 | 56 | - name: Run test 57 | run: yarn test 58 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release Version 2 | on: 3 | workflow_dispatch: 4 | inputs: 5 | dry_run: 6 | required: true 7 | type: boolean 8 | default: true 9 | description: 'DryRun?' 10 | 11 | env: 12 | COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} 13 | 14 | jobs: 15 | release: 16 | runs-on: ubuntu-latest 17 | name: Release 18 | env: 19 | GH_TOKEN: ${{ secrets.GH_TOKEN }} 20 | GIT_AUTHOR_NAME: ${{ secrets.GH_USER_NAME }} 21 | GIT_AUTHOR_EMAIL: ${{ secrets.GH_USER_EMAIL }} 22 | GIT_COMMITTER_NAME: ${{ secrets.GH_USER_NAME }} 23 | GIT_COMMITTER_EMAIL: ${{ secrets.GH_USER_EMAIL }} 24 | 25 | steps: 26 | - uses: actions/checkout@v4 27 | with: 28 | # Make sure the release step uses its own credentials. 29 | persist-credentials: false 30 | 31 | - name: Install Node 32 | uses: actions/setup-node@v3 33 | with: 34 | node-version-file: .nvmrc 35 | cache: yarn 36 | 37 | - name: Release (--dry-run) 38 | if: (github.event_name == 'workflow_dispatch' && github.event.inputs.dry_run == 'true') 39 | run: | 40 | yarn install 41 | yarn semantic-release --dry-run 42 | 43 | - name: Release 44 | if: (github.event_name == 'workflow_dispatch' && github.event.inputs.dry_run != 'true') 45 | run: | 46 | yarn install 47 | yarn semantic-release 48 | -------------------------------------------------------------------------------- /.github/workflows/release_docker.yml: -------------------------------------------------------------------------------- 1 | name: Release Docker 2 | on: 3 | release: 4 | types: [published] 5 | 6 | workflow_dispatch: 7 | inputs: 8 | dry_run: 9 | required: true 10 | type: boolean 11 | default: true 12 | description: 'DryRun?' 13 | 14 | env: 15 | COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }} 16 | 17 | jobs: 18 | build-docker: 19 | runs-on: ubuntu-latest 20 | name: Build Dockers 21 | 22 | steps: 23 | - uses: actions/checkout@v4 24 | 25 | - name: Install Node 26 | uses: actions/setup-node@v3 27 | with: 28 | node-version-file: .nvmrc 29 | 30 | - name: Setting env var 31 | id: env_var 32 | shell: bash 33 | run: | 34 | echo "RENDERSCRIPT_VERSION=$(node -e 'console.log(require("./package.json").version)')" >> $GITHUB_OUTPUT 35 | echo "PLAYWRIGHT_VERSION=$(node -e 'console.log(require("./package.json").dependencies.playwright)')" >> $GITHUB_OUTPUT 36 | 37 | - name: Set up Docker Buildx 38 | uses: docker/setup-buildx-action@v3 39 | 40 | - uses: docker/login-action@v2 41 | with: 42 | username: ${{ secrets.DOCKERHUB_USERNAME }} 43 | password: ${{ secrets.DOCKERHUB_TOKEN }} 44 | 45 | - name: Set up Docker QEMU for arm64 docker builds 46 | uses: docker/setup-qemu-action@v3 47 | with: 48 | platforms: arm64 49 | 50 | - name: Build Image 51 | uses: docker/build-push-action@v4.2.1 52 | with: 53 | file: Dockerfile 54 | context: . 55 | platforms: linux/amd64 # buildx does not support multi-arch load 56 | push: false 57 | load: true 58 | tags: | 59 | algolia/renderscript:latest 60 | algolia/renderscript:${{ steps.env_var.outputs.RENDERSCRIPT_VERSION }} 61 | algolia/renderscript:${{ env.COMMIT_SHA }} 62 | cache-from: type=gha 63 | cache-to: type=gha,mode=max 64 | build-args: | 65 | VERSION=${{ steps.env_var.outputs.RENDERSCRIPT_VERSION }} 66 | PLAYWRIGHT_VERSION=${{ steps.env_var.outputs.PLAYWRIGHT_VERSION }} 67 | 68 | - name: Test Image 69 | run: ./scripts/test_image.sh ${{ env.COMMIT_SHA }} 70 | 71 | # Cache should be reused from prev execution 72 | - name: Push 73 | if: (github.event_name == 'release') || (github.event_name == 'workflow_dispatch' && github.event.inputs.dry_run != 'true') 74 | uses: docker/build-push-action@v4 75 | with: 76 | file: Dockerfile 77 | context: . 78 | platforms: linux/amd64,linux/arm64/v8 79 | push: true 80 | tags: | 81 | algolia/renderscript:latest 82 | algolia/renderscript:${{ steps.env_var.outputs.RENDERSCRIPT_VERSION }} 83 | cache-from: type=gha 84 | cache-to: type=gha,mode=max 85 | build-args: | 86 | VERSION=${{ steps.env_var.outputs.RENDERSCRIPT_VERSION }} 87 | PLAYWRIGHT_VERSION=${{ steps.env_var.outputs.PLAYWRIGHT_VERSION }} 88 | -------------------------------------------------------------------------------- /.github/workflows/renovate.yml: -------------------------------------------------------------------------------- 1 | name: Renovate 2 | on: 3 | schedule: 4 | - cron: '0 14 * * 5' 5 | workflow_dispatch: 6 | 7 | jobs: 8 | renovate: 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Renovate Automatic Branch 13 | uses: bodinsamuel/renovate-automatic-branch@v1 14 | with: 15 | github-token: ${{ secrets.GITHUB_TOKEN }} 16 | repo-owner: algolia 17 | repo-name: renderscript 18 | branch-base: master 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ 2 | yarn-error.log 3 | 4 | dist/ 5 | vendors/ 6 | 7 | .env 8 | 9 | # Editor files 10 | .exrc 11 | .idea 12 | 13 | # https://yarnpkg.com/getting-started/qa#which-files-should-be-gitignored 14 | .yarn/* 15 | !.yarn/releases 16 | !.yarn/plugins 17 | 18 | .idea 19 | .DS_Store 20 | .vscode 21 | .scannerwork 22 | *~ 23 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | update-notifier=false 2 | -------------------------------------------------------------------------------- /.nvmrc: -------------------------------------------------------------------------------- 1 | 18.18.2 2 | -------------------------------------------------------------------------------- /.prettierrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | trailingComma: 'es5', 3 | tabWidth: 2, 4 | semi: true, 5 | singleQuote: true, 6 | printWidth: 80, 7 | } 8 | -------------------------------------------------------------------------------- /.yarnrc.yml: -------------------------------------------------------------------------------- 1 | compressionLevel: mixed 2 | 3 | enableGlobalCache: false 4 | 5 | enableTelemetry: false 6 | 7 | nodeLinker: node-modules 8 | 9 | yarnPath: .yarn/releases/yarn-4.0.2.cjs 10 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | ## Running it locally 4 | 5 | Development: 6 | 7 | ```sh 8 | yarn 9 | yarn dev 10 | ``` 11 | 12 | Docker image: 13 | 14 | ```sh 15 | yarn docker:build 16 | docker run -p 23000:3000 algolia/renderscript 17 | open "http://localhost:23000/render?url=https%3A%2F%2Fwww.algolia.com&ua=Test+Renderscript" 18 | ``` 19 | 20 | ### Env Variables 21 | 22 | See `.env.example` 23 | 24 | ## Releasing 25 | 26 | Releases are built using GitHub actions. You can release a new version by triggering the [Release Version](https://github.com/algolia/renderscript/actions/workflows/release.yml) workflow. 27 | 28 | ### Manual Release Locally 29 | 30 | ```sh 31 | yarn docker:build 32 | 33 | docker push "algolia/renderscript" 34 | docker push "algolia/renderscript:${VERSION}" 35 | docker push "algolia/renderscript:${GIT_HASH}" 36 | ``` 37 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # ------------------ 2 | # Build playwright 3 | # ------------------ 4 | FROM ubuntu:jammy as base 5 | 6 | # For tzdata 7 | ARG DEBIAN_FRONTEND=noninteractive 8 | ARG TZ=America/Los_Angeles 9 | 10 | # === INSTALL Node.js === 11 | RUN apt-get update && \ 12 | # Install node16 13 | apt-get install -y curl wget && \ 14 | curl -sL https://deb.nodesource.com/setup_18.x | bash - && \ 15 | apt-get install -y nodejs && \ 16 | # Feature-parity with node.js base images. 17 | apt-get install -y --no-install-recommends git openssh-client && \ 18 | npm install -g yarn && \ 19 | # clean apt cache 20 | rm -rf /var/lib/apt/lists/* && \ 21 | # Create the pwuser 22 | adduser pwuser 23 | 24 | # === BAKE BROWSERS INTO IMAGE === 25 | ARG PLAYWRIGHT_VERSION 26 | ENV PLAYWRIGHT_VERSION ${PLAYWRIGHT_VERSION} 27 | ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright 28 | 29 | # Browsers will be downloaded in `/ms-playwright`. 30 | RUN mkdir /ms-playwright \ 31 | && PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=true npm install -g playwright@$PLAYWRIGHT_VERSION \ 32 | && npx playwright install --with-deps chromium \ 33 | && npx playwright install --with-deps firefox \ 34 | # Clean cache 35 | && rm -rf /var/lib/apt/lists/* \ 36 | && chmod -R 777 /ms-playwright 37 | 38 | 39 | # ------------------ 40 | # package.json cache 41 | # ------------------ 42 | FROM apteno/alpine-jq:2022-09-25 AS deps 43 | 44 | # To prevent cache invalidation from changes in fields other than dependencies 45 | COPY package.json /tmp 46 | RUN jq 'walk(if type == "object" then with_entries(select(.key | test("^jest|prettier|eslint|semantic|dotenv|nodemon") | not)) else . end) | { name, dependencies, devDependencies, packageManager }' < /tmp/package.json > /tmp/deps.json 47 | 48 | 49 | # ------------------ 50 | # New base image 51 | # ------------------ 52 | FROM base as tmp 53 | 54 | ENV IN_DOCKER true 55 | ENV PLAYWRIGHT_BROWSERS_PATH="/ms-playwright" 56 | ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD="true" 57 | 58 | # Setup the app WORKDIR 59 | WORKDIR /app/tmp 60 | 61 | # Copy and install dependencies separately from the app's code 62 | # To leverage Docker's cache when no dependency has change 63 | COPY --from=deps /tmp/deps.json ./package.json 64 | COPY yarn.lock .yarnrc.yml ./ 65 | COPY .yarn .yarn 66 | 67 | # Install dev dependencies 68 | RUN true \ 69 | && yarn install 70 | 71 | # This step will invalidates cache 72 | COPY . ./ 73 | 74 | # Builds the UI, install chrome and remove dev dependencies 75 | RUN true \ 76 | && ls -lah /app/tmp \ 77 | && yarn build \ 78 | && yarn workspaces focus --all --production \ 79 | && rm -rf .yarn/ 80 | 81 | # ------------------ 82 | # New final image that only contains built code 83 | # ------------------ 84 | FROM base as final 85 | 86 | ARG VERSION 87 | ENV VERSION ${VERSION:-dev} 88 | 89 | # Autolink repository https://docs.github.com/en/packages/learn-github-packages/connecting-a-repository-to-a-package 90 | LABEL org.opencontainers.image.source=https://github.com/algolia/renderscript 91 | LABEL org.opencontainers.image.revision=$VERSION 92 | 93 | ENV NODE_ENV production 94 | ENV IN_DOCKER true 95 | ENV PLAYWRIGHT_BROWSERS_PATH="/ms-playwright" 96 | ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD="true" 97 | 98 | # Do not use root to run the app 99 | USER pwuser 100 | 101 | # Copy install from previous stage 102 | WORKDIR /app/renderscript 103 | COPY --from=tmp --chown=pwuser:pwuser /app/tmp /app/renderscript 104 | 105 | CMD [ "node", "dist/index.js" ] 106 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Renderscript 2 | 3 | > An API to render a page inside a real Chromium (with JavaScript enabled) and send back the raw HTML. 4 | 5 | This project is directly written for and consumed by [Algolia Crawler](https://www.algolia.com/products/search-and-discovery/crawler/). 6 | 7 | 🔐 **Secure** 8 | Leverages `Context` to isolate each page, prevent cookie sharing, control redirection, etc... 9 | 10 | 🚀 **Performant**: 11 | Ignores unnecessary resources for rendering HTML (e.g. `images`, `video`, `font`, etc...) and bundle an AdBlocker by default. 12 | 13 | 🤖 **Automated**: 14 | Renderscript has everything abstracted to render a page and login to website with minimal configuration required. 15 | 16 | ## Usage 17 | 18 | ### Local 19 | 20 | ```sh 21 | yarn dev 22 | ``` 23 | 24 | **Goto**: 25 | 26 | ### Docker 27 | 28 | ```sh 29 | docker build . -t algolia/renderscript 30 | docker run -p 3000:3000 -it algolia/renderscript 31 | 32 | curl -X POST http://localhost:3000/render \ 33 | -H 'Content-Type: application/json' \ 34 | -d '{"url": "https://www.algolia.com/", "ua": "local_renderscript"}' 35 | ``` 36 | 37 | ## API 38 | 39 | - [`POST /render`](#post-render) 40 | - [`GET /render`](#get-render) 41 | - [`POST /login`](#post-login) 42 | - [`GET /list`](#get-list) 43 | - [`GET /healthy`, `GET /ready`](#get-healthy--get-ready) 44 | 45 | --- 46 | 47 | ### `POST /render` 48 | 49 | Main endpoint. Renders the page and dumps a JSON with all the page information. 50 | 51 | #### Body parameters: 52 | 53 | ```ts 54 | { 55 | /** 56 | * URL to render (for hash and query params support, use `encodeURIComponent` on it) 57 | */ 58 | url: string; 59 | 60 | /** 61 | * User-Agent to use. 62 | */ 63 | ua: string; 64 | 65 | /** 66 | * Enables AdBlocker 67 | */ 68 | adblock?: boolean; 69 | 70 | /** 71 | * Define the range of time. 72 | * Minimum and maximum execution time. 73 | */ 74 | waitTime?: { 75 | min?: number; 76 | max?: number; 77 | }; 78 | 79 | /** 80 | * Headers to Forward on navigation 81 | */ 82 | headersToForward?: { 83 | [s: string]: string; 84 | }; 85 | } 86 | ``` 87 | 88 | #### Response `application/json`: 89 | 90 | ```ts 91 | { 92 | /** 93 | * HTTP Code of the rendered page. 94 | */ 95 | statusCode: number | null; 96 | 97 | /** 98 | * HTTP Headers of the rendered page. 99 | */ 100 | headers: Record; 101 | 102 | /** 103 | * Body of the rendered page. 104 | */ 105 | body: string | null; 106 | 107 | /** 108 | * Metrics from different taks during the rendering. 109 | */ 110 | metrics: Metrics; 111 | 112 | /** 113 | * The redirection renderscript caught. 114 | */ 115 | resolvedUrl: string | null; 116 | 117 | /** 118 | * Has the page reached timeout? 119 | * When timeout has been reached we continue the rendering as usual 120 | * but reduce other timeout to a minimum. 121 | */ 122 | timeout: boolean; 123 | 124 | /** 125 | * Any error encountered along the way. 126 | * If this field is filled that means the rest of the payload is partial. 127 | */ 128 | error: string | null; 129 | } 130 | ``` 131 | 132 | --- 133 | 134 | ### `GET /render` 135 | 136 | Used for debug purposes. Dumps directly the HTML for easy inspection in your browser. 137 | 138 | #### Query parameters: 139 | 140 | > see `POST /render` parameters 141 | 142 | #### Response `text/html`. 143 | 144 | CSP headers are set to prevent script execution on the rendered page. 145 | 146 | --- 147 | 148 | ### `POST /login` 149 | 150 | This endpoint will load a given login page, look for `input` fields, enter the given credentials and validate the form. 151 | It allows retrieving programmatically a session-cookie from websites with [CSRF](https://en.wikipedia.org/wiki/Cross-site_request_forgery) protection. 152 | 153 | #### Body parameters 154 | 155 | ```ts 156 | { 157 | /** 158 | * URL to render (for hash and query params support, use `encodeURIComponent` on it) 159 | */ 160 | url: string; 161 | 162 | /** 163 | * User-Agent to use. 164 | */ 165 | ua: string; 166 | 167 | /** 168 | * Username to enter on the login form. Renderscript expects to find an `input[type=text]` or `input[type=email]` on the login page. 169 | */ 170 | username: string; 171 | 172 | /** 173 | * Password to enter on the login form. Renderscript expects to find an `input[type=password]` on the login page. 174 | */ 175 | password: string; 176 | 177 | /** 178 | * Define the range of time. 179 | * Minimum and maximum execution time. 180 | */ 181 | waitTime?: { 182 | min?: number; 183 | max?: number; 184 | }; 185 | 186 | /** 187 | * Boolean (optional). 188 | * If set to true, Renderscript will return the rendered HTML after the login request. Useful to debug visually. 189 | */ 190 | renderHTML?: boolean; 191 | } 192 | ``` 193 | 194 | #### Response `application/json` 195 | 196 | ```ts 197 | { 198 | /** 199 | * HTTP Code of the rendered page. 200 | */ 201 | statusCode: number | null; 202 | 203 | /** 204 | * HTTP Headers of the rendered page. 205 | */ 206 | headers: Record; 207 | 208 | /** 209 | * Metrics from different taks during the rendering. 210 | */ 211 | metrics: Metrics; 212 | 213 | /** 214 | * Has the page reached timeout? 215 | * When timeout has been reached we continue the rendering as usual 216 | * but reduce other timeout to a minimum. 217 | */ 218 | timeout: boolean; 219 | 220 | /** 221 | * Any error encountered along the way. 222 | * If this field is filled that means the rest of the payload is partial. 223 | */ 224 | error: string | null; 225 | 226 | /** 227 | * Cookie generated from a succesful login. 228 | */ 229 | cookies: Cookie[]; 230 | 231 | /** 232 | * The URL at the end of a succesful login. 233 | */ 234 | resolvedUrl: string | null; 235 | 236 | /** 237 | * Body at the end of a succesful login. 238 | */ 239 | body: string | null; 240 | } 241 | ``` 242 | 243 | #### Response `text/html` 244 | 245 | If `renderHTML: true`, returns `text/html`. 246 | CSP headers are set to prevent script execution on the rendered page. 247 | 248 | --- 249 | 250 | ### `GET /list` 251 | 252 | List currenlty open pages. 253 | Useful to debug. 254 | 255 | --- 256 | 257 | ### `GET /healthy`, `GET /ready` 258 | 259 | Health Check for Kubernetes and others. 260 | 261 | --- 262 | 263 | ## Credits 264 | 265 | This project was heavily inspired by [`GoogleChrome/rendertron`](https://github.com/GoogleChrome/rendertron). 266 | It was based on [`puppeteer-core`](https://github.com/GoogleChrome/puppeteer) but we switched to [Playwright](https://playwright.dev/). 267 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | // eslint-disable-next-line import/no-commonjs 2 | module.exports = { 3 | preset: 'ts-jest', 4 | testEnvironment: 'node', 5 | testPathIgnorePatterns: ['/node_modules/', '/dist/'], 6 | testMatch: ['/src/**/*.test.[jt]s'], 7 | globalSetup: '/jest.setup.ts', 8 | setupFiles: ['dotenv/config'], 9 | maxWorkers: 1, 10 | }; 11 | -------------------------------------------------------------------------------- /jest.setup.ts: -------------------------------------------------------------------------------- 1 | import { request } from 'undici'; 2 | 3 | import { wait } from './src/helpers/wait'; 4 | 5 | // eslint-disable-next-line @typescript-eslint/explicit-function-return-type 6 | export default async function setup() { 7 | const max = 50; 8 | let curr = 0; 9 | 10 | while (curr < max) { 11 | curr += 1; 12 | try { 13 | const { statusCode } = await request('http://localhost:3000/ready'); 14 | console.log('API statusCode:', statusCode, `(retries: ${curr})`); 15 | 16 | if (statusCode === 200) { 17 | console.log('API Ready'); 18 | return; 19 | } 20 | } catch (err: any) { 21 | console.log(err.message); 22 | } finally { 23 | await wait(1000); 24 | } 25 | } 26 | 27 | throw Error('API did not reach ready status'); 28 | } 29 | -------------------------------------------------------------------------------- /nodemon.json: -------------------------------------------------------------------------------- 1 | { 2 | "ignore": [ 3 | ".git", 4 | "node_modules", 5 | "dist", 6 | "__tests__/" 7 | ], 8 | "watch": [ 9 | "src" 10 | ], 11 | "exec": "yarn dev:run", 12 | "ext": "ts" 13 | } 14 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@algolia/renderscript", 3 | "version": "2.3.6", 4 | "description": "A custom JavaScript rendering engine based on Playwright", 5 | "main": "dist/index.js", 6 | "scripts": { 7 | "build": "yarn clean && yarn tsc && yarn browser:adblocks", 8 | "ci:start": "ALLOW_LOCALHOST=true yarn start", 9 | "clean": "rm -rf dist/", 10 | "dev": "nodemon", 11 | "dev:run": "yarn build && NODE_ENV=development node -r dotenv/config dist/index.js", 12 | "docker:build": "./scripts/build.sh", 13 | "browser:adblocks": "./scripts/update_adblock_hosts.sh", 14 | "lint": "eslint --ext=jsx,ts,tsx,js .", 15 | "start": "UV_THREADPOOL_SIZE=100 node dist/index.js", 16 | "semantic-release": "semantic-release", 17 | "test": "jest src/" 18 | }, 19 | "repository": { 20 | "type": "git", 21 | "url": "https://github.com/algolia/renderscript.git" 22 | }, 23 | "keywords": [ 24 | "algolia", 25 | "playwright", 26 | "js renderer", 27 | "rendertron", 28 | "prerender", 29 | "javascript rendering", 30 | "ssr" 31 | ], 32 | "author": "Algolia ", 33 | "license": "MIT", 34 | "bugs": { 35 | "url": "https://github.com/algolia/renderscript/issues" 36 | }, 37 | "engines": { 38 | "node": "18.18.2" 39 | }, 40 | "homepage": "https://github.com/algolia/renderscript#readme", 41 | "devDependencies": { 42 | "@semantic-release/changelog": "6.0.3", 43 | "@semantic-release/git": "10.0.1", 44 | "@types/cookie-parser": "1.4.6", 45 | "@types/csurf": "1.11.5", 46 | "@types/express": "4.17.21", 47 | "@types/jest": "29.5.8", 48 | "@types/node": "18.18.10", 49 | "@types/uuid": "9.0.7", 50 | "@typescript-eslint/eslint-plugin": "6.11.0", 51 | "@typescript-eslint/parser": "6.11.0", 52 | "dotenv": "16.3.1", 53 | "ejs": "3.1.9", 54 | "eslint": "8.54.0", 55 | "eslint-config-algolia": "22.0.0", 56 | "eslint-config-prettier": "9.0.0", 57 | "eslint-config-standard": "17.1.0", 58 | "eslint-import-resolver-typescript": "3.6.1", 59 | "eslint-plugin-algolia": "2.0.0", 60 | "eslint-plugin-eslint-comments": "3.2.0", 61 | "eslint-plugin-import": "2.29.0", 62 | "eslint-plugin-jest": "27.6.0", 63 | "eslint-plugin-jsdoc": "46.9.0", 64 | "eslint-plugin-node": "11.1.0", 65 | "eslint-plugin-prettier": "5.0.1", 66 | "eslint-plugin-promise": "6.1.1", 67 | "jest": "29.7.0", 68 | "nodemon": "3.0.1", 69 | "pino-pretty": "10.2.3", 70 | "prettier": "3.1.0", 71 | "semantic-release": "22.0.8", 72 | "ts-jest": "29.1.1", 73 | "ts-node": "10.9.2", 74 | "typescript": "5.2.2" 75 | }, 76 | "dependencies": { 77 | "@algolia/dns-filter": "1.1.25", 78 | "@sentry/node": "7.80.1", 79 | "altheia-async-data-validator": "5.0.15", 80 | "body-parser": "1.20.2", 81 | "cookie-parser": "1.4.6", 82 | "csurf": "1.11.0", 83 | "express": "4.19.2", 84 | "hot-shots": "10.0.0", 85 | "pino": "8.16.2", 86 | "playwright": "1.49.0", 87 | "undici": "5.28.4", 88 | "uuid": "9.0.1" 89 | }, 90 | "resolutions": { 91 | "chalk": "4.1.2", 92 | "@semantic-release/npm": "10.0.6" 93 | }, 94 | "packageManager": "yarn@4.0.2" 95 | } 96 | -------------------------------------------------------------------------------- /public/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Renderscript 6 | 8 | 9 | 19 | 20 | 21 | 22 |
23 |
24 | 25 | 26 | 29 | 32 | 33 | 34 |
35 |

renderscript

36 |
37 | 38 |
39 |
40 |

Render

41 |
42 |
43 |
44 | 45 |
46 | 47 |
48 |
49 | 50 |
51 | 52 |
53 | 54 |
55 |
56 |
57 | 58 |
59 |
60 | 61 | 62 |
63 |
64 | 65 | 69 |
70 |
71 |
72 |
73 | 74 |
75 | 76 |
77 |
78 |
79 | 85 | 86 |
87 | 89 | 92 |
93 |
94 |
95 |
96 |

Login

97 |
98 |
99 |
100 |
101 | 102 |
103 | 105 |
106 |
107 |
108 | 109 |
110 | 112 |
113 |
114 |
115 | 116 |
117 |
118 | 119 |
121 |
122 |
123 |
124 | 125 |
127 |
128 |
129 |
130 |
131 | 132 | 133 |
134 | 135 |
136 | 138 |
139 |
140 |
141 |
142 | 143 | 144 | 145 | -------------------------------------------------------------------------------- /public/static/main.webmanifest: -------------------------------------------------------------------------------- 1 | { 2 | "name": "", 3 | "short_name": "", 4 | "icons": [ 5 | { 6 | "src": "/android-chrome-192x192.png", 7 | "sizes": "192x192", 8 | "type": "image/png" 9 | } 10 | ], 11 | "theme_color": "#ffffff", 12 | "background_color": "#ffffff", 13 | "display": "standalone" 14 | } 15 | -------------------------------------------------------------------------------- /public/test-website/async.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 12 | 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /public/test-website/basic.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | A basic page 6 | 7 | 8 | -------------------------------------------------------------------------------- /public/test-website/blocked-requests.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 44 | 45 | 46 | 47 | A basic page 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | Hello 58 | 59 |
Hello
60 |
Foo
61 |
Img Bg
62 | 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /public/test-website/iframe.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | A basic page 8 |
9 |
10 | 11 | 12 | 13 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /public/test-website/js-redirect-hash.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /public/test-website/js-redirect-history.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /public/test-website/js-redirect-path.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /public/test-website/js-redirect.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 18 | 19 | 20 | 21 | -------------------------------------------------------------------------------- /public/test-website/login-double-password.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /public/test-website/login-multiple-input.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 | 10 | 11 | 12 |
13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /public/test-website/meta-refresh-5.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Redirecting... 8 | 9 | 10 | -------------------------------------------------------------------------------- /public/test-website/meta-refresh.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | Redirecting... 8 | 9 | 10 | -------------------------------------------------------------------------------- /public/test-website/page-crash.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /public/test-website/slow.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
9 |
10 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /public/views/login-2steps-js.ejs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Log-in 5 | 6 | 15 | 16 | 41 | 42 | 43 | 44 |

2-steps JavaScript login form

45 | 46 |
47 | 48 | 49 | 50 |
51 |
52 | 53 |
54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /public/views/login-step1.ejs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

2-steps login form

6 | 7 |
8 | 9 |
10 |

11 | 12 |
13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /public/views/login-step2.ejs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

2-steps login form

6 | 7 |
8 | 9 | 10 |
11 |

12 | 13 |
14 | 15 | 16 | 17 | -------------------------------------------------------------------------------- /public/views/login.ejs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 |

Protected login form

6 | 7 |
8 | 9 |
10 |
11 |
12 |

13 | 14 |

15 | 16 |
17 | 18 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /release.config.js: -------------------------------------------------------------------------------- 1 | /* eslint-disable import/no-commonjs */ 2 | /* eslint-disable no-template-curly-in-string */ 3 | module.exports = { 4 | branch: 'master', 5 | verifyConditions: ['@semantic-release/github'], 6 | prepare: [ 7 | { 8 | path: '@semantic-release/changelog', 9 | changelogFile: 'CHANGELOG.md', 10 | }, 11 | '@semantic-release/npm', 12 | { 13 | path: '@semantic-release/git', 14 | assets: ['package.json', 'CHANGELOG.md'], 15 | message: 16 | 'chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}', 17 | }, 18 | ], 19 | publish: '@semantic-release/github', 20 | success: [], 21 | fail: [], 22 | npmPublish: false, 23 | }; 24 | -------------------------------------------------------------------------------- /renovate.json: -------------------------------------------------------------------------------- 1 | { 2 | "extends": [ 3 | "config:js-app", 4 | "github>algolia/renovate-config-algolia" 5 | ], 6 | "baseBranches": [ 7 | "chore/renovateBaseBranch" 8 | ], 9 | "lockFileMaintenance": { "enabled": false }, 10 | "automergeType": "branch", 11 | "prHourlyLimit": 2, 12 | "prConcurrentLimit": 5 13 | } 14 | -------------------------------------------------------------------------------- /scripts/build.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | set -ex 4 | 5 | hash=$(git rev-parse HEAD) 6 | current=$(node -e "console.log(require('./package.json').version)") 7 | playwright_version=$(node -e 'console.log(require("./package.json").dependencies.playwright)') 8 | echo "Releasing: $current ; Playwright version: $playwright_version" 9 | echo "" 10 | 11 | # Build renderscript 12 | 13 | # To run locally on your mac m1, you need to change platform to linux/arm64/v8 14 | # For deploy, it should be linux/amd64 15 | docker buildx build \ 16 | --platform linux/amd64 \ 17 | --progress plain \ 18 | -t algolia/renderscript \ 19 | -t "algolia/renderscript:${current}" \ 20 | -t "algolia/renderscript:${hash}" \ 21 | -t "algolia/renderscript:latest" \ 22 | --build-arg "VERSION=${current}" \ 23 | --build-arg "PLAYWRIGHT_VERSION=${playwright_version}" \ 24 | --load \ 25 | . 26 | -------------------------------------------------------------------------------- /scripts/start.sh: -------------------------------------------------------------------------------- 1 | #! /bin/sh 2 | 3 | if [ -f .env ]; then 4 | source .env 5 | fi 6 | 7 | if [ -z "$EXTENSIONS" ]; then 8 | # Headless Chrome, just launch the API 9 | node dist/api/index.js 10 | else 11 | cleanup() { 12 | echo "start.sh: Gracefully exiting" 13 | 14 | # Kill the API first, then XVFB 15 | kill -TERM $api_pid 16 | wait $api_pid >/dev/null || true 17 | 18 | echo "start.sh: Gracefully exited node process" 19 | 20 | kill -TERM $xvfb_pid 21 | wait $xvfb_pid >/dev/null || true 22 | 23 | echo "start.sh: Gracefully exited xfvb" 24 | } 25 | 26 | trap cleanup INT 27 | trap cleanup TERM 28 | 29 | DISPLAY=:95 30 | 31 | Xvfb $DISPLAY -screen 0 1920x1080x16 & 32 | xvfb_pid=$! 33 | DISPLAY="$DISPLAY" node dist/api/index.js & 34 | api_pid=$! 35 | 36 | wait $api_pid 37 | wait $xvfb_pid 38 | fi 39 | -------------------------------------------------------------------------------- /scripts/test_image.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | set -e 4 | 5 | hash=$1 # the last commit change because of semantic-release 6 | docker run -d --name renderscript_test -p 3000:3000 algolia/renderscript:$hash 7 | 8 | ATTEMPTS=10 9 | until $(curl -o /dev/null -s -f http://localhost:3000/ready); do 10 | echo "waiting for docker..." 11 | sleep 1 12 | ATTEMPTS=$((ATTEMPTS-1)) 13 | if [[ $ATTEMPTS -eq "0" ]]; then 14 | echo "Timed out, check the logs of renderscript_test container:" 15 | docker logs renderscript_test -n 50 16 | exit 1 17 | fi 18 | done 19 | 20 | logs=$(docker logs renderscript_test 2>&1) 21 | echo $logs 22 | 23 | if echo $logs | grep -q '"svc":"brws","msg":"Ready"'; then 24 | echo "Browser ready" 25 | else 26 | echo "Browser not ready" 27 | exit 1 28 | fi 29 | 30 | curl --silent --request POST \ 31 | --url http://localhost:3000/render \ 32 | --header 'Content-Type: application/json' \ 33 | --data '{ 34 | "url": "https://www.example.com", 35 | "ua": "Renderscript CI", 36 | "waitTime": { 37 | "min": 1000, 38 | "max": 3000 39 | } 40 | }' >/dev/null 41 | 42 | logs=$(docker logs renderscript_test 2>&1) 43 | echo $logs 44 | 45 | if echo $logs | grep -q '"msg":"Done","data":'; then 46 | echo "Rendered" 47 | else 48 | echo "Not rendered" 49 | exit 1 50 | fi 51 | 52 | echo "Image OK" 53 | docker stop renderscript_test && docker rm renderscript_test 54 | -------------------------------------------------------------------------------- /scripts/update_adblock_hosts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # adblock hosts file URL 4 | URL="https://raw.githubusercontent.com/badmojr/1Hosts/master/Pro/domains.txt" 5 | 6 | TARGET_DIR="./dist/lib/browser" 7 | TARGET_FILE="adblock_hosts.txt" 8 | 9 | if curl -o "${TARGET_DIR}/${TARGET_FILE}" "$URL" -s; then 10 | echo "✅ adblock hosts download successful." 11 | else 12 | echo "❌ adblock hosts download failed." 13 | exit 1 14 | fi 15 | -------------------------------------------------------------------------------- /src/__tests__/__snapshots__/async.test.ts.snap: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`async should render async page on chromium 1`] = `"
Algolia Crawler
1. Init - 2. DOMContentLoaded - 3. window.onload"`; 4 | 5 | exports[`async should render async page on firefox 1`] = `"
Algolia Crawler
1. Init - 2. DOMContentLoaded - 3. window.onload"`; 6 | -------------------------------------------------------------------------------- /src/__tests__/__snapshots__/login.test.ts.snap: -------------------------------------------------------------------------------- 1 | // Jest Snapshot v1, https://goo.gl/fbAQLP 2 | 3 | exports[`JavaScript redirect should not try to render the body if renderHTML was not requested 1`] = ` 4 | { 5 | "domain": "localhost", 6 | "expires": -1, 7 | "httpOnly": false, 8 | "name": "sessionToken", 9 | "path": "/secure", 10 | "sameSite": "Strict", 11 | "secure": false, 12 | "value": "53cu23_535510n", 13 | } 14 | `; 15 | 16 | exports[`login should works even with a 2-steps login 1`] = ` 17 | { 18 | "domain": "localhost", 19 | "expires": -1, 20 | "httpOnly": false, 21 | "name": "sessionToken", 22 | "path": "/secure", 23 | "sameSite": "Strict", 24 | "secure": false, 25 | "value": "53cu23_535510n", 26 | } 27 | `; 28 | 29 | exports[`login should works with a 2-steps JS login 1`] = ` 30 | { 31 | "domain": "localhost", 32 | "expires": -1, 33 | "httpOnly": false, 34 | "name": "sessionToken", 35 | "path": "/secure", 36 | "sameSite": "Strict", 37 | "secure": false, 38 | "value": "53cu23_535510n", 39 | } 40 | `; 41 | 42 | exports[`login should works with correct credentials 1`] = ` 43 | { 44 | "domain": "localhost", 45 | "expires": -1, 46 | "httpOnly": false, 47 | "name": "sessionToken", 48 | "path": "/secure", 49 | "sameSite": "Strict", 50 | "secure": false, 51 | "value": "53cu23_535510n", 52 | } 53 | `; 54 | -------------------------------------------------------------------------------- /src/__tests__/api.test.ts: -------------------------------------------------------------------------------- 1 | import type { PostRenderSuccess } from '../api/@types/postRender'; 2 | 3 | import { postRender, request } from './helpers'; 4 | 5 | /** 6 | * Test the schema only on this file. 7 | */ 8 | describe('POST /render', () => { 9 | it('should validate 200', async () => { 10 | const { res, body } = await postRender({ 11 | url: 'http://localhost:3000/test-website/async.html', 12 | ua: 'Algolia Crawler', 13 | }); 14 | expect(res.statusCode).toBe(200); 15 | 16 | const json: PostRenderSuccess = JSON.parse(body); 17 | expect(json).toStrictEqual({ 18 | body: expect.any(String), 19 | error: null, 20 | rawError: null, 21 | headers: { 22 | 'accept-ranges': 'bytes', 23 | 'cache-control': 'public, max-age=0', 24 | connection: 'keep-alive', 25 | 'content-length': expect.any(String), 26 | 'content-type': 'text/html; charset=UTF-8', 27 | date: expect.any(String), 28 | etag: expect.any(String), 29 | 'keep-alive': 'timeout=5', 30 | 'last-modified': expect.any(String), 31 | }, 32 | statusCode: 200, 33 | resolvedUrl: null, 34 | timeout: false, 35 | metrics: { 36 | renderingBudget: { 37 | consumed: expect.any(Number), 38 | max: 20000, 39 | }, 40 | timings: { 41 | context: expect.any(Number), 42 | equiv: expect.any(Number), 43 | goto: expect.any(Number), 44 | minWait: null, 45 | ready: expect.any(Number), 46 | serialize: expect.any(Number), 47 | close: expect.any(Number), 48 | total: expect.any(Number), 49 | }, 50 | page: { 51 | contentLength: { 52 | main: 763, 53 | total: 763, 54 | }, 55 | mem: { 56 | jsHeapTotalSize: 0, 57 | jsHeapUsedSize: 0, 58 | }, 59 | requests: { 60 | blocked: 0, 61 | pending: 0, 62 | total: 1, 63 | }, 64 | timings: { 65 | download: expect.any(Number), 66 | }, 67 | }, 68 | }, 69 | }); 70 | }); 71 | 72 | it('should handle bad json', async () => { 73 | const res = await request('http://localhost:3000/render', { 74 | method: 'POST', 75 | headers: { 76 | 'content-type': 'application/json', 77 | }, 78 | body: '{"url": "https://example.com", "ua": "test}', 79 | }); 80 | 81 | expect(JSON.parse(res.body)).toStrictEqual({ 82 | status: 400, 83 | error: 'Invalid json: Unexpected end of JSON input', 84 | code: 'invalid_json', 85 | }); 86 | }); 87 | }); 88 | -------------------------------------------------------------------------------- /src/__tests__/async.test.ts: -------------------------------------------------------------------------------- 1 | import type { PostRenderSuccess } from '../api/@types/postRender'; 2 | 3 | import { cleanString, postRender, request } from './helpers'; 4 | 5 | jest.setTimeout(10000); 6 | 7 | describe('async', () => { 8 | it.each(['chromium', 'firefox'])( 9 | 'should render async page on %s', 10 | async (browser) => { 11 | const { res, body } = await request( 12 | `http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fasync.html&ua=Algolia+Crawler&browser=${browser}` 13 | ); 14 | 15 | expect(res.statusCode).toBe(200); 16 | expect(res.headers).toEqual({ 17 | connection: 'keep-alive', 18 | 'content-length': expect.any(String), 19 | 'content-security-policy': 20 | "default-src 'none'; style-src * 'unsafe-inline'; img-src * data:; font-src *", 21 | 'content-type': 'text/html; charset=utf-8', 22 | date: expect.any(String), 23 | etag: expect.any(String), 24 | 'keep-alive': 'timeout=5', 25 | }); 26 | 27 | expect(cleanString(body)).toMatchSnapshot(); 28 | } 29 | ); 30 | 31 | it('should wait by default for 0ms', async () => { 32 | const { res, body } = await postRender({ 33 | url: 'http://localhost:3000/test-website/async.html', 34 | ua: 'Algolia Crawler', 35 | }); 36 | 37 | const json: PostRenderSuccess = JSON.parse(body); 38 | expect(res.statusCode).toBe(200); 39 | expect(json.metrics.timings.total).toBeLessThanOrEqual(2000); 40 | expect(json.body).not.toMatch('4. setTimeout 1000'); 41 | }); 42 | 43 | it('should wait at least 6000ms', async () => { 44 | const { res, body } = await postRender({ 45 | url: 'http://localhost:3000/test-website/async.html', 46 | ua: 'Algolia Crawler', 47 | waitTime: { 48 | min: 6000, 49 | }, 50 | }); 51 | 52 | const json: PostRenderSuccess = JSON.parse(body); 53 | expect(res.statusCode).toBe(200); 54 | 55 | expect(json.metrics.timings.minWait).toBeGreaterThanOrEqual(5000); 56 | expect(json.metrics.timings.total).toBeGreaterThanOrEqual(6000); 57 | expect(json.metrics.timings.total).toBeLessThanOrEqual(7000); 58 | expect(json.body).toMatch('5. setTimeout 5000'); 59 | }); 60 | 61 | it('should wait at most 5000ms', async () => { 62 | const { res, body } = await postRender({ 63 | url: 'http://localhost:3000/test-website/slow.html', 64 | ua: 'Algolia Crawler', 65 | waitTime: { 66 | min: 4000, 67 | max: 5000, 68 | }, 69 | }); 70 | 71 | const json: PostRenderSuccess = JSON.parse(body); 72 | expect(res.statusCode).toBe(200); 73 | expect(json.metrics.timings.goto).toBeLessThanOrEqual(50); 74 | 75 | // In that case the page is slow so min is not used 76 | expect(json.metrics.timings.minWait).toBeNull(); 77 | 78 | expect(json.metrics.timings.ready).toBeLessThanOrEqual(5020); 79 | expect(json.metrics.timings.total).toBeGreaterThanOrEqual(4000); 80 | expect(json.metrics.timings.total).toBeLessThanOrEqual(5120); 81 | 82 | // We count the dot because there is no way to have precise execution 83 | // There should be around 25 dots (one fetch every 200ms during 5s = 25 dots) 84 | // We check for 20 to have some margin 85 | // And no more than 30 to check that it was not executed more than 5s 86 | expect(json.body).toMatch('.'.repeat(20)); 87 | expect(json.body).not.toMatch('.'.repeat(30)); 88 | }); 89 | }); 90 | -------------------------------------------------------------------------------- /src/__tests__/blockedRequests.test.ts: -------------------------------------------------------------------------------- 1 | import type { PostRenderSuccess } from '../api/@types/postRender'; 2 | 3 | import { postRender } from './helpers'; 4 | 5 | jest.setTimeout(10000); 6 | 7 | describe('native', () => { 8 | it('should block basic unecessary requests', async () => { 9 | const { res, body } = await postRender({ 10 | url: 'http://localhost:3000/test-website/blocked-requests.html', 11 | ua: 'Algolia Crawler', 12 | }); 13 | 14 | const json: PostRenderSuccess = JSON.parse(body); 15 | 16 | expect(res.statusCode).toBe(200); 17 | expect(json.metrics.page!.requests).toStrictEqual({ 18 | total: 11, 19 | pending: 0, 20 | blocked: 6, 21 | }); 22 | }); 23 | }); 24 | 25 | describe('adblocker', () => { 26 | it('should use adblock', async () => { 27 | const { res, body } = await postRender({ 28 | url: 'http://localhost:3000/test-website/blocked-requests.html', 29 | ua: 'Algolia Crawler', 30 | adblock: true, 31 | }); 32 | 33 | const json: PostRenderSuccess = JSON.parse(body); 34 | 35 | expect(res.statusCode).toBe(200); 36 | expect(json.metrics.page!.requests).toStrictEqual({ 37 | total: 11, 38 | pending: 0, 39 | blocked: 9, 40 | }); 41 | /** 42 | * @example 43 | * https://www.google-analytics.com/analytics.js 44 | * https://static.ads-twitter.com/uwt.js 45 | * https://www.googletagmanager.com/gtm.js?id=GTM-FOOBAR&l=dataLayer 46 | * https://via.placeholder.com/150 47 | * https://via.placeholder.com/152 48 | * http://localhost:3000/301 49 | * https://res.cloudinary.com/hilnmyskv/image/upload/v1623928136/ui-library/nav/search.svg 50 | * https://fonts.gstatic.com/s/qahiri/v1/tsssAp1RZy0C_hGeVHqgjHq-pg.woff2 51 | * https://fonts.gstatic.com/s/roboto/v30/KFOiCnqEu92Fr1Mu51QrIzc.ttf 52 | */ 53 | }); 54 | }); 55 | -------------------------------------------------------------------------------- /src/__tests__/errors.test.ts: -------------------------------------------------------------------------------- 1 | import type { PostRenderSuccess } from '../api/@types/postRender'; 2 | import type { BrowserEngine } from '../lib/browser/Browser'; 3 | 4 | import { postRender } from './helpers'; 5 | 6 | jest.setTimeout(30000); 7 | 8 | describe('errors', () => { 9 | it('should catch DNS error', async () => { 10 | const { res, body } = await postRender({ 11 | url: 'http://thisisnota-domain.thistld.does.not.exist', 12 | ua: 'Algolia Crawler', 13 | }); 14 | 15 | const json: PostRenderSuccess = JSON.parse(body); 16 | expect(res.statusCode).toBe(200); 17 | expect(json.body).toBeNull(); 18 | expect(json.error).toBe('dns_error'); 19 | }); 20 | 21 | // Firefox doesn't crash reliably one the CI 22 | it.each(['chromium' /* , 'firefox' */])( 23 | '%s should catch Page Crashed', 24 | async (browser) => { 25 | const { res, body } = await postRender({ 26 | url: 'http://localhost:3000/test-website/page-crash.html', 27 | ua: 'Algolia Crawler', 28 | browser: browser as BrowserEngine, 29 | waitTime: { 30 | max: 10000, 31 | }, 32 | }); 33 | 34 | const json: PostRenderSuccess = JSON.parse(body); 35 | expect(res.statusCode).toBe(500); 36 | expect(json.body).toBeNull(); 37 | expect(json.error).toBe('body_serialisation_failed'); 38 | } 39 | ); 40 | }); 41 | -------------------------------------------------------------------------------- /src/__tests__/helpers.ts: -------------------------------------------------------------------------------- 1 | import type { IncomingHttpHeaders } from 'http'; 2 | 3 | import type { Cookie } from 'playwright'; 4 | import { request as req } from 'undici'; 5 | import type Dispatcher from 'undici/types/dispatcher'; 6 | 7 | import type { 8 | PostLoginParams, 9 | PostLoginSuccess, 10 | } from '../api/@types/postLogin'; 11 | import type { PostRenderParams } from '../api/@types/postRender'; 12 | 13 | export async function request( 14 | url: string, 15 | params?: Parameters[1] 16 | ): Promise<{ res: Dispatcher.ResponseData; body: string }> { 17 | const res = await req(url, params); 18 | 19 | let body = ''; 20 | for await (const chunk of res.body) { 21 | body += chunk.toString(); 22 | } 23 | 24 | return { res, body }; 25 | } 26 | 27 | export async function postRender( 28 | opts: Partial, 29 | headers?: IncomingHttpHeaders 30 | ): Promise<{ res: Dispatcher.ResponseData; body: string }> { 31 | return await request('http://localhost:3000/render', { 32 | method: 'POST', 33 | headers: { 34 | 'content-type': 'application/json', 35 | ...headers, 36 | }, 37 | body: JSON.stringify({ 38 | ua: 'Algolia Crawler', 39 | ...opts, 40 | }), 41 | }); 42 | } 43 | 44 | export async function sendLoginRequest( 45 | opts: Partial 46 | ): Promise<{ res: Dispatcher.ResponseData; body: string }> { 47 | return await request('http://localhost:3000/login', { 48 | method: 'POST', 49 | headers: { 50 | 'content-type': 'application/json', 51 | }, 52 | body: JSON.stringify({ 53 | ua: 'Algolia Crawler', 54 | ...opts, 55 | }), 56 | }); 57 | } 58 | 59 | export function cleanString(body: string): string { 60 | return body.replace(/\n|\r/g, '').replace(/\s\s+/g, ''); 61 | } 62 | 63 | export function cleanCookies( 64 | cookies: PostLoginSuccess['cookies'] 65 | ): Array< 66 | Omit 67 | > { 68 | return cookies.map( 69 | ({ value, expires, httpOnly, secure, sameSite, ...rest }) => { 70 | return rest; 71 | } 72 | ); 73 | } 74 | 75 | export function cookiesToString(cookies: PostLoginSuccess['cookies']): string { 76 | if (!cookies) { 77 | return ''; 78 | } 79 | return cookies.map((cookie) => `${cookie.name}=${cookie.value}`).join('; '); 80 | } 81 | -------------------------------------------------------------------------------- /src/__tests__/index.test.ts: -------------------------------------------------------------------------------- 1 | import { cleanString, request } from './helpers'; 2 | 3 | jest.setTimeout(30 * 1000); 4 | 5 | describe('main', () => { 6 | it('should error when no url', async () => { 7 | const { res, body } = await request('http://localhost:3000/render?'); 8 | 9 | expect(res.statusCode).toBe(400); 10 | 11 | expect(JSON.parse(body)).toEqual({ 12 | details: [ 13 | { label: 'url', message: 'url is required', type: 'required' }, 14 | { label: 'ua', message: 'ua is required', type: 'required' }, 15 | ], 16 | error: true, 17 | message: 'Bad Request', 18 | }); 19 | }); 20 | 21 | it('should error when no user agent', async () => { 22 | const { res, body } = await request( 23 | 'http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fbasic.html' 24 | ); 25 | 26 | expect(res.statusCode).toBe(400); 27 | 28 | expect(JSON.parse(body)).toEqual({ 29 | error: true, 30 | message: 'Bad Request', 31 | details: [{ label: 'ua', type: 'required', message: 'ua is required' }], 32 | }); 33 | }); 34 | 35 | it('should validate waitTime', async () => { 36 | const { res, body } = await request( 37 | 'http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fbasic.html&ua=Algolia+Crawler&waitTime[min]=foo&waitTime[max]=bar' 38 | ); 39 | 40 | expect(res.statusCode).toBe(400); 41 | 42 | expect(JSON.parse(body)).toEqual({ 43 | error: true, 44 | message: 'Bad Request', 45 | details: [ 46 | { 47 | errors: [ 48 | { 49 | label: 'min', 50 | message: 'min must be a valid number', 51 | type: 'number.typeof', 52 | }, 53 | { 54 | label: 'max', 55 | message: 'max must be a valid number', 56 | type: 'number.typeof', 57 | }, 58 | ], 59 | label: 'waitTime', 60 | message: 'waitTime does not match its schema', 61 | type: 'object.schema', 62 | }, 63 | ], 64 | }); 65 | }); 66 | 67 | it.each(['chromium', 'firefox'])( 68 | 'should render basic page on %s', 69 | async (browser) => { 70 | const { res, body } = await request( 71 | `http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fbasic.html&ua=Algolia+Crawler&browser=${browser}` 72 | ); 73 | 74 | expect(res.statusCode).toBe(200); 75 | expect(res.headers).toEqual({ 76 | connection: 'keep-alive', 77 | 'content-length': '79', 78 | 'content-security-policy': 79 | "default-src 'none'; style-src * 'unsafe-inline'; img-src * data:; font-src *", 80 | 'content-type': 'text/html; charset=utf-8', 81 | date: expect.any(String), 82 | etag: 'W/"4f-3aYUmdp4dkv6HiR9rJEG+VKiCsw"', 83 | 'keep-alive': 'timeout=5', 84 | }); 85 | 86 | expect(cleanString(body)).toBe( 87 | `A basic page` 88 | ); 89 | } 90 | ); 91 | }); 92 | -------------------------------------------------------------------------------- /src/__tests__/list.test.ts: -------------------------------------------------------------------------------- 1 | import { wait } from '../helpers/wait'; 2 | 3 | import { request } from './helpers'; 4 | 5 | jest.setTimeout(25000); 6 | 7 | describe('list', () => { 8 | it('should list nothing', async () => { 9 | const { res, body } = await request('http://localhost:3000/list'); 10 | 11 | expect(res.statusCode).toBe(200); 12 | const parsed = JSON.parse(body); 13 | expect(parsed).toEqual({ 14 | open: { 15 | chromium: [], 16 | firefox: [], 17 | }, 18 | }); 19 | }); 20 | 21 | it('should list current page', async () => { 22 | const r = request('http://localhost:3000/render', { 23 | method: 'POST', 24 | headers: { 25 | 'content-type': 'application/json', 26 | }, 27 | body: JSON.stringify({ 28 | url: 'http://localhost:3000/test-website/slow.html', 29 | ua: 'Algolia Crawler', 30 | waitTime: { 31 | min: 2000, 32 | max: 3000, 33 | }, 34 | }), 35 | }); 36 | 37 | await wait(1000); 38 | 39 | // Currently processing 40 | const res1 = await request('http://localhost:3000/list'); 41 | const parsed1 = JSON.parse(res1.body); 42 | expect(parsed1).toEqual({ 43 | open: { 44 | chromium: ['http://localhost:3000/test-website/slow.html'], 45 | firefox: [], 46 | }, 47 | }); 48 | 49 | await r; 50 | 51 | // Cleared 52 | const res2 = await request('http://localhost:3000/list'); 53 | const parsed2 = JSON.parse(res2.body); 54 | expect(parsed2).toEqual({ 55 | open: { 56 | chromium: [], 57 | firefox: [], 58 | }, 59 | }); 60 | }); 61 | }); 62 | -------------------------------------------------------------------------------- /src/__tests__/login.real.test.ts: -------------------------------------------------------------------------------- 1 | import type { Cookie } from 'playwright'; 2 | 3 | import type { PostLoginSuccess } from '../api/@types/postLogin'; 4 | import type { PostRenderSuccess } from '../api/@types/postRender'; 5 | 6 | import { 7 | cleanCookies, 8 | cookiesToString, 9 | postRender, 10 | sendLoginRequest, 11 | } from './helpers'; 12 | 13 | const rawCreds = process.env.LOGIN_CREDENTIALS; 14 | const canExec = process.env.CI || rawCreds; 15 | 16 | jest.setTimeout(25000); 17 | 18 | // !--- Not working right now 19 | // eslint-disable-next-line jest/no-disabled-tests 20 | describe.skip('Real Login', () => { 21 | let creds: { [name: string]: { username: string; password: string } }; 22 | 23 | beforeAll(() => { 24 | if (!canExec) { 25 | throw new Error('can only exec in CI or with LOGIN_CREDENTIALS'); 26 | } 27 | 28 | creds = JSON.parse(rawCreds!); 29 | }); 30 | 31 | describe('login.live.com', () => { 32 | let cookies: Cookie[]; 33 | let cred: { username: string; password: string }; 34 | beforeAll(() => { 35 | cred = creds['login.live.com']; 36 | }); 37 | 38 | it('should not be logged', async () => { 39 | const { res, body } = await postRender({ 40 | url: 'https://account.microsoft.com/billing/orders?refd=account.microsoft.com', 41 | }); 42 | 43 | expect(res.statusCode).toBe(200); 44 | const parsed: PostRenderSuccess = JSON.parse(body); 45 | expect(parsed.statusCode).toBe(302); 46 | expect( 47 | parsed.resolvedUrl?.startsWith('https://login.live.com/login.srf') 48 | ).toBe(true); 49 | }); 50 | 51 | it('get proper cookies', async () => { 52 | const { res, body } = await sendLoginRequest({ 53 | url: 'https://account.microsoft.com/billing/orders?refd=account.microsoft.com', 54 | username: cred.username, 55 | password: cred.password, 56 | }); 57 | 58 | expect(res.statusCode).toBe(200); 59 | const parsed: PostLoginSuccess = JSON.parse(body); 60 | const tmp = cleanCookies(parsed.cookies); 61 | [ 62 | { domain: '.account.live.com', name: 'RPSMaybe', path: '/' }, 63 | { domain: '.account.microsoft.com', name: 'AMCSecAuth', path: '/' }, 64 | { domain: '.account.microsoft.com', name: 'ANON', path: '/' }, 65 | { domain: '.account.microsoft.com', name: 'NAP', path: '/' }, 66 | { domain: '.live.com', name: 'amsc', path: '/' }, 67 | { domain: '.live.com', name: 'ANON', path: '/' }, 68 | { domain: '.live.com', name: 'mkt', path: '/' }, 69 | { domain: '.live.com', name: 'mkt1', path: '/' }, 70 | { domain: '.live.com', name: 'MSPAuth', path: '/' }, 71 | { domain: '.live.com', name: 'MSPProf', path: '/' }, 72 | { domain: '.live.com', name: 'NAP', path: '/' }, 73 | { domain: '.live.com', name: 'PPLState', path: '/' }, 74 | { domain: '.live.com', name: 'wlidperf', path: '/' }, 75 | { domain: '.live.com', name: 'WLSSC', path: '/' }, 76 | { domain: '.login.live.com', name: 'JSH', path: '/' }, 77 | { domain: '.login.live.com', name: 'JSHP', path: '/' }, 78 | { domain: '.login.live.com', name: 'MSCC', path: '/' }, 79 | { domain: '.login.live.com', name: 'MSPBack', path: '/' }, 80 | { domain: '.login.live.com', name: 'MSPOK', path: '/' }, 81 | { domain: '.login.live.com', name: 'MSPRequ', path: '/' }, 82 | { domain: '.login.live.com', name: 'MSPRequ', path: '/' }, 83 | { domain: '.login.live.com', name: 'MSPSoftVis', path: '/' }, 84 | { domain: '.login.live.com', name: 'OParams', path: '/' }, 85 | { domain: '.login.live.com', name: 'SDIDC', path: '/' }, 86 | { domain: '.login.live.com', name: 'uaid', path: '/' }, 87 | { domain: '.login.live.com', name: 'uaid', path: '/' }, 88 | { domain: '.microsoft.com', name: 'display-culture', path: '/' }, 89 | { domain: '.microsoft.com', name: 'market', path: '/' }, 90 | { domain: 'account.microsoft.com', name: 'ai_session', path: '/' }, 91 | { domain: 'account.microsoft.com', name: 'AMC-MS-CV', path: '/' }, 92 | { domain: 'account.microsoft.com', name: 'authBounced', path: '/' }, 93 | { domain: 'account.microsoft.com', name: 'canary', path: '/' }, 94 | { domain: 'account.microsoft.com', name: 'GRNID', path: '/' }, 95 | { domain: 'account.microsoft.com', name: 'GroupIds', path: '/' }, 96 | { domain: 'account.microsoft.com', name: 'ShCLSessionID', path: '/' }, 97 | // { domain: 'login.live.com', name: '__Host-MSAAUTH', path: '/' }, seems optional 98 | { domain: 'login.live.com', name: '__Host-MSAAUTHP', path: '/' }, 99 | ].forEach((cookie) => { 100 | expect( 101 | tmp.find((c) => c.name === cookie.name && c.domain === cookie.domain) 102 | ).toStrictEqual(cookie); 103 | }); 104 | 105 | cookies = parsed.cookies; 106 | }); 107 | 108 | it('should be logged', async () => { 109 | const { res, body } = await postRender( 110 | { 111 | url: 'https://account.microsoft.com/billing/orders?refd=account.microsoft.com', 112 | }, 113 | { 114 | Cookie: cookiesToString(cookies), 115 | } 116 | ); 117 | 118 | expect(res.statusCode).toBe(200); 119 | const parsed: PostRenderSuccess = JSON.parse(body); 120 | expect(parsed.statusCode).toBe(200); 121 | }); 122 | }); 123 | }); 124 | -------------------------------------------------------------------------------- /src/__tests__/login.test.ts: -------------------------------------------------------------------------------- 1 | import type { Cookie } from 'playwright'; 2 | 3 | import type { PostLoginSuccess } from '../api/@types/postLogin'; 4 | 5 | import { sendLoginRequest } from './helpers'; 6 | 7 | jest.setTimeout(45000); 8 | 9 | describe('login', () => { 10 | it('should error when no username', async () => { 11 | const { res, body } = await sendLoginRequest({ 12 | url: 'http://localhost:3000/secure/login', 13 | username: '', 14 | password: 'password', 15 | }); 16 | 17 | expect(res.statusCode).toBe(400); 18 | 19 | expect(JSON.parse(body)).toEqual({ 20 | details: [ 21 | { 22 | label: 'username', 23 | message: 'username is required', 24 | type: 'required', 25 | }, 26 | ], 27 | error: true, 28 | message: 'Bad Request', 29 | }); 30 | }); 31 | 32 | it('should error when no password', async () => { 33 | const { res, body } = await sendLoginRequest({ 34 | url: 'http://localhost:3000/secure/login', 35 | username: 'admin', 36 | password: '', 37 | }); 38 | 39 | expect(res.statusCode).toBe(400); 40 | 41 | expect(JSON.parse(body)).toEqual({ 42 | details: [ 43 | { 44 | label: 'password', 45 | message: 'password is required', 46 | type: 'required', 47 | }, 48 | ], 49 | error: true, 50 | message: 'Bad Request', 51 | }); 52 | }); 53 | 54 | it('should error multiple text input', async () => { 55 | const { res, body } = await sendLoginRequest({ 56 | url: 'http://localhost:3000/test-website/login-multiple-input.html', 57 | username: 'admin', 58 | password: 'paswword', 59 | }); 60 | 61 | expect(res.statusCode).toBe(500); 62 | const parsed: PostLoginSuccess = JSON.parse(body); 63 | expect(parsed.error).toBe('no_cookies'); 64 | expect(parsed.rawError).toBeNull(); 65 | }); 66 | 67 | it('should error double password', async () => { 68 | const { res, body } = await sendLoginRequest({ 69 | url: 'http://localhost:3000/test-website/login-double-password.html', 70 | username: 'admin', 71 | password: 'paswword', 72 | }); 73 | 74 | expect(res.statusCode).toBe(200); 75 | const parsed: PostLoginSuccess = JSON.parse(body); 76 | expect(parsed.error).toBe('too_many_fields'); 77 | expect(parsed.rawError?.message).toBe( 78 | 'Too many input found for "input[type=password]:not([aria-hidden="true"])", found "2"' 79 | ); 80 | }); 81 | 82 | it('should works with correct credentials', async () => { 83 | const { res, body } = await sendLoginRequest({ 84 | url: 'http://localhost:3000/secure/login', 85 | username: 'admin', 86 | password: 'password', 87 | }); 88 | 89 | expect(res.statusCode).toBe(200); 90 | 91 | const parsed: PostLoginSuccess = JSON.parse(body); 92 | expect( 93 | parsed.cookies.find((cookie) => cookie.name === 'sessionToken') 94 | ).toMatchSnapshot(); 95 | // Check that we actually went through the form 96 | expect( 97 | parsed.cookies.find((cookie) => cookie.name === '_csrf') 98 | ).toBeDefined(); 99 | }); 100 | 101 | it('should works even with a 2-steps login', async () => { 102 | const { res, body } = await sendLoginRequest({ 103 | url: 'http://localhost:3000/secure/login/step1', 104 | username: 'admin', 105 | password: 'password', 106 | }); 107 | 108 | expect(res.statusCode).toBe(200); 109 | 110 | const cookies: Cookie[] = JSON.parse(body).cookies; 111 | expect( 112 | cookies.find((cookie) => cookie.name === 'sessionToken') 113 | ).toMatchSnapshot(); 114 | // Check that we actually went through the form 115 | expect(cookies.find((cookie) => cookie.name === '_csrf')).toBeDefined(); 116 | }); 117 | 118 | it('should works with a 2-steps JS login', async () => { 119 | const { res, body } = await sendLoginRequest({ 120 | url: 'http://localhost:3000/secure/login/2steps', 121 | username: 'admin', 122 | password: 'password', 123 | }); 124 | 125 | expect(res.statusCode).toBe(200); 126 | 127 | const cookies: Cookie[] = JSON.parse(body).cookies; 128 | expect( 129 | cookies.find((cookie) => cookie.name === 'sessionToken') 130 | ).toMatchSnapshot(); 131 | // Check that we actually went through the form 132 | expect(cookies.find((cookie) => cookie.name === '_csrf')).toBeDefined(); 133 | }); 134 | 135 | it('should works but not get a session token with bad credentials', async () => { 136 | const { res, body } = await sendLoginRequest({ 137 | url: 'http://localhost:3000/secure/login', 138 | username: 'admin', 139 | password: 'admin', 140 | }); 141 | 142 | expect(res.statusCode).toBe(200); 143 | 144 | const parsed: PostLoginSuccess = JSON.parse(body); 145 | expect(parsed.cookies).toHaveLength(1); 146 | expect( 147 | parsed.cookies.find((cookie) => cookie.name === 'sessionToken') 148 | ).toBeUndefined(); 149 | // Check that we actually went through the form 150 | expect( 151 | parsed.cookies.find((cookie) => cookie.name === '_csrf') 152 | ).toBeDefined(); 153 | }); 154 | }); 155 | 156 | describe('JavaScript redirect', () => { 157 | it('should fail to renderHTML because of the JS redirect', async () => { 158 | const { res, body } = await sendLoginRequest({ 159 | url: 'http://localhost:3000/secure/login?redirect=true', 160 | username: 'admin', 161 | password: 'password', 162 | renderHTML: true, 163 | waitTime: { 164 | min: 1000, 165 | }, 166 | }); 167 | 168 | expect(res.statusCode).toBe(200); 169 | expect(body).toBe( 170 | 'OK(/test)' 171 | ); 172 | }); 173 | 174 | it('should not try to render the body if renderHTML was not requested', async () => { 175 | const { res, body } = await sendLoginRequest({ 176 | url: 'http://localhost:3000/secure/login?redirect=true', 177 | username: 'admin', 178 | password: 'password', 179 | waitTime: { 180 | min: 1000, 181 | }, 182 | }); 183 | 184 | // Since we didn't try to render, it returns the current cookies, even if there is an ongoing JS redirection 185 | expect(res.statusCode).toBe(200); 186 | 187 | const parsed: PostLoginSuccess = JSON.parse(body); 188 | expect(parsed.body).toBe( 189 | 'OK(/test)' 190 | ); 191 | expect(parsed.statusCode).toBe(200); 192 | expect(parsed.metrics.timings.total).toBeGreaterThan(1000); 193 | expect(parsed.resolvedUrl).toBe('http://localhost:3000/secure/test'); 194 | expect( 195 | parsed.cookies.find((cookie) => cookie.name === 'sessionToken') 196 | ).toMatchSnapshot(); 197 | }); 198 | }); 199 | -------------------------------------------------------------------------------- /src/__tests__/redirect.test.ts: -------------------------------------------------------------------------------- 1 | import type { PostRenderSuccess } from '../api/@types/postRender'; 2 | 3 | import { cleanString, postRender, request } from './helpers'; 4 | 5 | describe('server redirect', () => { 6 | it('should return the redirection', async () => { 7 | // !--- 8 | // Server Redirect are flaky since Playwright do not catch 301 9 | // You might want to relaunch the test if it failed. 10 | const { res, body } = await postRender({ 11 | url: 'http://localhost:3000/301', 12 | waitTime: { 13 | min: 5000, // wait long to be sure we end up being redirected 14 | }, 15 | }); 16 | 17 | const json: PostRenderSuccess = JSON.parse(body); 18 | expect(res.statusCode).toBe(200); 19 | 20 | expect(json.body).toBeNull(); 21 | expect(json.headers).toMatchObject({ 22 | location: '/test-website/basic.html', 23 | }); 24 | expect(json.statusCode).toBe(301); 25 | expect(json.timeout).toBe(false); 26 | expect(json.resolvedUrl).toBe( 27 | 'http://localhost:3000/test-website/basic.html' 28 | ); 29 | 30 | // Make sure execution was interrupted gracefully 31 | expect(json.metrics.timings.total).toBeGreaterThan(0); 32 | expect(json.metrics.timings.serialize).toBeNull(); 33 | expect(json.metrics.timings.close).toBeGreaterThan(0); 34 | }); 35 | }); 36 | 37 | describe('meta refresh', () => { 38 | it('should return the redirection', async () => { 39 | const { res, body } = await postRender({ 40 | url: 'http://localhost:3000/test-website/meta-refresh.html', 41 | ua: 'Algolia Crawler', 42 | }); 43 | 44 | const json: PostRenderSuccess = JSON.parse(body); 45 | expect(res.statusCode).toBe(200); 46 | 47 | expect(json.statusCode).toBe(200); 48 | expect(json.body).toBeNull(); 49 | expect(json.resolvedUrl).toBe( 50 | 'http://localhost:3000/test-website/basic.html' 51 | ); 52 | expect(json.error).toBe('redirection'); 53 | 54 | // Make sure execution was interrupted gracefully 55 | expect(json.metrics.timings.total).toBeGreaterThan(0); 56 | expect(json.metrics.timings.serialize).toBeNull(); 57 | expect(json.metrics.timings.close).toBeGreaterThan(0); 58 | }); 59 | 60 | it('should return the redirection even if not executed yet', async () => { 61 | const { res, body } = await postRender({ 62 | // The client redirection happens after 5sec but we only wait 2sec 63 | url: 'http://localhost:3000/test-website/meta-refresh-5.html', 64 | ua: 'Algolia Crawler', 65 | waitTime: { 66 | max: 2000, 67 | }, 68 | }); 69 | 70 | const json: PostRenderSuccess = JSON.parse(body); 71 | expect(res.statusCode).toBe(200); 72 | 73 | expect(json.statusCode).toBe(200); 74 | expect(json.body).toBeNull(); 75 | expect(json.resolvedUrl).toBe( 76 | 'http://localhost:3000/test-website/basic.html' 77 | ); 78 | expect(json.error).toBe('redirection'); 79 | 80 | // Make sure execution was interrupted gracefully 81 | expect(json.metrics.timings.total).toBeGreaterThan(0); 82 | expect(json.metrics.timings.serialize).toBeNull(); 83 | expect(json.metrics.timings.close).toBeGreaterThan(0); 84 | }); 85 | }); 86 | 87 | describe('js redirects', () => { 88 | it('should catch redirection', async () => { 89 | const { res, body } = await postRender({ 90 | url: 'http://localhost:3000/test-website/js-redirect.html?to=/test-website/basic.html', 91 | ua: 'Algolia Crawler', 92 | waitTime: { 93 | max: 2000, 94 | }, 95 | }); 96 | 97 | const json: PostRenderSuccess = JSON.parse(body); 98 | expect(res.statusCode).toBe(200); 99 | 100 | expect(json.statusCode).toBe(200); 101 | expect(json.body).toBeNull(); 102 | expect(json.resolvedUrl).toBe( 103 | 'http://localhost:3000/test-website/basic.html' 104 | ); 105 | expect(json.error).toBe('redirection'); 106 | 107 | // Make sure execution was interrupted gracefully 108 | expect(json.metrics.timings.total).toBeGreaterThan(0); 109 | expect(json.metrics.timings.serialize).toBeNull(); 110 | expect(json.metrics.timings.close).toBeGreaterThanOrEqual(0); 111 | }); 112 | 113 | it('should catch path', async () => { 114 | const { res, body } = await postRender({ 115 | url: 'http://localhost:3000/test-website/js-redirect-path.html', 116 | ua: 'Algolia Crawler', 117 | waitTime: { 118 | min: 2000, 119 | }, 120 | }); 121 | 122 | const json: PostRenderSuccess = JSON.parse(body); 123 | expect(res.statusCode).toBe(200); 124 | 125 | expect(json.statusCode).toBe(200); 126 | expect(json.body).toBeNull(); 127 | expect(json.resolvedUrl).toBe( 128 | 'http://localhost:3000/test-website/basic.html' 129 | ); 130 | expect(json.error).toBe('redirection'); 131 | 132 | // Make sure execution was interrupted gracefully 133 | expect(json.metrics.timings.total).toBeGreaterThan(0); 134 | expect(json.metrics.timings.serialize).toBeNull(); 135 | expect(json.metrics.timings.close).toBeGreaterThanOrEqual(0); 136 | }); 137 | 138 | it('should catch history pushState', async () => { 139 | const { res, body } = await postRender({ 140 | url: 'http://localhost:3000/test-website/js-redirect-history.html', 141 | ua: 'Algolia Crawler', 142 | waitTime: { 143 | min: 2000, 144 | }, 145 | }); 146 | 147 | const json: PostRenderSuccess = JSON.parse(body); 148 | expect(res.statusCode).toBe(200); 149 | 150 | expect(json.statusCode).toBe(200); 151 | expect(json.body).toBeNull(); 152 | expect(json.resolvedUrl).toBe( 153 | 'http://localhost:3000/test-website/basic.html' 154 | ); 155 | expect(json.error).toBe('redirection'); 156 | 157 | // Make sure execution was interrupted gracefully 158 | expect(json.metrics.timings.total).toBeGreaterThan(0); 159 | expect(json.metrics.timings.serialize).toBeNull(); 160 | expect(json.metrics.timings.close).toBeGreaterThanOrEqual(0); 161 | }); 162 | 163 | it('should catch hash but render normally', async () => { 164 | const { res, body } = await postRender({ 165 | url: 'http://localhost:3000/test-website/js-redirect-hash.html', 166 | ua: 'Algolia Crawler', 167 | waitTime: { 168 | min: 2000, 169 | }, 170 | }); 171 | 172 | const json: PostRenderSuccess = JSON.parse(body); 173 | expect(res.statusCode).toBe(200); 174 | 175 | expect(json.statusCode).toBe(200); 176 | expect(json.body).toBe( 177 | ` \n\n\n \n\n\n\n` 178 | ); 179 | expect(json.error).toBeNull(); 180 | 181 | // Make sure execution was interrupted gracefully 182 | expect(json.metrics.timings.total).toBeGreaterThan(0); 183 | expect(json.metrics.timings.serialize).toBeGreaterThan(0); 184 | expect(json.metrics.timings.close).toBeGreaterThanOrEqual(0); 185 | }); 186 | 187 | it('should output 307', async () => { 188 | const { res, body } = await request( 189 | `http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fjs-redirect.html?to=${encodeURIComponent( 190 | '/test-website/basic.html' 191 | )}&ua=Algolia+Crawler` 192 | ); 193 | 194 | expect(res.statusCode).toBe(307); 195 | expect(res.headers).toEqual({ 196 | connection: 'keep-alive', 197 | 'content-length': '0', 198 | date: expect.any(String), 199 | 'keep-alive': 'timeout=5', 200 | location: 'http://localhost:3000/test-website/basic.html', 201 | }); 202 | 203 | expect(cleanString(body)).toBe(''); 204 | }); 205 | }); 206 | -------------------------------------------------------------------------------- /src/__tests__/tasksManager.test.ts: -------------------------------------------------------------------------------- 1 | import type { GetHealthySuccess } from '../api/@types/getHealthy'; 2 | 3 | import { postRender, request } from './helpers'; 4 | 5 | describe('manager', () => { 6 | it('should properly close page after done', async () => { 7 | // Before 8 | const { res, body } = await request('http://localhost:3000/healthy'); 9 | expect(res.statusCode).toBe(200); 10 | 11 | const before: GetHealthySuccess = JSON.parse(body); 12 | expect(before).toEqual({ 13 | ready: true, 14 | tasksRunning: 0, 15 | pagesOpen: 0, 16 | totalRun: expect.any(Number), 17 | }); 18 | 19 | // Process something 20 | const { res: resRender } = await postRender({ 21 | url: 'http://localhost:3000/test-website/async.html', 22 | ua: 'Algolia Crawler', 23 | }); 24 | expect(resRender.statusCode).toBe(200); 25 | 26 | // After 27 | const { res: resAfter, body: bodyAfter } = await request( 28 | 'http://localhost:3000/healthy' 29 | ); 30 | expect(resAfter.statusCode).toBe(200); 31 | 32 | const after: GetHealthySuccess = JSON.parse(bodyAfter); 33 | expect(after).toEqual({ 34 | ready: true, 35 | tasksRunning: 0, 36 | pagesOpen: 0, 37 | totalRun: expect.any(Number), 38 | }); 39 | 40 | // Compare because we can't know how much due to of other that could have been run before 41 | expect(after.totalRun).toBeGreaterThan(0); 42 | expect(before.totalRun).toBeLessThan(after.totalRun); 43 | }); 44 | }); 45 | -------------------------------------------------------------------------------- /src/api/@types/getHealthy.ts: -------------------------------------------------------------------------------- 1 | export interface GetHealthySuccess { 2 | ready: boolean; 3 | tasksRunning: number; 4 | pagesOpen: number; 5 | totalRun: number; 6 | } 7 | -------------------------------------------------------------------------------- /src/api/@types/getList.ts: -------------------------------------------------------------------------------- 1 | export interface GetListSuccess { 2 | open: { [engine: string]: string[] }; 3 | } 4 | -------------------------------------------------------------------------------- /src/api/@types/getRoot.ts: -------------------------------------------------------------------------------- 1 | export interface GetRoot { 2 | version: string; 3 | } 4 | -------------------------------------------------------------------------------- /src/api/@types/postLogin.ts: -------------------------------------------------------------------------------- 1 | import type { Cookie } from 'playwright'; 2 | 3 | import type { Metrics, TaskBaseParams } from '../../lib/types'; 4 | 5 | import type { Res500 } from './responses'; 6 | 7 | export type PostLoginParams = Omit< 8 | TaskBaseParams, 9 | 'type' | 'url' | 'userAgent' 10 | > & { 11 | url: string; 12 | ua: string; 13 | username: string; 14 | password: string; 15 | renderHTML: boolean; 16 | }; 17 | 18 | export type PostLoginResponse = PostLoginSuccess | Res500; 19 | 20 | export interface PostLoginSuccess { 21 | /** 22 | * HTTP Code of the rendered page. 23 | */ 24 | statusCode: number | null; 25 | 26 | /** 27 | * HTTP Headers of the rendered page. 28 | */ 29 | headers: Record; 30 | 31 | /** 32 | * Metrics from different taks during the rendering. 33 | */ 34 | metrics: Metrics; 35 | 36 | /** 37 | * Has the page reached timeout? 38 | * When timeout has been reached we continue the rendering as usual 39 | * but reduce other timeout to a minimum. 40 | */ 41 | timeout: boolean; 42 | 43 | /** 44 | * Any error encountered along the way. 45 | * If this field is filled that means the rest of the payload is partial. 46 | */ 47 | error: string | null; 48 | rawError: { message: string; stack?: string } | null; 49 | 50 | /** 51 | * Cookie generated from a succesful login. 52 | */ 53 | cookies: Cookie[]; 54 | 55 | /** 56 | * The URL at the end of a succesful login. 57 | */ 58 | resolvedUrl: string | null; 59 | 60 | /** 61 | * Body at the end of a succesful login. 62 | */ 63 | body: string | null; 64 | } 65 | -------------------------------------------------------------------------------- /src/api/@types/postRender.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | HandledError, 3 | Metrics, 4 | TaskBaseParams, 5 | UnhandledError, 6 | } from '../../lib/types'; 7 | 8 | import type { Res500 } from './responses'; 9 | 10 | export type PostRenderParams = Omit< 11 | TaskBaseParams, 12 | 'type' | 'url' | 'userAgent' 13 | > & { 14 | url: string; 15 | ua: string; 16 | }; 17 | 18 | export type PostRenderResponse = PostRenderSuccess | Res500; 19 | 20 | export interface PostRenderSuccess { 21 | /** 22 | * HTTP Code of the rendered page. 23 | */ 24 | statusCode: number | null; 25 | 26 | /** 27 | * HTTP Headers of the rendered page. 28 | */ 29 | headers: Record; 30 | 31 | /** 32 | * Body of the rendered page. 33 | */ 34 | body: string | null; 35 | 36 | /** 37 | * Metrics from different taks during the rendering. 38 | */ 39 | metrics: Metrics; 40 | 41 | /** 42 | * The redirection renderscript caught. 43 | */ 44 | resolvedUrl: string | null; 45 | 46 | /** 47 | * Has the page reached timeout? 48 | * When timeout has been reached we continue the rendering as usual 49 | * but reduce other timeout to a minimum. 50 | */ 51 | timeout: boolean; 52 | 53 | /** 54 | * Any error encountered along the way. 55 | * If this field is filled that means the rest of the payload is partial. 56 | */ 57 | error: HandledError | UnhandledError | null; 58 | rawError: { message: string; stack?: string } | null; 59 | } 60 | -------------------------------------------------------------------------------- /src/api/@types/responses.ts: -------------------------------------------------------------------------------- 1 | export interface Res500 { 2 | error: string; 3 | } 4 | -------------------------------------------------------------------------------- /src/api/constants.ts: -------------------------------------------------------------------------------- 1 | export const HEADERS_TO_FORWARD = process.env.HEADERS_TO_FORWARD 2 | ? process.env.HEADERS_TO_FORWARD.split(',') 3 | : ['Cookie', 'Authorization']; 4 | 5 | // Only whitelist loading styles resources when testing 6 | // (will not change programmatic use of this system) 7 | export const CSP_HEADERS = [ 8 | "default-src 'none'", 9 | "style-src * 'unsafe-inline'", 10 | 'img-src * data:', 11 | 'font-src *', 12 | ].join('; '); 13 | 14 | export const SESSION_COOKIE = 'sessionToken=53cu23_535510n'; 15 | 16 | export const DELETE_COOKIE = 17 | 'sessionToken=; expires=Thu, 01 Jan 1970 00:00:00 GMT'; 18 | -------------------------------------------------------------------------------- /src/api/helpers/alt.ts: -------------------------------------------------------------------------------- 1 | import Altheia from 'altheia-async-data-validator'; 2 | 3 | export const alt = Altheia.instance(); 4 | alt.lang('protocol_not_allowed', () => 'Only HTTP protocol is allowed'); 5 | 6 | export function getDefaultParams(): Record { 7 | return { 8 | url: alt 9 | .internet() 10 | .url() 11 | .custom('protocol_not_allowed', (val) => { 12 | return ['http:', 'https:'].includes(new URL(val).protocol); 13 | }) 14 | .required(), 15 | ua: alt.string().required(), 16 | waitTime: alt.object().schema( 17 | alt({ 18 | min: alt.number().cast().min(1000).max(19000), 19 | max: alt.number().cast().min(2000).max(20000), 20 | }) 21 | ), 22 | adblock: alt.boolean().cast(), 23 | browser: alt.string(), 24 | }; 25 | } 26 | -------------------------------------------------------------------------------- /src/api/helpers/buildUrl.ts: -------------------------------------------------------------------------------- 1 | import { report } from '../../helpers/errorReporting'; 2 | 3 | const DOCKER_LOCALHOST = 'host.docker.internal'; 4 | 5 | const USE_DOCKER_LOCALHOST = process.env.USE_DOCKER_LOCALHOST === 'true'; 6 | 7 | export function replaceHost(url: URL, from: string, to: string): URL { 8 | const fromRegex = new RegExp(`^${from}(:|$)`); 9 | const host = url.host || ''; 10 | // eslint-disable-next-line no-param-reassign 11 | url.host = host.replace(fromRegex, `${to}$1`); 12 | return url; 13 | } 14 | 15 | export function revertUrl(href: string | null): URL | null { 16 | if (!href) { 17 | return null; 18 | } 19 | 20 | try { 21 | const url = new URL(href); 22 | if (!USE_DOCKER_LOCALHOST) { 23 | return url; 24 | } 25 | return replaceHost(url, DOCKER_LOCALHOST, 'localhost'); 26 | } catch (err) { 27 | report(new Error('invalid revertUrl'), { href }); 28 | return null; 29 | } 30 | } 31 | 32 | export function buildUrl(href: string): URL { 33 | const url = new URL(href); 34 | if (!USE_DOCKER_LOCALHOST) { 35 | return url; 36 | } 37 | return replaceHost(url, 'localhost', DOCKER_LOCALHOST); 38 | } 39 | -------------------------------------------------------------------------------- /src/api/helpers/errors.ts: -------------------------------------------------------------------------------- 1 | import type express from 'express'; 2 | 3 | interface AnyParams { 4 | res: express.Response; 5 | status: number; 6 | message: string; 7 | details?: any; 8 | } 9 | 10 | function any({ res, status, message, details }: AnyParams): void { 11 | res.status(status).json({ error: true, message, details }); 12 | } 13 | 14 | interface BadRequestParams { 15 | res: express.Response; 16 | message?: string; 17 | details?: any; 18 | } 19 | 20 | export function badRequest({ 21 | res, 22 | details, 23 | message = 'Bad Request', 24 | }: BadRequestParams): void { 25 | return any({ 26 | res, 27 | status: 400, 28 | message, 29 | details, 30 | }); 31 | } 32 | -------------------------------------------------------------------------------- /src/api/helpers/getForwardedHeaders.ts: -------------------------------------------------------------------------------- 1 | import type express from 'express'; 2 | 3 | import { HEADERS_TO_FORWARD } from '../constants'; 4 | 5 | export function getForwardedHeadersFromRequest( 6 | req: express.Request 7 | ): Record { 8 | const headersToForward = HEADERS_TO_FORWARD.reduce((partial, headerName) => { 9 | const name = headerName.toLowerCase(); 10 | if (req.headers[name]) { 11 | return { ...partial, [name]: req.headers[name] }; 12 | } 13 | return partial; 14 | }, {}); 15 | 16 | return headersToForward; 17 | } 18 | -------------------------------------------------------------------------------- /src/api/helpers/logger.ts: -------------------------------------------------------------------------------- 1 | import { log as mainLog } from '../../helpers/logger'; 2 | 3 | export const log = mainLog.child({ svc: 'api ' }); 4 | -------------------------------------------------------------------------------- /src/api/helpers/requestLogger.ts: -------------------------------------------------------------------------------- 1 | import type express from 'express'; 2 | 3 | import { log } from './logger'; 4 | 5 | export function requestLogger( 6 | req: express.Request, 7 | res: express.Response, 8 | next: express.NextFunction 9 | ): void { 10 | if (['/ready', '/healthy'].includes(req.url)) { 11 | next(); 12 | return; 13 | } 14 | 15 | log.info('Received', { method: req.method, path: req.url, body: req.body }); 16 | next(); 17 | } 18 | -------------------------------------------------------------------------------- /src/api/index.ts: -------------------------------------------------------------------------------- 1 | import * as http from 'http'; 2 | import * as path from 'path'; 3 | 4 | import { urlencoded, json } from 'body-parser'; 5 | import cookieParser from 'cookie-parser'; 6 | import csurf from 'csurf'; 7 | import express, { static as expressStatic } from 'express'; 8 | 9 | import projectRoot from '../helpers/projectRoot'; 10 | 11 | import { log } from './helpers/logger'; 12 | import { requestLogger } from './helpers/requestLogger'; 13 | import { healthy } from './routes/healthy'; 14 | import { list } from './routes/list'; 15 | import * as routeLogin from './routes/login'; 16 | import { 17 | getLogin, 18 | getStep1, 19 | getTest, 20 | getTwoSteps, 21 | postLogin, 22 | postStep2, 23 | } from './routes/privates/login'; 24 | import { ready } from './routes/ready'; 25 | import * as routeRender from './routes/render'; 26 | import { root } from './routes/root'; 27 | 28 | export class Api { 29 | server: http.Server; 30 | private _app: express.Express; 31 | private _csrfProtection: express.RequestHandler; 32 | 33 | constructor() { 34 | this._csrfProtection = csurf({ 35 | cookie: { maxAge: 120, sameSite: 'strict' }, 36 | }); 37 | this._app = express(); 38 | this.server = http.createServer(this._app); 39 | } 40 | 41 | start(port: number): void { 42 | this._setup(); 43 | this._routes(); 44 | if (process.env.NODE_ENV !== 'production') { 45 | this._privateRoutes(); 46 | } else { 47 | this._app.get('/', root); 48 | } 49 | 50 | // 404 51 | this._app.use('*', (req, res) => { 52 | res.status(404).json({ 53 | status: 404, 54 | error: 'Endpoint not found', 55 | code: 'not_found', 56 | }); 57 | }); 58 | 59 | // error handler 60 | this._app.use((err: any, req: express.Request, res: express.Response) => { 61 | if (err?.code !== 'EBADCSRFTOKEN') { 62 | // return next(); 63 | return res.status(500).json({ 64 | status: 500, 65 | error: 'Internal Server Error', 66 | code: 'internal_server_error', 67 | }); 68 | } 69 | 70 | // CSRF token errors 71 | res.status(403).json({ 72 | status: 403, 73 | error: 'The form has expired', 74 | code: 'form_expired', 75 | }); 76 | }); 77 | 78 | this.server.listen(port, () => { 79 | log.info(`Ready http://localhost:${port}`); 80 | }); 81 | } 82 | 83 | stop(cb: () => any): void { 84 | this.server.close(cb); 85 | } 86 | 87 | private _setup(): void { 88 | const jsonParser = json({ limit: '1mb' }); 89 | this._app.disable('x-powered-by'); 90 | 91 | this._app.use(urlencoded({ limit: '1mb', extended: true })); 92 | this._app.use((req, res, next) => { 93 | return jsonParser(req, res, (err) => { 94 | if (!err) { 95 | return next(); 96 | } 97 | 98 | return res.status(400).json({ 99 | status: 400, 100 | error: `Invalid json: ${err.message}`, 101 | code: 'invalid_json', 102 | }); 103 | }); 104 | }); 105 | 106 | this._app.use(requestLogger); 107 | this._app.use(cookieParser()); 108 | this._app.set('views', path.join(projectRoot, '/public/views')); 109 | this._app.set('view engine', 'ejs'); 110 | } 111 | 112 | private _routes(): void { 113 | this._app 114 | .get('/ready', ready) 115 | .get('/healthy', healthy) 116 | .get('/list', list) 117 | .get('/render', routeRender.validate, routeRender.render) 118 | .post('/render', routeRender.validate, routeRender.renderJSON) 119 | .post('/login', routeLogin.validate, routeLogin.processLogin); 120 | } 121 | 122 | private _privateRoutes(): void { 123 | this._app.use(expressStatic(path.join(projectRoot, '/public'))); 124 | 125 | this._app.get('/301', (req, res) => 126 | res.redirect(301, '/test-website/basic.html') 127 | ); 128 | 129 | // Login form with CSRF protection 130 | this._app 131 | .get('/secure/login', this._csrfProtection, getLogin) 132 | .post('/secure/login', this._csrfProtection, postLogin) 133 | .get('/secure/test', getTest) 134 | 135 | // 2-steps login form with CSRF protection 136 | .get('/secure/login/step1', this._csrfProtection, getStep1) 137 | .post('/secure/login/step2', this._csrfProtection, postStep2) 138 | .get('/secure/login/2steps', this._csrfProtection, getTwoSteps); 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/api/routes/healthy.ts: -------------------------------------------------------------------------------- 1 | import os from 'os'; 2 | 3 | import type express from 'express'; 4 | 5 | import { report } from '../../helpers/errorReporting'; 6 | import { stats } from '../../helpers/stats'; 7 | import { UNHEALTHY_TASK_TTL } from '../../lib/constants'; 8 | import { tasksManager } from '../../lib/singletons'; 9 | import type { GetHealthySuccess } from '../@types/getHealthy'; 10 | 11 | const hostname = os.hostname(); 12 | 13 | export function healthy( 14 | req: express.Request, 15 | res: express.Response 16 | ): void { 17 | const health = tasksManager.getHealth(); 18 | const tasksRunning = tasksManager.currentConcurrency; 19 | let pagesOpen = 0; 20 | tasksManager.currentBrowsers.forEach((browser) => { 21 | pagesOpen += browser?.getCurrentConcurrency() || 0; 22 | }); 23 | const totalRun = tasksManager.totalRun; 24 | 25 | // Those stats could be computed from .task.count 26 | // But we want to double check that we don't forgot tasks or tabs 27 | stats.gauge('renderscript.tasks.running', tasksRunning); 28 | stats.gauge('renderscript.pages.open', pagesOpen); 29 | stats.check( 30 | 'renderscript.up', 31 | health.ready ? stats.CHECKS.OK : stats.CHECKS.CRITICAL, 32 | { 33 | hostname, 34 | } 35 | ); 36 | 37 | if (!health.ready && health.oldTasks.length > 0) { 38 | report(new Error('Reporting not healthy'), { 39 | tasks: health.oldTasks, 40 | max: UNHEALTHY_TASK_TTL, 41 | tasksRunning, 42 | pagesOpen, 43 | totalRun, 44 | }); 45 | } 46 | 47 | res 48 | .status(health.ready ? 200 : 503) 49 | .json({ ready: health.ready, tasksRunning, pagesOpen, totalRun }); 50 | } 51 | -------------------------------------------------------------------------------- /src/api/routes/list.ts: -------------------------------------------------------------------------------- 1 | import type express from 'express'; 2 | 3 | import { tasksManager } from '../../lib/singletons'; 4 | import type { GetListSuccess } from '../@types/getList'; 5 | 6 | /** 7 | * List currently opened pages. 8 | * Useful to debug non-killed page. 9 | */ 10 | export function list( 11 | req: express.Request, 12 | res: express.Response 13 | ): void { 14 | const open: { [engine: string]: string[] } = { 15 | chromium: [], 16 | firefox: [], 17 | }; 18 | tasksManager.currentBrowsers.forEach((browser, engine) => { 19 | if (browser) { 20 | browser.instance!.contexts().forEach((ctx) => { 21 | ctx.pages().forEach((page) => { 22 | open[engine].push(page.url()); 23 | }); 24 | }); 25 | } 26 | }); 27 | 28 | res.status(200).json({ open }); 29 | } 30 | -------------------------------------------------------------------------------- /src/api/routes/login.ts: -------------------------------------------------------------------------------- 1 | import type express from 'express'; 2 | 3 | import { report } from '../../helpers/errorReporting'; 4 | import { retryableErrors } from '../../lib/helpers/errors'; 5 | import { tasksManager } from '../../lib/singletons'; 6 | import { LoginTask } from '../../lib/tasks/Login'; 7 | import type { PostLoginParams, PostLoginResponse } from '../@types/postLogin'; 8 | import { CSP_HEADERS } from '../constants'; 9 | import { getDefaultParams, alt } from '../helpers/alt'; 10 | import { buildUrl, revertUrl } from '../helpers/buildUrl'; 11 | import { badRequest } from '../helpers/errors'; 12 | import { getForwardedHeadersFromRequest } from '../helpers/getForwardedHeaders'; 13 | 14 | export async function validate( 15 | req: express.Request, 16 | res: express.Response, 17 | next: express.NextFunction 18 | ): Promise { 19 | const errors = await alt({ 20 | ...getDefaultParams(), 21 | username: alt.string().required(), 22 | password: alt.string().required(), 23 | renderHTML: alt.boolean().cast(), 24 | }) 25 | .body(req.body) 26 | .validate(); 27 | 28 | if (errors) { 29 | badRequest({ res, details: errors }); 30 | return; 31 | } 32 | 33 | next(); 34 | } 35 | 36 | export async function processLogin( 37 | req: express.Request, 38 | res: express.Response 39 | ): Promise { 40 | const { ua, username, password, renderHTML, waitTime, browser } = req.body; 41 | const headersToForward = getForwardedHeadersFromRequest(req); 42 | const url = new URL(buildUrl(req.body.url)); 43 | 44 | try { 45 | const task = await tasksManager.task( 46 | new LoginTask({ 47 | url: new URL(url), 48 | headersToForward, 49 | userAgent: ua, 50 | login: { 51 | username, 52 | password, 53 | }, 54 | browser, 55 | renderHTML, 56 | waitTime, 57 | }) 58 | ); 59 | 60 | if (renderHTML) { 61 | res 62 | .status(200) 63 | .header('Content-Type', 'text/html') 64 | .header('Content-Security-Policy', CSP_HEADERS) 65 | .send(task.body); 66 | return; 67 | } 68 | 69 | const resolvedUrl = revertUrl(task.resolvedUrl)?.href || null; 70 | const code = 71 | task.error && 72 | retryableErrors.includes(task.error) && 73 | task.error !== 'redirection' 74 | ? 500 75 | : 200; 76 | 77 | res.status(code).json({ 78 | headers: task.headers, 79 | metrics: task.metrics, 80 | statusCode: task.statusCode, 81 | timeout: task.timeout, 82 | error: task.error, 83 | cookies: task.cookies, 84 | resolvedUrl, 85 | body: task.body, 86 | rawError: task.rawError 87 | ? { 88 | message: task.rawError.message, 89 | stack: task.rawError.stack, 90 | } 91 | : null, 92 | }); 93 | } catch (err: any) { 94 | res.status(500).json({ error: err.message }); 95 | report(err, { url, type: 'login' }); 96 | } 97 | } 98 | -------------------------------------------------------------------------------- /src/api/routes/privates/login.ts: -------------------------------------------------------------------------------- 1 | import type { Request, Response } from 'express'; 2 | 3 | import { DELETE_COOKIE, SESSION_COOKIE } from '../../constants'; 4 | import { log } from '../../helpers/logger'; 5 | 6 | export function getLogin(req: Request, res: Response): void { 7 | res.render('login', { 8 | baseUrl: req.baseUrl, 9 | csrfToken: req.csrfToken(), 10 | }); 11 | } 12 | 13 | export function postLogin(req: Request, res: Response): void { 14 | const { username, password, redirect } = req.body; 15 | renderLogin({ 16 | username, 17 | password, 18 | redirect, 19 | res, 20 | }); 21 | } 22 | 23 | export function getTest(req: Request, res: Response): void { 24 | const cookie = req.get('Cookie') || ''; 25 | const cookies = cookie.split(';').map((c) => c.trim()); 26 | const granted = cookies.includes(SESSION_COOKIE); 27 | log.debug(`[/secure/test] granted: ${granted}, received cookie: ${cookie}`); 28 | res 29 | .contentType('text/html') 30 | .status(granted ? 200 : 401) 31 | .send( 32 | `${ 33 | granted ? 'OK' : 'NOK' 34 | }(/test)` 35 | ); 36 | } 37 | 38 | export function getStep1(req: Request, res: Response): void { 39 | res.render('login-step1', { 40 | baseUrl: req.baseUrl, 41 | csrfToken: req.csrfToken(), 42 | }); 43 | } 44 | 45 | export function postStep2(req: Request, res: Response): void { 46 | const { username } = req.body; 47 | res.render('login-step2', { 48 | baseUrl: req.baseUrl, 49 | csrfToken: req.csrfToken(), 50 | username, 51 | }); 52 | } 53 | 54 | export function getTwoSteps(req: Request, res: Response): void { 55 | const { username } = req.body; 56 | res.render('login-2steps-js', { 57 | baseUrl: req.baseUrl, 58 | csrfToken: req.csrfToken(), 59 | username, 60 | }); 61 | } 62 | 63 | function renderLogin({ 64 | username, 65 | password, 66 | redirect, 67 | res, 68 | }: { 69 | username: string; 70 | password: string; 71 | redirect?: boolean; 72 | res: Response; 73 | }): void { 74 | const granted = username === 'admin' && password === 'password'; 75 | const setCookie = `${ 76 | granted ? SESSION_COOKIE : DELETE_COOKIE 77 | }; SameSite=Strict`; 78 | log.debug('renderLogin', { 79 | username, 80 | password, 81 | setCookie, 82 | }); 83 | 84 | res 85 | .contentType('text/html') 86 | .set('Set-Cookie', setCookie) 87 | .status(granted ? 200 : 401) 88 | .send( 89 | `${ 90 | redirect 91 | ? `` 97 | : '' 98 | }${granted ? 'OK' : 'NOK'}(/login)` 99 | ); 100 | } 101 | -------------------------------------------------------------------------------- /src/api/routes/ready.ts: -------------------------------------------------------------------------------- 1 | import type express from 'express'; 2 | 3 | import { tasksManager } from '../../lib/singletons'; 4 | 5 | export function ready(req: express.Request, res: express.Response): void { 6 | const isHealthy = tasksManager.getHealth().ready; 7 | res.status(isHealthy ? 200 : 503).json({ ready: isHealthy }); 8 | } 9 | -------------------------------------------------------------------------------- /src/api/routes/render.ts: -------------------------------------------------------------------------------- 1 | import type express from 'express'; 2 | 3 | import { report } from '../../helpers/errorReporting'; 4 | import { retryableErrors } from '../../lib/helpers/errors'; 5 | import { tasksManager } from '../../lib/singletons'; 6 | import { RenderTask } from '../../lib/tasks/Render'; 7 | import type { 8 | PostRenderParams, 9 | PostRenderResponse, 10 | } from '../@types/postRender'; 11 | import type { Res500 } from '../@types/responses'; 12 | import { CSP_HEADERS } from '../constants'; 13 | import { getDefaultParams, alt } from '../helpers/alt'; 14 | import { buildUrl, revertUrl } from '../helpers/buildUrl'; 15 | import { badRequest } from '../helpers/errors'; 16 | import { getForwardedHeadersFromRequest } from '../helpers/getForwardedHeaders'; 17 | 18 | export async function validate( 19 | req: express.Request, 20 | res: express.Response, 21 | next: express.NextFunction 22 | ): Promise { 23 | const errors = await alt(getDefaultParams()) 24 | .body(req.method === 'GET' ? req.query : req.body) 25 | .validate(); 26 | 27 | if (errors) { 28 | badRequest({ res, details: errors }); 29 | return; 30 | } 31 | 32 | next(); 33 | } 34 | 35 | export async function render( 36 | req: express.Request, 37 | res: express.Response 38 | ): Promise { 39 | const { url: rawUrl, ua, waitTime, adblock, browser } = req.query; 40 | const headersToForward = getForwardedHeadersFromRequest(req); 41 | const url = new URL(buildUrl(rawUrl)); 42 | 43 | try { 44 | const { error, statusCode, body, resolvedUrl } = await tasksManager.task( 45 | new RenderTask({ 46 | url, 47 | headersToForward, 48 | userAgent: ua, 49 | browser, 50 | waitTime, 51 | adblock, 52 | }) 53 | ); 54 | 55 | if (resolvedUrl && resolvedUrl !== url.href) { 56 | const location = revertUrl(resolvedUrl)?.href || url.href; 57 | res.status(307).header('Location', location).send(); 58 | return; 59 | } 60 | 61 | if (error) { 62 | res.status(400).json({ error }); 63 | return; 64 | } 65 | 66 | res 67 | .status(statusCode!) 68 | .header('Content-Type', 'text/html') 69 | .header('Content-Security-Policy', CSP_HEADERS) 70 | .send(body); 71 | } catch (err: any) { 72 | res.status(500).json({ 73 | error: err.message, 74 | }); 75 | report(err, { type: 'render', url: rawUrl, browser }); 76 | } 77 | } 78 | 79 | export async function renderJSON( 80 | req: express.Request, 81 | res: express.Response 82 | ): Promise { 83 | const { url: rawUrl, ua, waitTime, adblock, browser } = req.body; 84 | const headersToForward = getForwardedHeadersFromRequest(req); 85 | const url = new URL(buildUrl(rawUrl)); 86 | 87 | try { 88 | const task = await tasksManager.task( 89 | new RenderTask({ 90 | url, 91 | headersToForward, 92 | userAgent: ua, 93 | browser, 94 | waitTime, 95 | adblock, 96 | }) 97 | ); 98 | 99 | if (!task.error && !task.body) { 100 | // Tmp while trying to understand the issue. 101 | report(new Error('No error but no body'), { 102 | task, 103 | url, 104 | waitTime, 105 | browser, 106 | }); 107 | task.error = 'body_serialisation_failed'; 108 | } 109 | 110 | const resolvedUrl = revertUrl(task.resolvedUrl)?.href || null; 111 | const code = 112 | task.error && 113 | retryableErrors.includes(task.error) && 114 | task.error !== 'redirection' 115 | ? 500 116 | : 200; 117 | res.status(code).json({ 118 | body: task.body, 119 | headers: task.headers, 120 | metrics: task.metrics, 121 | resolvedUrl, 122 | statusCode: task.statusCode, 123 | timeout: task.timeout, 124 | error: task.error, 125 | rawError: task.rawError 126 | ? { 127 | message: task.rawError.message, 128 | stack: task.rawError.stack, 129 | } 130 | : null, 131 | }); 132 | } catch (err: any) { 133 | res.status(500).json({ error: err.message }); 134 | report(err, { type: 'renderJSON', url: rawUrl, browser }); 135 | } 136 | } 137 | -------------------------------------------------------------------------------- /src/api/routes/root.ts: -------------------------------------------------------------------------------- 1 | import type express from 'express'; 2 | 3 | import type { GetRoot } from '../@types/getRoot'; 4 | 5 | export function root( 6 | req: express.Request, 7 | res: express.Response 8 | ): void { 9 | res.status(200).json({ version: process.env.VERSION || 'dev' }); 10 | } 11 | -------------------------------------------------------------------------------- /src/global.d.ts: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/algolia/renderscript/9629a8bd4471002ae2227ef33d0107828e799b77/src/global.d.ts -------------------------------------------------------------------------------- /src/helpers/errorReporting.ts: -------------------------------------------------------------------------------- 1 | import * as Sentry from '@sentry/node'; 2 | 3 | import { log } from './logger'; 4 | 5 | export const RENDERSCRIPT_TASK_URL_TAG = 'renderscript:task:url'; 6 | export const RENDERSCRIPT_TASK_TYPE_TAG = 'renderscript:task:type'; 7 | 8 | type SentryTag = { 9 | key: string; 10 | value: string; 11 | }; 12 | 13 | Sentry.init({ 14 | dsn: process.env.SENTRY_DSN, 15 | release: process.env.npm_package_version, 16 | environment: process.env.CLUSTER_NAME || process.env.NODE_ENV, 17 | serverName: 'renderscript', 18 | ignoreErrors: [], 19 | maxBreadcrumbs: 10, 20 | }); 21 | 22 | export function report( 23 | err: Error, 24 | extra: any = {}, 25 | tags: SentryTag[] = [] 26 | ): void { 27 | if (!process.env.SENTRY_DSN) { 28 | console.error({ err, extra }); 29 | return; 30 | } 31 | 32 | log.error(err.message, extra); 33 | Sentry.withScope((scope) => { 34 | tags.forEach((tag) => { 35 | Sentry.setTag(tag.key, tag.value); 36 | }); 37 | 38 | scope.setExtras(extra); 39 | Sentry.captureException(err); 40 | }); 41 | } 42 | 43 | export async function drain(): Promise { 44 | const client = Sentry.getCurrentHub().getClient(); 45 | if (client) { 46 | return await client.close(2000); 47 | } 48 | 49 | return true; 50 | } 51 | -------------------------------------------------------------------------------- /src/helpers/gracefulClose.ts: -------------------------------------------------------------------------------- 1 | import { nextTick } from 'process'; 2 | 3 | import type { Api } from '../api/index'; 4 | import type { TasksManager } from '../lib/TasksManager'; 5 | 6 | import * as reporting from './errorReporting'; 7 | import { log } from './logger'; 8 | import * as stats from './stats'; 9 | 10 | interface Params { 11 | api: Api; 12 | tasksManager: TasksManager; 13 | } 14 | 15 | let gracefullyClosing = false; 16 | 17 | async function close({ api, tasksManager }: Params): Promise { 18 | const webServerPromise = new Promise((resolve) => { 19 | log.info('[API] Stopping...'); 20 | api.stop(() => { 21 | log.info('[API] stopped'); 22 | resolve(); 23 | }); 24 | }); 25 | 26 | await webServerPromise; 27 | await tasksManager.stop(); 28 | 29 | log.info('Gracefully stopped everything'); 30 | } 31 | 32 | export async function gracefulClose(opts: Params): Promise { 33 | // If we receive multiple signals, swallow them 34 | if (gracefullyClosing) { 35 | return; 36 | } 37 | 38 | gracefullyClosing = true; 39 | log.info('Starting graceful close...'); 40 | 41 | try { 42 | await close(opts); 43 | await reporting.drain(); 44 | await stats.close(); 45 | } catch (err) { 46 | log.error('Graceful exit failed', err); 47 | } 48 | log.flush(); 49 | 50 | nextTick(() => { 51 | // eslint-disable-next-line no-process-exit 52 | process.exit(0); 53 | }); 54 | } 55 | -------------------------------------------------------------------------------- /src/helpers/logger.ts: -------------------------------------------------------------------------------- 1 | import { pino } from 'pino'; 2 | 3 | const isProd = process.env.NODE_ENV === 'production'; 4 | export const log = pino({ 5 | level: process.env.LOG_LEVEL || 'info', 6 | timestamp: true, 7 | base: {}, 8 | formatters: { 9 | level(label) { 10 | return { level: label }; 11 | }, 12 | }, 13 | hooks: { 14 | // By default pino does Sprintf instead we merge objects. 15 | logMethod(args, method) { 16 | const final: Record = { msg: '', data: {} }; 17 | args.forEach((m) => { 18 | if (typeof m === 'string') { 19 | final.msg += m; 20 | } else if (typeof m === 'object' && m instanceof Error) { 21 | final.err = m; 22 | } else if (m.err || m.error) final.err = m.err || m.error; 23 | else final.data = { ...final.data, ...m }; 24 | }); 25 | method.apply(this, [final as unknown as string]); 26 | }, 27 | }, 28 | prettifier: !isProd, 29 | transport: !isProd 30 | ? { 31 | target: 'pino-pretty', 32 | options: { 33 | colorize: true, 34 | singleLine: true, 35 | messageFormat: '{svc} \x1B[37m{msg}', 36 | translateTime: 'HH:MM', 37 | ignore: 'svc', 38 | }, 39 | } 40 | : undefined, 41 | }); 42 | -------------------------------------------------------------------------------- /src/helpers/projectRoot.ts: -------------------------------------------------------------------------------- 1 | import * as path from 'path'; 2 | 3 | export default path.join(__dirname, '..', '..'); 4 | -------------------------------------------------------------------------------- /src/helpers/promiseWithTimeout.ts: -------------------------------------------------------------------------------- 1 | export class PromiseWithTimeoutError extends Error {} 2 | 3 | export async function promiseWithTimeout( 4 | promise: Readonly>, 5 | timeout: number 6 | ): Promise { 7 | let timeoutId: ReturnType | undefined = undefined; 8 | const timeoutPromise = new Promise((resolve, reject) => { 9 | timeoutId = setTimeout(() => { 10 | reject(new PromiseWithTimeoutError('Renderscript Controlled Timeout')); 11 | }, timeout); 12 | }); 13 | try { 14 | return await Promise.race([promise, timeoutPromise]); 15 | } finally { 16 | clearTimeout(timeoutId); 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /src/helpers/stats.ts: -------------------------------------------------------------------------------- 1 | import { StatsD } from 'hot-shots'; 2 | 3 | import { report } from './errorReporting'; 4 | 5 | const client = new StatsD({ 6 | host: process.env.DOGSTATSD_HOST || 'localhost', 7 | port: 8125, 8 | prefix: process.env.DOGSTATSD_PREFIX || 'alg.crawler.', 9 | mock: process.env.NODE_ENV !== 'production', 10 | globalTags: { 11 | env: process.env.NODE_ENV === 'production' ? 'prod' : 'dev', 12 | }, 13 | errorHandler(error: Error): void { 14 | report(error); 15 | }, 16 | }); 17 | 18 | export function close(): Promise { 19 | return new Promise((resolve, reject) => { 20 | client.close((err) => { 21 | if (err) { 22 | reject(err); 23 | return; 24 | } 25 | resolve(); 26 | }); 27 | }); 28 | } 29 | 30 | export const stats = client; 31 | -------------------------------------------------------------------------------- /src/helpers/wait.ts: -------------------------------------------------------------------------------- 1 | // Coming in nodejs 16 2 | export function wait(waitTime: number): Promise { 3 | return new Promise((resolve) => { 4 | setTimeout(resolve, waitTime); 5 | }); 6 | } 7 | -------------------------------------------------------------------------------- /src/helpers/waitForPendingRequests.ts: -------------------------------------------------------------------------------- 1 | import { setTimeout } from 'timers/promises'; 2 | 3 | import type { BrowserPage } from '../lib/browser/Page'; 4 | 5 | import { log } from './logger'; 6 | 7 | // waitForNavigation({ waitUntil: 'networkidle' }) or waitForLoadState('networkidle') 8 | // can be flaky and return too soon: 9 | // https://github.com/microsoft/playwright/issues/4664#issuecomment-742691215 10 | // https://github.com/microsoft/playwright/issues/2515#issuecomment-724163391 11 | // This helper permits to manually wait, if the page still has pending requests. 12 | export async function waitForPendingRequests( 13 | page: BrowserPage, 14 | timeout: number 15 | ): Promise { 16 | const startTime = Date.now(); 17 | while (page.pendingRequests > 0 && Date.now() - startTime < timeout) { 18 | log.debug( 19 | { pageUrl: page.ref?.url() }, 20 | `Waiting for ${page.pendingRequests} requests to complete... Wait time:${ 21 | Date.now() - startTime 22 | }, timeout: ${timeout}` 23 | ); 24 | await setTimeout(1000); 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | import { Api } from './api/index'; 2 | import { report } from './helpers/errorReporting'; 3 | import { gracefulClose } from './helpers/gracefulClose'; 4 | import { log } from './helpers/logger'; 5 | import * as singletons from './lib/singletons'; 6 | 7 | const PORT = parseInt(process.env.PORT || '3000', 10); 8 | 9 | // Uncaught Promise Rejection 10 | process.on('unhandledRejection', (reason) => { 11 | report(new Error('unhandled rejection'), { err: reason }); 12 | 13 | log.info('Hard exit after unhandledRejection'); 14 | // We are not sure if it's stable or not 15 | setTimeout(() => { 16 | // eslint-disable-next-line no-process-exit 17 | process.exit(1); 18 | }, 1); 19 | }); 20 | 21 | process.on('uncaughtException', (reason) => { 22 | report(new Error('uncaught exception'), { err: reason }); 23 | 24 | log.info('Hard exit after uncaughtException'); 25 | // We are not sure if it's stable or not 26 | setTimeout(() => { 27 | // eslint-disable-next-line no-process-exit 28 | process.exit(1); 29 | }, 1); 30 | }); 31 | 32 | (async (): Promise => { 33 | log.info('Starting...', { 34 | env: process.env.NODE_ENV, 35 | v: process.env.VERSION, 36 | }); 37 | 38 | const api = new Api(); 39 | api.start(PORT); 40 | 41 | await singletons.init(); 42 | 43 | // Handle SIGINT 44 | // It doesn't seem to handle it correctly, but it's just `yarn` messing up 45 | // Try running 46 | // 47 | // yarn build && NODE_ENV=development node dist/index.js 48 | // 49 | // to see that it works fine 50 | const gracefulCloseParams = { api, tasksManager: singletons.tasksManager }; 51 | const boundGracefulClose = gracefulClose.bind(null, gracefulCloseParams); 52 | process.on('SIGINT', boundGracefulClose); 53 | process.on('SIGTERM', boundGracefulClose); 54 | })(); 55 | -------------------------------------------------------------------------------- /src/lib/TasksManager.ts: -------------------------------------------------------------------------------- 1 | import { 2 | RENDERSCRIPT_TASK_TYPE_TAG, 3 | RENDERSCRIPT_TASK_URL_TAG, 4 | report, 5 | } from '../helpers/errorReporting'; 6 | import { log as mainLog } from '../helpers/logger'; 7 | import { stats } from '../helpers/stats'; 8 | 9 | import type { BrowserEngine } from './browser/Browser'; 10 | import { Browser } from './browser/Browser'; 11 | import { RESPONSE_IGNORED_ERRORS } from './browser/constants'; 12 | import { UNHEALTHY_TASK_TTL } from './constants'; 13 | import { cleanErrorMessage, ErrorIsHandledError } from './helpers/errors'; 14 | import type { Task } from './tasks/Task'; 15 | import type { TaskObject, TaskFinal } from './types'; 16 | 17 | export const log = mainLog.child({ svc: 'mngr' }); 18 | 19 | export class TasksManager { 20 | #chromium: Browser | null = null; 21 | #firefox: Browser | null = null; 22 | #stopping: boolean = true; 23 | #tasks: Map = new Map(); 24 | #totalRun: number = 0; 25 | 26 | getHealth(): { ready: boolean; reason?: string; oldTasks: string[][] } { 27 | const oldTasks: any[][] = []; 28 | 29 | if (this.#stopping) { 30 | return { ready: false, reason: 'stopping', oldTasks }; 31 | } 32 | 33 | // Tasks lifecycle 34 | this.#tasks.forEach((task) => { 35 | const duration = Date.now() - task.ref.createdAt!.getTime(); 36 | if (duration < UNHEALTHY_TASK_TTL) { 37 | return; 38 | } 39 | oldTasks.push([ 40 | duration, 41 | task.ref.id, 42 | task.ref.params.url.href, 43 | JSON.stringify(task.ref.results), 44 | JSON.stringify(task.ref.metrics), 45 | task.ref.isDone, 46 | ]); 47 | }); 48 | 49 | if (oldTasks.length > 0) { 50 | return { ready: false, reason: 'oldTasks', oldTasks }; 51 | } 52 | 53 | if (this.#chromium && this.#firefox) { 54 | return { 55 | ready: this.#chromium.isReady && this.#firefox.isReady, 56 | reason: `browser(s) not ready: chromium: ${ 57 | this.#chromium.isReady ? '✅' : '❌' 58 | } ; firefox: ${this.#firefox.isReady ? '✅' : '❌'}`, 59 | oldTasks, 60 | }; 61 | } 62 | 63 | return { ready: false, oldTasks }; 64 | } 65 | 66 | get currentBrowsers(): Map { 67 | return new Map([ 68 | ['chromium', this.#chromium], 69 | ['firefox', this.#firefox], 70 | ]); 71 | } 72 | 73 | get currentConcurrency(): number { 74 | return this.#tasks.size; 75 | } 76 | 77 | get totalRun(): number { 78 | return this.#totalRun; 79 | } 80 | 81 | async launch(): Promise { 82 | const chromium = new Browser('chromium'); 83 | await chromium.create(); 84 | const firefox = new Browser('firefox'); 85 | await firefox.create(); 86 | 87 | this.#chromium = chromium; 88 | this.#firefox = firefox; 89 | this.#stopping = false; 90 | log.info('Ready'); 91 | } 92 | 93 | /** 94 | * Register and execute a task. 95 | */ 96 | async task(task: Task): Promise { 97 | const health = this.getHealth(); 98 | if (!health.ready) { 99 | // The process can be marked as not ready because one of the browsers is not up 100 | // If we receive a job for a browser that is ready, only report and process it. 101 | if ( 102 | (!task.params.browser || task.params.browser === 'chromium') && 103 | this.#chromium?.isReady 104 | ) { 105 | report(new Error('Unhealthy node received a job but can process it'), { 106 | url: task.params.url, 107 | browser: 'chromium', 108 | reason: health.reason, 109 | }); 110 | } else if (task.params.browser === 'firefox' && this.#firefox?.isReady) { 111 | report(new Error('Unhealthy node received a job but can process it'), { 112 | url: task.params.url, 113 | browser: 'firefox', 114 | reason: health.reason, 115 | }); 116 | } else { 117 | throw new Error(`Unhealthy node received a job: ${health.reason}`); 118 | } 119 | } 120 | 121 | try { 122 | const promise = this.#exec(task); 123 | this.#totalRun += 1; 124 | this.#tasks.set(task.id, { 125 | ref: task, 126 | promise, 127 | }); 128 | 129 | return await promise; 130 | } finally { 131 | this.#tasks.delete(task.id); 132 | } 133 | } 134 | 135 | /** 136 | * Stop the task manager. 137 | */ 138 | async stop(): Promise { 139 | this.#stopping = true; 140 | log.info('[Manager] stopping...'); 141 | 142 | // We wait for all tasks to finish before closing 143 | const promises: Array> = []; 144 | this.#tasks.forEach((task) => { 145 | promises.push(this.#removeTask(task.ref.id)); 146 | }); 147 | await Promise.all(promises); 148 | 149 | this.#tasks.clear(); 150 | 151 | if (this.#chromium) { 152 | await this.#chromium.stop(); 153 | this.#chromium = null; 154 | } 155 | if (this.#firefox) { 156 | await this.#firefox.stop(); 157 | this.#firefox = null; 158 | } 159 | } 160 | 161 | /** 162 | * Actual execution of a task. 163 | * It will create a browser, a page, launch the task (render, login), close everything. 164 | * Any unexpected error will be thrown. 165 | */ 166 | async #exec(task: Task): Promise { 167 | if (this.#stopping) { 168 | throw new Error('Task can not be executed: stopping'); 169 | } 170 | 171 | const engine: BrowserEngine = task.params.browser || 'chromium'; 172 | const browser = engine === 'firefox' ? this.#firefox : this.#chromium; 173 | if (!browser || !browser.isReady) { 174 | throw new Error('Task can not be executed: no_browser'); 175 | } 176 | 177 | const id = task.id; 178 | const url = task.params.url.href; 179 | const type = task.constructor.name; 180 | log.info('Processing', { id, url, type }); 181 | 182 | const start = Date.now(); 183 | 184 | try { 185 | await task.createContext(browser); 186 | await task.process(); 187 | } catch (err: any) { 188 | /* eslint-disable no-param-reassign */ 189 | if (!(err instanceof ErrorIsHandledError)) { 190 | task.results.error = task.results.error || cleanErrorMessage(err); 191 | task.results.rawError = err; 192 | report(err, { url }, [ 193 | { 194 | key: RENDERSCRIPT_TASK_URL_TAG, 195 | value: url, 196 | }, 197 | { 198 | key: RENDERSCRIPT_TASK_TYPE_TAG, 199 | value: type, 200 | }, 201 | ]); 202 | } 203 | /* eslint-enable no-param-reassign */ 204 | } 205 | 206 | try { 207 | await task.saveMetrics(); 208 | } catch (err: any) { 209 | // Task itself should never break the whole execution 210 | report(err, { url }); 211 | } 212 | 213 | // No matter what happen we want to kill everything gracefully 214 | try { 215 | await task.close(); 216 | this.#tasks.delete(id); 217 | } catch (err: any) { 218 | // Don't let close errors crash the process 219 | if (RESPONSE_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) { 220 | // Expected error when browser is already closed 221 | log.debug('Expected close error', { err: err.message, url }); 222 | } else { 223 | report(new Error('Error during close'), { err, url }); 224 | } 225 | } 226 | 227 | // ---- Reporting 228 | const total = Date.now() - start; 229 | stats.timing('renderscript.task', total, undefined, { type }); 230 | 231 | if (task.metrics.page) { 232 | const mp = task.metrics.page; 233 | /* eslint-disable prettier/prettier */ 234 | stats.timing(`renderscript.task.download`, mp.timings.download!); 235 | stats.histogram(`renderscript.task.requests`, mp.requests.total); 236 | stats.increment(`renderscript.task.requests.amount`, mp.requests.total); 237 | stats.histogram(`renderscript.task.blockedRequests`, mp.requests.blocked); 238 | stats.increment(`renderscript.task.blockedRequests.amount`, mp.requests.blocked); 239 | stats.increment(`renderscript.task.contentLength.amount`, mp.contentLength.main); 240 | stats.histogram(`renderscript.task.contentLength`, mp.contentLength.main); 241 | stats.increment(`renderscript.task.contentLengthTotal.amount`, mp.contentLength.total); 242 | stats.histogram(`renderscript.task.contentLengthTotal`, mp.contentLength.total); 243 | /* eslint-enable prettier/prettier */ 244 | } 245 | 246 | log.info( 247 | { id, url, code: task.results.error, metrics: task.metrics }, 248 | 'Done' 249 | ); 250 | const res = task.results; 251 | return { 252 | ...res, 253 | timeout: task.page?.hasTimeout || false, 254 | metrics: task.metrics, 255 | }; 256 | } 257 | 258 | async #removeTask(id: string): Promise { 259 | const task = this.#tasks.get(id); 260 | if (!task) { 261 | throw new Error(`Could not find task: ${id}`); 262 | } 263 | 264 | try { 265 | await task.promise; 266 | } catch (err) { 267 | // 268 | } 269 | } 270 | } 271 | -------------------------------------------------------------------------------- /src/lib/browser/Adblocker.ts: -------------------------------------------------------------------------------- 1 | import { promises as fs } from 'fs'; 2 | 3 | import { report } from '../../helpers/errorReporting'; 4 | import { log as mainLog } from '../../helpers/logger'; 5 | 6 | const log = mainLog.child({ svc: 'adbk' }); 7 | 8 | /** 9 | * Dead simple adblocking by exact hostname. 10 | */ 11 | export class Adblocker { 12 | #hostnames: Set = new Set(); 13 | 14 | async load(): Promise { 15 | try { 16 | const data = await fs.readFile(`${__dirname}/adblock_hosts.txt`, 'utf8'); 17 | const lines = data.split(/[\r\n]+/); 18 | 19 | for (const line of lines) { 20 | if (!line.startsWith('#')) { 21 | this.#hostnames.add(line); 22 | } 23 | } 24 | 25 | log.info('Ready', { 26 | entries: this.#hostnames.size, 27 | }); 28 | } catch (err: any) { 29 | report(new Error('Error while setting up adblocker'), { err }); 30 | } 31 | } 32 | 33 | match(url: URL): boolean { 34 | return this.#hostnames.has(url.hostname); 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/lib/browser/Browser.ts: -------------------------------------------------------------------------------- 1 | import type { 2 | Browser as BrowserInterface, 3 | BrowserContext, 4 | BrowserContextOptions, 5 | } from 'playwright'; 6 | import { chromium, firefox } from 'playwright'; 7 | import { v4 as uuid } from 'uuid'; 8 | 9 | import { report } from '../../helpers/errorReporting'; 10 | import { log as mainLog } from '../../helpers/logger'; 11 | import { stats } from '../../helpers/stats'; 12 | 13 | import { flags, HEIGHT, WIDTH } from './constants'; 14 | 15 | const log = mainLog.child({ svc: 'brws' }); 16 | 17 | export type BrowserEngine = 'chromium' | 'firefox'; 18 | export const DEFAULT_ENGINE: BrowserEngine = 'chromium'; 19 | 20 | export class Browser { 21 | #id; 22 | #engine: BrowserEngine; 23 | #ready: boolean = false; 24 | #stopping: boolean = false; 25 | #browser: BrowserInterface | undefined; 26 | 27 | constructor(engine?: BrowserEngine) { 28 | this.#id = uuid(); 29 | this.#engine = engine || 'chromium'; 30 | } 31 | 32 | get isReady(): boolean { 33 | return ( 34 | this.#ready && 35 | typeof this.#browser !== 'undefined' && 36 | this.#browser.isConnected() 37 | ); 38 | } 39 | 40 | get instance(): BrowserInterface | undefined { 41 | return this.#browser; 42 | } 43 | 44 | /** 45 | * Create a Playwright instance. 46 | */ 47 | async create(): Promise { 48 | log.info(`Creating ${this.#engine}...`, { id: this.#id }); 49 | 50 | const env: { [s: string]: string } = {}; 51 | if (process.env.DISPLAY) { 52 | env.DISPLAY = process.env.DISPLAY; 53 | } 54 | 55 | const start = Date.now(); 56 | try { 57 | const browser = this.#engine === 'firefox' ? firefox : chromium; 58 | this.#browser = await browser.launch({ 59 | headless: true, 60 | env, 61 | handleSIGINT: false, 62 | handleSIGHUP: false, 63 | handleSIGTERM: false, 64 | args: flags, 65 | }); 66 | this.#browser.on('disconnected', () => { 67 | if (!this.#stopping) { 68 | this.#ready = false; 69 | report( 70 | new Error( 71 | `Browser disconnected (engine: ${this.#engine}). Relaunching...` 72 | ) 73 | ); 74 | this.create(); 75 | } 76 | }); 77 | } catch (e: any) { 78 | report(e, { browser: this.#engine }); 79 | } 80 | stats.timing('renderscript.create', Date.now() - start, { 81 | browser: this.#engine, 82 | }); 83 | 84 | this.#ready = true; 85 | log.info('Ready', { id: this.#id, browser: this.#engine }); 86 | } 87 | 88 | async stop(): Promise { 89 | this.#stopping = true; 90 | await this.#browser?.close(); 91 | } 92 | 93 | getCurrentConcurrency(): number { 94 | if (!this.#browser) { 95 | return 0; 96 | } 97 | 98 | return this.#browser.contexts().reduce((i, ctx) => { 99 | return i + ctx.pages().length; 100 | }, 0); 101 | } 102 | 103 | async getNewContext(opts: BrowserContextOptions): Promise { 104 | if (!this.#browser?.isConnected()) { 105 | throw new Error(`No browser available (engine=${this.#engine})`); 106 | } 107 | 108 | const start = Date.now(); 109 | const ctx = await this.#browser!.newContext({ 110 | acceptDownloads: false, 111 | bypassCSP: false, 112 | hasTouch: false, 113 | isMobile: false, 114 | javaScriptEnabled: true, 115 | locale: 'en-GB', 116 | timezoneId: 'Europe/Paris', 117 | offline: false, 118 | permissions: [], 119 | userAgent: 'Algolia Crawler Renderscript', 120 | viewport: { height: HEIGHT, width: WIDTH }, 121 | extraHTTPHeaders: { 122 | 'Accept-Encoding': 'gzip, deflate', 123 | Accept: 124 | 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 125 | }, 126 | ...opts, 127 | }); 128 | stats.timing('renderscript.context.create', Date.now() - start); 129 | 130 | return ctx; 131 | } 132 | } 133 | -------------------------------------------------------------------------------- /src/lib/browser/Page.ts: -------------------------------------------------------------------------------- 1 | import type { BrowserContext, Page, Route, Response } from 'playwright'; 2 | 3 | import { report } from '../../helpers/errorReporting'; 4 | import { log } from '../../helpers/logger'; 5 | import { 6 | promiseWithTimeout, 7 | PromiseWithTimeoutError, 8 | } from '../../helpers/promiseWithTimeout'; 9 | import { stats } from '../../helpers/stats'; 10 | import { DATA_REGEXP, IGNORED_RESOURCES } from '../constants'; 11 | import { cleanErrorMessage } from '../helpers/errors'; 12 | import { isURLAllowed } from '../helpers/validateURL'; 13 | import { adblocker } from '../singletons'; 14 | import type { PageMetrics, Perf, TaskBaseParams } from '../types'; 15 | 16 | import type { BrowserEngine } from './Browser'; 17 | import { DEFAULT_ENGINE } from './Browser'; 18 | import { 19 | METRICS_IGNORED_ERRORS, 20 | REQUEST_IGNORED_ERRORS, 21 | RESPONSE_IGNORED_ERRORS, 22 | } from './constants'; 23 | 24 | /** 25 | * Abstract some logics around playwright pages. 26 | */ 27 | export class BrowserPage { 28 | #ref: Page | undefined; 29 | #context: BrowserContext | undefined; 30 | #engine: BrowserEngine; 31 | #metrics: PageMetrics = { 32 | timings: { 33 | download: 0, 34 | }, 35 | requests: { 36 | total: 0, 37 | blocked: 0, 38 | pending: 0, 39 | }, 40 | contentLength: { 41 | main: 0, 42 | total: 0, 43 | }, 44 | mem: { 45 | jsHeapUsedSize: null, 46 | jsHeapTotalSize: null, 47 | }, 48 | }; 49 | #redirection?: string; 50 | #hasTimeout: boolean = false; 51 | #initialResponse?: Response; 52 | 53 | get ref(): Page | undefined { 54 | return this.#ref; 55 | } 56 | 57 | get context(): BrowserContext | undefined { 58 | return this.#context; 59 | } 60 | 61 | get isReady(): boolean { 62 | return Boolean(this.#ref && this.#context); 63 | } 64 | 65 | get isClosed(): boolean { 66 | return this.#ref?.isClosed() === true; 67 | } 68 | 69 | get hasTimeout(): boolean { 70 | return this.#hasTimeout; 71 | } 72 | 73 | get redirection(): string | undefined { 74 | return this.#redirection; 75 | } 76 | 77 | get initialResponse(): Response | undefined { 78 | return this.#initialResponse; 79 | } 80 | 81 | get pendingRequests(): number { 82 | return this.#metrics.requests.pending; 83 | } 84 | 85 | constructor(context: BrowserContext, engine?: BrowserEngine) { 86 | this.#context = context; 87 | this.#engine = engine || DEFAULT_ENGINE; 88 | } 89 | 90 | /** 91 | * Create an empty page in a browser. 92 | */ 93 | async create(): Promise { 94 | const start = Date.now(); 95 | const page = await this.#context!.newPage(); 96 | 97 | stats.timing('renderscript.page.create', Date.now() - start); 98 | this.#ref = page; 99 | 100 | page.on('crash', () => { 101 | // e.g: crash happen on OOM. 102 | report(new Error('Page crashed'), { pageUrl: page.url() }); 103 | }); 104 | page.on('popup', () => { 105 | report(new Error('Popup created'), { pageUrl: page.url() }); 106 | }); 107 | page.on('request', (req) => { 108 | log.debug('request_start', { url: req.url(), pageUrl: page.url() }); 109 | this.#metrics.requests.pending += 1; 110 | }); 111 | page.on('requestfailed', (req) => { 112 | log.debug('request_failed', { url: req.url(), pageUrl: page.url() }); 113 | this.#metrics.requests.pending -= 1; 114 | }); 115 | page.on('requestfinished', async (req) => { 116 | if (log.isLevelEnabled('trace')) { 117 | const response = await req.response(); 118 | log.trace('request_finished', { 119 | url: req.url(), 120 | pageUrl: page.url(), 121 | requestHeaders: req.headers(), 122 | responseStatus: response?.status(), 123 | }); 124 | } else if (log.isLevelEnabled('debug')) { 125 | const response = await req.response(); 126 | log.debug('request_finished', { 127 | url: req.url(), 128 | pageUrl: page.url(), 129 | responseStatus: response?.status(), 130 | }); 131 | } 132 | this.#metrics.requests.pending -= 1; 133 | }); 134 | } 135 | 136 | /** 137 | * Destroy the page and the private context. 138 | */ 139 | async close(): Promise { 140 | await this.#ref?.close(); 141 | this.#ref = undefined; 142 | } 143 | 144 | /** 145 | * We wrap goto to handle timeout. 146 | */ 147 | async goto( 148 | url: string, 149 | opts: Parameters[1] 150 | ): Promise { 151 | let response: Response | null = null; 152 | 153 | function onResponse(res: Response): void { 154 | // We listen to response because "goto" will throw on timeout but we still want to process the doc in that case 155 | if (!response) { 156 | response = res; 157 | } 158 | } 159 | this.#ref!.once('response', onResponse); 160 | 161 | const start = Date.now(); 162 | try { 163 | // Response can be assigned here or on('response') 164 | response = await this.#ref!.goto(url, opts); 165 | } catch (err: any) { 166 | if (!this.redirection && !err.message.includes('ERR_ABORTED')) { 167 | this.throwIfNotTimeout(err); 168 | } 169 | } finally { 170 | // We remove listener, because we don't want more response 171 | this.#ref!.removeListener('response', onResponse); 172 | } 173 | 174 | stats.timing('renderscript.page.goto', Date.now() - start, undefined, { 175 | success: response ? 'true' : 'false', 176 | waitUntil: opts?.waitUntil || 'unknown', 177 | }); 178 | 179 | if (!response) { 180 | // Can happen in case of chrome crash 181 | throw new Error('goto_no_response'); 182 | } 183 | 184 | return response; 185 | } 186 | 187 | /** 188 | * Wait for navigation with timeout handling. 189 | */ 190 | async waitForNavigation(opts: { 191 | timeout: number; 192 | waitUntil: Parameters[0]; 193 | }): Promise { 194 | let response: Response | null = null; 195 | function onResponse(res: Response): void { 196 | // We listen to response because "goto" will throw on timeout but we still want to process the doc in that case 197 | if (!response) { 198 | response = res; 199 | } 200 | } 201 | this.#ref!.once('response', onResponse); 202 | 203 | try { 204 | if (this.#ref) { 205 | await this.#ref.waitForLoadState(opts.waitUntil, opts); 206 | response = await this.#ref.waitForResponse( 207 | (res) => res.status() >= 200 && res.status() < 400, 208 | opts 209 | ); 210 | } 211 | } catch (err: any) { 212 | this.throwIfNotTimeout(err); 213 | } finally { 214 | // We remove listener, because we don't want more response 215 | this.#ref!.removeListener('response', onResponse); 216 | } 217 | 218 | return response; 219 | } 220 | 221 | /** 222 | * Get performance metrics from the page. 223 | * This function can fail silently because it's non-critical resource. 224 | * If that happen it will return previous metrics. 225 | */ 226 | async saveMetrics(): Promise { 227 | try { 228 | if (!this.#ref || this.#ref.isClosed()) { 229 | // page has been closed or not yet open 230 | return this.#metrics; 231 | } 232 | 233 | const evaluate = await promiseWithTimeout( 234 | this.#ref!.evaluate(() => { 235 | return JSON.stringify({ 236 | curr: performance.getEntriesByType('navigation')[0], 237 | all: performance.getEntries(), 238 | // @ts-expect-error only exists in chromium 239 | mem: performance.memory || {}, 240 | }); 241 | }), 242 | 200 243 | ); 244 | 245 | if (!evaluate) { 246 | throw new Error('Getting perf error'); 247 | } 248 | const perf: Perf = JSON.parse(evaluate); 249 | 250 | this.#metrics.timings.download = Math.round(perf.curr.duration || 0); 251 | this.#metrics.mem = { 252 | jsHeapUsedSize: perf.mem.usedJSHeapSize || 0, 253 | jsHeapTotalSize: perf.mem.totalJSHeapSize || 0, 254 | }; 255 | } catch (err: any) { 256 | if (!METRICS_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) { 257 | report(new Error('Error saving metrics'), { err }); 258 | } 259 | } 260 | 261 | return this.#metrics; 262 | } 263 | 264 | /** 265 | * Output body as a string at the moment it is requested. 266 | */ 267 | async renderBody( 268 | { silent }: { silent: boolean } = { silent: false } 269 | ): Promise { 270 | try { 271 | return await promiseWithTimeout( 272 | (async (): Promise => { 273 | const start = Date.now(); 274 | const content = await this.#ref?.content(); 275 | stats.timing('renderscript.renderBody', Date.now() - start, { 276 | browser: this.#engine as string, 277 | }); 278 | return content || null; 279 | })(), 280 | 10000 // this is the most important part so we try hard 281 | ); 282 | } catch (err: any) { 283 | if (!(err instanceof PromiseWithTimeoutError)) { 284 | if (!silent) { 285 | throw err; 286 | } 287 | } 288 | report(err, { 289 | url: this.ref?.url(), 290 | browser: this.#engine, 291 | action: 'renderBody', 292 | }); 293 | } 294 | return null; 295 | } 296 | 297 | /** 298 | * Add cookies to the context. 299 | */ 300 | async setCookies({ url, headersToForward }: TaskBaseParams): Promise { 301 | const cookies = headersToForward!.cookie.split('; ').map((cookie) => { 302 | const [key, ...v] = cookie.split('='); 303 | return { domain: url.hostname, path: '/', name: key, value: v.join('=') }; 304 | }); 305 | 306 | try { 307 | await this.#context!.addCookies(cookies); 308 | } catch (err) { 309 | report(new Error('Failed to set cookie'), { err, url }); 310 | } 311 | } 312 | 313 | /** 314 | * Disable service workers, this is recommended. 315 | */ 316 | async setDisableServiceWorker(): Promise { 317 | await this.#context!.addInitScript(() => { 318 | // @ts-expect-error read-only prop 319 | delete window.navigator.serviceWorker; 320 | }); 321 | this.#ref!.on('worker', () => { 322 | report(new Error('WebWorker disabled but created'), { 323 | pageUrl: this.#ref!.url(), 324 | }); 325 | }); 326 | } 327 | 328 | /** 329 | * Disable navigation. Only opt-in because Login requires navigation. 330 | * Because playwright has some limitation we can't cancel redirection directly, so it's not bulletproof. 331 | * Request will most likely be interrupted but due do code lag and event we can still have time to reach the backend. 332 | */ 333 | setDisableNavigation( 334 | originalUrl: string, 335 | onNavigation: (url: string) => Promise 336 | ): void { 337 | this.#ref?.on('framenavigated', async (frame) => { 338 | const newUrl = new URL(frame.url()); 339 | newUrl.hash = ''; 340 | if (originalUrl === newUrl.href) { 341 | return; 342 | } 343 | if (frame.parentFrame()) { 344 | // Sub Frame we don't care 345 | return; 346 | } 347 | if (newUrl.href === 'chrome-error://chromewebdata/') { 348 | // Page crashed 349 | return; 350 | } 351 | if (!this.#redirection) { 352 | // Can happen that on('framenavigated') event comes before on('request') 353 | this.#redirection = newUrl.href; 354 | } 355 | 356 | await onNavigation(newUrl.href); 357 | 358 | // We still report just in case. 359 | log.warn( 360 | { 361 | pageUrl: originalUrl, 362 | to: newUrl.href, 363 | }, 364 | 'Unexpected navigation' 365 | ); 366 | }); 367 | 368 | this.#ref?.on('request', async (req) => { 369 | const newUrl = new URL(req.url()); 370 | 371 | // Playwright does not route redirection to route() so we need to manually catch them 372 | const main = req.frame().parentFrame() === null; 373 | const redir = req.isNavigationRequest(); 374 | 375 | if (!redir || (redir && !main) || originalUrl === newUrl.href) { 376 | return; 377 | } 378 | 379 | newUrl.hash = ''; 380 | if (originalUrl === newUrl.href) { 381 | return; 382 | } 383 | 384 | log.info('Will navigate', { pageUrl: originalUrl, url: newUrl.href }); 385 | 386 | this.#redirection = newUrl.href; 387 | await onNavigation(newUrl.href); 388 | }); 389 | } 390 | 391 | /** 392 | * Helper to throw if an error is not timeout so we can reuse the response easily. 393 | */ 394 | throwIfNotTimeout(err: any): Error { 395 | if (!(err instanceof Error) || err.name !== 'TimeoutError') { 396 | throw err; 397 | } 398 | 399 | // This error is expected has most page will reach timeout 400 | // we want to continue because we can still have a response 401 | this.#hasTimeout = true; 402 | return err; 403 | } 404 | 405 | /** 406 | * Get a generic request handler (route). 407 | * That will disallow most content a. 408 | */ 409 | getOnRequestHandler({ 410 | url, 411 | adblock, 412 | headersToForward, 413 | }: TaskBaseParams): (route: Route) => Promise { 414 | return async (route: Route): Promise => { 415 | const req = route.request(); 416 | const reqUrl = req.url(); 417 | this.#metrics.requests.total += 1; 418 | 419 | try { 420 | if (this.#hasTimeout) { 421 | // If the page was killed in the meantime we don't want to process anything else 422 | await route.abort('blockedbyclient'); 423 | return; 424 | } 425 | 426 | // Skip data URIs 427 | if (DATA_REGEXP.test(reqUrl)) { 428 | this.#metrics.requests.blocked += 1; 429 | await route.abort('blockedbyclient'); 430 | return; 431 | } 432 | 433 | // Iframe block 434 | if (req.frame().parentFrame()) { 435 | this.#metrics.requests.blocked += 1; 436 | 437 | await route.abort('blockedbyclient'); 438 | return; 439 | } 440 | 441 | // Ignore some type of resources 442 | if (IGNORED_RESOURCES.includes(req.resourceType())) { 443 | this.#metrics.requests.blocked += 1; 444 | 445 | await route.abort('blockedbyclient'); 446 | return; 447 | } 448 | 449 | // Adblocking 450 | if (adblock && adblocker.match(new URL(reqUrl))) { 451 | this.#metrics.requests.blocked += 1; 452 | 453 | await route.abort('blockedbyclient'); 454 | return; 455 | } 456 | 457 | // Check for ssrf attempts = page that redirects to localhost for example 458 | if (!(await isURLAllowed(reqUrl))) { 459 | this.#metrics.requests.blocked += 1; 460 | await route.abort('blockedbyclient'); 461 | return; 462 | } 463 | 464 | if (req.isNavigationRequest()) { 465 | const headers = await req.allHeaders(); 466 | await route.continue({ 467 | // headers ignore values set for `Cookie`, relies to page.setCookie instead 468 | headers: { ...headers, ...headersToForward }, 469 | }); 470 | return; 471 | } 472 | 473 | await route.continue(); 474 | } catch (err: any) { 475 | if (REQUEST_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) { 476 | return; 477 | } 478 | 479 | report(err, { 480 | context: 'onRequest', 481 | url: url.href, 482 | with: reqUrl, 483 | browser: this.#engine, 484 | }); 485 | } 486 | }; 487 | } 488 | 489 | getOnResponseHandler({ 490 | url, 491 | }: TaskBaseParams): (res: Response) => Promise { 492 | return async (res: Response) => { 493 | try { 494 | if (this.#hasTimeout) { 495 | // If the page was killed in the meantime we don't want to process anything else 496 | return; 497 | } 498 | 499 | if (this.isClosed) { 500 | return; 501 | } 502 | 503 | // Check if response is still valid before accessing properties 504 | const reqRes = await res.request().response(); 505 | if (!reqRes) { 506 | // Response is no longer valid 507 | return; 508 | } 509 | 510 | const reqUrl = res.url(); 511 | 512 | // Check if headers can be accessed safely 513 | let headers; 514 | try { 515 | headers = await res.allHeaders(); 516 | } catch (err: any) { 517 | if (REQUEST_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) { 518 | return; 519 | } 520 | throw err; 521 | } 522 | 523 | let length = 0; 524 | 525 | // Store initial response in case of navigation 526 | if (!this.#initialResponse) { 527 | this.#initialResponse = res; 528 | } 529 | 530 | if (headers['content-length']) { 531 | length = parseInt(headers['content-length'], 10); 532 | } 533 | 534 | const status = res.status(); 535 | 536 | // Redirections do not have a body 537 | if (status > 300 && status < 400) { 538 | return; 539 | } 540 | 541 | try { 542 | if (length === 0 && !this.isClosed) { 543 | // Not every request has the content-length header, the byteLength match perfectly 544 | // but does not necessarly represent what was transfered (if it was gzipped for example) 545 | try { 546 | length = (await res.body()).byteLength; 547 | } catch (bodyErr: any) { 548 | // eslint-disable-next-line max-depth 549 | if ( 550 | REQUEST_IGNORED_ERRORS.some((msg) => 551 | bodyErr.message.includes(msg) 552 | ) 553 | ) { 554 | return; 555 | } 556 | throw bodyErr; 557 | } 558 | } 559 | 560 | if (reqUrl === url.href) { 561 | // If this is our original URL we log it to a dedicated metric 562 | this.#metrics.contentLength.main = length; 563 | } 564 | 565 | this.#metrics.contentLength.total += length; 566 | } catch (err: any) { 567 | if ( 568 | RESPONSE_IGNORED_ERRORS.some((msg) => err.message.includes(msg)) 569 | ) { 570 | return; 571 | } 572 | 573 | // We can not throw in callback, it will go directly into unhandled 574 | report(err, { context: 'onResponse', pageUrl: url.href, reqUrl }); 575 | } 576 | } catch (err: any) { 577 | if (RESPONSE_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) { 578 | return; 579 | } 580 | report(err, { context: 'onResponseHandler', pageUrl: url.href }); 581 | } 582 | }; 583 | } 584 | 585 | /** 586 | * Returns the URL if found. 587 | */ 588 | async checkForHttpEquivRefresh({ 589 | timeout, 590 | }: { 591 | timeout: number; 592 | }): Promise { 593 | if (!this.#ref) { 594 | return; 595 | } 596 | 597 | try { 598 | const url = new URL(this.#ref.url()); 599 | const metaRefreshElement = this.#ref.locator( 600 | 'meta[http-equiv="refresh"]' 601 | ); 602 | 603 | if (!metaRefreshElement || (await metaRefreshElement.count()) <= 0) { 604 | return; 605 | } 606 | 607 | const el = (await metaRefreshElement.elementHandle({ timeout }))!; 608 | const metaRefreshContent = await el.getProperty('content'); 609 | const refreshContent = await metaRefreshContent?.jsonValue(); 610 | const match = refreshContent?.match(/\d+;\s(?:url|URL)=(.*)/); 611 | if (!match) { 612 | return; 613 | } 614 | 615 | // Sometimes URLs are surrounded by quotes 616 | const matchedURL = match[1].replace(/'/g, ''); 617 | const redirectURL = new URL(matchedURL, url); 618 | 619 | log.debug('Meta refresh found', { redir: redirectURL.href }); 620 | 621 | return redirectURL; 622 | } catch (err: any) { 623 | if (err instanceof Error && cleanErrorMessage(err) !== 'unknown_error') { 624 | return; 625 | } 626 | report(new Error('Error while trying to check for meta refresh'), { 627 | err, 628 | timeout: this.#hasTimeout, 629 | }); 630 | } 631 | } 632 | } 633 | -------------------------------------------------------------------------------- /src/lib/browser/TimeBudget.test.ts: -------------------------------------------------------------------------------- 1 | import { wait } from '../../helpers/wait'; 2 | 3 | import { TimeBudget } from './TimeBudget'; 4 | 5 | describe('consume()', () => { 6 | it('should consume correctly', async () => { 7 | const tb = new TimeBudget(100); 8 | tb.consume(); 9 | expect(tb.get()).toBeGreaterThan(98); 10 | expect(tb.consumed).toBeGreaterThanOrEqual(0); 11 | expect(tb.consumed).toBeLessThanOrEqual(2); 12 | 13 | await wait(10); 14 | tb.consume(); 15 | expect(tb.get()).toBeGreaterThanOrEqual(80); 16 | expect(tb.get()).toBeLessThanOrEqual(90); 17 | 18 | expect(tb.consumed).toBeGreaterThanOrEqual(10); 19 | }); 20 | }); 21 | 22 | describe('get()', () => { 23 | it('should return correct get', async () => { 24 | const tb = new TimeBudget(100); 25 | expect(tb.get()).toBeGreaterThanOrEqual(99); 26 | 27 | await wait(100); 28 | tb.consume(); 29 | 30 | expect(tb.get()).toBeGreaterThanOrEqual(1); 31 | expect(tb.get()).toBeLessThanOrEqual(2); 32 | }); 33 | }); 34 | 35 | describe('min()', () => { 36 | it('should return correct min', async () => { 37 | const tb = new TimeBudget(100); 38 | expect(tb.min(99)).toBeGreaterThanOrEqual(99); 39 | 40 | await wait(60); 41 | tb.consume(); 42 | 43 | // Still 99 even if budget does not allow 44 | expect(tb.min(99)).toBeGreaterThanOrEqual(99); 45 | }); 46 | }); 47 | 48 | describe('getRange()', () => { 49 | it('should return correct inside range', () => { 50 | const tb = new TimeBudget(100); 51 | expect(tb.getRange(0, 10)).toBe(10); 52 | }); 53 | it('should return correct outside range', () => { 54 | const tb = new TimeBudget(100); 55 | expect(tb.getRange(0, 200)).toBe(100); 56 | }); 57 | it('should return correct outside range but forced', () => { 58 | const tb = new TimeBudget(100); 59 | expect(tb.getRange(200, 300)).toBe(200); 60 | }); 61 | }); 62 | -------------------------------------------------------------------------------- /src/lib/browser/TimeBudget.ts: -------------------------------------------------------------------------------- 1 | export class TimeBudget { 2 | max: number; 3 | consumed: number = 0; 4 | lastConsumption: number = Date.now(); 5 | 6 | constructor(max: number) { 7 | this.max = max; 8 | } 9 | 10 | /** 11 | * Consume budget. 12 | * 13 | * @returns Number - What was consumed compared to prev call. 14 | */ 15 | consume(): number { 16 | const consumed = Date.now() - this.lastConsumption; 17 | this.consumed += consumed; 18 | this.lastConsumption = Date.now(); 19 | return consumed; 20 | } 21 | 22 | get(): number { 23 | // Not 0, because 0 === unlimited 24 | return Math.max(1, this.max - this.consumed); 25 | } 26 | 27 | min(min: number): number { 28 | return Math.max(min, this.get()); 29 | } 30 | 31 | getRange(min: number, max: number): number { 32 | return Math.max(min, Math.min(max, this.get())); 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /src/lib/browser/constants.ts: -------------------------------------------------------------------------------- 1 | export const RESPONSE_IGNORED_ERRORS = [ 2 | // 200 no body, HEAD, OPTIONS 3 | 'No data found for resource with given identifier', 4 | 'No resource with given identifier found', 5 | // Too big to fit in memory, or memory filled 6 | 'Request content was evicted from inspector cache', 7 | // Protocol error, js redirect or options 8 | 'This might happen if the request is a preflight request', 9 | // Can happen if the page that trigger this response was closed in the meantime 10 | 'Target closed', 11 | 'Target page, context or browser has been closed', 12 | 'Target has been closed', 13 | 'Browser has been disconnected', 14 | ]; 15 | 16 | export const REQUEST_IGNORED_ERRORS = ['Request is already handled']; 17 | 18 | export const GOTO_IGNORED_ERRORS = ['Navigation timeout']; 19 | 20 | export const VALIDATE_URL_IGNORED_ERRORS = ['ENOTFOUND', 'EAI_AGAIN']; 21 | 22 | export const METRICS_IGNORED_ERRORS = [ 23 | // Navigation or page closed, okay for metrics 24 | 'Target closed', 25 | 'Target page, context or browser has been closed', 26 | 'Target has been closed', 27 | 'Browser has been disconnected', 28 | 'Execution context was destroyed', 29 | 'Renderscript Controlled Timeout', 30 | ]; 31 | 32 | export const WIDTH = 1280; 33 | export const HEIGHT = 1024; 34 | 35 | export const flags = [ 36 | // Disable sandboxing when not available 37 | '--no-sandbox', 38 | '--disable-setuid-sandbox', 39 | '--no-zygote', 40 | // No GPU available inside Docker 41 | '--disable-gpu', 42 | // Seems like a powerful hack, not sure why 43 | // https://github.com/Codeception/CodeceptJS/issues/561 44 | "--proxy-server='direct://'", 45 | '--proxy-bypass-list=*', 46 | // Disable cache 47 | // '--disk-cache-dir=/dev/null', 48 | '--media-cache-size=1', 49 | '--disk-cache-size=1', 50 | // Disable useless UI features 51 | '--disable-extensions', 52 | '--disable-features=Translate', 53 | '--disable-infobars', 54 | '--disable-notifications', 55 | '--disable-translate', 56 | '--no-default-browser-check', 57 | '--no-first-run', // screen on very first run 58 | '--noerrdialogs', 59 | '--disable-background-timer-throttling', 60 | '--disable-backgrounding-occluded-windows', 61 | '--disable-password-generation', 62 | '--disable-prompt-on-repos', 63 | '--disable-save-password-bubble', 64 | '--disable-single-click-autofill', 65 | '--disable-restore-session-state', 66 | '--disable-translate', 67 | '--disable-new-profile-management', 68 | '--disable-new-avatar-menu', 69 | '--disable-infobars', 70 | '--disable-device-discovery-notifications', 71 | '--disable-client-side-phishing-detection', 72 | '--disable-notifications', 73 | '--disable-component-extensions-with-background-pages', 74 | // Disable dev-shm 75 | // See https://github.com/GoogleChrome/puppeteer/blob/master/docs/troubleshooting.md#tips 76 | '--disable-dev-shm-usage', 77 | 78 | '--enable-automation', 79 | '--disable-print-preview', 80 | // https://github.com/cypress-io/cypress/issues/5132 81 | '--disable-ipc-flooding-protection', 82 | 83 | // Taken from https://github.com/cypress-io/cypress/blob/develop/packages/server/lib/browsers/chrome.ts 84 | // "--disable-background-networking" 85 | '--disable-web-resources', 86 | '--safebrowsing-disable-auto-update', 87 | '--safebrowsing-disable-download-protection', 88 | '--disable-client-side-phishing-detection', 89 | '--disable-component-update', 90 | '--disable-default-apps', 91 | 92 | // Crash reporter 93 | '--disable-breakpad', 94 | '--disable-crash-reporter', 95 | ]; 96 | -------------------------------------------------------------------------------- /src/lib/constants.ts: -------------------------------------------------------------------------------- 1 | import { PRIVATE_IP_PREFIXES } from '@algolia/dns-filter'; 2 | 3 | export const IP_PREFIXES_WHITELIST = process.env.IP_PREFIXES_WHITELIST 4 | ? process.env.IP_PREFIXES_WHITELIST.split(',') 5 | : ['127.', '0.', '::1']; 6 | 7 | export const RESTRICTED_IPS = 8 | process.env.ALLOW_LOCALHOST === 'true' 9 | ? PRIVATE_IP_PREFIXES.filter( 10 | (prefix: string) => !IP_PREFIXES_WHITELIST.includes(prefix) 11 | ) // relax filtering 12 | : PRIVATE_IP_PREFIXES; 13 | 14 | export const IGNORED_RESOURCES = [ 15 | 'font', 16 | 'image', 17 | 'media', 18 | 'websocket', 19 | 'manifest', 20 | 'texttrack', 21 | ]; 22 | 23 | export const DATA_REGEXP = /^data:/i; 24 | 25 | export const WAIT_TIME = { 26 | min: 500, 27 | max: 20000, 28 | }; 29 | 30 | export const MAX_WAIT_FOR_NEW_PAGE = process.env.MAX_WAIT_FOR_NEW_PAGE 31 | ? parseInt(process.env.MAX_WAIT_FOR_NEW_PAGE, 10) 32 | : 6000; // In feb 2022 p95 < 6s 33 | 34 | export const UNHEALTHY_TASK_TTL = (MAX_WAIT_FOR_NEW_PAGE + WAIT_TIME.max) * 3; 35 | -------------------------------------------------------------------------------- /src/lib/helpers/errors.ts: -------------------------------------------------------------------------------- 1 | import type { HandledError, UnhandledError } from '../types'; 2 | 3 | export const retryableErrors: Array = [ 4 | 'body_serialisation_failed', 5 | 'connection_error', 6 | 'fetch_aborted', 7 | 'fetch_timeout', 8 | 'no_cookies', 9 | 'no_response_after_login', 10 | 'page_closed_too_soon', 11 | 'page_crashed', 12 | 'timedout', 13 | 'unknown_error', 14 | 'error_reading_response', 15 | ]; 16 | 17 | // eslint-disable-next-line eslint-comments/disable-enable-pair 18 | /* eslint-disable complexity */ 19 | export function cleanErrorMessage(error: Error): HandledError | UnhandledError { 20 | if ( 21 | error.message.includes('ERR_NAME_NOT_RESOLVED') || 22 | error.message.includes('ERR_ADDRESS_UNREACHABLE') 23 | ) { 24 | return 'dns_error'; 25 | } 26 | if ( 27 | error.message.includes('ERR_CONNECTION_REFUSED') || 28 | error.message.includes('ERR_CONNECTION_ABORTED') || 29 | error.message.includes('ERR_CONNECTION_CLOSED') || 30 | error.message.includes('ERR_CONNECTION_FAILED') || 31 | error.message.includes('ERR_INTERNET_DISCONNECTED') || 32 | error.message.includes('ERR_CONNECTION_RESET') 33 | ) { 34 | return 'connection_error'; 35 | } 36 | if (error.message.includes('ERR_ABORTED')) { 37 | return 'fetch_aborted'; 38 | } 39 | if ( 40 | error.message.includes('ETIMEDOUT') || 41 | error.message.includes('ESOCKETTIMEDOUT') 42 | ) { 43 | return 'fetch_timeout'; 44 | } 45 | if ( 46 | error.message.includes('Navigation failed because page was closed') || 47 | error.message.includes('Target closed') || 48 | error.message.includes('Target page, context or browser has been closed') || 49 | error.message.includes('Target has been closed') || 50 | error.message.includes('Browser has been disconnected') 51 | ) { 52 | return 'page_closed_too_soon'; 53 | } 54 | if ( 55 | error.message.includes('goto_no_response') || 56 | error.message.includes('Navigation failed because page crashed') || 57 | error.message.includes('ERR_FAILED') || 58 | error.message.includes('Element is not attached to the DOM') 59 | ) { 60 | return 'page_crashed'; 61 | } 62 | if (error.message.includes('ERR_BLOCKED_BY_RESPONSE')) { 63 | return 'forbidden_by_website'; 64 | } 65 | if (error.message.includes('ERR_TIMED_OUT')) { 66 | // This is a generic error from playwright 67 | return 'timedout'; 68 | } 69 | 70 | return `unknown_error`; 71 | } 72 | 73 | export class ErrorIsHandledError extends Error {} 74 | -------------------------------------------------------------------------------- /src/lib/helpers/getInput.ts: -------------------------------------------------------------------------------- 1 | import type { Locator } from 'playwright'; 2 | 3 | import type { BrowserPage } from '../browser/Page'; 4 | import type { HandledError } from '../types'; 5 | 6 | /** 7 | * Get input for selector. 8 | */ 9 | export async function getInput( 10 | page: BrowserPage | undefined, 11 | sel: string 12 | ): Promise { 13 | const textInputLoc = page?.ref?.locator(sel); 14 | 15 | const count = textInputLoc ? await textInputLoc.count() : 0; 16 | if (!textInputLoc || count <= 0) { 17 | return { 18 | error: 'field_not_found', 19 | rawError: new Error(`Field not found "${sel}"`), 20 | }; 21 | } 22 | 23 | if (count > 1) { 24 | // sometimes another input can be hidden using CSS, 25 | // wait for the page to be fully loaded 26 | await page?.waitForNavigation({ 27 | waitUntil: 'load', 28 | timeout: 10_000, 29 | }); 30 | 31 | // check again but this time only for visible elements 32 | const visibleInputLoc = await textInputLoc.locator('visible=true'); 33 | const visibleCount = visibleInputLoc ? await visibleInputLoc.count() : 0; 34 | if (visibleCount === 1) { 35 | return visibleInputLoc; 36 | } 37 | 38 | return { 39 | error: 'too_many_fields', 40 | rawError: new Error( 41 | `Too many input found for "${sel}", found "${count}"` 42 | ), 43 | }; 44 | } 45 | 46 | return textInputLoc; 47 | } 48 | -------------------------------------------------------------------------------- /src/lib/helpers/injectBaseHref.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * Injects a tag which allows other resources to load on the 3 | * page without trying to get them from the `renderscript` server. 4 | * It has no effect on serialised output, but allows it to verify render 5 | * quality. 6 | */ 7 | export function injectBaseHref(origin: string): void { 8 | const base = document.createElement('base'); 9 | base.setAttribute('href', origin); 10 | 11 | const bases = document.head.querySelectorAll('base'); 12 | if (bases.length) { 13 | // Patch existing if it is relative. 14 | const existingBase = bases[0].getAttribute('href') || ''; 15 | if (existingBase.startsWith('/')) { 16 | bases[0].setAttribute('href', origin + existingBase); 17 | } 18 | } else { 19 | // Only inject if it doesn't already exist. 20 | document.head.insertAdjacentElement('afterbegin', base); 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/lib/helpers/validateURL.ts: -------------------------------------------------------------------------------- 1 | import { validateURL } from '@algolia/dns-filter'; 2 | 3 | import { report } from '../../helpers/errorReporting'; 4 | import { VALIDATE_URL_IGNORED_ERRORS } from '../browser/constants'; 5 | import { RESTRICTED_IPS } from '../constants'; 6 | 7 | export async function isURLAllowed(url: string): Promise { 8 | try { 9 | // Check for valid URL before validation 10 | // eslint-disable-next-line no-new 11 | new URL(url); 12 | } catch (e) { 13 | report(new Error('Invalid url'), { url, err: e }); 14 | return false; 15 | } 16 | try { 17 | await validateURL({ 18 | url, 19 | ipPrefixes: RESTRICTED_IPS, 20 | }); 21 | } catch (err: any) { 22 | if (!VALIDATE_URL_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) { 23 | report(new Error('Blocked url'), { err, url }); 24 | return false; 25 | } 26 | return true; 27 | } 28 | 29 | return true; 30 | } 31 | -------------------------------------------------------------------------------- /src/lib/singletons.ts: -------------------------------------------------------------------------------- 1 | import { report } from '../helpers/errorReporting'; 2 | import { log } from '../helpers/logger'; 3 | 4 | import { TasksManager } from './TasksManager'; 5 | import { Adblocker } from './browser/Adblocker'; 6 | 7 | export const tasksManager = new TasksManager(); 8 | export const adblocker = new Adblocker(); 9 | 10 | export async function init(): Promise { 11 | try { 12 | await tasksManager.launch(); 13 | await adblocker.load(); 14 | } catch (err: any) { 15 | report(new Error('Error during launch'), { err }); 16 | 17 | log.info('Exit'); 18 | setTimeout(() => { 19 | // eslint-disable-next-line no-process-exit 20 | process.exit(1); 21 | }, 1); 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /src/lib/tasks/Login.ts: -------------------------------------------------------------------------------- 1 | import type { ElementHandle, Response, Request, Locator } from 'playwright'; 2 | 3 | import { report } from '../../helpers/errorReporting'; 4 | import { waitForPendingRequests } from '../../helpers/waitForPendingRequests'; 5 | import { cleanErrorMessage } from '../helpers/errors'; 6 | import { getInput } from '../helpers/getInput'; 7 | import type { LoginTaskParams } from '../types'; 8 | 9 | import { Task } from './Task'; 10 | 11 | const usernameSelectors = [ 12 | 'input[type=email][id*=login i]', 13 | 'input[type=email][name*=login i]', 14 | 'input[type=text][id*=login i]', 15 | 'input[type=text][id*=email i]', 16 | 'input[type=text][id*=username i]', 17 | 'input[type=text][name*=login i]', 18 | 'input[type=text][name*=email i]', 19 | 'input[type=text][name*=username i]', 20 | 'input[type=email]', 21 | 'input[type=text]', 22 | ]; 23 | const passwordSel = 'input[type=password]:not([aria-hidden="true"])'; 24 | 25 | export class LoginTask extends Task { 26 | async process(): Promise { 27 | if (!this.page) { 28 | throw new Error('Calling process before createContext()'); 29 | } 30 | 31 | /* Setup */ 32 | const { url } = this.params; 33 | let response: Response; 34 | 35 | try { 36 | response = await this.page.goto(url.href, { 37 | timeout: this.timeBudget.get(), 38 | waitUntil: 'networkidle', 39 | }); 40 | } catch (err: any) { 41 | return this.throwHandledError({ error: err.message, rawError: err }); 42 | } finally { 43 | this.setMetric('goto'); 44 | } 45 | 46 | await this.saveStatus(response); 47 | 48 | const usernameInput = await this.#typeUsername(); 49 | await this.saveStatus(response); 50 | 51 | // Get the password input 52 | const passwordInput = await this.#typePasswordInput({ 53 | textInput: usernameInput!, 54 | step: '1', 55 | }); 56 | await this.saveStatus(response); 57 | 58 | // Submit 59 | await this.#submitForm(passwordInput!); 60 | 61 | await this.saveStatus(response); 62 | 63 | await this.minWait(); 64 | await this.saveStatus(response); 65 | if (!this.page.ref) { 66 | return; 67 | } 68 | 69 | /* Transforming */ 70 | this.results.resolvedUrl = this.page.ref.url(); 71 | // we get the cookie for the requested domain 72 | // this is not ideal for some SSO, returning valid cookies but missing some of them 73 | this.results.cookies = await this.page.ref 74 | ?.context() 75 | .cookies([url.href, this.results.resolvedUrl]); 76 | 77 | if (this.results.cookies.length <= 0) { 78 | return this.throwHandledError({ error: 'no_cookies' }); 79 | } 80 | 81 | const body = await this.page.renderBody(); 82 | this.results.body = body; 83 | this.setMetric('serialize'); 84 | } 85 | 86 | /** 87 | * Get username input and type the value in it. 88 | */ 89 | async #typeUsername(): Promise | void> { 92 | const { log, page, params } = this; 93 | const { login } = params; 94 | 95 | try { 96 | // We first check if there is form 97 | // Try multiple selector from the most to less precise 98 | let usernameInputLoc: Locator | null = null; 99 | for (const usernameSel of usernameSelectors) { 100 | const input = await getInput(page, usernameSel); 101 | if (!('error' in input)) { 102 | usernameInputLoc = input; 103 | break; 104 | } 105 | } 106 | if (!usernameInputLoc) { 107 | return this.throwHandledError({ 108 | error: 'field_not_found', 109 | rawError: new Error('Username field not found'), 110 | }); 111 | } 112 | 113 | log.info('Entering username...', { userName: login.username }); 114 | 115 | const usernameInput = await usernameInputLoc.elementHandle({ 116 | timeout: 500, 117 | }); 118 | // https://playwright.dev/docs/release-notes#version-138 119 | await usernameInput?.fill(login.username, { 120 | noWaitAfter: true, 121 | timeout: this.timeBudget.getRange(2000, 3000), 122 | }); 123 | 124 | return usernameInput!; 125 | } finally { 126 | this.timeBudget.consume(); 127 | } 128 | } 129 | 130 | /** 131 | * Get password input. 132 | */ 133 | async #typePasswordInput({ 134 | textInput, 135 | step, 136 | }: { 137 | textInput: ElementHandle; 138 | step: '1' | '2'; 139 | }): Promise | null | void> { 140 | const { page, params } = this; 141 | const { login } = params; 142 | 143 | try { 144 | // Find the input 145 | const passwordInputLoc = await getInput(page, passwordSel); 146 | if (!('error' in passwordInputLoc)) { 147 | this.log.info('Entering password...'); 148 | await passwordInputLoc.fill(login.password, { 149 | noWaitAfter: true, 150 | timeout: this.timeBudget.getRange(2000, 3000), 151 | }); 152 | 153 | return passwordInputLoc.elementHandle(); 154 | } 155 | 156 | if (passwordInputLoc.error === 'too_many_fields') { 157 | return this.throwHandledError(passwordInputLoc); 158 | } 159 | 160 | if (step === '2' && passwordInputLoc.error === 'field_not_found') { 161 | return this.throwHandledError(passwordInputLoc); 162 | } 163 | 164 | return await this.#handleFirstStepForm({ textInput }); 165 | } finally { 166 | this.timeBudget.consume(); 167 | } 168 | } 169 | 170 | /** 171 | * Try to submit first step form to get the password input. 172 | */ 173 | async #handleFirstStepForm({ 174 | textInput, 175 | }: { 176 | textInput: ElementHandle; 177 | }): Promise | null | void> { 178 | const log = this.log; 179 | 180 | // It can be that we are in a "two step form" 181 | log.info('No password input found: validating username...'); 182 | 183 | // Submit the form to see if the second step appears 184 | await textInput.press('Enter', { 185 | noWaitAfter: true, 186 | timeout: this.timeBudget.getRange(2000, 3000), 187 | }); 188 | this.timeBudget.consume(); 189 | 190 | // And wait for a new input to be there maybe 191 | // page!.waitForNavigation() doesn't work with Okta for example, it's JS based 192 | await this.page!.ref?.waitForSelector(passwordSel, { 193 | timeout: this.timeBudget.min(3000), 194 | }); 195 | this.timeBudget.consume(); 196 | 197 | log.debug('Current URL', { pageUrl: this.page!.ref?.url() }); 198 | return this.#typePasswordInput({ textInput, step: '2' }); 199 | } 200 | 201 | /** 202 | * Submit form and wait for response or something to happen. 203 | */ 204 | async #submitForm( 205 | passwordInput: ElementHandle 206 | ): Promise { 207 | const log = this.log; 208 | const { url } = this.params; 209 | let res: Response | null = null; 210 | 211 | try { 212 | log.debug(`Submit login form`); 213 | // We don't submit form directly because sometimes there are no form 214 | // We wait both at the same time because navigation happens quickly 215 | [res] = await Promise.all([ 216 | this.page!.waitForNavigation({ 217 | timeout: this.timeBudget.min(3000), 218 | waitUntil: 'domcontentloaded', 219 | }), 220 | passwordInput.press('Enter', { 221 | noWaitAfter: true, 222 | timeout: this.timeBudget.getRange(2000, 3000), 223 | }), 224 | ]); 225 | } catch (err: any) { 226 | this.page!.throwIfNotTimeout(err); 227 | } finally { 228 | this.timeBudget.consume(); 229 | } 230 | 231 | try { 232 | log.debug(`Login wait for network idle`); 233 | const timeBudget = this.timeBudget.get(); 234 | const startWaitTime = Date.now(); 235 | 236 | // After it is submitted there can quite a lof ot redirections, so we wait a bit more 237 | // we could do it before, but it's easier to split domcontentloaded and networkidle for debug 238 | const [resAfterNetwork] = await Promise.all([ 239 | this.page!.waitForNavigation({ 240 | timeout: this.timeBudget.min(5000), 241 | waitUntil: 'networkidle', 242 | }), 243 | ]); 244 | if (resAfterNetwork) { 245 | // if no navigation happened, resAfterNetwork is null 246 | // but we don't want to erase res because it is most of the time normal if we already reached the final page 247 | res = resAfterNetwork; 248 | } 249 | const timeWaited = Date.now() - startWaitTime; 250 | await waitForPendingRequests(this.page!, timeBudget - timeWaited); 251 | } catch (err: any) { 252 | report(new Error('Error waiting to submit form'), { 253 | err: err.message, 254 | pageUrl: this.page!.ref?.url(), 255 | }); 256 | return this.throwHandledError({ 257 | error: cleanErrorMessage(err), 258 | rawError: err, 259 | }); 260 | } finally { 261 | this.timeBudget.consume(); 262 | } 263 | 264 | const hasSpecialCase = this.#needSpecialCase(); 265 | if (hasSpecialCase) { 266 | log.debug(`Login wait for spec`); 267 | try { 268 | const [resAfterSpec] = await Promise.all([ 269 | this.page!.waitForNavigation({ 270 | timeout: this.timeBudget.min(5000), 271 | waitUntil: 'networkidle', 272 | }), 273 | this.#handleSpecialCaseForm({ name: hasSpecialCase }), 274 | ]); 275 | if (resAfterSpec) { 276 | res = resAfterSpec; 277 | } 278 | } catch (err: any) { 279 | this.page!.throwIfNotTimeout(err); 280 | } finally { 281 | this.timeBudget.consume(); 282 | } 283 | } 284 | 285 | if (!res) { 286 | if (this.page!.ref?.url() === url.href) { 287 | // Return an error if we got no login response and are still on the same URL 288 | return this.throwHandledError({ error: 'no_response_after_login' }); 289 | } 290 | 291 | // Can happen if navigation was done through History API 292 | log.debug('No login response, but redirected', { 293 | pageUrl: this.page!.ref?.url(), 294 | }); 295 | return; 296 | } 297 | 298 | // Computing redirection chain. 299 | const chain = []; 300 | let prev: Request | null = res.request(); 301 | while (prev) { 302 | prev = prev.redirectedFrom(); 303 | if (!prev) { 304 | prev = null; 305 | break; 306 | } 307 | chain.push(prev.url()); 308 | } 309 | log.debug('Login after redirections', { 310 | pageUrl: this.page!.ref?.url(), 311 | chain, 312 | }); 313 | } 314 | 315 | #needSpecialCase(): 'login.live.com' | false { 316 | if (!this.page?.ref) { 317 | return false; 318 | } 319 | 320 | const currentUrl = this.page.ref.url(); 321 | if (currentUrl.startsWith('https://login.live.com')) { 322 | return 'login.live.com'; 323 | } 324 | 325 | return false; 326 | } 327 | 328 | async #handleSpecialCaseForm({ 329 | name, 330 | }: { 331 | name: 'login.live.com'; 332 | }): Promise { 333 | const { log } = this; 334 | if (!this.page?.ref) { 335 | return; 336 | } 337 | 338 | // Spec for Microsoft SSO 339 | if (name === 'login.live.com') { 340 | log.debug('MSFT: Entering specs'); 341 | 342 | // There is a "Keep me sign in?" checkbox now 343 | const confirm = this.page.ref.locator('#KmsiCheckboxField'); 344 | const submit = this.page.ref.locator('input[type=submit]'); 345 | 346 | if ((await confirm.count()) === 1 && (await submit.count()) === 1) { 347 | log.debug('MSFT: found confirm and submit'); 348 | 349 | await confirm.click({ 350 | timeout: this.timeBudget.getRange(200, 500), 351 | noWaitAfter: true, // Otherwise wait for navigation 352 | }); 353 | 354 | await submit.click({ 355 | timeout: this.timeBudget.getRange(200, 500), 356 | noWaitAfter: true, // Otherwise wait for navigation 357 | }); 358 | } 359 | } 360 | } 361 | } 362 | -------------------------------------------------------------------------------- /src/lib/tasks/Render.ts: -------------------------------------------------------------------------------- 1 | import type { Response } from 'playwright'; 2 | 3 | import { 4 | promiseWithTimeout, 5 | PromiseWithTimeoutError, 6 | } from '../../helpers/promiseWithTimeout'; 7 | import { waitForPendingRequests } from '../../helpers/waitForPendingRequests'; 8 | import { RESPONSE_IGNORED_ERRORS } from '../browser/constants'; 9 | import { cleanErrorMessage } from '../helpers/errors'; 10 | import type { RenderTaskParams } from '../types'; 11 | 12 | import { Task } from './Task'; 13 | 14 | export class RenderTask extends Task { 15 | async process(): Promise { 16 | if (!this.page) { 17 | throw new Error('Calling process before createContext()'); 18 | } 19 | 20 | /* Setup */ 21 | const { url } = this.params; 22 | let response: Response; 23 | 24 | // Important to catch any redirect 25 | this.page.setDisableNavigation(url.href, async (newUrl) => { 26 | this.results.error = 'redirection'; 27 | this.results.resolvedUrl = newUrl; 28 | 29 | // We save the status of the page before the navigation (hopefully) 30 | await this.page?.saveMetrics(); 31 | 32 | // Hard close of the page to avoid reaching the backend 33 | await this.page?.close(); 34 | }); 35 | 36 | try { 37 | response = await this.page.goto(url.href, { 38 | timeout: this.timeBudget.get(), 39 | waitUntil: 'domcontentloaded', 40 | }); 41 | } catch (err: any) { 42 | return this.throwHandledError({ 43 | error: this.results.error || cleanErrorMessage(err), 44 | rawError: err, 45 | }); 46 | } finally { 47 | this.setMetric('goto'); 48 | } 49 | 50 | // --- At this point we have just the DOM, but we want to do some checks 51 | await this.saveMetrics(); 52 | 53 | // In case of redirection, initialResponse is preferred since response is probably now incorrect 54 | await this.saveStatus(this.page.initialResponse || response); 55 | 56 | if (this.page.redirection) { 57 | this.results.resolvedUrl = 58 | this.results.resolvedUrl || this.page.redirection; 59 | return this.throwHandledError({ 60 | error: this.results.error || 'redirection', 61 | }); 62 | } 63 | 64 | // Check for html refresh 65 | try { 66 | const redirect = await promiseWithTimeout( 67 | this.page.checkForHttpEquivRefresh({ 68 | timeout: this.timeBudget.getRange(1000, 3000), 69 | }), 70 | 3000 71 | ); 72 | if (redirect) { 73 | this.results.resolvedUrl = redirect.href; 74 | return this.throwHandledError({ 75 | error: this.results.error || 'redirection', 76 | }); 77 | } 78 | } catch (err) { 79 | if (!(err instanceof PromiseWithTimeoutError)) { 80 | throw err; 81 | } 82 | } finally { 83 | this.setMetric('equiv'); 84 | } 85 | 86 | if (this.results.statusCode !== 200) { 87 | // Everything is different than OK is not worth processing 88 | this.results.body = await this.page.renderBody(); 89 | return; 90 | } 91 | 92 | // --- Basic checks passed we wait a bit more to page to render 93 | try { 94 | const timeBudget = this.timeBudget.get(); 95 | const startWaitTime = Date.now(); 96 | 97 | try { 98 | await this.page.ref?.waitForLoadState('networkidle', { 99 | timeout: timeBudget, 100 | }); 101 | } catch (waitErr: any) { 102 | // Check if this is a redirection first 103 | if (this.page.redirection) { 104 | this.results.resolvedUrl = 105 | this.results.resolvedUrl || this.page.redirection; 106 | return this.throwHandledError({ 107 | error: this.results.error || 'redirection', 108 | rawError: waitErr, 109 | }); 110 | } 111 | if ( 112 | RESPONSE_IGNORED_ERRORS.some((msg) => waitErr.message.includes(msg)) 113 | ) { 114 | // Page was closed while waiting 115 | return this.throwHandledError({ 116 | error: 'page_closed_too_soon', 117 | rawError: waitErr, 118 | }); 119 | } 120 | throw waitErr; // Re-throw if it's not a target closed error 121 | } 122 | 123 | const timeWaited = Date.now() - startWaitTime; 124 | await waitForPendingRequests(this.page!, timeBudget - timeWaited); 125 | } catch (err: any) { 126 | this.page.throwIfNotTimeout(err); 127 | } finally { 128 | this.setMetric('ready'); 129 | } 130 | 131 | await this.minWait(); 132 | 133 | this.checkFinalURL(); 134 | 135 | /* Transforming */ 136 | // await page.evaluate(injectBaseHref, baseHref); 137 | const body = await this.page.renderBody({ silent: true }); 138 | if (body === null) { 139 | return this.throwHandledError({ error: 'body_serialisation_failed' }); 140 | } 141 | 142 | this.results.body = body; 143 | this.setMetric('serialize'); 144 | } 145 | 146 | private checkFinalURL(): void { 147 | const newUrl = this.page!.ref?.url() ? new URL(this.page!.ref.url()) : null; 148 | if (!newUrl) { 149 | // Redirected to nowhere 150 | this.results.resolvedUrl = 'about:blank/'; 151 | return this.throwHandledError({ error: 'wrong_redirection' }); 152 | } 153 | 154 | newUrl.hash = ''; 155 | if (newUrl.href !== this.params.url.href) { 156 | // Redirection was not caught this should not happen 157 | this.results.resolvedUrl = newUrl.href; 158 | return this.throwHandledError({ error: 'wrong_redirection' }); 159 | } 160 | } 161 | } 162 | -------------------------------------------------------------------------------- /src/lib/tasks/Task.ts: -------------------------------------------------------------------------------- 1 | import type { Logger } from 'pino'; 2 | import type { BrowserContext, Response } from 'playwright'; 3 | import { v4 as uuid } from 'uuid'; 4 | 5 | import { report } from '../../helpers/errorReporting'; 6 | import { log } from '../../helpers/logger'; 7 | import { stats } from '../../helpers/stats'; 8 | import type { Browser } from '../browser/Browser'; 9 | import { BrowserPage } from '../browser/Page'; 10 | import { TimeBudget } from '../browser/TimeBudget'; 11 | import { RESPONSE_IGNORED_ERRORS } from '../browser/constants'; 12 | import { WAIT_TIME } from '../constants'; 13 | import { ErrorIsHandledError } from '../helpers/errors'; 14 | import type { 15 | ErrorReturn, 16 | Metrics, 17 | TaskBaseParams, 18 | TaskResult, 19 | } from '../types'; 20 | 21 | export abstract class Task { 22 | id: string; 23 | params; 24 | page?: BrowserPage; 25 | createdAt?: Date; 26 | startedAt?: Date; 27 | results: TaskResult = { 28 | statusCode: null, 29 | body: null, 30 | headers: {}, 31 | error: null, 32 | rawError: null, 33 | resolvedUrl: null, 34 | cookies: [], 35 | }; 36 | log: Logger; 37 | timeBudget: TimeBudget; 38 | #metrics: Metrics = { 39 | timings: { 40 | context: null, 41 | goto: null, 42 | equiv: null, 43 | ready: null, 44 | minWait: null, 45 | serialize: null, 46 | close: null, 47 | total: null, 48 | }, 49 | renderingBudget: { 50 | max: 0, 51 | consumed: 0, 52 | }, 53 | page: null, 54 | }; 55 | 56 | #closed: boolean = false; 57 | #context?: BrowserContext; 58 | 59 | constructor(params: TTaskType, logger?: Logger) { 60 | this.id = uuid(); 61 | // Do not print this or pass it to reporting, it contains secrets 62 | this.params = { 63 | ...params, 64 | waitTime: { 65 | ...WAIT_TIME, 66 | ...params.waitTime, 67 | }, 68 | }; 69 | this.createdAt = new Date(); 70 | this.timeBudget = new TimeBudget(this.params.waitTime.max); 71 | this.#metrics.renderingBudget.max = this.timeBudget.max; 72 | this.log = logger ?? log.child({ svc: 'task', ctx: { id: this.id } }); 73 | } 74 | 75 | get metrics(): Metrics { 76 | return this.#metrics; 77 | } 78 | 79 | get isDone(): boolean { 80 | return this.#closed; 81 | } 82 | 83 | async close(): Promise { 84 | if (this.#closed) { 85 | return; 86 | } 87 | 88 | this.#closed = true; 89 | await this.page?.close(); 90 | await this.#context?.close(); 91 | this.setMetric('close'); 92 | 93 | this.metrics.timings.total = Date.now() - this.startedAt!.getTime(); 94 | this.#metrics.renderingBudget.consumed = this.timeBudget.consumed; 95 | this.#context = undefined; 96 | } 97 | 98 | /** 99 | * Create the incognito context and the page so each task has a fresh start. 100 | */ 101 | async createContext(browser: Browser): Promise { 102 | this.timeBudget.lastConsumption = Date.now(); 103 | this.startedAt = new Date(); 104 | 105 | const context = await browser.getNewContext({ 106 | userAgent: this.params.userAgent, 107 | }); 108 | context.setDefaultTimeout(WAIT_TIME.min); 109 | context.setDefaultNavigationTimeout(WAIT_TIME.max); 110 | 111 | const page = new BrowserPage(context, this.params.browser); 112 | this.page = page; 113 | this.#context = context; 114 | 115 | await page.create(); 116 | 117 | if (this.params.headersToForward?.cookie) { 118 | await page.setCookies(this.params); 119 | } 120 | 121 | await context.route('**/*', page.getOnRequestHandler(this.params)); 122 | // does not work await page.setDisableServiceWorker(); 123 | 124 | page.ref?.on('response', page.getOnResponseHandler(this.params)); 125 | 126 | this.setMetric('context'); 127 | } 128 | 129 | /** 130 | * Save status in results. 131 | */ 132 | async saveStatus(response: Response): Promise { 133 | try { 134 | this.results.statusCode = response.status(); 135 | this.results.headers = await response.allHeaders(); 136 | } catch (err: any) { 137 | return this.throwHandledError({ 138 | error: 'error_reading_response', 139 | rawError: err, 140 | }); 141 | } 142 | } 143 | 144 | /** 145 | * Wait for browser to execute more stuff before we kill the page. 146 | */ 147 | async minWait(): Promise { 148 | const minWait = this.params.waitTime!.min; 149 | const todo = minWait - this.timeBudget.consumed; 150 | if (todo <= 0) { 151 | return; 152 | } 153 | 154 | this.log.debug(`Waiting ${todo} extra ms...`); 155 | await this.page!.ref?.waitForTimeout(todo); 156 | this.setMetric('minWait'); 157 | } 158 | 159 | /** 160 | * Log metric and reduce time budget. 161 | */ 162 | setMetric(name: keyof Metrics['timings']): void { 163 | this.#metrics.timings[name] = this.timeBudget.consume(); 164 | stats.timing(`renderscript.page.${name}`, this.#metrics.timings[name]!); 165 | } 166 | 167 | /** 168 | * Save page metrics. 169 | */ 170 | async saveMetrics(): Promise { 171 | try { 172 | if (!this.page || this.page.isClosed) { 173 | // page has been closed 174 | return; 175 | } 176 | this.#metrics.page = await this.page.saveMetrics(); 177 | } catch (err: any) { 178 | // Can happen if target is already closed or redirection 179 | if (RESPONSE_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) { 180 | // Expected error when page is closed, no need to report 181 | return; 182 | } 183 | // Report other unexpected errors 184 | report(err, { context: 'saveMetrics' }); 185 | } 186 | } 187 | 188 | /** 189 | * Shortcut everything. 190 | */ 191 | throwHandledError(res: ErrorReturn): void { 192 | this.results.error = res.error; 193 | this.results.rawError = res.rawError || null; 194 | stats.increment('renderscript.task.handlederror', { 195 | error: res.error || 'no_error', 196 | }); 197 | throw new ErrorIsHandledError(); 198 | } 199 | 200 | abstract process(): Promise; 201 | } 202 | -------------------------------------------------------------------------------- /src/lib/types.ts: -------------------------------------------------------------------------------- 1 | import type { Cookie } from 'playwright'; 2 | 3 | import type { BrowserEngine } from './browser/Browser'; 4 | import type { Task } from './tasks/Task'; 5 | 6 | export type HandledError = 7 | | HandledLoginError 8 | | 'body_serialisation_failed' 9 | | 'connection_error' 10 | | 'dns_error' 11 | | 'error_reading_response' 12 | | 'fetch_aborted' 13 | | 'fetch_timeout' 14 | | 'forbidden_by_website' 15 | | 'no_cookies' 16 | | 'page_closed_too_soon' 17 | | 'page_crashed' 18 | | 'redirection' 19 | | 'timedout' 20 | | 'wrong_redirection'; 21 | 22 | export type HandledLoginError = 23 | | 'field_not_found' 24 | | 'no_response_after_login' 25 | | 'too_many_fields'; 26 | 27 | export type UnhandledError = 'unknown_error'; 28 | 29 | export interface TaskBaseParams { 30 | url: URL; 31 | userAgent: string; 32 | adblock?: boolean; 33 | browser?: BrowserEngine; 34 | waitTime?: { 35 | min?: number; 36 | max?: number; 37 | }; 38 | headersToForward?: { 39 | [s: string]: string; 40 | }; 41 | } 42 | 43 | export interface Perf { 44 | curr: PerformanceNavigationTiming; 45 | all: PerformanceEntryList; 46 | mem: { 47 | jsHeapSizeLimit?: number; 48 | totalJSHeapSize?: number; 49 | usedJSHeapSize?: number; 50 | }; 51 | } 52 | 53 | export type RenderTaskParams = TaskBaseParams; 54 | 55 | export interface LoginTaskParams extends TaskBaseParams { 56 | login: { 57 | username: string; 58 | password: string; 59 | }; 60 | renderHTML?: boolean; 61 | } 62 | 63 | export type TaskParams = LoginTaskParams | RenderTaskParams; 64 | 65 | export interface TaskFinal extends TaskResult { 66 | metrics: Metrics; 67 | timeout: boolean; 68 | } 69 | 70 | export interface TaskResult { 71 | statusCode: number | null; 72 | body: string | null; 73 | error: HandledError | UnhandledError | null; 74 | rawError: Error | null; 75 | headers: Record; 76 | resolvedUrl: string | null; 77 | cookies: Cookie[]; 78 | } 79 | 80 | export type ErrorReturn = Optional< 81 | Pick, 82 | 'rawError' 83 | >; 84 | 85 | export interface Metrics { 86 | timings: { 87 | context: number | null; 88 | goto: number | null; 89 | equiv: number | null; 90 | ready: number | null; 91 | minWait: number | null; 92 | serialize: number | null; 93 | close: number | null; 94 | total: number | null; 95 | }; 96 | renderingBudget: { 97 | max: number; 98 | consumed: number; 99 | }; 100 | page: PageMetrics | null; 101 | } 102 | 103 | export interface PageMetrics { 104 | timings: { 105 | download: number | null; 106 | }; 107 | mem: { 108 | jsHeapUsedSize: number | null; 109 | jsHeapTotalSize: number | null; 110 | }; 111 | requests: { 112 | total: number; 113 | blocked: number; 114 | pending: number; 115 | }; 116 | contentLength: { 117 | main: number; 118 | total: number; 119 | }; 120 | } 121 | 122 | export interface TaskObject { 123 | ref: Task; 124 | promise: Promise; 125 | } 126 | 127 | /** 128 | * Take an interface and list the keys that are optional. 129 | * 130 | * @example 131 | * interface Hello { 132 | * foo?: string; 133 | * bar?: string; 134 | * baz: string; 135 | * } 136 | * 137 | * OptionalKeys; 138 | * 139 | * Will result in: 140 | * 'foo' | 'bar' 141 | */ 142 | export type OptionalKeys = { 143 | [K in keyof T]: undefined extends T[K] ? K : never; 144 | }[keyof T]; 145 | 146 | /** 147 | * Take an interface and choose what property should undefined. 148 | * 149 | * @example 150 | * interface Hello { 151 | * foo: string; 152 | * bar: string; 153 | * baz?: string; 154 | * }; 155 | * 156 | * Optional; 157 | * 158 | * Will results in: 159 | * { 160 | * foo: string; 161 | * bar?: string; 162 | * baz?: string; 163 | * } 164 | * 165 | */ 166 | export type Optional = { 167 | [P in Exclude>>]?: T[P]; 168 | } & { 169 | [P in Exclude>]: T[P]; 170 | }; 171 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2022", 4 | "lib": [ 5 | "dom", 6 | ], 7 | "module": "CommonJS", 8 | "strict": true, 9 | "outDir": "dist/", 10 | "types": [ 11 | "node", 12 | "jest" 13 | ], 14 | "esModuleInterop": true, 15 | "allowSyntheticDefaultImports": true, 16 | "sourceMap": true, 17 | "declaration": true, 18 | "declarationMap": true 19 | }, 20 | "include": [ 21 | "src/**/*" 22 | ], 23 | "exclude": [ 24 | "node_modules" 25 | ], 26 | "typeRoots": [ 27 | "node_modules/@types" 28 | ] 29 | } 30 | --------------------------------------------------------------------------------