├── .nvmrc
├── src
├── global.d.ts
├── api
│ ├── @types
│ │ ├── getRoot.ts
│ │ ├── responses.ts
│ │ ├── getList.ts
│ │ ├── getHealthy.ts
│ │ ├── postRender.ts
│ │ └── postLogin.ts
│ ├── helpers
│ │ ├── logger.ts
│ │ ├── requestLogger.ts
│ │ ├── getForwardedHeaders.ts
│ │ ├── errors.ts
│ │ ├── alt.ts
│ │ └── buildUrl.ts
│ ├── routes
│ │ ├── root.ts
│ │ ├── ready.ts
│ │ ├── list.ts
│ │ ├── healthy.ts
│ │ ├── privates
│ │ │ └── login.ts
│ │ ├── login.ts
│ │ └── render.ts
│ ├── constants.ts
│ └── index.ts
├── helpers
│ ├── projectRoot.ts
│ ├── wait.ts
│ ├── promiseWithTimeout.ts
│ ├── stats.ts
│ ├── waitForPendingRequests.ts
│ ├── logger.ts
│ ├── errorReporting.ts
│ └── gracefulClose.ts
├── lib
│ ├── singletons.ts
│ ├── browser
│ │ ├── TimeBudget.ts
│ │ ├── Adblocker.ts
│ │ ├── TimeBudget.test.ts
│ │ ├── constants.ts
│ │ ├── Browser.ts
│ │ └── Page.ts
│ ├── helpers
│ │ ├── injectBaseHref.ts
│ │ ├── validateURL.ts
│ │ ├── getInput.ts
│ │ └── errors.ts
│ ├── constants.ts
│ ├── types.ts
│ ├── tasks
│ │ ├── Render.ts
│ │ ├── Task.ts
│ │ └── Login.ts
│ └── TasksManager.ts
├── __tests__
│ ├── __snapshots__
│ │ ├── login.test.ts.snap
│ │ └── async.test.ts.snap
│ ├── errors.test.ts
│ ├── tasksManager.test.ts
│ ├── list.test.ts
│ ├── blockedRequests.test.ts
│ ├── helpers.ts
│ ├── api.test.ts
│ ├── index.test.ts
│ ├── async.test.ts
│ ├── login.real.test.ts
│ ├── login.test.ts
│ └── redirect.test.ts
└── index.ts
├── .npmrc
├── .eslintignore
├── public
├── test-website
│ ├── basic.html
│ ├── js-redirect-hash.html
│ ├── js-redirect-path.html
│ ├── meta-refresh.html
│ ├── meta-refresh-5.html
│ ├── page-crash.html
│ ├── js-redirect-history.html
│ ├── login-double-password.html
│ ├── login-multiple-input.html
│ ├── js-redirect.html
│ ├── iframe.html
│ ├── slow.html
│ ├── async.html
│ └── blocked-requests.html
├── static
│ └── main.webmanifest
├── views
│ ├── login-step1.ejs
│ ├── login-step2.ejs
│ ├── login.ejs
│ └── login-2steps-js.ejs
└── index.html
├── .prettierrc.js
├── .yarnrc.yml
├── nodemon.json
├── renovate.json
├── .gitignore
├── jest.config.js
├── scripts
├── update_adblock_hosts.sh
├── start.sh
├── build.sh
└── test_image.sh
├── .github
└── workflows
│ ├── renovate.yml
│ ├── main.yml
│ ├── release.yml
│ └── release_docker.yml
├── tsconfig.json
├── release.config.js
├── jest.setup.ts
├── .env.example
├── CONTRIBUTING.md
├── .dockerignore
├── .eslintrc.js
├── package.json
├── Dockerfile
└── README.md
/.nvmrc:
--------------------------------------------------------------------------------
1 | 18.18.2
2 |
--------------------------------------------------------------------------------
/src/global.d.ts:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/.npmrc:
--------------------------------------------------------------------------------
1 | update-notifier=false
2 |
--------------------------------------------------------------------------------
/src/api/@types/getRoot.ts:
--------------------------------------------------------------------------------
1 | export interface GetRoot {
2 | version: string;
3 | }
4 |
--------------------------------------------------------------------------------
/src/api/@types/responses.ts:
--------------------------------------------------------------------------------
1 | export interface Res500 {
2 | error: string;
3 | }
4 |
--------------------------------------------------------------------------------
/src/api/@types/getList.ts:
--------------------------------------------------------------------------------
1 | export interface GetListSuccess {
2 | open: { [engine: string]: string[] };
3 | }
4 |
--------------------------------------------------------------------------------
/.eslintignore:
--------------------------------------------------------------------------------
1 | dist/
2 | pw-browsers/
3 | dist/
4 | coverage/
5 | node_modules/
6 |
7 | .yarnrc.yml
8 | .yarn/
9 |
--------------------------------------------------------------------------------
/src/helpers/projectRoot.ts:
--------------------------------------------------------------------------------
1 | import * as path from 'path';
2 |
3 | export default path.join(__dirname, '..', '..');
4 |
--------------------------------------------------------------------------------
/public/test-website/basic.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | A basic page
6 |
7 |
8 |
--------------------------------------------------------------------------------
/src/api/helpers/logger.ts:
--------------------------------------------------------------------------------
1 | import { log as mainLog } from '../../helpers/logger';
2 |
3 | export const log = mainLog.child({ svc: 'api ' });
4 |
--------------------------------------------------------------------------------
/.prettierrc.js:
--------------------------------------------------------------------------------
1 | module.exports = {
2 | trailingComma: 'es5',
3 | tabWidth: 2,
4 | semi: true,
5 | singleQuote: true,
6 | printWidth: 80,
7 | }
8 |
--------------------------------------------------------------------------------
/src/api/@types/getHealthy.ts:
--------------------------------------------------------------------------------
1 | export interface GetHealthySuccess {
2 | ready: boolean;
3 | tasksRunning: number;
4 | pagesOpen: number;
5 | totalRun: number;
6 | }
7 |
--------------------------------------------------------------------------------
/.yarnrc.yml:
--------------------------------------------------------------------------------
1 | compressionLevel: mixed
2 |
3 | enableGlobalCache: false
4 |
5 | enableTelemetry: false
6 |
7 | nodeLinker: node-modules
8 |
9 | yarnPath: .yarn/releases/yarn-4.0.2.cjs
10 |
--------------------------------------------------------------------------------
/src/helpers/wait.ts:
--------------------------------------------------------------------------------
1 | // Coming in nodejs 16
2 | export function wait(waitTime: number): Promise {
3 | return new Promise((resolve) => {
4 | setTimeout(resolve, waitTime);
5 | });
6 | }
7 |
--------------------------------------------------------------------------------
/public/test-website/js-redirect-hash.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/nodemon.json:
--------------------------------------------------------------------------------
1 | {
2 | "ignore": [
3 | ".git",
4 | "node_modules",
5 | "dist",
6 | "__tests__/"
7 | ],
8 | "watch": [
9 | "src"
10 | ],
11 | "exec": "yarn dev:run",
12 | "ext": "ts"
13 | }
14 |
--------------------------------------------------------------------------------
/public/test-website/js-redirect-path.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
--------------------------------------------------------------------------------
/public/test-website/meta-refresh.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Redirecting...
8 |
9 |
10 |
--------------------------------------------------------------------------------
/public/test-website/meta-refresh-5.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | Redirecting...
8 |
9 |
10 |
--------------------------------------------------------------------------------
/public/test-website/page-crash.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/public/test-website/js-redirect-history.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/public/test-website/login-double-password.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
--------------------------------------------------------------------------------
/src/api/routes/root.ts:
--------------------------------------------------------------------------------
1 | import type express from 'express';
2 |
3 | import type { GetRoot } from '../@types/getRoot';
4 |
5 | export function root(
6 | req: express.Request,
7 | res: express.Response
8 | ): void {
9 | res.status(200).json({ version: process.env.VERSION || 'dev' });
10 | }
11 |
--------------------------------------------------------------------------------
/src/api/routes/ready.ts:
--------------------------------------------------------------------------------
1 | import type express from 'express';
2 |
3 | import { tasksManager } from '../../lib/singletons';
4 |
5 | export function ready(req: express.Request, res: express.Response): void {
6 | const isHealthy = tasksManager.getHealth().ready;
7 | res.status(isHealthy ? 200 : 503).json({ ready: isHealthy });
8 | }
9 |
--------------------------------------------------------------------------------
/renovate.json:
--------------------------------------------------------------------------------
1 | {
2 | "extends": [
3 | "config:js-app",
4 | "github>algolia/renovate-config-algolia"
5 | ],
6 | "baseBranches": [
7 | "chore/renovateBaseBranch"
8 | ],
9 | "lockFileMaintenance": { "enabled": false },
10 | "automergeType": "branch",
11 | "prHourlyLimit": 2,
12 | "prConcurrentLimit": 5
13 | }
14 |
--------------------------------------------------------------------------------
/public/static/main.webmanifest:
--------------------------------------------------------------------------------
1 | {
2 | "name": "",
3 | "short_name": "",
4 | "icons": [
5 | {
6 | "src": "/android-chrome-192x192.png",
7 | "sizes": "192x192",
8 | "type": "image/png"
9 | }
10 | ],
11 | "theme_color": "#ffffff",
12 | "background_color": "#ffffff",
13 | "display": "standalone"
14 | }
15 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/
2 | yarn-error.log
3 |
4 | dist/
5 | vendors/
6 |
7 | .env
8 |
9 | # Editor files
10 | .exrc
11 | .idea
12 |
13 | # https://yarnpkg.com/getting-started/qa#which-files-should-be-gitignored
14 | .yarn/*
15 | !.yarn/releases
16 | !.yarn/plugins
17 |
18 | .idea
19 | .DS_Store
20 | .vscode
21 | .scannerwork
22 | *~
23 |
--------------------------------------------------------------------------------
/jest.config.js:
--------------------------------------------------------------------------------
1 | // eslint-disable-next-line import/no-commonjs
2 | module.exports = {
3 | preset: 'ts-jest',
4 | testEnvironment: 'node',
5 | testPathIgnorePatterns: ['/node_modules/', '/dist/'],
6 | testMatch: ['/src/**/*.test.[jt]s'],
7 | globalSetup: '/jest.setup.ts',
8 | setupFiles: ['dotenv/config'],
9 | maxWorkers: 1,
10 | };
11 |
--------------------------------------------------------------------------------
/scripts/update_adblock_hosts.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 |
3 | # adblock hosts file URL
4 | URL="https://raw.githubusercontent.com/badmojr/1Hosts/master/Pro/domains.txt"
5 |
6 | TARGET_DIR="./dist/lib/browser"
7 | TARGET_FILE="adblock_hosts.txt"
8 |
9 | if curl -o "${TARGET_DIR}/${TARGET_FILE}" "$URL" -s; then
10 | echo "✅ adblock hosts download successful."
11 | else
12 | echo "❌ adblock hosts download failed."
13 | exit 1
14 | fi
15 |
--------------------------------------------------------------------------------
/public/test-website/login-multiple-input.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/src/api/helpers/requestLogger.ts:
--------------------------------------------------------------------------------
1 | import type express from 'express';
2 |
3 | import { log } from './logger';
4 |
5 | export function requestLogger(
6 | req: express.Request,
7 | res: express.Response,
8 | next: express.NextFunction
9 | ): void {
10 | if (['/ready', '/healthy'].includes(req.url)) {
11 | next();
12 | return;
13 | }
14 |
15 | log.info('Received', { method: req.method, path: req.url, body: req.body });
16 | next();
17 | }
18 |
--------------------------------------------------------------------------------
/.github/workflows/renovate.yml:
--------------------------------------------------------------------------------
1 | name: Renovate
2 | on:
3 | schedule:
4 | - cron: '0 14 * * 5'
5 | workflow_dispatch:
6 |
7 | jobs:
8 | renovate:
9 | runs-on: ubuntu-latest
10 |
11 | steps:
12 | - name: Renovate Automatic Branch
13 | uses: bodinsamuel/renovate-automatic-branch@v1
14 | with:
15 | github-token: ${{ secrets.GITHUB_TOKEN }}
16 | repo-owner: algolia
17 | repo-name: renderscript
18 | branch-base: master
19 |
--------------------------------------------------------------------------------
/public/views/login-step1.ejs:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 2-steps login form
6 |
7 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/public/test-website/js-redirect.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/public/views/login-step2.ejs:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | 2-steps login form
6 |
7 |
14 |
15 |
16 |
17 |
--------------------------------------------------------------------------------
/public/test-website/iframe.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | A basic page
8 |
9 |
10 |
11 |
12 |
13 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/public/test-website/slow.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
23 |
24 |
25 |
26 |
--------------------------------------------------------------------------------
/src/api/helpers/getForwardedHeaders.ts:
--------------------------------------------------------------------------------
1 | import type express from 'express';
2 |
3 | import { HEADERS_TO_FORWARD } from '../constants';
4 |
5 | export function getForwardedHeadersFromRequest(
6 | req: express.Request
7 | ): Record {
8 | const headersToForward = HEADERS_TO_FORWARD.reduce((partial, headerName) => {
9 | const name = headerName.toLowerCase();
10 | if (req.headers[name]) {
11 | return { ...partial, [name]: req.headers[name] };
12 | }
13 | return partial;
14 | }, {});
15 |
16 | return headersToForward;
17 | }
18 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "target": "ES2022",
4 | "lib": [
5 | "dom",
6 | ],
7 | "module": "CommonJS",
8 | "strict": true,
9 | "outDir": "dist/",
10 | "types": [
11 | "node",
12 | "jest"
13 | ],
14 | "esModuleInterop": true,
15 | "allowSyntheticDefaultImports": true,
16 | "sourceMap": true,
17 | "declaration": true,
18 | "declarationMap": true
19 | },
20 | "include": [
21 | "src/**/*"
22 | ],
23 | "exclude": [
24 | "node_modules"
25 | ],
26 | "typeRoots": [
27 | "node_modules/@types"
28 | ]
29 | }
30 |
--------------------------------------------------------------------------------
/src/api/constants.ts:
--------------------------------------------------------------------------------
1 | export const HEADERS_TO_FORWARD = process.env.HEADERS_TO_FORWARD
2 | ? process.env.HEADERS_TO_FORWARD.split(',')
3 | : ['Cookie', 'Authorization'];
4 |
5 | // Only whitelist loading styles resources when testing
6 | // (will not change programmatic use of this system)
7 | export const CSP_HEADERS = [
8 | "default-src 'none'",
9 | "style-src * 'unsafe-inline'",
10 | 'img-src * data:',
11 | 'font-src *',
12 | ].join('; ');
13 |
14 | export const SESSION_COOKIE = 'sessionToken=53cu23_535510n';
15 |
16 | export const DELETE_COOKIE =
17 | 'sessionToken=; expires=Thu, 01 Jan 1970 00:00:00 GMT';
18 |
--------------------------------------------------------------------------------
/src/helpers/promiseWithTimeout.ts:
--------------------------------------------------------------------------------
1 | export class PromiseWithTimeoutError extends Error {}
2 |
3 | export async function promiseWithTimeout(
4 | promise: Readonly>,
5 | timeout: number
6 | ): Promise {
7 | let timeoutId: ReturnType | undefined = undefined;
8 | const timeoutPromise = new Promise((resolve, reject) => {
9 | timeoutId = setTimeout(() => {
10 | reject(new PromiseWithTimeoutError('Renderscript Controlled Timeout'));
11 | }, timeout);
12 | });
13 | try {
14 | return await Promise.race([promise, timeoutPromise]);
15 | } finally {
16 | clearTimeout(timeoutId);
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/src/api/helpers/errors.ts:
--------------------------------------------------------------------------------
1 | import type express from 'express';
2 |
3 | interface AnyParams {
4 | res: express.Response;
5 | status: number;
6 | message: string;
7 | details?: any;
8 | }
9 |
10 | function any({ res, status, message, details }: AnyParams): void {
11 | res.status(status).json({ error: true, message, details });
12 | }
13 |
14 | interface BadRequestParams {
15 | res: express.Response;
16 | message?: string;
17 | details?: any;
18 | }
19 |
20 | export function badRequest({
21 | res,
22 | details,
23 | message = 'Bad Request',
24 | }: BadRequestParams): void {
25 | return any({
26 | res,
27 | status: 400,
28 | message,
29 | details,
30 | });
31 | }
32 |
--------------------------------------------------------------------------------
/release.config.js:
--------------------------------------------------------------------------------
1 | /* eslint-disable import/no-commonjs */
2 | /* eslint-disable no-template-curly-in-string */
3 | module.exports = {
4 | branch: 'master',
5 | verifyConditions: ['@semantic-release/github'],
6 | prepare: [
7 | {
8 | path: '@semantic-release/changelog',
9 | changelogFile: 'CHANGELOG.md',
10 | },
11 | '@semantic-release/npm',
12 | {
13 | path: '@semantic-release/git',
14 | assets: ['package.json', 'CHANGELOG.md'],
15 | message:
16 | 'chore(release): ${nextRelease.version} [skip ci]\n\n${nextRelease.notes}',
17 | },
18 | ],
19 | publish: '@semantic-release/github',
20 | success: [],
21 | fail: [],
22 | npmPublish: false,
23 | };
24 |
--------------------------------------------------------------------------------
/src/lib/singletons.ts:
--------------------------------------------------------------------------------
1 | import { report } from '../helpers/errorReporting';
2 | import { log } from '../helpers/logger';
3 |
4 | import { TasksManager } from './TasksManager';
5 | import { Adblocker } from './browser/Adblocker';
6 |
7 | export const tasksManager = new TasksManager();
8 | export const adblocker = new Adblocker();
9 |
10 | export async function init(): Promise {
11 | try {
12 | await tasksManager.launch();
13 | await adblocker.load();
14 | } catch (err: any) {
15 | report(new Error('Error during launch'), { err });
16 |
17 | log.info('Exit');
18 | setTimeout(() => {
19 | // eslint-disable-next-line no-process-exit
20 | process.exit(1);
21 | }, 1);
22 | }
23 | }
24 |
--------------------------------------------------------------------------------
/jest.setup.ts:
--------------------------------------------------------------------------------
1 | import { request } from 'undici';
2 |
3 | import { wait } from './src/helpers/wait';
4 |
5 | // eslint-disable-next-line @typescript-eslint/explicit-function-return-type
6 | export default async function setup() {
7 | const max = 50;
8 | let curr = 0;
9 |
10 | while (curr < max) {
11 | curr += 1;
12 | try {
13 | const { statusCode } = await request('http://localhost:3000/ready');
14 | console.log('API statusCode:', statusCode, `(retries: ${curr})`);
15 |
16 | if (statusCode === 200) {
17 | console.log('API Ready');
18 | return;
19 | }
20 | } catch (err: any) {
21 | console.log(err.message);
22 | } finally {
23 | await wait(1000);
24 | }
25 | }
26 |
27 | throw Error('API did not reach ready status');
28 | }
29 |
--------------------------------------------------------------------------------
/src/helpers/stats.ts:
--------------------------------------------------------------------------------
1 | import { StatsD } from 'hot-shots';
2 |
3 | import { report } from './errorReporting';
4 |
5 | const client = new StatsD({
6 | host: process.env.DOGSTATSD_HOST || 'localhost',
7 | port: 8125,
8 | prefix: process.env.DOGSTATSD_PREFIX || 'alg.crawler.',
9 | mock: process.env.NODE_ENV !== 'production',
10 | globalTags: {
11 | env: process.env.NODE_ENV === 'production' ? 'prod' : 'dev',
12 | },
13 | errorHandler(error: Error): void {
14 | report(error);
15 | },
16 | });
17 |
18 | export function close(): Promise {
19 | return new Promise((resolve, reject) => {
20 | client.close((err) => {
21 | if (err) {
22 | reject(err);
23 | return;
24 | }
25 | resolve();
26 | });
27 | });
28 | }
29 |
30 | export const stats = client;
31 |
--------------------------------------------------------------------------------
/src/api/helpers/alt.ts:
--------------------------------------------------------------------------------
1 | import Altheia from 'altheia-async-data-validator';
2 |
3 | export const alt = Altheia.instance();
4 | alt.lang('protocol_not_allowed', () => 'Only HTTP protocol is allowed');
5 |
6 | export function getDefaultParams(): Record {
7 | return {
8 | url: alt
9 | .internet()
10 | .url()
11 | .custom('protocol_not_allowed', (val) => {
12 | return ['http:', 'https:'].includes(new URL(val).protocol);
13 | })
14 | .required(),
15 | ua: alt.string().required(),
16 | waitTime: alt.object().schema(
17 | alt({
18 | min: alt.number().cast().min(1000).max(19000),
19 | max: alt.number().cast().min(2000).max(20000),
20 | })
21 | ),
22 | adblock: alt.boolean().cast(),
23 | browser: alt.string(),
24 | };
25 | }
26 |
--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | # Allow calls on localhost IPs
2 | ALLOW_LOCALHOST="true"
3 |
4 | # Change the minimum level of the log that will be output
5 | LOG_LEVEL="info"
6 |
7 | # Comma-separated list of prefixes to whitelist when `ALLOW_LOCALHOST` is set to true.
8 | # Example: `IP_PREFIXES_WHITELIST=127.,0.,::1` (these are the default values used when the variable is not provided alongside `ALLOW_LOCALHOST`)
9 | IP_PREFIXES_WHITELIST=
10 |
11 | # Comma-separated list of headers to forward on navigation request
12 | # Example: `HEADERS_TO_FORWARD=Cookie,Authorization` (default value)
13 | HEADERS_TO_FORWARD=
14 |
15 | # Report errors to this Sentry URL.
16 | SENTRY_DSN=
17 |
18 | # Login credentials for testing
19 | # example: LOGIN_CREDENTIALS={"login.live.com":{"username":"FOOBAR@outlook.com","password":"FOOBAR"}}
20 | LOGIN_CREDENTIALS=
21 |
--------------------------------------------------------------------------------
/src/api/routes/list.ts:
--------------------------------------------------------------------------------
1 | import type express from 'express';
2 |
3 | import { tasksManager } from '../../lib/singletons';
4 | import type { GetListSuccess } from '../@types/getList';
5 |
6 | /**
7 | * List currently opened pages.
8 | * Useful to debug non-killed page.
9 | */
10 | export function list(
11 | req: express.Request,
12 | res: express.Response
13 | ): void {
14 | const open: { [engine: string]: string[] } = {
15 | chromium: [],
16 | firefox: [],
17 | };
18 | tasksManager.currentBrowsers.forEach((browser, engine) => {
19 | if (browser) {
20 | browser.instance!.contexts().forEach((ctx) => {
21 | ctx.pages().forEach((page) => {
22 | open[engine].push(page.url());
23 | });
24 | });
25 | }
26 | });
27 |
28 | res.status(200).json({ open });
29 | }
30 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | ## Running it locally
4 |
5 | Development:
6 |
7 | ```sh
8 | yarn
9 | yarn dev
10 | ```
11 |
12 | Docker image:
13 |
14 | ```sh
15 | yarn docker:build
16 | docker run -p 23000:3000 algolia/renderscript
17 | open "http://localhost:23000/render?url=https%3A%2F%2Fwww.algolia.com&ua=Test+Renderscript"
18 | ```
19 |
20 | ### Env Variables
21 |
22 | See `.env.example`
23 |
24 | ## Releasing
25 |
26 | Releases are built using GitHub actions. You can release a new version by triggering the [Release Version](https://github.com/algolia/renderscript/actions/workflows/release.yml) workflow.
27 |
28 | ### Manual Release Locally
29 |
30 | ```sh
31 | yarn docker:build
32 |
33 | docker push "algolia/renderscript"
34 | docker push "algolia/renderscript:${VERSION}"
35 | docker push "algolia/renderscript:${GIT_HASH}"
36 | ```
37 |
--------------------------------------------------------------------------------
/scripts/start.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 |
3 | if [ -f .env ]; then
4 | source .env
5 | fi
6 |
7 | if [ -z "$EXTENSIONS" ]; then
8 | # Headless Chrome, just launch the API
9 | node dist/api/index.js
10 | else
11 | cleanup() {
12 | echo "start.sh: Gracefully exiting"
13 |
14 | # Kill the API first, then XVFB
15 | kill -TERM $api_pid
16 | wait $api_pid >/dev/null || true
17 |
18 | echo "start.sh: Gracefully exited node process"
19 |
20 | kill -TERM $xvfb_pid
21 | wait $xvfb_pid >/dev/null || true
22 |
23 | echo "start.sh: Gracefully exited xfvb"
24 | }
25 |
26 | trap cleanup INT
27 | trap cleanup TERM
28 |
29 | DISPLAY=:95
30 |
31 | Xvfb $DISPLAY -screen 0 1920x1080x16 &
32 | xvfb_pid=$!
33 | DISPLAY="$DISPLAY" node dist/api/index.js &
34 | api_pid=$!
35 |
36 | wait $api_pid
37 | wait $xvfb_pid
38 | fi
39 |
--------------------------------------------------------------------------------
/scripts/build.sh:
--------------------------------------------------------------------------------
1 | #! /bin/sh
2 |
3 | set -ex
4 |
5 | hash=$(git rev-parse HEAD)
6 | current=$(node -e "console.log(require('./package.json').version)")
7 | playwright_version=$(node -e 'console.log(require("./package.json").dependencies.playwright)')
8 | echo "Releasing: $current ; Playwright version: $playwright_version"
9 | echo ""
10 |
11 | # Build renderscript
12 |
13 | # To run locally on your mac m1, you need to change platform to linux/arm64/v8
14 | # For deploy, it should be linux/amd64
15 | docker buildx build \
16 | --platform linux/amd64 \
17 | --progress plain \
18 | -t algolia/renderscript \
19 | -t "algolia/renderscript:${current}" \
20 | -t "algolia/renderscript:${hash}" \
21 | -t "algolia/renderscript:latest" \
22 | --build-arg "VERSION=${current}" \
23 | --build-arg "PLAYWRIGHT_VERSION=${playwright_version}" \
24 | --load \
25 | .
26 |
--------------------------------------------------------------------------------
/src/lib/browser/TimeBudget.ts:
--------------------------------------------------------------------------------
1 | export class TimeBudget {
2 | max: number;
3 | consumed: number = 0;
4 | lastConsumption: number = Date.now();
5 |
6 | constructor(max: number) {
7 | this.max = max;
8 | }
9 |
10 | /**
11 | * Consume budget.
12 | *
13 | * @returns Number - What was consumed compared to prev call.
14 | */
15 | consume(): number {
16 | const consumed = Date.now() - this.lastConsumption;
17 | this.consumed += consumed;
18 | this.lastConsumption = Date.now();
19 | return consumed;
20 | }
21 |
22 | get(): number {
23 | // Not 0, because 0 === unlimited
24 | return Math.max(1, this.max - this.consumed);
25 | }
26 |
27 | min(min: number): number {
28 | return Math.max(min, this.get());
29 | }
30 |
31 | getRange(min: number, max: number): number {
32 | return Math.max(min, Math.min(max, this.get()));
33 | }
34 | }
35 |
--------------------------------------------------------------------------------
/public/test-website/async.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
12 |
27 |
28 |
29 |
30 |
--------------------------------------------------------------------------------
/src/lib/helpers/injectBaseHref.ts:
--------------------------------------------------------------------------------
1 | /**
2 | * Injects a tag which allows other resources to load on the
3 | * page without trying to get them from the `renderscript` server.
4 | * It has no effect on serialised output, but allows it to verify render
5 | * quality.
6 | */
7 | export function injectBaseHref(origin: string): void {
8 | const base = document.createElement('base');
9 | base.setAttribute('href', origin);
10 |
11 | const bases = document.head.querySelectorAll('base');
12 | if (bases.length) {
13 | // Patch existing if it is relative.
14 | const existingBase = bases[0].getAttribute('href') || '';
15 | if (existingBase.startsWith('/')) {
16 | bases[0].setAttribute('href', origin + existingBase);
17 | }
18 | } else {
19 | // Only inject if it doesn't already exist.
20 | document.head.insertAdjacentElement('afterbegin', base);
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/src/lib/helpers/validateURL.ts:
--------------------------------------------------------------------------------
1 | import { validateURL } from '@algolia/dns-filter';
2 |
3 | import { report } from '../../helpers/errorReporting';
4 | import { VALIDATE_URL_IGNORED_ERRORS } from '../browser/constants';
5 | import { RESTRICTED_IPS } from '../constants';
6 |
7 | export async function isURLAllowed(url: string): Promise {
8 | try {
9 | // Check for valid URL before validation
10 | // eslint-disable-next-line no-new
11 | new URL(url);
12 | } catch (e) {
13 | report(new Error('Invalid url'), { url, err: e });
14 | return false;
15 | }
16 | try {
17 | await validateURL({
18 | url,
19 | ipPrefixes: RESTRICTED_IPS,
20 | });
21 | } catch (err: any) {
22 | if (!VALIDATE_URL_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) {
23 | report(new Error('Blocked url'), { err, url });
24 | return false;
25 | }
26 | return true;
27 | }
28 |
29 | return true;
30 | }
31 |
--------------------------------------------------------------------------------
/public/views/login.ejs:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Protected login form
6 |
7 |
17 |
18 |
25 |
26 |
27 |
28 |
--------------------------------------------------------------------------------
/src/lib/browser/Adblocker.ts:
--------------------------------------------------------------------------------
1 | import { promises as fs } from 'fs';
2 |
3 | import { report } from '../../helpers/errorReporting';
4 | import { log as mainLog } from '../../helpers/logger';
5 |
6 | const log = mainLog.child({ svc: 'adbk' });
7 |
8 | /**
9 | * Dead simple adblocking by exact hostname.
10 | */
11 | export class Adblocker {
12 | #hostnames: Set = new Set();
13 |
14 | async load(): Promise {
15 | try {
16 | const data = await fs.readFile(`${__dirname}/adblock_hosts.txt`, 'utf8');
17 | const lines = data.split(/[\r\n]+/);
18 |
19 | for (const line of lines) {
20 | if (!line.startsWith('#')) {
21 | this.#hostnames.add(line);
22 | }
23 | }
24 |
25 | log.info('Ready', {
26 | entries: this.#hostnames.size,
27 | });
28 | } catch (err: any) {
29 | report(new Error('Error while setting up adblocker'), { err });
30 | }
31 | }
32 |
33 | match(url: URL): boolean {
34 | return this.#hostnames.has(url.hostname);
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/lib/constants.ts:
--------------------------------------------------------------------------------
1 | import { PRIVATE_IP_PREFIXES } from '@algolia/dns-filter';
2 |
3 | export const IP_PREFIXES_WHITELIST = process.env.IP_PREFIXES_WHITELIST
4 | ? process.env.IP_PREFIXES_WHITELIST.split(',')
5 | : ['127.', '0.', '::1'];
6 |
7 | export const RESTRICTED_IPS =
8 | process.env.ALLOW_LOCALHOST === 'true'
9 | ? PRIVATE_IP_PREFIXES.filter(
10 | (prefix: string) => !IP_PREFIXES_WHITELIST.includes(prefix)
11 | ) // relax filtering
12 | : PRIVATE_IP_PREFIXES;
13 |
14 | export const IGNORED_RESOURCES = [
15 | 'font',
16 | 'image',
17 | 'media',
18 | 'websocket',
19 | 'manifest',
20 | 'texttrack',
21 | ];
22 |
23 | export const DATA_REGEXP = /^data:/i;
24 |
25 | export const WAIT_TIME = {
26 | min: 500,
27 | max: 20000,
28 | };
29 |
30 | export const MAX_WAIT_FOR_NEW_PAGE = process.env.MAX_WAIT_FOR_NEW_PAGE
31 | ? parseInt(process.env.MAX_WAIT_FOR_NEW_PAGE, 10)
32 | : 6000; // In feb 2022 p95 < 6s
33 |
34 | export const UNHEALTHY_TASK_TTL = (MAX_WAIT_FOR_NEW_PAGE + WAIT_TIME.max) * 3;
35 |
--------------------------------------------------------------------------------
/src/helpers/waitForPendingRequests.ts:
--------------------------------------------------------------------------------
1 | import { setTimeout } from 'timers/promises';
2 |
3 | import type { BrowserPage } from '../lib/browser/Page';
4 |
5 | import { log } from './logger';
6 |
7 | // waitForNavigation({ waitUntil: 'networkidle' }) or waitForLoadState('networkidle')
8 | // can be flaky and return too soon:
9 | // https://github.com/microsoft/playwright/issues/4664#issuecomment-742691215
10 | // https://github.com/microsoft/playwright/issues/2515#issuecomment-724163391
11 | // This helper permits to manually wait, if the page still has pending requests.
12 | export async function waitForPendingRequests(
13 | page: BrowserPage,
14 | timeout: number
15 | ): Promise {
16 | const startTime = Date.now();
17 | while (page.pendingRequests > 0 && Date.now() - startTime < timeout) {
18 | log.debug(
19 | { pageUrl: page.ref?.url() },
20 | `Waiting for ${page.pendingRequests} requests to complete... Wait time:${
21 | Date.now() - startTime
22 | }, timeout: ${timeout}`
23 | );
24 | await setTimeout(1000);
25 | }
26 | }
27 |
--------------------------------------------------------------------------------
/src/api/helpers/buildUrl.ts:
--------------------------------------------------------------------------------
1 | import { report } from '../../helpers/errorReporting';
2 |
3 | const DOCKER_LOCALHOST = 'host.docker.internal';
4 |
5 | const USE_DOCKER_LOCALHOST = process.env.USE_DOCKER_LOCALHOST === 'true';
6 |
7 | export function replaceHost(url: URL, from: string, to: string): URL {
8 | const fromRegex = new RegExp(`^${from}(:|$)`);
9 | const host = url.host || '';
10 | // eslint-disable-next-line no-param-reassign
11 | url.host = host.replace(fromRegex, `${to}$1`);
12 | return url;
13 | }
14 |
15 | export function revertUrl(href: string | null): URL | null {
16 | if (!href) {
17 | return null;
18 | }
19 |
20 | try {
21 | const url = new URL(href);
22 | if (!USE_DOCKER_LOCALHOST) {
23 | return url;
24 | }
25 | return replaceHost(url, DOCKER_LOCALHOST, 'localhost');
26 | } catch (err) {
27 | report(new Error('invalid revertUrl'), { href });
28 | return null;
29 | }
30 | }
31 |
32 | export function buildUrl(href: string): URL {
33 | const url = new URL(href);
34 | if (!USE_DOCKER_LOCALHOST) {
35 | return url;
36 | }
37 | return replaceHost(url, 'localhost', DOCKER_LOCALHOST);
38 | }
39 |
--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
1 | # Only ignore files / folders here that are also in .gitgnore
2 | #
3 | # This file uses the same format as .gitgnore,
4 | # except that it's not recursive by default
5 | #
6 | # | .gitignore | .dockerignore |
7 | # |------------|---------------|
8 | # | .DS_Store | **/.DS_Store |
9 | # | /.env | .env |
10 | #
11 | # Only bother including:
12 | # * big files / folder to lower the build context
13 | # * often updated files to help the docker cache
14 |
15 | # Dependencies
16 | **/node_modules
17 | dist/
18 | .env
19 |
20 | # Logs
21 | **/*.log
22 |
23 | .git/
24 | .github/
25 | .githooks/
26 | .circleci/
27 | .nodemon.json
28 | .editorconfig
29 | .gitattributes
30 | .prettierignore
31 | .prettierrc.js
32 | .eslintrc.js
33 | .nvmrc
34 | .npmrc
35 | .eslintignore
36 | .eslinrcjs
37 | .vscode
38 | .env.example
39 | .yarn/cache/
40 | release.config.js
41 | nodemon.json
42 | cypress.json
43 | README.md
44 | CHANGELOG.md
45 | CONTRIBUTING.md
46 | **/*.test.ts
47 | renovate.json
48 | **/jest*
49 | **/.DS_Store
50 | **/.storybook/
51 | **/__fixtures__/
52 | **/__snapshots__/
53 | **/__mocks__/
54 | **/__mock__/
55 | **/__tests__/
56 | **/tsconfig.tsbuildinfo
57 |
--------------------------------------------------------------------------------
/src/helpers/logger.ts:
--------------------------------------------------------------------------------
1 | import { pino } from 'pino';
2 |
3 | const isProd = process.env.NODE_ENV === 'production';
4 | export const log = pino({
5 | level: process.env.LOG_LEVEL || 'info',
6 | timestamp: true,
7 | base: {},
8 | formatters: {
9 | level(label) {
10 | return { level: label };
11 | },
12 | },
13 | hooks: {
14 | // By default pino does Sprintf instead we merge objects.
15 | logMethod(args, method) {
16 | const final: Record = { msg: '', data: {} };
17 | args.forEach((m) => {
18 | if (typeof m === 'string') {
19 | final.msg += m;
20 | } else if (typeof m === 'object' && m instanceof Error) {
21 | final.err = m;
22 | } else if (m.err || m.error) final.err = m.err || m.error;
23 | else final.data = { ...final.data, ...m };
24 | });
25 | method.apply(this, [final as unknown as string]);
26 | },
27 | },
28 | prettifier: !isProd,
29 | transport: !isProd
30 | ? {
31 | target: 'pino-pretty',
32 | options: {
33 | colorize: true,
34 | singleLine: true,
35 | messageFormat: '{svc} \x1B[37m{msg}',
36 | translateTime: 'HH:MM',
37 | ignore: 'svc',
38 | },
39 | }
40 | : undefined,
41 | });
42 |
--------------------------------------------------------------------------------
/src/helpers/errorReporting.ts:
--------------------------------------------------------------------------------
1 | import * as Sentry from '@sentry/node';
2 |
3 | import { log } from './logger';
4 |
5 | export const RENDERSCRIPT_TASK_URL_TAG = 'renderscript:task:url';
6 | export const RENDERSCRIPT_TASK_TYPE_TAG = 'renderscript:task:type';
7 |
8 | type SentryTag = {
9 | key: string;
10 | value: string;
11 | };
12 |
13 | Sentry.init({
14 | dsn: process.env.SENTRY_DSN,
15 | release: process.env.npm_package_version,
16 | environment: process.env.CLUSTER_NAME || process.env.NODE_ENV,
17 | serverName: 'renderscript',
18 | ignoreErrors: [],
19 | maxBreadcrumbs: 10,
20 | });
21 |
22 | export function report(
23 | err: Error,
24 | extra: any = {},
25 | tags: SentryTag[] = []
26 | ): void {
27 | if (!process.env.SENTRY_DSN) {
28 | console.error({ err, extra });
29 | return;
30 | }
31 |
32 | log.error(err.message, extra);
33 | Sentry.withScope((scope) => {
34 | tags.forEach((tag) => {
35 | Sentry.setTag(tag.key, tag.value);
36 | });
37 |
38 | scope.setExtras(extra);
39 | Sentry.captureException(err);
40 | });
41 | }
42 |
43 | export async function drain(): Promise {
44 | const client = Sentry.getCurrentHub().getClient();
45 | if (client) {
46 | return await client.close(2000);
47 | }
48 |
49 | return true;
50 | }
51 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: Renderscript
2 |
3 | on:
4 | push:
5 | branches:
6 | - 'master'
7 | - 'renovate/**'
8 | pull_request:
9 |
10 | env:
11 | COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
12 |
13 | jobs:
14 | lint:
15 | runs-on: ubuntu-latest
16 | name: Lint
17 | steps:
18 | - uses: actions/checkout@v4
19 |
20 | - name: Install Node
21 | uses: actions/setup-node@v3
22 | with:
23 | node-version-file: .nvmrc
24 | cache: yarn
25 |
26 | - run: yarn install --frozen-lockfile
27 |
28 | - name: Run Linter
29 | run: yarn lint
30 |
31 | tests:
32 | runs-on: ubuntu-latest
33 | name: Tests
34 | needs: lint
35 | steps:
36 | - uses: actions/checkout@v4
37 |
38 | - name: Install Node
39 | uses: actions/setup-node@v3
40 | with:
41 | node-version-file: .nvmrc
42 | cache: yarn
43 |
44 | - run: yarn install --frozen-lockfile
45 |
46 | - name: Install Playwright browsers
47 | run: yarn playwright install
48 |
49 | - name: Build
50 | run: yarn build
51 |
52 | - name: Background process
53 | run: |
54 | yarn ci:start &
55 |
56 | - name: Run test
57 | run: yarn test
58 |
--------------------------------------------------------------------------------
/src/__tests__/__snapshots__/login.test.ts.snap:
--------------------------------------------------------------------------------
1 | // Jest Snapshot v1, https://goo.gl/fbAQLP
2 |
3 | exports[`JavaScript redirect should not try to render the body if renderHTML was not requested 1`] = `
4 | {
5 | "domain": "localhost",
6 | "expires": -1,
7 | "httpOnly": false,
8 | "name": "sessionToken",
9 | "path": "/secure",
10 | "sameSite": "Strict",
11 | "secure": false,
12 | "value": "53cu23_535510n",
13 | }
14 | `;
15 |
16 | exports[`login should works even with a 2-steps login 1`] = `
17 | {
18 | "domain": "localhost",
19 | "expires": -1,
20 | "httpOnly": false,
21 | "name": "sessionToken",
22 | "path": "/secure",
23 | "sameSite": "Strict",
24 | "secure": false,
25 | "value": "53cu23_535510n",
26 | }
27 | `;
28 |
29 | exports[`login should works with a 2-steps JS login 1`] = `
30 | {
31 | "domain": "localhost",
32 | "expires": -1,
33 | "httpOnly": false,
34 | "name": "sessionToken",
35 | "path": "/secure",
36 | "sameSite": "Strict",
37 | "secure": false,
38 | "value": "53cu23_535510n",
39 | }
40 | `;
41 |
42 | exports[`login should works with correct credentials 1`] = `
43 | {
44 | "domain": "localhost",
45 | "expires": -1,
46 | "httpOnly": false,
47 | "name": "sessionToken",
48 | "path": "/secure",
49 | "sameSite": "Strict",
50 | "secure": false,
51 | "value": "53cu23_535510n",
52 | }
53 | `;
54 |
--------------------------------------------------------------------------------
/src/__tests__/errors.test.ts:
--------------------------------------------------------------------------------
1 | import type { PostRenderSuccess } from '../api/@types/postRender';
2 | import type { BrowserEngine } from '../lib/browser/Browser';
3 |
4 | import { postRender } from './helpers';
5 |
6 | jest.setTimeout(30000);
7 |
8 | describe('errors', () => {
9 | it('should catch DNS error', async () => {
10 | const { res, body } = await postRender({
11 | url: 'http://thisisnota-domain.thistld.does.not.exist',
12 | ua: 'Algolia Crawler',
13 | });
14 |
15 | const json: PostRenderSuccess = JSON.parse(body);
16 | expect(res.statusCode).toBe(200);
17 | expect(json.body).toBeNull();
18 | expect(json.error).toBe('dns_error');
19 | });
20 |
21 | // Firefox doesn't crash reliably one the CI
22 | it.each(['chromium' /* , 'firefox' */])(
23 | '%s should catch Page Crashed',
24 | async (browser) => {
25 | const { res, body } = await postRender({
26 | url: 'http://localhost:3000/test-website/page-crash.html',
27 | ua: 'Algolia Crawler',
28 | browser: browser as BrowserEngine,
29 | waitTime: {
30 | max: 10000,
31 | },
32 | });
33 |
34 | const json: PostRenderSuccess = JSON.parse(body);
35 | expect(res.statusCode).toBe(500);
36 | expect(json.body).toBeNull();
37 | expect(json.error).toBe('body_serialisation_failed');
38 | }
39 | );
40 | });
41 |
--------------------------------------------------------------------------------
/scripts/test_image.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | set -e
4 |
5 | hash=$1 # the last commit change because of semantic-release
6 | docker run -d --name renderscript_test -p 3000:3000 algolia/renderscript:$hash
7 |
8 | ATTEMPTS=10
9 | until $(curl -o /dev/null -s -f http://localhost:3000/ready); do
10 | echo "waiting for docker..."
11 | sleep 1
12 | ATTEMPTS=$((ATTEMPTS-1))
13 | if [[ $ATTEMPTS -eq "0" ]]; then
14 | echo "Timed out, check the logs of renderscript_test container:"
15 | docker logs renderscript_test -n 50
16 | exit 1
17 | fi
18 | done
19 |
20 | logs=$(docker logs renderscript_test 2>&1)
21 | echo $logs
22 |
23 | if echo $logs | grep -q '"svc":"brws","msg":"Ready"'; then
24 | echo "Browser ready"
25 | else
26 | echo "Browser not ready"
27 | exit 1
28 | fi
29 |
30 | curl --silent --request POST \
31 | --url http://localhost:3000/render \
32 | --header 'Content-Type: application/json' \
33 | --data '{
34 | "url": "https://www.example.com",
35 | "ua": "Renderscript CI",
36 | "waitTime": {
37 | "min": 1000,
38 | "max": 3000
39 | }
40 | }' >/dev/null
41 |
42 | logs=$(docker logs renderscript_test 2>&1)
43 | echo $logs
44 |
45 | if echo $logs | grep -q '"msg":"Done","data":'; then
46 | echo "Rendered"
47 | else
48 | echo "Not rendered"
49 | exit 1
50 | fi
51 |
52 | echo "Image OK"
53 | docker stop renderscript_test && docker rm renderscript_test
54 |
--------------------------------------------------------------------------------
/src/helpers/gracefulClose.ts:
--------------------------------------------------------------------------------
1 | import { nextTick } from 'process';
2 |
3 | import type { Api } from '../api/index';
4 | import type { TasksManager } from '../lib/TasksManager';
5 |
6 | import * as reporting from './errorReporting';
7 | import { log } from './logger';
8 | import * as stats from './stats';
9 |
10 | interface Params {
11 | api: Api;
12 | tasksManager: TasksManager;
13 | }
14 |
15 | let gracefullyClosing = false;
16 |
17 | async function close({ api, tasksManager }: Params): Promise {
18 | const webServerPromise = new Promise((resolve) => {
19 | log.info('[API] Stopping...');
20 | api.stop(() => {
21 | log.info('[API] stopped');
22 | resolve();
23 | });
24 | });
25 |
26 | await webServerPromise;
27 | await tasksManager.stop();
28 |
29 | log.info('Gracefully stopped everything');
30 | }
31 |
32 | export async function gracefulClose(opts: Params): Promise {
33 | // If we receive multiple signals, swallow them
34 | if (gracefullyClosing) {
35 | return;
36 | }
37 |
38 | gracefullyClosing = true;
39 | log.info('Starting graceful close...');
40 |
41 | try {
42 | await close(opts);
43 | await reporting.drain();
44 | await stats.close();
45 | } catch (err) {
46 | log.error('Graceful exit failed', err);
47 | }
48 | log.flush();
49 |
50 | nextTick(() => {
51 | // eslint-disable-next-line no-process-exit
52 | process.exit(0);
53 | });
54 | }
55 |
--------------------------------------------------------------------------------
/src/lib/helpers/getInput.ts:
--------------------------------------------------------------------------------
1 | import type { Locator } from 'playwright';
2 |
3 | import type { BrowserPage } from '../browser/Page';
4 | import type { HandledError } from '../types';
5 |
6 | /**
7 | * Get input for selector.
8 | */
9 | export async function getInput(
10 | page: BrowserPage | undefined,
11 | sel: string
12 | ): Promise {
13 | const textInputLoc = page?.ref?.locator(sel);
14 |
15 | const count = textInputLoc ? await textInputLoc.count() : 0;
16 | if (!textInputLoc || count <= 0) {
17 | return {
18 | error: 'field_not_found',
19 | rawError: new Error(`Field not found "${sel}"`),
20 | };
21 | }
22 |
23 | if (count > 1) {
24 | // sometimes another input can be hidden using CSS,
25 | // wait for the page to be fully loaded
26 | await page?.waitForNavigation({
27 | waitUntil: 'load',
28 | timeout: 10_000,
29 | });
30 |
31 | // check again but this time only for visible elements
32 | const visibleInputLoc = await textInputLoc.locator('visible=true');
33 | const visibleCount = visibleInputLoc ? await visibleInputLoc.count() : 0;
34 | if (visibleCount === 1) {
35 | return visibleInputLoc;
36 | }
37 |
38 | return {
39 | error: 'too_many_fields',
40 | rawError: new Error(
41 | `Too many input found for "${sel}", found "${count}"`
42 | ),
43 | };
44 | }
45 |
46 | return textInputLoc;
47 | }
48 |
--------------------------------------------------------------------------------
/src/api/@types/postRender.ts:
--------------------------------------------------------------------------------
1 | import type {
2 | HandledError,
3 | Metrics,
4 | TaskBaseParams,
5 | UnhandledError,
6 | } from '../../lib/types';
7 |
8 | import type { Res500 } from './responses';
9 |
10 | export type PostRenderParams = Omit<
11 | TaskBaseParams,
12 | 'type' | 'url' | 'userAgent'
13 | > & {
14 | url: string;
15 | ua: string;
16 | };
17 |
18 | export type PostRenderResponse = PostRenderSuccess | Res500;
19 |
20 | export interface PostRenderSuccess {
21 | /**
22 | * HTTP Code of the rendered page.
23 | */
24 | statusCode: number | null;
25 |
26 | /**
27 | * HTTP Headers of the rendered page.
28 | */
29 | headers: Record;
30 |
31 | /**
32 | * Body of the rendered page.
33 | */
34 | body: string | null;
35 |
36 | /**
37 | * Metrics from different taks during the rendering.
38 | */
39 | metrics: Metrics;
40 |
41 | /**
42 | * The redirection renderscript caught.
43 | */
44 | resolvedUrl: string | null;
45 |
46 | /**
47 | * Has the page reached timeout?
48 | * When timeout has been reached we continue the rendering as usual
49 | * but reduce other timeout to a minimum.
50 | */
51 | timeout: boolean;
52 |
53 | /**
54 | * Any error encountered along the way.
55 | * If this field is filled that means the rest of the payload is partial.
56 | */
57 | error: HandledError | UnhandledError | null;
58 | rawError: { message: string; stack?: string } | null;
59 | }
60 |
--------------------------------------------------------------------------------
/.eslintrc.js:
--------------------------------------------------------------------------------
1 | // eslint-disable-next-line import/no-commonjs
2 | module.exports = {
3 | env: {
4 | browser: true, // For frontend only
5 | es2020: true,
6 | jest: true,
7 | },
8 | extends: [
9 | 'algolia',
10 | 'algolia/jest',
11 | 'algolia/typescript',
12 | 'plugin:import/typescript',
13 | ],
14 | parser: '@typescript-eslint/parser',
15 | parserOptions: {
16 | ecmaVersion: 11,
17 | sourceType: 'module',
18 | },
19 | settings: {
20 | 'import/resolver': {
21 | typescript: {},
22 | },
23 | },
24 |
25 | plugins: ['prettier', '@typescript-eslint', 'import', 'algolia'],
26 | rules: {
27 | 'algolia/func-style-toplevel': 'error',
28 |
29 | 'no-console': 'off',
30 | 'no-continue': 'off',
31 | 'no-loop-func': 'off',
32 | 'consistent-return': 'off',
33 |
34 | '@typescript-eslint/explicit-member-accessibility': [
35 | 'error',
36 | { accessibility: 'no-public' },
37 | ],
38 | 'eslint-comments/disable-enable-pair': ['error', { allowWholeFile: true }],
39 |
40 | 'no-param-reassign': [
41 | 'error',
42 | { props: true, ignorePropertyModificationsFor: ['res', 'req'] }, // http://expressjs.com/en/api.html#res.locals
43 | ],
44 |
45 | // TMP
46 | 'jsdoc/check-examples': ['off'],
47 | '@typescript-eslint/prefer-optional-chain': ['off'], // to re-enable when this is fixed: https://github.com/typescript-eslint/typescript-eslint/issues/6024
48 | },
49 | };
50 |
--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
1 | name: Release Version
2 | on:
3 | workflow_dispatch:
4 | inputs:
5 | dry_run:
6 | required: true
7 | type: boolean
8 | default: true
9 | description: 'DryRun?'
10 |
11 | env:
12 | COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
13 |
14 | jobs:
15 | release:
16 | runs-on: ubuntu-latest
17 | name: Release
18 | env:
19 | GH_TOKEN: ${{ secrets.GH_TOKEN }}
20 | GIT_AUTHOR_NAME: ${{ secrets.GH_USER_NAME }}
21 | GIT_AUTHOR_EMAIL: ${{ secrets.GH_USER_EMAIL }}
22 | GIT_COMMITTER_NAME: ${{ secrets.GH_USER_NAME }}
23 | GIT_COMMITTER_EMAIL: ${{ secrets.GH_USER_EMAIL }}
24 |
25 | steps:
26 | - uses: actions/checkout@v4
27 | with:
28 | # Make sure the release step uses its own credentials.
29 | persist-credentials: false
30 |
31 | - name: Install Node
32 | uses: actions/setup-node@v3
33 | with:
34 | node-version-file: .nvmrc
35 | cache: yarn
36 |
37 | - name: Release (--dry-run)
38 | if: (github.event_name == 'workflow_dispatch' && github.event.inputs.dry_run == 'true')
39 | run: |
40 | yarn install
41 | yarn semantic-release --dry-run
42 |
43 | - name: Release
44 | if: (github.event_name == 'workflow_dispatch' && github.event.inputs.dry_run != 'true')
45 | run: |
46 | yarn install
47 | yarn semantic-release
48 |
--------------------------------------------------------------------------------
/src/__tests__/tasksManager.test.ts:
--------------------------------------------------------------------------------
1 | import type { GetHealthySuccess } from '../api/@types/getHealthy';
2 |
3 | import { postRender, request } from './helpers';
4 |
5 | describe('manager', () => {
6 | it('should properly close page after done', async () => {
7 | // Before
8 | const { res, body } = await request('http://localhost:3000/healthy');
9 | expect(res.statusCode).toBe(200);
10 |
11 | const before: GetHealthySuccess = JSON.parse(body);
12 | expect(before).toEqual({
13 | ready: true,
14 | tasksRunning: 0,
15 | pagesOpen: 0,
16 | totalRun: expect.any(Number),
17 | });
18 |
19 | // Process something
20 | const { res: resRender } = await postRender({
21 | url: 'http://localhost:3000/test-website/async.html',
22 | ua: 'Algolia Crawler',
23 | });
24 | expect(resRender.statusCode).toBe(200);
25 |
26 | // After
27 | const { res: resAfter, body: bodyAfter } = await request(
28 | 'http://localhost:3000/healthy'
29 | );
30 | expect(resAfter.statusCode).toBe(200);
31 |
32 | const after: GetHealthySuccess = JSON.parse(bodyAfter);
33 | expect(after).toEqual({
34 | ready: true,
35 | tasksRunning: 0,
36 | pagesOpen: 0,
37 | totalRun: expect.any(Number),
38 | });
39 |
40 | // Compare because we can't know how much due to of other that could have been run before
41 | expect(after.totalRun).toBeGreaterThan(0);
42 | expect(before.totalRun).toBeLessThan(after.totalRun);
43 | });
44 | });
45 |
--------------------------------------------------------------------------------
/src/api/@types/postLogin.ts:
--------------------------------------------------------------------------------
1 | import type { Cookie } from 'playwright';
2 |
3 | import type { Metrics, TaskBaseParams } from '../../lib/types';
4 |
5 | import type { Res500 } from './responses';
6 |
7 | export type PostLoginParams = Omit<
8 | TaskBaseParams,
9 | 'type' | 'url' | 'userAgent'
10 | > & {
11 | url: string;
12 | ua: string;
13 | username: string;
14 | password: string;
15 | renderHTML: boolean;
16 | };
17 |
18 | export type PostLoginResponse = PostLoginSuccess | Res500;
19 |
20 | export interface PostLoginSuccess {
21 | /**
22 | * HTTP Code of the rendered page.
23 | */
24 | statusCode: number | null;
25 |
26 | /**
27 | * HTTP Headers of the rendered page.
28 | */
29 | headers: Record;
30 |
31 | /**
32 | * Metrics from different taks during the rendering.
33 | */
34 | metrics: Metrics;
35 |
36 | /**
37 | * Has the page reached timeout?
38 | * When timeout has been reached we continue the rendering as usual
39 | * but reduce other timeout to a minimum.
40 | */
41 | timeout: boolean;
42 |
43 | /**
44 | * Any error encountered along the way.
45 | * If this field is filled that means the rest of the payload is partial.
46 | */
47 | error: string | null;
48 | rawError: { message: string; stack?: string } | null;
49 |
50 | /**
51 | * Cookie generated from a succesful login.
52 | */
53 | cookies: Cookie[];
54 |
55 | /**
56 | * The URL at the end of a succesful login.
57 | */
58 | resolvedUrl: string | null;
59 |
60 | /**
61 | * Body at the end of a succesful login.
62 | */
63 | body: string | null;
64 | }
65 |
--------------------------------------------------------------------------------
/src/__tests__/__snapshots__/async.test.ts.snap:
--------------------------------------------------------------------------------
1 | // Jest Snapshot v1, https://goo.gl/fbAQLP
2 |
3 | exports[`async should render async page on chromium 1`] = `"Algolia Crawler
1. Init - 2. DOMContentLoaded - 3. window.onload"`;
4 |
5 | exports[`async should render async page on firefox 1`] = `"Algolia Crawler
1. Init - 2. DOMContentLoaded - 3. window.onload"`;
6 |
--------------------------------------------------------------------------------
/src/__tests__/list.test.ts:
--------------------------------------------------------------------------------
1 | import { wait } from '../helpers/wait';
2 |
3 | import { request } from './helpers';
4 |
5 | jest.setTimeout(25000);
6 |
7 | describe('list', () => {
8 | it('should list nothing', async () => {
9 | const { res, body } = await request('http://localhost:3000/list');
10 |
11 | expect(res.statusCode).toBe(200);
12 | const parsed = JSON.parse(body);
13 | expect(parsed).toEqual({
14 | open: {
15 | chromium: [],
16 | firefox: [],
17 | },
18 | });
19 | });
20 |
21 | it('should list current page', async () => {
22 | const r = request('http://localhost:3000/render', {
23 | method: 'POST',
24 | headers: {
25 | 'content-type': 'application/json',
26 | },
27 | body: JSON.stringify({
28 | url: 'http://localhost:3000/test-website/slow.html',
29 | ua: 'Algolia Crawler',
30 | waitTime: {
31 | min: 2000,
32 | max: 3000,
33 | },
34 | }),
35 | });
36 |
37 | await wait(1000);
38 |
39 | // Currently processing
40 | const res1 = await request('http://localhost:3000/list');
41 | const parsed1 = JSON.parse(res1.body);
42 | expect(parsed1).toEqual({
43 | open: {
44 | chromium: ['http://localhost:3000/test-website/slow.html'],
45 | firefox: [],
46 | },
47 | });
48 |
49 | await r;
50 |
51 | // Cleared
52 | const res2 = await request('http://localhost:3000/list');
53 | const parsed2 = JSON.parse(res2.body);
54 | expect(parsed2).toEqual({
55 | open: {
56 | chromium: [],
57 | firefox: [],
58 | },
59 | });
60 | });
61 | });
62 |
--------------------------------------------------------------------------------
/src/api/routes/healthy.ts:
--------------------------------------------------------------------------------
1 | import os from 'os';
2 |
3 | import type express from 'express';
4 |
5 | import { report } from '../../helpers/errorReporting';
6 | import { stats } from '../../helpers/stats';
7 | import { UNHEALTHY_TASK_TTL } from '../../lib/constants';
8 | import { tasksManager } from '../../lib/singletons';
9 | import type { GetHealthySuccess } from '../@types/getHealthy';
10 |
11 | const hostname = os.hostname();
12 |
13 | export function healthy(
14 | req: express.Request,
15 | res: express.Response
16 | ): void {
17 | const health = tasksManager.getHealth();
18 | const tasksRunning = tasksManager.currentConcurrency;
19 | let pagesOpen = 0;
20 | tasksManager.currentBrowsers.forEach((browser) => {
21 | pagesOpen += browser?.getCurrentConcurrency() || 0;
22 | });
23 | const totalRun = tasksManager.totalRun;
24 |
25 | // Those stats could be computed from .task.count
26 | // But we want to double check that we don't forgot tasks or tabs
27 | stats.gauge('renderscript.tasks.running', tasksRunning);
28 | stats.gauge('renderscript.pages.open', pagesOpen);
29 | stats.check(
30 | 'renderscript.up',
31 | health.ready ? stats.CHECKS.OK : stats.CHECKS.CRITICAL,
32 | {
33 | hostname,
34 | }
35 | );
36 |
37 | if (!health.ready && health.oldTasks.length > 0) {
38 | report(new Error('Reporting not healthy'), {
39 | tasks: health.oldTasks,
40 | max: UNHEALTHY_TASK_TTL,
41 | tasksRunning,
42 | pagesOpen,
43 | totalRun,
44 | });
45 | }
46 |
47 | res
48 | .status(health.ready ? 200 : 503)
49 | .json({ ready: health.ready, tasksRunning, pagesOpen, totalRun });
50 | }
51 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | import { Api } from './api/index';
2 | import { report } from './helpers/errorReporting';
3 | import { gracefulClose } from './helpers/gracefulClose';
4 | import { log } from './helpers/logger';
5 | import * as singletons from './lib/singletons';
6 |
7 | const PORT = parseInt(process.env.PORT || '3000', 10);
8 |
9 | // Uncaught Promise Rejection
10 | process.on('unhandledRejection', (reason) => {
11 | report(new Error('unhandled rejection'), { err: reason });
12 |
13 | log.info('Hard exit after unhandledRejection');
14 | // We are not sure if it's stable or not
15 | setTimeout(() => {
16 | // eslint-disable-next-line no-process-exit
17 | process.exit(1);
18 | }, 1);
19 | });
20 |
21 | process.on('uncaughtException', (reason) => {
22 | report(new Error('uncaught exception'), { err: reason });
23 |
24 | log.info('Hard exit after uncaughtException');
25 | // We are not sure if it's stable or not
26 | setTimeout(() => {
27 | // eslint-disable-next-line no-process-exit
28 | process.exit(1);
29 | }, 1);
30 | });
31 |
32 | (async (): Promise => {
33 | log.info('Starting...', {
34 | env: process.env.NODE_ENV,
35 | v: process.env.VERSION,
36 | });
37 |
38 | const api = new Api();
39 | api.start(PORT);
40 |
41 | await singletons.init();
42 |
43 | // Handle SIGINT
44 | // It doesn't seem to handle it correctly, but it's just `yarn` messing up
45 | // Try running
46 | //
47 | // yarn build && NODE_ENV=development node dist/index.js
48 | //
49 | // to see that it works fine
50 | const gracefulCloseParams = { api, tasksManager: singletons.tasksManager };
51 | const boundGracefulClose = gracefulClose.bind(null, gracefulCloseParams);
52 | process.on('SIGINT', boundGracefulClose);
53 | process.on('SIGTERM', boundGracefulClose);
54 | })();
55 |
--------------------------------------------------------------------------------
/src/__tests__/blockedRequests.test.ts:
--------------------------------------------------------------------------------
1 | import type { PostRenderSuccess } from '../api/@types/postRender';
2 |
3 | import { postRender } from './helpers';
4 |
5 | jest.setTimeout(10000);
6 |
7 | describe('native', () => {
8 | it('should block basic unecessary requests', async () => {
9 | const { res, body } = await postRender({
10 | url: 'http://localhost:3000/test-website/blocked-requests.html',
11 | ua: 'Algolia Crawler',
12 | });
13 |
14 | const json: PostRenderSuccess = JSON.parse(body);
15 |
16 | expect(res.statusCode).toBe(200);
17 | expect(json.metrics.page!.requests).toStrictEqual({
18 | total: 11,
19 | pending: 0,
20 | blocked: 6,
21 | });
22 | });
23 | });
24 |
25 | describe('adblocker', () => {
26 | it('should use adblock', async () => {
27 | const { res, body } = await postRender({
28 | url: 'http://localhost:3000/test-website/blocked-requests.html',
29 | ua: 'Algolia Crawler',
30 | adblock: true,
31 | });
32 |
33 | const json: PostRenderSuccess = JSON.parse(body);
34 |
35 | expect(res.statusCode).toBe(200);
36 | expect(json.metrics.page!.requests).toStrictEqual({
37 | total: 11,
38 | pending: 0,
39 | blocked: 9,
40 | });
41 | /**
42 | * @example
43 | * https://www.google-analytics.com/analytics.js
44 | * https://static.ads-twitter.com/uwt.js
45 | * https://www.googletagmanager.com/gtm.js?id=GTM-FOOBAR&l=dataLayer
46 | * https://via.placeholder.com/150
47 | * https://via.placeholder.com/152
48 | * http://localhost:3000/301
49 | * https://res.cloudinary.com/hilnmyskv/image/upload/v1623928136/ui-library/nav/search.svg
50 | * https://fonts.gstatic.com/s/qahiri/v1/tsssAp1RZy0C_hGeVHqgjHq-pg.woff2
51 | * https://fonts.gstatic.com/s/roboto/v30/KFOiCnqEu92Fr1Mu51QrIzc.ttf
52 | */
53 | });
54 | });
55 |
--------------------------------------------------------------------------------
/src/lib/browser/TimeBudget.test.ts:
--------------------------------------------------------------------------------
1 | import { wait } from '../../helpers/wait';
2 |
3 | import { TimeBudget } from './TimeBudget';
4 |
5 | describe('consume()', () => {
6 | it('should consume correctly', async () => {
7 | const tb = new TimeBudget(100);
8 | tb.consume();
9 | expect(tb.get()).toBeGreaterThan(98);
10 | expect(tb.consumed).toBeGreaterThanOrEqual(0);
11 | expect(tb.consumed).toBeLessThanOrEqual(2);
12 |
13 | await wait(10);
14 | tb.consume();
15 | expect(tb.get()).toBeGreaterThanOrEqual(80);
16 | expect(tb.get()).toBeLessThanOrEqual(90);
17 |
18 | expect(tb.consumed).toBeGreaterThanOrEqual(10);
19 | });
20 | });
21 |
22 | describe('get()', () => {
23 | it('should return correct get', async () => {
24 | const tb = new TimeBudget(100);
25 | expect(tb.get()).toBeGreaterThanOrEqual(99);
26 |
27 | await wait(100);
28 | tb.consume();
29 |
30 | expect(tb.get()).toBeGreaterThanOrEqual(1);
31 | expect(tb.get()).toBeLessThanOrEqual(2);
32 | });
33 | });
34 |
35 | describe('min()', () => {
36 | it('should return correct min', async () => {
37 | const tb = new TimeBudget(100);
38 | expect(tb.min(99)).toBeGreaterThanOrEqual(99);
39 |
40 | await wait(60);
41 | tb.consume();
42 |
43 | // Still 99 even if budget does not allow
44 | expect(tb.min(99)).toBeGreaterThanOrEqual(99);
45 | });
46 | });
47 |
48 | describe('getRange()', () => {
49 | it('should return correct inside range', () => {
50 | const tb = new TimeBudget(100);
51 | expect(tb.getRange(0, 10)).toBe(10);
52 | });
53 | it('should return correct outside range', () => {
54 | const tb = new TimeBudget(100);
55 | expect(tb.getRange(0, 200)).toBe(100);
56 | });
57 | it('should return correct outside range but forced', () => {
58 | const tb = new TimeBudget(100);
59 | expect(tb.getRange(200, 300)).toBe(200);
60 | });
61 | });
62 |
--------------------------------------------------------------------------------
/public/views/login-2steps-js.ejs:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Log-in
5 |
6 |
15 |
16 |
41 |
42 |
43 |
44 | 2-steps JavaScript login form
45 |
46 |
54 |
55 |
56 |
57 |
--------------------------------------------------------------------------------
/src/__tests__/helpers.ts:
--------------------------------------------------------------------------------
1 | import type { IncomingHttpHeaders } from 'http';
2 |
3 | import type { Cookie } from 'playwright';
4 | import { request as req } from 'undici';
5 | import type Dispatcher from 'undici/types/dispatcher';
6 |
7 | import type {
8 | PostLoginParams,
9 | PostLoginSuccess,
10 | } from '../api/@types/postLogin';
11 | import type { PostRenderParams } from '../api/@types/postRender';
12 |
13 | export async function request(
14 | url: string,
15 | params?: Parameters[1]
16 | ): Promise<{ res: Dispatcher.ResponseData; body: string }> {
17 | const res = await req(url, params);
18 |
19 | let body = '';
20 | for await (const chunk of res.body) {
21 | body += chunk.toString();
22 | }
23 |
24 | return { res, body };
25 | }
26 |
27 | export async function postRender(
28 | opts: Partial,
29 | headers?: IncomingHttpHeaders
30 | ): Promise<{ res: Dispatcher.ResponseData; body: string }> {
31 | return await request('http://localhost:3000/render', {
32 | method: 'POST',
33 | headers: {
34 | 'content-type': 'application/json',
35 | ...headers,
36 | },
37 | body: JSON.stringify({
38 | ua: 'Algolia Crawler',
39 | ...opts,
40 | }),
41 | });
42 | }
43 |
44 | export async function sendLoginRequest(
45 | opts: Partial
46 | ): Promise<{ res: Dispatcher.ResponseData; body: string }> {
47 | return await request('http://localhost:3000/login', {
48 | method: 'POST',
49 | headers: {
50 | 'content-type': 'application/json',
51 | },
52 | body: JSON.stringify({
53 | ua: 'Algolia Crawler',
54 | ...opts,
55 | }),
56 | });
57 | }
58 |
59 | export function cleanString(body: string): string {
60 | return body.replace(/\n|\r/g, '').replace(/\s\s+/g, '');
61 | }
62 |
63 | export function cleanCookies(
64 | cookies: PostLoginSuccess['cookies']
65 | ): Array<
66 | Omit
67 | > {
68 | return cookies.map(
69 | ({ value, expires, httpOnly, secure, sameSite, ...rest }) => {
70 | return rest;
71 | }
72 | );
73 | }
74 |
75 | export function cookiesToString(cookies: PostLoginSuccess['cookies']): string {
76 | if (!cookies) {
77 | return '';
78 | }
79 | return cookies.map((cookie) => `${cookie.name}=${cookie.value}`).join('; ');
80 | }
81 |
--------------------------------------------------------------------------------
/public/test-website/blocked-requests.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
44 |
45 |
46 |
47 | A basic page
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
58 |
59 | Hello
60 | Foo
61 | Img Bg
62 |
63 |
64 |
65 |
66 |
67 |
68 |
--------------------------------------------------------------------------------
/src/lib/helpers/errors.ts:
--------------------------------------------------------------------------------
1 | import type { HandledError, UnhandledError } from '../types';
2 |
3 | export const retryableErrors: Array = [
4 | 'body_serialisation_failed',
5 | 'connection_error',
6 | 'fetch_aborted',
7 | 'fetch_timeout',
8 | 'no_cookies',
9 | 'no_response_after_login',
10 | 'page_closed_too_soon',
11 | 'page_crashed',
12 | 'timedout',
13 | 'unknown_error',
14 | 'error_reading_response',
15 | ];
16 |
17 | // eslint-disable-next-line eslint-comments/disable-enable-pair
18 | /* eslint-disable complexity */
19 | export function cleanErrorMessage(error: Error): HandledError | UnhandledError {
20 | if (
21 | error.message.includes('ERR_NAME_NOT_RESOLVED') ||
22 | error.message.includes('ERR_ADDRESS_UNREACHABLE')
23 | ) {
24 | return 'dns_error';
25 | }
26 | if (
27 | error.message.includes('ERR_CONNECTION_REFUSED') ||
28 | error.message.includes('ERR_CONNECTION_ABORTED') ||
29 | error.message.includes('ERR_CONNECTION_CLOSED') ||
30 | error.message.includes('ERR_CONNECTION_FAILED') ||
31 | error.message.includes('ERR_INTERNET_DISCONNECTED') ||
32 | error.message.includes('ERR_CONNECTION_RESET')
33 | ) {
34 | return 'connection_error';
35 | }
36 | if (error.message.includes('ERR_ABORTED')) {
37 | return 'fetch_aborted';
38 | }
39 | if (
40 | error.message.includes('ETIMEDOUT') ||
41 | error.message.includes('ESOCKETTIMEDOUT')
42 | ) {
43 | return 'fetch_timeout';
44 | }
45 | if (
46 | error.message.includes('Navigation failed because page was closed') ||
47 | error.message.includes('Target closed') ||
48 | error.message.includes('Target page, context or browser has been closed') ||
49 | error.message.includes('Target has been closed') ||
50 | error.message.includes('Browser has been disconnected')
51 | ) {
52 | return 'page_closed_too_soon';
53 | }
54 | if (
55 | error.message.includes('goto_no_response') ||
56 | error.message.includes('Navigation failed because page crashed') ||
57 | error.message.includes('ERR_FAILED') ||
58 | error.message.includes('Element is not attached to the DOM')
59 | ) {
60 | return 'page_crashed';
61 | }
62 | if (error.message.includes('ERR_BLOCKED_BY_RESPONSE')) {
63 | return 'forbidden_by_website';
64 | }
65 | if (error.message.includes('ERR_TIMED_OUT')) {
66 | // This is a generic error from playwright
67 | return 'timedout';
68 | }
69 |
70 | return `unknown_error`;
71 | }
72 |
73 | export class ErrorIsHandledError extends Error {}
74 |
--------------------------------------------------------------------------------
/src/__tests__/api.test.ts:
--------------------------------------------------------------------------------
1 | import type { PostRenderSuccess } from '../api/@types/postRender';
2 |
3 | import { postRender, request } from './helpers';
4 |
5 | /**
6 | * Test the schema only on this file.
7 | */
8 | describe('POST /render', () => {
9 | it('should validate 200', async () => {
10 | const { res, body } = await postRender({
11 | url: 'http://localhost:3000/test-website/async.html',
12 | ua: 'Algolia Crawler',
13 | });
14 | expect(res.statusCode).toBe(200);
15 |
16 | const json: PostRenderSuccess = JSON.parse(body);
17 | expect(json).toStrictEqual({
18 | body: expect.any(String),
19 | error: null,
20 | rawError: null,
21 | headers: {
22 | 'accept-ranges': 'bytes',
23 | 'cache-control': 'public, max-age=0',
24 | connection: 'keep-alive',
25 | 'content-length': expect.any(String),
26 | 'content-type': 'text/html; charset=UTF-8',
27 | date: expect.any(String),
28 | etag: expect.any(String),
29 | 'keep-alive': 'timeout=5',
30 | 'last-modified': expect.any(String),
31 | },
32 | statusCode: 200,
33 | resolvedUrl: null,
34 | timeout: false,
35 | metrics: {
36 | renderingBudget: {
37 | consumed: expect.any(Number),
38 | max: 20000,
39 | },
40 | timings: {
41 | context: expect.any(Number),
42 | equiv: expect.any(Number),
43 | goto: expect.any(Number),
44 | minWait: null,
45 | ready: expect.any(Number),
46 | serialize: expect.any(Number),
47 | close: expect.any(Number),
48 | total: expect.any(Number),
49 | },
50 | page: {
51 | contentLength: {
52 | main: 763,
53 | total: 763,
54 | },
55 | mem: {
56 | jsHeapTotalSize: 0,
57 | jsHeapUsedSize: 0,
58 | },
59 | requests: {
60 | blocked: 0,
61 | pending: 0,
62 | total: 1,
63 | },
64 | timings: {
65 | download: expect.any(Number),
66 | },
67 | },
68 | },
69 | });
70 | });
71 |
72 | it('should handle bad json', async () => {
73 | const res = await request('http://localhost:3000/render', {
74 | method: 'POST',
75 | headers: {
76 | 'content-type': 'application/json',
77 | },
78 | body: '{"url": "https://example.com", "ua": "test}',
79 | });
80 |
81 | expect(JSON.parse(res.body)).toStrictEqual({
82 | status: 400,
83 | error: 'Invalid json: Unexpected end of JSON input',
84 | code: 'invalid_json',
85 | });
86 | });
87 | });
88 |
--------------------------------------------------------------------------------
/src/api/routes/privates/login.ts:
--------------------------------------------------------------------------------
1 | import type { Request, Response } from 'express';
2 |
3 | import { DELETE_COOKIE, SESSION_COOKIE } from '../../constants';
4 | import { log } from '../../helpers/logger';
5 |
6 | export function getLogin(req: Request, res: Response): void {
7 | res.render('login', {
8 | baseUrl: req.baseUrl,
9 | csrfToken: req.csrfToken(),
10 | });
11 | }
12 |
13 | export function postLogin(req: Request, res: Response): void {
14 | const { username, password, redirect } = req.body;
15 | renderLogin({
16 | username,
17 | password,
18 | redirect,
19 | res,
20 | });
21 | }
22 |
23 | export function getTest(req: Request, res: Response): void {
24 | const cookie = req.get('Cookie') || '';
25 | const cookies = cookie.split(';').map((c) => c.trim());
26 | const granted = cookies.includes(SESSION_COOKIE);
27 | log.debug(`[/secure/test] granted: ${granted}, received cookie: ${cookie}`);
28 | res
29 | .contentType('text/html')
30 | .status(granted ? 200 : 401)
31 | .send(
32 | `${
33 | granted ? 'OK' : 'NOK'
34 | }(/test)`
35 | );
36 | }
37 |
38 | export function getStep1(req: Request, res: Response): void {
39 | res.render('login-step1', {
40 | baseUrl: req.baseUrl,
41 | csrfToken: req.csrfToken(),
42 | });
43 | }
44 |
45 | export function postStep2(req: Request, res: Response): void {
46 | const { username } = req.body;
47 | res.render('login-step2', {
48 | baseUrl: req.baseUrl,
49 | csrfToken: req.csrfToken(),
50 | username,
51 | });
52 | }
53 |
54 | export function getTwoSteps(req: Request, res: Response): void {
55 | const { username } = req.body;
56 | res.render('login-2steps-js', {
57 | baseUrl: req.baseUrl,
58 | csrfToken: req.csrfToken(),
59 | username,
60 | });
61 | }
62 |
63 | function renderLogin({
64 | username,
65 | password,
66 | redirect,
67 | res,
68 | }: {
69 | username: string;
70 | password: string;
71 | redirect?: boolean;
72 | res: Response;
73 | }): void {
74 | const granted = username === 'admin' && password === 'password';
75 | const setCookie = `${
76 | granted ? SESSION_COOKIE : DELETE_COOKIE
77 | }; SameSite=Strict`;
78 | log.debug('renderLogin', {
79 | username,
80 | password,
81 | setCookie,
82 | });
83 |
84 | res
85 | .contentType('text/html')
86 | .set('Set-Cookie', setCookie)
87 | .status(granted ? 200 : 401)
88 | .send(
89 | `${
90 | redirect
91 | ? ``
97 | : ''
98 | }${granted ? 'OK' : 'NOK'}(/login)`
99 | );
100 | }
101 |
--------------------------------------------------------------------------------
/src/api/routes/login.ts:
--------------------------------------------------------------------------------
1 | import type express from 'express';
2 |
3 | import { report } from '../../helpers/errorReporting';
4 | import { retryableErrors } from '../../lib/helpers/errors';
5 | import { tasksManager } from '../../lib/singletons';
6 | import { LoginTask } from '../../lib/tasks/Login';
7 | import type { PostLoginParams, PostLoginResponse } from '../@types/postLogin';
8 | import { CSP_HEADERS } from '../constants';
9 | import { getDefaultParams, alt } from '../helpers/alt';
10 | import { buildUrl, revertUrl } from '../helpers/buildUrl';
11 | import { badRequest } from '../helpers/errors';
12 | import { getForwardedHeadersFromRequest } from '../helpers/getForwardedHeaders';
13 |
14 | export async function validate(
15 | req: express.Request,
16 | res: express.Response,
17 | next: express.NextFunction
18 | ): Promise {
19 | const errors = await alt({
20 | ...getDefaultParams(),
21 | username: alt.string().required(),
22 | password: alt.string().required(),
23 | renderHTML: alt.boolean().cast(),
24 | })
25 | .body(req.body)
26 | .validate();
27 |
28 | if (errors) {
29 | badRequest({ res, details: errors });
30 | return;
31 | }
32 |
33 | next();
34 | }
35 |
36 | export async function processLogin(
37 | req: express.Request,
38 | res: express.Response
39 | ): Promise {
40 | const { ua, username, password, renderHTML, waitTime, browser } = req.body;
41 | const headersToForward = getForwardedHeadersFromRequest(req);
42 | const url = new URL(buildUrl(req.body.url));
43 |
44 | try {
45 | const task = await tasksManager.task(
46 | new LoginTask({
47 | url: new URL(url),
48 | headersToForward,
49 | userAgent: ua,
50 | login: {
51 | username,
52 | password,
53 | },
54 | browser,
55 | renderHTML,
56 | waitTime,
57 | })
58 | );
59 |
60 | if (renderHTML) {
61 | res
62 | .status(200)
63 | .header('Content-Type', 'text/html')
64 | .header('Content-Security-Policy', CSP_HEADERS)
65 | .send(task.body);
66 | return;
67 | }
68 |
69 | const resolvedUrl = revertUrl(task.resolvedUrl)?.href || null;
70 | const code =
71 | task.error &&
72 | retryableErrors.includes(task.error) &&
73 | task.error !== 'redirection'
74 | ? 500
75 | : 200;
76 |
77 | res.status(code).json({
78 | headers: task.headers,
79 | metrics: task.metrics,
80 | statusCode: task.statusCode,
81 | timeout: task.timeout,
82 | error: task.error,
83 | cookies: task.cookies,
84 | resolvedUrl,
85 | body: task.body,
86 | rawError: task.rawError
87 | ? {
88 | message: task.rawError.message,
89 | stack: task.rawError.stack,
90 | }
91 | : null,
92 | });
93 | } catch (err: any) {
94 | res.status(500).json({ error: err.message });
95 | report(err, { url, type: 'login' });
96 | }
97 | }
98 |
--------------------------------------------------------------------------------
/src/__tests__/index.test.ts:
--------------------------------------------------------------------------------
1 | import { cleanString, request } from './helpers';
2 |
3 | jest.setTimeout(30 * 1000);
4 |
5 | describe('main', () => {
6 | it('should error when no url', async () => {
7 | const { res, body } = await request('http://localhost:3000/render?');
8 |
9 | expect(res.statusCode).toBe(400);
10 |
11 | expect(JSON.parse(body)).toEqual({
12 | details: [
13 | { label: 'url', message: 'url is required', type: 'required' },
14 | { label: 'ua', message: 'ua is required', type: 'required' },
15 | ],
16 | error: true,
17 | message: 'Bad Request',
18 | });
19 | });
20 |
21 | it('should error when no user agent', async () => {
22 | const { res, body } = await request(
23 | 'http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fbasic.html'
24 | );
25 |
26 | expect(res.statusCode).toBe(400);
27 |
28 | expect(JSON.parse(body)).toEqual({
29 | error: true,
30 | message: 'Bad Request',
31 | details: [{ label: 'ua', type: 'required', message: 'ua is required' }],
32 | });
33 | });
34 |
35 | it('should validate waitTime', async () => {
36 | const { res, body } = await request(
37 | 'http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fbasic.html&ua=Algolia+Crawler&waitTime[min]=foo&waitTime[max]=bar'
38 | );
39 |
40 | expect(res.statusCode).toBe(400);
41 |
42 | expect(JSON.parse(body)).toEqual({
43 | error: true,
44 | message: 'Bad Request',
45 | details: [
46 | {
47 | errors: [
48 | {
49 | label: 'min',
50 | message: 'min must be a valid number',
51 | type: 'number.typeof',
52 | },
53 | {
54 | label: 'max',
55 | message: 'max must be a valid number',
56 | type: 'number.typeof',
57 | },
58 | ],
59 | label: 'waitTime',
60 | message: 'waitTime does not match its schema',
61 | type: 'object.schema',
62 | },
63 | ],
64 | });
65 | });
66 |
67 | it.each(['chromium', 'firefox'])(
68 | 'should render basic page on %s',
69 | async (browser) => {
70 | const { res, body } = await request(
71 | `http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fbasic.html&ua=Algolia+Crawler&browser=${browser}`
72 | );
73 |
74 | expect(res.statusCode).toBe(200);
75 | expect(res.headers).toEqual({
76 | connection: 'keep-alive',
77 | 'content-length': '79',
78 | 'content-security-policy':
79 | "default-src 'none'; style-src * 'unsafe-inline'; img-src * data:; font-src *",
80 | 'content-type': 'text/html; charset=utf-8',
81 | date: expect.any(String),
82 | etag: 'W/"4f-3aYUmdp4dkv6HiR9rJEG+VKiCsw"',
83 | 'keep-alive': 'timeout=5',
84 | });
85 |
86 | expect(cleanString(body)).toBe(
87 | `A basic page`
88 | );
89 | }
90 | );
91 | });
92 |
--------------------------------------------------------------------------------
/.github/workflows/release_docker.yml:
--------------------------------------------------------------------------------
1 | name: Release Docker
2 | on:
3 | release:
4 | types: [published]
5 |
6 | workflow_dispatch:
7 | inputs:
8 | dry_run:
9 | required: true
10 | type: boolean
11 | default: true
12 | description: 'DryRun?'
13 |
14 | env:
15 | COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
16 |
17 | jobs:
18 | build-docker:
19 | runs-on: ubuntu-latest
20 | name: Build Dockers
21 |
22 | steps:
23 | - uses: actions/checkout@v4
24 |
25 | - name: Install Node
26 | uses: actions/setup-node@v3
27 | with:
28 | node-version-file: .nvmrc
29 |
30 | - name: Setting env var
31 | id: env_var
32 | shell: bash
33 | run: |
34 | echo "RENDERSCRIPT_VERSION=$(node -e 'console.log(require("./package.json").version)')" >> $GITHUB_OUTPUT
35 | echo "PLAYWRIGHT_VERSION=$(node -e 'console.log(require("./package.json").dependencies.playwright)')" >> $GITHUB_OUTPUT
36 |
37 | - name: Set up Docker Buildx
38 | uses: docker/setup-buildx-action@v3
39 |
40 | - uses: docker/login-action@v2
41 | with:
42 | username: ${{ secrets.DOCKERHUB_USERNAME }}
43 | password: ${{ secrets.DOCKERHUB_TOKEN }}
44 |
45 | - name: Set up Docker QEMU for arm64 docker builds
46 | uses: docker/setup-qemu-action@v3
47 | with:
48 | platforms: arm64
49 |
50 | - name: Build Image
51 | uses: docker/build-push-action@v4.2.1
52 | with:
53 | file: Dockerfile
54 | context: .
55 | platforms: linux/amd64 # buildx does not support multi-arch load
56 | push: false
57 | load: true
58 | tags: |
59 | algolia/renderscript:latest
60 | algolia/renderscript:${{ steps.env_var.outputs.RENDERSCRIPT_VERSION }}
61 | algolia/renderscript:${{ env.COMMIT_SHA }}
62 | cache-from: type=gha
63 | cache-to: type=gha,mode=max
64 | build-args: |
65 | VERSION=${{ steps.env_var.outputs.RENDERSCRIPT_VERSION }}
66 | PLAYWRIGHT_VERSION=${{ steps.env_var.outputs.PLAYWRIGHT_VERSION }}
67 |
68 | - name: Test Image
69 | run: ./scripts/test_image.sh ${{ env.COMMIT_SHA }}
70 |
71 | # Cache should be reused from prev execution
72 | - name: Push
73 | if: (github.event_name == 'release') || (github.event_name == 'workflow_dispatch' && github.event.inputs.dry_run != 'true')
74 | uses: docker/build-push-action@v4
75 | with:
76 | file: Dockerfile
77 | context: .
78 | platforms: linux/amd64,linux/arm64/v8
79 | push: true
80 | tags: |
81 | algolia/renderscript:latest
82 | algolia/renderscript:${{ steps.env_var.outputs.RENDERSCRIPT_VERSION }}
83 | cache-from: type=gha
84 | cache-to: type=gha,mode=max
85 | build-args: |
86 | VERSION=${{ steps.env_var.outputs.RENDERSCRIPT_VERSION }}
87 | PLAYWRIGHT_VERSION=${{ steps.env_var.outputs.PLAYWRIGHT_VERSION }}
88 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@algolia/renderscript",
3 | "version": "2.3.6",
4 | "description": "A custom JavaScript rendering engine based on Playwright",
5 | "main": "dist/index.js",
6 | "scripts": {
7 | "build": "yarn clean && yarn tsc && yarn browser:adblocks",
8 | "ci:start": "ALLOW_LOCALHOST=true yarn start",
9 | "clean": "rm -rf dist/",
10 | "dev": "nodemon",
11 | "dev:run": "yarn build && NODE_ENV=development node -r dotenv/config dist/index.js",
12 | "docker:build": "./scripts/build.sh",
13 | "browser:adblocks": "./scripts/update_adblock_hosts.sh",
14 | "lint": "eslint --ext=jsx,ts,tsx,js .",
15 | "start": "UV_THREADPOOL_SIZE=100 node dist/index.js",
16 | "semantic-release": "semantic-release",
17 | "test": "jest src/"
18 | },
19 | "repository": {
20 | "type": "git",
21 | "url": "https://github.com/algolia/renderscript.git"
22 | },
23 | "keywords": [
24 | "algolia",
25 | "playwright",
26 | "js renderer",
27 | "rendertron",
28 | "prerender",
29 | "javascript rendering",
30 | "ssr"
31 | ],
32 | "author": "Algolia ",
33 | "license": "MIT",
34 | "bugs": {
35 | "url": "https://github.com/algolia/renderscript/issues"
36 | },
37 | "engines": {
38 | "node": "18.18.2"
39 | },
40 | "homepage": "https://github.com/algolia/renderscript#readme",
41 | "devDependencies": {
42 | "@semantic-release/changelog": "6.0.3",
43 | "@semantic-release/git": "10.0.1",
44 | "@types/cookie-parser": "1.4.6",
45 | "@types/csurf": "1.11.5",
46 | "@types/express": "4.17.21",
47 | "@types/jest": "29.5.8",
48 | "@types/node": "18.18.10",
49 | "@types/uuid": "9.0.7",
50 | "@typescript-eslint/eslint-plugin": "6.11.0",
51 | "@typescript-eslint/parser": "6.11.0",
52 | "dotenv": "16.3.1",
53 | "ejs": "3.1.9",
54 | "eslint": "8.54.0",
55 | "eslint-config-algolia": "22.0.0",
56 | "eslint-config-prettier": "9.0.0",
57 | "eslint-config-standard": "17.1.0",
58 | "eslint-import-resolver-typescript": "3.6.1",
59 | "eslint-plugin-algolia": "2.0.0",
60 | "eslint-plugin-eslint-comments": "3.2.0",
61 | "eslint-plugin-import": "2.29.0",
62 | "eslint-plugin-jest": "27.6.0",
63 | "eslint-plugin-jsdoc": "46.9.0",
64 | "eslint-plugin-node": "11.1.0",
65 | "eslint-plugin-prettier": "5.0.1",
66 | "eslint-plugin-promise": "6.1.1",
67 | "jest": "29.7.0",
68 | "nodemon": "3.0.1",
69 | "pino-pretty": "10.2.3",
70 | "prettier": "3.1.0",
71 | "semantic-release": "22.0.8",
72 | "ts-jest": "29.1.1",
73 | "ts-node": "10.9.2",
74 | "typescript": "5.2.2"
75 | },
76 | "dependencies": {
77 | "@algolia/dns-filter": "1.1.25",
78 | "@sentry/node": "7.80.1",
79 | "altheia-async-data-validator": "5.0.15",
80 | "body-parser": "1.20.2",
81 | "cookie-parser": "1.4.6",
82 | "csurf": "1.11.0",
83 | "express": "4.19.2",
84 | "hot-shots": "10.0.0",
85 | "pino": "8.16.2",
86 | "playwright": "1.49.0",
87 | "undici": "5.28.4",
88 | "uuid": "9.0.1"
89 | },
90 | "resolutions": {
91 | "chalk": "4.1.2",
92 | "@semantic-release/npm": "10.0.6"
93 | },
94 | "packageManager": "yarn@4.0.2"
95 | }
96 |
--------------------------------------------------------------------------------
/src/__tests__/async.test.ts:
--------------------------------------------------------------------------------
1 | import type { PostRenderSuccess } from '../api/@types/postRender';
2 |
3 | import { cleanString, postRender, request } from './helpers';
4 |
5 | jest.setTimeout(10000);
6 |
7 | describe('async', () => {
8 | it.each(['chromium', 'firefox'])(
9 | 'should render async page on %s',
10 | async (browser) => {
11 | const { res, body } = await request(
12 | `http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fasync.html&ua=Algolia+Crawler&browser=${browser}`
13 | );
14 |
15 | expect(res.statusCode).toBe(200);
16 | expect(res.headers).toEqual({
17 | connection: 'keep-alive',
18 | 'content-length': expect.any(String),
19 | 'content-security-policy':
20 | "default-src 'none'; style-src * 'unsafe-inline'; img-src * data:; font-src *",
21 | 'content-type': 'text/html; charset=utf-8',
22 | date: expect.any(String),
23 | etag: expect.any(String),
24 | 'keep-alive': 'timeout=5',
25 | });
26 |
27 | expect(cleanString(body)).toMatchSnapshot();
28 | }
29 | );
30 |
31 | it('should wait by default for 0ms', async () => {
32 | const { res, body } = await postRender({
33 | url: 'http://localhost:3000/test-website/async.html',
34 | ua: 'Algolia Crawler',
35 | });
36 |
37 | const json: PostRenderSuccess = JSON.parse(body);
38 | expect(res.statusCode).toBe(200);
39 | expect(json.metrics.timings.total).toBeLessThanOrEqual(2000);
40 | expect(json.body).not.toMatch('4. setTimeout 1000');
41 | });
42 |
43 | it('should wait at least 6000ms', async () => {
44 | const { res, body } = await postRender({
45 | url: 'http://localhost:3000/test-website/async.html',
46 | ua: 'Algolia Crawler',
47 | waitTime: {
48 | min: 6000,
49 | },
50 | });
51 |
52 | const json: PostRenderSuccess = JSON.parse(body);
53 | expect(res.statusCode).toBe(200);
54 |
55 | expect(json.metrics.timings.minWait).toBeGreaterThanOrEqual(5000);
56 | expect(json.metrics.timings.total).toBeGreaterThanOrEqual(6000);
57 | expect(json.metrics.timings.total).toBeLessThanOrEqual(7000);
58 | expect(json.body).toMatch('5. setTimeout 5000');
59 | });
60 |
61 | it('should wait at most 5000ms', async () => {
62 | const { res, body } = await postRender({
63 | url: 'http://localhost:3000/test-website/slow.html',
64 | ua: 'Algolia Crawler',
65 | waitTime: {
66 | min: 4000,
67 | max: 5000,
68 | },
69 | });
70 |
71 | const json: PostRenderSuccess = JSON.parse(body);
72 | expect(res.statusCode).toBe(200);
73 | expect(json.metrics.timings.goto).toBeLessThanOrEqual(50);
74 |
75 | // In that case the page is slow so min is not used
76 | expect(json.metrics.timings.minWait).toBeNull();
77 |
78 | expect(json.metrics.timings.ready).toBeLessThanOrEqual(5020);
79 | expect(json.metrics.timings.total).toBeGreaterThanOrEqual(4000);
80 | expect(json.metrics.timings.total).toBeLessThanOrEqual(5120);
81 |
82 | // We count the dot because there is no way to have precise execution
83 | // There should be around 25 dots (one fetch every 200ms during 5s = 25 dots)
84 | // We check for 20 to have some margin
85 | // And no more than 30 to check that it was not executed more than 5s
86 | expect(json.body).toMatch('.'.repeat(20));
87 | expect(json.body).not.toMatch('.'.repeat(30));
88 | });
89 | });
90 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | # ------------------
2 | # Build playwright
3 | # ------------------
4 | FROM ubuntu:jammy as base
5 |
6 | # For tzdata
7 | ARG DEBIAN_FRONTEND=noninteractive
8 | ARG TZ=America/Los_Angeles
9 |
10 | # === INSTALL Node.js ===
11 | RUN apt-get update && \
12 | # Install node16
13 | apt-get install -y curl wget && \
14 | curl -sL https://deb.nodesource.com/setup_18.x | bash - && \
15 | apt-get install -y nodejs && \
16 | # Feature-parity with node.js base images.
17 | apt-get install -y --no-install-recommends git openssh-client && \
18 | npm install -g yarn && \
19 | # clean apt cache
20 | rm -rf /var/lib/apt/lists/* && \
21 | # Create the pwuser
22 | adduser pwuser
23 |
24 | # === BAKE BROWSERS INTO IMAGE ===
25 | ARG PLAYWRIGHT_VERSION
26 | ENV PLAYWRIGHT_VERSION ${PLAYWRIGHT_VERSION}
27 | ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
28 |
29 | # Browsers will be downloaded in `/ms-playwright`.
30 | RUN mkdir /ms-playwright \
31 | && PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=true npm install -g playwright@$PLAYWRIGHT_VERSION \
32 | && npx playwright install --with-deps chromium \
33 | && npx playwright install --with-deps firefox \
34 | # Clean cache
35 | && rm -rf /var/lib/apt/lists/* \
36 | && chmod -R 777 /ms-playwright
37 |
38 |
39 | # ------------------
40 | # package.json cache
41 | # ------------------
42 | FROM apteno/alpine-jq:2022-09-25 AS deps
43 |
44 | # To prevent cache invalidation from changes in fields other than dependencies
45 | COPY package.json /tmp
46 | RUN jq 'walk(if type == "object" then with_entries(select(.key | test("^jest|prettier|eslint|semantic|dotenv|nodemon") | not)) else . end) | { name, dependencies, devDependencies, packageManager }' < /tmp/package.json > /tmp/deps.json
47 |
48 |
49 | # ------------------
50 | # New base image
51 | # ------------------
52 | FROM base as tmp
53 |
54 | ENV IN_DOCKER true
55 | ENV PLAYWRIGHT_BROWSERS_PATH="/ms-playwright"
56 | ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD="true"
57 |
58 | # Setup the app WORKDIR
59 | WORKDIR /app/tmp
60 |
61 | # Copy and install dependencies separately from the app's code
62 | # To leverage Docker's cache when no dependency has change
63 | COPY --from=deps /tmp/deps.json ./package.json
64 | COPY yarn.lock .yarnrc.yml ./
65 | COPY .yarn .yarn
66 |
67 | # Install dev dependencies
68 | RUN true \
69 | && yarn install
70 |
71 | # This step will invalidates cache
72 | COPY . ./
73 |
74 | # Builds the UI, install chrome and remove dev dependencies
75 | RUN true \
76 | && ls -lah /app/tmp \
77 | && yarn build \
78 | && yarn workspaces focus --all --production \
79 | && rm -rf .yarn/
80 |
81 | # ------------------
82 | # New final image that only contains built code
83 | # ------------------
84 | FROM base as final
85 |
86 | ARG VERSION
87 | ENV VERSION ${VERSION:-dev}
88 |
89 | # Autolink repository https://docs.github.com/en/packages/learn-github-packages/connecting-a-repository-to-a-package
90 | LABEL org.opencontainers.image.source=https://github.com/algolia/renderscript
91 | LABEL org.opencontainers.image.revision=$VERSION
92 |
93 | ENV NODE_ENV production
94 | ENV IN_DOCKER true
95 | ENV PLAYWRIGHT_BROWSERS_PATH="/ms-playwright"
96 | ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD="true"
97 |
98 | # Do not use root to run the app
99 | USER pwuser
100 |
101 | # Copy install from previous stage
102 | WORKDIR /app/renderscript
103 | COPY --from=tmp --chown=pwuser:pwuser /app/tmp /app/renderscript
104 |
105 | CMD [ "node", "dist/index.js" ]
106 |
--------------------------------------------------------------------------------
/src/lib/browser/constants.ts:
--------------------------------------------------------------------------------
1 | export const RESPONSE_IGNORED_ERRORS = [
2 | // 200 no body, HEAD, OPTIONS
3 | 'No data found for resource with given identifier',
4 | 'No resource with given identifier found',
5 | // Too big to fit in memory, or memory filled
6 | 'Request content was evicted from inspector cache',
7 | // Protocol error, js redirect or options
8 | 'This might happen if the request is a preflight request',
9 | // Can happen if the page that trigger this response was closed in the meantime
10 | 'Target closed',
11 | 'Target page, context or browser has been closed',
12 | 'Target has been closed',
13 | 'Browser has been disconnected',
14 | ];
15 |
16 | export const REQUEST_IGNORED_ERRORS = ['Request is already handled'];
17 |
18 | export const GOTO_IGNORED_ERRORS = ['Navigation timeout'];
19 |
20 | export const VALIDATE_URL_IGNORED_ERRORS = ['ENOTFOUND', 'EAI_AGAIN'];
21 |
22 | export const METRICS_IGNORED_ERRORS = [
23 | // Navigation or page closed, okay for metrics
24 | 'Target closed',
25 | 'Target page, context or browser has been closed',
26 | 'Target has been closed',
27 | 'Browser has been disconnected',
28 | 'Execution context was destroyed',
29 | 'Renderscript Controlled Timeout',
30 | ];
31 |
32 | export const WIDTH = 1280;
33 | export const HEIGHT = 1024;
34 |
35 | export const flags = [
36 | // Disable sandboxing when not available
37 | '--no-sandbox',
38 | '--disable-setuid-sandbox',
39 | '--no-zygote',
40 | // No GPU available inside Docker
41 | '--disable-gpu',
42 | // Seems like a powerful hack, not sure why
43 | // https://github.com/Codeception/CodeceptJS/issues/561
44 | "--proxy-server='direct://'",
45 | '--proxy-bypass-list=*',
46 | // Disable cache
47 | // '--disk-cache-dir=/dev/null',
48 | '--media-cache-size=1',
49 | '--disk-cache-size=1',
50 | // Disable useless UI features
51 | '--disable-extensions',
52 | '--disable-features=Translate',
53 | '--disable-infobars',
54 | '--disable-notifications',
55 | '--disable-translate',
56 | '--no-default-browser-check',
57 | '--no-first-run', // screen on very first run
58 | '--noerrdialogs',
59 | '--disable-background-timer-throttling',
60 | '--disable-backgrounding-occluded-windows',
61 | '--disable-password-generation',
62 | '--disable-prompt-on-repos',
63 | '--disable-save-password-bubble',
64 | '--disable-single-click-autofill',
65 | '--disable-restore-session-state',
66 | '--disable-translate',
67 | '--disable-new-profile-management',
68 | '--disable-new-avatar-menu',
69 | '--disable-infobars',
70 | '--disable-device-discovery-notifications',
71 | '--disable-client-side-phishing-detection',
72 | '--disable-notifications',
73 | '--disable-component-extensions-with-background-pages',
74 | // Disable dev-shm
75 | // See https://github.com/GoogleChrome/puppeteer/blob/master/docs/troubleshooting.md#tips
76 | '--disable-dev-shm-usage',
77 |
78 | '--enable-automation',
79 | '--disable-print-preview',
80 | // https://github.com/cypress-io/cypress/issues/5132
81 | '--disable-ipc-flooding-protection',
82 |
83 | // Taken from https://github.com/cypress-io/cypress/blob/develop/packages/server/lib/browsers/chrome.ts
84 | // "--disable-background-networking"
85 | '--disable-web-resources',
86 | '--safebrowsing-disable-auto-update',
87 | '--safebrowsing-disable-download-protection',
88 | '--disable-client-side-phishing-detection',
89 | '--disable-component-update',
90 | '--disable-default-apps',
91 |
92 | // Crash reporter
93 | '--disable-breakpad',
94 | '--disable-crash-reporter',
95 | ];
96 |
--------------------------------------------------------------------------------
/src/lib/browser/Browser.ts:
--------------------------------------------------------------------------------
1 | import type {
2 | Browser as BrowserInterface,
3 | BrowserContext,
4 | BrowserContextOptions,
5 | } from 'playwright';
6 | import { chromium, firefox } from 'playwright';
7 | import { v4 as uuid } from 'uuid';
8 |
9 | import { report } from '../../helpers/errorReporting';
10 | import { log as mainLog } from '../../helpers/logger';
11 | import { stats } from '../../helpers/stats';
12 |
13 | import { flags, HEIGHT, WIDTH } from './constants';
14 |
15 | const log = mainLog.child({ svc: 'brws' });
16 |
17 | export type BrowserEngine = 'chromium' | 'firefox';
18 | export const DEFAULT_ENGINE: BrowserEngine = 'chromium';
19 |
20 | export class Browser {
21 | #id;
22 | #engine: BrowserEngine;
23 | #ready: boolean = false;
24 | #stopping: boolean = false;
25 | #browser: BrowserInterface | undefined;
26 |
27 | constructor(engine?: BrowserEngine) {
28 | this.#id = uuid();
29 | this.#engine = engine || 'chromium';
30 | }
31 |
32 | get isReady(): boolean {
33 | return (
34 | this.#ready &&
35 | typeof this.#browser !== 'undefined' &&
36 | this.#browser.isConnected()
37 | );
38 | }
39 |
40 | get instance(): BrowserInterface | undefined {
41 | return this.#browser;
42 | }
43 |
44 | /**
45 | * Create a Playwright instance.
46 | */
47 | async create(): Promise {
48 | log.info(`Creating ${this.#engine}...`, { id: this.#id });
49 |
50 | const env: { [s: string]: string } = {};
51 | if (process.env.DISPLAY) {
52 | env.DISPLAY = process.env.DISPLAY;
53 | }
54 |
55 | const start = Date.now();
56 | try {
57 | const browser = this.#engine === 'firefox' ? firefox : chromium;
58 | this.#browser = await browser.launch({
59 | headless: true,
60 | env,
61 | handleSIGINT: false,
62 | handleSIGHUP: false,
63 | handleSIGTERM: false,
64 | args: flags,
65 | });
66 | this.#browser.on('disconnected', () => {
67 | if (!this.#stopping) {
68 | this.#ready = false;
69 | report(
70 | new Error(
71 | `Browser disconnected (engine: ${this.#engine}). Relaunching...`
72 | )
73 | );
74 | this.create();
75 | }
76 | });
77 | } catch (e: any) {
78 | report(e, { browser: this.#engine });
79 | }
80 | stats.timing('renderscript.create', Date.now() - start, {
81 | browser: this.#engine,
82 | });
83 |
84 | this.#ready = true;
85 | log.info('Ready', { id: this.#id, browser: this.#engine });
86 | }
87 |
88 | async stop(): Promise {
89 | this.#stopping = true;
90 | await this.#browser?.close();
91 | }
92 |
93 | getCurrentConcurrency(): number {
94 | if (!this.#browser) {
95 | return 0;
96 | }
97 |
98 | return this.#browser.contexts().reduce((i, ctx) => {
99 | return i + ctx.pages().length;
100 | }, 0);
101 | }
102 |
103 | async getNewContext(opts: BrowserContextOptions): Promise {
104 | if (!this.#browser?.isConnected()) {
105 | throw new Error(`No browser available (engine=${this.#engine})`);
106 | }
107 |
108 | const start = Date.now();
109 | const ctx = await this.#browser!.newContext({
110 | acceptDownloads: false,
111 | bypassCSP: false,
112 | hasTouch: false,
113 | isMobile: false,
114 | javaScriptEnabled: true,
115 | locale: 'en-GB',
116 | timezoneId: 'Europe/Paris',
117 | offline: false,
118 | permissions: [],
119 | userAgent: 'Algolia Crawler Renderscript',
120 | viewport: { height: HEIGHT, width: WIDTH },
121 | extraHTTPHeaders: {
122 | 'Accept-Encoding': 'gzip, deflate',
123 | Accept:
124 | 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
125 | },
126 | ...opts,
127 | });
128 | stats.timing('renderscript.context.create', Date.now() - start);
129 |
130 | return ctx;
131 | }
132 | }
133 |
--------------------------------------------------------------------------------
/src/lib/types.ts:
--------------------------------------------------------------------------------
1 | import type { Cookie } from 'playwright';
2 |
3 | import type { BrowserEngine } from './browser/Browser';
4 | import type { Task } from './tasks/Task';
5 |
6 | export type HandledError =
7 | | HandledLoginError
8 | | 'body_serialisation_failed'
9 | | 'connection_error'
10 | | 'dns_error'
11 | | 'error_reading_response'
12 | | 'fetch_aborted'
13 | | 'fetch_timeout'
14 | | 'forbidden_by_website'
15 | | 'no_cookies'
16 | | 'page_closed_too_soon'
17 | | 'page_crashed'
18 | | 'redirection'
19 | | 'timedout'
20 | | 'wrong_redirection';
21 |
22 | export type HandledLoginError =
23 | | 'field_not_found'
24 | | 'no_response_after_login'
25 | | 'too_many_fields';
26 |
27 | export type UnhandledError = 'unknown_error';
28 |
29 | export interface TaskBaseParams {
30 | url: URL;
31 | userAgent: string;
32 | adblock?: boolean;
33 | browser?: BrowserEngine;
34 | waitTime?: {
35 | min?: number;
36 | max?: number;
37 | };
38 | headersToForward?: {
39 | [s: string]: string;
40 | };
41 | }
42 |
43 | export interface Perf {
44 | curr: PerformanceNavigationTiming;
45 | all: PerformanceEntryList;
46 | mem: {
47 | jsHeapSizeLimit?: number;
48 | totalJSHeapSize?: number;
49 | usedJSHeapSize?: number;
50 | };
51 | }
52 |
53 | export type RenderTaskParams = TaskBaseParams;
54 |
55 | export interface LoginTaskParams extends TaskBaseParams {
56 | login: {
57 | username: string;
58 | password: string;
59 | };
60 | renderHTML?: boolean;
61 | }
62 |
63 | export type TaskParams = LoginTaskParams | RenderTaskParams;
64 |
65 | export interface TaskFinal extends TaskResult {
66 | metrics: Metrics;
67 | timeout: boolean;
68 | }
69 |
70 | export interface TaskResult {
71 | statusCode: number | null;
72 | body: string | null;
73 | error: HandledError | UnhandledError | null;
74 | rawError: Error | null;
75 | headers: Record;
76 | resolvedUrl: string | null;
77 | cookies: Cookie[];
78 | }
79 |
80 | export type ErrorReturn = Optional<
81 | Pick,
82 | 'rawError'
83 | >;
84 |
85 | export interface Metrics {
86 | timings: {
87 | context: number | null;
88 | goto: number | null;
89 | equiv: number | null;
90 | ready: number | null;
91 | minWait: number | null;
92 | serialize: number | null;
93 | close: number | null;
94 | total: number | null;
95 | };
96 | renderingBudget: {
97 | max: number;
98 | consumed: number;
99 | };
100 | page: PageMetrics | null;
101 | }
102 |
103 | export interface PageMetrics {
104 | timings: {
105 | download: number | null;
106 | };
107 | mem: {
108 | jsHeapUsedSize: number | null;
109 | jsHeapTotalSize: number | null;
110 | };
111 | requests: {
112 | total: number;
113 | blocked: number;
114 | pending: number;
115 | };
116 | contentLength: {
117 | main: number;
118 | total: number;
119 | };
120 | }
121 |
122 | export interface TaskObject {
123 | ref: Task;
124 | promise: Promise;
125 | }
126 |
127 | /**
128 | * Take an interface and list the keys that are optional.
129 | *
130 | * @example
131 | * interface Hello {
132 | * foo?: string;
133 | * bar?: string;
134 | * baz: string;
135 | * }
136 | *
137 | * OptionalKeys;
138 | *
139 | * Will result in:
140 | * 'foo' | 'bar'
141 | */
142 | export type OptionalKeys = {
143 | [K in keyof T]: undefined extends T[K] ? K : never;
144 | }[keyof T];
145 |
146 | /**
147 | * Take an interface and choose what property should undefined.
148 | *
149 | * @example
150 | * interface Hello {
151 | * foo: string;
152 | * bar: string;
153 | * baz?: string;
154 | * };
155 | *
156 | * Optional;
157 | *
158 | * Will results in:
159 | * {
160 | * foo: string;
161 | * bar?: string;
162 | * baz?: string;
163 | * }
164 | *
165 | */
166 | export type Optional = {
167 | [P in Exclude>>]?: T[P];
168 | } & {
169 | [P in Exclude>]: T[P];
170 | };
171 |
--------------------------------------------------------------------------------
/src/api/routes/render.ts:
--------------------------------------------------------------------------------
1 | import type express from 'express';
2 |
3 | import { report } from '../../helpers/errorReporting';
4 | import { retryableErrors } from '../../lib/helpers/errors';
5 | import { tasksManager } from '../../lib/singletons';
6 | import { RenderTask } from '../../lib/tasks/Render';
7 | import type {
8 | PostRenderParams,
9 | PostRenderResponse,
10 | } from '../@types/postRender';
11 | import type { Res500 } from '../@types/responses';
12 | import { CSP_HEADERS } from '../constants';
13 | import { getDefaultParams, alt } from '../helpers/alt';
14 | import { buildUrl, revertUrl } from '../helpers/buildUrl';
15 | import { badRequest } from '../helpers/errors';
16 | import { getForwardedHeadersFromRequest } from '../helpers/getForwardedHeaders';
17 |
18 | export async function validate(
19 | req: express.Request,
20 | res: express.Response,
21 | next: express.NextFunction
22 | ): Promise {
23 | const errors = await alt(getDefaultParams())
24 | .body(req.method === 'GET' ? req.query : req.body)
25 | .validate();
26 |
27 | if (errors) {
28 | badRequest({ res, details: errors });
29 | return;
30 | }
31 |
32 | next();
33 | }
34 |
35 | export async function render(
36 | req: express.Request,
37 | res: express.Response
38 | ): Promise {
39 | const { url: rawUrl, ua, waitTime, adblock, browser } = req.query;
40 | const headersToForward = getForwardedHeadersFromRequest(req);
41 | const url = new URL(buildUrl(rawUrl));
42 |
43 | try {
44 | const { error, statusCode, body, resolvedUrl } = await tasksManager.task(
45 | new RenderTask({
46 | url,
47 | headersToForward,
48 | userAgent: ua,
49 | browser,
50 | waitTime,
51 | adblock,
52 | })
53 | );
54 |
55 | if (resolvedUrl && resolvedUrl !== url.href) {
56 | const location = revertUrl(resolvedUrl)?.href || url.href;
57 | res.status(307).header('Location', location).send();
58 | return;
59 | }
60 |
61 | if (error) {
62 | res.status(400).json({ error });
63 | return;
64 | }
65 |
66 | res
67 | .status(statusCode!)
68 | .header('Content-Type', 'text/html')
69 | .header('Content-Security-Policy', CSP_HEADERS)
70 | .send(body);
71 | } catch (err: any) {
72 | res.status(500).json({
73 | error: err.message,
74 | });
75 | report(err, { type: 'render', url: rawUrl, browser });
76 | }
77 | }
78 |
79 | export async function renderJSON(
80 | req: express.Request,
81 | res: express.Response
82 | ): Promise {
83 | const { url: rawUrl, ua, waitTime, adblock, browser } = req.body;
84 | const headersToForward = getForwardedHeadersFromRequest(req);
85 | const url = new URL(buildUrl(rawUrl));
86 |
87 | try {
88 | const task = await tasksManager.task(
89 | new RenderTask({
90 | url,
91 | headersToForward,
92 | userAgent: ua,
93 | browser,
94 | waitTime,
95 | adblock,
96 | })
97 | );
98 |
99 | if (!task.error && !task.body) {
100 | // Tmp while trying to understand the issue.
101 | report(new Error('No error but no body'), {
102 | task,
103 | url,
104 | waitTime,
105 | browser,
106 | });
107 | task.error = 'body_serialisation_failed';
108 | }
109 |
110 | const resolvedUrl = revertUrl(task.resolvedUrl)?.href || null;
111 | const code =
112 | task.error &&
113 | retryableErrors.includes(task.error) &&
114 | task.error !== 'redirection'
115 | ? 500
116 | : 200;
117 | res.status(code).json({
118 | body: task.body,
119 | headers: task.headers,
120 | metrics: task.metrics,
121 | resolvedUrl,
122 | statusCode: task.statusCode,
123 | timeout: task.timeout,
124 | error: task.error,
125 | rawError: task.rawError
126 | ? {
127 | message: task.rawError.message,
128 | stack: task.rawError.stack,
129 | }
130 | : null,
131 | });
132 | } catch (err: any) {
133 | res.status(500).json({ error: err.message });
134 | report(err, { type: 'renderJSON', url: rawUrl, browser });
135 | }
136 | }
137 |
--------------------------------------------------------------------------------
/src/api/index.ts:
--------------------------------------------------------------------------------
1 | import * as http from 'http';
2 | import * as path from 'path';
3 |
4 | import { urlencoded, json } from 'body-parser';
5 | import cookieParser from 'cookie-parser';
6 | import csurf from 'csurf';
7 | import express, { static as expressStatic } from 'express';
8 |
9 | import projectRoot from '../helpers/projectRoot';
10 |
11 | import { log } from './helpers/logger';
12 | import { requestLogger } from './helpers/requestLogger';
13 | import { healthy } from './routes/healthy';
14 | import { list } from './routes/list';
15 | import * as routeLogin from './routes/login';
16 | import {
17 | getLogin,
18 | getStep1,
19 | getTest,
20 | getTwoSteps,
21 | postLogin,
22 | postStep2,
23 | } from './routes/privates/login';
24 | import { ready } from './routes/ready';
25 | import * as routeRender from './routes/render';
26 | import { root } from './routes/root';
27 |
28 | export class Api {
29 | server: http.Server;
30 | private _app: express.Express;
31 | private _csrfProtection: express.RequestHandler;
32 |
33 | constructor() {
34 | this._csrfProtection = csurf({
35 | cookie: { maxAge: 120, sameSite: 'strict' },
36 | });
37 | this._app = express();
38 | this.server = http.createServer(this._app);
39 | }
40 |
41 | start(port: number): void {
42 | this._setup();
43 | this._routes();
44 | if (process.env.NODE_ENV !== 'production') {
45 | this._privateRoutes();
46 | } else {
47 | this._app.get('/', root);
48 | }
49 |
50 | // 404
51 | this._app.use('*', (req, res) => {
52 | res.status(404).json({
53 | status: 404,
54 | error: 'Endpoint not found',
55 | code: 'not_found',
56 | });
57 | });
58 |
59 | // error handler
60 | this._app.use((err: any, req: express.Request, res: express.Response) => {
61 | if (err?.code !== 'EBADCSRFTOKEN') {
62 | // return next();
63 | return res.status(500).json({
64 | status: 500,
65 | error: 'Internal Server Error',
66 | code: 'internal_server_error',
67 | });
68 | }
69 |
70 | // CSRF token errors
71 | res.status(403).json({
72 | status: 403,
73 | error: 'The form has expired',
74 | code: 'form_expired',
75 | });
76 | });
77 |
78 | this.server.listen(port, () => {
79 | log.info(`Ready http://localhost:${port}`);
80 | });
81 | }
82 |
83 | stop(cb: () => any): void {
84 | this.server.close(cb);
85 | }
86 |
87 | private _setup(): void {
88 | const jsonParser = json({ limit: '1mb' });
89 | this._app.disable('x-powered-by');
90 |
91 | this._app.use(urlencoded({ limit: '1mb', extended: true }));
92 | this._app.use((req, res, next) => {
93 | return jsonParser(req, res, (err) => {
94 | if (!err) {
95 | return next();
96 | }
97 |
98 | return res.status(400).json({
99 | status: 400,
100 | error: `Invalid json: ${err.message}`,
101 | code: 'invalid_json',
102 | });
103 | });
104 | });
105 |
106 | this._app.use(requestLogger);
107 | this._app.use(cookieParser());
108 | this._app.set('views', path.join(projectRoot, '/public/views'));
109 | this._app.set('view engine', 'ejs');
110 | }
111 |
112 | private _routes(): void {
113 | this._app
114 | .get('/ready', ready)
115 | .get('/healthy', healthy)
116 | .get('/list', list)
117 | .get('/render', routeRender.validate, routeRender.render)
118 | .post('/render', routeRender.validate, routeRender.renderJSON)
119 | .post('/login', routeLogin.validate, routeLogin.processLogin);
120 | }
121 |
122 | private _privateRoutes(): void {
123 | this._app.use(expressStatic(path.join(projectRoot, '/public')));
124 |
125 | this._app.get('/301', (req, res) =>
126 | res.redirect(301, '/test-website/basic.html')
127 | );
128 |
129 | // Login form with CSRF protection
130 | this._app
131 | .get('/secure/login', this._csrfProtection, getLogin)
132 | .post('/secure/login', this._csrfProtection, postLogin)
133 | .get('/secure/test', getTest)
134 |
135 | // 2-steps login form with CSRF protection
136 | .get('/secure/login/step1', this._csrfProtection, getStep1)
137 | .post('/secure/login/step2', this._csrfProtection, postStep2)
138 | .get('/secure/login/2steps', this._csrfProtection, getTwoSteps);
139 | }
140 | }
141 |
--------------------------------------------------------------------------------
/src/__tests__/login.real.test.ts:
--------------------------------------------------------------------------------
1 | import type { Cookie } from 'playwright';
2 |
3 | import type { PostLoginSuccess } from '../api/@types/postLogin';
4 | import type { PostRenderSuccess } from '../api/@types/postRender';
5 |
6 | import {
7 | cleanCookies,
8 | cookiesToString,
9 | postRender,
10 | sendLoginRequest,
11 | } from './helpers';
12 |
13 | const rawCreds = process.env.LOGIN_CREDENTIALS;
14 | const canExec = process.env.CI || rawCreds;
15 |
16 | jest.setTimeout(25000);
17 |
18 | // !--- Not working right now
19 | // eslint-disable-next-line jest/no-disabled-tests
20 | describe.skip('Real Login', () => {
21 | let creds: { [name: string]: { username: string; password: string } };
22 |
23 | beforeAll(() => {
24 | if (!canExec) {
25 | throw new Error('can only exec in CI or with LOGIN_CREDENTIALS');
26 | }
27 |
28 | creds = JSON.parse(rawCreds!);
29 | });
30 |
31 | describe('login.live.com', () => {
32 | let cookies: Cookie[];
33 | let cred: { username: string; password: string };
34 | beforeAll(() => {
35 | cred = creds['login.live.com'];
36 | });
37 |
38 | it('should not be logged', async () => {
39 | const { res, body } = await postRender({
40 | url: 'https://account.microsoft.com/billing/orders?refd=account.microsoft.com',
41 | });
42 |
43 | expect(res.statusCode).toBe(200);
44 | const parsed: PostRenderSuccess = JSON.parse(body);
45 | expect(parsed.statusCode).toBe(302);
46 | expect(
47 | parsed.resolvedUrl?.startsWith('https://login.live.com/login.srf')
48 | ).toBe(true);
49 | });
50 |
51 | it('get proper cookies', async () => {
52 | const { res, body } = await sendLoginRequest({
53 | url: 'https://account.microsoft.com/billing/orders?refd=account.microsoft.com',
54 | username: cred.username,
55 | password: cred.password,
56 | });
57 |
58 | expect(res.statusCode).toBe(200);
59 | const parsed: PostLoginSuccess = JSON.parse(body);
60 | const tmp = cleanCookies(parsed.cookies);
61 | [
62 | { domain: '.account.live.com', name: 'RPSMaybe', path: '/' },
63 | { domain: '.account.microsoft.com', name: 'AMCSecAuth', path: '/' },
64 | { domain: '.account.microsoft.com', name: 'ANON', path: '/' },
65 | { domain: '.account.microsoft.com', name: 'NAP', path: '/' },
66 | { domain: '.live.com', name: 'amsc', path: '/' },
67 | { domain: '.live.com', name: 'ANON', path: '/' },
68 | { domain: '.live.com', name: 'mkt', path: '/' },
69 | { domain: '.live.com', name: 'mkt1', path: '/' },
70 | { domain: '.live.com', name: 'MSPAuth', path: '/' },
71 | { domain: '.live.com', name: 'MSPProf', path: '/' },
72 | { domain: '.live.com', name: 'NAP', path: '/' },
73 | { domain: '.live.com', name: 'PPLState', path: '/' },
74 | { domain: '.live.com', name: 'wlidperf', path: '/' },
75 | { domain: '.live.com', name: 'WLSSC', path: '/' },
76 | { domain: '.login.live.com', name: 'JSH', path: '/' },
77 | { domain: '.login.live.com', name: 'JSHP', path: '/' },
78 | { domain: '.login.live.com', name: 'MSCC', path: '/' },
79 | { domain: '.login.live.com', name: 'MSPBack', path: '/' },
80 | { domain: '.login.live.com', name: 'MSPOK', path: '/' },
81 | { domain: '.login.live.com', name: 'MSPRequ', path: '/' },
82 | { domain: '.login.live.com', name: 'MSPRequ', path: '/' },
83 | { domain: '.login.live.com', name: 'MSPSoftVis', path: '/' },
84 | { domain: '.login.live.com', name: 'OParams', path: '/' },
85 | { domain: '.login.live.com', name: 'SDIDC', path: '/' },
86 | { domain: '.login.live.com', name: 'uaid', path: '/' },
87 | { domain: '.login.live.com', name: 'uaid', path: '/' },
88 | { domain: '.microsoft.com', name: 'display-culture', path: '/' },
89 | { domain: '.microsoft.com', name: 'market', path: '/' },
90 | { domain: 'account.microsoft.com', name: 'ai_session', path: '/' },
91 | { domain: 'account.microsoft.com', name: 'AMC-MS-CV', path: '/' },
92 | { domain: 'account.microsoft.com', name: 'authBounced', path: '/' },
93 | { domain: 'account.microsoft.com', name: 'canary', path: '/' },
94 | { domain: 'account.microsoft.com', name: 'GRNID', path: '/' },
95 | { domain: 'account.microsoft.com', name: 'GroupIds', path: '/' },
96 | { domain: 'account.microsoft.com', name: 'ShCLSessionID', path: '/' },
97 | // { domain: 'login.live.com', name: '__Host-MSAAUTH', path: '/' }, seems optional
98 | { domain: 'login.live.com', name: '__Host-MSAAUTHP', path: '/' },
99 | ].forEach((cookie) => {
100 | expect(
101 | tmp.find((c) => c.name === cookie.name && c.domain === cookie.domain)
102 | ).toStrictEqual(cookie);
103 | });
104 |
105 | cookies = parsed.cookies;
106 | });
107 |
108 | it('should be logged', async () => {
109 | const { res, body } = await postRender(
110 | {
111 | url: 'https://account.microsoft.com/billing/orders?refd=account.microsoft.com',
112 | },
113 | {
114 | Cookie: cookiesToString(cookies),
115 | }
116 | );
117 |
118 | expect(res.statusCode).toBe(200);
119 | const parsed: PostRenderSuccess = JSON.parse(body);
120 | expect(parsed.statusCode).toBe(200);
121 | });
122 | });
123 | });
124 |
--------------------------------------------------------------------------------
/src/lib/tasks/Render.ts:
--------------------------------------------------------------------------------
1 | import type { Response } from 'playwright';
2 |
3 | import {
4 | promiseWithTimeout,
5 | PromiseWithTimeoutError,
6 | } from '../../helpers/promiseWithTimeout';
7 | import { waitForPendingRequests } from '../../helpers/waitForPendingRequests';
8 | import { RESPONSE_IGNORED_ERRORS } from '../browser/constants';
9 | import { cleanErrorMessage } from '../helpers/errors';
10 | import type { RenderTaskParams } from '../types';
11 |
12 | import { Task } from './Task';
13 |
14 | export class RenderTask extends Task {
15 | async process(): Promise {
16 | if (!this.page) {
17 | throw new Error('Calling process before createContext()');
18 | }
19 |
20 | /* Setup */
21 | const { url } = this.params;
22 | let response: Response;
23 |
24 | // Important to catch any redirect
25 | this.page.setDisableNavigation(url.href, async (newUrl) => {
26 | this.results.error = 'redirection';
27 | this.results.resolvedUrl = newUrl;
28 |
29 | // We save the status of the page before the navigation (hopefully)
30 | await this.page?.saveMetrics();
31 |
32 | // Hard close of the page to avoid reaching the backend
33 | await this.page?.close();
34 | });
35 |
36 | try {
37 | response = await this.page.goto(url.href, {
38 | timeout: this.timeBudget.get(),
39 | waitUntil: 'domcontentloaded',
40 | });
41 | } catch (err: any) {
42 | return this.throwHandledError({
43 | error: this.results.error || cleanErrorMessage(err),
44 | rawError: err,
45 | });
46 | } finally {
47 | this.setMetric('goto');
48 | }
49 |
50 | // --- At this point we have just the DOM, but we want to do some checks
51 | await this.saveMetrics();
52 |
53 | // In case of redirection, initialResponse is preferred since response is probably now incorrect
54 | await this.saveStatus(this.page.initialResponse || response);
55 |
56 | if (this.page.redirection) {
57 | this.results.resolvedUrl =
58 | this.results.resolvedUrl || this.page.redirection;
59 | return this.throwHandledError({
60 | error: this.results.error || 'redirection',
61 | });
62 | }
63 |
64 | // Check for html refresh
65 | try {
66 | const redirect = await promiseWithTimeout(
67 | this.page.checkForHttpEquivRefresh({
68 | timeout: this.timeBudget.getRange(1000, 3000),
69 | }),
70 | 3000
71 | );
72 | if (redirect) {
73 | this.results.resolvedUrl = redirect.href;
74 | return this.throwHandledError({
75 | error: this.results.error || 'redirection',
76 | });
77 | }
78 | } catch (err) {
79 | if (!(err instanceof PromiseWithTimeoutError)) {
80 | throw err;
81 | }
82 | } finally {
83 | this.setMetric('equiv');
84 | }
85 |
86 | if (this.results.statusCode !== 200) {
87 | // Everything is different than OK is not worth processing
88 | this.results.body = await this.page.renderBody();
89 | return;
90 | }
91 |
92 | // --- Basic checks passed we wait a bit more to page to render
93 | try {
94 | const timeBudget = this.timeBudget.get();
95 | const startWaitTime = Date.now();
96 |
97 | try {
98 | await this.page.ref?.waitForLoadState('networkidle', {
99 | timeout: timeBudget,
100 | });
101 | } catch (waitErr: any) {
102 | // Check if this is a redirection first
103 | if (this.page.redirection) {
104 | this.results.resolvedUrl =
105 | this.results.resolvedUrl || this.page.redirection;
106 | return this.throwHandledError({
107 | error: this.results.error || 'redirection',
108 | rawError: waitErr,
109 | });
110 | }
111 | if (
112 | RESPONSE_IGNORED_ERRORS.some((msg) => waitErr.message.includes(msg))
113 | ) {
114 | // Page was closed while waiting
115 | return this.throwHandledError({
116 | error: 'page_closed_too_soon',
117 | rawError: waitErr,
118 | });
119 | }
120 | throw waitErr; // Re-throw if it's not a target closed error
121 | }
122 |
123 | const timeWaited = Date.now() - startWaitTime;
124 | await waitForPendingRequests(this.page!, timeBudget - timeWaited);
125 | } catch (err: any) {
126 | this.page.throwIfNotTimeout(err);
127 | } finally {
128 | this.setMetric('ready');
129 | }
130 |
131 | await this.minWait();
132 |
133 | this.checkFinalURL();
134 |
135 | /* Transforming */
136 | // await page.evaluate(injectBaseHref, baseHref);
137 | const body = await this.page.renderBody({ silent: true });
138 | if (body === null) {
139 | return this.throwHandledError({ error: 'body_serialisation_failed' });
140 | }
141 |
142 | this.results.body = body;
143 | this.setMetric('serialize');
144 | }
145 |
146 | private checkFinalURL(): void {
147 | const newUrl = this.page!.ref?.url() ? new URL(this.page!.ref.url()) : null;
148 | if (!newUrl) {
149 | // Redirected to nowhere
150 | this.results.resolvedUrl = 'about:blank/';
151 | return this.throwHandledError({ error: 'wrong_redirection' });
152 | }
153 |
154 | newUrl.hash = '';
155 | if (newUrl.href !== this.params.url.href) {
156 | // Redirection was not caught this should not happen
157 | this.results.resolvedUrl = newUrl.href;
158 | return this.throwHandledError({ error: 'wrong_redirection' });
159 | }
160 | }
161 | }
162 |
--------------------------------------------------------------------------------
/src/lib/tasks/Task.ts:
--------------------------------------------------------------------------------
1 | import type { Logger } from 'pino';
2 | import type { BrowserContext, Response } from 'playwright';
3 | import { v4 as uuid } from 'uuid';
4 |
5 | import { report } from '../../helpers/errorReporting';
6 | import { log } from '../../helpers/logger';
7 | import { stats } from '../../helpers/stats';
8 | import type { Browser } from '../browser/Browser';
9 | import { BrowserPage } from '../browser/Page';
10 | import { TimeBudget } from '../browser/TimeBudget';
11 | import { RESPONSE_IGNORED_ERRORS } from '../browser/constants';
12 | import { WAIT_TIME } from '../constants';
13 | import { ErrorIsHandledError } from '../helpers/errors';
14 | import type {
15 | ErrorReturn,
16 | Metrics,
17 | TaskBaseParams,
18 | TaskResult,
19 | } from '../types';
20 |
21 | export abstract class Task {
22 | id: string;
23 | params;
24 | page?: BrowserPage;
25 | createdAt?: Date;
26 | startedAt?: Date;
27 | results: TaskResult = {
28 | statusCode: null,
29 | body: null,
30 | headers: {},
31 | error: null,
32 | rawError: null,
33 | resolvedUrl: null,
34 | cookies: [],
35 | };
36 | log: Logger;
37 | timeBudget: TimeBudget;
38 | #metrics: Metrics = {
39 | timings: {
40 | context: null,
41 | goto: null,
42 | equiv: null,
43 | ready: null,
44 | minWait: null,
45 | serialize: null,
46 | close: null,
47 | total: null,
48 | },
49 | renderingBudget: {
50 | max: 0,
51 | consumed: 0,
52 | },
53 | page: null,
54 | };
55 |
56 | #closed: boolean = false;
57 | #context?: BrowserContext;
58 |
59 | constructor(params: TTaskType, logger?: Logger) {
60 | this.id = uuid();
61 | // Do not print this or pass it to reporting, it contains secrets
62 | this.params = {
63 | ...params,
64 | waitTime: {
65 | ...WAIT_TIME,
66 | ...params.waitTime,
67 | },
68 | };
69 | this.createdAt = new Date();
70 | this.timeBudget = new TimeBudget(this.params.waitTime.max);
71 | this.#metrics.renderingBudget.max = this.timeBudget.max;
72 | this.log = logger ?? log.child({ svc: 'task', ctx: { id: this.id } });
73 | }
74 |
75 | get metrics(): Metrics {
76 | return this.#metrics;
77 | }
78 |
79 | get isDone(): boolean {
80 | return this.#closed;
81 | }
82 |
83 | async close(): Promise {
84 | if (this.#closed) {
85 | return;
86 | }
87 |
88 | this.#closed = true;
89 | await this.page?.close();
90 | await this.#context?.close();
91 | this.setMetric('close');
92 |
93 | this.metrics.timings.total = Date.now() - this.startedAt!.getTime();
94 | this.#metrics.renderingBudget.consumed = this.timeBudget.consumed;
95 | this.#context = undefined;
96 | }
97 |
98 | /**
99 | * Create the incognito context and the page so each task has a fresh start.
100 | */
101 | async createContext(browser: Browser): Promise {
102 | this.timeBudget.lastConsumption = Date.now();
103 | this.startedAt = new Date();
104 |
105 | const context = await browser.getNewContext({
106 | userAgent: this.params.userAgent,
107 | });
108 | context.setDefaultTimeout(WAIT_TIME.min);
109 | context.setDefaultNavigationTimeout(WAIT_TIME.max);
110 |
111 | const page = new BrowserPage(context, this.params.browser);
112 | this.page = page;
113 | this.#context = context;
114 |
115 | await page.create();
116 |
117 | if (this.params.headersToForward?.cookie) {
118 | await page.setCookies(this.params);
119 | }
120 |
121 | await context.route('**/*', page.getOnRequestHandler(this.params));
122 | // does not work await page.setDisableServiceWorker();
123 |
124 | page.ref?.on('response', page.getOnResponseHandler(this.params));
125 |
126 | this.setMetric('context');
127 | }
128 |
129 | /**
130 | * Save status in results.
131 | */
132 | async saveStatus(response: Response): Promise {
133 | try {
134 | this.results.statusCode = response.status();
135 | this.results.headers = await response.allHeaders();
136 | } catch (err: any) {
137 | return this.throwHandledError({
138 | error: 'error_reading_response',
139 | rawError: err,
140 | });
141 | }
142 | }
143 |
144 | /**
145 | * Wait for browser to execute more stuff before we kill the page.
146 | */
147 | async minWait(): Promise {
148 | const minWait = this.params.waitTime!.min;
149 | const todo = minWait - this.timeBudget.consumed;
150 | if (todo <= 0) {
151 | return;
152 | }
153 |
154 | this.log.debug(`Waiting ${todo} extra ms...`);
155 | await this.page!.ref?.waitForTimeout(todo);
156 | this.setMetric('minWait');
157 | }
158 |
159 | /**
160 | * Log metric and reduce time budget.
161 | */
162 | setMetric(name: keyof Metrics['timings']): void {
163 | this.#metrics.timings[name] = this.timeBudget.consume();
164 | stats.timing(`renderscript.page.${name}`, this.#metrics.timings[name]!);
165 | }
166 |
167 | /**
168 | * Save page metrics.
169 | */
170 | async saveMetrics(): Promise {
171 | try {
172 | if (!this.page || this.page.isClosed) {
173 | // page has been closed
174 | return;
175 | }
176 | this.#metrics.page = await this.page.saveMetrics();
177 | } catch (err: any) {
178 | // Can happen if target is already closed or redirection
179 | if (RESPONSE_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) {
180 | // Expected error when page is closed, no need to report
181 | return;
182 | }
183 | // Report other unexpected errors
184 | report(err, { context: 'saveMetrics' });
185 | }
186 | }
187 |
188 | /**
189 | * Shortcut everything.
190 | */
191 | throwHandledError(res: ErrorReturn): void {
192 | this.results.error = res.error;
193 | this.results.rawError = res.rawError || null;
194 | stats.increment('renderscript.task.handlederror', {
195 | error: res.error || 'no_error',
196 | });
197 | throw new ErrorIsHandledError();
198 | }
199 |
200 | abstract process(): Promise;
201 | }
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Renderscript
2 |
3 | > An API to render a page inside a real Chromium (with JavaScript enabled) and send back the raw HTML.
4 |
5 | This project is directly written for and consumed by [Algolia Crawler](https://www.algolia.com/products/search-and-discovery/crawler/).
6 |
7 | 🔐 **Secure**
8 | Leverages `Context` to isolate each page, prevent cookie sharing, control redirection, etc...
9 |
10 | 🚀 **Performant**:
11 | Ignores unnecessary resources for rendering HTML (e.g. `images`, `video`, `font`, etc...) and bundle an AdBlocker by default.
12 |
13 | 🤖 **Automated**:
14 | Renderscript has everything abstracted to render a page and login to website with minimal configuration required.
15 |
16 | ## Usage
17 |
18 | ### Local
19 |
20 | ```sh
21 | yarn dev
22 | ```
23 |
24 | **Goto**:
25 |
26 | ### Docker
27 |
28 | ```sh
29 | docker build . -t algolia/renderscript
30 | docker run -p 3000:3000 -it algolia/renderscript
31 |
32 | curl -X POST http://localhost:3000/render \
33 | -H 'Content-Type: application/json' \
34 | -d '{"url": "https://www.algolia.com/", "ua": "local_renderscript"}'
35 | ```
36 |
37 | ## API
38 |
39 | - [`POST /render`](#post-render)
40 | - [`GET /render`](#get-render)
41 | - [`POST /login`](#post-login)
42 | - [`GET /list`](#get-list)
43 | - [`GET /healthy`, `GET /ready`](#get-healthy--get-ready)
44 |
45 | ---
46 |
47 | ### `POST /render`
48 |
49 | Main endpoint. Renders the page and dumps a JSON with all the page information.
50 |
51 | #### Body parameters:
52 |
53 | ```ts
54 | {
55 | /**
56 | * URL to render (for hash and query params support, use `encodeURIComponent` on it)
57 | */
58 | url: string;
59 |
60 | /**
61 | * User-Agent to use.
62 | */
63 | ua: string;
64 |
65 | /**
66 | * Enables AdBlocker
67 | */
68 | adblock?: boolean;
69 |
70 | /**
71 | * Define the range of time.
72 | * Minimum and maximum execution time.
73 | */
74 | waitTime?: {
75 | min?: number;
76 | max?: number;
77 | };
78 |
79 | /**
80 | * Headers to Forward on navigation
81 | */
82 | headersToForward?: {
83 | [s: string]: string;
84 | };
85 | }
86 | ```
87 |
88 | #### Response `application/json`:
89 |
90 | ```ts
91 | {
92 | /**
93 | * HTTP Code of the rendered page.
94 | */
95 | statusCode: number | null;
96 |
97 | /**
98 | * HTTP Headers of the rendered page.
99 | */
100 | headers: Record;
101 |
102 | /**
103 | * Body of the rendered page.
104 | */
105 | body: string | null;
106 |
107 | /**
108 | * Metrics from different taks during the rendering.
109 | */
110 | metrics: Metrics;
111 |
112 | /**
113 | * The redirection renderscript caught.
114 | */
115 | resolvedUrl: string | null;
116 |
117 | /**
118 | * Has the page reached timeout?
119 | * When timeout has been reached we continue the rendering as usual
120 | * but reduce other timeout to a minimum.
121 | */
122 | timeout: boolean;
123 |
124 | /**
125 | * Any error encountered along the way.
126 | * If this field is filled that means the rest of the payload is partial.
127 | */
128 | error: string | null;
129 | }
130 | ```
131 |
132 | ---
133 |
134 | ### `GET /render`
135 |
136 | Used for debug purposes. Dumps directly the HTML for easy inspection in your browser.
137 |
138 | #### Query parameters:
139 |
140 | > see `POST /render` parameters
141 |
142 | #### Response `text/html`.
143 |
144 | CSP headers are set to prevent script execution on the rendered page.
145 |
146 | ---
147 |
148 | ### `POST /login`
149 |
150 | This endpoint will load a given login page, look for `input` fields, enter the given credentials and validate the form.
151 | It allows retrieving programmatically a session-cookie from websites with [CSRF](https://en.wikipedia.org/wiki/Cross-site_request_forgery) protection.
152 |
153 | #### Body parameters
154 |
155 | ```ts
156 | {
157 | /**
158 | * URL to render (for hash and query params support, use `encodeURIComponent` on it)
159 | */
160 | url: string;
161 |
162 | /**
163 | * User-Agent to use.
164 | */
165 | ua: string;
166 |
167 | /**
168 | * Username to enter on the login form. Renderscript expects to find an `input[type=text]` or `input[type=email]` on the login page.
169 | */
170 | username: string;
171 |
172 | /**
173 | * Password to enter on the login form. Renderscript expects to find an `input[type=password]` on the login page.
174 | */
175 | password: string;
176 |
177 | /**
178 | * Define the range of time.
179 | * Minimum and maximum execution time.
180 | */
181 | waitTime?: {
182 | min?: number;
183 | max?: number;
184 | };
185 |
186 | /**
187 | * Boolean (optional).
188 | * If set to true, Renderscript will return the rendered HTML after the login request. Useful to debug visually.
189 | */
190 | renderHTML?: boolean;
191 | }
192 | ```
193 |
194 | #### Response `application/json`
195 |
196 | ```ts
197 | {
198 | /**
199 | * HTTP Code of the rendered page.
200 | */
201 | statusCode: number | null;
202 |
203 | /**
204 | * HTTP Headers of the rendered page.
205 | */
206 | headers: Record;
207 |
208 | /**
209 | * Metrics from different taks during the rendering.
210 | */
211 | metrics: Metrics;
212 |
213 | /**
214 | * Has the page reached timeout?
215 | * When timeout has been reached we continue the rendering as usual
216 | * but reduce other timeout to a minimum.
217 | */
218 | timeout: boolean;
219 |
220 | /**
221 | * Any error encountered along the way.
222 | * If this field is filled that means the rest of the payload is partial.
223 | */
224 | error: string | null;
225 |
226 | /**
227 | * Cookie generated from a succesful login.
228 | */
229 | cookies: Cookie[];
230 |
231 | /**
232 | * The URL at the end of a succesful login.
233 | */
234 | resolvedUrl: string | null;
235 |
236 | /**
237 | * Body at the end of a succesful login.
238 | */
239 | body: string | null;
240 | }
241 | ```
242 |
243 | #### Response `text/html`
244 |
245 | If `renderHTML: true`, returns `text/html`.
246 | CSP headers are set to prevent script execution on the rendered page.
247 |
248 | ---
249 |
250 | ### `GET /list`
251 |
252 | List currenlty open pages.
253 | Useful to debug.
254 |
255 | ---
256 |
257 | ### `GET /healthy`, `GET /ready`
258 |
259 | Health Check for Kubernetes and others.
260 |
261 | ---
262 |
263 | ## Credits
264 |
265 | This project was heavily inspired by [`GoogleChrome/rendertron`](https://github.com/GoogleChrome/rendertron).
266 | It was based on [`puppeteer-core`](https://github.com/GoogleChrome/puppeteer) but we switched to [Playwright](https://playwright.dev/).
267 |
--------------------------------------------------------------------------------
/src/__tests__/login.test.ts:
--------------------------------------------------------------------------------
1 | import type { Cookie } from 'playwright';
2 |
3 | import type { PostLoginSuccess } from '../api/@types/postLogin';
4 |
5 | import { sendLoginRequest } from './helpers';
6 |
7 | jest.setTimeout(45000);
8 |
9 | describe('login', () => {
10 | it('should error when no username', async () => {
11 | const { res, body } = await sendLoginRequest({
12 | url: 'http://localhost:3000/secure/login',
13 | username: '',
14 | password: 'password',
15 | });
16 |
17 | expect(res.statusCode).toBe(400);
18 |
19 | expect(JSON.parse(body)).toEqual({
20 | details: [
21 | {
22 | label: 'username',
23 | message: 'username is required',
24 | type: 'required',
25 | },
26 | ],
27 | error: true,
28 | message: 'Bad Request',
29 | });
30 | });
31 |
32 | it('should error when no password', async () => {
33 | const { res, body } = await sendLoginRequest({
34 | url: 'http://localhost:3000/secure/login',
35 | username: 'admin',
36 | password: '',
37 | });
38 |
39 | expect(res.statusCode).toBe(400);
40 |
41 | expect(JSON.parse(body)).toEqual({
42 | details: [
43 | {
44 | label: 'password',
45 | message: 'password is required',
46 | type: 'required',
47 | },
48 | ],
49 | error: true,
50 | message: 'Bad Request',
51 | });
52 | });
53 |
54 | it('should error multiple text input', async () => {
55 | const { res, body } = await sendLoginRequest({
56 | url: 'http://localhost:3000/test-website/login-multiple-input.html',
57 | username: 'admin',
58 | password: 'paswword',
59 | });
60 |
61 | expect(res.statusCode).toBe(500);
62 | const parsed: PostLoginSuccess = JSON.parse(body);
63 | expect(parsed.error).toBe('no_cookies');
64 | expect(parsed.rawError).toBeNull();
65 | });
66 |
67 | it('should error double password', async () => {
68 | const { res, body } = await sendLoginRequest({
69 | url: 'http://localhost:3000/test-website/login-double-password.html',
70 | username: 'admin',
71 | password: 'paswword',
72 | });
73 |
74 | expect(res.statusCode).toBe(200);
75 | const parsed: PostLoginSuccess = JSON.parse(body);
76 | expect(parsed.error).toBe('too_many_fields');
77 | expect(parsed.rawError?.message).toBe(
78 | 'Too many input found for "input[type=password]:not([aria-hidden="true"])", found "2"'
79 | );
80 | });
81 |
82 | it('should works with correct credentials', async () => {
83 | const { res, body } = await sendLoginRequest({
84 | url: 'http://localhost:3000/secure/login',
85 | username: 'admin',
86 | password: 'password',
87 | });
88 |
89 | expect(res.statusCode).toBe(200);
90 |
91 | const parsed: PostLoginSuccess = JSON.parse(body);
92 | expect(
93 | parsed.cookies.find((cookie) => cookie.name === 'sessionToken')
94 | ).toMatchSnapshot();
95 | // Check that we actually went through the form
96 | expect(
97 | parsed.cookies.find((cookie) => cookie.name === '_csrf')
98 | ).toBeDefined();
99 | });
100 |
101 | it('should works even with a 2-steps login', async () => {
102 | const { res, body } = await sendLoginRequest({
103 | url: 'http://localhost:3000/secure/login/step1',
104 | username: 'admin',
105 | password: 'password',
106 | });
107 |
108 | expect(res.statusCode).toBe(200);
109 |
110 | const cookies: Cookie[] = JSON.parse(body).cookies;
111 | expect(
112 | cookies.find((cookie) => cookie.name === 'sessionToken')
113 | ).toMatchSnapshot();
114 | // Check that we actually went through the form
115 | expect(cookies.find((cookie) => cookie.name === '_csrf')).toBeDefined();
116 | });
117 |
118 | it('should works with a 2-steps JS login', async () => {
119 | const { res, body } = await sendLoginRequest({
120 | url: 'http://localhost:3000/secure/login/2steps',
121 | username: 'admin',
122 | password: 'password',
123 | });
124 |
125 | expect(res.statusCode).toBe(200);
126 |
127 | const cookies: Cookie[] = JSON.parse(body).cookies;
128 | expect(
129 | cookies.find((cookie) => cookie.name === 'sessionToken')
130 | ).toMatchSnapshot();
131 | // Check that we actually went through the form
132 | expect(cookies.find((cookie) => cookie.name === '_csrf')).toBeDefined();
133 | });
134 |
135 | it('should works but not get a session token with bad credentials', async () => {
136 | const { res, body } = await sendLoginRequest({
137 | url: 'http://localhost:3000/secure/login',
138 | username: 'admin',
139 | password: 'admin',
140 | });
141 |
142 | expect(res.statusCode).toBe(200);
143 |
144 | const parsed: PostLoginSuccess = JSON.parse(body);
145 | expect(parsed.cookies).toHaveLength(1);
146 | expect(
147 | parsed.cookies.find((cookie) => cookie.name === 'sessionToken')
148 | ).toBeUndefined();
149 | // Check that we actually went through the form
150 | expect(
151 | parsed.cookies.find((cookie) => cookie.name === '_csrf')
152 | ).toBeDefined();
153 | });
154 | });
155 |
156 | describe('JavaScript redirect', () => {
157 | it('should fail to renderHTML because of the JS redirect', async () => {
158 | const { res, body } = await sendLoginRequest({
159 | url: 'http://localhost:3000/secure/login?redirect=true',
160 | username: 'admin',
161 | password: 'password',
162 | renderHTML: true,
163 | waitTime: {
164 | min: 1000,
165 | },
166 | });
167 |
168 | expect(res.statusCode).toBe(200);
169 | expect(body).toBe(
170 | 'OK(/test)'
171 | );
172 | });
173 |
174 | it('should not try to render the body if renderHTML was not requested', async () => {
175 | const { res, body } = await sendLoginRequest({
176 | url: 'http://localhost:3000/secure/login?redirect=true',
177 | username: 'admin',
178 | password: 'password',
179 | waitTime: {
180 | min: 1000,
181 | },
182 | });
183 |
184 | // Since we didn't try to render, it returns the current cookies, even if there is an ongoing JS redirection
185 | expect(res.statusCode).toBe(200);
186 |
187 | const parsed: PostLoginSuccess = JSON.parse(body);
188 | expect(parsed.body).toBe(
189 | 'OK(/test)'
190 | );
191 | expect(parsed.statusCode).toBe(200);
192 | expect(parsed.metrics.timings.total).toBeGreaterThan(1000);
193 | expect(parsed.resolvedUrl).toBe('http://localhost:3000/secure/test');
194 | expect(
195 | parsed.cookies.find((cookie) => cookie.name === 'sessionToken')
196 | ).toMatchSnapshot();
197 | });
198 | });
199 |
--------------------------------------------------------------------------------
/src/__tests__/redirect.test.ts:
--------------------------------------------------------------------------------
1 | import type { PostRenderSuccess } from '../api/@types/postRender';
2 |
3 | import { cleanString, postRender, request } from './helpers';
4 |
5 | describe('server redirect', () => {
6 | it('should return the redirection', async () => {
7 | // !---
8 | // Server Redirect are flaky since Playwright do not catch 301
9 | // You might want to relaunch the test if it failed.
10 | const { res, body } = await postRender({
11 | url: 'http://localhost:3000/301',
12 | waitTime: {
13 | min: 5000, // wait long to be sure we end up being redirected
14 | },
15 | });
16 |
17 | const json: PostRenderSuccess = JSON.parse(body);
18 | expect(res.statusCode).toBe(200);
19 |
20 | expect(json.body).toBeNull();
21 | expect(json.headers).toMatchObject({
22 | location: '/test-website/basic.html',
23 | });
24 | expect(json.statusCode).toBe(301);
25 | expect(json.timeout).toBe(false);
26 | expect(json.resolvedUrl).toBe(
27 | 'http://localhost:3000/test-website/basic.html'
28 | );
29 |
30 | // Make sure execution was interrupted gracefully
31 | expect(json.metrics.timings.total).toBeGreaterThan(0);
32 | expect(json.metrics.timings.serialize).toBeNull();
33 | expect(json.metrics.timings.close).toBeGreaterThan(0);
34 | });
35 | });
36 |
37 | describe('meta refresh', () => {
38 | it('should return the redirection', async () => {
39 | const { res, body } = await postRender({
40 | url: 'http://localhost:3000/test-website/meta-refresh.html',
41 | ua: 'Algolia Crawler',
42 | });
43 |
44 | const json: PostRenderSuccess = JSON.parse(body);
45 | expect(res.statusCode).toBe(200);
46 |
47 | expect(json.statusCode).toBe(200);
48 | expect(json.body).toBeNull();
49 | expect(json.resolvedUrl).toBe(
50 | 'http://localhost:3000/test-website/basic.html'
51 | );
52 | expect(json.error).toBe('redirection');
53 |
54 | // Make sure execution was interrupted gracefully
55 | expect(json.metrics.timings.total).toBeGreaterThan(0);
56 | expect(json.metrics.timings.serialize).toBeNull();
57 | expect(json.metrics.timings.close).toBeGreaterThan(0);
58 | });
59 |
60 | it('should return the redirection even if not executed yet', async () => {
61 | const { res, body } = await postRender({
62 | // The client redirection happens after 5sec but we only wait 2sec
63 | url: 'http://localhost:3000/test-website/meta-refresh-5.html',
64 | ua: 'Algolia Crawler',
65 | waitTime: {
66 | max: 2000,
67 | },
68 | });
69 |
70 | const json: PostRenderSuccess = JSON.parse(body);
71 | expect(res.statusCode).toBe(200);
72 |
73 | expect(json.statusCode).toBe(200);
74 | expect(json.body).toBeNull();
75 | expect(json.resolvedUrl).toBe(
76 | 'http://localhost:3000/test-website/basic.html'
77 | );
78 | expect(json.error).toBe('redirection');
79 |
80 | // Make sure execution was interrupted gracefully
81 | expect(json.metrics.timings.total).toBeGreaterThan(0);
82 | expect(json.metrics.timings.serialize).toBeNull();
83 | expect(json.metrics.timings.close).toBeGreaterThan(0);
84 | });
85 | });
86 |
87 | describe('js redirects', () => {
88 | it('should catch redirection', async () => {
89 | const { res, body } = await postRender({
90 | url: 'http://localhost:3000/test-website/js-redirect.html?to=/test-website/basic.html',
91 | ua: 'Algolia Crawler',
92 | waitTime: {
93 | max: 2000,
94 | },
95 | });
96 |
97 | const json: PostRenderSuccess = JSON.parse(body);
98 | expect(res.statusCode).toBe(200);
99 |
100 | expect(json.statusCode).toBe(200);
101 | expect(json.body).toBeNull();
102 | expect(json.resolvedUrl).toBe(
103 | 'http://localhost:3000/test-website/basic.html'
104 | );
105 | expect(json.error).toBe('redirection');
106 |
107 | // Make sure execution was interrupted gracefully
108 | expect(json.metrics.timings.total).toBeGreaterThan(0);
109 | expect(json.metrics.timings.serialize).toBeNull();
110 | expect(json.metrics.timings.close).toBeGreaterThanOrEqual(0);
111 | });
112 |
113 | it('should catch path', async () => {
114 | const { res, body } = await postRender({
115 | url: 'http://localhost:3000/test-website/js-redirect-path.html',
116 | ua: 'Algolia Crawler',
117 | waitTime: {
118 | min: 2000,
119 | },
120 | });
121 |
122 | const json: PostRenderSuccess = JSON.parse(body);
123 | expect(res.statusCode).toBe(200);
124 |
125 | expect(json.statusCode).toBe(200);
126 | expect(json.body).toBeNull();
127 | expect(json.resolvedUrl).toBe(
128 | 'http://localhost:3000/test-website/basic.html'
129 | );
130 | expect(json.error).toBe('redirection');
131 |
132 | // Make sure execution was interrupted gracefully
133 | expect(json.metrics.timings.total).toBeGreaterThan(0);
134 | expect(json.metrics.timings.serialize).toBeNull();
135 | expect(json.metrics.timings.close).toBeGreaterThanOrEqual(0);
136 | });
137 |
138 | it('should catch history pushState', async () => {
139 | const { res, body } = await postRender({
140 | url: 'http://localhost:3000/test-website/js-redirect-history.html',
141 | ua: 'Algolia Crawler',
142 | waitTime: {
143 | min: 2000,
144 | },
145 | });
146 |
147 | const json: PostRenderSuccess = JSON.parse(body);
148 | expect(res.statusCode).toBe(200);
149 |
150 | expect(json.statusCode).toBe(200);
151 | expect(json.body).toBeNull();
152 | expect(json.resolvedUrl).toBe(
153 | 'http://localhost:3000/test-website/basic.html'
154 | );
155 | expect(json.error).toBe('redirection');
156 |
157 | // Make sure execution was interrupted gracefully
158 | expect(json.metrics.timings.total).toBeGreaterThan(0);
159 | expect(json.metrics.timings.serialize).toBeNull();
160 | expect(json.metrics.timings.close).toBeGreaterThanOrEqual(0);
161 | });
162 |
163 | it('should catch hash but render normally', async () => {
164 | const { res, body } = await postRender({
165 | url: 'http://localhost:3000/test-website/js-redirect-hash.html',
166 | ua: 'Algolia Crawler',
167 | waitTime: {
168 | min: 2000,
169 | },
170 | });
171 |
172 | const json: PostRenderSuccess = JSON.parse(body);
173 | expect(res.statusCode).toBe(200);
174 |
175 | expect(json.statusCode).toBe(200);
176 | expect(json.body).toBe(
177 | ` \n\n\n \n\n\n\n`
178 | );
179 | expect(json.error).toBeNull();
180 |
181 | // Make sure execution was interrupted gracefully
182 | expect(json.metrics.timings.total).toBeGreaterThan(0);
183 | expect(json.metrics.timings.serialize).toBeGreaterThan(0);
184 | expect(json.metrics.timings.close).toBeGreaterThanOrEqual(0);
185 | });
186 |
187 | it('should output 307', async () => {
188 | const { res, body } = await request(
189 | `http://localhost:3000/render?url=http%3A%2F%2Flocalhost%3A3000%2Ftest-website%2Fjs-redirect.html?to=${encodeURIComponent(
190 | '/test-website/basic.html'
191 | )}&ua=Algolia+Crawler`
192 | );
193 |
194 | expect(res.statusCode).toBe(307);
195 | expect(res.headers).toEqual({
196 | connection: 'keep-alive',
197 | 'content-length': '0',
198 | date: expect.any(String),
199 | 'keep-alive': 'timeout=5',
200 | location: 'http://localhost:3000/test-website/basic.html',
201 | });
202 |
203 | expect(cleanString(body)).toBe('');
204 | });
205 | });
206 |
--------------------------------------------------------------------------------
/src/lib/TasksManager.ts:
--------------------------------------------------------------------------------
1 | import {
2 | RENDERSCRIPT_TASK_TYPE_TAG,
3 | RENDERSCRIPT_TASK_URL_TAG,
4 | report,
5 | } from '../helpers/errorReporting';
6 | import { log as mainLog } from '../helpers/logger';
7 | import { stats } from '../helpers/stats';
8 |
9 | import type { BrowserEngine } from './browser/Browser';
10 | import { Browser } from './browser/Browser';
11 | import { RESPONSE_IGNORED_ERRORS } from './browser/constants';
12 | import { UNHEALTHY_TASK_TTL } from './constants';
13 | import { cleanErrorMessage, ErrorIsHandledError } from './helpers/errors';
14 | import type { Task } from './tasks/Task';
15 | import type { TaskObject, TaskFinal } from './types';
16 |
17 | export const log = mainLog.child({ svc: 'mngr' });
18 |
19 | export class TasksManager {
20 | #chromium: Browser | null = null;
21 | #firefox: Browser | null = null;
22 | #stopping: boolean = true;
23 | #tasks: Map = new Map();
24 | #totalRun: number = 0;
25 |
26 | getHealth(): { ready: boolean; reason?: string; oldTasks: string[][] } {
27 | const oldTasks: any[][] = [];
28 |
29 | if (this.#stopping) {
30 | return { ready: false, reason: 'stopping', oldTasks };
31 | }
32 |
33 | // Tasks lifecycle
34 | this.#tasks.forEach((task) => {
35 | const duration = Date.now() - task.ref.createdAt!.getTime();
36 | if (duration < UNHEALTHY_TASK_TTL) {
37 | return;
38 | }
39 | oldTasks.push([
40 | duration,
41 | task.ref.id,
42 | task.ref.params.url.href,
43 | JSON.stringify(task.ref.results),
44 | JSON.stringify(task.ref.metrics),
45 | task.ref.isDone,
46 | ]);
47 | });
48 |
49 | if (oldTasks.length > 0) {
50 | return { ready: false, reason: 'oldTasks', oldTasks };
51 | }
52 |
53 | if (this.#chromium && this.#firefox) {
54 | return {
55 | ready: this.#chromium.isReady && this.#firefox.isReady,
56 | reason: `browser(s) not ready: chromium: ${
57 | this.#chromium.isReady ? '✅' : '❌'
58 | } ; firefox: ${this.#firefox.isReady ? '✅' : '❌'}`,
59 | oldTasks,
60 | };
61 | }
62 |
63 | return { ready: false, oldTasks };
64 | }
65 |
66 | get currentBrowsers(): Map {
67 | return new Map([
68 | ['chromium', this.#chromium],
69 | ['firefox', this.#firefox],
70 | ]);
71 | }
72 |
73 | get currentConcurrency(): number {
74 | return this.#tasks.size;
75 | }
76 |
77 | get totalRun(): number {
78 | return this.#totalRun;
79 | }
80 |
81 | async launch(): Promise {
82 | const chromium = new Browser('chromium');
83 | await chromium.create();
84 | const firefox = new Browser('firefox');
85 | await firefox.create();
86 |
87 | this.#chromium = chromium;
88 | this.#firefox = firefox;
89 | this.#stopping = false;
90 | log.info('Ready');
91 | }
92 |
93 | /**
94 | * Register and execute a task.
95 | */
96 | async task(task: Task): Promise {
97 | const health = this.getHealth();
98 | if (!health.ready) {
99 | // The process can be marked as not ready because one of the browsers is not up
100 | // If we receive a job for a browser that is ready, only report and process it.
101 | if (
102 | (!task.params.browser || task.params.browser === 'chromium') &&
103 | this.#chromium?.isReady
104 | ) {
105 | report(new Error('Unhealthy node received a job but can process it'), {
106 | url: task.params.url,
107 | browser: 'chromium',
108 | reason: health.reason,
109 | });
110 | } else if (task.params.browser === 'firefox' && this.#firefox?.isReady) {
111 | report(new Error('Unhealthy node received a job but can process it'), {
112 | url: task.params.url,
113 | browser: 'firefox',
114 | reason: health.reason,
115 | });
116 | } else {
117 | throw new Error(`Unhealthy node received a job: ${health.reason}`);
118 | }
119 | }
120 |
121 | try {
122 | const promise = this.#exec(task);
123 | this.#totalRun += 1;
124 | this.#tasks.set(task.id, {
125 | ref: task,
126 | promise,
127 | });
128 |
129 | return await promise;
130 | } finally {
131 | this.#tasks.delete(task.id);
132 | }
133 | }
134 |
135 | /**
136 | * Stop the task manager.
137 | */
138 | async stop(): Promise {
139 | this.#stopping = true;
140 | log.info('[Manager] stopping...');
141 |
142 | // We wait for all tasks to finish before closing
143 | const promises: Array> = [];
144 | this.#tasks.forEach((task) => {
145 | promises.push(this.#removeTask(task.ref.id));
146 | });
147 | await Promise.all(promises);
148 |
149 | this.#tasks.clear();
150 |
151 | if (this.#chromium) {
152 | await this.#chromium.stop();
153 | this.#chromium = null;
154 | }
155 | if (this.#firefox) {
156 | await this.#firefox.stop();
157 | this.#firefox = null;
158 | }
159 | }
160 |
161 | /**
162 | * Actual execution of a task.
163 | * It will create a browser, a page, launch the task (render, login), close everything.
164 | * Any unexpected error will be thrown.
165 | */
166 | async #exec(task: Task): Promise {
167 | if (this.#stopping) {
168 | throw new Error('Task can not be executed: stopping');
169 | }
170 |
171 | const engine: BrowserEngine = task.params.browser || 'chromium';
172 | const browser = engine === 'firefox' ? this.#firefox : this.#chromium;
173 | if (!browser || !browser.isReady) {
174 | throw new Error('Task can not be executed: no_browser');
175 | }
176 |
177 | const id = task.id;
178 | const url = task.params.url.href;
179 | const type = task.constructor.name;
180 | log.info('Processing', { id, url, type });
181 |
182 | const start = Date.now();
183 |
184 | try {
185 | await task.createContext(browser);
186 | await task.process();
187 | } catch (err: any) {
188 | /* eslint-disable no-param-reassign */
189 | if (!(err instanceof ErrorIsHandledError)) {
190 | task.results.error = task.results.error || cleanErrorMessage(err);
191 | task.results.rawError = err;
192 | report(err, { url }, [
193 | {
194 | key: RENDERSCRIPT_TASK_URL_TAG,
195 | value: url,
196 | },
197 | {
198 | key: RENDERSCRIPT_TASK_TYPE_TAG,
199 | value: type,
200 | },
201 | ]);
202 | }
203 | /* eslint-enable no-param-reassign */
204 | }
205 |
206 | try {
207 | await task.saveMetrics();
208 | } catch (err: any) {
209 | // Task itself should never break the whole execution
210 | report(err, { url });
211 | }
212 |
213 | // No matter what happen we want to kill everything gracefully
214 | try {
215 | await task.close();
216 | this.#tasks.delete(id);
217 | } catch (err: any) {
218 | // Don't let close errors crash the process
219 | if (RESPONSE_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) {
220 | // Expected error when browser is already closed
221 | log.debug('Expected close error', { err: err.message, url });
222 | } else {
223 | report(new Error('Error during close'), { err, url });
224 | }
225 | }
226 |
227 | // ---- Reporting
228 | const total = Date.now() - start;
229 | stats.timing('renderscript.task', total, undefined, { type });
230 |
231 | if (task.metrics.page) {
232 | const mp = task.metrics.page;
233 | /* eslint-disable prettier/prettier */
234 | stats.timing(`renderscript.task.download`, mp.timings.download!);
235 | stats.histogram(`renderscript.task.requests`, mp.requests.total);
236 | stats.increment(`renderscript.task.requests.amount`, mp.requests.total);
237 | stats.histogram(`renderscript.task.blockedRequests`, mp.requests.blocked);
238 | stats.increment(`renderscript.task.blockedRequests.amount`, mp.requests.blocked);
239 | stats.increment(`renderscript.task.contentLength.amount`, mp.contentLength.main);
240 | stats.histogram(`renderscript.task.contentLength`, mp.contentLength.main);
241 | stats.increment(`renderscript.task.contentLengthTotal.amount`, mp.contentLength.total);
242 | stats.histogram(`renderscript.task.contentLengthTotal`, mp.contentLength.total);
243 | /* eslint-enable prettier/prettier */
244 | }
245 |
246 | log.info(
247 | { id, url, code: task.results.error, metrics: task.metrics },
248 | 'Done'
249 | );
250 | const res = task.results;
251 | return {
252 | ...res,
253 | timeout: task.page?.hasTimeout || false,
254 | metrics: task.metrics,
255 | };
256 | }
257 |
258 | async #removeTask(id: string): Promise {
259 | const task = this.#tasks.get(id);
260 | if (!task) {
261 | throw new Error(`Could not find task: ${id}`);
262 | }
263 |
264 | try {
265 | await task.promise;
266 | } catch (err) {
267 | //
268 | }
269 | }
270 | }
271 |
--------------------------------------------------------------------------------
/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Renderscript
6 |
8 |
9 |
19 |
20 |
21 |
22 |
23 |
24 |
34 |
35 | renderscript
36 |
37 |
38 |
142 |
143 |
144 |
145 |
--------------------------------------------------------------------------------
/src/lib/tasks/Login.ts:
--------------------------------------------------------------------------------
1 | import type { ElementHandle, Response, Request, Locator } from 'playwright';
2 |
3 | import { report } from '../../helpers/errorReporting';
4 | import { waitForPendingRequests } from '../../helpers/waitForPendingRequests';
5 | import { cleanErrorMessage } from '../helpers/errors';
6 | import { getInput } from '../helpers/getInput';
7 | import type { LoginTaskParams } from '../types';
8 |
9 | import { Task } from './Task';
10 |
11 | const usernameSelectors = [
12 | 'input[type=email][id*=login i]',
13 | 'input[type=email][name*=login i]',
14 | 'input[type=text][id*=login i]',
15 | 'input[type=text][id*=email i]',
16 | 'input[type=text][id*=username i]',
17 | 'input[type=text][name*=login i]',
18 | 'input[type=text][name*=email i]',
19 | 'input[type=text][name*=username i]',
20 | 'input[type=email]',
21 | 'input[type=text]',
22 | ];
23 | const passwordSel = 'input[type=password]:not([aria-hidden="true"])';
24 |
25 | export class LoginTask extends Task {
26 | async process(): Promise {
27 | if (!this.page) {
28 | throw new Error('Calling process before createContext()');
29 | }
30 |
31 | /* Setup */
32 | const { url } = this.params;
33 | let response: Response;
34 |
35 | try {
36 | response = await this.page.goto(url.href, {
37 | timeout: this.timeBudget.get(),
38 | waitUntil: 'networkidle',
39 | });
40 | } catch (err: any) {
41 | return this.throwHandledError({ error: err.message, rawError: err });
42 | } finally {
43 | this.setMetric('goto');
44 | }
45 |
46 | await this.saveStatus(response);
47 |
48 | const usernameInput = await this.#typeUsername();
49 | await this.saveStatus(response);
50 |
51 | // Get the password input
52 | const passwordInput = await this.#typePasswordInput({
53 | textInput: usernameInput!,
54 | step: '1',
55 | });
56 | await this.saveStatus(response);
57 |
58 | // Submit
59 | await this.#submitForm(passwordInput!);
60 |
61 | await this.saveStatus(response);
62 |
63 | await this.minWait();
64 | await this.saveStatus(response);
65 | if (!this.page.ref) {
66 | return;
67 | }
68 |
69 | /* Transforming */
70 | this.results.resolvedUrl = this.page.ref.url();
71 | // we get the cookie for the requested domain
72 | // this is not ideal for some SSO, returning valid cookies but missing some of them
73 | this.results.cookies = await this.page.ref
74 | ?.context()
75 | .cookies([url.href, this.results.resolvedUrl]);
76 |
77 | if (this.results.cookies.length <= 0) {
78 | return this.throwHandledError({ error: 'no_cookies' });
79 | }
80 |
81 | const body = await this.page.renderBody();
82 | this.results.body = body;
83 | this.setMetric('serialize');
84 | }
85 |
86 | /**
87 | * Get username input and type the value in it.
88 | */
89 | async #typeUsername(): Promise | void> {
92 | const { log, page, params } = this;
93 | const { login } = params;
94 |
95 | try {
96 | // We first check if there is form
97 | // Try multiple selector from the most to less precise
98 | let usernameInputLoc: Locator | null = null;
99 | for (const usernameSel of usernameSelectors) {
100 | const input = await getInput(page, usernameSel);
101 | if (!('error' in input)) {
102 | usernameInputLoc = input;
103 | break;
104 | }
105 | }
106 | if (!usernameInputLoc) {
107 | return this.throwHandledError({
108 | error: 'field_not_found',
109 | rawError: new Error('Username field not found'),
110 | });
111 | }
112 |
113 | log.info('Entering username...', { userName: login.username });
114 |
115 | const usernameInput = await usernameInputLoc.elementHandle({
116 | timeout: 500,
117 | });
118 | // https://playwright.dev/docs/release-notes#version-138
119 | await usernameInput?.fill(login.username, {
120 | noWaitAfter: true,
121 | timeout: this.timeBudget.getRange(2000, 3000),
122 | });
123 |
124 | return usernameInput!;
125 | } finally {
126 | this.timeBudget.consume();
127 | }
128 | }
129 |
130 | /**
131 | * Get password input.
132 | */
133 | async #typePasswordInput({
134 | textInput,
135 | step,
136 | }: {
137 | textInput: ElementHandle;
138 | step: '1' | '2';
139 | }): Promise | null | void> {
140 | const { page, params } = this;
141 | const { login } = params;
142 |
143 | try {
144 | // Find the input
145 | const passwordInputLoc = await getInput(page, passwordSel);
146 | if (!('error' in passwordInputLoc)) {
147 | this.log.info('Entering password...');
148 | await passwordInputLoc.fill(login.password, {
149 | noWaitAfter: true,
150 | timeout: this.timeBudget.getRange(2000, 3000),
151 | });
152 |
153 | return passwordInputLoc.elementHandle();
154 | }
155 |
156 | if (passwordInputLoc.error === 'too_many_fields') {
157 | return this.throwHandledError(passwordInputLoc);
158 | }
159 |
160 | if (step === '2' && passwordInputLoc.error === 'field_not_found') {
161 | return this.throwHandledError(passwordInputLoc);
162 | }
163 |
164 | return await this.#handleFirstStepForm({ textInput });
165 | } finally {
166 | this.timeBudget.consume();
167 | }
168 | }
169 |
170 | /**
171 | * Try to submit first step form to get the password input.
172 | */
173 | async #handleFirstStepForm({
174 | textInput,
175 | }: {
176 | textInput: ElementHandle;
177 | }): Promise | null | void> {
178 | const log = this.log;
179 |
180 | // It can be that we are in a "two step form"
181 | log.info('No password input found: validating username...');
182 |
183 | // Submit the form to see if the second step appears
184 | await textInput.press('Enter', {
185 | noWaitAfter: true,
186 | timeout: this.timeBudget.getRange(2000, 3000),
187 | });
188 | this.timeBudget.consume();
189 |
190 | // And wait for a new input to be there maybe
191 | // page!.waitForNavigation() doesn't work with Okta for example, it's JS based
192 | await this.page!.ref?.waitForSelector(passwordSel, {
193 | timeout: this.timeBudget.min(3000),
194 | });
195 | this.timeBudget.consume();
196 |
197 | log.debug('Current URL', { pageUrl: this.page!.ref?.url() });
198 | return this.#typePasswordInput({ textInput, step: '2' });
199 | }
200 |
201 | /**
202 | * Submit form and wait for response or something to happen.
203 | */
204 | async #submitForm(
205 | passwordInput: ElementHandle
206 | ): Promise {
207 | const log = this.log;
208 | const { url } = this.params;
209 | let res: Response | null = null;
210 |
211 | try {
212 | log.debug(`Submit login form`);
213 | // We don't submit form directly because sometimes there are no form
214 | // We wait both at the same time because navigation happens quickly
215 | [res] = await Promise.all([
216 | this.page!.waitForNavigation({
217 | timeout: this.timeBudget.min(3000),
218 | waitUntil: 'domcontentloaded',
219 | }),
220 | passwordInput.press('Enter', {
221 | noWaitAfter: true,
222 | timeout: this.timeBudget.getRange(2000, 3000),
223 | }),
224 | ]);
225 | } catch (err: any) {
226 | this.page!.throwIfNotTimeout(err);
227 | } finally {
228 | this.timeBudget.consume();
229 | }
230 |
231 | try {
232 | log.debug(`Login wait for network idle`);
233 | const timeBudget = this.timeBudget.get();
234 | const startWaitTime = Date.now();
235 |
236 | // After it is submitted there can quite a lof ot redirections, so we wait a bit more
237 | // we could do it before, but it's easier to split domcontentloaded and networkidle for debug
238 | const [resAfterNetwork] = await Promise.all([
239 | this.page!.waitForNavigation({
240 | timeout: this.timeBudget.min(5000),
241 | waitUntil: 'networkidle',
242 | }),
243 | ]);
244 | if (resAfterNetwork) {
245 | // if no navigation happened, resAfterNetwork is null
246 | // but we don't want to erase res because it is most of the time normal if we already reached the final page
247 | res = resAfterNetwork;
248 | }
249 | const timeWaited = Date.now() - startWaitTime;
250 | await waitForPendingRequests(this.page!, timeBudget - timeWaited);
251 | } catch (err: any) {
252 | report(new Error('Error waiting to submit form'), {
253 | err: err.message,
254 | pageUrl: this.page!.ref?.url(),
255 | });
256 | return this.throwHandledError({
257 | error: cleanErrorMessage(err),
258 | rawError: err,
259 | });
260 | } finally {
261 | this.timeBudget.consume();
262 | }
263 |
264 | const hasSpecialCase = this.#needSpecialCase();
265 | if (hasSpecialCase) {
266 | log.debug(`Login wait for spec`);
267 | try {
268 | const [resAfterSpec] = await Promise.all([
269 | this.page!.waitForNavigation({
270 | timeout: this.timeBudget.min(5000),
271 | waitUntil: 'networkidle',
272 | }),
273 | this.#handleSpecialCaseForm({ name: hasSpecialCase }),
274 | ]);
275 | if (resAfterSpec) {
276 | res = resAfterSpec;
277 | }
278 | } catch (err: any) {
279 | this.page!.throwIfNotTimeout(err);
280 | } finally {
281 | this.timeBudget.consume();
282 | }
283 | }
284 |
285 | if (!res) {
286 | if (this.page!.ref?.url() === url.href) {
287 | // Return an error if we got no login response and are still on the same URL
288 | return this.throwHandledError({ error: 'no_response_after_login' });
289 | }
290 |
291 | // Can happen if navigation was done through History API
292 | log.debug('No login response, but redirected', {
293 | pageUrl: this.page!.ref?.url(),
294 | });
295 | return;
296 | }
297 |
298 | // Computing redirection chain.
299 | const chain = [];
300 | let prev: Request | null = res.request();
301 | while (prev) {
302 | prev = prev.redirectedFrom();
303 | if (!prev) {
304 | prev = null;
305 | break;
306 | }
307 | chain.push(prev.url());
308 | }
309 | log.debug('Login after redirections', {
310 | pageUrl: this.page!.ref?.url(),
311 | chain,
312 | });
313 | }
314 |
315 | #needSpecialCase(): 'login.live.com' | false {
316 | if (!this.page?.ref) {
317 | return false;
318 | }
319 |
320 | const currentUrl = this.page.ref.url();
321 | if (currentUrl.startsWith('https://login.live.com')) {
322 | return 'login.live.com';
323 | }
324 |
325 | return false;
326 | }
327 |
328 | async #handleSpecialCaseForm({
329 | name,
330 | }: {
331 | name: 'login.live.com';
332 | }): Promise {
333 | const { log } = this;
334 | if (!this.page?.ref) {
335 | return;
336 | }
337 |
338 | // Spec for Microsoft SSO
339 | if (name === 'login.live.com') {
340 | log.debug('MSFT: Entering specs');
341 |
342 | // There is a "Keep me sign in?" checkbox now
343 | const confirm = this.page.ref.locator('#KmsiCheckboxField');
344 | const submit = this.page.ref.locator('input[type=submit]');
345 |
346 | if ((await confirm.count()) === 1 && (await submit.count()) === 1) {
347 | log.debug('MSFT: found confirm and submit');
348 |
349 | await confirm.click({
350 | timeout: this.timeBudget.getRange(200, 500),
351 | noWaitAfter: true, // Otherwise wait for navigation
352 | });
353 |
354 | await submit.click({
355 | timeout: this.timeBudget.getRange(200, 500),
356 | noWaitAfter: true, // Otherwise wait for navigation
357 | });
358 | }
359 | }
360 | }
361 | }
362 |
--------------------------------------------------------------------------------
/src/lib/browser/Page.ts:
--------------------------------------------------------------------------------
1 | import type { BrowserContext, Page, Route, Response } from 'playwright';
2 |
3 | import { report } from '../../helpers/errorReporting';
4 | import { log } from '../../helpers/logger';
5 | import {
6 | promiseWithTimeout,
7 | PromiseWithTimeoutError,
8 | } from '../../helpers/promiseWithTimeout';
9 | import { stats } from '../../helpers/stats';
10 | import { DATA_REGEXP, IGNORED_RESOURCES } from '../constants';
11 | import { cleanErrorMessage } from '../helpers/errors';
12 | import { isURLAllowed } from '../helpers/validateURL';
13 | import { adblocker } from '../singletons';
14 | import type { PageMetrics, Perf, TaskBaseParams } from '../types';
15 |
16 | import type { BrowserEngine } from './Browser';
17 | import { DEFAULT_ENGINE } from './Browser';
18 | import {
19 | METRICS_IGNORED_ERRORS,
20 | REQUEST_IGNORED_ERRORS,
21 | RESPONSE_IGNORED_ERRORS,
22 | } from './constants';
23 |
24 | /**
25 | * Abstract some logics around playwright pages.
26 | */
27 | export class BrowserPage {
28 | #ref: Page | undefined;
29 | #context: BrowserContext | undefined;
30 | #engine: BrowserEngine;
31 | #metrics: PageMetrics = {
32 | timings: {
33 | download: 0,
34 | },
35 | requests: {
36 | total: 0,
37 | blocked: 0,
38 | pending: 0,
39 | },
40 | contentLength: {
41 | main: 0,
42 | total: 0,
43 | },
44 | mem: {
45 | jsHeapUsedSize: null,
46 | jsHeapTotalSize: null,
47 | },
48 | };
49 | #redirection?: string;
50 | #hasTimeout: boolean = false;
51 | #initialResponse?: Response;
52 |
53 | get ref(): Page | undefined {
54 | return this.#ref;
55 | }
56 |
57 | get context(): BrowserContext | undefined {
58 | return this.#context;
59 | }
60 |
61 | get isReady(): boolean {
62 | return Boolean(this.#ref && this.#context);
63 | }
64 |
65 | get isClosed(): boolean {
66 | return this.#ref?.isClosed() === true;
67 | }
68 |
69 | get hasTimeout(): boolean {
70 | return this.#hasTimeout;
71 | }
72 |
73 | get redirection(): string | undefined {
74 | return this.#redirection;
75 | }
76 |
77 | get initialResponse(): Response | undefined {
78 | return this.#initialResponse;
79 | }
80 |
81 | get pendingRequests(): number {
82 | return this.#metrics.requests.pending;
83 | }
84 |
85 | constructor(context: BrowserContext, engine?: BrowserEngine) {
86 | this.#context = context;
87 | this.#engine = engine || DEFAULT_ENGINE;
88 | }
89 |
90 | /**
91 | * Create an empty page in a browser.
92 | */
93 | async create(): Promise {
94 | const start = Date.now();
95 | const page = await this.#context!.newPage();
96 |
97 | stats.timing('renderscript.page.create', Date.now() - start);
98 | this.#ref = page;
99 |
100 | page.on('crash', () => {
101 | // e.g: crash happen on OOM.
102 | report(new Error('Page crashed'), { pageUrl: page.url() });
103 | });
104 | page.on('popup', () => {
105 | report(new Error('Popup created'), { pageUrl: page.url() });
106 | });
107 | page.on('request', (req) => {
108 | log.debug('request_start', { url: req.url(), pageUrl: page.url() });
109 | this.#metrics.requests.pending += 1;
110 | });
111 | page.on('requestfailed', (req) => {
112 | log.debug('request_failed', { url: req.url(), pageUrl: page.url() });
113 | this.#metrics.requests.pending -= 1;
114 | });
115 | page.on('requestfinished', async (req) => {
116 | if (log.isLevelEnabled('trace')) {
117 | const response = await req.response();
118 | log.trace('request_finished', {
119 | url: req.url(),
120 | pageUrl: page.url(),
121 | requestHeaders: req.headers(),
122 | responseStatus: response?.status(),
123 | });
124 | } else if (log.isLevelEnabled('debug')) {
125 | const response = await req.response();
126 | log.debug('request_finished', {
127 | url: req.url(),
128 | pageUrl: page.url(),
129 | responseStatus: response?.status(),
130 | });
131 | }
132 | this.#metrics.requests.pending -= 1;
133 | });
134 | }
135 |
136 | /**
137 | * Destroy the page and the private context.
138 | */
139 | async close(): Promise {
140 | await this.#ref?.close();
141 | this.#ref = undefined;
142 | }
143 |
144 | /**
145 | * We wrap goto to handle timeout.
146 | */
147 | async goto(
148 | url: string,
149 | opts: Parameters[1]
150 | ): Promise {
151 | let response: Response | null = null;
152 |
153 | function onResponse(res: Response): void {
154 | // We listen to response because "goto" will throw on timeout but we still want to process the doc in that case
155 | if (!response) {
156 | response = res;
157 | }
158 | }
159 | this.#ref!.once('response', onResponse);
160 |
161 | const start = Date.now();
162 | try {
163 | // Response can be assigned here or on('response')
164 | response = await this.#ref!.goto(url, opts);
165 | } catch (err: any) {
166 | if (!this.redirection && !err.message.includes('ERR_ABORTED')) {
167 | this.throwIfNotTimeout(err);
168 | }
169 | } finally {
170 | // We remove listener, because we don't want more response
171 | this.#ref!.removeListener('response', onResponse);
172 | }
173 |
174 | stats.timing('renderscript.page.goto', Date.now() - start, undefined, {
175 | success: response ? 'true' : 'false',
176 | waitUntil: opts?.waitUntil || 'unknown',
177 | });
178 |
179 | if (!response) {
180 | // Can happen in case of chrome crash
181 | throw new Error('goto_no_response');
182 | }
183 |
184 | return response;
185 | }
186 |
187 | /**
188 | * Wait for navigation with timeout handling.
189 | */
190 | async waitForNavigation(opts: {
191 | timeout: number;
192 | waitUntil: Parameters[0];
193 | }): Promise {
194 | let response: Response | null = null;
195 | function onResponse(res: Response): void {
196 | // We listen to response because "goto" will throw on timeout but we still want to process the doc in that case
197 | if (!response) {
198 | response = res;
199 | }
200 | }
201 | this.#ref!.once('response', onResponse);
202 |
203 | try {
204 | if (this.#ref) {
205 | await this.#ref.waitForLoadState(opts.waitUntil, opts);
206 | response = await this.#ref.waitForResponse(
207 | (res) => res.status() >= 200 && res.status() < 400,
208 | opts
209 | );
210 | }
211 | } catch (err: any) {
212 | this.throwIfNotTimeout(err);
213 | } finally {
214 | // We remove listener, because we don't want more response
215 | this.#ref!.removeListener('response', onResponse);
216 | }
217 |
218 | return response;
219 | }
220 |
221 | /**
222 | * Get performance metrics from the page.
223 | * This function can fail silently because it's non-critical resource.
224 | * If that happen it will return previous metrics.
225 | */
226 | async saveMetrics(): Promise {
227 | try {
228 | if (!this.#ref || this.#ref.isClosed()) {
229 | // page has been closed or not yet open
230 | return this.#metrics;
231 | }
232 |
233 | const evaluate = await promiseWithTimeout(
234 | this.#ref!.evaluate(() => {
235 | return JSON.stringify({
236 | curr: performance.getEntriesByType('navigation')[0],
237 | all: performance.getEntries(),
238 | // @ts-expect-error only exists in chromium
239 | mem: performance.memory || {},
240 | });
241 | }),
242 | 200
243 | );
244 |
245 | if (!evaluate) {
246 | throw new Error('Getting perf error');
247 | }
248 | const perf: Perf = JSON.parse(evaluate);
249 |
250 | this.#metrics.timings.download = Math.round(perf.curr.duration || 0);
251 | this.#metrics.mem = {
252 | jsHeapUsedSize: perf.mem.usedJSHeapSize || 0,
253 | jsHeapTotalSize: perf.mem.totalJSHeapSize || 0,
254 | };
255 | } catch (err: any) {
256 | if (!METRICS_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) {
257 | report(new Error('Error saving metrics'), { err });
258 | }
259 | }
260 |
261 | return this.#metrics;
262 | }
263 |
264 | /**
265 | * Output body as a string at the moment it is requested.
266 | */
267 | async renderBody(
268 | { silent }: { silent: boolean } = { silent: false }
269 | ): Promise {
270 | try {
271 | return await promiseWithTimeout(
272 | (async (): Promise => {
273 | const start = Date.now();
274 | const content = await this.#ref?.content();
275 | stats.timing('renderscript.renderBody', Date.now() - start, {
276 | browser: this.#engine as string,
277 | });
278 | return content || null;
279 | })(),
280 | 10000 // this is the most important part so we try hard
281 | );
282 | } catch (err: any) {
283 | if (!(err instanceof PromiseWithTimeoutError)) {
284 | if (!silent) {
285 | throw err;
286 | }
287 | }
288 | report(err, {
289 | url: this.ref?.url(),
290 | browser: this.#engine,
291 | action: 'renderBody',
292 | });
293 | }
294 | return null;
295 | }
296 |
297 | /**
298 | * Add cookies to the context.
299 | */
300 | async setCookies({ url, headersToForward }: TaskBaseParams): Promise {
301 | const cookies = headersToForward!.cookie.split('; ').map((cookie) => {
302 | const [key, ...v] = cookie.split('=');
303 | return { domain: url.hostname, path: '/', name: key, value: v.join('=') };
304 | });
305 |
306 | try {
307 | await this.#context!.addCookies(cookies);
308 | } catch (err) {
309 | report(new Error('Failed to set cookie'), { err, url });
310 | }
311 | }
312 |
313 | /**
314 | * Disable service workers, this is recommended.
315 | */
316 | async setDisableServiceWorker(): Promise {
317 | await this.#context!.addInitScript(() => {
318 | // @ts-expect-error read-only prop
319 | delete window.navigator.serviceWorker;
320 | });
321 | this.#ref!.on('worker', () => {
322 | report(new Error('WebWorker disabled but created'), {
323 | pageUrl: this.#ref!.url(),
324 | });
325 | });
326 | }
327 |
328 | /**
329 | * Disable navigation. Only opt-in because Login requires navigation.
330 | * Because playwright has some limitation we can't cancel redirection directly, so it's not bulletproof.
331 | * Request will most likely be interrupted but due do code lag and event we can still have time to reach the backend.
332 | */
333 | setDisableNavigation(
334 | originalUrl: string,
335 | onNavigation: (url: string) => Promise
336 | ): void {
337 | this.#ref?.on('framenavigated', async (frame) => {
338 | const newUrl = new URL(frame.url());
339 | newUrl.hash = '';
340 | if (originalUrl === newUrl.href) {
341 | return;
342 | }
343 | if (frame.parentFrame()) {
344 | // Sub Frame we don't care
345 | return;
346 | }
347 | if (newUrl.href === 'chrome-error://chromewebdata/') {
348 | // Page crashed
349 | return;
350 | }
351 | if (!this.#redirection) {
352 | // Can happen that on('framenavigated') event comes before on('request')
353 | this.#redirection = newUrl.href;
354 | }
355 |
356 | await onNavigation(newUrl.href);
357 |
358 | // We still report just in case.
359 | log.warn(
360 | {
361 | pageUrl: originalUrl,
362 | to: newUrl.href,
363 | },
364 | 'Unexpected navigation'
365 | );
366 | });
367 |
368 | this.#ref?.on('request', async (req) => {
369 | const newUrl = new URL(req.url());
370 |
371 | // Playwright does not route redirection to route() so we need to manually catch them
372 | const main = req.frame().parentFrame() === null;
373 | const redir = req.isNavigationRequest();
374 |
375 | if (!redir || (redir && !main) || originalUrl === newUrl.href) {
376 | return;
377 | }
378 |
379 | newUrl.hash = '';
380 | if (originalUrl === newUrl.href) {
381 | return;
382 | }
383 |
384 | log.info('Will navigate', { pageUrl: originalUrl, url: newUrl.href });
385 |
386 | this.#redirection = newUrl.href;
387 | await onNavigation(newUrl.href);
388 | });
389 | }
390 |
391 | /**
392 | * Helper to throw if an error is not timeout so we can reuse the response easily.
393 | */
394 | throwIfNotTimeout(err: any): Error {
395 | if (!(err instanceof Error) || err.name !== 'TimeoutError') {
396 | throw err;
397 | }
398 |
399 | // This error is expected has most page will reach timeout
400 | // we want to continue because we can still have a response
401 | this.#hasTimeout = true;
402 | return err;
403 | }
404 |
405 | /**
406 | * Get a generic request handler (route).
407 | * That will disallow most content a.
408 | */
409 | getOnRequestHandler({
410 | url,
411 | adblock,
412 | headersToForward,
413 | }: TaskBaseParams): (route: Route) => Promise {
414 | return async (route: Route): Promise => {
415 | const req = route.request();
416 | const reqUrl = req.url();
417 | this.#metrics.requests.total += 1;
418 |
419 | try {
420 | if (this.#hasTimeout) {
421 | // If the page was killed in the meantime we don't want to process anything else
422 | await route.abort('blockedbyclient');
423 | return;
424 | }
425 |
426 | // Skip data URIs
427 | if (DATA_REGEXP.test(reqUrl)) {
428 | this.#metrics.requests.blocked += 1;
429 | await route.abort('blockedbyclient');
430 | return;
431 | }
432 |
433 | // Iframe block
434 | if (req.frame().parentFrame()) {
435 | this.#metrics.requests.blocked += 1;
436 |
437 | await route.abort('blockedbyclient');
438 | return;
439 | }
440 |
441 | // Ignore some type of resources
442 | if (IGNORED_RESOURCES.includes(req.resourceType())) {
443 | this.#metrics.requests.blocked += 1;
444 |
445 | await route.abort('blockedbyclient');
446 | return;
447 | }
448 |
449 | // Adblocking
450 | if (adblock && adblocker.match(new URL(reqUrl))) {
451 | this.#metrics.requests.blocked += 1;
452 |
453 | await route.abort('blockedbyclient');
454 | return;
455 | }
456 |
457 | // Check for ssrf attempts = page that redirects to localhost for example
458 | if (!(await isURLAllowed(reqUrl))) {
459 | this.#metrics.requests.blocked += 1;
460 | await route.abort('blockedbyclient');
461 | return;
462 | }
463 |
464 | if (req.isNavigationRequest()) {
465 | const headers = await req.allHeaders();
466 | await route.continue({
467 | // headers ignore values set for `Cookie`, relies to page.setCookie instead
468 | headers: { ...headers, ...headersToForward },
469 | });
470 | return;
471 | }
472 |
473 | await route.continue();
474 | } catch (err: any) {
475 | if (REQUEST_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) {
476 | return;
477 | }
478 |
479 | report(err, {
480 | context: 'onRequest',
481 | url: url.href,
482 | with: reqUrl,
483 | browser: this.#engine,
484 | });
485 | }
486 | };
487 | }
488 |
489 | getOnResponseHandler({
490 | url,
491 | }: TaskBaseParams): (res: Response) => Promise {
492 | return async (res: Response) => {
493 | try {
494 | if (this.#hasTimeout) {
495 | // If the page was killed in the meantime we don't want to process anything else
496 | return;
497 | }
498 |
499 | if (this.isClosed) {
500 | return;
501 | }
502 |
503 | // Check if response is still valid before accessing properties
504 | const reqRes = await res.request().response();
505 | if (!reqRes) {
506 | // Response is no longer valid
507 | return;
508 | }
509 |
510 | const reqUrl = res.url();
511 |
512 | // Check if headers can be accessed safely
513 | let headers;
514 | try {
515 | headers = await res.allHeaders();
516 | } catch (err: any) {
517 | if (REQUEST_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) {
518 | return;
519 | }
520 | throw err;
521 | }
522 |
523 | let length = 0;
524 |
525 | // Store initial response in case of navigation
526 | if (!this.#initialResponse) {
527 | this.#initialResponse = res;
528 | }
529 |
530 | if (headers['content-length']) {
531 | length = parseInt(headers['content-length'], 10);
532 | }
533 |
534 | const status = res.status();
535 |
536 | // Redirections do not have a body
537 | if (status > 300 && status < 400) {
538 | return;
539 | }
540 |
541 | try {
542 | if (length === 0 && !this.isClosed) {
543 | // Not every request has the content-length header, the byteLength match perfectly
544 | // but does not necessarly represent what was transfered (if it was gzipped for example)
545 | try {
546 | length = (await res.body()).byteLength;
547 | } catch (bodyErr: any) {
548 | // eslint-disable-next-line max-depth
549 | if (
550 | REQUEST_IGNORED_ERRORS.some((msg) =>
551 | bodyErr.message.includes(msg)
552 | )
553 | ) {
554 | return;
555 | }
556 | throw bodyErr;
557 | }
558 | }
559 |
560 | if (reqUrl === url.href) {
561 | // If this is our original URL we log it to a dedicated metric
562 | this.#metrics.contentLength.main = length;
563 | }
564 |
565 | this.#metrics.contentLength.total += length;
566 | } catch (err: any) {
567 | if (
568 | RESPONSE_IGNORED_ERRORS.some((msg) => err.message.includes(msg))
569 | ) {
570 | return;
571 | }
572 |
573 | // We can not throw in callback, it will go directly into unhandled
574 | report(err, { context: 'onResponse', pageUrl: url.href, reqUrl });
575 | }
576 | } catch (err: any) {
577 | if (RESPONSE_IGNORED_ERRORS.some((msg) => err.message.includes(msg))) {
578 | return;
579 | }
580 | report(err, { context: 'onResponseHandler', pageUrl: url.href });
581 | }
582 | };
583 | }
584 |
585 | /**
586 | * Returns the URL if found.
587 | */
588 | async checkForHttpEquivRefresh({
589 | timeout,
590 | }: {
591 | timeout: number;
592 | }): Promise {
593 | if (!this.#ref) {
594 | return;
595 | }
596 |
597 | try {
598 | const url = new URL(this.#ref.url());
599 | const metaRefreshElement = this.#ref.locator(
600 | 'meta[http-equiv="refresh"]'
601 | );
602 |
603 | if (!metaRefreshElement || (await metaRefreshElement.count()) <= 0) {
604 | return;
605 | }
606 |
607 | const el = (await metaRefreshElement.elementHandle({ timeout }))!;
608 | const metaRefreshContent = await el.getProperty('content');
609 | const refreshContent = await metaRefreshContent?.jsonValue();
610 | const match = refreshContent?.match(/\d+;\s(?:url|URL)=(.*)/);
611 | if (!match) {
612 | return;
613 | }
614 |
615 | // Sometimes URLs are surrounded by quotes
616 | const matchedURL = match[1].replace(/'/g, '');
617 | const redirectURL = new URL(matchedURL, url);
618 |
619 | log.debug('Meta refresh found', { redir: redirectURL.href });
620 |
621 | return redirectURL;
622 | } catch (err: any) {
623 | if (err instanceof Error && cleanErrorMessage(err) !== 'unknown_error') {
624 | return;
625 | }
626 | report(new Error('Error while trying to check for meta refresh'), {
627 | err,
628 | timeout: this.#hasTimeout,
629 | });
630 | }
631 | }
632 | }
633 |
--------------------------------------------------------------------------------