├── .devcontainer ├── devcontainer.json └── docker-compose.yml ├── .dockerignore ├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── README.md ├── chrome.json ├── ci.sh ├── docker-compose.yml ├── jest.config.js ├── nodemon.json ├── package-lock.json ├── package.json ├── src ├── browser.ts ├── cache.ts ├── healthcheck.ts ├── loader.ts ├── logging.ts ├── main.ts ├── params.test.ts ├── params.ts ├── pool.ts ├── queue.test.ts ├── queue.ts └── server.ts ├── title.png └── tsconfig.json /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "html-spitter-dev", 3 | "dockerComposeFile": ["docker-compose.yml"], 4 | "service": "vscode", 5 | "runServices": ["vscode"], 6 | "shutdownAction": "stopCompose", 7 | "postCreateCommand": "npm install", 8 | "workspaceFolder": "/workspace", 9 | // "overrideCommand": "", 10 | "extensions": [ 11 | "dbaeumer.vscode-eslint", 12 | "ms-vscode.vscode-typescript-tslint-plugin", 13 | "esbenp.prettier-vscode", 14 | "visualstudioexptteam.vscodeintellicode", 15 | "christian-kohler.npm-intellisense", 16 | "IBM.output-colorizer", 17 | "eamodio.gitlens", 18 | "mhutchie.git-graph", 19 | "davidanson.vscode-markdownlint", 20 | "shardulm94.trailing-spaces", 21 | "alefragnani.Bookmarks", 22 | "Gruntfuggly.todo-tree", 23 | "quicktype.quicktype", 24 | "spikespaz.vscode-smoothtype", 25 | "stkb.rewrap", 26 | "vscode-icons-team.vscode-icons", 27 | "ms-azuretools.vscode-docker" 28 | ], 29 | "settings": { 30 | // General settings 31 | "files.eol": "\n", 32 | // Docker 33 | "remote.extensionKind": { 34 | "ms-azuretools.vscode-docker": "workspace" 35 | }, 36 | "editor.codeActionsOnSave": { 37 | "source.fixAll.tslint": true 38 | }, 39 | "[javascript]": { 40 | "editor.defaultFormatter": "esbenp.prettier-vscode", 41 | "editor.formatOnSave": true 42 | }, 43 | "[typescript]": { 44 | "editor.defaultFormatter": "esbenp.prettier-vscode", 45 | "editor.formatOnSave": false 46 | }, 47 | "eslint.autoFixOnSave": true, 48 | "eslint.validate": [ 49 | "javascript", 50 | { 51 | "autoFix": true, 52 | "language": "typescript" 53 | } 54 | ], 55 | "prettier.eslintIntegration": true 56 | } 57 | } -------------------------------------------------------------------------------- /.devcontainer/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3.7" 2 | 3 | services: 4 | vscode: 5 | image: qmcgaw/nodedevcontainer 6 | volumes: 7 | - ../:/workspace 8 | - ~/.ssh:/home/vscode/.ssh:ro 9 | - ~/.ssh:/root/.ssh:ro 10 | - /var/run/docker.sock:/var/run/docker.sock 11 | cap_add: 12 | - SYS_PTRACE 13 | security_opt: 14 | - seccomp:unconfined 15 | entrypoint: zsh -c "while sleep 1000; do :; done" 16 | # ports: 17 | # - 3000:3000/tcp 18 | environment: 19 | - PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1 20 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | .devcontainer/ 2 | .git/ 3 | build/ 4 | node_modules/ 5 | .gitignore 6 | .travis.yml 7 | chrome.json 8 | docker-compose.yml 9 | LICENSE 10 | nodemon.json 11 | README.md 12 | title.png 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | node_modules/ 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | dist: xenial 2 | sudo: required 3 | git: 4 | quiet: true 5 | depth: 1 6 | env: 7 | global: 8 | - DOCKER_REPO=qmcgaw/htmlspitter 9 | before_install: 10 | - curl -fsSL https://get.docker.com | sh 11 | - echo '{"experimental":"enabled"}' | sudo tee /etc/docker/daemon.json 12 | - mkdir -p $HOME/.docker 13 | - echo '{"experimental":"enabled"}' | sudo tee $HOME/.docker/config.json 14 | - sudo service docker start 15 | install: 16 | - docker run --rm --privileged multiarch/qemu-user-static --reset -p yes 17 | - docker buildx create --name xbuilder --use 18 | script: bash ci.sh 19 | after_success: 20 | - curl -X POST https://hooks.microbadger.com/images/$DOCKER_REPO/WEBHOOK_LINK || exit 0 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG NODE_VERSION=13.2 2 | 3 | FROM node:${NODE_VERSION}-buster-slim AS builder 4 | WORKDIR /htmlspitter 5 | COPY package.json package-lock.json ./ 6 | RUN npm install 7 | COPY . ./ 8 | RUN npm t 9 | RUN npm run build 10 | 11 | FROM node:${NODE_VERSION}-buster-slim 12 | ARG GOOGLE_CHROME_BRANCH=beta 13 | ARG VERSION 14 | ARG BUILD_DATE 15 | ARG VCS_REF 16 | LABEL \ 17 | org.opencontainers.image.authors="quentin.mcgaw@gmail.com" \ 18 | org.opencontainers.image.created=$BUILD_DATE \ 19 | org.opencontainers.image.version="$VERSION" \ 20 | org.opencontainers.image.revision=$VCS_REF \ 21 | org.opencontainers.image.url="https://github.com/qdm12/htmlspitter" \ 22 | org.opencontainers.image.documentation="https://github.com/qdm12/htmlspitter/blob/master/README.md" \ 23 | org.opencontainers.image.source="https://github.com/qdm12/htmlspitter" \ 24 | org.opencontainers.image.title="HTMLSpitter" \ 25 | org.opencontainers.image.description="Lightweight Docker image with NodeJS server to spit out HTML from loaded JS using Puppeteer and Chrome" 26 | WORKDIR /htmlspitter 27 | EXPOSE 8000 28 | RUN apt-get -qq update && \ 29 | apt-get -qq install -y --no-install-recommends gnupg2 wget && \ 30 | wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \ 31 | apt-get -qq remove -y wget gnupg2 && \ 32 | sh -c 'echo "deb http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' && \ 33 | apt-get -qq update && \ 34 | apt-get -qq install -y --no-install-recommends google-chrome-${GOOGLE_CHROME_BRANCH} && \ 35 | rm -rf /var/lib/apt/lists/* 36 | RUN groupadd -r nonrootgroup && \ 37 | useradd -r -g nonrootgroup -G audio,video nonrootuser && \ 38 | mkdir -p /home/nonrootuser/Downloads && \ 39 | chown -R nonrootuser:nonrootgroup /home/nonrootuser && \ 40 | chown -R nonrootuser:nonrootgroup /htmlspitter 41 | ENV CHROME_BIN=/usr/bin/google-chrome-${GOOGLE_CHROME_BRANCH} \ 42 | NODE_ENV=production 43 | HEALTHCHECK --interval=10s --timeout=3s --start-period=5s --retries=1 CMD [ "node", "./healthcheck.js" ] 44 | ENTRYPOINT [ "node", "./main.js" ] 45 | COPY package.json package-lock.json ./ 46 | RUN npm install --only=prod 47 | COPY --from=builder --chown=nonrootuser:nonrootgroup /htmlspitter/build /htmlspitter 48 | USER nonrootuser -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Quentin McGaw 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HTMLSpitter 2 | 3 | *Lightweight Docker image with NodeJS server to spit out HTML from loaded JS using Puppeteer and Chrome* 4 | 5 | [Medium story: HTML from the Javascript world](https://medium.com/@quentin.mcgaw/html-from-the-javascript-world-c536f88d51df) 6 | 7 | [![htmlspitter](https://github.com/qdm12/htmlspitter/raw/master/title.png)](https://hub.docker.com/r/qmcgaw/htmlspitter) 8 | 9 | [![Build Status](https://travis-ci.org/qdm12/htmlspitter.svg?branch=master)](https://travis-ci.org/qdm12/htmlspitter) 10 | [![Docker Pulls](https://img.shields.io/docker/pulls/qmcgaw/htmlspitter.svg)](https://hub.docker.com/r/qmcgaw/htmlspitter) 11 | [![Docker Stars](https://img.shields.io/docker/stars/qmcgaw/htmlspitter.svg)](https://hub.docker.com/r/qmcgaw/htmlspitter) 12 | [![Image size](https://images.microbadger.com/badges/image/qmcgaw/htmlspitter.svg)](https://microbadger.com/images/qmcgaw/htmlspitter) 13 | [![Image version](https://images.microbadger.com/badges/version/qmcgaw/htmlspitter.svg)](https://microbadger.com/images/qmcgaw/htmlspitter) 14 | 15 | [![Join Slack channel](https://img.shields.io/badge/slack-@qdm12-yellow.svg?logo=slack)](https://join.slack.com/t/qdm12/shared_invite/enQtOTE0NjcxNTM1ODc5LTYyZmVlOTM3MGI4ZWU0YmJkMjUxNmQ4ODQ2OTAwYzMxMTlhY2Q1MWQyOWUyNjc2ODliNjFjMDUxNWNmNzk5MDk) 16 | [![GitHub last commit](https://img.shields.io/github/last-commit/qdm12/htmlspitter.svg)](https://github.com/qdm12/htmlspitter/issues) 17 | [![GitHub commit activity](https://img.shields.io/github/commit-activity/y/qdm12/htmlspitter.svg)](https://github.com/qdm12/htmlspitter/issues) 18 | [![GitHub issues](https://img.shields.io/github/issues/qdm12/htmlspitter.svg)](https://github.com/qdm12/htmlspitter/issues) 19 | 20 | | Image size | RAM usage | 21 | | --- | --- | 22 | | 558MB | 110MB+ | 23 | 24 |
Click to show base components

25 | 26 | - [node:13.2-buster-slim](https://hub.docker.com/_/node/) 27 | - [Google Chrome 79 Beta](https://www.ubuntuupdates.org/package/google_chrome/stable/main/base/google-chrome-beta) 28 | - [Puppeteer v2.00](https://github.com/GoogleChrome/puppeteer/releases/tag/v2.0.0) 29 | 30 |

31 | 32 | The program is written in NodeJS with Typescript, in the [src](src) directory. 33 | 34 | ## Description 35 | 36 | Runs a NodeJS server accepting HTTP requests with two URL parameters: 37 | 38 | - `url` which is the URL to prerender into HTML 39 | - `wait` which is the **optional** load event to wait for before stopping the prerendering. It can be: 40 | - `load` (wait for the `load` event) 41 | - `domcontentloaded` (wait for the `DOMContentLoaded` event) 42 | - `networkidle0` (**default**, wait until there is no network connections for at least 500 ms) 43 | - `networkidle2` (wait until there are less than 3 network connections for at least 500 ms) 44 | 45 | For example: 46 | 47 | ``` 48 | http://localhost:8000/?url=https://github.com/qdm12/htmlspitter 49 | ``` 50 | 51 | - The server scales up Chromium instances if needed 52 | - It limits the number of opened pages per instance to prevent one page crashing all the other pages 53 | - It has a 1 hour cache for loaded HTML 54 | - It has a queue system for requests once the maximum number of pages/chromium instances is reached 55 | - **Not compatible** with other architectures than amd64 as Chrome-Beta is only built for `amd64` for now and is required. 56 | 57 | ## Usage 58 | 59 | 1. Run the container 60 | 61 | ```sh 62 | docker run -it --rm --init -p 8000:8000 qmcgaw/htmlspitter 63 | ``` 64 | 65 | You can also use [docker-compose.yml](https://github.com/qdm12/htmlspitter/blob/master/docker-compose.yml). 66 | 67 | ## Environment variables 68 | 69 | | Name | Default | Possible values | Description | 70 | | --- | --- | --- | --- | 71 | | `MAX_PAGES` | `10` | `-1` or integer larger than `0` | Max number of pages per Chromium instance at any time, `-1` for no max | 72 | | `MAX_HITS` | `300` | `-1` or integer larger than `0` | Max number of pages opened per Chromium instance during its lifetime (before relaunch), `-1` for no max | 73 | | `MAX_AGE_UNUSED` | `60` | `-1` or integer larger than `0` | Max age in seconds of inactivity before the browser is closed, `-1` for no max | 74 | | `MAX_BROWSERS` | `10` | `-1` or integer larger than `0` | Max number of Chromium instances at any time, `-1` for no max | 75 | | `MAX_CACHE_SIZE` | `10` | `-1` or integer larger than `0` | Max number of MB stored in the cache, `-1` for no max | 76 | | `MAX_QUEUE_SIZE` | `100` | `-1` or integer larger than `0` | Max size of queue of pages per Chromium instance, `-1` for no max | 77 | | `LOG` | `normal` | `normal` or `json` | Format to use to print logs | 78 | | `TIMEOUT` | `15000` | `-1` or integer larger than `0` | Timeout in ms to load a page, `-1` for no timeout | 79 | 80 | ## Troubleshooting 81 | 82 | ### Chrome fails to launch 83 | 84 | If you obtain the error: 85 | 86 | ```json 87 | {"error":"Error: Failed to launch chrome!\nFailed to move to new namespace: PID namespaces supported, Network namespace supported, but failed: errno = Operation not permitted\n\n\nTROUBLESHOOTING: https://github.com/GoogleChrome/puppeteer/blob/master/docs/troubleshooting.md\n"} 88 | ``` 89 | 90 | Then you might need to use **seccomp** with the [chrome.json](https://github.com/qdm12/htmlspitter/blob/master/chrome.json) file of this repository: 91 | 92 | ```sh 93 | wget https://raw.githubusercontent.com/qdm12/htmlspitter/master/chrome.json 94 | docker run -it --rm --init --security-opt seccomp=$(pwd)/chrome.json -p 8000:8000 qmcgaw/htmlspitter 95 | ``` 96 | 97 | ## Details 98 | 99 | ### Program 100 | 101 | - A built-in local memory cache holds HTML content obtained the last hour and is limited in the size of characters it contains. 102 | - A built-in pool of Chromium instances creates and removes Chromium instances according to the server load. 103 | - Each Chromium instance has a limited number of pages so that if one page crashes Chromium, not all page loads are lost. 104 | - As Chromium caches content, each instance is destroyed and re-created once it reaches a certain number of page loads. 105 | 106 | ### Docker 107 | 108 | - [chrome.json](https://github.com/qdm12/htmlspitter/blob/master/chrome.json) may be required depending on your host OS. 109 | - The `--init` flag is added to prevent eventual zombie Chromium processes to exist when the container stops the main NodeJS program. 110 | - A built in healthcheck is implemented by running `node build/healthcheck.js` against a running instance. 111 | 112 | ### Performance considerations 113 | 114 | - Chromium is written in C++ and multi threaded so it scales well with more CPU cores 115 | - The NodeJS program should not be the bottleneck because all the work is done by Chromium 116 | - The bottleneck will be CPU and especially RAM used by Chromium instance(s) 117 | - You can **scale up** by having multiple machines running the program, behind a load balancer 118 | 119 | ## Development 120 | 121 | - Either use the Docker container development image with Visual Studio Code and the remote development extension 122 | - Or install Node and NPM on your machine 123 | 124 | ```sh 125 | # Install all dependencies 126 | npm i 127 | # Transcompile the Typescript code to Javascript and run build/main.js with 128 | npm run start 129 | ``` 130 | 131 | Test it with, for example: 132 | 133 | ```sh 134 | wget -qO- http://localhost:8000/?url=https://github.com/qdm12/htmlspitter 135 | ``` 136 | 137 | You can also: 138 | 139 | - Run tests 140 | 141 | ```sh 142 | npm t 143 | ``` 144 | 145 | - Run the sever with hot reload (performs `npm run start` on each .ts change) 146 | 147 | ```sh 148 | npx nodemon 149 | ``` 150 | 151 | - Build Docker 152 | 153 | ```sh 154 | docker build -t qmcgaw/htmlspitter . 155 | ``` 156 | 157 | You can also specify the branch of Google Chrome from `beta` (default), `stable` and `unstable` 158 | 159 | ```sh 160 | docker build -t qmcgaw/htmlspitter --build-arg GOOGLE_CHROME_BRANCH=unstable 161 | ``` 162 | 163 | - There are two environment variables you might find useful: 164 | - `PORT` to set the HTTP server listening port 165 | - `CHROME_BIN` which is the path to the Chrome binary or `Puppeteer-bundled` 166 | 167 | ### TODOs 168 | 169 | - [ ] Show Chrome version at start 170 | - [ ] Fake user agents 171 | - [ ] Prevent recursive calls to localhost 172 | - [ ] Format JSON or raw HTML 173 | - [ ] Limit Chromium instances in terms of RAM 174 | - [ ] Compression Gzip 175 | - [ ] Sync same URL with Redis (not getting twice the same URL) 176 | - [ ] Sync Cache with Postgresql or Redis depending on size 177 | - [ ] Limit data size in Postgresql according to time created 178 | - [ ] Unit testing 179 | - [ ] ReactJS GUI 180 | - [ ] Static binary in Scratch Docker image 181 | 182 | ## Credits 183 | 184 | - Credits to [jessfraz](https://github.com/jessfraz) for [chrome.json](chrome.json) 185 | - The Google Chrome team 186 | - The Puppeteer developers 187 | 188 | ## License 189 | 190 | This repository is under an [MIT license](https://github.com/qdm12/htmlspitter/master/license) 191 | -------------------------------------------------------------------------------- /chrome.json: -------------------------------------------------------------------------------- 1 | { 2 | "defaultAction": "SCMP_ACT_ERRNO", 3 | "syscalls": [ 4 | { 5 | "name": "accept", 6 | "action": "SCMP_ACT_ALLOW", 7 | "args": null 8 | }, 9 | { 10 | "name": "accept4", 11 | "action": "SCMP_ACT_ALLOW", 12 | "args": null 13 | }, 14 | { 15 | "name": "access", 16 | "action": "SCMP_ACT_ALLOW", 17 | "args": null 18 | }, 19 | { 20 | "name": "alarm", 21 | "action": "SCMP_ACT_ALLOW", 22 | "args": null 23 | }, 24 | { 25 | "name": "arch_prctl", 26 | "action": "SCMP_ACT_ALLOW", 27 | "args": null 28 | }, 29 | { 30 | "name": "bind", 31 | "action": "SCMP_ACT_ALLOW", 32 | "args": null 33 | }, 34 | { 35 | "name": "brk", 36 | "action": "SCMP_ACT_ALLOW", 37 | "args": null 38 | }, 39 | { 40 | "name": "capget", 41 | "action": "SCMP_ACT_ALLOW", 42 | "args": null 43 | }, 44 | { 45 | "name": "capset", 46 | "action": "SCMP_ACT_ALLOW", 47 | "args": null 48 | }, 49 | { 50 | "name": "chdir", 51 | "action": "SCMP_ACT_ALLOW", 52 | "args": null 53 | }, 54 | { 55 | "name": "chmod", 56 | "action": "SCMP_ACT_ALLOW", 57 | "args": null 58 | }, 59 | { 60 | "name": "chown", 61 | "action": "SCMP_ACT_ALLOW", 62 | "args": null 63 | }, 64 | { 65 | "name": "chown32", 66 | "action": "SCMP_ACT_ALLOW", 67 | "args": null 68 | }, 69 | { 70 | "name": "chroot", 71 | "action": "SCMP_ACT_ALLOW", 72 | "args": null 73 | }, 74 | { 75 | "name": "clock_getres", 76 | "action": "SCMP_ACT_ALLOW", 77 | "args": null 78 | }, 79 | { 80 | "name": "clock_gettime", 81 | "action": "SCMP_ACT_ALLOW", 82 | "args": null 83 | }, 84 | { 85 | "name": "clock_nanosleep", 86 | "action": "SCMP_ACT_ALLOW", 87 | "args": null 88 | }, 89 | { 90 | "name": "clone", 91 | "action": "SCMP_ACT_ALLOW", 92 | "args": null 93 | }, 94 | { 95 | "name": "close", 96 | "action": "SCMP_ACT_ALLOW", 97 | "args": null 98 | }, 99 | { 100 | "name": "connect", 101 | "action": "SCMP_ACT_ALLOW", 102 | "args": null 103 | }, 104 | { 105 | "name": "creat", 106 | "action": "SCMP_ACT_ALLOW", 107 | "args": null 108 | }, 109 | { 110 | "name": "dup", 111 | "action": "SCMP_ACT_ALLOW", 112 | "args": null 113 | }, 114 | { 115 | "name": "dup2", 116 | "action": "SCMP_ACT_ALLOW", 117 | "args": null 118 | }, 119 | { 120 | "name": "dup3", 121 | "action": "SCMP_ACT_ALLOW", 122 | "args": null 123 | }, 124 | { 125 | "name": "epoll_create", 126 | "action": "SCMP_ACT_ALLOW", 127 | "args": null 128 | }, 129 | { 130 | "name": "epoll_create1", 131 | "action": "SCMP_ACT_ALLOW", 132 | "args": null 133 | }, 134 | { 135 | "name": "epoll_ctl", 136 | "action": "SCMP_ACT_ALLOW", 137 | "args": null 138 | }, 139 | { 140 | "name": "epoll_ctl_old", 141 | "action": "SCMP_ACT_ALLOW", 142 | "args": null 143 | }, 144 | { 145 | "name": "epoll_pwait", 146 | "action": "SCMP_ACT_ALLOW", 147 | "args": null 148 | }, 149 | { 150 | "name": "epoll_wait", 151 | "action": "SCMP_ACT_ALLOW", 152 | "args": null 153 | }, 154 | { 155 | "name": "epoll_wait_old", 156 | "action": "SCMP_ACT_ALLOW", 157 | "args": null 158 | }, 159 | { 160 | "name": "eventfd", 161 | "action": "SCMP_ACT_ALLOW", 162 | "args": null 163 | }, 164 | { 165 | "name": "eventfd2", 166 | "action": "SCMP_ACT_ALLOW", 167 | "args": null 168 | }, 169 | { 170 | "name": "execve", 171 | "action": "SCMP_ACT_ALLOW", 172 | "args": null 173 | }, 174 | { 175 | "name": "execveat", 176 | "action": "SCMP_ACT_ALLOW", 177 | "args": null 178 | }, 179 | { 180 | "name": "exit", 181 | "action": "SCMP_ACT_ALLOW", 182 | "args": null 183 | }, 184 | { 185 | "name": "exit_group", 186 | "action": "SCMP_ACT_ALLOW", 187 | "args": null 188 | }, 189 | { 190 | "name": "faccessat", 191 | "action": "SCMP_ACT_ALLOW", 192 | "args": null 193 | }, 194 | { 195 | "name": "fadvise64", 196 | "action": "SCMP_ACT_ALLOW", 197 | "args": null 198 | }, 199 | { 200 | "name": "fadvise64_64", 201 | "action": "SCMP_ACT_ALLOW", 202 | "args": null 203 | }, 204 | { 205 | "name": "fallocate", 206 | "action": "SCMP_ACT_ALLOW", 207 | "args": null 208 | }, 209 | { 210 | "name": "fanotify_init", 211 | "action": "SCMP_ACT_ALLOW", 212 | "args": null 213 | }, 214 | { 215 | "name": "fanotify_mark", 216 | "action": "SCMP_ACT_ALLOW", 217 | "args": null 218 | }, 219 | { 220 | "name": "fchdir", 221 | "action": "SCMP_ACT_ALLOW", 222 | "args": null 223 | }, 224 | { 225 | "name": "fchmod", 226 | "action": "SCMP_ACT_ALLOW", 227 | "args": null 228 | }, 229 | { 230 | "name": "fchmodat", 231 | "action": "SCMP_ACT_ALLOW", 232 | "args": null 233 | }, 234 | { 235 | "name": "fchown", 236 | "action": "SCMP_ACT_ALLOW", 237 | "args": null 238 | }, 239 | { 240 | "name": "fchown32", 241 | "action": "SCMP_ACT_ALLOW", 242 | "args": null 243 | }, 244 | { 245 | "name": "fchownat", 246 | "action": "SCMP_ACT_ALLOW", 247 | "args": null 248 | }, 249 | { 250 | "name": "fcntl", 251 | "action": "SCMP_ACT_ALLOW", 252 | "args": null 253 | }, 254 | { 255 | "name": "fcntl64", 256 | "action": "SCMP_ACT_ALLOW", 257 | "args": null 258 | }, 259 | { 260 | "name": "fdatasync", 261 | "action": "SCMP_ACT_ALLOW", 262 | "args": null 263 | }, 264 | { 265 | "name": "fgetxattr", 266 | "action": "SCMP_ACT_ALLOW", 267 | "args": null 268 | }, 269 | { 270 | "name": "flistxattr", 271 | "action": "SCMP_ACT_ALLOW", 272 | "args": null 273 | }, 274 | { 275 | "name": "flock", 276 | "action": "SCMP_ACT_ALLOW", 277 | "args": null 278 | }, 279 | { 280 | "name": "fork", 281 | "action": "SCMP_ACT_ALLOW", 282 | "args": null 283 | }, 284 | { 285 | "name": "fremovexattr", 286 | "action": "SCMP_ACT_ALLOW", 287 | "args": null 288 | }, 289 | { 290 | "name": "fsetxattr", 291 | "action": "SCMP_ACT_ALLOW", 292 | "args": null 293 | }, 294 | { 295 | "name": "fstat", 296 | "action": "SCMP_ACT_ALLOW", 297 | "args": null 298 | }, 299 | { 300 | "name": "fstat64", 301 | "action": "SCMP_ACT_ALLOW", 302 | "args": null 303 | }, 304 | { 305 | "name": "fstatat64", 306 | "action": "SCMP_ACT_ALLOW", 307 | "args": null 308 | }, 309 | { 310 | "name": "fstatfs", 311 | "action": "SCMP_ACT_ALLOW", 312 | "args": null 313 | }, 314 | { 315 | "name": "fstatfs64", 316 | "action": "SCMP_ACT_ALLOW", 317 | "args": null 318 | }, 319 | { 320 | "name": "fsync", 321 | "action": "SCMP_ACT_ALLOW", 322 | "args": null 323 | }, 324 | { 325 | "name": "ftruncate", 326 | "action": "SCMP_ACT_ALLOW", 327 | "args": null 328 | }, 329 | { 330 | "name": "ftruncate64", 331 | "action": "SCMP_ACT_ALLOW", 332 | "args": null 333 | }, 334 | { 335 | "name": "futex", 336 | "action": "SCMP_ACT_ALLOW", 337 | "args": null 338 | }, 339 | { 340 | "name": "futimesat", 341 | "action": "SCMP_ACT_ALLOW", 342 | "args": null 343 | }, 344 | { 345 | "name": "getcpu", 346 | "action": "SCMP_ACT_ALLOW", 347 | "args": null 348 | }, 349 | { 350 | "name": "getcwd", 351 | "action": "SCMP_ACT_ALLOW", 352 | "args": null 353 | }, 354 | { 355 | "name": "getdents", 356 | "action": "SCMP_ACT_ALLOW", 357 | "args": null 358 | }, 359 | { 360 | "name": "getdents64", 361 | "action": "SCMP_ACT_ALLOW", 362 | "args": null 363 | }, 364 | { 365 | "name": "getegid", 366 | "action": "SCMP_ACT_ALLOW", 367 | "args": null 368 | }, 369 | { 370 | "name": "getegid32", 371 | "action": "SCMP_ACT_ALLOW", 372 | "args": null 373 | }, 374 | { 375 | "name": "geteuid", 376 | "action": "SCMP_ACT_ALLOW", 377 | "args": null 378 | }, 379 | { 380 | "name": "geteuid32", 381 | "action": "SCMP_ACT_ALLOW", 382 | "args": null 383 | }, 384 | { 385 | "name": "getgid", 386 | "action": "SCMP_ACT_ALLOW", 387 | "args": null 388 | }, 389 | { 390 | "name": "getgid32", 391 | "action": "SCMP_ACT_ALLOW", 392 | "args": null 393 | }, 394 | { 395 | "name": "getgroups", 396 | "action": "SCMP_ACT_ALLOW", 397 | "args": null 398 | }, 399 | { 400 | "name": "getgroups32", 401 | "action": "SCMP_ACT_ALLOW", 402 | "args": null 403 | }, 404 | { 405 | "name": "getitimer", 406 | "action": "SCMP_ACT_ALLOW", 407 | "args": null 408 | }, 409 | { 410 | "name": "getpeername", 411 | "action": "SCMP_ACT_ALLOW", 412 | "args": null 413 | }, 414 | { 415 | "name": "getpgid", 416 | "action": "SCMP_ACT_ALLOW", 417 | "args": null 418 | }, 419 | { 420 | "name": "getpgrp", 421 | "action": "SCMP_ACT_ALLOW", 422 | "args": null 423 | }, 424 | { 425 | "name": "getpid", 426 | "action": "SCMP_ACT_ALLOW", 427 | "args": null 428 | }, 429 | { 430 | "name": "getppid", 431 | "action": "SCMP_ACT_ALLOW", 432 | "args": null 433 | }, 434 | { 435 | "name": "getpriority", 436 | "action": "SCMP_ACT_ALLOW", 437 | "args": null 438 | }, 439 | { 440 | "name": "getrandom", 441 | "action": "SCMP_ACT_ALLOW", 442 | "args": null 443 | }, 444 | { 445 | "name": "getresgid", 446 | "action": "SCMP_ACT_ALLOW", 447 | "args": null 448 | }, 449 | { 450 | "name": "getresgid32", 451 | "action": "SCMP_ACT_ALLOW", 452 | "args": null 453 | }, 454 | { 455 | "name": "getresuid", 456 | "action": "SCMP_ACT_ALLOW", 457 | "args": null 458 | }, 459 | { 460 | "name": "getresuid32", 461 | "action": "SCMP_ACT_ALLOW", 462 | "args": null 463 | }, 464 | { 465 | "name": "getrlimit", 466 | "action": "SCMP_ACT_ALLOW", 467 | "args": null 468 | }, 469 | { 470 | "name": "get_robust_list", 471 | "action": "SCMP_ACT_ALLOW", 472 | "args": null 473 | }, 474 | { 475 | "name": "getrusage", 476 | "action": "SCMP_ACT_ALLOW", 477 | "args": null 478 | }, 479 | { 480 | "name": "getsid", 481 | "action": "SCMP_ACT_ALLOW", 482 | "args": null 483 | }, 484 | { 485 | "name": "getsockname", 486 | "action": "SCMP_ACT_ALLOW", 487 | "args": null 488 | }, 489 | { 490 | "name": "getsockopt", 491 | "action": "SCMP_ACT_ALLOW", 492 | "args": null 493 | }, 494 | { 495 | "name": "get_thread_area", 496 | "action": "SCMP_ACT_ALLOW", 497 | "args": null 498 | }, 499 | { 500 | "name": "gettid", 501 | "action": "SCMP_ACT_ALLOW", 502 | "args": null 503 | }, 504 | { 505 | "name": "gettimeofday", 506 | "action": "SCMP_ACT_ALLOW", 507 | "args": null 508 | }, 509 | { 510 | "name": "getuid", 511 | "action": "SCMP_ACT_ALLOW", 512 | "args": null 513 | }, 514 | { 515 | "name": "getuid32", 516 | "action": "SCMP_ACT_ALLOW", 517 | "args": null 518 | }, 519 | { 520 | "name": "getxattr", 521 | "action": "SCMP_ACT_ALLOW", 522 | "args": null 523 | }, 524 | { 525 | "name": "inotify_add_watch", 526 | "action": "SCMP_ACT_ALLOW", 527 | "args": null 528 | }, 529 | { 530 | "name": "inotify_init", 531 | "action": "SCMP_ACT_ALLOW", 532 | "args": null 533 | }, 534 | { 535 | "name": "inotify_init1", 536 | "action": "SCMP_ACT_ALLOW", 537 | "args": null 538 | }, 539 | { 540 | "name": "inotify_rm_watch", 541 | "action": "SCMP_ACT_ALLOW", 542 | "args": null 543 | }, 544 | { 545 | "name": "io_cancel", 546 | "action": "SCMP_ACT_ALLOW", 547 | "args": null 548 | }, 549 | { 550 | "name": "ioctl", 551 | "action": "SCMP_ACT_ALLOW", 552 | "args": null 553 | }, 554 | { 555 | "name": "io_destroy", 556 | "action": "SCMP_ACT_ALLOW", 557 | "args": null 558 | }, 559 | { 560 | "name": "io_getevents", 561 | "action": "SCMP_ACT_ALLOW", 562 | "args": null 563 | }, 564 | { 565 | "name": "ioprio_get", 566 | "action": "SCMP_ACT_ALLOW", 567 | "args": null 568 | }, 569 | { 570 | "name": "ioprio_set", 571 | "action": "SCMP_ACT_ALLOW", 572 | "args": null 573 | }, 574 | { 575 | "name": "io_setup", 576 | "action": "SCMP_ACT_ALLOW", 577 | "args": null 578 | }, 579 | { 580 | "name": "io_submit", 581 | "action": "SCMP_ACT_ALLOW", 582 | "args": null 583 | }, 584 | { 585 | "name": "kill", 586 | "action": "SCMP_ACT_ALLOW", 587 | "args": null 588 | }, 589 | { 590 | "name": "lchown", 591 | "action": "SCMP_ACT_ALLOW", 592 | "args": null 593 | }, 594 | { 595 | "name": "lchown32", 596 | "action": "SCMP_ACT_ALLOW", 597 | "args": null 598 | }, 599 | { 600 | "name": "lgetxattr", 601 | "action": "SCMP_ACT_ALLOW", 602 | "args": null 603 | }, 604 | { 605 | "name": "link", 606 | "action": "SCMP_ACT_ALLOW", 607 | "args": null 608 | }, 609 | { 610 | "name": "linkat", 611 | "action": "SCMP_ACT_ALLOW", 612 | "args": null 613 | }, 614 | { 615 | "name": "listen", 616 | "action": "SCMP_ACT_ALLOW", 617 | "args": null 618 | }, 619 | { 620 | "name": "listxattr", 621 | "action": "SCMP_ACT_ALLOW", 622 | "args": null 623 | }, 624 | { 625 | "name": "llistxattr", 626 | "action": "SCMP_ACT_ALLOW", 627 | "args": null 628 | }, 629 | { 630 | "name": "_llseek", 631 | "action": "SCMP_ACT_ALLOW", 632 | "args": null 633 | }, 634 | { 635 | "name": "lremovexattr", 636 | "action": "SCMP_ACT_ALLOW", 637 | "args": null 638 | }, 639 | { 640 | "name": "lseek", 641 | "action": "SCMP_ACT_ALLOW", 642 | "args": null 643 | }, 644 | { 645 | "name": "lsetxattr", 646 | "action": "SCMP_ACT_ALLOW", 647 | "args": null 648 | }, 649 | { 650 | "name": "lstat", 651 | "action": "SCMP_ACT_ALLOW", 652 | "args": null 653 | }, 654 | { 655 | "name": "lstat64", 656 | "action": "SCMP_ACT_ALLOW", 657 | "args": null 658 | }, 659 | { 660 | "name": "madvise", 661 | "action": "SCMP_ACT_ALLOW", 662 | "args": null 663 | }, 664 | { 665 | "name": "memfd_create", 666 | "action": "SCMP_ACT_ALLOW", 667 | "args": null 668 | }, 669 | { 670 | "name": "mincore", 671 | "action": "SCMP_ACT_ALLOW", 672 | "args": null 673 | }, 674 | { 675 | "name": "mkdir", 676 | "action": "SCMP_ACT_ALLOW", 677 | "args": null 678 | }, 679 | { 680 | "name": "mkdirat", 681 | "action": "SCMP_ACT_ALLOW", 682 | "args": null 683 | }, 684 | { 685 | "name": "mknod", 686 | "action": "SCMP_ACT_ALLOW", 687 | "args": null 688 | }, 689 | { 690 | "name": "mknodat", 691 | "action": "SCMP_ACT_ALLOW", 692 | "args": null 693 | }, 694 | { 695 | "name": "mlock", 696 | "action": "SCMP_ACT_ALLOW", 697 | "args": null 698 | }, 699 | { 700 | "name": "mlockall", 701 | "action": "SCMP_ACT_ALLOW", 702 | "args": null 703 | }, 704 | { 705 | "name": "mmap", 706 | "action": "SCMP_ACT_ALLOW", 707 | "args": null 708 | }, 709 | { 710 | "name": "mmap2", 711 | "action": "SCMP_ACT_ALLOW", 712 | "args": null 713 | }, 714 | { 715 | "name": "mprotect", 716 | "action": "SCMP_ACT_ALLOW", 717 | "args": null 718 | }, 719 | { 720 | "name": "mq_getsetattr", 721 | "action": "SCMP_ACT_ALLOW", 722 | "args": null 723 | }, 724 | { 725 | "name": "mq_notify", 726 | "action": "SCMP_ACT_ALLOW", 727 | "args": null 728 | }, 729 | { 730 | "name": "mq_open", 731 | "action": "SCMP_ACT_ALLOW", 732 | "args": null 733 | }, 734 | { 735 | "name": "mq_timedreceive", 736 | "action": "SCMP_ACT_ALLOW", 737 | "args": null 738 | }, 739 | { 740 | "name": "mq_timedsend", 741 | "action": "SCMP_ACT_ALLOW", 742 | "args": null 743 | }, 744 | { 745 | "name": "mq_unlink", 746 | "action": "SCMP_ACT_ALLOW", 747 | "args": null 748 | }, 749 | { 750 | "name": "mremap", 751 | "action": "SCMP_ACT_ALLOW", 752 | "args": null 753 | }, 754 | { 755 | "name": "msgctl", 756 | "action": "SCMP_ACT_ALLOW", 757 | "args": null 758 | }, 759 | { 760 | "name": "msgget", 761 | "action": "SCMP_ACT_ALLOW", 762 | "args": null 763 | }, 764 | { 765 | "name": "msgrcv", 766 | "action": "SCMP_ACT_ALLOW", 767 | "args": null 768 | }, 769 | { 770 | "name": "msgsnd", 771 | "action": "SCMP_ACT_ALLOW", 772 | "args": null 773 | }, 774 | { 775 | "name": "msync", 776 | "action": "SCMP_ACT_ALLOW", 777 | "args": null 778 | }, 779 | { 780 | "name": "munlock", 781 | "action": "SCMP_ACT_ALLOW", 782 | "args": null 783 | }, 784 | { 785 | "name": "munlockall", 786 | "action": "SCMP_ACT_ALLOW", 787 | "args": null 788 | }, 789 | { 790 | "name": "munmap", 791 | "action": "SCMP_ACT_ALLOW", 792 | "args": null 793 | }, 794 | { 795 | "name": "name_to_handle_at", 796 | "action": "SCMP_ACT_ALLOW", 797 | "args": null 798 | }, 799 | { 800 | "name": "nanosleep", 801 | "action": "SCMP_ACT_ALLOW", 802 | "args": null 803 | }, 804 | { 805 | "name": "newfstatat", 806 | "action": "SCMP_ACT_ALLOW", 807 | "args": null 808 | }, 809 | { 810 | "name": "_newselect", 811 | "action": "SCMP_ACT_ALLOW", 812 | "args": null 813 | }, 814 | { 815 | "name": "open", 816 | "action": "SCMP_ACT_ALLOW", 817 | "args": null 818 | }, 819 | { 820 | "name": "open_by_handle_at", 821 | "action": "SCMP_ACT_ALLOW", 822 | "args": null 823 | }, 824 | { 825 | "name": "openat", 826 | "action": "SCMP_ACT_ALLOW", 827 | "args": null 828 | }, 829 | { 830 | "name": "pause", 831 | "action": "SCMP_ACT_ALLOW", 832 | "args": null 833 | }, 834 | { 835 | "name": "pipe", 836 | "action": "SCMP_ACT_ALLOW", 837 | "args": null 838 | }, 839 | { 840 | "name": "pipe2", 841 | "action": "SCMP_ACT_ALLOW", 842 | "args": null 843 | }, 844 | { 845 | "name": "poll", 846 | "action": "SCMP_ACT_ALLOW", 847 | "args": null 848 | }, 849 | { 850 | "name": "ppoll", 851 | "action": "SCMP_ACT_ALLOW", 852 | "args": null 853 | }, 854 | { 855 | "name": "prctl", 856 | "action": "SCMP_ACT_ALLOW", 857 | "args": null 858 | }, 859 | { 860 | "name": "pread64", 861 | "action": "SCMP_ACT_ALLOW", 862 | "args": null 863 | }, 864 | { 865 | "name": "preadv", 866 | "action": "SCMP_ACT_ALLOW", 867 | "args": null 868 | }, 869 | { 870 | "name": "prlimit64", 871 | "action": "SCMP_ACT_ALLOW", 872 | "args": null 873 | }, 874 | { 875 | "name": "pselect6", 876 | "action": "SCMP_ACT_ALLOW", 877 | "args": null 878 | }, 879 | { 880 | "name": "pwrite64", 881 | "action": "SCMP_ACT_ALLOW", 882 | "args": null 883 | }, 884 | { 885 | "name": "pwritev", 886 | "action": "SCMP_ACT_ALLOW", 887 | "args": null 888 | }, 889 | { 890 | "name": "read", 891 | "action": "SCMP_ACT_ALLOW", 892 | "args": null 893 | }, 894 | { 895 | "name": "readahead", 896 | "action": "SCMP_ACT_ALLOW", 897 | "args": null 898 | }, 899 | { 900 | "name": "readlink", 901 | "action": "SCMP_ACT_ALLOW", 902 | "args": null 903 | }, 904 | { 905 | "name": "readlinkat", 906 | "action": "SCMP_ACT_ALLOW", 907 | "args": null 908 | }, 909 | { 910 | "name": "readv", 911 | "action": "SCMP_ACT_ALLOW", 912 | "args": null 913 | }, 914 | { 915 | "name": "recvfrom", 916 | "action": "SCMP_ACT_ALLOW", 917 | "args": null 918 | }, 919 | { 920 | "name": "recvmmsg", 921 | "action": "SCMP_ACT_ALLOW", 922 | "args": null 923 | }, 924 | { 925 | "name": "recvmsg", 926 | "action": "SCMP_ACT_ALLOW", 927 | "args": null 928 | }, 929 | { 930 | "name": "remap_file_pages", 931 | "action": "SCMP_ACT_ALLOW", 932 | "args": null 933 | }, 934 | { 935 | "name": "removexattr", 936 | "action": "SCMP_ACT_ALLOW", 937 | "args": null 938 | }, 939 | { 940 | "name": "rename", 941 | "action": "SCMP_ACT_ALLOW", 942 | "args": null 943 | }, 944 | { 945 | "name": "renameat", 946 | "action": "SCMP_ACT_ALLOW", 947 | "args": null 948 | }, 949 | { 950 | "name": "renameat2", 951 | "action": "SCMP_ACT_ALLOW", 952 | "args": null 953 | }, 954 | { 955 | "name": "rmdir", 956 | "action": "SCMP_ACT_ALLOW", 957 | "args": null 958 | }, 959 | { 960 | "name": "rt_sigaction", 961 | "action": "SCMP_ACT_ALLOW", 962 | "args": null 963 | }, 964 | { 965 | "name": "rt_sigpending", 966 | "action": "SCMP_ACT_ALLOW", 967 | "args": null 968 | }, 969 | { 970 | "name": "rt_sigprocmask", 971 | "action": "SCMP_ACT_ALLOW", 972 | "args": null 973 | }, 974 | { 975 | "name": "rt_sigqueueinfo", 976 | "action": "SCMP_ACT_ALLOW", 977 | "args": null 978 | }, 979 | { 980 | "name": "rt_sigreturn", 981 | "action": "SCMP_ACT_ALLOW", 982 | "args": null 983 | }, 984 | { 985 | "name": "rt_sigsuspend", 986 | "action": "SCMP_ACT_ALLOW", 987 | "args": null 988 | }, 989 | { 990 | "name": "rt_sigtimedwait", 991 | "action": "SCMP_ACT_ALLOW", 992 | "args": null 993 | }, 994 | { 995 | "name": "rt_tgsigqueueinfo", 996 | "action": "SCMP_ACT_ALLOW", 997 | "args": null 998 | }, 999 | { 1000 | "name": "sched_getaffinity", 1001 | "action": "SCMP_ACT_ALLOW", 1002 | "args": null 1003 | }, 1004 | { 1005 | "name": "sched_getattr", 1006 | "action": "SCMP_ACT_ALLOW", 1007 | "args": null 1008 | }, 1009 | { 1010 | "name": "sched_getparam", 1011 | "action": "SCMP_ACT_ALLOW", 1012 | "args": null 1013 | }, 1014 | { 1015 | "name": "sched_get_priority_max", 1016 | "action": "SCMP_ACT_ALLOW", 1017 | "args": null 1018 | }, 1019 | { 1020 | "name": "sched_get_priority_min", 1021 | "action": "SCMP_ACT_ALLOW", 1022 | "args": null 1023 | }, 1024 | { 1025 | "name": "sched_getscheduler", 1026 | "action": "SCMP_ACT_ALLOW", 1027 | "args": null 1028 | }, 1029 | { 1030 | "name": "sched_rr_get_interval", 1031 | "action": "SCMP_ACT_ALLOW", 1032 | "args": null 1033 | }, 1034 | { 1035 | "name": "sched_setaffinity", 1036 | "action": "SCMP_ACT_ALLOW", 1037 | "args": null 1038 | }, 1039 | { 1040 | "name": "sched_setattr", 1041 | "action": "SCMP_ACT_ALLOW", 1042 | "args": null 1043 | }, 1044 | { 1045 | "name": "sched_setparam", 1046 | "action": "SCMP_ACT_ALLOW", 1047 | "args": null 1048 | }, 1049 | { 1050 | "name": "sched_setscheduler", 1051 | "action": "SCMP_ACT_ALLOW", 1052 | "args": null 1053 | }, 1054 | { 1055 | "name": "sched_yield", 1056 | "action": "SCMP_ACT_ALLOW", 1057 | "args": null 1058 | }, 1059 | { 1060 | "name": "seccomp", 1061 | "action": "SCMP_ACT_ALLOW", 1062 | "args": null 1063 | }, 1064 | { 1065 | "name": "select", 1066 | "action": "SCMP_ACT_ALLOW", 1067 | "args": null 1068 | }, 1069 | { 1070 | "name": "semctl", 1071 | "action": "SCMP_ACT_ALLOW", 1072 | "args": null 1073 | }, 1074 | { 1075 | "name": "semget", 1076 | "action": "SCMP_ACT_ALLOW", 1077 | "args": null 1078 | }, 1079 | { 1080 | "name": "semop", 1081 | "action": "SCMP_ACT_ALLOW", 1082 | "args": null 1083 | }, 1084 | { 1085 | "name": "semtimedop", 1086 | "action": "SCMP_ACT_ALLOW", 1087 | "args": null 1088 | }, 1089 | { 1090 | "name": "sendfile", 1091 | "action": "SCMP_ACT_ALLOW", 1092 | "args": null 1093 | }, 1094 | { 1095 | "name": "sendfile64", 1096 | "action": "SCMP_ACT_ALLOW", 1097 | "args": null 1098 | }, 1099 | { 1100 | "name": "sendmmsg", 1101 | "action": "SCMP_ACT_ALLOW", 1102 | "args": null 1103 | }, 1104 | { 1105 | "name": "sendmsg", 1106 | "action": "SCMP_ACT_ALLOW", 1107 | "args": null 1108 | }, 1109 | { 1110 | "name": "sendto", 1111 | "action": "SCMP_ACT_ALLOW", 1112 | "args": null 1113 | }, 1114 | { 1115 | "name": "setdomainname", 1116 | "action": "SCMP_ACT_ALLOW", 1117 | "args": null 1118 | }, 1119 | { 1120 | "name": "setfsgid", 1121 | "action": "SCMP_ACT_ALLOW", 1122 | "args": null 1123 | }, 1124 | { 1125 | "name": "setfsgid32", 1126 | "action": "SCMP_ACT_ALLOW", 1127 | "args": null 1128 | }, 1129 | { 1130 | "name": "setfsuid", 1131 | "action": "SCMP_ACT_ALLOW", 1132 | "args": null 1133 | }, 1134 | { 1135 | "name": "setfsuid32", 1136 | "action": "SCMP_ACT_ALLOW", 1137 | "args": null 1138 | }, 1139 | { 1140 | "name": "setgid", 1141 | "action": "SCMP_ACT_ALLOW", 1142 | "args": null 1143 | }, 1144 | { 1145 | "name": "setgid32", 1146 | "action": "SCMP_ACT_ALLOW", 1147 | "args": null 1148 | }, 1149 | { 1150 | "name": "setgroups", 1151 | "action": "SCMP_ACT_ALLOW", 1152 | "args": null 1153 | }, 1154 | { 1155 | "name": "setgroups32", 1156 | "action": "SCMP_ACT_ALLOW", 1157 | "args": null 1158 | }, 1159 | { 1160 | "name": "sethostname", 1161 | "action": "SCMP_ACT_ALLOW", 1162 | "args": null 1163 | }, 1164 | { 1165 | "name": "setitimer", 1166 | "action": "SCMP_ACT_ALLOW", 1167 | "args": null 1168 | }, 1169 | { 1170 | "name": "setns", 1171 | "action": "SCMP_ACT_ALLOW", 1172 | "args": null 1173 | }, 1174 | { 1175 | "name": "setpgid", 1176 | "action": "SCMP_ACT_ALLOW", 1177 | "args": null 1178 | }, 1179 | { 1180 | "name": "setpriority", 1181 | "action": "SCMP_ACT_ALLOW", 1182 | "args": null 1183 | }, 1184 | { 1185 | "name": "setregid", 1186 | "action": "SCMP_ACT_ALLOW", 1187 | "args": null 1188 | }, 1189 | { 1190 | "name": "setregid32", 1191 | "action": "SCMP_ACT_ALLOW", 1192 | "args": null 1193 | }, 1194 | { 1195 | "name": "setresgid", 1196 | "action": "SCMP_ACT_ALLOW", 1197 | "args": null 1198 | }, 1199 | { 1200 | "name": "setresgid32", 1201 | "action": "SCMP_ACT_ALLOW", 1202 | "args": null 1203 | }, 1204 | { 1205 | "name": "setresuid", 1206 | "action": "SCMP_ACT_ALLOW", 1207 | "args": null 1208 | }, 1209 | { 1210 | "name": "setresuid32", 1211 | "action": "SCMP_ACT_ALLOW", 1212 | "args": null 1213 | }, 1214 | { 1215 | "name": "setreuid", 1216 | "action": "SCMP_ACT_ALLOW", 1217 | "args": null 1218 | }, 1219 | { 1220 | "name": "setreuid32", 1221 | "action": "SCMP_ACT_ALLOW", 1222 | "args": null 1223 | }, 1224 | { 1225 | "name": "setrlimit", 1226 | "action": "SCMP_ACT_ALLOW", 1227 | "args": null 1228 | }, 1229 | { 1230 | "name": "set_robust_list", 1231 | "action": "SCMP_ACT_ALLOW", 1232 | "args": null 1233 | }, 1234 | { 1235 | "name": "setsid", 1236 | "action": "SCMP_ACT_ALLOW", 1237 | "args": null 1238 | }, 1239 | { 1240 | "name": "setsockopt", 1241 | "action": "SCMP_ACT_ALLOW", 1242 | "args": null 1243 | }, 1244 | { 1245 | "name": "set_thread_area", 1246 | "action": "SCMP_ACT_ALLOW", 1247 | "args": null 1248 | }, 1249 | { 1250 | "name": "set_tid_address", 1251 | "action": "SCMP_ACT_ALLOW", 1252 | "args": null 1253 | }, 1254 | { 1255 | "name": "setuid", 1256 | "action": "SCMP_ACT_ALLOW", 1257 | "args": null 1258 | }, 1259 | { 1260 | "name": "setuid32", 1261 | "action": "SCMP_ACT_ALLOW", 1262 | "args": null 1263 | }, 1264 | { 1265 | "name": "setxattr", 1266 | "action": "SCMP_ACT_ALLOW", 1267 | "args": null 1268 | }, 1269 | { 1270 | "name": "shmat", 1271 | "action": "SCMP_ACT_ALLOW", 1272 | "args": null 1273 | }, 1274 | { 1275 | "name": "shmctl", 1276 | "action": "SCMP_ACT_ALLOW", 1277 | "args": null 1278 | }, 1279 | { 1280 | "name": "shmdt", 1281 | "action": "SCMP_ACT_ALLOW", 1282 | "args": null 1283 | }, 1284 | { 1285 | "name": "shmget", 1286 | "action": "SCMP_ACT_ALLOW", 1287 | "args": null 1288 | }, 1289 | { 1290 | "name": "shutdown", 1291 | "action": "SCMP_ACT_ALLOW", 1292 | "args": null 1293 | }, 1294 | { 1295 | "name": "sigaltstack", 1296 | "action": "SCMP_ACT_ALLOW", 1297 | "args": null 1298 | }, 1299 | { 1300 | "name": "signalfd", 1301 | "action": "SCMP_ACT_ALLOW", 1302 | "args": null 1303 | }, 1304 | { 1305 | "name": "signalfd4", 1306 | "action": "SCMP_ACT_ALLOW", 1307 | "args": null 1308 | }, 1309 | { 1310 | "name": "socket", 1311 | "action": "SCMP_ACT_ALLOW", 1312 | "args": null 1313 | }, 1314 | { 1315 | "name": "socketpair", 1316 | "action": "SCMP_ACT_ALLOW", 1317 | "args": null 1318 | }, 1319 | { 1320 | "name": "splice", 1321 | "action": "SCMP_ACT_ALLOW", 1322 | "args": null 1323 | }, 1324 | { 1325 | "name": "stat", 1326 | "action": "SCMP_ACT_ALLOW", 1327 | "args": null 1328 | }, 1329 | { 1330 | "name": "stat64", 1331 | "action": "SCMP_ACT_ALLOW", 1332 | "args": null 1333 | }, 1334 | { 1335 | "name": "statfs", 1336 | "action": "SCMP_ACT_ALLOW", 1337 | "args": null 1338 | }, 1339 | { 1340 | "name": "statfs64", 1341 | "action": "SCMP_ACT_ALLOW", 1342 | "args": null 1343 | }, 1344 | { 1345 | "name": "symlink", 1346 | "action": "SCMP_ACT_ALLOW", 1347 | "args": null 1348 | }, 1349 | { 1350 | "name": "symlinkat", 1351 | "action": "SCMP_ACT_ALLOW", 1352 | "args": null 1353 | }, 1354 | { 1355 | "name": "sync", 1356 | "action": "SCMP_ACT_ALLOW", 1357 | "args": null 1358 | }, 1359 | { 1360 | "name": "sync_file_range", 1361 | "action": "SCMP_ACT_ALLOW", 1362 | "args": null 1363 | }, 1364 | { 1365 | "name": "syncfs", 1366 | "action": "SCMP_ACT_ALLOW", 1367 | "args": null 1368 | }, 1369 | { 1370 | "name": "sysinfo", 1371 | "action": "SCMP_ACT_ALLOW", 1372 | "args": null 1373 | }, 1374 | { 1375 | "name": "syslog", 1376 | "action": "SCMP_ACT_ALLOW", 1377 | "args": null 1378 | }, 1379 | { 1380 | "name": "tee", 1381 | "action": "SCMP_ACT_ALLOW", 1382 | "args": null 1383 | }, 1384 | { 1385 | "name": "tgkill", 1386 | "action": "SCMP_ACT_ALLOW", 1387 | "args": null 1388 | }, 1389 | { 1390 | "name": "time", 1391 | "action": "SCMP_ACT_ALLOW", 1392 | "args": null 1393 | }, 1394 | { 1395 | "name": "timer_create", 1396 | "action": "SCMP_ACT_ALLOW", 1397 | "args": null 1398 | }, 1399 | { 1400 | "name": "timer_delete", 1401 | "action": "SCMP_ACT_ALLOW", 1402 | "args": null 1403 | }, 1404 | { 1405 | "name": "timerfd_create", 1406 | "action": "SCMP_ACT_ALLOW", 1407 | "args": null 1408 | }, 1409 | { 1410 | "name": "timerfd_gettime", 1411 | "action": "SCMP_ACT_ALLOW", 1412 | "args": null 1413 | }, 1414 | { 1415 | "name": "timerfd_settime", 1416 | "action": "SCMP_ACT_ALLOW", 1417 | "args": null 1418 | }, 1419 | { 1420 | "name": "timer_getoverrun", 1421 | "action": "SCMP_ACT_ALLOW", 1422 | "args": null 1423 | }, 1424 | { 1425 | "name": "timer_gettime", 1426 | "action": "SCMP_ACT_ALLOW", 1427 | "args": null 1428 | }, 1429 | { 1430 | "name": "timer_settime", 1431 | "action": "SCMP_ACT_ALLOW", 1432 | "args": null 1433 | }, 1434 | { 1435 | "name": "times", 1436 | "action": "SCMP_ACT_ALLOW", 1437 | "args": null 1438 | }, 1439 | { 1440 | "name": "tkill", 1441 | "action": "SCMP_ACT_ALLOW", 1442 | "args": null 1443 | }, 1444 | { 1445 | "name": "truncate", 1446 | "action": "SCMP_ACT_ALLOW", 1447 | "args": null 1448 | }, 1449 | { 1450 | "name": "truncate64", 1451 | "action": "SCMP_ACT_ALLOW", 1452 | "args": null 1453 | }, 1454 | { 1455 | "name": "ugetrlimit", 1456 | "action": "SCMP_ACT_ALLOW", 1457 | "args": null 1458 | }, 1459 | { 1460 | "name": "umask", 1461 | "action": "SCMP_ACT_ALLOW", 1462 | "args": null 1463 | }, 1464 | { 1465 | "name": "uname", 1466 | "action": "SCMP_ACT_ALLOW", 1467 | "args": null 1468 | }, 1469 | { 1470 | "name": "unlink", 1471 | "action": "SCMP_ACT_ALLOW", 1472 | "args": null 1473 | }, 1474 | { 1475 | "name": "unlinkat", 1476 | "action": "SCMP_ACT_ALLOW", 1477 | "args": null 1478 | }, 1479 | { 1480 | "name": "unshare", 1481 | "action": "SCMP_ACT_ALLOW", 1482 | "args": null 1483 | }, 1484 | { 1485 | "name": "utime", 1486 | "action": "SCMP_ACT_ALLOW", 1487 | "args": null 1488 | }, 1489 | { 1490 | "name": "utimensat", 1491 | "action": "SCMP_ACT_ALLOW", 1492 | "args": null 1493 | }, 1494 | { 1495 | "name": "utimes", 1496 | "action": "SCMP_ACT_ALLOW", 1497 | "args": null 1498 | }, 1499 | { 1500 | "name": "vfork", 1501 | "action": "SCMP_ACT_ALLOW", 1502 | "args": null 1503 | }, 1504 | { 1505 | "name": "vhangup", 1506 | "action": "SCMP_ACT_ALLOW", 1507 | "args": null 1508 | }, 1509 | { 1510 | "name": "vmsplice", 1511 | "action": "SCMP_ACT_ALLOW", 1512 | "args": null 1513 | }, 1514 | { 1515 | "name": "wait4", 1516 | "action": "SCMP_ACT_ALLOW", 1517 | "args": null 1518 | }, 1519 | { 1520 | "name": "waitid", 1521 | "action": "SCMP_ACT_ALLOW", 1522 | "args": null 1523 | }, 1524 | { 1525 | "name": "write", 1526 | "action": "SCMP_ACT_ALLOW", 1527 | "args": null 1528 | }, 1529 | { 1530 | "name": "writev", 1531 | "action": "SCMP_ACT_ALLOW", 1532 | "args": null 1533 | } 1534 | ] 1535 | } 1536 | -------------------------------------------------------------------------------- /ci.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ARCHS=linux/amd64 4 | 5 | echo -e "\n\nPull request: $TRAVIS_PULL_REQUEST\nRelease tag: $TRAVIS_TAG\nBranch: $TRAVIS_BRANCH\n\nTarget arch: $ARCHS\n\n" 6 | 7 | if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then 8 | echo -e "\n\nBuilding pull request without pushing to Docker Hub\n\n" 9 | docker buildx build \ 10 | --progress plain \ 11 | --platform="$ARCHS" \ 12 | . 13 | exit $? 14 | fi 15 | 16 | echo $DOCKER_PASSWORD | docker login -u qmcgaw --password-stdin 2>&1 17 | 18 | TAG="$TRAVIS_TAG" 19 | if [ -z "$TAG" ]; then 20 | TAG=latest 21 | if [ "$TRAVIS_BRANCH" != "master" ]; then 22 | TAG="$TRAVIS_BRANCH" 23 | fi 24 | fi 25 | 26 | echo -e "\n\nBuilding Docker images for \"$DOCKER_REPO:$TAG\"\n\n" 27 | docker buildx build \ 28 | --progress plain \ 29 | --platform="$ARCHS" \ 30 | --build-arg BUILD_DATE=`date -u +"%Y-%m-%dT%H:%M:%SZ"` \ 31 | --build-arg VCS_REF=`git rev-parse --short HEAD` \ 32 | --build-arg VERSION=$TAG \ 33 | -t $DOCKER_REPO:$TAG \ 34 | --push \ 35 | . -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | htmlspitter: 4 | build: https://github.com/qdm12/htmlspitter.git 5 | image: qmcgaw/htmlspitter 6 | container_name: htmlspitter 7 | network_mode: bridge 8 | init: true 9 | # security_opt: 10 | # - seccomp=./chrome.json 11 | ports: 12 | - 8000:8000/tcp 13 | environment: 14 | - MAX_PAGES= 15 | - MAX_HITS= 16 | - MAX_AGE_UNUSED= 17 | - MAX_BROWSERS= 18 | - MAX_CACHE_SIZE= 19 | - MAX_QUEUE_SIZE= 20 | - LOG= 21 | - TIMEOUT= 22 | restart: always -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | "roots": [ 3 | "/src" 4 | ], 5 | "transform": { 6 | "^.+\\.tsx?$": "ts-jest" 7 | }, 8 | } -------------------------------------------------------------------------------- /nodemon.json: -------------------------------------------------------------------------------- 1 | { 2 | "ext": "ts", 3 | "exec": "npm start" 4 | } -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "htmlspitter", 3 | "version": "1.0.0", 4 | "description": "NodeJS server to spit out HTML from loaded JS using Puppeteer", 5 | "main": "./build/main.js", 6 | "scripts": { 7 | "build": "tsc", 8 | "start": "tsc && node ./build/main.js", 9 | "test": "jest" 10 | }, 11 | "repository": { 12 | "type": "git", 13 | "url": "git+https://github.com/qdm12/htmlspitter.git" 14 | }, 15 | "keywords": [ 16 | "puppeteer", 17 | "html", 18 | "javascript", 19 | "typescript", 20 | "scrap", 21 | "scrapper" 22 | ], 23 | "author": "Quentin McGaw", 24 | "license": "MIT", 25 | "bugs": { 26 | "url": "https://github.com/qdm12/htmlspitter/issues" 27 | }, 28 | "homepage": "https://github.com/qdm12/htmlspitter#readme", 29 | "dependencies": { 30 | "debug": "^4.1.1", 31 | "express": "^4.16.4", 32 | "node-fetch": "^2.3.0", 33 | "puppeteer-core": "^2.0.0", 34 | "supports-color": "^6.1.0", 35 | "valid-url": "^1.0.9", 36 | "winston": "^3.2.1" 37 | }, 38 | "devDependencies": { 39 | "@types/debug": "^4.1.4", 40 | "@types/express": "^4.16.1", 41 | "@types/jest": "^24.0.12", 42 | "@types/node-fetch": "^2.3.2", 43 | "@types/puppeteer-core": "^2.0.0", 44 | "@types/valid-url": "^1.0.2", 45 | "@types/winston": "^2.4.4", 46 | "eslint": "^6.7.2", 47 | "jest": "^24.7.1", 48 | "nodemon": "^1.18.10", 49 | "ts-jest": "^24.0.2", 50 | "tslint": "^5.20.1", 51 | "typescript": "^3.4.2" 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/browser.ts: -------------------------------------------------------------------------------- 1 | import { Queue } from "./queue"; 2 | import puppeteer from 'puppeteer'; 3 | import { debugLog } from "./logging"; 4 | 5 | interface paramsType { 6 | maxPages: number, 7 | maxHits: number, // max number of pages over lifetime 8 | maxAgeUnused: number, // seconds since last page opened 9 | } 10 | 11 | interface statsType { 12 | pages: number, // curent number of pages 13 | hits: number, // current count of pages opened over lifetime 14 | lastUsedAt: Date, 15 | } 16 | 17 | export class Browser { 18 | browser: puppeteer.Browser | null; 19 | launched: Promise; 20 | queue: Queue; // queue of pages to create 21 | params: paramsType; 22 | stats: statsType; 23 | constructor( 24 | executablePath: string, 25 | maxPages: number, 26 | maxHits: number, 27 | maxAgeUnused: number, 28 | maxQueueSize: number, 29 | ) { 30 | debugLog.browser("creating"); 31 | this.browser = null; 32 | // need to await browser.launched 33 | this.launched = this.launch(executablePath); 34 | this.queue = new Queue(maxQueueSize); 35 | this.params = { 36 | maxPages, 37 | maxHits, 38 | maxAgeUnused, 39 | } 40 | this.stats = { 41 | pages: 0, 42 | hits: 0, 43 | lastUsedAt: new Date(), 44 | } 45 | } 46 | async launch(executablePathStr: string) { 47 | let executablePath: string | undefined = undefined; 48 | if (executablePathStr !== "Puppeteer-bundled") { 49 | executablePath = executablePathStr; 50 | } 51 | this.browser = await puppeteer.launch({ 52 | headless: true, 53 | executablePath, 54 | args: [ 55 | "--disable-dev-shm-usage", 56 | "--disable-background-networking", 57 | "--disable-default-apps", 58 | "--disable-extensions", 59 | "--disable-gpu", 60 | "--disable-sync", 61 | "--disable-translate", 62 | "--hide-scrollbars", 63 | "--metrics-recording-only", 64 | "--mute-audio", 65 | "--no-first-run", 66 | "--safebrowsing-disable-auto-update" 67 | ] 68 | }); 69 | debugLog.browser("created"); 70 | } 71 | isOverHitLimit() { 72 | const result = this.stats.hits > this.params.maxHits; 73 | if (result) { 74 | debugLog.browser("over hit limit"); 75 | } 76 | return result; 77 | } 78 | isUnused() { 79 | const t = new Date().valueOf() / 1000; 80 | const lastUsed = this.stats.lastUsedAt.valueOf() / 1000; 81 | const result = t - lastUsed > this.params.maxAgeUnused; 82 | if (result) { 83 | debugLog.browser("unused"); 84 | } 85 | return result; 86 | } 87 | // Set a periodic function to check on renewMe() 88 | renewMe() { 89 | const result = this.isOverHitLimit() && this.stats.pages === 0; 90 | if (result) { 91 | debugLog.browser("renew me"); 92 | } 93 | return result; 94 | } 95 | pageAvailable() { 96 | return this.stats.pages < this.params.maxPages; 97 | } 98 | async createPage() { 99 | debugLog.browser("creating page"); 100 | if (this.browser === null) { 101 | throw Error("cannot create page for null browser"); 102 | } 103 | if (this.isOverHitLimit()) { 104 | throw Error("browser has reached its hit limit of pages"); 105 | } 106 | // Browser local queue to create new pages (FIFO) 107 | const id = this.queue.push(); // raise an error if maximum is reached 108 | while (!(this.pageAvailable() && this.queue.isFirst(id))) { 109 | debugLog.browser("waiting to create page"); 110 | await sleepAsync(100); 111 | } 112 | this.queue.shift(); 113 | this.stats.lastUsedAt = new Date(); 114 | const page = await this.browser.newPage(); 115 | this.stats.pages++; 116 | debugLog.browser("created page"); 117 | return page; 118 | } 119 | async closePage(page: puppeteer.Page) { 120 | debugLog.browser("closing page"); 121 | await page.close(); 122 | debugLog.browser("closed page"); 123 | this.stats.pages--; 124 | } 125 | async close() { 126 | debugLog.browser("closing browser"); 127 | if (this.browser === null) { 128 | debugLog.browser("browser is already null"); 129 | return; 130 | } 131 | await this.browser.close(); 132 | debugLog.browser("closed browser"); 133 | } 134 | } 135 | 136 | const sleepAsync = async (ms: number) => { 137 | return new Promise(resolve => { 138 | setTimeout(resolve, ms) 139 | }); 140 | } -------------------------------------------------------------------------------- /src/cache.ts: -------------------------------------------------------------------------------- 1 | import { debugLog } from "./logging"; 2 | 3 | class Value { 4 | url: string; 5 | html: string; 6 | size: number; // bytes 7 | created: Date; 8 | constructor(url: string, html: string) { 9 | this.url = url; 10 | this.html = html; 11 | this.size = 2 * (url.length + html.length); 12 | this.created = new Date(); 13 | } 14 | } 15 | 16 | export class CacheHTML { 17 | map: Map; 18 | maxSize: number; 19 | size: number; // bytes 20 | constructor(maxSize: number) { 21 | this.map = new Map(); 22 | this.maxSize = maxSize; 23 | this.size = 0; 24 | } 25 | getValue(url: string) { 26 | const value = this.map.get(url); 27 | if (value === undefined) { 28 | throw Error("value of cache for URL ${url} is undefined"); 29 | } 30 | return value; 31 | } 32 | getValueHTML(url: string) { 33 | return this.getValue(url).html; 34 | } 35 | getValueTimestamp(url: string) { 36 | return this.getValue(url).created.valueOf() / 1000; 37 | } 38 | getValueSize(url: string) { 39 | return this.getValue(url).size; 40 | } 41 | getValueAge(url: string) { // seconds 42 | return this.getValueTimestamp(url) - new Date().valueOf() / 1000; 43 | } 44 | setValue(url: string, html: string) { 45 | const value = new Value(url, html); 46 | this.size += value.size; 47 | this.map.set(url, value); 48 | } 49 | hasValue(url: string) { 50 | return this.map.has(url); 51 | } 52 | deleteValue(url: string) { 53 | this.size -= this.getValueSize(url); 54 | this.map.delete(url); 55 | } 56 | getKeysSortedByAge() { 57 | const sortedKeys = Array.from(this.map.keys()); 58 | sortedKeys.sort((k1, k2) => 59 | this.getValueTimestamp(k1) - this.getValueTimestamp(k2)); 60 | return sortedKeys; 61 | } 62 | cleanOld() { 63 | // remove elements older than 1 hour 64 | const expiredKeys: Set = new Set(); 65 | this.map.forEach( 66 | (v, url) => { 67 | if (this.getValueAge(url) > 3600) { 68 | expiredKeys.add(url); 69 | } 70 | } 71 | ); 72 | for (const url of expiredKeys.values()) { 73 | debugLog.cache("cleaning old URL " + url); 74 | this.deleteValue(url); 75 | } 76 | } 77 | reduceSize() { 78 | if (this.size < this.maxSize) { 79 | return; 80 | } 81 | const sortedKeys = this.getKeysSortedByAge(); 82 | let i = 0; 83 | while (this.size > this.maxSize) { 84 | const url = sortedKeys[i]; 85 | debugLog.cache("reducing size removing URL " + url); 86 | this.deleteValue(url); 87 | i++; 88 | if (i > sortedKeys.length) { 89 | break; 90 | } 91 | } 92 | } 93 | } -------------------------------------------------------------------------------- /src/healthcheck.ts: -------------------------------------------------------------------------------- 1 | import fetch from "node-fetch"; 2 | 3 | const getHealthcheck = async () => { 4 | try { 5 | const res = await fetch("http://localhost:8000/healthcheck"); 6 | if (res.status !== 200) { 7 | console.log("status code is " + res.status); 8 | process.exit(1); 9 | } 10 | } catch (e) { 11 | console.log(String(e)); 12 | process.exit(1); 13 | } 14 | } 15 | 16 | getHealthcheck(); -------------------------------------------------------------------------------- /src/loader.ts: -------------------------------------------------------------------------------- 1 | import puppeteer from 'puppeteer'; 2 | import { CacheHTML } from './cache'; 3 | import { Pool } from './pool'; 4 | import { debugLog } from './logging'; 5 | 6 | export class Loader { 7 | badURLs: Set; 8 | badURLsConfirmed: Set; 9 | pool: Pool; 10 | cache: CacheHTML; 11 | timeout: number; 12 | constructor(pool: Pool, cache: CacheHTML, timeout: number) { 13 | this.badURLs = new Set(); 14 | this.badURLsConfirmed = new Set(); 15 | this.pool = pool; 16 | this.cache = cache; 17 | this.timeout = timeout; 18 | } 19 | spitHTML = async (url: string, wait: string | undefined) => { 20 | // Check if it's in bad URLs 21 | if (this.badURLsConfirmed.has(url)) { 22 | throw Error(url + " is a confirmed bad URL"); 23 | } 24 | // Check if it's in our cache 25 | if (this.cache.hasValue(url)) { 26 | debugLog.loader("cache has HTML for URL " + url) 27 | return this.cache.getValueHTML(url); 28 | } 29 | // Get a browser instance and create a page 30 | const browser = await this.pool.getBrowser(); 31 | const page = await browser.createPage(); 32 | const tasks = []; 33 | tasks.push(page.setCacheEnabled(false)); 34 | // Avoid all unecessary HTTP requests 35 | tasks.push(page.setRequestInterception(true)); 36 | page.on('request', req => { 37 | if (Loader.requestIsAllowed(req)) { 38 | req.continue(); 39 | } else { 40 | req.abort(); 41 | } 42 | }); 43 | // Load and wait for the page 44 | debugLog.loader("going to page " + url); 45 | let html: string; 46 | try { 47 | await Promise.all(tasks); 48 | await page.goto(url, { 49 | waitUntil: Loader.buildWaitUntil(wait), 50 | timeout: this.timeout, 51 | }); 52 | html = await page.content(); 53 | } catch (e) { 54 | this.recordBadURL(url); 55 | throw e; 56 | } 57 | // Cleaning up 58 | page.close(); // async but no need to wait 59 | this.cache.reduceSize(); // async but no need to wait 60 | this.cache.cleanOld(); // async but no need to wait 61 | this.cache.setValue(url, html); // async but no need to wait 62 | debugLog.loader("spitting HTML of URL " + url); 63 | return html; 64 | } 65 | recordBadURL = (url: string) => { 66 | if (this.badURLsConfirmed.has(url)) { 67 | return; 68 | } else if (this.badURLs.has(url)) { 69 | this.badURLs.delete(url); 70 | this.badURLsConfirmed.add(url); 71 | } else { 72 | this.badURLs.add(url); 73 | } 74 | } 75 | static buildWaitUntil = (wait: string | undefined) => { 76 | let waitUntil: puppeteer.LoadEvent; 77 | switch (wait) { 78 | case "load": 79 | waitUntil = "load"; 80 | break 81 | case "domcontentloaded": 82 | waitUntil = "domcontentloaded"; 83 | break; 84 | case "2": 85 | waitUntil = "networkidle0"; 86 | break; 87 | case "3": 88 | waitUntil = "networkidle2"; 89 | break; 90 | case undefined: 91 | waitUntil = "networkidle0"; 92 | break; 93 | default: 94 | throw Error(`wait parameter ${wait} is invalid`); 95 | } 96 | return waitUntil; 97 | } 98 | static requestIsAllowed = (req: puppeteer.Request) => { 99 | const whitelist = [ 100 | "document", 101 | "script", 102 | "xhr", 103 | "fetch" 104 | ]; 105 | const url = req.url(); 106 | if (!whitelist.includes(req.resourceType())) { 107 | debugLog.loader("unallowed resource type for resource URL: " + url); 108 | return false; 109 | } 110 | const blacklist = [ 111 | "www.google-analytics.com", 112 | "/gtag/js", 113 | "gs.js", 114 | "analytics.js" 115 | ]; 116 | for (const blacklisted of blacklist) { 117 | const arr = url.match(blacklisted); 118 | if (arr != null && arr.length > 0) { 119 | debugLog.loader("blacklisted resource URL: " + url); 120 | return false; 121 | } 122 | } 123 | return true; 124 | } 125 | } -------------------------------------------------------------------------------- /src/logging.ts: -------------------------------------------------------------------------------- 1 | import { createLogger, format, transports, Logger } from "winston"; 2 | 3 | export let logger: Logger = createLogger({ 4 | level: "info", 5 | format: format.combine(format.colorize(), format.cli()), 6 | transports: new transports.Console(), 7 | }); 8 | 9 | export const setLoggerFormat = (s: string) => { 10 | switch (s) { 11 | case "json": 12 | logger.format = format.combine(format.json(), format.timestamp()); 13 | break; 14 | case "normal": 15 | logger.format = format.combine(format.cli(), format.colorize()); 16 | break; 17 | default: 18 | throw Error(`Logger format '${s}' is unrecognized`); 19 | } 20 | } 21 | 22 | export const silence = (silent: boolean) => logger.silent = silent; 23 | 24 | export const debugLog = { 25 | main: require('debug')('htmlspitter:main'), 26 | browser: require('debug')('htmlspitter:browser'), 27 | server: require('debug')('htmlspitter:server'), 28 | pool: require('debug')('htmlspitter:pool'), 29 | params: require('debug')('htmlspitter:params'), 30 | cache: require('debug')('htmlspitter:cache'), 31 | loader: require('debug')('htmlspitter:loader'), 32 | } -------------------------------------------------------------------------------- /src/main.ts: -------------------------------------------------------------------------------- 1 | import { Params } from './params'; 2 | import { debugLog, setLoggerFormat, logger } from './logging'; 3 | import { Pool } from './pool'; 4 | import { Server } from './server'; 5 | import { CacheHTML } from './cache'; 6 | 7 | const main = async () => { 8 | const params = new Params(); 9 | try { 10 | params.parse(process.env); 11 | } catch (error) { 12 | logger.error(error); 13 | process.exit(1); 14 | } 15 | if (params.log === "normal") { 16 | console.log("\n ========================================="); 17 | console.log(" ========================================="); 18 | console.log(" ============== HTMLSpitter =============="); 19 | console.log(" ========================================="); 20 | console.log(" ========================================="); 21 | console.log(" == by github.com/qdm12 - Quentin McGaw ==\n"); 22 | } 23 | debugLog.main("Starting"); 24 | setLoggerFormat(params.log); 25 | logger.info(params.toString()); 26 | debugLog.main("Creating pool of browsers"); 27 | const pool = new Pool( 28 | params.maxBrowsers, 29 | params.maxPages, 30 | params.maxHits, 31 | params.maxAgeUnused, 32 | params.executablePath, 33 | params.maxQueueSize, 34 | ); 35 | debugLog.main("Creating cache"); 36 | const cache = new CacheHTML(params.maxCacheSize * 1000000); 37 | debugLog.main("Launching server"); 38 | const server = new Server(params.port, pool, cache, params.timeout); 39 | process.on('SIGTERM', () => { 40 | debugLog.main("Closing server"); 41 | server.close( 42 | async () => { 43 | debugLog.main("Closing pool of browsers"); 44 | await pool.close(); 45 | process.exit(0); 46 | } 47 | ); 48 | }); 49 | } 50 | 51 | main(); 52 | 53 | -------------------------------------------------------------------------------- /src/params.test.ts: -------------------------------------------------------------------------------- 1 | import { Params } from "./params"; 2 | import { silence } from "./logging"; 3 | const fs = require('fs'); 4 | 5 | jest.mock('fs'); 6 | 7 | beforeAll(() => silence(true)); 8 | afterAll(() => silence(false)); 9 | 10 | describe("constructor", () => { 11 | it("sets all params to default", () => { 12 | const params = new Params(); 13 | expect(params.port).toBe(8000); 14 | expect(params.executablePath).toBe("Puppeteer-bundled"); 15 | expect(params.maxPages).toBe(10); 16 | expect(params.maxHits).toBe(300); 17 | expect(params.maxAgeUnused).toBe(60); 18 | expect(params.maxBrowsers).toBe(10); 19 | expect(params.maxCacheSize).toBe(10); 20 | expect(params.maxQueueSize).toBe(100); 21 | expect(params.log).toBe("normal"); 22 | expect(params.timeout).toBe(7000); 23 | }); 24 | }); 25 | 26 | describe("parse", () => { 27 | it("creates a params with all to default", () => { 28 | const env: NodeJS.ProcessEnv = {}; 29 | const params = new Params(); 30 | params.parse(env); 31 | expect(params.port).toBe(8000); 32 | expect(params.executablePath).toBe("Puppeteer-bundled"); 33 | expect(params.maxPages).toBe(10); 34 | expect(params.maxHits).toBe(300); 35 | expect(params.maxAgeUnused).toBe(60); 36 | expect(params.maxBrowsers).toBe(10); 37 | expect(params.maxCacheSize).toBe(10); 38 | expect(params.maxQueueSize).toBe(100); 39 | expect(params.log).toBe("normal"); 40 | expect(params.timeout).toBe(7000); 41 | }); 42 | it("creates a params with all from environment", () => { 43 | jest.resetAllMocks(); 44 | fs.existsSync.mockReturnValue(true); 45 | const env: NodeJS.ProcessEnv = { 46 | PORT: "8888", 47 | CHROME_BIN: "/path", 48 | MAX_PAGES: "100", 49 | MAX_HITS: "3000", 50 | MAX_AGE_UNUSED: "600", 51 | MAX_BROWSERS: "100", 52 | MAX_CACHE_SIZE: "100", 53 | MAX_QUEUE_SIZE: "1000", 54 | LOG: "json", 55 | TIMEOUT: "8000", 56 | }; 57 | const params = new Params(); 58 | params.parse(env); 59 | expect(params.port).toBe(8888); 60 | expect(params.executablePath).toBe("/path"); 61 | expect(params.maxPages).toBe(100); 62 | expect(params.maxHits).toBe(3000); 63 | expect(params.maxAgeUnused).toBe(600); 64 | expect(params.maxBrowsers).toBe(100); 65 | expect(params.maxCacheSize).toBe(100); 66 | expect(params.maxQueueSize).toBe(1000); 67 | expect(params.log).toBe("json"); 68 | expect(params.timeout).toBe(8000); 69 | expect(fs.existsSync).toHaveBeenCalled(); 70 | }); 71 | it("raises an error", () => { 72 | const env: NodeJS.ProcessEnv = { 73 | PORT: "troll", 74 | CHROME_BIN: "/", 75 | MAX_PAGES: "100", 76 | MAX_HITS: "3000", 77 | MAX_AGE_UNUSED: "600", 78 | MAX_BROWSERS: "100", 79 | MAX_CACHE_SIZE: "100", 80 | MAX_QUEUE_SIZE: "1000", 81 | LOG: "json", 82 | TIMEOUT: "8000", 83 | }; 84 | const params = new Params(); 85 | const f = () => params.parse(env); 86 | expect(f).toThrowError("Environment variable PORT 'troll' is not a number"); 87 | }); 88 | }); 89 | 90 | describe("getPort", () => { 91 | it("returns default 0", () => { 92 | const port = Params.getPort(undefined, 0); 93 | expect(port).toBe(0); 94 | }); 95 | it("throws an error when it's not a number", () => { 96 | const f = () => Params.getPort("troll", 0) 97 | expect(f).toThrowError("Environment variable PORT 'troll' is not a number"); 98 | }); 99 | it("throws an error when it's not an integer", () => { 100 | const f = () => Params.getPort("1.2", 0) 101 | expect(f).toThrowError("Environment variable PORT 1.2 is not an integer"); 102 | }); 103 | it("throws an error when it's not a positive integer", () => { 104 | const f = () => Params.getPort("-5", 0) 105 | expect(f).toThrowError("Environment variable PORT -5 must be positive"); 106 | }); 107 | it("returns reserved port when running as root", () => { 108 | const port = Params.getPort("500", 0); 109 | expect(port).toBe(500); 110 | }); 111 | it("returns reserved port when running on Windows", () => { 112 | const port = Params.getPort("500", -1); 113 | expect(port).toBe(500); 114 | }); 115 | it("throws an error when it's a reserved port and not running as root or on Windows", () => { 116 | const f = () => Params.getPort("500", 1) 117 | expect(f).toThrowError("Environment variable PORT 500 cannot be in the reserved system ports range (1 to 1023) when running without root"); 118 | }); 119 | it("throws an error when it's above 65535", () => { 120 | const f = () => Params.getPort("65536", 0) 121 | expect(f).toThrowError("Environment variable PORT 65536 cannot be higher than 65535"); 122 | }); 123 | it("returns port", () => { 124 | const port = Params.getPort("8888", 0); 125 | expect(port).toBe(8888); 126 | }); 127 | }); 128 | 129 | describe("getExecutablePath", () => { 130 | it("returns default ''", () => { 131 | const path = Params.getExecutablePath(undefined); 132 | expect(path).toBe(""); 133 | }); 134 | it("throws an error when the file does not exist", () => { 135 | jest.resetAllMocks(); 136 | fs.existsSync.mockReturnValue(false); 137 | const f = () => Params.getExecutablePath("/path") 138 | expect(f).toThrowError("/path does not exist"); 139 | expect(fs.existsSync).toHaveBeenCalled(); 140 | }); 141 | it("returns the executable path", () => { 142 | jest.resetAllMocks(); 143 | fs.existsSync.mockReturnValue(true); 144 | const path = Params.getExecutablePath("/path") 145 | expect(path).toBe("/path"); 146 | expect(fs.existsSync).toHaveBeenCalled(); 147 | }); 148 | }); 149 | 150 | describe("getMax", () => { 151 | it("returns default 0", () => { 152 | const n = Params.getMax(undefined, "X"); 153 | expect(n).toBe(0); 154 | }); 155 | it("throws an error when it's not a number", () => { 156 | const f = () => Params.getMax("troll", "X"); 157 | expect(f).toThrowError("Environment variable X 'troll' is not a number"); 158 | }); 159 | it("throws an error when it's not an integer", () => { 160 | const f = () => Params.getMax("1.2", "X"); 161 | expect(f).toThrowError("Environment variable X 1.2 is not an integer"); 162 | }); 163 | it("returns Infinity on -1", () => { 164 | const n = Params.getMax("-1", "X"); 165 | expect(n).toBe(Infinity); 166 | }); 167 | it("throws an error when it's zero", () => { 168 | const f = () => Params.getMax("0", "X"); 169 | expect(f).toThrowError("Environment variable X must be a positive integer or -1 (infinite)"); 170 | }); 171 | it("throws an error when it's negative and not -1", () => { 172 | const f = () => Params.getMax("-2", "X"); 173 | expect(f).toThrowError("Environment variable X must be a positive integer or -1 (infinite)"); 174 | }); 175 | it("returns the number", () => { 176 | const n = Params.getMax("15", "X"); 177 | expect(n).toBe(15); 178 | }); 179 | }); 180 | 181 | describe("getLog", () => { 182 | it("returns default", () => { 183 | const log = Params.getLog(undefined); 184 | expect(log).toBe(""); 185 | }); 186 | it("throws an error when it's not valid", () => { 187 | const f = () => Params.getLog("troll") 188 | expect(f).toThrowError("Environment variable LOG 'troll' is unrecognized"); 189 | }); 190 | it("returns the log", () => { 191 | const log = Params.getLog("json"); 192 | expect(log).toBe("json"); 193 | }); 194 | }); 195 | 196 | describe("getTimeout", () => { 197 | it("returns default 0", () => { 198 | const port = Params.getTimeout(undefined); 199 | expect(port).toBe(0); 200 | }); 201 | it("returns Infinity for -1", () => { 202 | const port = Params.getTimeout("-1"); 203 | expect(port).toBe(Infinity); 204 | }); 205 | it("returns timeout", () => { 206 | const port = Params.getTimeout("8888"); 207 | expect(port).toBe(8888); 208 | }); 209 | it("throws an error when it's not a number", () => { 210 | const f = () => Params.getTimeout("troll") 211 | expect(f).toThrowError("Environment variable TIMEOUT 'troll' is not a number"); 212 | }); 213 | it("throws an error when it's not an integer", () => { 214 | const f = () => Params.getTimeout("1.2") 215 | expect(f).toThrowError("Environment variable TIMEOUT 1.2 is not an integer"); 216 | }); 217 | it("throws an error when it's not a positive integer and not -1", () => { 218 | const f = () => Params.getTimeout("-5") 219 | expect(f).toThrowError("Environment variable TIMEOUT -5 must be positive"); 220 | }); 221 | }); 222 | 223 | describe("toString", () => { 224 | it("returns stringified params", () => { 225 | const p = new Params(); 226 | const s = p.toString(); 227 | expect(s).toBe("{\"port\":8000,\"executablePath\":\"Puppeteer-bundled\",\"maxPages\":10,\"maxHits\":300,\"maxAgeUnused\":60,\"maxBrowsers\":10,\"maxCacheSize\":10,\"maxQueueSize\":100,\"log\":\"normal\",\"timeout\":7000}"); 228 | }); 229 | }); 230 | -------------------------------------------------------------------------------- /src/params.ts: -------------------------------------------------------------------------------- 1 | import { debugLog, logger } from "./logging"; 2 | import { existsSync } from "fs"; 3 | 4 | export class Params { 5 | port: number; 6 | executablePath: string; 7 | maxPages: number; 8 | maxHits: number; 9 | maxAgeUnused: number; 10 | maxBrowsers: number; 11 | maxCacheSize: number; 12 | maxQueueSize: number; 13 | log: string; 14 | timeout: number; 15 | constructor() { 16 | this.port = 8000; 17 | this.executablePath = "Puppeteer-bundled"; 18 | this.maxPages = 10; 19 | this.maxHits = 300; 20 | this.maxAgeUnused = 60; 21 | this.maxBrowsers = 10; 22 | this.maxCacheSize = 10; 23 | this.maxQueueSize = 100; 24 | this.log = "normal"; 25 | this.timeout = 7000; 26 | } 27 | parse(env: NodeJS.ProcessEnv) { 28 | debugLog.params("reading parameters"); 29 | let uid: number; 30 | try { 31 | uid = process.geteuid(); 32 | } catch (error) { 33 | uid = -1; 34 | } 35 | this.port = Params.getPort(env.PORT, uid) || this.port; 36 | this.executablePath = Params.getExecutablePath(env.CHROME_BIN) || this.executablePath; 37 | this.maxPages = Params.getMax(env.MAX_PAGES, "MAX_PAGES") || this.maxPages; 38 | this.maxHits = Params.getMax(env.MAX_HITS, "MAX_HITS") || this.maxHits; 39 | this.maxAgeUnused = Params.getMax(env.MAX_AGE_UNUSED, "MAX_AGE_UNUSED") || this.maxAgeUnused; 40 | this.maxBrowsers = Params.getMax(env.MAX_BROWSERS, "MAX_BROWSERS") || this.maxBrowsers; 41 | this.maxCacheSize = Params.getMax(env.MAX_CACHE_SIZE, "MAX_CACHE_SIZE") || this.maxCacheSize; 42 | this.maxQueueSize = Params.getMax(env.MAX_QUEUE_SIZE, "MAX_QUEUE_SIZE") || this.maxQueueSize; 43 | this.log = Params.getLog(env.LOG) || this.log; 44 | this.timeout = Params.getTimeout(env.TIMEOUT) || this.timeout; 45 | } 46 | static getPort(s: string | undefined, uid: number): number { 47 | if (s === undefined || s === "") { 48 | return 0; // set to default 49 | } 50 | const port = Number(s); 51 | if (Number.isNaN(port)) { 52 | throw Error(`Environment variable PORT '${s}' is not a number`); 53 | } else if (!Number.isInteger(port)) { 54 | throw Error(`Environment variable PORT ${port} is not an integer`); 55 | } else if (port < 1) { 56 | throw Error(`Environment variable PORT ${port} must be positive`); 57 | } else if (port < 1024) { 58 | if (uid === 0) { 59 | logger.warn(`Environment variable PORT ${port} is allowed to be in the reserved system ports range as you are running as root`); 60 | } else if (uid === -1) { 61 | logger.warn(`Environment variable PORT ${port} is allowed to be in the reserved system ports range as you are running in Windows`); 62 | } else { 63 | throw Error(`Environment variable PORT ${port} cannot be in the reserved system ports range (1 to 1023) when running without root`); 64 | } 65 | } else if (port > 65535) { 66 | throw Error(`Environment variable PORT ${port} cannot be higher than 65535`); 67 | } else if (port > 49151) { 68 | logger.warn(`Environment variable PORT ${port} is in the dynamic/private ports range (above 49151)`); 69 | } 70 | return port; 71 | } 72 | static getExecutablePath(s: string | undefined): string { 73 | if (s === undefined || s === "") { 74 | return ""; // set to default 75 | } 76 | if (!existsSync(s)) { 77 | throw Error(`${s} does not exist`); 78 | } 79 | return s; 80 | } 81 | static getMax(s: string | undefined, envName: string): number { 82 | if (s === undefined || s === "") { 83 | return 0; // set to default 84 | } 85 | const n = Number(s); 86 | if (Number.isNaN(n)) { 87 | throw Error(`Environment variable ${envName} '${s}' is not a number`); 88 | } else if (!Number.isInteger(n)) { 89 | throw Error(`Environment variable ${envName} ${n} is not an integer`); 90 | } else if (n === -1) { 91 | return Infinity; 92 | } else if (n < 1) { 93 | throw Error(`Environment variable ${envName} must be a positive integer or -1 (infinite)`); 94 | } 95 | return n; 96 | } 97 | static getLog(s: string | undefined): string { 98 | if (s === undefined || s === "") { 99 | return ""; // set to default 100 | } else if (s !== "normal" && s !== "json") { 101 | throw Error(`Environment variable LOG '${s}' is unrecognized`); 102 | } 103 | return s; 104 | } 105 | static getTimeout(s: string | undefined): number { 106 | if (s === undefined || s === "") { 107 | return 0; // set to default 108 | } 109 | const timeout = Number(s); 110 | if (Number.isNaN(timeout)) { 111 | throw Error(`Environment variable TIMEOUT '${s}' is not a number`); 112 | } else if (!Number.isInteger(timeout)) { 113 | throw Error(`Environment variable TIMEOUT ${timeout} is not an integer`); 114 | } else if (timeout === -1) { 115 | return Infinity; 116 | } else if (timeout < 1) { 117 | throw Error(`Environment variable TIMEOUT ${timeout} must be positive`); 118 | } 119 | return timeout; 120 | } 121 | toString() { 122 | return JSON.stringify(this); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/pool.ts: -------------------------------------------------------------------------------- 1 | import { Browser } from './browser'; 2 | import { debugLog } from './logging'; 3 | 4 | interface paramsType { 5 | maxBrowsers: number, 6 | maxPages: number, 7 | maxHits: number, 8 | maxAgeUnused: number, 9 | executablePath: string, 10 | maxQueueSize: number, 11 | } 12 | 13 | // Pool of browsers 14 | export class Pool { 15 | // id, browser 16 | pool: Map; 17 | params: paramsType; 18 | periodicTimer: NodeJS.Timeout; 19 | constructor( 20 | maxBrowsers: number, 21 | maxPages: number, 22 | maxHits: number, 23 | maxAgeUnused: number, 24 | executablePath: string, 25 | maxQueueSize: number, 26 | ) { 27 | this.pool = new Map(); 28 | this.params = { 29 | maxBrowsers, 30 | maxPages, 31 | maxHits, 32 | maxAgeUnused, 33 | executablePath, 34 | maxQueueSize, 35 | }; 36 | this.periodicTimer = this.periodicChecks(); 37 | } 38 | periodicChecks() { 39 | return setInterval( 40 | () => { 41 | debugLog.pool("periodic checks"); 42 | this.pool.forEach((b, id) => { 43 | if (b.isUnused()) { 44 | this.closeBrowser(id); 45 | } else if (b.renewMe()) { 46 | this.renewBrowser(id); 47 | } 48 | }); 49 | }, 10000 50 | ); 51 | } 52 | canAddBrowser() { 53 | return this.pool.size < this.params.maxBrowsers; 54 | } 55 | newBrowserID() { 56 | if (!this.canAddBrowser()) { 57 | throw Error("cannot add a browser"); 58 | } 59 | let nextID = 0; 60 | while (nextID < Math.max(this.pool.size, 1)) { 61 | if (!this.pool.has(nextID)) { 62 | return nextID; 63 | } 64 | nextID++ 65 | } 66 | nextID++; // equals to size of set 67 | return nextID; 68 | } 69 | async addBrowser() { 70 | debugLog.pool("adding browser"); 71 | if (!this.canAddBrowser()) { 72 | throw Error("reached maximum number of browsers: " + this.params.maxBrowsers); 73 | } 74 | const id = this.newBrowserID(); 75 | const browser = new Browser( 76 | this.params.executablePath, 77 | this.params.maxPages, 78 | this.params.maxHits, 79 | this.params.maxAgeUnused, 80 | this.params.maxQueueSize, 81 | ); 82 | await browser.launched; 83 | this.pool.set(id, browser); 84 | debugLog.pool("added browser with ID " + id); 85 | const test = this.pool.get(id); 86 | return browser; 87 | } 88 | async closeBrowser(id: number) { 89 | debugLog.pool("closing browser with ID " + id); 90 | const browser = this.pool.get(id); 91 | if (browser === undefined) { 92 | throw Error("browser for id " + id + " does not exist"); 93 | } 94 | await browser.close(); 95 | this.pool.delete(id); 96 | } 97 | async close() { 98 | debugLog.pool("closing pool"); 99 | clearTimeout(this.periodicTimer); 100 | for (const id of this.pool.keys()) { 101 | await this.closeBrowser(id); 102 | } 103 | } 104 | getBrowserLeastPages() { 105 | let minID = 0, minPages = Infinity; 106 | this.pool.forEach((b, id) => { 107 | if (b.stats.pages < minPages) { 108 | minPages = b.stats.pages; 109 | minID = id; 110 | } 111 | }); 112 | const browser = this.pool.get(minID); 113 | if (browser === undefined) { 114 | throw Error("browser for id " + minID + " does not exist"); 115 | } 116 | debugLog.pool("got browser with least pages, ID " + minID + ", pages: " + browser.stats.pages); 117 | return browser; 118 | } 119 | // Gets the first browser which has not reached the 120 | // maximum number of pages yet. 121 | async getBrowser() { 122 | debugLog.pool("getting a browser"); 123 | for (const browser of this.pool.values()) { 124 | if (browser.stats.pages < this.params.maxPages) { 125 | return browser; 126 | } 127 | } 128 | // No browser or 129 | // all browsers reached their maximum capacity of pages 130 | if (this.canAddBrowser()) { 131 | return await this.addBrowser(); 132 | } 133 | // Max number of browsers so enqueue to the least busy browser 134 | return this.getBrowserLeastPages(); 135 | } 136 | async renewBrowser(id: number) { 137 | debugLog.pool("renewing browser with ID " + id); 138 | await this.closeBrowser(id); 139 | await this.addBrowser(); 140 | } 141 | } -------------------------------------------------------------------------------- /src/queue.test.ts: -------------------------------------------------------------------------------- 1 | import { Queue } from "./queue"; 2 | 3 | describe("constructor", () => { 4 | it("creates a queue of length 10", () => { 5 | const q = new Queue(10); 6 | expect(q.ids.length).toBe(0); 7 | expect(q.maxSize).toBe(10); 8 | }); 9 | it("creates a queue of unlimited length", () => { 10 | const q = new Queue(-1); 11 | expect(q.ids.length).toBe(0); 12 | expect(q.maxSize).toBe(-1); 13 | }); 14 | }); 15 | 16 | describe("getNextID", () => { 17 | it("gets first ID", () => { 18 | const q = new Queue(10); 19 | const id = q.getNextID(); 20 | expect(id).toBe(0); 21 | }); 22 | it("gets first and second ID", () => { 23 | const q = new Queue(10); 24 | const id1 = q.getNextID(); 25 | q.push(); 26 | const id2 = q.getNextID(); 27 | expect(id1).toBe(0); 28 | expect(id2).toBe(1); 29 | }); 30 | it("gets ID 1 in the middle", () => { 31 | const q = new Queue(10); 32 | q.push(); 33 | q.push(); 34 | q.push(); 35 | q.shift(); 36 | q.shift(); 37 | q.push(); 38 | const id = q.getNextID(); 39 | expect(id).toBe(1); 40 | }); 41 | }); 42 | 43 | describe("push", () => { 44 | it("once", () => { 45 | const q = new Queue(10); 46 | q.push(); 47 | expect(q.ids.length).toBe(1); 48 | }); 49 | it("multiple times", () => { 50 | const q = new Queue(10); 51 | q.push(); 52 | q.push(); 53 | expect(q.ids.length).toBe(2); 54 | }); 55 | it("too many times", () => { 56 | const q = new Queue(2); 57 | q.push(); 58 | q.push(); 59 | const f = () => q.push(); 60 | expect(f).toThrowError("queue reached its maximum size") 61 | expect(q.ids.length).toBe(2); 62 | }); 63 | }); 64 | 65 | describe("shift", () => { 66 | it("once for queue of size 2", () => { 67 | const q = new Queue(10); 68 | q.push(); 69 | q.push(); 70 | q.shift(); 71 | expect(q.ids.length).toBe(1); 72 | }); 73 | it("all in queue", () => { 74 | const q = new Queue(10); 75 | q.push(); 76 | q.push(); 77 | q.shift(); 78 | q.shift(); 79 | expect(q.ids.length).toBe(0); 80 | }); 81 | it("too many times", () => { 82 | const q = new Queue(10); 83 | q.push(); 84 | q.shift(); 85 | const f = () => q.shift(); 86 | expect(f).toThrowError("cannot shift because queue is empty") 87 | expect(q.ids.length).toBe(0); 88 | }); 89 | }); -------------------------------------------------------------------------------- /src/queue.ts: -------------------------------------------------------------------------------- 1 | export class Queue { 2 | ids: number[]; 3 | maxSize: number; 4 | constructor(maxSize: number) { // -1 for unlimited queue 5 | this.maxSize = maxSize; 6 | this.ids = []; 7 | } 8 | // Adds an element to the back of the queue 9 | push() { 10 | if (this.ids.length === this.maxSize) { 11 | throw Error("queue reached its maximum size") 12 | } 13 | const id = this.getNextID(); 14 | this.ids.push(id); 15 | return id; 16 | } 17 | getNextID() { // use a set for higher performance 18 | let nextID = -1; 19 | let found = false; 20 | while (!found) { 21 | nextID++; 22 | found = this.ids.indexOf(nextID) === -1; 23 | } 24 | return nextID; 25 | } 26 | shift() { 27 | const idRemoved = this.ids.shift(); 28 | if (idRemoved === undefined) { 29 | throw Error("cannot shift because queue is empty"); 30 | } 31 | } 32 | isFirst(id: number) { 33 | return id === this.ids[0]; 34 | } 35 | } -------------------------------------------------------------------------------- /src/server.ts: -------------------------------------------------------------------------------- 1 | import express, { Express } from 'express'; 2 | import http from "http"; 3 | import validUrl from "valid-url"; 4 | import { Loader } from './loader'; 5 | import { logger, debugLog } from './logging'; 6 | import { Pool } from './pool'; 7 | import { CacheHTML } from './cache'; 8 | import { Request, Response } from 'express-serve-static-core'; 9 | 10 | export class Server { 11 | app: Express; 12 | server: http.Server; 13 | loader: Loader; 14 | constructor(port: number, pool: Pool, cache: CacheHTML, timeout: number) { 15 | this.app = express(); 16 | this.loader = new Loader(pool, cache, timeout); 17 | this.app.get('/', async (req, res, _) => { 18 | this.getRootHandler(req, res) 19 | }); 20 | this.app.get('/healthcheck', async (req, res, _) => { 21 | this.getHealthcheckHandler(req, res) 22 | }); 23 | this.server = this.app.listen( 24 | port, 25 | () => logger.info("server listening on port " + port), 26 | ); 27 | } 28 | async getRootHandler(req: Request, res: Response) { 29 | logger.info("received HTTP GET: " + req.url); 30 | let url: string; 31 | try { 32 | url = Server.verifyURL(req.query["url"]) 33 | } catch (e) { 34 | return res.status(403).send({ 35 | "error": String(e) 36 | }); 37 | } 38 | const wait = req.query["wait"]; 39 | try { 40 | const html = await this.loader.spitHTML(url, wait); 41 | return res.status(200).send({ 42 | "html": html 43 | }); 44 | } catch (e) { 45 | logger.error(String(e)); 46 | return res.status(403).send({ 47 | "error": String(e) 48 | }); 49 | } 50 | } 51 | async getHealthcheckHandler(req: Request, res: Response) { 52 | debugLog.server("received GET /healthcheck request: " + req.url); 53 | const healthy = true; // TODO 54 | if (healthy) { 55 | return res.status(200); 56 | } 57 | logger.warn("unhealthy"); 58 | return res.status(500).send("unhealthy"); 59 | } 60 | static verifyURL(url: string | undefined): string { 61 | if (url === undefined || url === "") { 62 | throw new Error("url parameter not provided") 63 | } else if (validUrl.isWebUri(url) === undefined) { 64 | throw new Error("url parameter is not a valid URL") 65 | } 66 | return url; 67 | } 68 | close(callback?: () => void) { 69 | this.server.close(callback); 70 | } 71 | } 72 | 73 | 74 | -------------------------------------------------------------------------------- /title.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/qdm12/htmlspitter/766c121a64c3cf61c6cedac5ab7991e57ae3992a/title.png -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "noEmitOnError": true, 4 | /* Basic Options */ 5 | "target": "es6", /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019' or 'ESNEXT'. */ 6 | "module": "commonjs", /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', or 'ESNext'. */ 7 | // "lib": [], /* Specify library files to be included in the compilation. */ 8 | // "allowJs": true, /* Allow javascript files to be compiled. */ 9 | // "checkJs": true, /* Report errors in .js files. */ 10 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', or 'react'. */ 11 | // "declaration": true, /* Generates corresponding '.d.ts' file. */ 12 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 13 | // "sourceMap": true, /* Generates corresponding '.map' file. */ 14 | // "outFile": "./", /* Concatenate and emit output to single file. */ 15 | "outDir": "./build", /* Redirect output structure to the directory. */ 16 | "rootDir": "./src", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 17 | // "composite": true, /* Enable project compilation */ 18 | // "incremental": true, /* Enable incremental compilation */ 19 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 20 | // "removeComments": true, /* Do not emit comments to output. */ 21 | // "noEmit": true, /* Do not emit outputs. */ 22 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 23 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 24 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 25 | 26 | /* Strict Type-Checking Options */ 27 | "strict": true, /* Enable all strict type-checking options. */ 28 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 29 | // "strictNullChecks": true, /* Enable strict null checks. */ 30 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */ 31 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 32 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 33 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 34 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 35 | 36 | /* Additional Checks */ 37 | // "noUnusedLocals": true, /* Report errors on unused locals. */ 38 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 39 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 40 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 41 | 42 | /* Module Resolution Options */ 43 | // "moduleResolution": "node", /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */ 44 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ 45 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 46 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 47 | // "typeRoots": [], /* List of folders to include type definitions from. */ 48 | // "types": [], /* Type declaration files to be included in compilation. */ 49 | // "allowSyntheticDefaultImports": true, /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */ 50 | "esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */ 51 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 52 | 53 | /* Source Map Options */ 54 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 55 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 56 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 57 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 58 | 59 | /* Experimental Options */ 60 | // "experimentalDecorators": true, /* Enables experimental support for ES7 decorators. */ 61 | // "emitDecoratorMetadata": true, /* Enables experimental support for emitting type metadata for decorators. */ 62 | } 63 | } 64 | --------------------------------------------------------------------------------