├── img
├── amazon-kindle.png
└── wikipedia-offline.png
├── .github
└── workflows
│ └── main.yml
├── test
├── restore.test.js
├── record.test.js
└── quopri.test.js
├── package.json
├── LICENSE
├── cli
├── restore.js
├── stats.js
└── record.js
├── .gitignore
├── src
├── util.js
├── quopri.js
├── restore.js
└── record.js
├── block
├── update_bad_hosts.py
└── blocklist.txt
└── README.md
/img/amazon-kindle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zytedata/web-snap/HEAD/img/amazon-kindle.png
--------------------------------------------------------------------------------
/img/wikipedia-offline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zytedata/web-snap/HEAD/img/wikipedia-offline.png
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | - push
5 | - pull_request
6 |
7 | jobs:
8 | test:
9 | runs-on: ubuntu-latest
10 | steps:
11 | - uses: actions/checkout@v5
12 | - uses: actions/setup-node@v4
13 | with:
14 | node-version: 22
15 | - run: npm install
16 | - run: |
17 | npx playwright install
18 | npm test
19 |
--------------------------------------------------------------------------------
/test/restore.test.js:
--------------------------------------------------------------------------------
1 | import test from 'ava';
2 | import { restorePage } from '../src/restore.js';
3 |
4 | test('basic restore page', async (t) => {
5 | const RECORD = {
6 | url: 'http://example.com',
7 | base_url: 'http://example.com/',
8 | html: 'Example page',
9 | responses: {},
10 | };
11 | const { page, browser } = await restorePage({ RECORD, timeout: 1, wait: 1, headless: true });
12 |
13 | const url = page.url();
14 | t.is(RECORD.base_url, url);
15 | const base_url = await page.evaluate('document.baseURI');
16 | t.is(RECORD.base_url, base_url);
17 | const html = (await page.content()).trim();
18 | t.true(html.includes('
Example page'));
19 |
20 | await browser.close();
21 | });
22 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "authors": [
3 | "Cristi Constantin "
4 | ],
5 | "dependencies": {
6 | "@ghostery/adblocker-playwright": "2.11.3",
7 | "cross-fetch": "4.1.0",
8 | "html-minifier-terser": "7.2.0",
9 | "lightningcss": "1.30.1",
10 | "mri": "1.2.0",
11 | "playwright": "1.57.0",
12 | "pretty-bytes": "7.0.1",
13 | "purgecss": "7.0.2"
14 | },
15 | "devDependencies": {
16 | "ava": "6.4.1",
17 | "express": "5.2.1"
18 | },
19 | "main": "src",
20 | "type": "module",
21 | "name": "web-snap",
22 | "description": "Create perfect snapshots of web pages",
23 | "license": "MIT",
24 | "version": "0.1",
25 | "repository": {
26 | "type": "git",
27 | "url": "git://github.com/croqaz/web-snap.git"
28 | },
29 | "bin": {
30 | "web-record": "cli/record.js",
31 | "web-restore": "cli/restore.js",
32 | "snap-stats": "cli/stats.js"
33 | },
34 | "scripts": {
35 | "test": "npx ava --verbose test/*.test.js"
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Cristi Constantin
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/test/record.test.js:
--------------------------------------------------------------------------------
1 | import test from 'ava';
2 | import http from 'http';
3 | import express from 'express';
4 | import { recordPage } from '../src/record.js';
5 |
6 | const PORT = 12345;
7 |
8 | function createTestServer() {
9 | const app = express();
10 | const server = http.createServer(app);
11 | app.set('etag', false);
12 | return { app, server };
13 | }
14 |
15 | test('basic record page', async (t) => {
16 | const { app, server } = createTestServer();
17 | app.get('/', function (_, res) {
18 | res.send('Hello world');
19 | });
20 | await new Promise((resolve) => server.listen(PORT, resolve));
21 |
22 | const { snapshot, browser } = await recordPage({
23 | url: `http://localhost:${PORT}`,
24 | timeout: 3,
25 | imgTimeout: 3,
26 | headless: true,
27 | });
28 | t.is(snapshot.url, `http://localhost:${PORT}`);
29 | t.is(snapshot.base_url, `http://localhost:${PORT}/`);
30 | t.true(snapshot.html.includes('Hello world'));
31 | t.deepEqual(snapshot.responses, {});
32 |
33 | await browser.close();
34 | server.close();
35 | });
36 |
--------------------------------------------------------------------------------
/cli/restore.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | import mri from 'mri';
3 |
4 | import pkg from '../package.json' with { type: 'json' };
5 | import { restorePage } from '../src/restore.js';
6 | import { delay } from '../src/util.js';
7 |
8 | const options = {
9 | boolean: ['help', 'version'],
10 | alias: {
11 | i: 'input',
12 | v: 'version',
13 | rm: 'removeElems',
14 | // c: 'config',
15 | },
16 | default: {
17 | headless: null, // visible browser window
18 | js: 'yes', // JS execution on restore
19 | offline: 'yes', // force browser offline
20 | timeout: 15, // navigation timeout
21 | wait: 120, // keep the browser open (seconds)
22 | overwrite: null, // overwrite body HTML with HTML from snapshot
23 | removeElems: '', // remove page elements
24 | },
25 | };
26 |
27 | (async () => {
28 | const args = mri(process.argv.slice(2), options);
29 |
30 | if (args.version) {
31 | console.log('Web-Snap v' + pkg.version);
32 | return;
33 | }
34 |
35 | const { page, browser } = await restorePage(args);
36 | page.on('close', () => process.exit());
37 | browser.on('disconnected', () => process.exit());
38 |
39 | await delay(args.wait);
40 | await browser.close();
41 | })();
42 |
--------------------------------------------------------------------------------
/test/quopri.test.js:
--------------------------------------------------------------------------------
1 | import test from 'ava';
2 | import { encode, decode } from '../src/quopri.js';
3 |
4 | test('quopri test', async (t) => {
5 | t.is(decode(' =3D=20'), ' = ');
6 | t.is(decode('foo\r\nbar='), 'foo\r\nbar');
7 | t.is(decode('=E4=BD=A0=E5=A5=BD'), 'ä½ å¥½'); // 你好
8 | t.is(
9 | decode('I=C3=B1t=C3=ABrn=C3=A2ti=C3=B4n=C3=A0liz=C3=A6ti=C3=B8n=E2=98=83=F0=9F=92=\r\n=A9'),
10 | 'Iñtërnâtiônà lizætiønâ\x98\x83ð\x9F\x92©',
11 | ); // Iñtërnâtiônàlizætiøn☃💩
12 | t.is(
13 | decode('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ=20'),
14 | 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ ',
15 | );
16 |
17 | t.is(encode(' = '), ' =3D=20');
18 | t.is(encode('foo\t'), 'foo=09');
19 | t.is(
20 | encode('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ='),
21 | 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ=3D',
22 | );
23 | t.is(
24 | encode('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ '),
25 | 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ=20',
26 | );
27 |
28 | t.is(decode(encode('a\nb\nc\n')), 'a\nb\nc\n');
29 | });
30 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Logs
2 | logs
3 | *.log
4 | npm-debug.log*
5 | yarn-debug.log*
6 | yarn-error.log*
7 | lerna-debug.log*
8 |
9 | # Diagnostic reports (https://nodejs.org/api/report.html)
10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
11 |
12 | # Runtime data
13 | pids
14 | *.pid
15 | *.seed
16 | *.pid.lock
17 |
18 | # Directory for instrumented libs generated by jscoverage/JSCover
19 | lib-cov
20 |
21 | # Coverage directory used by tools like istanbul
22 | coverage
23 | *.lcov
24 |
25 | # nyc test coverage
26 | .nyc_output
27 |
28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
29 | .grunt
30 |
31 | # Bower dependency directory (https://bower.io/)
32 | bower_components
33 |
34 | # node-waf configuration
35 | .lock-wscript
36 |
37 | # Compiled binary addons (https://nodejs.org/api/addons.html)
38 | build/Release
39 |
40 | # Dependency directories
41 | node_modules/
42 | jspm_packages/
43 |
44 | # TypeScript v1 declaration files
45 | typings/
46 |
47 | # TypeScript cache
48 | *.tsbuildinfo
49 |
50 | # Optional npm cache directory
51 | .npm
52 |
53 | # Optional eslint cache
54 | .eslintcache
55 |
56 | # Microbundle cache
57 | .rpt2_cache/
58 | .rts2_cache_cjs/
59 | .rts2_cache_es/
60 | .rts2_cache_umd/
61 |
62 | # Optional REPL history
63 | .node_repl_history
64 |
65 | # Output of 'npm pack'
66 | *.tgz
67 |
68 | # Yarn Integrity file
69 | .yarn-integrity
70 |
71 | # dotenv environment variables file
72 | .env
73 | .env.test
74 |
75 | # parcel-bundler cache (https://parceljs.org/)
76 | .cache
77 |
78 | # Next.js build output
79 | .next
80 |
81 | # Nuxt.js build / generate output
82 | .nuxt
83 | dist
84 |
85 | # Gatsby files
86 | .cache/
87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js
88 | # https://nextjs.org/blog/next-9-1#public-directory-support
89 | # public
90 |
91 | # vuepress build output
92 | .vuepress/dist
93 |
94 | # Serverless directories
95 | .serverless/
96 |
97 | # FuseBox cache
98 | .fusebox/
99 |
100 | # DynamoDB Local files
101 | .dynamodb/
102 |
103 | # TernJS port file
104 | .tern-port
105 |
106 | # Snapshot files
107 | snap*.json
108 | snap*.json.gz
109 |
--------------------------------------------------------------------------------
/src/util.js:
--------------------------------------------------------------------------------
1 | /*
2 | * Common utils
3 | */
4 | import fs from 'fs';
5 | import { gunzip } from 'zlib';
6 | import { promisify } from 'util';
7 |
8 | import { encode, decode } from './quopri.js';
9 |
10 | export function delay(time) {
11 | return new Promise((resolve) => setTimeout(resolve, time));
12 | }
13 |
14 | export function requestKey(r) {
15 | return `${r.method()}:${r.url()}`;
16 | }
17 |
18 | export function normalizeURL(url) {
19 | if (!url) return '';
20 | const u = new URL(url.replace(/\/+$/, ''));
21 | u.hash = '';
22 | return u.toString();
23 | }
24 |
25 | export function checkBrowser(str) {
26 | return ['chromium', 'firefox', 'webkit'].includes(str);
27 | }
28 |
29 | export function toBool(str) {
30 | if (!str) return !!str;
31 | if (typeof str !== 'string') return str;
32 | str = str.toLowerCase();
33 | if (str === 'false' || str === 'off' || str === 'no' || str === '0') return false;
34 | return true;
35 | }
36 |
37 | export function smartSplit(str) {
38 | if (!str) return [];
39 | if (typeof str !== 'string') return str;
40 | const split = [];
41 | for (let s of str.split(/[,; ]+/)) {
42 | if (s.trim()) {
43 | split.push(s);
44 | }
45 | }
46 | return split;
47 | }
48 |
49 | export async function parseSnapshot(fname) {
50 | let record = await fs.promises.readFile(fname);
51 | if (fname.endsWith('.gz')) {
52 | record = await promisify(gunzip)(record);
53 | }
54 | return JSON.parse(record);
55 | }
56 |
57 | export function encodeBody(resourceType, contentType, buffer) {
58 | if (!buffer || buffer.length === 0) return '';
59 | if (
60 | resourceType === 'document' ||
61 | resourceType === 'stylesheet' ||
62 | resourceType === 'script' ||
63 | resourceType === 'manifest'
64 | ) {
65 | return `QUOPRI:${encode(buffer)}`;
66 | }
67 | if (
68 | contentType &&
69 | (contentType.startsWith('text/') ||
70 | contentType.startsWith('image/svg+xml') ||
71 | contentType.startsWith('application/json'))
72 | ) {
73 | return `QUOPRI:${encode(buffer)}`;
74 | }
75 | return `BASE64:${buffer.toString('base64')}`;
76 | }
77 |
78 | export function decodeBody(body) {
79 | if (!body || body.length === 0) return '';
80 | if (body.startsWith('QUOPRI:')) return decode(body.slice(7));
81 | if (body.startsWith('BASE64:')) return Buffer.from(body.slice(7), 'base64');
82 | return Buffer.from(body, 'base64');
83 | }
84 |
--------------------------------------------------------------------------------
/cli/stats.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | import mri from 'mri';
3 | import prettyBytes from 'pretty-bytes';
4 |
5 | import { parseSnapshot } from '../src/util.js';
6 |
7 | const options = {
8 | alias: {
9 | i: 'input',
10 | },
11 | };
12 |
13 | function bar(value, maxValue) {
14 | // https://github.com/morishin/ascii-horizontal-barchart
15 | const fractions = ['▏', '▎', '▍', '▋', '▊', '▉'];
16 | const barLength = (value * 100) / maxValue;
17 | const wholeNumberPart = Math.floor(barLength);
18 | const fractionalPart = barLength - wholeNumberPart;
19 | let txt = fractions[fractions.length - 1].repeat(wholeNumberPart);
20 | if (fractionalPart > 0) txt += fractions[Math.floor(fractionalPart * fractions.length)];
21 | return txt;
22 | }
23 |
24 | (async () => {
25 | const args = mri(process.argv.slice(2), options);
26 |
27 | const fname = args._ ? args._[0] : null || args.input;
28 | const snap = await parseSnapshot(fname);
29 |
30 | let resourceTypes = {};
31 | let maxValue = Math.max(...Object.values(snap.responses).map((v) => (v.body ? v.body.length : 0)));
32 | const data = Object.entries(snap.responses)
33 | .map(([k, v]) => {
34 | const t = (v.headers && v.headers['content-type']) ? v.headers['content-type'].split('/')[0] : 'other';
35 | if (resourceTypes[t]) resourceTypes[t] += 1;
36 | else resourceTypes[t] = 1;
37 | return [k, v.body ? v.body.length : 0];
38 | })
39 | .filter(([_, v]) => v >= maxValue / 20 && v > 100);
40 | const totSize = data.reduce((sum, curr) => sum + curr[1], 0);
41 |
42 | console.log(`\nHTML body size: ${prettyBytes(snap.html.length, { minimumFractionDigits: 2 })}`);
43 | console.log(`Resources size: ${prettyBytes(totSize, { minimumFractionDigits: 2 })}`);
44 | console.log(`There are ${Object.keys(snap.responses).length} resources in total`);
45 |
46 | data.push(['GET:HTML body', snap.html.length]);
47 |
48 | data.sort((a, b) => b[1] - a[1]);
49 | console.log('\nTop resources by size::');
50 | for (const [txt, nr] of data.slice(0, 10)) {
51 | const barText = bar(nr, maxValue);
52 | const suffix = ' ' + prettyBytes(nr, { minimumFractionDigits: 2 });
53 | let http = txt.split(':').slice(1).join(':');
54 | http = http.replace(/^https?:\/\/(w+?\.)?/, '');
55 | if (http.length > 165) http = http.slice(0, 160) + ' ... ' + http.slice(-5);
56 | console.log(http);
57 | console.log(barText + suffix);
58 | }
59 |
60 | console.log('\nResources by type::');
61 | resourceTypes = Array.from(Object.entries(resourceTypes));
62 | resourceTypes.sort((a, b) => b[1] - a[1]);
63 | maxValue = resourceTypes[0][1];
64 | for (const [txt, nr] of resourceTypes) {
65 | const barText = bar(nr, maxValue);
66 | const suffix = ' ' + nr;
67 | console.log(txt + '\n' + barText + suffix);
68 | }
69 | })();
70 |
--------------------------------------------------------------------------------
/src/quopri.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Quoted-printable decode string.
3 | */
4 | export function decode(input) {
5 | // Reference: https://mths.be/quoted-printable by @mathias | MIT license
6 | return (
7 | input
8 | // https://tools.ietf.org/html/rfc2045#section-6.7, rule 3:
9 | // "Therefore, when decoding a `Quoted-Printable` body, any trailing white
10 | // space on a line must be deleted, as it will necessarily have been added
11 | // by intermediate transport agents"
12 | .replace(/[\t\x20]$/gm, '')
13 | // Remove hard line breaks preceded by `=`. Proper `Quoted-Printable`-
14 | // encoded data only contains CRLF line endings, but for compatibility
15 | // reasons we support separate CR and LF too.
16 | .replace(/=(?:\r\n?|\n|$)/g, '')
17 | // Decode escape sequences of the form `=XX` where `XX` is any
18 | // combination of two hexidecimal digits. For optimal compatibility,
19 | // lowercase hexadecimal digits are supported as well. See
20 | // https://tools.ietf.org/html/rfc2045#section-6.7, note 1.
21 | .replace(/=([a-fA-F0-9]{2})/g, function (_, $1) {
22 | let codePoint = parseInt($1, 16);
23 | return String.fromCharCode(codePoint);
24 | })
25 | );
26 | }
27 |
28 | /**
29 | * Quoted-printable encode string or Buffer.
30 | */
31 | export function encode(buffer) {
32 | // Reference: https://npmjs.com/package/libqp by Andris Reinman | MIT license
33 | if (typeof buffer === 'string') {
34 | buffer = Buffer.from(buffer, 'utf-8');
35 | }
36 |
37 | // usable characters that do not need encoding
38 | const ranges = [
39 | // https://tools.ietf.org/html/rfc2045#section-6.7
40 | [0x09], //
41 | [0x0a], //
42 | [0x0d], //
43 | [0x20, 0x3c], // !"#$%&'()*+,-./0123456789:;
44 | [0x3e, 0x7e], // >?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}
45 | ];
46 | let result = '';
47 |
48 | for (let i = 0, len = buffer.length; i < len; i++) {
49 | let ord = buffer[i];
50 | // if the char is in allowed range, then keep as is, unless it is a ws in the end of a line
51 | if (
52 | checkRanges(ord, ranges) &&
53 | !(
54 | (ord === 0x20 || ord === 0x09) &&
55 | (i === len - 1 || buffer[i + 1] === 0x0a || buffer[i + 1] === 0x0d)
56 | )
57 | ) {
58 | result += String.fromCharCode(ord);
59 | continue;
60 | }
61 | result += '=' + (ord < 0x10 ? '0' : '') + ord.toString(16).toUpperCase();
62 | }
63 |
64 | return result;
65 | }
66 |
67 | /**
68 | * Helper function to check if a number is inside provided ranges
69 | */
70 | function checkRanges(nr, ranges) {
71 | for (let i = ranges.length - 1; i >= 0; i--) {
72 | if (!ranges[i].length) {
73 | continue;
74 | }
75 | if (ranges[i].length === 1 && nr === ranges[i][0]) {
76 | return true;
77 | }
78 | if (ranges[i].length === 2 && nr >= ranges[i][0] && nr <= ranges[i][1]) {
79 | return true;
80 | }
81 | }
82 | return false;
83 | }
84 |
--------------------------------------------------------------------------------
/block/update_bad_hosts.py:
--------------------------------------------------------------------------------
1 | """
2 | Example usage:
3 | python block/update_bad_hosts.py block/blocklist.txt
4 | """
5 | import sys
6 | import requests
7 |
8 |
9 | def upd_easylist():
10 | name = 'EASYLIST'
11 | URL = 'https://v.firebog.net/hosts/Easylist.txt'
12 | # https://easylist.to/easylist/easylist.txt
13 | hosts = set()
14 | r = requests.get(URL)
15 | print(r, URL)
16 | for line in r.text.split('\n')[5:]:
17 | line = line.strip()
18 | if not line or line[0] == '#' or len(line) < 4:
19 | continue
20 | hosts.add(line.strip())
21 |
22 | URL = 'https://v.firebog.net/hosts/Easyprivacy.txt'
23 | # https://easylist.to/easylist/easyprivacy.txt
24 | r = requests.get(URL)
25 | print(r, URL)
26 | for line in r.text.split('\n')[5:]:
27 | line = line.strip()
28 | if not line or line[0] == '#' or len(line) < 4:
29 | continue
30 | hosts.add(line.strip())
31 |
32 | print(f'{name} found hosts: {len(hosts)}')
33 | return name, hosts
34 |
35 |
36 | def upd_adaway():
37 | name = 'ADAWAY'
38 | URL = 'https://adaway.org/hosts.txt'
39 | r = requests.get(URL)
40 | print(r, URL)
41 | hosts = set()
42 | for line in r.text.split('\n'):
43 | line = line.strip()
44 | if not line or line[0] == '#':
45 | continue
46 | if line.startswith('127.0.0.1 '):
47 | hosts.add(line[9:].strip())
48 | print(f'{name} found hosts: {len(hosts)}')
49 | return name, hosts
50 |
51 |
52 | def upd_disconnect():
53 | name = 'DISCONNECT'
54 | URL = 'https://s3.amazonaws.com/lists.disconnect.me/simple_ad.txt'
55 | r = requests.get(URL)
56 | print(r, URL)
57 | hosts = set()
58 | for line in r.text.split('\n')[3:]:
59 | line = line.strip()
60 | if not line or line[0] == '#':
61 | continue
62 | hosts.add(line.strip())
63 | print(f'{name} found hosts: {len(hosts)}')
64 | return name, hosts
65 |
66 |
67 | def upd_w3kbl():
68 | name = 'W3KBL'
69 | URL = 'https://v.firebog.net/hosts/static/w3kbl.txt'
70 | r = requests.get(URL)
71 | print(r, URL)
72 | hosts = set()
73 | for line in r.text.split('\n')[6:]:
74 | line = line.strip()
75 | if not line or line[0] == '#':
76 | continue
77 | hosts.add(line.strip().split(" ")[0])
78 | print(f'{name} found hosts: {len(hosts)}')
79 | return name, hosts
80 |
81 |
82 | def save_result():
83 | OUTPUT = sys.argv[1] if len(sys.argv) > 1 else 'blocklist.txt'
84 |
85 | # Custom list of block rules
86 | CUSTOM = set([
87 | # google
88 | 'google-analytics.com',
89 | 'google.com/adsense/search',
90 | 'google.com/recaptcha',
91 | 'googleads.g.doubleclick.net/pagead',
92 | 'googleoptimize.com',
93 | 'gstatic.com/recaptcha/releases',
94 | # amazon
95 | 'fls-na.amazon.com',
96 | 'cloudfront-labs.amazonaws.com',
97 | 'unagi.amazon.com/\\d/events',
98 | # other
99 | 'match.adsrvr.org/track',
100 | # cookie popups
101 | 'cdn.cookielaw.org',
102 | ])
103 | # popular lists
104 | _, easy = upd_easylist()
105 | _, adaway = upd_adaway()
106 | _, disco = upd_disconnect()
107 | _, w3kbl = upd_w3kbl()
108 |
109 | hosts = CUSTOM | (adaway & w3kbl) | (adaway & easy) | (easy & w3kbl) | (easy & disco) | (w3kbl & disco)
110 | with open(OUTPUT, 'w') as fd:
111 | fd.write('# Generated from update_bad_hosts.py\n')
112 | for x in sorted(hosts):
113 | if len(x) < 5: continue
114 | fd.write(f'{x}\n')
115 | print(f'Written {len(hosts)} hosts in {OUTPUT}')
116 |
117 |
118 | if __name__ == '__main__':
119 | save_result()
120 |
--------------------------------------------------------------------------------
/cli/record.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 | import fs from 'fs';
3 | import { gzip } from 'zlib';
4 | import { promisify } from 'util';
5 | import { minify } from 'html-minifier-terser';
6 | import prettyBytes from 'pretty-bytes';
7 | import mri from 'mri';
8 |
9 | import pkg from '../package.json' with { type: 'json' };
10 | import { recordPage } from '../src/record.js';
11 | import { delay } from '../src/util.js';
12 |
13 | const options = {
14 | boolean: ['help', 'version'],
15 | alias: {
16 | i: 'input',
17 | o: 'output',
18 | v: 'version',
19 | z: 'gzip',
20 | css: 'addCSS',
21 | rm: 'removeElems',
22 | drop: 'dropRequests',
23 | },
24 | default: {
25 | // browser: 'chromium', // only Chromium supported for now
26 | gzip: null, // compress final JSON
27 | headless: null, // visible browser window
28 | blockAds: null, // enable AdBlocker?
29 | blockList: null, // block domains from custom list
30 | extraMeta: null, // extract meta from HTML?
31 | iframes: null, // capture iframes?
32 | js: 'on', // disable JS execution and capturing
33 | minify: null, // min final HTML before save
34 | minCSS: null, // min final CSS before save
35 | purgeCSS: null, // purge unused CSS and generate 1 single CSS file
36 | timeout: 15, // navigation timeout
37 | imgTimeout: 15,
38 | wait: 5, // wait for user interaction (seconds)
39 | // headers: 'content-type, date', // Content-Type header is pretty important
40 | headers: 'content-type, content-length, content-range, date, content-language, last-modified', // extended version
41 | userAgent: '', // custom user agent
42 | // userAgent: Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36
43 | dropRequests: '', // drop matching requests
44 | dropStatus: '', // drop matching statuses
45 | removeElems: '', // remove page elements
46 | addCSS: '', // add extra CSS
47 | console: null, // print browser's console msgs
48 | },
49 | };
50 |
51 | (async () => {
52 | const args = mri(process.argv.slice(2), options);
53 |
54 | if (args.version) {
55 | console.log('Web-Snap v' + pkg.version);
56 | return;
57 | }
58 |
59 | const { snapshot, page, context, browser } = await recordPage(args);
60 |
61 | page.on('close', async () => {
62 | if (args.minify) {
63 | const s1 = snapshot.html.length;
64 | try {
65 | snapshot.html = await minify(snapshot.html, {
66 | caseSensitive: true,
67 | collapseBooleanAttributes: true,
68 | collapseWhitespace: true,
69 | conservativeCollapse: true,
70 | continueOnParseError: true,
71 | quoteCharacter: "'",
72 | removeAttributeQuotes: true,
73 | removeStyleLinkTypeAttributes: true,
74 | sortAttributes: true,
75 | sortClassName: true,
76 | });
77 | const s2 = snapshot.html.length;
78 | const s3 = prettyBytes(s2, { maximumFractionDigits: 2 });
79 | console.log(
80 | `Body HTML minify efficiency ${((s2 / s1) * 100).toFixed(2)}% from ` +
81 | `${Intl.NumberFormat('en').format(s1)} to ${Intl.NumberFormat('en').format(s2)} ` +
82 | `(${s3})`,
83 | );
84 | } catch (err) {
85 | console.error('Cannot minify HTML!', err);
86 | }
87 | }
88 | if (args.gzip) {
89 | const record = await promisify(gzip)(JSON.stringify(snapshot));
90 | await fs.promises.writeFile(args.OUT, record, { encoding: 'utf8' });
91 | } else {
92 | await fs.promises.writeFile(args.OUT, JSON.stringify(snapshot, null, 2), { encoding: 'utf8' });
93 | }
94 | console.log(`Snapshot file: "${args.OUT}" was saved`);
95 | process.exit();
96 | });
97 |
98 | console.log(`Waiting ${args.wait / 1000} sec...`);
99 | await delay(args.wait);
100 | await browser.close();
101 | })();
102 |
--------------------------------------------------------------------------------
/src/restore.js:
--------------------------------------------------------------------------------
1 | /*
2 | * Restore a recorded page.
3 | */
4 | import { chromium } from 'playwright';
5 |
6 | import { requestKey, normalizeURL, toBool, smartSplit, parseSnapshot, decodeBody } from './util.js';
7 |
8 | async function processArgs(args) {
9 | args.js = toBool(args.js);
10 | args.headless = toBool(args.headless); // debug & tests
11 | args.offline = toBool(args.offline);
12 | args.timeout = parseInt(args.timeout) * 1000;
13 | args.wait = parseInt(args.wait) * 1000;
14 | args.REMOVE = smartSplit(args.removeElems);
15 |
16 | const snap = args._ ? args._[0] : null || args.input;
17 | if (snap) {
18 | args.RECORD = await parseSnapshot(snap);
19 | }
20 | }
21 |
22 | export async function restorePage(args) {
23 | await processArgs(args);
24 | const record = args.RECORD;
25 |
26 | if (!record) {
27 | console.error('Empty snapshot file! Cannot launch!');
28 | return;
29 | }
30 | if (!((record.url || record.base_url) && record.html && record.responses)) {
31 | console.error('Invalid snapshot file! Cannot launch!');
32 | return;
33 | }
34 |
35 | const URL = normalizeURL(record.base_url || record.url);
36 | console.log('Restoring URL:', URL);
37 |
38 | const browser = await chromium.launch({
39 | headless: args.headless,
40 | args: [
41 | '--allow-running-insecure-content',
42 | '--disable-background-networking',
43 | '--disable-breakpad',
44 | '--disable-crash-reporter',
45 | '--disable-default-apps',
46 | '--disable-demo-mode',
47 | '--disable-extensions',
48 | '--disable-features=IsolateOrigins',
49 | '--disable-site-isolation-trials',
50 | '--disable-speech-api',
51 | '--disable-sync',
52 | '--disable-web-security',
53 | ],
54 | });
55 |
56 | const context = await browser.newContext({
57 | bypassCSP: true,
58 | acceptInsecureCerts: true,
59 | ignoreHTTPSErrors: true,
60 | javaScriptEnabled: args.js,
61 | offline: args.offline,
62 | // serviceWorkers: 'block',
63 | viewport: null,
64 | });
65 |
66 | const page = await context.newPage();
67 |
68 | page.on('console', async (msg) => {
69 | if (msg.text().startsWith('Failed to load resource')) return;
70 | console.log(`CONSOLE ${msg.type()}: ${msg.text()}`);
71 | });
72 |
73 | page.setDefaultTimeout(args.timeout);
74 | await context.route('**', async (route) => {
75 | const r = route.request();
76 | const u = normalizeURL(r.url());
77 |
78 | if (u === URL) {
79 | console.log(`Restored INDEX from CACHE: ${u}`);
80 | route.fulfill({
81 | contentType: 'text/html; charset=utf-8',
82 | body: record.html,
83 | });
84 | return;
85 | }
86 |
87 | const key = requestKey(r);
88 | const cached = record.responses[key];
89 | if (cached && cached.status) {
90 | // ignore all javascript requests on restore, when JS disabled
91 | const contentType = cached.headers['content-type'];
92 | if (
93 | !args.js &&
94 | contentType &&
95 | (contentType.startsWith('text/javascript') ||
96 | contentType.startsWith('application/javascript') ||
97 | contentType.startsWith('application/x-javascript'))
98 | ) {
99 | // HTTP 204 = NO CONTENT
100 | route.fulfill({ status: 204 });
101 | return;
102 | }
103 | console.log(`Restored from CACHE: ${key}`);
104 | route.fulfill({
105 | contentType: contentType || '',
106 | body: decodeBody(cached.body),
107 | status: record.status,
108 | headers: cached.headers, // Some headers may be useful here
109 | });
110 | return;
111 | }
112 |
113 | // else
114 | console.log(`MISSING resource: ${key}`);
115 | route.continue(); // or abort ??
116 | });
117 |
118 | // navigate to the resolved URL instead of the user provided one
119 | try {
120 | await page.goto(URL, { waitUntil: 'networkidle' });
121 | } catch (err) {
122 | console.error('Page timeout:', err);
123 | }
124 |
125 | // overwrite page content with the one from the snapshot, to fix potential JS issues
126 | if (args.overwrite && args.js) {
127 | console.log('REWRITE page content from snapshot..');
128 | page.setContent(record.html);
129 | }
130 |
131 | for (const selector of args.REMOVE) {
132 | console.log('REMOVE element selector:', selector);
133 | await page.evaluate((s) => {
134 | for (const el of document.querySelectorAll(s)) {
135 | el.parentNode.removeChild(el);
136 | }
137 | }, selector);
138 | }
139 |
140 | return { page, context, browser };
141 | }
142 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Web-snaphots
2 |
3 | Create "perfect" snapshots of web pages.
4 |
5 |
6 | ## Install
7 |
8 | ``` shell
9 | $ npm install git+https://github.com/croqaz/web-snap.git
10 | ```
11 |
12 | ## Usage
13 |
14 | ``` shell
15 | $ web-record https://en.wikipedia.org/wiki/Online_and_offline
16 | ```
17 |
18 | This will open a Chrome-like browser, show you the page and create an output file called by default: "snapshot_en.wikipedia.org.json"
19 | To restore this snapshot file, you can use:
20 |
21 | ``` shell
22 | $ web-restore snapshot_en.wikipedia.org.json
23 | ```
24 |
25 | This will open a Chrome-like browser, show the page and you can read it even if you're offline.
26 |
27 | You can also save and restore more complicated pages, like Amazon products:
28 |
29 | ``` shell
30 | $ web-record https://www.amazon.com/dp/B07978J597/
31 | $ web-restore snapshot_amazon.com.json
32 | ```
33 |
34 | Note that some pages should be scrolled a little bit and hover some elements, to make sure all the page and images are loaded before the snapshot is taken.
35 | This is not a limitation of web-snap, it's how modern browsers and pages are intentionally built to load resources lazily, on demand.
36 |
37 | For a complete example, with all the flags:
38 |
39 | ``` shell
40 | $ web-record https://en.wikipedia.org/wiki/Online_and_offline --gzip \
41 | --rm 'script, #mw-navigation, #mw-page-base, #mw-head-base, #footer-icons' \
42 | --css '#content{margin-left:0 !important}' --drop '.png$, .css$' --wait 10 \
43 | --js off --minify --purgeCSS
44 | ```
45 |
46 | 
47 |
48 | This will store the page just like before, but it will do a lot of pre-processing, to reduce the snapshot size from *1.3MB*, to only *27K* (48x smaller), without losing any useful information.
49 |
50 | The `--gzip` flag will archive the JSON using GZIP. It is totally safe to use.
51 | The `--rm` flag, or `--removeElems`, will remove the specified page elements, using selectors. This can be used to remove useless elements so you can focus on the important content and reduce the snapshot size.
52 | The `--css` flag, or `--addCSS`, will add custom CSS on the page, before creating the snapshot. This can be used to change the font size, or move some elements to make the page look nicer.
53 | The `--drop`, or `--dropRequests` flag, will drop all HTTP requests matching, with regex. This can be used to stop tracking requests and reduce the final snapshot size.
54 | The `--wait` how much the browser page will stay open (in seconds) to allow the user to interact with the page, eg: accept cookies, close popups, scroll a little, hover some images.
55 | The `--js` flag will stop the browser from executing Javascript and will drop all Javascript requests, which usually reduces the snapshot size by A LOT. NOTE that this option will completely break many pages.
56 | The `--minify` flag will try to compress the final HTML as much as possible, to reduce the snapshot size. NOTE that this can crash for some pages with lots of Javascript.
57 | The `--purgeCSS` flag will purge all unused CSS and replace all styles with this processed CSS. This can reduce the snapshot size by A LOT, but will completely break some pages.
58 |
59 | And a last example, how to capture an Amazon page:
60 |
61 | ``` shell
62 | web-record https://www.amazon.com/dp/B086CV781H --gzip \
63 | --rm 'script #nav-main #mars-fs-wrapper #rhf #navFooter #navBackToTop' \
64 | --blockAds yes --blockList block/blocklist.txt --drop '//unagi.amazon.com/1' \
65 | --js off --minify --wait 10
66 | ```
67 |
68 | 
69 |
70 | These options will reduce the Amazon snapshot from ~*21MB*, to *857K* (24x smaller), without losing any useful information.
71 |
72 | If you care about the snapshot size, you need to try different options depending on the domain, to see what works, because some options will break the page on restore.
73 |
74 |
75 | ## File format
76 |
77 | The `snapshot.json` file format is simple:
78 |
79 | - url - is the URL specified when creating the snapshot
80 | - base_url - this is the resolved URL, after redirects (eg: may redirect to HTTPS and www.)
81 | - canonical_url - (optional) this is the canonical URL of the page
82 | - title - (optional) this is the title of the page
83 | - html - is the final, settled HTML of the page
84 | - responses - contains all the resources of the page (CSS, JS, images, etc) as key-value pairs:
85 | - body - the resource body saved as Quopri or Base64
86 | - headers - a limited subset of the response headers
87 | - request_url - the initial resource URL
88 | - response_url - (optional) the final response URL, after redirects (if it's different than the request URL)
89 | - status - a number representing the HTTP status
90 |
91 | The format is subject to change, ideally to simplify it.
92 |
93 |
94 | ## Limitations
95 |
96 | This format doesn't usually capture the audio and video of the page.
97 | This means you can't completely capture Youtube, Vimeo, or Spotify pages. (YET? or never?)
98 | This limitation may change in the future, but it's not the primary goal of the project.
99 |
100 | There are also issues with some iframes and shadow DOM nodes.
101 |
102 | Read my article that compares WARC, rrWeb and "recorded":
103 | https://crlf.link/log/entries/220803-web-snap/
104 |
105 |
106 | ## Similar
107 |
108 | - https://github.com/Y2Z/monolith
109 | - https://github.com/go-shiori/obelisk
110 | - https://github.com/danburzo/percollate
111 | - https://github.com/croqaz/clean-mark
112 | - https://github.com/gildas-lormeau/SingleFile
113 | - https://github.com/sindresorhus/capture-website
114 |
115 | Also check:
116 |
117 | - https://crlf.link/mem/offline
118 | - https://crlf.link/mem/web-archiving
119 |
--------------------------------------------------------------------------------
/block/blocklist.txt:
--------------------------------------------------------------------------------
1 | # Generated from update_bad_hosts.py
2 | 360yield.com
3 | 3gl.net
4 | 3p-geo.yahoo.com
5 | 3p-udc.yahoo.com
6 | 600z.com
7 | a-ads.com
8 | a-reporting.nytimes.com
9 | a.mobify.com
10 | a.ucoz.net
11 | a1.api.bbc.co.uk
12 | aamt.nbcnews.com
13 | aamt.today.com
14 | acdn.adnxs.com
15 | activemetering.com
16 | ad-balancer.net
17 | ad-cdn.technoratimedia.com
18 | ad-delivery.net
19 | ad-serverparc.nl
20 | ad-stir.com
21 | ad.daum.net
22 | ad.doubleclick.net
23 | ad.wsod.com
24 | adblockanalytics.com
25 | adbrite.com
26 | adbro.me
27 | adbutler.com
28 | adc-ad-assets.adtilt.com
29 | adcash.com
30 | adcloud.net
31 | ade.googlesyndication.com
32 | adform.net
33 | adgardener.com
34 | adlc-exchange.toast.com
35 | adlog.com.com
36 | admedo.com
37 | admitad.com
38 | admixer.net
39 | admob.com
40 | adnetworkperformance.com
41 | adnxs.com
42 | adocean.pl
43 | adotube.com
44 | adpacks.com
45 | adperium.com
46 | adrta.com
47 | ads-twitter.com
48 | ads.betfair.com
49 | ads.linkedin.com
50 | ads.samsung.com
51 | ads.saymedia.com
52 | ads.servebom.com
53 | ads1-adnow.com
54 | ads3-adnow.com
55 | ads5-adnow.com
56 | adsafeprotected.com
57 | adsame.com
58 | adscale.de
59 | adsdk.com
60 | adserver-2084671375.us-east-1.elb.amazonaws.com
61 | adserverplus.com
62 | adsfac.eu
63 | adsfac.net
64 | adsfac.us
65 | adskape.ru
66 | adsnative.com
67 | adsonar.com
68 | adspeed.net
69 | adspirit.de
70 | adsupply.com
71 | adtng.com
72 | adtoma.com
73 | adtrace.org
74 | adtrue.com
75 | advertica-cdn.com
76 | adviva.net
77 | adx-exchange.toast.com
78 | adxpansion.com
79 | adzmedia.com
80 | affiliate.dtiserv.com
81 | affiliatefuel.com
82 | affiliatefuture.com
83 | affiliates.thrixxx.com
84 | affiliatewindow.com
85 | affiz.net
86 | agkn.com
87 | aimatch.com
88 | alexandria.marfeelcdn.com
89 | alphonso.tv
90 | als-svc.nytimes.com
91 | altitude-arena.com
92 | am15.net
93 | amazon-adsystem.com
94 | amp-error-reporting.appspot.com
95 | amplifypixel.outbrain.com
96 | analytics-prod2.glance-internal.inmobi.com
97 | analytics-production.hapyak.com
98 | analytics-static.ugc.bazaarvoice.com
99 | analytics-tracker.thescore.com
100 | analytics.163.com
101 | analytics.analytics-egain.com
102 | analytics.carambo.la
103 | analytics.chase.com
104 | analytics.edgekey.net
105 | analytics.ff.avast.com
106 | analytics.foresee.com
107 | analytics.getshogun.com
108 | analytics.glance.inmobi.com
109 | analytics.kaltura.com
110 | analytics.kidoz.net
111 | analytics.kongregate.io
112 | analytics.logsss.com
113 | analytics.mailmunch.co
114 | analytics.nike.com
115 | analytics.plex.tv
116 | analytics.reyrey.net
117 | analytics.shareaholic.com
118 | analytics.tiktok.com
119 | analytics.tout.com
120 | analytics.vendemore.com
121 | analytics.wildtangent.com
122 | analytics.yahoo.com
123 | analytics.yolacdn.net
124 | analytics.yomedia.vn
125 | analytics.ziftsolutions.com
126 | andomedia.com
127 | annoyingacoustics.com
128 | api-js.mixpanel.com
129 | api.amplitude.com
130 | api.branch.io
131 | app.adjust.com
132 | appads.com
133 | ariane.abtasty.com
134 | as5000.com
135 | assets.micpn.com
136 | aswpsdkus.com
137 | atdmt.com
138 | atwola.com
139 | audit.median.hu
140 | axonix.com
141 | b.fox.com
142 | banners.adultfriendfinder.com
143 | banners.amigos.com
144 | banners.cams.com
145 | banners.passion.com
146 | banners.videosecrets.com
147 | bannershotlink.perfectgonzo.com
148 | bans.bride.ru
149 | bat.bing.com
150 | batch.upsight-api.com
151 | bats.video.yahoo.com
152 | beacon.flow.io
153 | beacon.qq.com
154 | beacon.riskified.com
155 | beacon.shazam.com
156 | beacon.sina.com.cn
157 | beacon.sojern.com
158 | beacons.mediamelon.com
159 | beap.gemini.yahoo.com
160 | bidder.criteo.com
161 | bids.concert.io
162 | bidswitch.net
163 | blogherads.com
164 | bluekai.com
165 | boomads.com
166 | bootstrap.upsight-api.com
167 | brainient.com
168 | brandreachsys.com
169 | bridgetrack.com
170 | bs.yandex.ru
171 | btloader.com
172 | bttrack.com
173 | c.bigmir.net
174 | c.mgid.com
175 | casalemedia.com
176 | cash4members.com
177 | cashlayer.com
178 | cc.swiftype.com
179 | ccgateway.net
180 | cdn-channels-pixel.ex.co
181 | cdn.cookielaw.org
182 | cdn.districtm.io
183 | cdn.usefathom.com
184 | cdn7.rocks
185 | cdnwidget.com
186 | cgicounter.puretec.de
187 | chanalytics.merchantadvantage.com
188 | checkm8.com
189 | chitika.net
190 | ck.connatix.com
191 | clickbooth.com
192 | clickboothlnk.com
193 | clickthruserver.com
194 | clickxchange.com
195 | client-analytics.braintreegateway.com
196 | clkrev.com
197 | cloudfront-labs.amazonaws.com
198 | cnt.my
199 | collect.banggood.com
200 | collect.igodigital.com
201 | collector.cint.com
202 | collector.xhamster.com
203 | colossusssp.com
204 | confiant-integrations.global.ssl.fastly.net
205 | connextra.com
206 | content.tapjoy.com
207 | contentabc.com
208 | contextweb.com
209 | count-server.sharethis.com
210 | count.rin.ru
211 | counter.bloke.com
212 | counter.cnw.cz
213 | counter.rambler.ru
214 | counter.snackly.co
215 | counter.yadro.ru
216 | cpays.com
217 | cpx.to
218 | cpxinteractive.com
219 | creative-serving.com
220 | creativecdn.com
221 | creatives.livejasmin.com
222 | crwdcntrl.net
223 | ct.pinterest.com
224 | cws.conviva.com
225 | cxad.cxense.com
226 | dapper.net
227 | dc.banggood.com
228 | dcinfos-cache.abtasty.com
229 | dd.nytimes.com
230 | dedicatedmedia.com
231 | demdex.net
232 | detect.rayjump.com
233 | direct-events-collector.spot.im
234 | directaclick.com
235 | directorym.com
236 | domdex.com
237 | doubleclick.com
238 | doubleclick.net
239 | doublepimp.com
240 | dpmsrv.com
241 | drfdisvc.walmart.com
242 | ds-aksb-a.akamaihd.net
243 | eacdn.com
244 | earnify.com
245 | ebuzzing.com
246 | ebz.io
247 | effectivemeasure.net
248 | emediate.dk
249 | emxdgt.com
250 | engine.fyber.com
251 | entrecard.s3.amazonaws.com
252 | eqads.com
253 | ero-advertising.com
254 | error-collector.ted.com
255 | et.nytimes.com
256 | euros4click.de
257 | event.collector.scopely.io
258 | events.attentivemobile.com
259 | events.brightline.tv
260 | events.privy.com
261 | events.redditmedia.com
262 | everesttech.net
263 | exoclick.com
264 | extend.tv
265 | extremereach.io
266 | eyereturn.com
267 | eyeviewads.com
268 | fam-ad.com
269 | fastapi.net
270 | fastclick.net
271 | fimserve.com
272 | firstlightera.com
273 | fls-na.amazon.com
274 | fls.doubleclick.net
275 | fmpub.net
276 | fuse.forbes.com
277 | fusionads.net
278 | fwmrm.net
279 | g.doubleclick.net
280 | ga-beacon.appspot.com
281 | gammaplatform.com
282 | ganon.yahoo.com
283 | gateway.foresee.com
284 | genieessp.com
285 | geo.nbcsports.com
286 | geo.yahoo.com
287 | geobanner.adultfriendfinder.com
288 | geolocation.forbes.com
289 | gj.mmstat.com
290 | gmads.net
291 | google-analytics.com
292 | google.com/adsense/search
293 | google.com/recaptcha
294 | googleads.g.doubleclick.net/pagead
295 | googleoptimize.com
296 | googlesyndication.com
297 | googletagservices.com
298 | greystripe.com
299 | gstatic.com/recaptcha/releases
300 | gwallet.com
301 | h12-media.com
302 | harrenmedianetwork.com
303 | hb.nexage.com
304 | hbopenbid.pubmatic.com
305 | hghit.com
306 | hits.informer.com
307 | hs-analytics.net
308 | hyperbanner.net
309 | iabusprivacy.pmc.com
310 | id5-sync.com
311 | ilyf4amifh.com
312 | imp.optaim.com
313 | imrworldwide.com
314 | in.treasuredata.com
315 | innity.net
316 | insightexpress.com
317 | insightexpressai.com
318 | irs01.com
319 | is-tracking-pixel-api-prod.appspot.com
320 | ja2n2u30a6rgyd.com
321 | jiwire.com
322 | juiceadv.com
323 | k.streamrail.com
324 | kanoodle.com
325 | karma.mdpcdn.com
326 | krxd.net
327 | kvinit-prod.api.kochava.com
328 | l.sharethis.com
329 | lakequincy.com
330 | lciapi.ninthdecimal.com
331 | leadbolt.net
332 | lfstmedia.com
333 | lgsmartad.com
334 | liftdna.com
335 | ligatus.com
336 | lightningcast.net
337 | linkbuddies.com
338 | linkexchange.com
339 | linkreferral.com
340 | log.adplex.co.kr
341 | log.go.com
342 | log.medietall.no
343 | log.outbrain.com
344 | log.pinterest.com
345 | log.sina.cn
346 | log.snapdeal.com
347 | logging.api.intuit.com
348 | logx.optimizely.com
349 | loopme.me
350 | lovelydrum.com
351 | ls.srvcs.tumblr.com
352 | lucidmedia.com
353 | lzjl.com
354 | ma.logsss.com
355 | madadsmedia.com
356 | mainadv.com
357 | marketgid.com
358 | marketing.888.com
359 | match.adsrvr.org/track
360 | match.prod.bidr.io
361 | matheranalytics.com
362 | maxonclick.com
363 | mbid.marfeelrev.com
364 | media6degrees.com
365 | mediaforge.com
366 | medleyads.com
367 | medyanetads.com
368 | metrics-logger.spot.im
369 | metrics.aetn.com
370 | metrics.brightcove.com
371 | metrics.fedex.com
372 | metrics.icloud.com
373 | metrics.kmsmep.com
374 | metrics.roblox.com
375 | metrics.ted.com
376 | metrics.timewarnercable.com
377 | mgid.com
378 | microad.net
379 | millennialmedia.com
380 | ml314.com
381 | mmismm.com
382 | mng-ads.com
383 | mocean.mobi
384 | monetize-api.coronalabs.com
385 | morgdm.ru
386 | mpnrs.com
387 | msads.net
388 | munchkin.marketo.net
389 | mythings.com
390 | nappyattack.com
391 | neocounter.neoworx-blog-tools.net
392 | nervoussummer.com
393 | nexus.ensighten.com
394 | nmcdn.us
395 | nuseek.com
396 | onclickads.net
397 | oneid.mmstat.com
398 | ophan.theguardian.com
399 | optad360.io
400 | osimg.nbcuni.com
401 | outcome-ssp.supersonicads.com
402 | overture.com
403 | oxado.com
404 | p.metrilo.com
405 | p.placed.com
406 | p.skimresources.com
407 | pagead2.googlesyndication.com
408 | pages-stats.rbl.ms
409 | pcash.imlive.com
410 | perf-events.cloud.unity3d.com
411 | perr.h-cdn.com
412 | pgmediaserve.com
413 | pgpartner.com
414 | phonograph2.voxmedia.com
415 | pi.ispot.tv
416 | ping.dozuki.com
417 | pingback.issuu.com
418 | pingjs.qq.com
419 | pings.conviva.com
420 | pippio.com
421 | pix.revjet.com
422 | pix.spot.im
423 | pixel.adsafeprotected.com
424 | pixel.facebook.com
425 | pixel.mtrcs.samba.tv
426 | pixel.wp.com
427 | pixiedust.buzzfeed.com
428 | placements.tapjoy.com
429 | platform.iteratehq.com
430 | player.adtelligent.com
431 | pointroll.com
432 | polarcdn-terrax.com
433 | polyad.net
434 | popads.net
435 | popunder.ru
436 | postrelease.com
437 | powerad.ai
438 | pr-bh.ybp.yahoo.com
439 | prd-collector-anon.ex.co
440 | prg.smartadserver.com
441 | primaryads.com
442 | projectwonderful.com
443 | promobenef.com
444 | promos.fling.com
445 | propellerads.com
446 | psa.carambo.la
447 | pt.ispot.tv
448 | pub.network
449 | pubmatic.com
450 | pushnami.com
451 | px.owneriq.net
452 | qnsr.com
453 | query1.petametrics.com
454 | r.skimresources.com
455 | ravm.tv
456 | rbthre.work
457 | recs.shareaholic.com
458 | referrer.disqus.com
459 | retagro.com
460 | retargeter.com
461 | rev2pub.com
462 | revcontent.com
463 | revmob.com
464 | revrtb.com
465 | rfihub.com
466 | rlcdn.com
467 | rlog.popin.cc
468 | rpc.tapjoy.com
469 | rtbpop.com
470 | rtbpopd.com
471 | rubiconproject.com
472 | rules.quantcount.com
473 | run-syndicate.com
474 | s.beop.io
475 | s.logsss.com
476 | s2d6.com
477 | samsungads.com
478 | sanalytics.disneyplus.com
479 | sanalytics.tbs.com
480 | sanalytics.verizon.com
481 | sanalytics.verizonwireless.com
482 | sbeacon.sina.com.cn
483 | sc-static.net
484 | sdk.appsflyer.com
485 | sdk.iad-01.braze.com
486 | sdk.iad-02.braze.com
487 | sdk.iad-03.braze.com
488 | sdk.iad-06.braze.com
489 | secure.merchantadvantage.com
490 | secure.perk0mean.com
491 | securepubads.g.doubleclick.net
492 | seg.sharethis.com
493 | sekindo.com
494 | servedbyadbutler.com
495 | smaato.net
496 | smadex.com
497 | smartadserver.com
498 | smetrics.aa.com
499 | smetrics.bestbuy.com
500 | smetrics.boston.com
501 | smetrics.chrysler.com
502 | smetrics.cnn.com
503 | smetrics.cox.com
504 | smetrics.dickssportinggoods.com
505 | smetrics.foxnews.com
506 | smetrics.lululemon.com
507 | smetrics.southwest.com
508 | smetrics.walgreens.com
509 | smetrics1.experian.com
510 | smy.iheart.com
511 | socdm.com
512 | sociomantic.com
513 | sofia.trustx.org
514 | solocpm.com
515 | solutions.invocacdn.com
516 | sparkstudios.com
517 | specialdeals.g5e.com
518 | speee-ad.akamaized.net
519 | sponsorpay.com
520 | spotscenered.info
521 | spotxchange.com
522 | srepdata.usatoday.com
523 | srtb.msn.com
524 | sstats.teenvogue.com
525 | star-advertising.com
526 | stas.outbrain.com
527 | static.doubleclick.net
528 | statm.the-adult-company.com
529 | stats-dev.brid.tv
530 | stats.appsflyer.com
531 | stats.bluebillywig.com
532 | stats.olark.com
533 | stats.smartclip.net
534 | stats.wordpress.com
535 | stats.wp.com
536 | stats.zotabox.com
537 | stickyadstv.com
538 | summerhamster.com
539 | sw88.abc.com
540 | sw88.espn.com
541 | sw88.go.com
542 | sweb.ulta.com
543 | sync.adap.tv
544 | t.appsflyer.com
545 | t.indeed.com
546 | t.leady.com
547 | t.metrilo.com
548 | t.sharethis.com
549 | t.skimresources.com
550 | t.wayfair.com
551 | t2.hulu.com
552 | tag.leadplace.fr
553 | tag.mtrcs.samba.tv
554 | tagger.opecloud.com
555 | tags.tiqcdn.com
556 | tapjoyads.com
557 | targeting.washpost.nile.works
558 | telemetrics.klaviyo.com
559 | telemetry.malwarebytes.com
560 | telemetry.sdk.inmobi.com
561 | thrtle.com
562 | tidaltv.com
563 | top-fwz1.mail.ru
564 | totemcash.com
565 | tpc.googlesyndication.com
566 | tr.snapchat.com
567 | trace.qq.com
568 | track.dictionary.com
569 | track.pricespider.com
570 | track.tappx.com
571 | track.tiara.daum.net
572 | track.uc.cn
573 | tracker.icerocket.com
574 | tracker.nbcuas.com
575 | tracker.personizely.net
576 | tracking.adalyser.com
577 | tracking.bloomberg.com
578 | tracking.immobilienscout24.de
579 | tracking.leadlander.com
580 | tracking.lengow.com
581 | tracking.lg.com
582 | tracking.listhub.net
583 | tracking.miui.com
584 | tradeadexchange.com
585 | trafficfactory.biz
586 | traffichunt.com
587 | trafficjunky.net
588 | traxex.gannettdigital.com
589 | tredir.go.com
590 | tremorhub.com
591 | tribalfusion.com
592 | triggers.wfxtriggers.com
593 | trk.clinch.co
594 | tru.am
595 | truoptik.com
596 | tvpixel.com
597 | twittad.com
598 | uimserv.net
599 | unagi.amazon.com/\d/events
600 | unrulymedia.com
601 | usc.adserver.snapads.com
602 | usersegment.wpdigital.net
603 | utarget.co.uk
604 | valueclick.com
605 | valueclickmedia.com
606 | vdopia.com
607 | vendimob.pl
608 | vi-serve.com
609 | videoevents.outbrain.com
610 | vindicosuite.com
611 | vntsm.com
612 | wass.ihsmarkit.com
613 | waust.at
614 | webads.co.nz
615 | webcounter.goweb.de
616 | websdk.appsflyer.com
617 | wetter.pushwoosh.com
618 | whaleads.com
619 | widget-pixels.outbrain.com
620 | widgetbucks.com
621 | wigetmedia.com
622 | wildcard.moatads.com.edgekey.net
623 | www.summerhamster.com
624 | x.disq.us
625 | xad.com
626 | xxxmyself.com
627 | yandexadexchange.net
628 | yieldads.com
629 | yieldlab.net
630 | yieldmanager.net
631 | yieldtraffic.com
632 | youborafds01.com
633 | youradexchange.com
634 | z.cdp-dev.cnn.com
635 | zenkreka.com
636 | zucks.net
637 |
--------------------------------------------------------------------------------
/src/record.js:
--------------------------------------------------------------------------------
1 | /*
2 | * Record a page.
3 | */
4 | import fs from 'fs';
5 | import fetch from 'cross-fetch';
6 | import playwright from 'playwright';
7 | import { transform as minifyCSS } from 'lightningcss';
8 | import { PurgeCSS } from 'purgecss';
9 | import { PlaywrightBlocker } from '@ghostery/adblocker-playwright';
10 |
11 | import { requestKey, normalizeURL, toBool, smartSplit, encodeBody } from './util.js';
12 |
13 | async function processArgs(args) {
14 | args.gzip = toBool(args.gzip);
15 | args.js = toBool(args.js);
16 | args.blockAds = toBool(args.blockAds);
17 | args.extraMeta = toBool(args.extraMeta);
18 | args.headless = toBool(args.headless);
19 | args.iframes = toBool(args.iframes);
20 | args.minify = toBool(args.minify);
21 | args.minCSS = toBool(args.minCSS);
22 | args.purgeCSS = toBool(args.purgeCSS);
23 | args.console = toBool(args.console);
24 |
25 | args.wait = parseInt(args.wait) * 1000;
26 | args.timeout = parseInt(args.timeout) * 1000;
27 | args.imgTimeout = parseInt(args.imgTimeout) * 1000;
28 |
29 | args.DROP = smartSplit(args.dropRequests).map((x) => new RegExp(x, 'i'));
30 | args.HEADERS = smartSplit(args.headers).map((x) => x.toLowerCase());
31 | args.REMOVE = smartSplit(args.removeElems);
32 | args.CSS = args.addCSS ? args.addCSS.trim() : '';
33 |
34 | args.DROPST = smartSplit(args.dropStatus).map((x) => new RegExp(x.replace(/x/gi, '\\d')));
35 | if (args.blockList) {
36 | const blockList = await fs.promises.readFile(args.blockList, { encoding: 'utf8' });
37 | args.DROPLI = blockList
38 | .split('\n')
39 | .map((x) => x.trim().replace(/\/+$/, ''))
40 | .filter((x) => x && !x.startsWith('#') && x.length > 5)
41 | .map((x) => new RegExp(`^https?://(www\.|m\.)?${x}/.+`, 'i'));
42 | console.log(`Loaded ${args.DROPLI.length} drop list domains from file`);
43 | }
44 |
45 | args.URI = args._ ? args._[0] : null || args.input || args.url;
46 | let HOST = new URL(args.URI).host;
47 | if (HOST.startsWith('www.')) HOST = HOST.slice(4);
48 | let OUT = args._ ? args._[1] : null || args.output;
49 | if (!OUT) OUT = `snapshot_${HOST}.json`;
50 | if (args.gzip && !OUT.endsWith('.gz')) OUT += '.gz';
51 | args.OUT = OUT;
52 | // console.log('ARGS:', args);
53 | }
54 |
55 | export async function recordPage(args) {
56 | await processArgs(args);
57 |
58 | // only Chromium supported for now
59 | const browser = await playwright.chromium.launch({
60 | headless: args.headless,
61 | // disable-web-security is needed to access cross-origin resources,
62 | // eg: CSS rules
63 | args: ['--disable-default-apps', '--disable-web-security'],
64 | });
65 | const context = await browser.newContext({
66 | javaScriptEnabled: args.js,
67 | userAgent: args.userAgent,
68 | bypassCSP: true,
69 | ignoreHTTPSErrors: true,
70 | serviceWorkers: 'block',
71 | viewport: null,
72 | });
73 | const page = await context.newPage();
74 |
75 | if (args.console) {
76 | page.on('console', async (msg) => {
77 | const msgArgs = msg.args();
78 | const logValues = await Promise.all(msgArgs.map(async (arg) => await arg.jsonValue()));
79 | console.log(`CONSOLE.${msg.type().toUpperCase()}:`, ...logValues);
80 | });
81 | }
82 |
83 | if (args.blockAds) {
84 | const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch);
85 | await blocker.enableBlockingInPage(page);
86 | }
87 |
88 | const snapshot = await internalRecordPage(args, page);
89 |
90 | return { snapshot, page, context, browser };
91 | }
92 |
93 | async function internalRecordPage(args, page) {
94 | const { URI, DROP, DROPLI, DROPST, HEADERS, REMOVE, CSS } = args;
95 |
96 | if ((DROP && DROP.length) || (DROPLI && DROPLI.length)) {
97 | const block = [...DROP, ...DROPLI];
98 | page.route('**', async (route) => {
99 | const r = route.request();
100 | const u = normalizeURL(r.url());
101 | for (const re of block) {
102 | if (re.test(u)) {
103 | console.warn('Drop matching request:', re, u);
104 | route.abort();
105 | return;
106 | }
107 | }
108 | route.continue();
109 | });
110 | }
111 |
112 | let snapshot = { url: URI, base_url: '', html: '', responses: {} };
113 | if (args.extraMeta) {
114 | snapshot = {
115 | url: URI,
116 | base_url: '',
117 | canonical_url: '',
118 | date: new Date().toISOString(),
119 | title: '',
120 | html: '',
121 | responses: {},
122 | };
123 | }
124 |
125 | page.on('response', async (response) => {
126 | const r = response.request();
127 | const u = normalizeURL(r.url());
128 | if (u.startsWith('data:')) {
129 | return;
130 | }
131 | // ignore the index page, it will be saved at the end
132 | if (u === normalizeURL(URI)) return;
133 |
134 | const status = response.status();
135 | if (DROPST && DROPST.length) {
136 | for (const re of DROPST) {
137 | if (re.test(status.toString())) {
138 | console.warn('Drop matching status:', re, status);
139 | return;
140 | }
141 | }
142 | } else {
143 | // ignore redirect requests, they will be saved after resolved
144 | if (status >= 300 && status < 400) {
145 | console.warn(`Redirect status: ${status}`, u, 'to:', response.headers()['location']);
146 | return;
147 | }
148 | // allow all the other statuses
149 | }
150 |
151 | const key = requestKey(r);
152 | console.log('Response:', status, key);
153 |
154 | // restrict headers to subset
155 | let headers = Object.entries(response.headers()).filter(([key]) => HEADERS.includes(key));
156 | headers = Object.fromEntries(headers);
157 | const contentType = headers['content-type'];
158 |
159 | let body;
160 | try {
161 | const buffer = await response.body();
162 | body = encodeBody(r.resourceType(), contentType, buffer);
163 | } catch (err) {
164 | const frame = page.frame({ url: u });
165 | if (frame && args.iframes) {
166 | console.log('Capture IFRAME content for:', frame.url());
167 | const content = (await frame.content()).trim();
168 | body = encodeBody(r.resourceType(), contentType, new Buffer.from(content, 'utf-8'));
169 | } else if (status !== 204) {
170 | console.error('ERR saving response for:', status, u, err);
171 | }
172 | }
173 |
174 | // if the request was NOT cached, or it WAS cached
175 | // and the new request is successful (overwrite with fresh data)
176 | if (!snapshot.responses[key] || (snapshot.responses[key] && snapshot.responses[key].status === 200)) {
177 | snapshot.responses[key] = {
178 | body,
179 | headers,
180 | request_url: u,
181 | status,
182 | };
183 | if (u !== response.url()) {
184 | snapshot.responses[key] = {
185 | response_url: response.url(),
186 | };
187 | }
188 | }
189 | });
190 |
191 | try {
192 | console.log('Waiting for the page to load...');
193 | await page.goto(URI, { timeout: args.timeout, waitUntil: 'networkidle' });
194 | } catch (err) {
195 | console.error('Wait timeout:', err);
196 | }
197 |
198 | // initial snapshot
199 | snapshot.html = (await page.content()).trim();
200 |
201 | const imgCount = await page.locator('img').count();
202 | if (imgCount > 0) {
203 | try {
204 | console.log('Waiting for images to load...');
205 | await page.waitForSelector('img', { timeout: args.imgTimeout });
206 | } catch (err) {
207 | console.error('Images timeout:', err);
208 | }
209 | }
210 |
211 | // resolved base URL
212 | snapshot.base_url = await page.evaluate('document.baseURI');
213 |
214 | if (args.extraMeta) {
215 | snapshot.title = (await page.title()).trim();
216 | // resolved canonical URL
217 | snapshot.canonical_url = await page.evaluate(
218 | `(document.querySelector("link[rel='canonical']") || document.createElement('link')).getAttribute('href')`,
219 | );
220 | if (!snapshot.canonical_url) delete snapshot.canonical_url;
221 | }
222 |
223 | // delete possible index duplicates, when user URL != resolved URL
224 | let baseKey = `GET:${snapshot.base_url}`;
225 | if (snapshot.responses[baseKey] && snapshot.responses[baseKey].body) {
226 | delete snapshot.responses[baseKey];
227 | }
228 | if (snapshot.canonical_url) {
229 | baseKey = `GET:${snapshot.canonical_url}`;
230 | if (snapshot.responses[baseKey] && snapshot.responses[baseKey].body) {
231 | delete snapshot.responses[baseKey];
232 | }
233 | }
234 | baseKey = null;
235 |
236 | for (const selector of REMOVE) {
237 | console.log('Removing element selector:', selector);
238 | await page.evaluate((s) => {
239 | for (const el of document.querySelectorAll(s)) {
240 | el.parentNode.removeChild(el);
241 | }
242 | }, selector);
243 | }
244 |
245 | if (CSS && CSS.length) {
246 | console.log('Adding custom CSS...');
247 | await page.evaluate((css) => {
248 | const cssHack = document.createElement('style');
249 | cssHack.className = 'hack';
250 | cssHack.innerText = css;
251 | document.head.appendChild(cssHack);
252 | }, CSS);
253 | }
254 |
255 | // second snapshot
256 | snapshot.html = (await page.content()).trim();
257 |
258 | if (args.minCSS || args.purgeCSS) {
259 | const [rawCSS, URLs] = await page.evaluate(() => {
260 | const urls = new Set();
261 | const css = [];
262 | console.log(`Collecting ${document.styleSheets.length} CSS styleSheets...`);
263 | // cycle #1 collect CSS
264 | for (const style of document.styleSheets) {
265 | if (style.href) urls.add(style.href);
266 | let raw;
267 | try {
268 | console.log(`Saving CSS rules for: ${style.ownerNode.localName} href=${style.href}`);
269 | raw = ` /* CSS for ${style.ownerNode.localName} href=${style.href} */ `;
270 | raw += Array.from(style.cssRules)
271 | .map((rule) => {
272 | if (rule.href) urls.add(rule.href);
273 | if (rule instanceof CSSImportRule) return '';
274 | return rule.cssText;
275 | })
276 | .join(' ');
277 | } catch (err) {
278 | console.warn(`Cannot access CSS: ${err}`);
279 | }
280 | if (raw) css.push(raw);
281 | }
282 | console.log(`Found ${css.length} CSS styleSheets...`);
283 | return [css, [...urls]];
284 | });
285 |
286 | let pageCSS = rawCSS.reduce((acc, curr) => acc + ' ' + curr, ' ');
287 | const s1 = pageCSS.length;
288 |
289 | if (args.purgeCSS) {
290 | const purgedCSS = await new PurgeCSS().purge({
291 | css: [{ raw: pageCSS }],
292 | content: [{ raw: snapshot.html, extension: 'html' }],
293 | });
294 | pageCSS = purgedCSS.map(({ css }) => css).join(' ');
295 | }
296 |
297 | let finalCSS = '';
298 | try {
299 | finalCSS = minifyCSS({
300 | code: Buffer.from(pageCSS),
301 | minify: true,
302 | }).code.toString();
303 | const s2 = finalCSS.length;
304 |
305 | console.log(
306 | `CSS styles minify efficiency ${((s2 / s1) * 100).toFixed(2)}% from ` +
307 | `${Intl.NumberFormat('en').format(s1)} to ${Intl.NumberFormat('en').format(s2)}.`,
308 | );
309 | } catch (err) {
310 | finalCSS = pageCSS;
311 | console.error(`Minify CSS failed with error: ${err}!`);
312 | }
313 |
314 | await page.evaluate((css) => {
315 | // cycle to remove CSS DOM nodes
316 | // this needs to run after collecting the CSS
317 | // and has to run a few times to remove all deep nodes ...
318 | while (document.styleSheets.length > 0) {
319 | for (const style of document.styleSheets) {
320 | try {
321 | console.warn(`Removing node: ${style.ownerNode}`);
322 | style.ownerNode.remove();
323 | } catch {}
324 | }
325 | }
326 | const cssHack = document.createElement('style');
327 | cssHack.className = 'clean';
328 | cssHack.innerText = css;
329 | document.head.appendChild(cssHack);
330 | }, finalCSS);
331 |
332 | // cleanup the recorded CSS resources
333 | for (const u of URLs) {
334 | const key = `GET:${u}`;
335 | const res = snapshot.responses[key];
336 | if (res) {
337 | console.log('Removing recorded CSS response:', key);
338 | res.body = null;
339 | delete snapshot.responses[key];
340 | }
341 | }
342 | // check if there are any CSS resources left
343 | for (const k of Object.keys(snapshot.responses)) {
344 | const res = snapshot.responses[k];
345 | if (
346 | res.headers &&
347 | res.headers['content-type'] &&
348 | res.headers['content-type'].startsWith('text/css')
349 | ) {
350 | console.log('CSS response not removed:', k);
351 | }
352 | }
353 |
354 | // post CSS snapshot
355 | snapshot.html = (await page.content()).trim();
356 | }
357 |
358 | return snapshot;
359 | }
360 |
--------------------------------------------------------------------------------