├── img ├── amazon-kindle.png └── wikipedia-offline.png ├── .github └── workflows │ └── main.yml ├── test ├── restore.test.js ├── record.test.js └── quopri.test.js ├── package.json ├── LICENSE ├── cli ├── restore.js ├── stats.js └── record.js ├── .gitignore ├── src ├── util.js ├── quopri.js ├── restore.js └── record.js ├── block ├── update_bad_hosts.py └── blocklist.txt └── README.md /img/amazon-kindle.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zytedata/web-snap/HEAD/img/amazon-kindle.png -------------------------------------------------------------------------------- /img/wikipedia-offline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zytedata/web-snap/HEAD/img/wikipedia-offline.png -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | - push 5 | - pull_request 6 | 7 | jobs: 8 | test: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v5 12 | - uses: actions/setup-node@v4 13 | with: 14 | node-version: 22 15 | - run: npm install 16 | - run: | 17 | npx playwright install 18 | npm test 19 | -------------------------------------------------------------------------------- /test/restore.test.js: -------------------------------------------------------------------------------- 1 | import test from 'ava'; 2 | import { restorePage } from '../src/restore.js'; 3 | 4 | test('basic restore page', async (t) => { 5 | const RECORD = { 6 | url: 'http://example.com', 7 | base_url: 'http://example.com/', 8 | html: 'Example page', 9 | responses: {}, 10 | }; 11 | const { page, browser } = await restorePage({ RECORD, timeout: 1, wait: 1, headless: true }); 12 | 13 | const url = page.url(); 14 | t.is(RECORD.base_url, url); 15 | const base_url = await page.evaluate('document.baseURI'); 16 | t.is(RECORD.base_url, base_url); 17 | const html = (await page.content()).trim(); 18 | t.true(html.includes('Example page')); 19 | 20 | await browser.close(); 21 | }); 22 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "authors": [ 3 | "Cristi Constantin " 4 | ], 5 | "dependencies": { 6 | "@ghostery/adblocker-playwright": "2.11.3", 7 | "cross-fetch": "4.1.0", 8 | "html-minifier-terser": "7.2.0", 9 | "lightningcss": "1.30.1", 10 | "mri": "1.2.0", 11 | "playwright": "1.57.0", 12 | "pretty-bytes": "7.0.1", 13 | "purgecss": "7.0.2" 14 | }, 15 | "devDependencies": { 16 | "ava": "6.4.1", 17 | "express": "5.2.1" 18 | }, 19 | "main": "src", 20 | "type": "module", 21 | "name": "web-snap", 22 | "description": "Create perfect snapshots of web pages", 23 | "license": "MIT", 24 | "version": "0.1", 25 | "repository": { 26 | "type": "git", 27 | "url": "git://github.com/croqaz/web-snap.git" 28 | }, 29 | "bin": { 30 | "web-record": "cli/record.js", 31 | "web-restore": "cli/restore.js", 32 | "snap-stats": "cli/stats.js" 33 | }, 34 | "scripts": { 35 | "test": "npx ava --verbose test/*.test.js" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Cristi Constantin 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /test/record.test.js: -------------------------------------------------------------------------------- 1 | import test from 'ava'; 2 | import http from 'http'; 3 | import express from 'express'; 4 | import { recordPage } from '../src/record.js'; 5 | 6 | const PORT = 12345; 7 | 8 | function createTestServer() { 9 | const app = express(); 10 | const server = http.createServer(app); 11 | app.set('etag', false); 12 | return { app, server }; 13 | } 14 | 15 | test('basic record page', async (t) => { 16 | const { app, server } = createTestServer(); 17 | app.get('/', function (_, res) { 18 | res.send('Hello world'); 19 | }); 20 | await new Promise((resolve) => server.listen(PORT, resolve)); 21 | 22 | const { snapshot, browser } = await recordPage({ 23 | url: `http://localhost:${PORT}`, 24 | timeout: 3, 25 | imgTimeout: 3, 26 | headless: true, 27 | }); 28 | t.is(snapshot.url, `http://localhost:${PORT}`); 29 | t.is(snapshot.base_url, `http://localhost:${PORT}/`); 30 | t.true(snapshot.html.includes('Hello world')); 31 | t.deepEqual(snapshot.responses, {}); 32 | 33 | await browser.close(); 34 | server.close(); 35 | }); 36 | -------------------------------------------------------------------------------- /cli/restore.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import mri from 'mri'; 3 | 4 | import pkg from '../package.json' with { type: 'json' }; 5 | import { restorePage } from '../src/restore.js'; 6 | import { delay } from '../src/util.js'; 7 | 8 | const options = { 9 | boolean: ['help', 'version'], 10 | alias: { 11 | i: 'input', 12 | v: 'version', 13 | rm: 'removeElems', 14 | // c: 'config', 15 | }, 16 | default: { 17 | headless: null, // visible browser window 18 | js: 'yes', // JS execution on restore 19 | offline: 'yes', // force browser offline 20 | timeout: 15, // navigation timeout 21 | wait: 120, // keep the browser open (seconds) 22 | overwrite: null, // overwrite body HTML with HTML from snapshot 23 | removeElems: '', // remove page elements 24 | }, 25 | }; 26 | 27 | (async () => { 28 | const args = mri(process.argv.slice(2), options); 29 | 30 | if (args.version) { 31 | console.log('Web-Snap v' + pkg.version); 32 | return; 33 | } 34 | 35 | const { page, browser } = await restorePage(args); 36 | page.on('close', () => process.exit()); 37 | browser.on('disconnected', () => process.exit()); 38 | 39 | await delay(args.wait); 40 | await browser.close(); 41 | })(); 42 | -------------------------------------------------------------------------------- /test/quopri.test.js: -------------------------------------------------------------------------------- 1 | import test from 'ava'; 2 | import { encode, decode } from '../src/quopri.js'; 3 | 4 | test('quopri test', async (t) => { 5 | t.is(decode(' =3D=20'), ' = '); 6 | t.is(decode('foo\r\nbar='), 'foo\r\nbar'); 7 | t.is(decode('=E4=BD=A0=E5=A5=BD'), 'ä½ å¥½'); // 你好 8 | t.is( 9 | decode('I=C3=B1t=C3=ABrn=C3=A2ti=C3=B4n=C3=A0liz=C3=A6ti=C3=B8n=E2=98=83=F0=9F=92=\r\n=A9'), 10 | 'IÃ±tÃ«rnÃ¢tiÃ´nÃ lizÃ¦tiÃ¸nâ\x98\x83ð\x9F\x92©', 11 | ); // Iñtërnâtiônàlizætiøn☃💩 12 | t.is( 13 | decode('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ=20'), 14 | 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ ', 15 | ); 16 | 17 | t.is(encode(' = '), ' =3D=20'); 18 | t.is(encode('foo\t'), 'foo=09'); 19 | t.is( 20 | encode('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ='), 21 | 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ=3D', 22 | ); 23 | t.is( 24 | encode('xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ '), 25 | 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxXYZ=20', 26 | ); 27 | 28 | t.is(decode(encode('a\nb\nc\n')), 'a\nb\nc\n'); 29 | }); 30 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Logs 2 | logs 3 | *.log 4 | npm-debug.log* 5 | yarn-debug.log* 6 | yarn-error.log* 7 | lerna-debug.log* 8 | 9 | # Diagnostic reports (https://nodejs.org/api/report.html) 10 | report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json 11 | 12 | # Runtime data 13 | pids 14 | *.pid 15 | *.seed 16 | *.pid.lock 17 | 18 | # Directory for instrumented libs generated by jscoverage/JSCover 19 | lib-cov 20 | 21 | # Coverage directory used by tools like istanbul 22 | coverage 23 | *.lcov 24 | 25 | # nyc test coverage 26 | .nyc_output 27 | 28 | # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) 29 | .grunt 30 | 31 | # Bower dependency directory (https://bower.io/) 32 | bower_components 33 | 34 | # node-waf configuration 35 | .lock-wscript 36 | 37 | # Compiled binary addons (https://nodejs.org/api/addons.html) 38 | build/Release 39 | 40 | # Dependency directories 41 | node_modules/ 42 | jspm_packages/ 43 | 44 | # TypeScript v1 declaration files 45 | typings/ 46 | 47 | # TypeScript cache 48 | *.tsbuildinfo 49 | 50 | # Optional npm cache directory 51 | .npm 52 | 53 | # Optional eslint cache 54 | .eslintcache 55 | 56 | # Microbundle cache 57 | .rpt2_cache/ 58 | .rts2_cache_cjs/ 59 | .rts2_cache_es/ 60 | .rts2_cache_umd/ 61 | 62 | # Optional REPL history 63 | .node_repl_history 64 | 65 | # Output of 'npm pack' 66 | *.tgz 67 | 68 | # Yarn Integrity file 69 | .yarn-integrity 70 | 71 | # dotenv environment variables file 72 | .env 73 | .env.test 74 | 75 | # parcel-bundler cache (https://parceljs.org/) 76 | .cache 77 | 78 | # Next.js build output 79 | .next 80 | 81 | # Nuxt.js build / generate output 82 | .nuxt 83 | dist 84 | 85 | # Gatsby files 86 | .cache/ 87 | # Comment in the public line in if your project uses Gatsby and *not* Next.js 88 | # https://nextjs.org/blog/next-9-1#public-directory-support 89 | # public 90 | 91 | # vuepress build output 92 | .vuepress/dist 93 | 94 | # Serverless directories 95 | .serverless/ 96 | 97 | # FuseBox cache 98 | .fusebox/ 99 | 100 | # DynamoDB Local files 101 | .dynamodb/ 102 | 103 | # TernJS port file 104 | .tern-port 105 | 106 | # Snapshot files 107 | snap*.json 108 | snap*.json.gz 109 | -------------------------------------------------------------------------------- /src/util.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Common utils 3 | */ 4 | import fs from 'fs'; 5 | import { gunzip } from 'zlib'; 6 | import { promisify } from 'util'; 7 | 8 | import { encode, decode } from './quopri.js'; 9 | 10 | export function delay(time) { 11 | return new Promise((resolve) => setTimeout(resolve, time)); 12 | } 13 | 14 | export function requestKey(r) { 15 | return `${r.method()}:${r.url()}`; 16 | } 17 | 18 | export function normalizeURL(url) { 19 | if (!url) return ''; 20 | const u = new URL(url.replace(/\/+$/, '')); 21 | u.hash = ''; 22 | return u.toString(); 23 | } 24 | 25 | export function checkBrowser(str) { 26 | return ['chromium', 'firefox', 'webkit'].includes(str); 27 | } 28 | 29 | export function toBool(str) { 30 | if (!str) return !!str; 31 | if (typeof str !== 'string') return str; 32 | str = str.toLowerCase(); 33 | if (str === 'false' || str === 'off' || str === 'no' || str === '0') return false; 34 | return true; 35 | } 36 | 37 | export function smartSplit(str) { 38 | if (!str) return []; 39 | if (typeof str !== 'string') return str; 40 | const split = []; 41 | for (let s of str.split(/[,; ]+/)) { 42 | if (s.trim()) { 43 | split.push(s); 44 | } 45 | } 46 | return split; 47 | } 48 | 49 | export async function parseSnapshot(fname) { 50 | let record = await fs.promises.readFile(fname); 51 | if (fname.endsWith('.gz')) { 52 | record = await promisify(gunzip)(record); 53 | } 54 | return JSON.parse(record); 55 | } 56 | 57 | export function encodeBody(resourceType, contentType, buffer) { 58 | if (!buffer || buffer.length === 0) return ''; 59 | if ( 60 | resourceType === 'document' || 61 | resourceType === 'stylesheet' || 62 | resourceType === 'script' || 63 | resourceType === 'manifest' 64 | ) { 65 | return `QUOPRI:${encode(buffer)}`; 66 | } 67 | if ( 68 | contentType && 69 | (contentType.startsWith('text/') || 70 | contentType.startsWith('image/svg+xml') || 71 | contentType.startsWith('application/json')) 72 | ) { 73 | return `QUOPRI:${encode(buffer)}`; 74 | } 75 | return `BASE64:${buffer.toString('base64')}`; 76 | } 77 | 78 | export function decodeBody(body) { 79 | if (!body || body.length === 0) return ''; 80 | if (body.startsWith('QUOPRI:')) return decode(body.slice(7)); 81 | if (body.startsWith('BASE64:')) return Buffer.from(body.slice(7), 'base64'); 82 | return Buffer.from(body, 'base64'); 83 | } 84 | -------------------------------------------------------------------------------- /cli/stats.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import mri from 'mri'; 3 | import prettyBytes from 'pretty-bytes'; 4 | 5 | import { parseSnapshot } from '../src/util.js'; 6 | 7 | const options = { 8 | alias: { 9 | i: 'input', 10 | }, 11 | }; 12 | 13 | function bar(value, maxValue) { 14 | // https://github.com/morishin/ascii-horizontal-barchart 15 | const fractions = ['▏', '▎', '▍', '▋', '▊', '▉']; 16 | const barLength = (value * 100) / maxValue; 17 | const wholeNumberPart = Math.floor(barLength); 18 | const fractionalPart = barLength - wholeNumberPart; 19 | let txt = fractions[fractions.length - 1].repeat(wholeNumberPart); 20 | if (fractionalPart > 0) txt += fractions[Math.floor(fractionalPart * fractions.length)]; 21 | return txt; 22 | } 23 | 24 | (async () => { 25 | const args = mri(process.argv.slice(2), options); 26 | 27 | const fname = args._ ? args._[0] : null || args.input; 28 | const snap = await parseSnapshot(fname); 29 | 30 | let resourceTypes = {}; 31 | let maxValue = Math.max(...Object.values(snap.responses).map((v) => (v.body ? v.body.length : 0))); 32 | const data = Object.entries(snap.responses) 33 | .map(([k, v]) => { 34 | const t = (v.headers && v.headers['content-type']) ? v.headers['content-type'].split('/')[0] : 'other'; 35 | if (resourceTypes[t]) resourceTypes[t] += 1; 36 | else resourceTypes[t] = 1; 37 | return [k, v.body ? v.body.length : 0]; 38 | }) 39 | .filter(([_, v]) => v >= maxValue / 20 && v > 100); 40 | const totSize = data.reduce((sum, curr) => sum + curr[1], 0); 41 | 42 | console.log(`\nHTML body size: ${prettyBytes(snap.html.length, { minimumFractionDigits: 2 })}`); 43 | console.log(`Resources size: ${prettyBytes(totSize, { minimumFractionDigits: 2 })}`); 44 | console.log(`There are ${Object.keys(snap.responses).length} resources in total`); 45 | 46 | data.push(['GET:HTML body', snap.html.length]); 47 | 48 | data.sort((a, b) => b[1] - a[1]); 49 | console.log('\nTop resources by size::'); 50 | for (const [txt, nr] of data.slice(0, 10)) { 51 | const barText = bar(nr, maxValue); 52 | const suffix = ' ' + prettyBytes(nr, { minimumFractionDigits: 2 }); 53 | let http = txt.split(':').slice(1).join(':'); 54 | http = http.replace(/^https?:\/\/(w+?\.)?/, ''); 55 | if (http.length > 165) http = http.slice(0, 160) + ' ... ' + http.slice(-5); 56 | console.log(http); 57 | console.log(barText + suffix); 58 | } 59 | 60 | console.log('\nResources by type::'); 61 | resourceTypes = Array.from(Object.entries(resourceTypes)); 62 | resourceTypes.sort((a, b) => b[1] - a[1]); 63 | maxValue = resourceTypes[0][1]; 64 | for (const [txt, nr] of resourceTypes) { 65 | const barText = bar(nr, maxValue); 66 | const suffix = ' ' + nr; 67 | console.log(txt + '\n' + barText + suffix); 68 | } 69 | })(); 70 | -------------------------------------------------------------------------------- /src/quopri.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Quoted-printable decode string. 3 | */ 4 | export function decode(input) { 5 | // Reference: https://mths.be/quoted-printable by @mathias | MIT license 6 | return ( 7 | input 8 | // https://tools.ietf.org/html/rfc2045#section-6.7, rule 3: 9 | // "Therefore, when decoding a `Quoted-Printable` body, any trailing white 10 | // space on a line must be deleted, as it will necessarily have been added 11 | // by intermediate transport agents" 12 | .replace(/[\t\x20]$/gm, '') 13 | // Remove hard line breaks preceded by `=`. Proper `Quoted-Printable`- 14 | // encoded data only contains CRLF line endings, but for compatibility 15 | // reasons we support separate CR and LF too. 16 | .replace(/=(?:\r\n?|\n|$)/g, '') 17 | // Decode escape sequences of the form `=XX` where `XX` is any 18 | // combination of two hexidecimal digits. For optimal compatibility, 19 | // lowercase hexadecimal digits are supported as well. See 20 | // https://tools.ietf.org/html/rfc2045#section-6.7, note 1. 21 | .replace(/=([a-fA-F0-9]{2})/g, function (_, $1) { 22 | let codePoint = parseInt($1, 16); 23 | return String.fromCharCode(codePoint); 24 | }) 25 | ); 26 | } 27 | 28 | /** 29 | * Quoted-printable encode string or Buffer. 30 | */ 31 | export function encode(buffer) { 32 | // Reference: https://npmjs.com/package/libqp by Andris Reinman | MIT license 33 | if (typeof buffer === 'string') { 34 | buffer = Buffer.from(buffer, 'utf-8'); 35 | } 36 | 37 | // usable characters that do not need encoding 38 | const ranges = [ 39 | // https://tools.ietf.org/html/rfc2045#section-6.7 40 | [0x09], // 41 | [0x0a], // 42 | [0x0d], // 43 | [0x20, 0x3c], // !"#$%&'()*+,-./0123456789:; 44 | [0x3e, 0x7e], // >?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|} 45 | ]; 46 | let result = ''; 47 | 48 | for (let i = 0, len = buffer.length; i < len; i++) { 49 | let ord = buffer[i]; 50 | // if the char is in allowed range, then keep as is, unless it is a ws in the end of a line 51 | if ( 52 | checkRanges(ord, ranges) && 53 | !( 54 | (ord === 0x20 || ord === 0x09) && 55 | (i === len - 1 || buffer[i + 1] === 0x0a || buffer[i + 1] === 0x0d) 56 | ) 57 | ) { 58 | result += String.fromCharCode(ord); 59 | continue; 60 | } 61 | result += '=' + (ord < 0x10 ? '0' : '') + ord.toString(16).toUpperCase(); 62 | } 63 | 64 | return result; 65 | } 66 | 67 | /** 68 | * Helper function to check if a number is inside provided ranges 69 | */ 70 | function checkRanges(nr, ranges) { 71 | for (let i = ranges.length - 1; i >= 0; i--) { 72 | if (!ranges[i].length) { 73 | continue; 74 | } 75 | if (ranges[i].length === 1 && nr === ranges[i][0]) { 76 | return true; 77 | } 78 | if (ranges[i].length === 2 && nr >= ranges[i][0] && nr <= ranges[i][1]) { 79 | return true; 80 | } 81 | } 82 | return false; 83 | } 84 | -------------------------------------------------------------------------------- /block/update_bad_hosts.py: -------------------------------------------------------------------------------- 1 | """ 2 | Example usage: 3 | python block/update_bad_hosts.py block/blocklist.txt 4 | """ 5 | import sys 6 | import requests 7 | 8 | 9 | def upd_easylist(): 10 | name = 'EASYLIST' 11 | URL = 'https://v.firebog.net/hosts/Easylist.txt' 12 | # https://easylist.to/easylist/easylist.txt 13 | hosts = set() 14 | r = requests.get(URL) 15 | print(r, URL) 16 | for line in r.text.split('\n')[5:]: 17 | line = line.strip() 18 | if not line or line[0] == '#' or len(line) < 4: 19 | continue 20 | hosts.add(line.strip()) 21 | 22 | URL = 'https://v.firebog.net/hosts/Easyprivacy.txt' 23 | # https://easylist.to/easylist/easyprivacy.txt 24 | r = requests.get(URL) 25 | print(r, URL) 26 | for line in r.text.split('\n')[5:]: 27 | line = line.strip() 28 | if not line or line[0] == '#' or len(line) < 4: 29 | continue 30 | hosts.add(line.strip()) 31 | 32 | print(f'{name} found hosts: {len(hosts)}') 33 | return name, hosts 34 | 35 | 36 | def upd_adaway(): 37 | name = 'ADAWAY' 38 | URL = 'https://adaway.org/hosts.txt' 39 | r = requests.get(URL) 40 | print(r, URL) 41 | hosts = set() 42 | for line in r.text.split('\n'): 43 | line = line.strip() 44 | if not line or line[0] == '#': 45 | continue 46 | if line.startswith('127.0.0.1 '): 47 | hosts.add(line[9:].strip()) 48 | print(f'{name} found hosts: {len(hosts)}') 49 | return name, hosts 50 | 51 | 52 | def upd_disconnect(): 53 | name = 'DISCONNECT' 54 | URL = 'https://s3.amazonaws.com/lists.disconnect.me/simple_ad.txt' 55 | r = requests.get(URL) 56 | print(r, URL) 57 | hosts = set() 58 | for line in r.text.split('\n')[3:]: 59 | line = line.strip() 60 | if not line or line[0] == '#': 61 | continue 62 | hosts.add(line.strip()) 63 | print(f'{name} found hosts: {len(hosts)}') 64 | return name, hosts 65 | 66 | 67 | def upd_w3kbl(): 68 | name = 'W3KBL' 69 | URL = 'https://v.firebog.net/hosts/static/w3kbl.txt' 70 | r = requests.get(URL) 71 | print(r, URL) 72 | hosts = set() 73 | for line in r.text.split('\n')[6:]: 74 | line = line.strip() 75 | if not line or line[0] == '#': 76 | continue 77 | hosts.add(line.strip().split(" ")[0]) 78 | print(f'{name} found hosts: {len(hosts)}') 79 | return name, hosts 80 | 81 | 82 | def save_result(): 83 | OUTPUT = sys.argv[1] if len(sys.argv) > 1 else 'blocklist.txt' 84 | 85 | # Custom list of block rules 86 | CUSTOM = set([ 87 | # google 88 | 'google-analytics.com', 89 | 'google.com/adsense/search', 90 | 'google.com/recaptcha', 91 | 'googleads.g.doubleclick.net/pagead', 92 | 'googleoptimize.com', 93 | 'gstatic.com/recaptcha/releases', 94 | # amazon 95 | 'fls-na.amazon.com', 96 | 'cloudfront-labs.amazonaws.com', 97 | 'unagi.amazon.com/\\d/events', 98 | # other 99 | 'match.adsrvr.org/track', 100 | # cookie popups 101 | 'cdn.cookielaw.org', 102 | ]) 103 | # popular lists 104 | _, easy = upd_easylist() 105 | _, adaway = upd_adaway() 106 | _, disco = upd_disconnect() 107 | _, w3kbl = upd_w3kbl() 108 | 109 | hosts = CUSTOM | (adaway & w3kbl) | (adaway & easy) | (easy & w3kbl) | (easy & disco) | (w3kbl & disco) 110 | with open(OUTPUT, 'w') as fd: 111 | fd.write('# Generated from update_bad_hosts.py\n') 112 | for x in sorted(hosts): 113 | if len(x) < 5: continue 114 | fd.write(f'{x}\n') 115 | print(f'Written {len(hosts)} hosts in {OUTPUT}') 116 | 117 | 118 | if __name__ == '__main__': 119 | save_result() 120 | -------------------------------------------------------------------------------- /cli/record.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import fs from 'fs'; 3 | import { gzip } from 'zlib'; 4 | import { promisify } from 'util'; 5 | import { minify } from 'html-minifier-terser'; 6 | import prettyBytes from 'pretty-bytes'; 7 | import mri from 'mri'; 8 | 9 | import pkg from '../package.json' with { type: 'json' }; 10 | import { recordPage } from '../src/record.js'; 11 | import { delay } from '../src/util.js'; 12 | 13 | const options = { 14 | boolean: ['help', 'version'], 15 | alias: { 16 | i: 'input', 17 | o: 'output', 18 | v: 'version', 19 | z: 'gzip', 20 | css: 'addCSS', 21 | rm: 'removeElems', 22 | drop: 'dropRequests', 23 | }, 24 | default: { 25 | // browser: 'chromium', // only Chromium supported for now 26 | gzip: null, // compress final JSON 27 | headless: null, // visible browser window 28 | blockAds: null, // enable AdBlocker? 29 | blockList: null, // block domains from custom list 30 | extraMeta: null, // extract meta from HTML? 31 | iframes: null, // capture iframes? 32 | js: 'on', // disable JS execution and capturing 33 | minify: null, // min final HTML before save 34 | minCSS: null, // min final CSS before save 35 | purgeCSS: null, // purge unused CSS and generate 1 single CSS file 36 | timeout: 15, // navigation timeout 37 | imgTimeout: 15, 38 | wait: 5, // wait for user interaction (seconds) 39 | // headers: 'content-type, date', // Content-Type header is pretty important 40 | headers: 'content-type, content-length, content-range, date, content-language, last-modified', // extended version 41 | userAgent: '', // custom user agent 42 | // userAgent: Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36 43 | dropRequests: '', // drop matching requests 44 | dropStatus: '', // drop matching statuses 45 | removeElems: '', // remove page elements 46 | addCSS: '', // add extra CSS 47 | console: null, // print browser's console msgs 48 | }, 49 | }; 50 | 51 | (async () => { 52 | const args = mri(process.argv.slice(2), options); 53 | 54 | if (args.version) { 55 | console.log('Web-Snap v' + pkg.version); 56 | return; 57 | } 58 | 59 | const { snapshot, page, context, browser } = await recordPage(args); 60 | 61 | page.on('close', async () => { 62 | if (args.minify) { 63 | const s1 = snapshot.html.length; 64 | try { 65 | snapshot.html = await minify(snapshot.html, { 66 | caseSensitive: true, 67 | collapseBooleanAttributes: true, 68 | collapseWhitespace: true, 69 | conservativeCollapse: true, 70 | continueOnParseError: true, 71 | quoteCharacter: "'", 72 | removeAttributeQuotes: true, 73 | removeStyleLinkTypeAttributes: true, 74 | sortAttributes: true, 75 | sortClassName: true, 76 | }); 77 | const s2 = snapshot.html.length; 78 | const s3 = prettyBytes(s2, { maximumFractionDigits: 2 }); 79 | console.log( 80 | `Body HTML minify efficiency ${((s2 / s1) * 100).toFixed(2)}% from ` + 81 | `${Intl.NumberFormat('en').format(s1)} to ${Intl.NumberFormat('en').format(s2)} ` + 82 | `(${s3})`, 83 | ); 84 | } catch (err) { 85 | console.error('Cannot minify HTML!', err); 86 | } 87 | } 88 | if (args.gzip) { 89 | const record = await promisify(gzip)(JSON.stringify(snapshot)); 90 | await fs.promises.writeFile(args.OUT, record, { encoding: 'utf8' }); 91 | } else { 92 | await fs.promises.writeFile(args.OUT, JSON.stringify(snapshot, null, 2), { encoding: 'utf8' }); 93 | } 94 | console.log(`Snapshot file: "${args.OUT}" was saved`); 95 | process.exit(); 96 | }); 97 | 98 | console.log(`Waiting ${args.wait / 1000} sec...`); 99 | await delay(args.wait); 100 | await browser.close(); 101 | })(); 102 | -------------------------------------------------------------------------------- /src/restore.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Restore a recorded page. 3 | */ 4 | import { chromium } from 'playwright'; 5 | 6 | import { requestKey, normalizeURL, toBool, smartSplit, parseSnapshot, decodeBody } from './util.js'; 7 | 8 | async function processArgs(args) { 9 | args.js = toBool(args.js); 10 | args.headless = toBool(args.headless); // debug & tests 11 | args.offline = toBool(args.offline); 12 | args.timeout = parseInt(args.timeout) * 1000; 13 | args.wait = parseInt(args.wait) * 1000; 14 | args.REMOVE = smartSplit(args.removeElems); 15 | 16 | const snap = args._ ? args._[0] : null || args.input; 17 | if (snap) { 18 | args.RECORD = await parseSnapshot(snap); 19 | } 20 | } 21 | 22 | export async function restorePage(args) { 23 | await processArgs(args); 24 | const record = args.RECORD; 25 | 26 | if (!record) { 27 | console.error('Empty snapshot file! Cannot launch!'); 28 | return; 29 | } 30 | if (!((record.url || record.base_url) && record.html && record.responses)) { 31 | console.error('Invalid snapshot file! Cannot launch!'); 32 | return; 33 | } 34 | 35 | const URL = normalizeURL(record.base_url || record.url); 36 | console.log('Restoring URL:', URL); 37 | 38 | const browser = await chromium.launch({ 39 | headless: args.headless, 40 | args: [ 41 | '--allow-running-insecure-content', 42 | '--disable-background-networking', 43 | '--disable-breakpad', 44 | '--disable-crash-reporter', 45 | '--disable-default-apps', 46 | '--disable-demo-mode', 47 | '--disable-extensions', 48 | '--disable-features=IsolateOrigins', 49 | '--disable-site-isolation-trials', 50 | '--disable-speech-api', 51 | '--disable-sync', 52 | '--disable-web-security', 53 | ], 54 | }); 55 | 56 | const context = await browser.newContext({ 57 | bypassCSP: true, 58 | acceptInsecureCerts: true, 59 | ignoreHTTPSErrors: true, 60 | javaScriptEnabled: args.js, 61 | offline: args.offline, 62 | // serviceWorkers: 'block', 63 | viewport: null, 64 | }); 65 | 66 | const page = await context.newPage(); 67 | 68 | page.on('console', async (msg) => { 69 | if (msg.text().startsWith('Failed to load resource')) return; 70 | console.log(`CONSOLE ${msg.type()}: ${msg.text()}`); 71 | }); 72 | 73 | page.setDefaultTimeout(args.timeout); 74 | await context.route('**', async (route) => { 75 | const r = route.request(); 76 | const u = normalizeURL(r.url()); 77 | 78 | if (u === URL) { 79 | console.log(`Restored INDEX from CACHE: ${u}`); 80 | route.fulfill({ 81 | contentType: 'text/html; charset=utf-8', 82 | body: record.html, 83 | }); 84 | return; 85 | } 86 | 87 | const key = requestKey(r); 88 | const cached = record.responses[key]; 89 | if (cached && cached.status) { 90 | // ignore all javascript requests on restore, when JS disabled 91 | const contentType = cached.headers['content-type']; 92 | if ( 93 | !args.js && 94 | contentType && 95 | (contentType.startsWith('text/javascript') || 96 | contentType.startsWith('application/javascript') || 97 | contentType.startsWith('application/x-javascript')) 98 | ) { 99 | // HTTP 204 = NO CONTENT 100 | route.fulfill({ status: 204 }); 101 | return; 102 | } 103 | console.log(`Restored from CACHE: ${key}`); 104 | route.fulfill({ 105 | contentType: contentType || '', 106 | body: decodeBody(cached.body), 107 | status: record.status, 108 | headers: cached.headers, // Some headers may be useful here 109 | }); 110 | return; 111 | } 112 | 113 | // else 114 | console.log(`MISSING resource: ${key}`); 115 | route.continue(); // or abort ?? 116 | }); 117 | 118 | // navigate to the resolved URL instead of the user provided one 119 | try { 120 | await page.goto(URL, { waitUntil: 'networkidle' }); 121 | } catch (err) { 122 | console.error('Page timeout:', err); 123 | } 124 | 125 | // overwrite page content with the one from the snapshot, to fix potential JS issues 126 | if (args.overwrite && args.js) { 127 | console.log('REWRITE page content from snapshot..'); 128 | page.setContent(record.html); 129 | } 130 | 131 | for (const selector of args.REMOVE) { 132 | console.log('REMOVE element selector:', selector); 133 | await page.evaluate((s) => { 134 | for (const el of document.querySelectorAll(s)) { 135 | el.parentNode.removeChild(el); 136 | } 137 | }, selector); 138 | } 139 | 140 | return { page, context, browser }; 141 | } 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Web-snaphots 2 | 3 | Create "perfect" snapshots of web pages. 4 | 5 | 6 | ## Install 7 | 8 | ``` shell 9 | $ npm install git+https://github.com/croqaz/web-snap.git 10 | ``` 11 | 12 | ## Usage 13 | 14 | ``` shell 15 | $ web-record https://en.wikipedia.org/wiki/Online_and_offline 16 | ``` 17 | 18 | This will open a Chrome-like browser, show you the page and create an output file called by default: "snapshot_en.wikipedia.org.json" 19 | To restore this snapshot file, you can use: 20 | 21 | ``` shell 22 | $ web-restore snapshot_en.wikipedia.org.json 23 | ``` 24 | 25 | This will open a Chrome-like browser, show the page and you can read it even if you're offline. 26 | 27 | You can also save and restore more complicated pages, like Amazon products: 28 | 29 | ``` shell 30 | $ web-record https://www.amazon.com/dp/B07978J597/ 31 | $ web-restore snapshot_amazon.com.json 32 | ``` 33 | 34 | Note that some pages should be scrolled a little bit and hover some elements, to make sure all the page and images are loaded before the snapshot is taken. 35 | This is not a limitation of web-snap, it's how modern browsers and pages are intentionally built to load resources lazily, on demand. 36 | 37 | For a complete example, with all the flags: 38 | 39 | ``` shell 40 | $ web-record https://en.wikipedia.org/wiki/Online_and_offline --gzip \ 41 | --rm 'script, #mw-navigation, #mw-page-base, #mw-head-base, #footer-icons' \ 42 | --css '#content{margin-left:0 !important}' --drop '.png$, .css$' --wait 10 \ 43 | --js off --minify --purgeCSS 44 | ``` 45 | 46 | ![Restored Wikipedia page](img/wikipedia-offline.png) 47 | 48 | This will store the page just like before, but it will do a lot of pre-processing, to reduce the snapshot size from *1.3MB*, to only *27K* (48x smaller), without losing any useful information. 49 | 50 | The `--gzip` flag will archive the JSON using GZIP. It is totally safe to use.
51 | The `--rm` flag, or `--removeElems`, will remove the specified page elements, using selectors. This can be used to remove useless elements so you can focus on the important content and reduce the snapshot size.
52 | The `--css` flag, or `--addCSS`, will add custom CSS on the page, before creating the snapshot. This can be used to change the font size, or move some elements to make the page look nicer.
53 | The `--drop`, or `--dropRequests` flag, will drop all HTTP requests matching, with regex. This can be used to stop tracking requests and reduce the final snapshot size.
54 | The `--wait` how much the browser page will stay open (in seconds) to allow the user to interact with the page, eg: accept cookies, close popups, scroll a little, hover some images.
55 | The `--js` flag will stop the browser from executing Javascript and will drop all Javascript requests, which usually reduces the snapshot size by A LOT. NOTE that this option will completely break many pages.
56 | The `--minify` flag will try to compress the final HTML as much as possible, to reduce the snapshot size. NOTE that this can crash for some pages with lots of Javascript.
57 | The `--purgeCSS` flag will purge all unused CSS and replace all styles with this processed CSS. This can reduce the snapshot size by A LOT, but will completely break some pages. 58 | 59 | And a last example, how to capture an Amazon page: 60 | 61 | ``` shell 62 | web-record https://www.amazon.com/dp/B086CV781H --gzip \ 63 | --rm 'script #nav-main #mars-fs-wrapper #rhf #navFooter #navBackToTop' \ 64 | --blockAds yes --blockList block/blocklist.txt --drop '//unagi.amazon.com/1' \ 65 | --js off --minify --wait 10 66 | ``` 67 | 68 | ![Restored Amazon page](img/amazon-kindle.png) 69 | 70 | These options will reduce the Amazon snapshot from ~*21MB*, to *857K* (24x smaller), without losing any useful information. 71 | 72 | If you care about the snapshot size, you need to try different options depending on the domain, to see what works, because some options will break the page on restore. 73 | 74 | 75 | ## File format 76 | 77 | The `snapshot.json` file format is simple: 78 | 79 | - url - is the URL specified when creating the snapshot 80 | - base_url - this is the resolved URL, after redirects (eg: may redirect to HTTPS and www.) 81 | - canonical_url - (optional) this is the canonical URL of the page 82 | - title - (optional) this is the title of the page 83 | - html - is the final, settled HTML of the page 84 | - responses - contains all the resources of the page (CSS, JS, images, etc) as key-value pairs: 85 | - body - the resource body saved as Quopri or Base64 86 | - headers - a limited subset of the response headers 87 | - request_url - the initial resource URL 88 | - response_url - (optional) the final response URL, after redirects (if it's different than the request URL) 89 | - status - a number representing the HTTP status 90 | 91 | The format is subject to change, ideally to simplify it. 92 | 93 | 94 | ## Limitations 95 | 96 | This format doesn't usually capture the audio and video of the page.
97 | This means you can't completely capture Youtube, Vimeo, or Spotify pages. (YET? or never?)
98 | This limitation may change in the future, but it's not the primary goal of the project. 99 | 100 | There are also issues with some iframes and shadow DOM nodes. 101 | 102 | Read my article that compares WARC, rrWeb and "recorded": 103 | https://crlf.link/log/entries/220803-web-snap/ 104 | 105 | 106 | ## Similar 107 | 108 | - https://github.com/Y2Z/monolith 109 | - https://github.com/go-shiori/obelisk 110 | - https://github.com/danburzo/percollate 111 | - https://github.com/croqaz/clean-mark 112 | - https://github.com/gildas-lormeau/SingleFile 113 | - https://github.com/sindresorhus/capture-website 114 | 115 | Also check: 116 | 117 | - https://crlf.link/mem/offline 118 | - https://crlf.link/mem/web-archiving 119 | -------------------------------------------------------------------------------- /block/blocklist.txt: -------------------------------------------------------------------------------- 1 | # Generated from update_bad_hosts.py 2 | 360yield.com 3 | 3gl.net 4 | 3p-geo.yahoo.com 5 | 3p-udc.yahoo.com 6 | 600z.com 7 | a-ads.com 8 | a-reporting.nytimes.com 9 | a.mobify.com 10 | a.ucoz.net 11 | a1.api.bbc.co.uk 12 | aamt.nbcnews.com 13 | aamt.today.com 14 | acdn.adnxs.com 15 | activemetering.com 16 | ad-balancer.net 17 | ad-cdn.technoratimedia.com 18 | ad-delivery.net 19 | ad-serverparc.nl 20 | ad-stir.com 21 | ad.daum.net 22 | ad.doubleclick.net 23 | ad.wsod.com 24 | adblockanalytics.com 25 | adbrite.com 26 | adbro.me 27 | adbutler.com 28 | adc-ad-assets.adtilt.com 29 | adcash.com 30 | adcloud.net 31 | ade.googlesyndication.com 32 | adform.net 33 | adgardener.com 34 | adlc-exchange.toast.com 35 | adlog.com.com 36 | admedo.com 37 | admitad.com 38 | admixer.net 39 | admob.com 40 | adnetworkperformance.com 41 | adnxs.com 42 | adocean.pl 43 | adotube.com 44 | adpacks.com 45 | adperium.com 46 | adrta.com 47 | ads-twitter.com 48 | ads.betfair.com 49 | ads.linkedin.com 50 | ads.samsung.com 51 | ads.saymedia.com 52 | ads.servebom.com 53 | ads1-adnow.com 54 | ads3-adnow.com 55 | ads5-adnow.com 56 | adsafeprotected.com 57 | adsame.com 58 | adscale.de 59 | adsdk.com 60 | adserver-2084671375.us-east-1.elb.amazonaws.com 61 | adserverplus.com 62 | adsfac.eu 63 | adsfac.net 64 | adsfac.us 65 | adskape.ru 66 | adsnative.com 67 | adsonar.com 68 | adspeed.net 69 | adspirit.de 70 | adsupply.com 71 | adtng.com 72 | adtoma.com 73 | adtrace.org 74 | adtrue.com 75 | advertica-cdn.com 76 | adviva.net 77 | adx-exchange.toast.com 78 | adxpansion.com 79 | adzmedia.com 80 | affiliate.dtiserv.com 81 | affiliatefuel.com 82 | affiliatefuture.com 83 | affiliates.thrixxx.com 84 | affiliatewindow.com 85 | affiz.net 86 | agkn.com 87 | aimatch.com 88 | alexandria.marfeelcdn.com 89 | alphonso.tv 90 | als-svc.nytimes.com 91 | altitude-arena.com 92 | am15.net 93 | amazon-adsystem.com 94 | amp-error-reporting.appspot.com 95 | amplifypixel.outbrain.com 96 | analytics-prod2.glance-internal.inmobi.com 97 | analytics-production.hapyak.com 98 | analytics-static.ugc.bazaarvoice.com 99 | analytics-tracker.thescore.com 100 | analytics.163.com 101 | analytics.analytics-egain.com 102 | analytics.carambo.la 103 | analytics.chase.com 104 | analytics.edgekey.net 105 | analytics.ff.avast.com 106 | analytics.foresee.com 107 | analytics.getshogun.com 108 | analytics.glance.inmobi.com 109 | analytics.kaltura.com 110 | analytics.kidoz.net 111 | analytics.kongregate.io 112 | analytics.logsss.com 113 | analytics.mailmunch.co 114 | analytics.nike.com 115 | analytics.plex.tv 116 | analytics.reyrey.net 117 | analytics.shareaholic.com 118 | analytics.tiktok.com 119 | analytics.tout.com 120 | analytics.vendemore.com 121 | analytics.wildtangent.com 122 | analytics.yahoo.com 123 | analytics.yolacdn.net 124 | analytics.yomedia.vn 125 | analytics.ziftsolutions.com 126 | andomedia.com 127 | annoyingacoustics.com 128 | api-js.mixpanel.com 129 | api.amplitude.com 130 | api.branch.io 131 | app.adjust.com 132 | appads.com 133 | ariane.abtasty.com 134 | as5000.com 135 | assets.micpn.com 136 | aswpsdkus.com 137 | atdmt.com 138 | atwola.com 139 | audit.median.hu 140 | axonix.com 141 | b.fox.com 142 | banners.adultfriendfinder.com 143 | banners.amigos.com 144 | banners.cams.com 145 | banners.passion.com 146 | banners.videosecrets.com 147 | bannershotlink.perfectgonzo.com 148 | bans.bride.ru 149 | bat.bing.com 150 | batch.upsight-api.com 151 | bats.video.yahoo.com 152 | beacon.flow.io 153 | beacon.qq.com 154 | beacon.riskified.com 155 | beacon.shazam.com 156 | beacon.sina.com.cn 157 | beacon.sojern.com 158 | beacons.mediamelon.com 159 | beap.gemini.yahoo.com 160 | bidder.criteo.com 161 | bids.concert.io 162 | bidswitch.net 163 | blogherads.com 164 | bluekai.com 165 | boomads.com 166 | bootstrap.upsight-api.com 167 | brainient.com 168 | brandreachsys.com 169 | bridgetrack.com 170 | bs.yandex.ru 171 | btloader.com 172 | bttrack.com 173 | c.bigmir.net 174 | c.mgid.com 175 | casalemedia.com 176 | cash4members.com 177 | cashlayer.com 178 | cc.swiftype.com 179 | ccgateway.net 180 | cdn-channels-pixel.ex.co 181 | cdn.cookielaw.org 182 | cdn.districtm.io 183 | cdn.usefathom.com 184 | cdn7.rocks 185 | cdnwidget.com 186 | cgicounter.puretec.de 187 | chanalytics.merchantadvantage.com 188 | checkm8.com 189 | chitika.net 190 | ck.connatix.com 191 | clickbooth.com 192 | clickboothlnk.com 193 | clickthruserver.com 194 | clickxchange.com 195 | client-analytics.braintreegateway.com 196 | clkrev.com 197 | cloudfront-labs.amazonaws.com 198 | cnt.my 199 | collect.banggood.com 200 | collect.igodigital.com 201 | collector.cint.com 202 | collector.xhamster.com 203 | colossusssp.com 204 | confiant-integrations.global.ssl.fastly.net 205 | connextra.com 206 | content.tapjoy.com 207 | contentabc.com 208 | contextweb.com 209 | count-server.sharethis.com 210 | count.rin.ru 211 | counter.bloke.com 212 | counter.cnw.cz 213 | counter.rambler.ru 214 | counter.snackly.co 215 | counter.yadro.ru 216 | cpays.com 217 | cpx.to 218 | cpxinteractive.com 219 | creative-serving.com 220 | creativecdn.com 221 | creatives.livejasmin.com 222 | crwdcntrl.net 223 | ct.pinterest.com 224 | cws.conviva.com 225 | cxad.cxense.com 226 | dapper.net 227 | dc.banggood.com 228 | dcinfos-cache.abtasty.com 229 | dd.nytimes.com 230 | dedicatedmedia.com 231 | demdex.net 232 | detect.rayjump.com 233 | direct-events-collector.spot.im 234 | directaclick.com 235 | directorym.com 236 | domdex.com 237 | doubleclick.com 238 | doubleclick.net 239 | doublepimp.com 240 | dpmsrv.com 241 | drfdisvc.walmart.com 242 | ds-aksb-a.akamaihd.net 243 | eacdn.com 244 | earnify.com 245 | ebuzzing.com 246 | ebz.io 247 | effectivemeasure.net 248 | emediate.dk 249 | emxdgt.com 250 | engine.fyber.com 251 | entrecard.s3.amazonaws.com 252 | eqads.com 253 | ero-advertising.com 254 | error-collector.ted.com 255 | et.nytimes.com 256 | euros4click.de 257 | event.collector.scopely.io 258 | events.attentivemobile.com 259 | events.brightline.tv 260 | events.privy.com 261 | events.redditmedia.com 262 | everesttech.net 263 | exoclick.com 264 | extend.tv 265 | extremereach.io 266 | eyereturn.com 267 | eyeviewads.com 268 | fam-ad.com 269 | fastapi.net 270 | fastclick.net 271 | fimserve.com 272 | firstlightera.com 273 | fls-na.amazon.com 274 | fls.doubleclick.net 275 | fmpub.net 276 | fuse.forbes.com 277 | fusionads.net 278 | fwmrm.net 279 | g.doubleclick.net 280 | ga-beacon.appspot.com 281 | gammaplatform.com 282 | ganon.yahoo.com 283 | gateway.foresee.com 284 | genieessp.com 285 | geo.nbcsports.com 286 | geo.yahoo.com 287 | geobanner.adultfriendfinder.com 288 | geolocation.forbes.com 289 | gj.mmstat.com 290 | gmads.net 291 | google-analytics.com 292 | google.com/adsense/search 293 | google.com/recaptcha 294 | googleads.g.doubleclick.net/pagead 295 | googleoptimize.com 296 | googlesyndication.com 297 | googletagservices.com 298 | greystripe.com 299 | gstatic.com/recaptcha/releases 300 | gwallet.com 301 | h12-media.com 302 | harrenmedianetwork.com 303 | hb.nexage.com 304 | hbopenbid.pubmatic.com 305 | hghit.com 306 | hits.informer.com 307 | hs-analytics.net 308 | hyperbanner.net 309 | iabusprivacy.pmc.com 310 | id5-sync.com 311 | ilyf4amifh.com 312 | imp.optaim.com 313 | imrworldwide.com 314 | in.treasuredata.com 315 | innity.net 316 | insightexpress.com 317 | insightexpressai.com 318 | irs01.com 319 | is-tracking-pixel-api-prod.appspot.com 320 | ja2n2u30a6rgyd.com 321 | jiwire.com 322 | juiceadv.com 323 | k.streamrail.com 324 | kanoodle.com 325 | karma.mdpcdn.com 326 | krxd.net 327 | kvinit-prod.api.kochava.com 328 | l.sharethis.com 329 | lakequincy.com 330 | lciapi.ninthdecimal.com 331 | leadbolt.net 332 | lfstmedia.com 333 | lgsmartad.com 334 | liftdna.com 335 | ligatus.com 336 | lightningcast.net 337 | linkbuddies.com 338 | linkexchange.com 339 | linkreferral.com 340 | log.adplex.co.kr 341 | log.go.com 342 | log.medietall.no 343 | log.outbrain.com 344 | log.pinterest.com 345 | log.sina.cn 346 | log.snapdeal.com 347 | logging.api.intuit.com 348 | logx.optimizely.com 349 | loopme.me 350 | lovelydrum.com 351 | ls.srvcs.tumblr.com 352 | lucidmedia.com 353 | lzjl.com 354 | ma.logsss.com 355 | madadsmedia.com 356 | mainadv.com 357 | marketgid.com 358 | marketing.888.com 359 | match.adsrvr.org/track 360 | match.prod.bidr.io 361 | matheranalytics.com 362 | maxonclick.com 363 | mbid.marfeelrev.com 364 | media6degrees.com 365 | mediaforge.com 366 | medleyads.com 367 | medyanetads.com 368 | metrics-logger.spot.im 369 | metrics.aetn.com 370 | metrics.brightcove.com 371 | metrics.fedex.com 372 | metrics.icloud.com 373 | metrics.kmsmep.com 374 | metrics.roblox.com 375 | metrics.ted.com 376 | metrics.timewarnercable.com 377 | mgid.com 378 | microad.net 379 | millennialmedia.com 380 | ml314.com 381 | mmismm.com 382 | mng-ads.com 383 | mocean.mobi 384 | monetize-api.coronalabs.com 385 | morgdm.ru 386 | mpnrs.com 387 | msads.net 388 | munchkin.marketo.net 389 | mythings.com 390 | nappyattack.com 391 | neocounter.neoworx-blog-tools.net 392 | nervoussummer.com 393 | nexus.ensighten.com 394 | nmcdn.us 395 | nuseek.com 396 | onclickads.net 397 | oneid.mmstat.com 398 | ophan.theguardian.com 399 | optad360.io 400 | osimg.nbcuni.com 401 | outcome-ssp.supersonicads.com 402 | overture.com 403 | oxado.com 404 | p.metrilo.com 405 | p.placed.com 406 | p.skimresources.com 407 | pagead2.googlesyndication.com 408 | pages-stats.rbl.ms 409 | pcash.imlive.com 410 | perf-events.cloud.unity3d.com 411 | perr.h-cdn.com 412 | pgmediaserve.com 413 | pgpartner.com 414 | phonograph2.voxmedia.com 415 | pi.ispot.tv 416 | ping.dozuki.com 417 | pingback.issuu.com 418 | pingjs.qq.com 419 | pings.conviva.com 420 | pippio.com 421 | pix.revjet.com 422 | pix.spot.im 423 | pixel.adsafeprotected.com 424 | pixel.facebook.com 425 | pixel.mtrcs.samba.tv 426 | pixel.wp.com 427 | pixiedust.buzzfeed.com 428 | placements.tapjoy.com 429 | platform.iteratehq.com 430 | player.adtelligent.com 431 | pointroll.com 432 | polarcdn-terrax.com 433 | polyad.net 434 | popads.net 435 | popunder.ru 436 | postrelease.com 437 | powerad.ai 438 | pr-bh.ybp.yahoo.com 439 | prd-collector-anon.ex.co 440 | prg.smartadserver.com 441 | primaryads.com 442 | projectwonderful.com 443 | promobenef.com 444 | promos.fling.com 445 | propellerads.com 446 | psa.carambo.la 447 | pt.ispot.tv 448 | pub.network 449 | pubmatic.com 450 | pushnami.com 451 | px.owneriq.net 452 | qnsr.com 453 | query1.petametrics.com 454 | r.skimresources.com 455 | ravm.tv 456 | rbthre.work 457 | recs.shareaholic.com 458 | referrer.disqus.com 459 | retagro.com 460 | retargeter.com 461 | rev2pub.com 462 | revcontent.com 463 | revmob.com 464 | revrtb.com 465 | rfihub.com 466 | rlcdn.com 467 | rlog.popin.cc 468 | rpc.tapjoy.com 469 | rtbpop.com 470 | rtbpopd.com 471 | rubiconproject.com 472 | rules.quantcount.com 473 | run-syndicate.com 474 | s.beop.io 475 | s.logsss.com 476 | s2d6.com 477 | samsungads.com 478 | sanalytics.disneyplus.com 479 | sanalytics.tbs.com 480 | sanalytics.verizon.com 481 | sanalytics.verizonwireless.com 482 | sbeacon.sina.com.cn 483 | sc-static.net 484 | sdk.appsflyer.com 485 | sdk.iad-01.braze.com 486 | sdk.iad-02.braze.com 487 | sdk.iad-03.braze.com 488 | sdk.iad-06.braze.com 489 | secure.merchantadvantage.com 490 | secure.perk0mean.com 491 | securepubads.g.doubleclick.net 492 | seg.sharethis.com 493 | sekindo.com 494 | servedbyadbutler.com 495 | smaato.net 496 | smadex.com 497 | smartadserver.com 498 | smetrics.aa.com 499 | smetrics.bestbuy.com 500 | smetrics.boston.com 501 | smetrics.chrysler.com 502 | smetrics.cnn.com 503 | smetrics.cox.com 504 | smetrics.dickssportinggoods.com 505 | smetrics.foxnews.com 506 | smetrics.lululemon.com 507 | smetrics.southwest.com 508 | smetrics.walgreens.com 509 | smetrics1.experian.com 510 | smy.iheart.com 511 | socdm.com 512 | sociomantic.com 513 | sofia.trustx.org 514 | solocpm.com 515 | solutions.invocacdn.com 516 | sparkstudios.com 517 | specialdeals.g5e.com 518 | speee-ad.akamaized.net 519 | sponsorpay.com 520 | spotscenered.info 521 | spotxchange.com 522 | srepdata.usatoday.com 523 | srtb.msn.com 524 | sstats.teenvogue.com 525 | star-advertising.com 526 | stas.outbrain.com 527 | static.doubleclick.net 528 | statm.the-adult-company.com 529 | stats-dev.brid.tv 530 | stats.appsflyer.com 531 | stats.bluebillywig.com 532 | stats.olark.com 533 | stats.smartclip.net 534 | stats.wordpress.com 535 | stats.wp.com 536 | stats.zotabox.com 537 | stickyadstv.com 538 | summerhamster.com 539 | sw88.abc.com 540 | sw88.espn.com 541 | sw88.go.com 542 | sweb.ulta.com 543 | sync.adap.tv 544 | t.appsflyer.com 545 | t.indeed.com 546 | t.leady.com 547 | t.metrilo.com 548 | t.sharethis.com 549 | t.skimresources.com 550 | t.wayfair.com 551 | t2.hulu.com 552 | tag.leadplace.fr 553 | tag.mtrcs.samba.tv 554 | tagger.opecloud.com 555 | tags.tiqcdn.com 556 | tapjoyads.com 557 | targeting.washpost.nile.works 558 | telemetrics.klaviyo.com 559 | telemetry.malwarebytes.com 560 | telemetry.sdk.inmobi.com 561 | thrtle.com 562 | tidaltv.com 563 | top-fwz1.mail.ru 564 | totemcash.com 565 | tpc.googlesyndication.com 566 | tr.snapchat.com 567 | trace.qq.com 568 | track.dictionary.com 569 | track.pricespider.com 570 | track.tappx.com 571 | track.tiara.daum.net 572 | track.uc.cn 573 | tracker.icerocket.com 574 | tracker.nbcuas.com 575 | tracker.personizely.net 576 | tracking.adalyser.com 577 | tracking.bloomberg.com 578 | tracking.immobilienscout24.de 579 | tracking.leadlander.com 580 | tracking.lengow.com 581 | tracking.lg.com 582 | tracking.listhub.net 583 | tracking.miui.com 584 | tradeadexchange.com 585 | trafficfactory.biz 586 | traffichunt.com 587 | trafficjunky.net 588 | traxex.gannettdigital.com 589 | tredir.go.com 590 | tremorhub.com 591 | tribalfusion.com 592 | triggers.wfxtriggers.com 593 | trk.clinch.co 594 | tru.am 595 | truoptik.com 596 | tvpixel.com 597 | twittad.com 598 | uimserv.net 599 | unagi.amazon.com/\d/events 600 | unrulymedia.com 601 | usc.adserver.snapads.com 602 | usersegment.wpdigital.net 603 | utarget.co.uk 604 | valueclick.com 605 | valueclickmedia.com 606 | vdopia.com 607 | vendimob.pl 608 | vi-serve.com 609 | videoevents.outbrain.com 610 | vindicosuite.com 611 | vntsm.com 612 | wass.ihsmarkit.com 613 | waust.at 614 | webads.co.nz 615 | webcounter.goweb.de 616 | websdk.appsflyer.com 617 | wetter.pushwoosh.com 618 | whaleads.com 619 | widget-pixels.outbrain.com 620 | widgetbucks.com 621 | wigetmedia.com 622 | wildcard.moatads.com.edgekey.net 623 | www.summerhamster.com 624 | x.disq.us 625 | xad.com 626 | xxxmyself.com 627 | yandexadexchange.net 628 | yieldads.com 629 | yieldlab.net 630 | yieldmanager.net 631 | yieldtraffic.com 632 | youborafds01.com 633 | youradexchange.com 634 | z.cdp-dev.cnn.com 635 | zenkreka.com 636 | zucks.net 637 | -------------------------------------------------------------------------------- /src/record.js: -------------------------------------------------------------------------------- 1 | /* 2 | * Record a page. 3 | */ 4 | import fs from 'fs'; 5 | import fetch from 'cross-fetch'; 6 | import playwright from 'playwright'; 7 | import { transform as minifyCSS } from 'lightningcss'; 8 | import { PurgeCSS } from 'purgecss'; 9 | import { PlaywrightBlocker } from '@ghostery/adblocker-playwright'; 10 | 11 | import { requestKey, normalizeURL, toBool, smartSplit, encodeBody } from './util.js'; 12 | 13 | async function processArgs(args) { 14 | args.gzip = toBool(args.gzip); 15 | args.js = toBool(args.js); 16 | args.blockAds = toBool(args.blockAds); 17 | args.extraMeta = toBool(args.extraMeta); 18 | args.headless = toBool(args.headless); 19 | args.iframes = toBool(args.iframes); 20 | args.minify = toBool(args.minify); 21 | args.minCSS = toBool(args.minCSS); 22 | args.purgeCSS = toBool(args.purgeCSS); 23 | args.console = toBool(args.console); 24 | 25 | args.wait = parseInt(args.wait) * 1000; 26 | args.timeout = parseInt(args.timeout) * 1000; 27 | args.imgTimeout = parseInt(args.imgTimeout) * 1000; 28 | 29 | args.DROP = smartSplit(args.dropRequests).map((x) => new RegExp(x, 'i')); 30 | args.HEADERS = smartSplit(args.headers).map((x) => x.toLowerCase()); 31 | args.REMOVE = smartSplit(args.removeElems); 32 | args.CSS = args.addCSS ? args.addCSS.trim() : ''; 33 | 34 | args.DROPST = smartSplit(args.dropStatus).map((x) => new RegExp(x.replace(/x/gi, '\\d'))); 35 | if (args.blockList) { 36 | const blockList = await fs.promises.readFile(args.blockList, { encoding: 'utf8' }); 37 | args.DROPLI = blockList 38 | .split('\n') 39 | .map((x) => x.trim().replace(/\/+$/, '')) 40 | .filter((x) => x && !x.startsWith('#') && x.length > 5) 41 | .map((x) => new RegExp(`^https?://(www\.|m\.)?${x}/.+`, 'i')); 42 | console.log(`Loaded ${args.DROPLI.length} drop list domains from file`); 43 | } 44 | 45 | args.URI = args._ ? args._[0] : null || args.input || args.url; 46 | let HOST = new URL(args.URI).host; 47 | if (HOST.startsWith('www.')) HOST = HOST.slice(4); 48 | let OUT = args._ ? args._[1] : null || args.output; 49 | if (!OUT) OUT = `snapshot_${HOST}.json`; 50 | if (args.gzip && !OUT.endsWith('.gz')) OUT += '.gz'; 51 | args.OUT = OUT; 52 | // console.log('ARGS:', args); 53 | } 54 | 55 | export async function recordPage(args) { 56 | await processArgs(args); 57 | 58 | // only Chromium supported for now 59 | const browser = await playwright.chromium.launch({ 60 | headless: args.headless, 61 | // disable-web-security is needed to access cross-origin resources, 62 | // eg: CSS rules 63 | args: ['--disable-default-apps', '--disable-web-security'], 64 | }); 65 | const context = await browser.newContext({ 66 | javaScriptEnabled: args.js, 67 | userAgent: args.userAgent, 68 | bypassCSP: true, 69 | ignoreHTTPSErrors: true, 70 | serviceWorkers: 'block', 71 | viewport: null, 72 | }); 73 | const page = await context.newPage(); 74 | 75 | if (args.console) { 76 | page.on('console', async (msg) => { 77 | const msgArgs = msg.args(); 78 | const logValues = await Promise.all(msgArgs.map(async (arg) => await arg.jsonValue())); 79 | console.log(`CONSOLE.${msg.type().toUpperCase()}:`, ...logValues); 80 | }); 81 | } 82 | 83 | if (args.blockAds) { 84 | const blocker = await PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch); 85 | await blocker.enableBlockingInPage(page); 86 | } 87 | 88 | const snapshot = await internalRecordPage(args, page); 89 | 90 | return { snapshot, page, context, browser }; 91 | } 92 | 93 | async function internalRecordPage(args, page) { 94 | const { URI, DROP, DROPLI, DROPST, HEADERS, REMOVE, CSS } = args; 95 | 96 | if ((DROP && DROP.length) || (DROPLI && DROPLI.length)) { 97 | const block = [...DROP, ...DROPLI]; 98 | page.route('**', async (route) => { 99 | const r = route.request(); 100 | const u = normalizeURL(r.url()); 101 | for (const re of block) { 102 | if (re.test(u)) { 103 | console.warn('Drop matching request:', re, u); 104 | route.abort(); 105 | return; 106 | } 107 | } 108 | route.continue(); 109 | }); 110 | } 111 | 112 | let snapshot = { url: URI, base_url: '', html: '', responses: {} }; 113 | if (args.extraMeta) { 114 | snapshot = { 115 | url: URI, 116 | base_url: '', 117 | canonical_url: '', 118 | date: new Date().toISOString(), 119 | title: '', 120 | html: '', 121 | responses: {}, 122 | }; 123 | } 124 | 125 | page.on('response', async (response) => { 126 | const r = response.request(); 127 | const u = normalizeURL(r.url()); 128 | if (u.startsWith('data:')) { 129 | return; 130 | } 131 | // ignore the index page, it will be saved at the end 132 | if (u === normalizeURL(URI)) return; 133 | 134 | const status = response.status(); 135 | if (DROPST && DROPST.length) { 136 | for (const re of DROPST) { 137 | if (re.test(status.toString())) { 138 | console.warn('Drop matching status:', re, status); 139 | return; 140 | } 141 | } 142 | } else { 143 | // ignore redirect requests, they will be saved after resolved 144 | if (status >= 300 && status < 400) { 145 | console.warn(`Redirect status: ${status}`, u, 'to:', response.headers()['location']); 146 | return; 147 | } 148 | // allow all the other statuses 149 | } 150 | 151 | const key = requestKey(r); 152 | console.log('Response:', status, key); 153 | 154 | // restrict headers to subset 155 | let headers = Object.entries(response.headers()).filter(([key]) => HEADERS.includes(key)); 156 | headers = Object.fromEntries(headers); 157 | const contentType = headers['content-type']; 158 | 159 | let body; 160 | try { 161 | const buffer = await response.body(); 162 | body = encodeBody(r.resourceType(), contentType, buffer); 163 | } catch (err) { 164 | const frame = page.frame({ url: u }); 165 | if (frame && args.iframes) { 166 | console.log('Capture IFRAME content for:', frame.url()); 167 | const content = (await frame.content()).trim(); 168 | body = encodeBody(r.resourceType(), contentType, new Buffer.from(content, 'utf-8')); 169 | } else if (status !== 204) { 170 | console.error('ERR saving response for:', status, u, err); 171 | } 172 | } 173 | 174 | // if the request was NOT cached, or it WAS cached 175 | // and the new request is successful (overwrite with fresh data) 176 | if (!snapshot.responses[key] || (snapshot.responses[key] && snapshot.responses[key].status === 200)) { 177 | snapshot.responses[key] = { 178 | body, 179 | headers, 180 | request_url: u, 181 | status, 182 | }; 183 | if (u !== response.url()) { 184 | snapshot.responses[key] = { 185 | response_url: response.url(), 186 | }; 187 | } 188 | } 189 | }); 190 | 191 | try { 192 | console.log('Waiting for the page to load...'); 193 | await page.goto(URI, { timeout: args.timeout, waitUntil: 'networkidle' }); 194 | } catch (err) { 195 | console.error('Wait timeout:', err); 196 | } 197 | 198 | // initial snapshot 199 | snapshot.html = (await page.content()).trim(); 200 | 201 | const imgCount = await page.locator('img').count(); 202 | if (imgCount > 0) { 203 | try { 204 | console.log('Waiting for images to load...'); 205 | await page.waitForSelector('img', { timeout: args.imgTimeout }); 206 | } catch (err) { 207 | console.error('Images timeout:', err); 208 | } 209 | } 210 | 211 | // resolved base URL 212 | snapshot.base_url = await page.evaluate('document.baseURI'); 213 | 214 | if (args.extraMeta) { 215 | snapshot.title = (await page.title()).trim(); 216 | // resolved canonical URL 217 | snapshot.canonical_url = await page.evaluate( 218 | `(document.querySelector("link[rel='canonical']") || document.createElement('link')).getAttribute('href')`, 219 | ); 220 | if (!snapshot.canonical_url) delete snapshot.canonical_url; 221 | } 222 | 223 | // delete possible index duplicates, when user URL != resolved URL 224 | let baseKey = `GET:${snapshot.base_url}`; 225 | if (snapshot.responses[baseKey] && snapshot.responses[baseKey].body) { 226 | delete snapshot.responses[baseKey]; 227 | } 228 | if (snapshot.canonical_url) { 229 | baseKey = `GET:${snapshot.canonical_url}`; 230 | if (snapshot.responses[baseKey] && snapshot.responses[baseKey].body) { 231 | delete snapshot.responses[baseKey]; 232 | } 233 | } 234 | baseKey = null; 235 | 236 | for (const selector of REMOVE) { 237 | console.log('Removing element selector:', selector); 238 | await page.evaluate((s) => { 239 | for (const el of document.querySelectorAll(s)) { 240 | el.parentNode.removeChild(el); 241 | } 242 | }, selector); 243 | } 244 | 245 | if (CSS && CSS.length) { 246 | console.log('Adding custom CSS...'); 247 | await page.evaluate((css) => { 248 | const cssHack = document.createElement('style'); 249 | cssHack.className = 'hack'; 250 | cssHack.innerText = css; 251 | document.head.appendChild(cssHack); 252 | }, CSS); 253 | } 254 | 255 | // second snapshot 256 | snapshot.html = (await page.content()).trim(); 257 | 258 | if (args.minCSS || args.purgeCSS) { 259 | const [rawCSS, URLs] = await page.evaluate(() => { 260 | const urls = new Set(); 261 | const css = []; 262 | console.log(`Collecting ${document.styleSheets.length} CSS styleSheets...`); 263 | // cycle #1 collect CSS 264 | for (const style of document.styleSheets) { 265 | if (style.href) urls.add(style.href); 266 | let raw; 267 | try { 268 | console.log(`Saving CSS rules for: ${style.ownerNode.localName} href=${style.href}`); 269 | raw = ` /* CSS for ${style.ownerNode.localName} href=${style.href} */ `; 270 | raw += Array.from(style.cssRules) 271 | .map((rule) => { 272 | if (rule.href) urls.add(rule.href); 273 | if (rule instanceof CSSImportRule) return ''; 274 | return rule.cssText; 275 | }) 276 | .join(' '); 277 | } catch (err) { 278 | console.warn(`Cannot access CSS: ${err}`); 279 | } 280 | if (raw) css.push(raw); 281 | } 282 | console.log(`Found ${css.length} CSS styleSheets...`); 283 | return [css, [...urls]]; 284 | }); 285 | 286 | let pageCSS = rawCSS.reduce((acc, curr) => acc + ' ' + curr, ' '); 287 | const s1 = pageCSS.length; 288 | 289 | if (args.purgeCSS) { 290 | const purgedCSS = await new PurgeCSS().purge({ 291 | css: [{ raw: pageCSS }], 292 | content: [{ raw: snapshot.html, extension: 'html' }], 293 | }); 294 | pageCSS = purgedCSS.map(({ css }) => css).join(' '); 295 | } 296 | 297 | let finalCSS = ''; 298 | try { 299 | finalCSS = minifyCSS({ 300 | code: Buffer.from(pageCSS), 301 | minify: true, 302 | }).code.toString(); 303 | const s2 = finalCSS.length; 304 | 305 | console.log( 306 | `CSS styles minify efficiency ${((s2 / s1) * 100).toFixed(2)}% from ` + 307 | `${Intl.NumberFormat('en').format(s1)} to ${Intl.NumberFormat('en').format(s2)}.`, 308 | ); 309 | } catch (err) { 310 | finalCSS = pageCSS; 311 | console.error(`Minify CSS failed with error: ${err}!`); 312 | } 313 | 314 | await page.evaluate((css) => { 315 | // cycle to remove CSS DOM nodes 316 | // this needs to run after collecting the CSS 317 | // and has to run a few times to remove all deep nodes ... 318 | while (document.styleSheets.length > 0) { 319 | for (const style of document.styleSheets) { 320 | try { 321 | console.warn(`Removing node: ${style.ownerNode}`); 322 | style.ownerNode.remove(); 323 | } catch {} 324 | } 325 | } 326 | const cssHack = document.createElement('style'); 327 | cssHack.className = 'clean'; 328 | cssHack.innerText = css; 329 | document.head.appendChild(cssHack); 330 | }, finalCSS); 331 | 332 | // cleanup the recorded CSS resources 333 | for (const u of URLs) { 334 | const key = `GET:${u}`; 335 | const res = snapshot.responses[key]; 336 | if (res) { 337 | console.log('Removing recorded CSS response:', key); 338 | res.body = null; 339 | delete snapshot.responses[key]; 340 | } 341 | } 342 | // check if there are any CSS resources left 343 | for (const k of Object.keys(snapshot.responses)) { 344 | const res = snapshot.responses[k]; 345 | if ( 346 | res.headers && 347 | res.headers['content-type'] && 348 | res.headers['content-type'].startsWith('text/css') 349 | ) { 350 | console.log('CSS response not removed:', k); 351 | } 352 | } 353 | 354 | // post CSS snapshot 355 | snapshot.html = (await page.content()).trim(); 356 | } 357 | 358 | return snapshot; 359 | } 360 | --------------------------------------------------------------------------------