(microlink.io)
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in
13 | all copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
21 | THE SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |

3 |

4 |
5 |
6 |
7 |
8 | 
9 | [](https://coveralls.io/github/microlinkhq/html-get)
10 | [](https://www.npmjs.org/package/html-get)
11 |
12 | > Get the HTML from any website, fine-tuned for correction & speed.
13 |
14 | ## Features
15 |
16 | - Get HTML markup for any URL, including images, video, audio, or pdf.
17 | - Block ads tracker or any non-necessary network subrequest.
18 | - Handle unreachable or timeout URLs gracefully.
19 | - Ensure HTML markup is appropriately encoded.
20 |
21 | **html-get** takes advantage of [puppeteer](https://github.com/GoogleChrome/puppeteer) headless technology when is needed, such as client-side apps that needs to be prerender.
22 |
23 | ## Install
24 |
25 | ```bash
26 | $ npm install browserless puppeteer html-get --save
27 | ```
28 |
29 | ## Usage
30 |
31 | ```js
32 | const createBrowserless = require('browserless')
33 | const getHTML = require('html-get')
34 |
35 | // Spawn Chromium process once
36 | const browserlessFactory = createBrowserless()
37 |
38 | // Kill the process when Node.js exit
39 | process.on('exit', () => {
40 | console.log('closing resources!')
41 | browserlessFactory.close()
42 | })
43 |
44 | const getContent = async url => {
45 | // create a browser context inside Chromium process
46 | const browserContext = browserlessFactory.createContext()
47 | const getBrowserless = () => browserContext
48 | const result = await getHTML(url, { getBrowserless })
49 | // close the browser context after it's used
50 | await getBrowserless((browser) => browser.destroyContext())
51 | return result
52 | }
53 |
54 | getContent('https://example.com')
55 | .then(content => {
56 | console.log(content)
57 | process.exit()
58 | })
59 | .catch(error => {
60 | console.error(error)
61 | process.exit(1)
62 | })
63 | ```
64 |
65 | ### Command Line
66 |
67 | ```
68 | $ npx html-get https://example.com
69 | ```
70 |
71 | ## API
72 |
73 | ### getHTML(url, [options])
74 |
75 | #### url
76 |
77 | *Required*
78 | Type: `string`
79 |
80 | The target URL for getting the HTML markup.
81 |
82 | #### options
83 |
84 | ##### encoding
85 |
86 | Type: `string`
87 | Default: `'utf-8'`
88 |
89 | It ensures the HTML markup is encoded to the encoded value provided.
90 |
91 | The value will be passes to [`html-encode`](https://github.com/kikobeats/html-encode)
92 |
93 | ##### getBrowserless
94 |
95 | *Required*
96 | Type: `function`
97 |
98 | A function that should return a [browserless](https://browserless.js.org/) instance to be used for interact with puppeteer:
99 |
100 | ##### getMode
101 |
102 | Type: `function`
103 |
104 | It determines the strategy to use based on the `url`, being the possibles values `'fetch'` or `'prerender'` .
105 |
106 | ##### getTemporalFile
107 |
108 | Type: `function`
109 |
110 | It creates a temporal file.
111 |
112 | ##### gotOpts
113 |
114 | Type: `object`
115 |
116 | It passes configuration object to [got](https://www.npmjs.com/package/got) under `'fetch'` strategy.
117 |
118 | ##### headers
119 |
120 | Type: `object`
121 |
122 | Request headers that will be passed to fetch/prerender process.
123 |
124 | ##### mutool
125 |
126 | Type: `function`|`boolean`
127 | Default: `source code`
128 |
129 | It returns a function that receives that executes [mutool](https://mupdf.com/) binary for turning PDF files into HTML markup.
130 |
131 | It can explicitly disabled passing `false`.
132 |
133 | ##### prerender
134 |
135 | Type: `boolean`|`string`
136 | Default: `'auto'`
137 |
138 | Enable or disable prerendering as mechanism for getting the HTML markup explicitly.
139 |
140 | The value `auto` means that that internally use a list of websites that don't need to use prerendering by default. This list is used for speedup the process, using `fetch` mode for these websites.
141 |
142 | See [getMode parameter](#getMode) for know more.
143 |
144 | ##### puppeteerOpts
145 |
146 | Type: `object`
147 |
148 | It passes coniguration object to [puppeteer](https://www.npmjs.com/package/puppeteer) under `'prerender'` strategy.
149 |
150 | ##### rewriteUrls
151 |
152 | Type: `boolean`
153 | Default: `false`
154 |
155 | When is `true`, it will be rewritten CSS/HTML relatives URLs present in the HTML markup into absolutes.
156 |
157 | ##### rewriteHtml
158 |
159 | Type: `boolean`
160 | Default: `false`
161 |
162 | When is `true`, it will rewrite some common mistake related with HTML meta tags.
163 |
164 | ##### serializeHtml
165 |
166 | It determines how HTML should be serialied before returning.
167 |
168 | It's serialized `$ => ({ html: $.html() })` by default.
169 |
170 | ## License
171 |
172 | **html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.
173 | Authored and maintained by [Kiko Beats](https://kikobeats.com) with help from [contributors](https://github.com/microlinkhq/html-get/contributors).
174 |
175 | > [microlink.io](https://microlink.io) · GitHub [microlinkhq](https://github.com/microlinkhq) · X [@microlinkhq](https://x.com/microlinkhq)
176 |
--------------------------------------------------------------------------------
/benchmark/get-content-type/index.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const NullProtoObj = require('null-prototype-object')
4 | const { parse } = require('content-type')
5 |
6 | const parseContentType = contentType =>
7 | typeof contentType === 'string'
8 | ? parse(contentType)
9 | : { type: undefined, parameters: {} }
10 |
11 | const createContentTypeFunction = useCache => {
12 | const CACHE = useCache ? new NullProtoObj() : null
13 |
14 | return headers => {
15 | const contentType = headers['content-type']
16 | if (useCache) {
17 | return (
18 | CACHE[contentType] ||
19 | (CACHE[contentType] = parseContentType(contentType))
20 | )
21 | } else {
22 | return parseContentType(contentType)
23 | }
24 | }
25 | }
26 |
27 | // Benchmark function
28 | const benchmark = (iterations, useCache) => {
29 | const headersList = [
30 | { 'content-type': 'application/json; charset=utf-8' },
31 | { 'content-type': 'text/html; charset=utf-8' },
32 | { 'content-type': 'application/xml; charset=utf-8' },
33 | { 'content-type': 'text/plain; charset=utf-8' },
34 | { 'content-type': 'application/json' }
35 | ]
36 |
37 | const contentTypeFunc = createContentTypeFunction(useCache)
38 |
39 | console.time(useCache ? 'Benchmark with Cache' : 'Benchmark without Cache')
40 | for (let i = 0; i < iterations; i++) {
41 | for (const headers of headersList) {
42 | contentTypeFunc(headers)
43 | }
44 | }
45 | console.timeEnd(useCache ? 'Benchmark with Cache' : 'Benchmark without Cache')
46 | }
47 |
48 | // Run the benchmark
49 | const iterations = 100000
50 | benchmark(iterations, false) // Without Cache
51 | benchmark(iterations, true) // With Cache
52 |
--------------------------------------------------------------------------------
/benchmark/mupdf/generate.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const { randomBytes } = require('crypto')
4 | const PDFDocument = require('pdfkit')
5 | const bytes = require('bytes-iec')
6 | const path = require('path')
7 | const fs = require('fs')
8 |
9 | function generatePdf (filename, filesize) {
10 | const doc = new PDFDocument()
11 | const filepath = path.join(__dirname, 'fixtures', filename)
12 | const stream = fs.createWriteStream(filepath)
13 | doc.pipe(stream)
14 |
15 | // adjust base64 overheard
16 | const size = bytes.format(Math.floor(filesize * 0.55))
17 | const randomData = randomBytes(bytes(size)).toString('base64')
18 |
19 | doc.text(randomData, {
20 | width: 410,
21 | align: 'left'
22 | })
23 |
24 | doc.end()
25 |
26 | stream.on('finish', () => console.log(filename))
27 | }
28 |
29 | const sizes = [...Array(10).keys()]
30 | .map(index => {
31 | const base = (index + 1) * 100
32 | const filename = bytes.format(base * 1000).toLowerCase()
33 | const filesize = bytes(`${base}KB`)
34 | return { filename, filesize }
35 | })
36 | .concat([
37 | { filename: '5mb', filesize: bytes('5MB') },
38 | { filename: '10mb', filesize: bytes('10MB') },
39 | { filename: '20mb', filesize: bytes('20MB') }
40 | ])
41 |
42 | for (const { filename, filesize } of sizes) {
43 | generatePdf(`${filename}.pdf`, filesize)
44 | }
45 |
--------------------------------------------------------------------------------
/benchmark/mupdf/index.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const { readFile, readdir } = require('fs/promises')
4 | const { defaultMutool } = require('../../src')
5 | const path = require('path')
6 |
7 | const OUTPUT = path.join(__dirname, 'output.pdf')
8 |
9 | class Benchmark {
10 | constructor (title) {
11 | this.title = title
12 | this.testCases = []
13 | this.results = []
14 | this.verifications = []
15 | }
16 |
17 | add (name, fn) {
18 | this.testCases.push({ name, fn })
19 | return this
20 | }
21 |
22 | verification (fn) {
23 | this.verifications.push(fn)
24 | return this
25 | }
26 |
27 | async run () {
28 | console.log(`\n${this.title}\n`)
29 |
30 | for (const [index, { name, fn }] of this.testCases
31 | .sort(() => Math.random() - 0.5)
32 | .entries()) {
33 | const start = Date.now()
34 | const result = await fn()
35 | for (const verification of this.verifications) {
36 | try {
37 | verification(result)
38 | } catch (error) {
39 | throw new Error(`Verification failed for '${name}': ${error.message}`)
40 | }
41 | }
42 | const duration = Date.now() - start
43 | this.results.push({ name, duration, result })
44 | console.log(`${index + 1}. ${name}: ${duration}ms`)
45 | }
46 |
47 | const { name, duration } = this.results.reduce(
48 | (prev, curr, idx) =>
49 | prev.duration < curr.duration ? prev : { ...curr, index: idx },
50 | { duration: Infinity }
51 | )
52 | const [fastest, secondFastest] = this.results.sort(
53 | (a, b) => a.duration - b.duration
54 | )
55 |
56 | const percentageFaster =
57 | ((secondFastest.duration - fastest.duration) / secondFastest.duration) *
58 | 100
59 |
60 | console.log(
61 | `\nFastest: "${name}" with ${duration}ms (${percentageFaster.toFixed(
62 | 2
63 | )}%)`
64 | )
65 | }
66 | }
67 |
68 | const main = async () => {
69 | const mutool = defaultMutool()
70 |
71 | const fixtures = await readdir(path.join(__dirname, 'fixtures'))
72 |
73 | for (const filename of fixtures) {
74 | const filepath = path.join(__dirname, 'fixtures', filename)
75 |
76 | await new Benchmark(`Benchmarking mutool ${filename}`)
77 | .verification(output => {
78 | if (typeof output !== 'string') {
79 | throw new TypeError(`Expected a string, got ${typeof output}`)
80 | }
81 | })
82 | .add('write in memory', async () => {
83 | const result = await mutool(filepath)
84 | return result.stdout
85 | })
86 | .add('write in file, read async', async () => {
87 | await mutool(`-o ${OUTPUT} ${filepath}`)
88 | return readFile(OUTPUT, 'utf-8')
89 | })
90 | .run()
91 | }
92 | }
93 |
94 | main()
95 |
--------------------------------------------------------------------------------
/benchmark/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "@html-get/benchmark",
3 | "version": "0.0.0",
4 | "dependencies": {
5 | "bytes-iec": "~3.1.1",
6 | "pdfkit": "~0.16.0"
7 | }
8 | }
9 |
--------------------------------------------------------------------------------
/bin/index.js:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | 'use strict'
4 |
5 | const createBrowserless = require('browserless')
6 | const { URL } = require('url')
7 | const mri = require('mri')
8 |
9 | const getHTML = require('..')
10 |
11 | const browserlessFactory = createBrowserless()
12 |
13 | const { _: input, debug: isDebug, ...args } = mri(process.argv.slice(2))
14 | const url = new URL(input).toString()
15 |
16 | const browserContext = browserlessFactory.createContext()
17 | const getBrowserless = () => browserContext
18 |
19 | getHTML(url, { getBrowserless, ...args })
20 | .then(async ({ html, stats, headers, statusCode }) => {
21 | if (isDebug) {
22 | console.log(`
23 | url: ${url}
24 | html: ${Buffer.from(html).byteLength} bytes (HTTP ${statusCode})
25 | time: ${stats.timing} (${stats.mode})
26 | headers: ${
27 | headers
28 | ? Object.keys(headers).reduce(
29 | (acc, key) => `${acc}${key}=${headers[key]} `,
30 | ''
31 | )
32 | : '-'
33 | }
34 | `)
35 | } else {
36 | console.log(html)
37 | }
38 | process.exit()
39 | })
40 | .catch(error => console.error(error) || process.exit(1))
41 | .finally(async () => {
42 | await getBrowserless(browser => browser.destroyContext())
43 | browserlessFactory.close()
44 | })
45 |
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "html-get",
3 | "description": "Get the HTML from any website, fine-tuned for correction & speed",
4 | "homepage": "https://nicedoc.com/microlinkhq/html-get",
5 | "version": "2.21.5",
6 | "main": "src/index.js",
7 | "bin": {
8 | "html-get": "bin/index.js"
9 | },
10 | "author": {
11 | "email": "josefrancisco.verdu@gmail.com",
12 | "name": "Kiko Beats",
13 | "url": "https://kikobeats.com"
14 | },
15 | "contributors": [],
16 | "repository": {
17 | "type": "git",
18 | "url": "git+https://github.com/microlinkhq/html-get.git"
19 | },
20 | "bugs": {
21 | "url": "https://github.com/microlinkhq/html-get/issues"
22 | },
23 | "keywords": [
24 | "audio",
25 | "fetch",
26 | "get",
27 | "got",
28 | "headless",
29 | "html",
30 | "image",
31 | "markup",
32 | "pdf",
33 | "prerender",
34 | "request",
35 | "video"
36 | ],
37 | "dependencies": {
38 | "@kikobeats/time-span": "~1.0.5",
39 | "@metascraper/helpers": "~5.46.1",
40 | "cheerio": "~1.0.0",
41 | "content-type": "~1.0.5",
42 | "css-url-regex": "~4.0.0",
43 | "debug-logfmt": "~1.2.3",
44 | "execall": "~2.0.0",
45 | "got": "~11.8.6",
46 | "html-encode": "~2.1.7",
47 | "html-urls": "~2.4.62",
48 | "is-html-content": "~1.0.0",
49 | "is-local-address": "~2.2.0",
50 | "lodash": "~4.17.21",
51 | "mri": "~1.2.0",
52 | "null-prototype-object": "~1.2.0",
53 | "p-cancelable": "~2.1.0",
54 | "p-retry": "~4.6.0",
55 | "tinyspawn": "~1.5.0",
56 | "top-sites": "~1.1.220"
57 | },
58 | "devDependencies": {
59 | "@commitlint/cli": "latest",
60 | "@commitlint/config-conventional": "latest",
61 | "@ksmithut/prettier-standard": "latest",
62 | "async-listen": "latest",
63 | "ava": "5",
64 | "browserless": "latest",
65 | "c8": "latest",
66 | "ci-publish": "latest",
67 | "finepack": "latest",
68 | "git-authors-cli": "latest",
69 | "github-generate-release": "latest",
70 | "nano-staged": "latest",
71 | "pretty": "latest",
72 | "puppeteer": "latest",
73 | "regex-iso-date": "latest",
74 | "simple-git-hooks": "latest",
75 | "standard": "latest",
76 | "standard-version": "latest"
77 | },
78 | "engines": {
79 | "node": ">= 10"
80 | },
81 | "files": [
82 | "bin",
83 | "scripts",
84 | "src"
85 | ],
86 | "scripts": {
87 | "clean": "rm -rf node_modules",
88 | "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true",
89 | "lint": "standard",
90 | "postinstall": "node scripts/postinstall",
91 | "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)",
92 | "pretest": "npm run lint",
93 | "release": "standard-version -a",
94 | "release:github": "github-generate-release",
95 | "release:tags": "git push --follow-tags origin HEAD:master",
96 | "test": "c8 ava"
97 | },
98 | "license": "MIT",
99 | "ava": {
100 | "files": [
101 | "test/**/*.js",
102 | "!test/helpers.js"
103 | ],
104 | "timeout": "2m",
105 | "workerThreads": false
106 | },
107 | "commitlint": {
108 | "extends": [
109 | "@commitlint/config-conventional"
110 | ],
111 | "rules": {
112 | "body-max-line-length": [
113 | 0
114 | ]
115 | }
116 | },
117 | "nano-staged": {
118 | "*.js": [
119 | "prettier-standard",
120 | "standard --fix"
121 | ],
122 | "package.json": [
123 | "finepack"
124 | ]
125 | },
126 | "pnpm": {
127 | "neverBuiltDependencies": []
128 | },
129 | "simple-git-hooks": {
130 | "commit-msg": "npx commitlint --edit",
131 | "pre-commit": "npx nano-staged"
132 | }
133 | }
134 |
--------------------------------------------------------------------------------
/scripts/postinstall:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env node
2 |
3 | 'use strict'
4 |
5 | const { compact, reduce, findIndex } = require('lodash')
6 | const { parseUrl } = require('@metascraper/helpers')
7 | const { writeFile } = require('fs').promises
8 | const topsites = require('top-sites')
9 |
10 | const domains = [
11 | [['domain', 'abc.net.au']],
12 | [['domain', 'x.com']],
13 | [['domainWithoutSuffix', 'apple']],
14 | [['domainWithoutSuffix', 'arxiv']],
15 | [['domainWithoutSuffix', 'bbc']],
16 | [['domainWithoutSuffix', 'blogspot']],
17 | [['domainWithoutSuffix', 'csdn']],
18 | [['domainWithoutSuffix', 'deviantart']],
19 | [['domainWithoutSuffix', 'digg']],
20 | [['domainWithoutSuffix', 'dribbble']],
21 | [['domainWithoutSuffix', 'engadget']],
22 | [['domainWithoutSuffix', 'etsy']],
23 | [['domainWithoutSuffix', 'eventbrite']],
24 | [['domainWithoutSuffix', 'flickr']],
25 | [['domainWithoutSuffix', 'ghost']],
26 | [['domainWithoutSuffix', 'giphy']],
27 | [['domainWithoutSuffix', 'github']],
28 | [['domainWithoutSuffix', 'gitlab']],
29 | [['domainWithoutSuffix', 'google']],
30 | [['domainWithoutSuffix', 'huffingtonpost']],
31 | [['domainWithoutSuffix', 'imdb']],
32 | [['domainWithoutSuffix', 'imgur']],
33 | [['domainWithoutSuffix', 'instagram']],
34 | [['domainWithoutSuffix', 'meetup']],
35 | [['domainWithoutSuffix', 'microsoft']],
36 | [['domainWithoutSuffix', 'nytimes']],
37 | [['domainWithoutSuffix', 'pinterest']],
38 | [['domainWithoutSuffix', 'producthunt']],
39 | [['domainWithoutSuffix', 'reddit']],
40 | [['domainWithoutSuffix', 'slideshare']],
41 | [['domainWithoutSuffix', 'soundcloud']],
42 | [['domainWithoutSuffix', 'sourceforge']],
43 | [['domainWithoutSuffix', 'spotify']],
44 | [['domainWithoutSuffix', 'stackoverflow']],
45 | [['domainWithoutSuffix', 'substack']],
46 | [['domainWithoutSuffix', 'techcrunch']],
47 | [['domainWithoutSuffix', 'telegraph']],
48 | [['domainWithoutSuffix', 'theguardian']],
49 | [['domainWithoutSuffix', 'theverge']],
50 | [['domainWithoutSuffix', 'tumblr']],
51 | [['domainWithoutSuffix', 'twitter']],
52 | [['domainWithoutSuffix', 'vimeo']],
53 | [['domainWithoutSuffix', 'wikipedia']],
54 | [['domainWithoutSuffix', 'wordpress']],
55 | [['domainWithoutSuffix', 'ycombinator']],
56 | [['domainWithoutSuffix', 'yelp']],
57 | [['domainWithoutSuffix', 'youtube']],
58 | [['domainWithoutSuffix', 'zoom']]
59 | ]
60 |
61 | const { top, rest } = reduce(
62 | domains,
63 | (acc, conditions) => {
64 | for (const [key, value] of conditions) {
65 | const index = findIndex(topsites, ({ rootDomain }) => {
66 | const parsedUrl = parseUrl(rootDomain)
67 | return parsedUrl[key] === value
68 | })
69 |
70 | if (index !== -1) {
71 | acc.top[index] = conditions
72 | return acc
73 | }
74 | }
75 |
76 | acc.rest.push(conditions)
77 | return acc
78 | },
79 | { top: new Array(topsites.length), rest: [] }
80 | )
81 |
82 | writeFile('./src/auto-domains.json', JSON.stringify(compact(top).concat(rest)), null, 2).catch(
83 | error => console.log(error)
84 | )
85 |
--------------------------------------------------------------------------------
/src/html.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const debug = require('debug-logfmt')('html-get:rewrite')
4 | const { get, castArray, forEach } = require('lodash')
5 | const isLocalAddress = require('is-local-address')
6 | const { TAGS: URL_TAGS } = require('html-urls')
7 | const isHTML = require('is-html-content')
8 | const cssUrl = require('css-url-regex')
9 | const execall = require('execall')
10 | const cheerio = require('cheerio')
11 | const { URL } = require('url')
12 | const path = require('path')
13 |
14 | const {
15 | date: toDate,
16 | isMime,
17 | isUrl,
18 | mimeExtension,
19 | parseUrl
20 | } = require('@metascraper/helpers')
21 |
22 | const { getContentType, getCharset } = require('./util')
23 |
24 | const has = el => el.length !== 0
25 |
26 | const upsert = (el, collection, item) => !has(el) && collection.push(item)
27 |
28 | /**
29 | * Infer timestamp from `last-modified`, `date`, or `age` response headers.
30 | */
31 | const getDate = headers => {
32 | const timestamp = get(headers, 'last-modified') || get(headers, 'date')
33 | return timestamp
34 | ? toDate(timestamp)
35 | : toDate(Date.now() - Number(get(headers, 'age')) * 1000)
36 | }
37 |
38 | const addHead = ({ $, url, headers }) => {
39 | const tags = []
40 | const charset = getCharset(headers)
41 | const date = getDate(headers)
42 | const { domain } = parseUrl(url)
43 | const head = $('head')
44 |
45 | upsert(head.find('title'), tags, `${path.basename(url)}`)
46 |
47 | if (domain) {
48 | upsert(
49 | head.find('meta[property="og:site_name"]'),
50 | tags,
51 | ``
52 | )
53 | }
54 |
55 | if (date) {
56 | upsert(
57 | head.find('meta[property="article:published_time"]'),
58 | tags,
59 | ``
60 | )
61 | }
62 |
63 | upsert(
64 | head.find('link[rel="canonical"]'),
65 | tags,
66 | ``
67 | )
68 |
69 | if (charset) {
70 | upsert(head.find('meta[charset]'), tags, ``)
71 | }
72 |
73 | tags.forEach(tag => head.append(tag))
74 | }
75 |
76 | const addBody = ({ url, headers, html }) => {
77 | const contentType = getContentType(headers)
78 | let element = ''
79 |
80 | if (isMime(contentType, 'image')) {
81 | element = `
`
82 | } else if (isMime(contentType, 'video')) {
83 | element = ``
84 | } else if (isMime(contentType, 'audio')) {
85 | element = ``
86 | } else if (mimeExtension(contentType) === 'json') {
87 | element = `${html}
`
88 | }
89 |
90 | return `${element}`
91 | }
92 |
93 | const isOpenGraph = (prop = '') =>
94 | ['og:', 'fb:', 'al:'].some(prefix => prop.startsWith(prefix))
95 |
96 | const rewriteMetaTags = ({ $ }) => {
97 | $('meta').each((_, element) => {
98 | const el = $(element)
99 | if (!el.attr('content')) return
100 |
101 | const name = el.attr('name')
102 | const property = el.attr('property')
103 |
104 | // Convert 'name' to 'property' for Open Graph tags if 'property' is not already set correctly
105 | if (property !== name && isOpenGraph(name)) {
106 | el.removeAttr('name').attr('property', name)
107 | debug('og', el.attr())
108 | // Convert 'property' to 'name' for non-Open Graph tags
109 | } else if (property && !isOpenGraph(property)) {
110 | el.removeAttr('property').attr('name', property)
111 | debug('meta', el.attr())
112 | }
113 | })
114 | }
115 |
116 | const rewriteHtmlUrls = ({ $, url }) => {
117 | forEach(URL_TAGS, (tagName, urlAttr) => {
118 | $(tagName.join(',')).each(function () {
119 | const el = $(this)
120 | const attr = el.attr(urlAttr)
121 | if (typeof attr !== 'string') return
122 | try {
123 | const urlObj = new URL(attr, url)
124 | if (!urlObj.protocol.startsWith('http')) return
125 | if (isLocalAddress(urlObj.hostname)) {
126 | el.remove()
127 | } else {
128 | el.attr(urlAttr, urlObj.toString())
129 | }
130 | } catch (_) {}
131 | })
132 | })
133 | }
134 |
135 | const replaceCssUrls = (url, stylesheet) => {
136 | const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce(
137 | (acc, match) => {
138 | match.subMatches.forEach(match => acc.add(match))
139 | return acc
140 | },
141 | new Set()
142 | )
143 |
144 | cssUrls.forEach(cssUrl => {
145 | if (cssUrl.startsWith('/')) {
146 | try {
147 | const absoluteUrl = new URL(cssUrl, url).toString()
148 | stylesheet = stylesheet.replaceAll(
149 | `url(${cssUrl})`,
150 | `url(${absoluteUrl})`
151 | )
152 | } catch (_) {}
153 | }
154 | })
155 |
156 | return stylesheet
157 | }
158 |
159 | const rewriteCssUrls = ({ $, url }) => {
160 | // Process
162 | $('style').each((_, element) =>
163 | $(element).html(replaceCssUrls(url, $(element).html()))
164 | )
165 |
166 | // Process elements with style attributes
167 | // e.g.,
168 | $('[style]').each((_, element) =>
169 | $(element).attr('style', replaceCssUrls(url, $(element).attr('style')))
170 | )
171 |
172 | return $
173 | }
174 |
175 | const injectStyle = ({ $, styles }) =>
176 | castArray(styles).forEach(style =>
177 | $('head').append(
178 | isUrl(style)
179 | ? ``
180 | : ``
181 | )
182 | )
183 |
184 | const injectScripts = ({ $, scripts, type }) =>
185 | castArray(scripts).forEach(script =>
186 | $('head').append(
187 | isUrl(script)
188 | ? ``
189 | : ``
190 | )
191 | )
192 |
193 | const addDocType = html =>
194 | html.startsWith('${html}`
195 |
196 | module.exports = ({
197 | html,
198 | url,
199 | headers = {},
200 | styles,
201 | hide,
202 | remove,
203 | rewriteUrls,
204 | rewriteHtml,
205 | scripts,
206 | modules
207 | }) => {
208 | const content = addDocType(
209 | isHTML(html) ? html : addBody({ url, headers, html })
210 | )
211 |
212 | const $ = cheerio.load(content)
213 |
214 | if (rewriteUrls) rewriteHtmlUrls({ $, url })
215 |
216 | if (rewriteHtml) rewriteMetaTags({ $, url })
217 |
218 | addHead({ $, url, headers })
219 |
220 | if (styles) injectStyle({ $, styles })
221 |
222 | if (hide) {
223 | injectStyle({
224 | $,
225 | styles: `${castArray(hide).join(', ')} { visibility: hidden !important; }`
226 | })
227 | }
228 |
229 | if (remove) {
230 | injectStyle({
231 | $,
232 | styles: `${castArray(remove).join(', ')} { display: none !important; }`
233 | })
234 | }
235 |
236 | if (scripts) injectScripts({ $, scripts, type: 'text/javascript' })
237 | if (modules) injectScripts({ $, modules, type: 'module' })
238 |
239 | return rewriteUrls ? rewriteCssUrls({ $, url }) : $
240 | }
241 |
242 | module.exports.getDate = getDate
243 |
--------------------------------------------------------------------------------
/src/index.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const { parseUrl, isMediaUrl, isPdfUrl } = require('@metascraper/helpers')
4 | const { readFile, writeFile } = require('fs/promises')
5 | const timeSpan = require('@kikobeats/time-span')()
6 | const debug = require('debug-logfmt')('html-get')
7 | const { execSync } = require('child_process')
8 | const PCancelable = require('p-cancelable')
9 | const { AbortError } = require('p-retry')
10 | const htmlEncode = require('html-encode')
11 | const crypto = require('crypto')
12 | const $ = require('tinyspawn')
13 | const path = require('path')
14 | const got = require('got')
15 | const os = require('os')
16 |
17 | const { getContentLength, getContentType } = require('./util')
18 | const autoDomains = require('./auto-domains')
19 | const addHtml = require('./html')
20 |
21 | const REQ_TIMEOUT = 8000
22 |
23 | const ABORT_TYPES = ['image', 'stylesheet', 'font']
24 |
25 | const PDF_SIZE_TRESHOLD = 150 * 1024 // 150kb
26 |
27 | const fetch = PCancelable.fn(
28 | async (
29 | url,
30 | {
31 | getTemporalFile,
32 | mutool,
33 | reflect = false,
34 | timeout = REQ_TIMEOUT,
35 | toEncode,
36 | ...opts
37 | },
38 | onCancel
39 | ) => {
40 | const reqTimeout = reflect ? timeout / 2 : timeout
41 |
42 | const req = got(url, {
43 | ...opts,
44 | timeout: reqTimeout,
45 | responseType: 'buffer'
46 | })
47 |
48 | onCancel.shouldReject = false
49 |
50 | onCancel(() => {
51 | debug('fetch:cancel', { url, reflect })
52 | req.cancel()
53 | })
54 |
55 | const redirects = []
56 | req.on('redirect', res =>
57 | redirects.push({ statusCode: res.statusCode, url: res.url })
58 | )
59 |
60 | try {
61 | const res = await req
62 |
63 | const html = await (async () => {
64 | const contentType = getContentType(res.headers)
65 |
66 | if (mutool && contentType === 'application/pdf') {
67 | const file = getTemporalFile(url, 'pdf')
68 | await writeFile(file.path, res.body)
69 | if (getContentLength(res.headers) > PDF_SIZE_TRESHOLD) {
70 | const ofile = getTemporalFile(`${url}-pdf`, 'pdf')
71 | await mutool(`-o ${ofile.path} ${file.path}`)
72 | return readFile(ofile.path, 'utf-8')
73 | } else {
74 | const { stdout } = await mutool(file.path)
75 | return stdout
76 | }
77 | }
78 |
79 | return contentType === 'text/html' || !isMediaUrl(url)
80 | ? await toEncode(res.body, res.headers['content-type'])
81 | : res.body.toString()
82 | })()
83 |
84 | return {
85 | headers: res.headers,
86 | html,
87 | mode: 'fetch',
88 | url: res.url,
89 | statusCode: res.statusCode,
90 | redirects
91 | }
92 | } catch (error) {
93 | debug('fetch:error', { url, message: error.message || error, reflect })
94 | return reflect
95 | ? { isRejected: true, error }
96 | : {
97 | url,
98 | html: '',
99 | mode: 'fetch',
100 | headers: error.response ? error.response.headers : {},
101 | statusCode: error.response ? error.response.statusCode : undefined,
102 | redirects
103 | }
104 | }
105 | }
106 | )
107 |
108 | const prerender = PCancelable.fn(
109 | async (
110 | url,
111 | {
112 | abortTypes = ABORT_TYPES,
113 | getBrowserless,
114 | gotOpts,
115 | headers,
116 | timeout = REQ_TIMEOUT,
117 | toEncode,
118 | ...opts
119 | },
120 | onCancel
121 | ) => {
122 | let fetchRes
123 | let data = {}
124 | let isFetchResRejected = false
125 |
126 | onCancel(() => fetchRes.cancel())
127 |
128 | try {
129 | fetchRes = fetch(url, {
130 | reflect: true,
131 | toEncode,
132 | ...gotOpts,
133 | headers,
134 | timeout
135 | })
136 | const browserless = await getBrowserless()
137 |
138 | const getPayload = browserless.evaluate(
139 | async (page, response) => {
140 | if (!response) throw new AbortError('empty response')
141 |
142 | return {
143 | headers: response.headers(),
144 | html: await page.content(),
145 | mode: 'prerender',
146 | url: response.url(),
147 | statusCode: response.status(),
148 | redirects: response
149 | .request()
150 | .redirectChain()
151 | .map(req => ({
152 | statusCode: req.response().status(),
153 | url: req.url()
154 | }))
155 | }
156 | },
157 | {
158 | timeout,
159 | headers,
160 | abortTypes
161 | }
162 | )
163 |
164 | const payload = await getPayload(url, opts)
165 | await fetchRes.cancel()
166 | debug('prerender', { url, state: 'success' })
167 | return payload
168 | } catch (err) {
169 | const { isRejected, ...dataProps } = await fetchRes
170 |
171 | debug('prerender:error', {
172 | url,
173 | isRejected,
174 | error: err.message
175 | })
176 |
177 | isFetchResRejected = isRejected
178 | data = dataProps
179 | }
180 |
181 | return isFetchResRejected
182 | ? {
183 | headers: data.headers || {},
184 | html: '',
185 | url,
186 | mode: 'prerender'
187 | }
188 | : data
189 | }
190 | )
191 |
192 | const modes = { fetch, prerender }
193 |
194 | const isFetchMode = url => {
195 | const parsedUrl = parseUrl(url)
196 | return autoDomains.some(conditions =>
197 | conditions.every(([prop, value]) => parsedUrl[prop] === value)
198 | )
199 | }
200 |
201 | const defaultGetMode = (url, { prerender }) => {
202 | if (prerender === false || isMediaUrl(url) || isPdfUrl(url)) return 'fetch'
203 | if (prerender === true) return 'prerender'
204 | return isFetchMode(url) ? 'fetch' : 'prerender'
205 | }
206 |
207 | const defaultGetTemporalFile = (input, ext) => {
208 | const hash = crypto.createHash('sha256').update(input).digest('hex')
209 | const filepath = path.join(
210 | os.tmpdir(),
211 | ext === undefined ? hash : `${hash}.${ext}`
212 | )
213 | return { path: filepath }
214 | }
215 |
216 | const defaultMutool = () =>
217 | (() => {
218 | try {
219 | const mutoolPath = execSync('which mutool', {
220 | stdio: ['pipe', 'pipe', 'ignore']
221 | })
222 | .toString()
223 | .trim()
224 | return (...args) => $(`${mutoolPath} draw -q -F html ${args}`)
225 | } catch (_) {}
226 | })()
227 |
228 | const getContent = PCancelable.fn(
229 | (
230 | url,
231 | mode,
232 | {
233 | getBrowserless,
234 | getTemporalFile,
235 | gotOpts,
236 | headers,
237 | mutool,
238 | puppeteerOpts,
239 | rewriteUrls,
240 | rewriteHtml,
241 | toEncode
242 | },
243 | onCancel
244 | ) => {
245 | const isFetchMode = mode === 'fetch'
246 |
247 | const fetchOpts = isFetchMode
248 | ? { headers, toEncode, mutool, getTemporalFile, ...gotOpts }
249 | : { headers, toEncode, getBrowserless, gotOpts, ...puppeteerOpts }
250 |
251 | const promise = modes[mode](url, fetchOpts)
252 | onCancel(() => promise.cancel())
253 |
254 | return promise.then(content => {
255 | const $ = addHtml({
256 | ...content,
257 | ...(isFetchMode ? puppeteerOpts : undefined),
258 | rewriteUrls,
259 | rewriteHtml
260 | })
261 |
262 | return { ...content, $ }
263 | })
264 | }
265 | )
266 |
267 | module.exports = PCancelable.fn(
268 | async (
269 | targetUrl,
270 | {
271 | encoding = 'utf-8',
272 | getBrowserless,
273 | getMode = defaultGetMode,
274 | getTemporalFile = defaultGetTemporalFile,
275 | gotOpts,
276 | headers,
277 | mutool = defaultMutool(),
278 | prerender = 'auto',
279 | puppeteerOpts,
280 | rewriteHtml = false,
281 | rewriteUrls = false,
282 | serializeHtml = $ => ({ html: $.html() })
283 | } = {},
284 | onCancel
285 | ) => {
286 | if (!getBrowserless && prerender !== false) {
287 | throw TypeError(
288 | "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
289 | )
290 | }
291 |
292 | const toEncode = htmlEncode(encoding)
293 | const reqMode = getMode(targetUrl, { prerender })
294 |
295 | const duration = timeSpan()
296 |
297 | const promise = getContent(targetUrl, reqMode, {
298 | getBrowserless,
299 | getTemporalFile,
300 | gotOpts,
301 | headers,
302 | mutool,
303 | puppeteerOpts,
304 | rewriteUrls,
305 | rewriteHtml,
306 | toEncode
307 | })
308 |
309 | onCancel(() => promise.cancel())
310 |
311 | const { mode, $, ...payload } = await promise
312 |
313 | return Object.assign(payload, {
314 | ...serializeHtml($),
315 | stats: { mode, timing: duration() }
316 | })
317 | }
318 | )
319 |
320 | module.exports.REQ_TIMEOUT = REQ_TIMEOUT
321 | module.exports.ABORT_TYPES = ABORT_TYPES
322 | module.exports.PDF_SIZE_TRESHOLD = PDF_SIZE_TRESHOLD
323 | module.exports.isFetchMode = isFetchMode
324 | module.exports.getContent = getContent
325 | module.exports.defaultMutool = defaultMutool
326 |
--------------------------------------------------------------------------------
/src/util.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const NullProtoObj = require('null-prototype-object')
4 | const { parse } = require('content-type')
5 |
6 | const CACHE = new NullProtoObj()
7 |
8 | const parseContentType = contentType =>
9 | typeof contentType === 'string'
10 | ? parse(contentType)
11 | : { type: undefined, parameters: {} }
12 |
13 | const contentType = headers => {
14 | const contentType = headers['content-type']
15 | return (
16 | CACHE[contentType] || (CACHE[contentType] = parseContentType(contentType))
17 | )
18 | }
19 |
20 | const getContentType = headers => contentType(headers).type
21 |
22 | const getCharset = headers =>
23 | contentType(headers).parameters.charset?.toLowerCase()
24 |
25 | const getContentLength = headers => Number(headers['content-length'])
26 |
27 | module.exports = {
28 | getCharset,
29 | getContentLength,
30 | getContentType
31 | }
32 |
--------------------------------------------------------------------------------
/test/auto-domains.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const autoDomains = require('../src/auto-domains.json')
6 |
7 | test('domains are sorted by popularity', t => {
8 | t.true(['youtube', 'google'].includes(autoDomains[0][0][1]))
9 | })
10 |
--------------------------------------------------------------------------------
/test/encoding.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const { runFixtureServer, initBrowserless } = require('./helpers')
6 | const getHTML = require('..')
7 |
8 | const getBrowserless = initBrowserless(test)
9 |
10 | ;[false, true].forEach(prerender => {
11 | const mode = prerender ? 'prerender' : 'fetch'
12 |
13 | test(`${mode} » Shift-JIS`, async t => {
14 | const url = await runFixtureServer(t, '51242_54045.html')
15 | const { html } = await getHTML(url, { prerender, getBrowserless })
16 | t.true(html.includes('或る日の小せん'))
17 | })
18 |
19 | test(`${mode} » Windows-1250`, async t => {
20 | const url = await runFixtureServer(t, 'rp.pl.html')
21 | const { html } = await getHTML(url, { prerender, getBrowserless })
22 | t.true(html.includes('majątków'))
23 | })
24 |
25 | test(`${mode} » UTF-8`, async t => {
26 | const url = await runFixtureServer(t, 'utf8.with.meta.html')
27 | const { html } = await getHTML(url, { prerender, getBrowserless })
28 | t.true(html.includes('日本語'))
29 | })
30 | })
31 |
--------------------------------------------------------------------------------
/test/fixtures/51242_54045.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/fixtures/51242_54045.html
--------------------------------------------------------------------------------
/test/fixtures/browserless.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 | browserless, a puppeter-like Node.js library for interacting with Headless production scenarios.
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
--------------------------------------------------------------------------------
/test/fixtures/rp.pl.html:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/fixtures/rp.pl.html
--------------------------------------------------------------------------------
/test/fixtures/utf8.with.meta.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | 日本語
7 |
8 |
9 |
--------------------------------------------------------------------------------
/test/helpers.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const { default: listen } = require('async-listen')
4 | const createBrowserless = require('browserless')
5 | const dateRegex = require('regex-iso-date')
6 | const { createServer } = require('http')
7 | const pretty = require('pretty')
8 | const path = require('path')
9 | const fs = require('fs')
10 |
11 | const createHeaders = name => contentType => ({
12 | [name]: contentType
13 | })
14 |
15 | const closeServer = server =>
16 | require('util').promisify(server.close.bind(server))()
17 |
18 | const fixture = name =>
19 | fs.readFileSync(path.join(__dirname, '/fixtures/', name))
20 |
21 | const initBrowserless = test => {
22 | const browserlessFactory = createBrowserless()
23 | test.after.always(browserlessFactory.close)
24 | return () => browserlessFactory.createContext()
25 | }
26 |
27 | const runServer = async (t, fn) => {
28 | const server = createServer(fn)
29 | const url = await listen(server)
30 | t.teardown(() => closeServer(server))
31 | return url
32 | }
33 |
34 | const runFixtureServer = async (t, fixturePath) =>
35 | runServer(t, (_, res) => {
36 | res.setHeader('content-type', 'text/html')
37 | res.end(fixture(fixturePath))
38 | })
39 |
40 | const prettyHtml = html =>
41 | pretty(html, { ocd: true }).replace(dateRegex(), '{DATE}')
42 |
43 | module.exports = {
44 | createHeaders,
45 | initBrowserless,
46 | prettyHtml,
47 | runFixtureServer,
48 | runServer
49 | }
50 |
--------------------------------------------------------------------------------
/test/html/get-date.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const { getDate } = require('../../src/html')
6 |
7 | test('from `last-modified`', t => {
8 | const date = getDate({ 'last-modified': 'Fri, 04 Aug 2023 21:10:56 GMT' })
9 | t.is(date, '2023-08-04T21:10:56.000Z')
10 | })
11 |
12 | test('from `date`', t => {
13 | const date = getDate({ 'last-modified': 'Sat, 05 Aug 2023 09:43:59 GMT' })
14 | t.is(date, '2023-08-05T09:43:59.000Z')
15 | })
16 |
17 | test('from `age`', t => {
18 | {
19 | const date = getDate({ age: '1884' })
20 | t.truthy(date)
21 | }
22 | {
23 | const date = getDate({})
24 | t.is(date, undefined)
25 | }
26 | })
27 |
--------------------------------------------------------------------------------
/test/html/index.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const cheerio = require('cheerio')
4 | const test = require('ava')
5 |
6 | const { prettyHtml } = require('../helpers')
7 |
8 | const html = (...args) => require('../../src/html')(...args).html()
9 |
10 | test('add minimal html markup', t => {
11 | const output = html({
12 | url: 'https://kikobeats.com',
13 | html: '',
14 | headers: {}
15 | })
16 |
17 | t.snapshot(prettyHtml(output))
18 | })
19 |
20 | test('add meta charset', t => {
21 | const output = html({
22 | url: 'https://kikobeats.com',
23 | html: '',
24 | headers: { 'content-type': 'text/html; charset=utf-8' }
25 | })
26 |
27 | t.snapshot(prettyHtml(output))
28 | })
29 |
30 | test('add doctype', t => {
31 | const output = html({
32 | url: 'https://kikobeats.com',
33 | html: `
34 |
35 |
36 | kikobeats.com
37 |
38 |
39 |
40 |
41 |
42 | `,
43 | headers: { 'content-type': 'text/html; charset=utf-8' }
44 | })
45 |
46 | t.snapshot(prettyHtml(output))
47 | })
48 |
49 | test('add json markup', t => {
50 | const output = html({
51 | html: '{"origin":"83.46.149.83","city":"Madrid","alpha2":"ES","alpha3":"ESP","callingCodes":["+34"],"currencies":{"EUR":{"name":"Euro","symbol":"€"}},"eeaMember":true,"euMember":true,"flag":"🇪🇸","languages":{"spa":"Spanish"},"numeric":724,"tld":[".es"],"region":"MD","latitude":"40.4163","longitude":"-3.6934","timezone":"Europe/Madrid","headers":{"accept":"*/*","accept-encoding":"gzip","cdn-loop":"cloudflare","cf-connecting-ip":"83.46.149.83","cf-ipcountry":"ES","cf-ray":"73a29be38cdf37c7-MAD","cf-visitor":"{"scheme":"https"}","connection":"Keep-Alive","host":"geolocation.microlink.io","user-agent":"curl/7.79.1","x-forwarded-for":"172.70.57.171","x-forwarded-host":"geolocation.microlink.io","x-forwarded-proto":"https","x-real-ip":"172.70.57.171","x-vercel-edge-region":"dev","x-vercel-id":"cdg1::x96k9-1660405852783-a0083d276cde","x-vercel-ip-city":"Madrid","x-vercel-ip-country":"ES","x-vercel-ip-country-region":"MD","x-vercel-ip-latitude":"40.4163","x-vercel-ip-longitude":"-3.6934","x-vercel-ip-timezone":"Europe/Madrid","x-vercel-proxied-for":"172.70.57.171"}}',
52 | url: 'https://geolocation.microlink.io/',
53 | headers: { 'content-type': 'application/json' }
54 | })
55 |
56 | t.snapshot(prettyHtml(output))
57 | })
58 |
59 | test('add image markup', t => {
60 | const output = html({
61 | url: 'https://media.giphy.com/media/LqTSLCsIIkCTvQ8X9g/giphy.gif',
62 | headers: { 'content-type': 'image/gif' }
63 | })
64 |
65 | t.snapshot(prettyHtml(output))
66 | })
67 |
68 | test('add audio markup', t => {
69 | const output = html({
70 | url: 'http://websrvr90va.audiovideoweb.com/va90web25003/companions/Foundations%20of%20Rock/13.01.mp3',
71 | headers: { 'content-type': 'audio/mp3' }
72 | })
73 |
74 | t.snapshot(prettyHtml(output))
75 | })
76 |
77 | test('add video markup', t => {
78 | const output = html({
79 | url: 'https://sample-videos.com/video123/mp4/720/big_buck_bunny_720p_1mb.mp4',
80 | headers: { 'content-type': 'video/mp4' }
81 | })
82 |
83 | t.snapshot(prettyHtml(output))
84 | })
85 |
86 | test('styles injection', t => {
87 | const output = html({
88 | url: 'https://kikobeats.com',
89 | html: `
90 |
91 |
92 |
93 |
94 | Document
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 | `,
103 | styles: [
104 | 'https://necolas.github.io/normalize.css/8.0.1/normalize.css',
105 | 'body { background: black; }'
106 | ]
107 | })
108 |
109 | t.true(
110 | output.includes(
111 | ''
112 | )
113 | )
114 |
115 | t.true(output.includes('background: black'))
116 |
117 | t.snapshot(prettyHtml(output))
118 | })
119 |
120 | test('scripts injection', t => {
121 | const output = html({
122 | url: 'https://kikobeats.com',
123 | html: `
124 |
125 |
126 |
127 |
128 | Document
129 |
130 |
131 |
132 |
133 |
134 |
135 |
136 | `,
137 | scripts: [
138 | `
139 | ;(function mutateWindow () {
140 | const iframe = document.createElement('iframe')
141 | iframe.style.display = 'none'
142 | document.body.appendChild(iframe)
143 |
144 | const a = Object.getOwnPropertyNames(iframe.contentWindow)
145 | const b = Object.getOwnPropertyNames(window)
146 |
147 | const diffKeys = b.filter(c => !a.includes(c))
148 | const diffObj = {}
149 | diffKeys.forEach(key => (diffObj[key] = window[key]))
150 |
151 | console.log('Found', diffKeys.length, 'keys mutates on window')
152 | copy(diffObj)
153 | console.log('Copied to clipboard!')
154 | })()`,
155 | 'https://code.jquery.com/jquery-3.5.1.min.js'
156 | ]
157 | })
158 |
159 | t.true(output.includes('mutateWindow'))
160 |
161 | t.true(
162 | output.includes(
163 | ''
164 | )
165 | )
166 |
167 | t.snapshot(prettyHtml(output))
168 | })
169 |
170 | test('hide elements', t => {
171 | const output = html({
172 | url: 'https://kikobeats.com',
173 | html: `
174 |
175 |
176 |
177 |
178 | Document
179 |
180 |
181 |
182 |
183 |
184 |
185 |
186 | `,
187 | hide: '#banner'
188 | })
189 |
190 | t.true(output.includes('#banner { visibility: hidden !important; }'))
191 | t.snapshot(prettyHtml(output))
192 | })
193 |
194 | test('remove elements', t => {
195 | const output = html({
196 | url: 'https://kikobeats.com',
197 | html: `
198 |
199 |
200 |
201 |
202 | Document
203 |
204 |
205 |
206 |
207 |
208 |
209 |
210 | `,
211 | remove: '#banner'
212 | })
213 |
214 | t.true(output.includes('#banner { display: none !important; }'))
215 | t.snapshot(prettyHtml(output))
216 | })
217 |
218 | test('add `og:site_name` when is possible', t => {
219 | t.is(
220 | cheerio
221 | .load(html({ url: 'https://1.1.1.1', html: '', headers: {} }))(
222 | 'meta[property="og:site_name"]'
223 | )
224 | .attr('content'),
225 | undefined
226 | )
227 | t.is(
228 | cheerio
229 | .load(html({ url: 'https://kikobeats.com', html: '', headers: {} }))(
230 | 'meta[property="og:site_name"]'
231 | )
232 | .attr('content'),
233 | 'kikobeats.com'
234 | )
235 | })
236 |
--------------------------------------------------------------------------------
/test/html/rewrite-css-urls.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const execall = require('execall')
4 | const test = require('ava')
5 |
6 | const { prettyHtml } = require('../helpers')
7 |
8 | const html = (...args) => require('../../src/html')(...args).html()
9 |
10 | test("don't modify html markup", t => {
11 | const output = html({
12 | rewriteUrls: true,
13 | url: 'https://www.rubiomonocoatusa.com/blogs/blog/how-to-apply-oil-plus-2c-to-furniture',
14 | html: `
15 |
16 |
17 |
18 |
19 |
20 |
21 | `,
22 | headers: {
23 | 'content-type': 'text/html; charset=utf-8'
24 | }
25 | })
26 |
27 | t.snapshot(prettyHtml(output))
28 | })
29 |
30 | test('rewrites relative URLs inside stylesheet', t => {
31 | const output = html({
32 | rewriteUrls: true,
33 | url: 'https://kikobeats.com',
34 | html: `
35 |
36 |
37 |
38 |
39 |
40 |
41 | `,
42 | headers: {
43 | 'content-type': 'text/html; charset=utf-8'
44 | }
45 | })
46 |
47 | const results = execall(
48 | /https:\/\/kikobeats.com\/images\/microlink\.jpg/g,
49 | output
50 | )
51 |
52 | t.is(results.length, 2)
53 | t.snapshot(prettyHtml(output))
54 | })
55 |
--------------------------------------------------------------------------------
/test/html/rewrite-html.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 | const cheerio = require('cheerio')
5 |
6 | const { prettyHtml } = require('../helpers')
7 |
8 | const html = (...args) => require('../../src/html')(...args).html()
9 |
10 | const composeHtml = meta =>
11 | prettyHtml(`
12 |
13 |
14 |
15 | kikobeats.com
16 |
17 |
18 | ${meta.join('\n')}
19 |
20 |
21 | `)
22 |
23 | ;['fb', 'al'].forEach(prefix => {
24 | test(`treat '${prefix}:' following 'og:' spec`, t => {
25 | const output = html({
26 | rewriteHtml: true,
27 | url: 'https://kikobeats.com',
28 | html: composeHtml([
29 | ``
30 | ]),
31 | headers: { 'content-type': 'text/html; charset=utf-8' }
32 | })
33 |
34 | const $ = cheerio.load(output)
35 | t.is(
36 | $(`meta[property="${prefix}:ios:url"]`).attr('content'),
37 | 'applinks://docs'
38 | )
39 | t.is($(`meta[name="${prefix}:ios:url"]`).attr('content'), undefined)
40 | })
41 | })
42 | ;['twitter', 'fb', 'al', 'og'].forEach(prefix => {
43 | test(`don't rewrite '${prefix}:' if content is empty`, t => {
44 | const output = html({
45 | rewriteHtml: true,
46 | url: 'https://kikobeats.com',
47 | html: composeHtml([``]),
48 | headers: { 'content-type': 'text/html; charset=utf-8' }
49 | })
50 |
51 | const $ = cheerio.load(output)
52 | t.is($(`meta[name="${prefix}:ios:url"]`).attr('content'), '')
53 | t.is($(`meta[property="${prefix}:ios:url"]`).attr('content'), undefined)
54 | })
55 | })
56 |
57 | test("don't rewrite meta if content is empty", t => {
58 | const output = html({
59 | rewriteHtml: true,
60 | url: 'https://kikobeats.com',
61 | html: composeHtml(['']),
62 | headers: { 'content-type': 'text/html; charset=utf-8' }
63 | })
64 |
65 | const $ = cheerio.load(output)
66 | t.is($('meta[property="title"]').attr('content'), '')
67 | t.is($('meta[name="title"]').attr('content'), undefined)
68 | })
69 |
70 | test('rewrite multiple meta wrong markup', t => {
71 | const output = html({
72 | rewriteHtml: true,
73 | url: 'https://kikobeats.com',
74 | html: composeHtml([
75 | '',
76 | '',
77 | ''
78 | ]),
79 | headers: { 'content-type': 'text/html; charset=utf-8' }
80 | })
81 |
82 | const $ = cheerio.load(output)
83 | t.is($('meta[name="title"]').attr('content'), 'Kiko Beats')
84 | t.is($('meta[property="title"]').attr('content'), undefined)
85 | t.is(
86 | $('meta[name="description"]').attr('content'),
87 | 'Personal website of Kiko Beats'
88 | )
89 | t.is($('meta[property="description"]').attr('content'), undefined)
90 | t.is(
91 | $('meta[name="image"]').attr('content'),
92 | 'https://kikobeats.com/image.jpg'
93 | )
94 | t.is($('meta[property="image"]').attr('content'), undefined)
95 | })
96 |
97 | test("rewrite multiple 'twitter:' wrong markup", t => {
98 | const output = html({
99 | rewriteHtml: true,
100 | url: 'https://kikobeats.com',
101 | html: composeHtml([
102 | '',
103 | '',
104 | ''
105 | ]),
106 | headers: { 'content-type': 'text/html; charset=utf-8' }
107 | })
108 |
109 | const $ = cheerio.load(output)
110 | t.is($('meta[name="twitter:title"]').attr('content'), 'Kiko Beats')
111 | t.is($('meta[property="twitter:title"]').attr('content'), undefined)
112 | t.is(
113 | $('meta[name="twitter:description"]').attr('content'),
114 | 'Personal website of Kiko Beats'
115 | )
116 | t.is($('meta[property="twitter:description"]').attr('content'), undefined)
117 | t.is(
118 | $('meta[name="twitter:image"]').attr('content'),
119 | 'https://kikobeats.com/image.jpg'
120 | )
121 | t.is($('meta[property="twitter:image"]').attr('content'), undefined)
122 | })
123 | ;['al', 'fb', 'og'].forEach(prefix => {
124 | test(`rewrite multiple '${prefix}' wrong markup`, t => {
125 | const output = html({
126 | rewriteHtml: true,
127 | url: 'https://kikobeats.com',
128 | html: composeHtml([
129 | ``,
130 | ``
131 | ]),
132 | headers: { 'content-type': 'text/html; charset=utf-8' }
133 | })
134 |
135 | const $ = cheerio.load(output)
136 | t.is($(`meta[property="${prefix}:app_id"]`).attr('content'), '1234')
137 | t.is($(`meta[name="${prefix}:app_id"]`).attr('content'), undefined)
138 | t.is($(`meta[property="${prefix}:session_id"]`).attr('content'), '5678')
139 | t.is($(`meta[name="${prefix}:session_id"]`).attr('content'), undefined)
140 | })
141 | })
142 |
--------------------------------------------------------------------------------
/test/html/rewrite-urls.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const path = require('path')
4 | const test = require('ava')
5 | const fs = require('fs')
6 |
7 | const { prettyHtml } = require('../helpers')
8 |
9 | const html = (...args) => require('../../src/html')(...args).html()
10 |
11 | test('remove localhost alike URLs', t => {
12 | const output = html({
13 | rewriteUrls: true,
14 | url: 'https://kikobeats.com',
15 | html: `
16 |
17 |
18 | kikobeats.com
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 | Email
29 | FTP Link
30 |
31 |
32 | `,
33 | headers: { 'content-type': 'text/html; charset=utf-8' }
34 | })
35 |
36 | t.snapshot(prettyHtml(output))
37 | })
38 |
39 | test('rewrites relative root URLs inside html markup', t => {
40 | const output = html({
41 | rewriteUrls: true,
42 | url: 'https://browserless.js.org',
43 | html: fs.readFileSync(
44 | path.resolve(__dirname, '../fixtures/browserless.html'),
45 | 'utf8'
46 | ),
47 | headers: {
48 | 'content-type': 'text/html; charset=utf-8'
49 | }
50 | })
51 |
52 | t.true(output.includes('https://browserless.js.org/static/main.min.js'))
53 | t.true(output.includes('https://unpkg.com/docsify/lib/docsify.min.js'))
54 |
55 | t.snapshot(prettyHtml(output))
56 | })
57 |
58 | test('rewrites relative URLs inside html markup', t => {
59 | const output = html({
60 | rewriteUrls: true,
61 | url: 'https://moovility.me/',
62 | html: `
63 |
64 |
65 |
66 |
67 |
68 | `,
69 | headers: {
70 | 'content-type': 'text/html; charset=utf-8'
71 | }
72 | })
73 |
74 | t.true(output.includes('https://moovility.me/img/icons/MOV/icon2-76.png'))
75 |
76 | t.snapshot(prettyHtml(output))
77 | })
78 |
79 | test(" don't modify inline javascript", t => {
80 | const output = html({
81 | rewriteUrls: true,
82 | url: 'https://www.latimes.com/opinion/story/2020-06-07/column-muralist-honors-african-americans-killed-by-police',
83 | html: `
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 | Print
92 |
93 | `,
94 | headers: {
95 | 'content-type': 'text/html;charset=UTF-8'
96 | }
97 | })
98 |
99 | t.true(
100 | output.includes(
101 | 'Print'
102 | )
103 | )
104 |
105 | t.snapshot(prettyHtml(output))
106 | })
107 |
108 | test("don't modify non http protocols", t => {
109 | const output = html({
110 | rewriteUrls: true,
111 | url: 'https://www.latimes.com/opinion/story/2020-06-07/column-muralist-honors-african-americans-killed-by-police',
112 | html: `
113 |
114 |
115 |
116 |
117 |
118 |
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 | `,
128 | headers: {
129 | 'content-type': 'text/html;charset=UTF-8'
130 | }
131 | })
132 |
133 | t.true(output.includes(''))
134 | t.true(output.includes(''))
135 | t.true(output.includes(''))
137 | t.true(output.includes(''))
138 | t.true(output.includes(''))
139 |
140 | t.snapshot(prettyHtml(output))
141 | })
142 |
143 | test("don't modify data URIs", t => {
144 | const output = html({
145 | rewriteUrls: true,
146 | url: 'https://example.com',
147 | html: `
148 |
149 |
150 |
151 |
152 |
153 |
154 |
155 |
156 |
157 | `,
158 | headers: {
159 | 'content-type': 'text/html;charset=UTF-8'
160 | }
161 | })
162 |
163 | t.true(
164 | output.includes(
165 | '
'
166 | )
167 | )
168 |
169 | t.snapshot(prettyHtml(output))
170 | })
171 |
172 | test("don't modify undefined attributes", t => {
173 | const output = html({
174 | rewriteUrls: true,
175 | url: 'https://moovility.me',
176 | html: `
177 |
178 |
179 |
180 | Document
181 |
182 |
183 |
184 |
185 | `,
186 | headers: {
187 | 'content-type': 'text/html;charset=UTF-8'
188 | }
189 | })
190 |
191 | t.true(output.includes(""))
192 |
193 | t.snapshot(prettyHtml(output))
194 | })
195 |
--------------------------------------------------------------------------------
/test/html/snapshots/index.js.md:
--------------------------------------------------------------------------------
1 | # Snapshot report for `test/html/index.js`
2 |
3 | The actual snapshot is saved in `index.js.snap`.
4 |
5 | Generated by [AVA](https://avajs.dev).
6 |
7 | ## add minimal html markup
8 |
9 | > Snapshot 1
10 |
11 | `␊
12 | ␊
13 | ␊
14 | kikobeats.com␊
15 | ␊
16 | ␊
17 | ␊
18 | ␊
19 | `
20 |
21 | ## add meta charset
22 |
23 | > Snapshot 1
24 |
25 | `␊
26 | ␊
27 | ␊
28 | kikobeats.com␊
29 | ␊
30 | ␊
31 | ␊
32 | ␊
33 | ␊
34 | `
35 |
36 | ## add doctype
37 |
38 | > Snapshot 1
39 |
40 | `␊
41 | ␊
42 | ␊
43 | kikobeats.com␊
44 | ␊
45 | ␊
46 | ␊
47 | ␊
48 | ␊
49 | ␊
50 | `
51 |
52 | ## add json markup
53 |
54 | > Snapshot 1
55 |
56 | `␊
57 | ␊
58 | ␊
59 | geolocation.microlink.io␊
60 | ␊
61 | ␊
62 | ␊
63 | {"origin":"83.46.149.83","city":"Madrid","alpha2":"ES","alpha3":"ESP","callingCodes":["+34"],"currencies":{"EUR":{"name":"Euro","symbol":"€"}},"eeaMember":true,"euMember":true,"flag":"🇪🇸","languages":{"spa":"Spanish"},"numeric":724,"tld":[".es"],"region":"MD","latitude":"40.4163","longitude":"-3.6934","timezone":"Europe/Madrid","headers":{"accept":"*/*","accept-encoding":"gzip","cdn-loop":"cloudflare","cf-connecting-ip":"83.46.149.83","cf-ipcountry":"ES","cf-ray":"73a29be38cdf37c7-MAD","cf-visitor":"{"scheme":"https"}","connection":"Keep-Alive","host":"geolocation.microlink.io","user-agent":"curl/7.79.1","x-forwarded-for":"172.70.57.171","x-forwarded-host":"geolocation.microlink.io","x-forwarded-proto":"https","x-real-ip":"172.70.57.171","x-vercel-edge-region":"dev","x-vercel-id":"cdg1::x96k9-1660405852783-a0083d276cde","x-vercel-ip-city":"Madrid","x-vercel-ip-country":"ES","x-vercel-ip-country-region":"MD","x-vercel-ip-latitude":"40.4163","x-vercel-ip-longitude":"-3.6934","x-vercel-ip-timezone":"Europe/Madrid","x-vercel-proxied-for":"172.70.57.171"}}
␊
64 | ␊
65 | `
66 |
67 | ## add image markup
68 |
69 | > Snapshot 1
70 |
71 | `␊
72 | ␊
73 | ␊
74 | giphy.gif␊
75 | ␊
76 | ␊
77 | ␊
78 |
␊
79 | `
80 |
81 | ## add audio markup
82 |
83 | > Snapshot 1
84 |
85 | `␊
86 | ␊
87 | ␊
88 | 13.01.mp3␊
89 | ␊
90 | ␊
91 | ␊
92 | ␊
95 | `
96 |
97 | ## add video markup
98 |
99 | > Snapshot 1
100 |
101 | `␊
102 | ␊
103 | ␊
104 | big_buck_bunny_720p_1mb.mp4␊
105 | ␊
106 | ␊
107 | ␊
108 | ␊
111 | `
112 |
113 | ## styles injection
114 |
115 | > Snapshot 1
116 |
117 | `␊
118 | ␊
119 | ␊
120 | ␊
121 | ␊
122 | Document␊
123 | ␊
124 | ␊
125 | ␊
126 | ␊
127 | ␊
128 | ␊
133 | ␊
134 | ␊
135 | ␊
136 | `
137 |
138 | ## scripts injection
139 |
140 | > Snapshot 1
141 |
142 | `␊
143 | ␊
144 | ␊
145 | ␊
146 | ␊
147 | Document␊
148 | ␊
149 | ␊
150 | ␊
151 | ␊
152 | ␊
168 | ␊
169 | ␊
170 | ␊
171 | ␊
172 | `
173 |
174 | ## hide elements
175 |
176 | > Snapshot 1
177 |
178 | `␊
179 | ␊
180 | ␊
181 | ␊
182 | ␊
183 | Document␊
184 | ␊
185 | ␊
186 | ␊
187 | ␊
188 | ␊
193 | ␊
194 | ␊
195 | ␊
196 | `
197 |
198 | ## remove elements
199 |
200 | > Snapshot 1
201 |
202 | `␊
203 | ␊
204 | ␊
205 | ␊
206 | ␊
207 | Document␊
208 | ␊
209 | ␊
210 | ␊
211 | ␊
212 | ␊
217 | ␊
218 | ␊
219 | ␊
220 | `
221 |
--------------------------------------------------------------------------------
/test/html/snapshots/index.js.snap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/html/snapshots/index.js.snap
--------------------------------------------------------------------------------
/test/html/snapshots/rewrite-css-urls.js.md:
--------------------------------------------------------------------------------
1 | # Snapshot report for `test/html/rewrite-css-urls.js`
2 |
3 | The actual snapshot is saved in `rewrite-css-urls.js.snap`.
4 |
5 | Generated by [AVA](https://avajs.dev).
6 |
7 | ## don't modify html markup
8 |
9 | > Snapshot 1
10 |
11 | `␊
12 | ␊
13 | ␊
14 | ␊
19 | ␊
20 | how-to-apply-oil-plus-2c-to-furniture␊
21 | ␊
22 | ␊
23 | ␊
24 | ␊
25 | ␊
26 | ␊
27 | `
28 |
29 | ## rewrites relative URLs inside stylesheet
30 |
31 | > Snapshot 1
32 |
33 | `␊
34 | ␊
35 | ␊
36 | kikobeats.com␊
37 | ␊
38 | ␊
39 | ␊
40 | ␊
41 | ␊
42 | ␊
43 | ␊
44 | ␊
45 | `
46 |
--------------------------------------------------------------------------------
/test/html/snapshots/rewrite-css-urls.js.snap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/html/snapshots/rewrite-css-urls.js.snap
--------------------------------------------------------------------------------
/test/html/snapshots/rewrite-urls.js.md:
--------------------------------------------------------------------------------
1 | # Snapshot report for `test/html/rewrite-urls.js`
2 |
3 | The actual snapshot is saved in `rewrite-urls.js.snap`.
4 |
5 | Generated by [AVA](https://avajs.dev).
6 |
7 | ## remove localhost alike URLs
8 |
9 | > Snapshot 1
10 |
11 | `␊
12 | ␊
13 | ␊
14 | kikobeats.com␊
15 | ␊
16 | ␊
17 | ␊
18 | ␊
19 | ␊
20 | ␊
21 | Email␊
22 | FTP Link␊
23 |
␊
24 | ␊
25 | `
26 |
27 | ## rewrites relative root URLs inside html markup
28 |
29 | > Snapshot 1
30 |
31 | `␊
32 | ␊
33 | ␊
34 | ␊
35 | ␊
36 | ␊
37 | ␊
38 | ␊
39 | ␊
40 | ␊
41 | ␊
42 | ␊
43 | browserless, a puppeter-like Node.js library for interacting with Headless production scenarios.␊
44 | ␊
45 | ␊
46 | ␊
47 | ␊
48 | ␊
49 | ␊
50 | ␊
51 | ␊
52 | ␊
53 | ␊
54 | ␊
55 | ␊
56 | ␊
57 | ␊
58 | ␊
59 | ␊
60 | ␊
61 | ␊
62 | ␊
63 | ␊
64 | ␊
65 | ␊
66 | ␊
67 | ␊
68 | ␊
69 | ␊
70 | ␊
71 | ␊
72 | ␊
73 | ␊
74 | ␊
75 | ␊
76 | ␊
77 | ␊
78 | ␊
79 | ␊
80 | ␊
81 | ␊
82 | ␊
83 | ␊
84 | ␊
85 | ␊
86 | ␊
87 | `
88 |
89 | ## rewrites relative URLs inside html markup
90 |
91 | > Snapshot 1
92 |
93 | `␊
94 | ␊
95 | ␊
96 | ␊
97 | moovility.me␊
98 | ␊
99 | ␊
100 | ␊
101 | ␊
102 | ␊
103 | ␊
104 | `
105 |
106 | ## don't modify inline javascript
107 |
108 | > Snapshot 1
109 |
110 | `␊
111 | ␊
112 | ␊
113 | ␊
114 | ␊
115 | column-muralist-honors-african-americans-killed-by-police␊
116 | ␊
117 | ␊
118 | ␊
119 | ␊
120 | Print␊
123 | ␊
124 | `
125 |
126 | ## don't modify non http protocols
127 |
128 | > Snapshot 1
129 |
130 | `␊
131 | ␊
132 | ␊
133 | ␊
134 | ␊
135 | column-muralist-honors-african-americans-killed-by-police␊
136 | ␊
137 | ␊
138 | ␊
139 | ␊
140 | ␊
141 | ␊
142 | ␊
143 | ␊
144 | ␊
145 | ␊
146 | ␊
147 | `
148 |
149 | ## don't modify data URIs
150 |
151 | > Snapshot 1
152 |
153 | `␊
154 | ␊
155 | ␊
156 | ␊
157 | ␊
158 | example.com␊
159 | ␊
160 | ␊
161 | ␊
162 | ␊
163 |
␊
164 | ␊
165 | `
166 |
167 | ## don't modify undefined attributes
168 |
169 | > Snapshot 1
170 |
171 | `␊
172 | ␊
173 | ␊
174 | Document␊
175 | ␊
176 | ␊
177 | ␊
178 | ␊
179 | ␊
180 | ␊
183 | ␊
184 | `
185 |
--------------------------------------------------------------------------------
/test/html/snapshots/rewrite-urls.js.snap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/html/snapshots/rewrite-urls.js.snap
--------------------------------------------------------------------------------
/test/index.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const PCancelable = require('p-cancelable')
4 | const cheerio = require('cheerio')
5 | const test = require('ava')
6 |
7 | const { initBrowserless, runServer, prettyHtml } = require('./helpers')
8 | const getHTML = require('..')
9 |
10 | const getBrowserless = initBrowserless(test)
11 |
12 | const wait = async (promise, prop) => {
13 | const res = await promise
14 | return prop ? res[prop] : res
15 | }
16 |
17 | test('throw an error if `getBrowserless` is not provided', async t => {
18 | const url = 'https://example.com'
19 | const error = await t.throwsAsync(getHTML(url))
20 | t.is(error.name, 'TypeError')
21 | t.is(
22 | error.message,
23 | "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`"
24 | )
25 | })
26 |
27 | test('promise is cancelable', async t => {
28 | const url = 'https://example.com'
29 | t.true(getHTML(url, { getBrowserless: () => {} }) instanceof PCancelable)
30 | t.true(
31 | getHTML.getContent(url, 'fetch', {
32 | getBrowserless: () => {}
33 | }) instanceof PCancelable
34 | )
35 | })
36 |
37 | test('reachable URL', async t => {
38 | const url = 'https://example.com'
39 | const [prerenderDisabled, prerenderEnabled] = await Promise.all([
40 | getHTML(url, { prerender: false, getBrowserless }),
41 | getHTML(url, {
42 | prerender: true,
43 | getBrowserless,
44 | puppeteerOpts: { adblock: false }
45 | })
46 | ])
47 |
48 | t.is(
49 | await wait(
50 | getHTML(url, { prerender: false, getBrowserless }),
51 | 'statusCode'
52 | ),
53 | 200
54 | )
55 | t.is(
56 | await wait(
57 | getHTML(url, {
58 | prerender: true,
59 | getBrowserless,
60 | puppeteerOpts: { adblock: false }
61 | }),
62 | 'statusCode'
63 | ),
64 | 200
65 | )
66 |
67 | t.is(prerenderDisabled.statusCode, prerenderEnabled.statusCode)
68 | t.is(prerenderDisabled.statusCode, 200)
69 |
70 | t.true(Object.keys(prerenderDisabled.headers).length > 0)
71 | t.true(Object.keys(prerenderEnabled.headers).length > 0)
72 | t.is(typeof prerenderDisabled.headers, typeof prerenderEnabled.headers)
73 |
74 | t.true(prerenderDisabled.html.length > 0)
75 | t.true(prerenderEnabled.html.length > 0)
76 | t.is(typeof prerenderDisabled.html, typeof prerenderEnabled.html)
77 | })
78 |
79 | test('timeout URL', async t => {
80 | const url = 'https://test-timeout.vercel.app'
81 |
82 | const [prerenderDisabled, prerenderEnabled] = await Promise.all([
83 | getHTML(url, {
84 | prerender: false,
85 | getBrowserless,
86 | gotOpts: { timeout: 1000 }
87 | }),
88 | getHTML(url, {
89 | prerender: true,
90 | getBrowserless,
91 | puppeteerOpts: { timeout: 2000, adblock: false }
92 | })
93 | ])
94 |
95 | t.is(prerenderDisabled.url, prerenderEnabled.url)
96 | t.is(prerenderDisabled.html, prerenderEnabled.html)
97 | t.is(prerenderDisabled.statusCode, prerenderEnabled.statusCode)
98 | t.deepEqual(prerenderDisabled.headers, prerenderEnabled.headers)
99 | })
100 |
101 | test('unreachable URL', async t => {
102 | const url = 'https://notexisturl.dev'
103 |
104 | const [prerenderDisabled, prerenderEnabled] = await Promise.all([
105 | getHTML(url, { prerender: false, getBrowserless }),
106 | getHTML(url, {
107 | prerender: true,
108 | getBrowserless,
109 | puppeteerOpts: { adblock: false }
110 | })
111 | ])
112 |
113 | t.is(prerenderDisabled.url, prerenderEnabled.url)
114 | t.is(prerenderDisabled.html, prerenderEnabled.html)
115 | t.is(prerenderDisabled.statusCode, prerenderEnabled.statusCode)
116 | t.deepEqual(prerenderDisabled.headers, prerenderEnabled.headers)
117 | })
118 |
119 | test('from audio URL', async t => {
120 | const targetUrl =
121 | 'https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav'
122 | const { url, stats, html } = await getHTML(targetUrl, {
123 | getBrowserless,
124 | prerender: false
125 | })
126 |
127 | t.is(stats.mode, 'fetch')
128 | t.is(url, targetUrl)
129 | t.snapshot(prettyHtml(html))
130 | })
131 |
132 | test('from image URL', async t => {
133 | const targetUrl = 'https://kikobeats.com/images/avatar.jpg'
134 | const { url, stats, html } = await getHTML(targetUrl, { getBrowserless })
135 |
136 | t.is(stats.mode, 'fetch')
137 | t.is(url, targetUrl)
138 |
139 | const $ = cheerio.load(html)
140 | $('meta[name="date"]').remove()
141 |
142 | t.snapshot(prettyHtml($.html()))
143 | })
144 |
145 | test('from SVG image URL', async t => {
146 | const targetUrl = 'https://cdn.microlink.io/file-examples/sample.svg'
147 | const { stats } = await getHTML(targetUrl, { getBrowserless })
148 | t.true(stats.timing < 3000)
149 | t.is(stats.mode, 'fetch')
150 | })
151 |
152 | test('from big image URL', async t => {
153 | const targetUrl =
154 | 'https://static.jutarnji.hr/images/live-multimedia/binary/2016/6/17/10/iStock_82744687_XXLARGE.jpg'
155 | const { stats } = await getHTML(targetUrl, { getBrowserless })
156 | t.true(stats.timing < 3000)
157 | t.is(stats.mode, 'fetch')
158 | })
159 |
160 | test('from URL with no content type', async t => {
161 | const targetUrl = await runServer(t, (_, res) => {
162 | res.end('.')
163 | })
164 | const { stats } = await getHTML(targetUrl, {
165 | getBrowserless,
166 | prerender: false
167 | })
168 | t.is(stats.mode, 'fetch')
169 | })
170 |
171 | test('from image URL that returns HTML markup', async t => {
172 | const targetUrl =
173 | 'https://www.europapress.es/chance/gente/%7B%7BrutaFoto%7D%7D%7B%7Bfechor%7D%7D_%7B%7BanchoFoto%7D%7D_%7B%7BaltoFoto%7D%7D%7B%7BversionFoto%7D%7D.jpg'
174 | const { stats } = await getHTML(targetUrl, { getBrowserless })
175 | t.true(stats.timing < 3000)
176 | t.is(stats.mode, 'fetch')
177 | })
178 |
179 | test('from video URL', async t => {
180 | const targetUrl = 'https://cdn.microlink.io/file-examples/sample.mp4'
181 | const { url, stats, html } = await getHTML(targetUrl, {
182 | prerender: false,
183 | getBrowserless
184 | })
185 |
186 | t.is(stats.mode, 'fetch')
187 | t.is(url, targetUrl)
188 | t.snapshot(prettyHtml(html))
189 | })
190 |
191 | test('from bad SSL URL', async t => {
192 | const targetUrl = 'https://self-signed.badssl.com/'
193 | const { url, stats, html } = await getHTML(targetUrl, {
194 | prerender: false,
195 | getBrowserless,
196 | gotOpts: {
197 | https: { rejectUnauthorized: false }
198 | }
199 | })
200 |
201 | t.true(html.includes('background: red'))
202 | t.is(stats.mode, 'fetch')
203 | t.is(url, targetUrl)
204 | t.snapshot(prettyHtml(html))
205 | })
206 |
--------------------------------------------------------------------------------
/test/is-fetch-mode.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const { isFetchMode } = require('..')
6 |
7 | test('true', t => {
8 | t.true(
9 | isFetchMode(
10 | 'https://www.abc.net.au/news/2023-06-14/idpwd-2023-calling-all-budding-storytellers-with-disability/102388090'
11 | )
12 | )
13 | t.true(
14 | isFetchMode('https://twitter.com/Kikobeats/status/1741205717636264436')
15 | )
16 | })
17 |
--------------------------------------------------------------------------------
/test/mode.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const getHTML = require('../src')
6 | const { initBrowserless } = require('./helpers')
7 |
8 | const getBrowserless = initBrowserless(test)
9 |
10 | test('`{ prerender: true }`', async t => {
11 | const url = 'https://example.com'
12 | const { stats } = await getHTML(url, { getBrowserless })
13 | t.is(stats.mode, 'prerender')
14 | })
15 |
16 | test('`{ prerender: false }`', async t => {
17 | const url = 'https://example.com'
18 | const { stats } = await getHTML(url, { prerender: false, getBrowserless })
19 | t.is(stats.mode, 'fetch')
20 | })
21 |
22 | test("`{ prerender: 'auto' }`", async t => {
23 | {
24 | const url = 'https://google.com'
25 | const { stats } = await getHTML(url, {
26 | getBrowserless,
27 | puppeteerOpts: { adblock: false }
28 | })
29 | t.is(stats.mode, 'fetch')
30 | }
31 | {
32 | const url = 'https://twitter.com/Kikobeats/status/1741205717636264436'
33 | const { html, stats } = await getHTML(url, {
34 | headers: {
35 | 'user-agent': 'Slackbot 1.0 (+https://api.slack.com/robots)'
36 | },
37 | getBrowserless,
38 | puppeteerOpts: { adblock: false }
39 | })
40 | t.true(html.includes('og:title'))
41 | t.is(stats.mode, 'fetch')
42 | }
43 | })
44 |
45 | test.skip('prerender error fallback into fetch mode', async t => {
46 | const url =
47 | 'https://www.sportsnet.ca/hockey/nhl/leafs-john-tavares-return-new-york-hope-positive/'
48 | const { stats, html } = await getHTML(url, {
49 | prerender: true,
50 | getBrowserless,
51 | puppeteerOpts: { adblock: false }
52 | })
53 | t.true(!!html)
54 | t.is(stats.mode, 'fetch')
55 | })
56 |
--------------------------------------------------------------------------------
/test/pdf.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const cheerio = require('cheerio')
4 | const test = require('ava')
5 |
6 | const { initBrowserless, prettyHtml } = require('./helpers')
7 | const getHTML = require('..')
8 |
9 | const getBrowserless = initBrowserless(test)
10 |
11 | const PDF_OVER_TRESHOLD = 'https://cdn.microlink.io/file-examples/sample.pdf'
12 | const PDF_UNDER_TRESHOLD = 'https://pdfobject.com/pdf/sample.pdf'
13 |
14 | test('disable if `mutool` is not installed', async t => {
15 | const targetUrl = 'https://cdn.microlink.io/file-examples/sample.pdf'
16 | const { url, stats, html } = await getHTML(targetUrl, {
17 | mutool: false,
18 | getBrowserless
19 | })
20 |
21 | const $ = cheerio.load(html)
22 | $('meta[name="date"]').remove()
23 |
24 | t.is(url, targetUrl)
25 | t.snapshot(prettyHtml($.html()))
26 | t.is(stats.mode, 'fetch')
27 | })
28 |
29 | test('turn PDF into HTML markup over the treshold', async t => {
30 | const targetUrl = PDF_OVER_TRESHOLD
31 | const { url, stats, html } = await getHTML(targetUrl, {
32 | getBrowserless
33 | })
34 |
35 | const $ = cheerio.load(html)
36 | t.is(url, targetUrl)
37 | t.is(
38 | $('p').first().text(),
39 | 'Instructions for Adding Your Logo & Address to AAO-HNSF Patient Handouts'
40 | )
41 | t.is(stats.mode, 'fetch')
42 | })
43 |
44 | test('turn PDF into HTML markup under the treshold', async t => {
45 | const targetUrl = PDF_UNDER_TRESHOLD
46 | const { url, stats, html } = await getHTML(targetUrl, {
47 | getBrowserless
48 | })
49 | const $ = cheerio.load(html)
50 | t.is(url, targetUrl)
51 | t.is($('p').eq(1).text(), 'This is a simple PDF file. Fun fun fun.')
52 | t.is(stats.mode, 'fetch')
53 | })
54 |
--------------------------------------------------------------------------------
/test/redirects.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const { initBrowserless } = require('./helpers')
6 | const getHTML = require('..')
7 |
8 | const getBrowserless = initBrowserless(test)
9 |
10 | ;[true, false].forEach(prerender => {
11 | const mode = prerender ? 'prerender' : 'fetch'
12 |
13 | test(`${mode} » collect redirects`, async t => {
14 | const targetUrl =
15 | 'https://test-redirect-drab.vercel.app/?url=https%3A%2F%2Ftest-redirect-drab.vercel.app%3Furl%3Dhttps%253A%252F%252Ftest-redirect-drab.vercel.app%252F%253Furl%253Dhttps%253A%252F%252Fexample.com'
16 |
17 | const { redirects } = await getHTML(targetUrl, {
18 | prerender,
19 | getBrowserless
20 | })
21 |
22 | t.deepEqual(redirects, [
23 | {
24 | statusCode: 302,
25 | url: 'https://test-redirect-drab.vercel.app/?url=https%3A%2F%2Ftest-redirect-drab.vercel.app%3Furl%3Dhttps%253A%252F%252Ftest-redirect-drab.vercel.app%252F%253Furl%253Dhttps%253A%252F%252Fexample.com'
26 | },
27 | {
28 | statusCode: 302,
29 | url: 'https://test-redirect-drab.vercel.app/?url=https%3A%2F%2Ftest-redirect-drab.vercel.app%2F%3Furl%3Dhttps%3A%2F%2Fexample.com'
30 | },
31 | {
32 | statusCode: 302,
33 | url: 'https://test-redirect-drab.vercel.app/?url=https://example.com'
34 | }
35 | ])
36 | })
37 | })
38 |
--------------------------------------------------------------------------------
/test/snapshots/index.js.md:
--------------------------------------------------------------------------------
1 | # Snapshot report for `test/index.js`
2 |
3 | The actual snapshot is saved in `index.js.snap`.
4 |
5 | Generated by [AVA](https://avajs.dev).
6 |
7 | ## from audio URL
8 |
9 | > Snapshot 1
10 |
11 | `␊
12 | ␊
13 | ␊
14 | embedadapt_100sample.wav␊
15 | ␊
16 | ␊
17 | ␊
18 | ␊
19 | ␊
22 | `
23 |
24 | ## from image URL
25 |
26 | > Snapshot 1
27 |
28 | `␊
29 | ␊
30 | ␊
31 | avatar.jpg␊
32 | ␊
33 | ␊
34 | ␊
35 |
␊
36 | `
37 |
38 | ## from video URL
39 |
40 | > Snapshot 1
41 |
42 | `␊
43 | ␊
44 | ␊
45 | sample.mp4␊
46 | ␊
47 | ␊
48 | ␊
49 | ␊
50 | ␊
53 | `
54 |
55 | ## from bad SSL URL
56 |
57 | > Snapshot 1
58 |
59 | `␊
60 | ␊
61 | ␊
62 | ␊
63 | ␊
64 | ␊
65 | ␊
66 | self-signed.badssl.com␊
67 | ␊
68 | ␊
73 | ␊
74 | ␊
75 | ␊
76 | ␊
77 | ␊
78 | ␊
79 |
␊
80 | self-signed.
badssl.com␊
81 |
␊
82 | ␊
83 | ␊
84 | `
85 |
--------------------------------------------------------------------------------
/test/snapshots/index.js.snap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/snapshots/index.js.snap
--------------------------------------------------------------------------------
/test/snapshots/pdf.js.md:
--------------------------------------------------------------------------------
1 | # Snapshot report for `test/pdf.js`
2 |
3 | The actual snapshot is saved in `pdf.js.snap`.
4 |
5 | Generated by [AVA](https://avajs.dev).
6 |
7 | ## disable if `mutool` is not installed
8 |
9 | > Snapshot 1
10 |
11 | `␊
12 | ␊
13 | ␊
14 | sample.pdf␊
15 | ␊
16 | ␊
17 | ␊
18 | ␊
19 | `
20 |
--------------------------------------------------------------------------------
/test/snapshots/pdf.js.snap:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/snapshots/pdf.js.snap
--------------------------------------------------------------------------------
/test/url.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const { initBrowserless, runServer, prettyHtml } = require('./helpers')
6 | const getHTML = require('..')
7 |
8 | const getBrowserless = initBrowserless(test)
9 |
10 | ;[false, true].forEach(prerender => {
11 | const mode = prerender ? 'prerender' : 'fetch'
12 | test(`${mode} » as string`, async t => {
13 | const url = await runServer(t, (_, res) =>
14 | res.end('.')
15 | )
16 | const { html } = await getHTML(url.toString(), {
17 | getBrowserless,
18 | prerender,
19 | puppeteerOpts: { adblock: false, animations: true }
20 | })
21 |
22 | t.is(
23 | prettyHtml(html),
24 | prettyHtml(`
25 |
26 |
27 | .
28 |
29 |
30 |
31 |
32 | `)
33 | )
34 | })
35 |
36 | test(`${mode} » as WHATWG URL object`, async t => {
37 | const url = await runServer(t, (_, res) =>
38 | res.end('.')
39 | )
40 | const { html } = await getHTML(url, {
41 | getBrowserless,
42 | prerender,
43 | puppeteerOpts: { adblock: false, animations: true }
44 | })
45 |
46 | t.is(
47 | prettyHtml(html),
48 | prettyHtml(`
49 |
50 |
51 | .
52 |
53 |
54 |
55 |
56 | `)
57 | )
58 | })
59 | })
60 |
--------------------------------------------------------------------------------
/test/util/get-charset.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const { getCharset } = require('../../src/util')
6 |
7 | const { createHeaders } = require('../helpers')
8 |
9 | const contentType = createHeaders('content-type')
10 |
11 | test('returns lower case value detected from content-type', t => {
12 | t.is(getCharset(contentType('text/html; charset=UTF-8')), 'utf-8')
13 | t.is(getCharset(contentType('text/html; charset=ISO-8859-1')), 'iso-8859-1')
14 | })
15 |
16 | test('returns undefined when charset is not detected', t => {
17 | t.is(getCharset(contentType('text/html; foo=bar')), undefined)
18 | t.is(getCharset(contentType('text/html')), undefined)
19 | t.is(getCharset(contentType('text/html')), undefined)
20 | t.is(getCharset(contentType('invalid/type')), undefined)
21 | })
22 |
--------------------------------------------------------------------------------
/test/util/get-content-length.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const { getContentLength } = require('../../src/util')
6 |
7 | const { PDF_SIZE_TRESHOLD } = require('../../src')
8 |
9 | const { createHeaders } = require('../helpers')
10 |
11 | const contentLength = createHeaders('content-length')
12 |
13 | test('parse content length into number', t => {
14 | {
15 | const raw = PDF_SIZE_TRESHOLD - PDF_SIZE_TRESHOLD * 0.25
16 | const input = String(raw)
17 | const length = getContentLength(contentLength(input))
18 | t.is(length, raw)
19 | t.true(length < PDF_SIZE_TRESHOLD)
20 | }
21 | {
22 | const raw = PDF_SIZE_TRESHOLD + PDF_SIZE_TRESHOLD * 0.25
23 | const input = String(raw)
24 | const length = getContentLength(contentLength(input))
25 | t.is(length, raw)
26 | t.false(length < PDF_SIZE_TRESHOLD)
27 | }
28 | })
29 |
30 | test('returns 0 if value is not present', t => {
31 | const length = getContentLength(contentLength())
32 | t.is(length, NaN)
33 | t.false(length > PDF_SIZE_TRESHOLD)
34 | })
35 |
--------------------------------------------------------------------------------
/test/util/get-content-type.js:
--------------------------------------------------------------------------------
1 | 'use strict'
2 |
3 | const test = require('ava')
4 |
5 | const { getContentType } = require('../../src/util')
6 |
7 | const { createHeaders } = require('../helpers')
8 |
9 | const contentType = createHeaders('content-type')
10 |
11 | test('return media type', t => {
12 | t.is(
13 | getContentType(contentType('application/pdf; charset=utf-8')),
14 | 'application/pdf'
15 | )
16 | t.is(
17 | getContentType(contentType('APPLICATION/PDF; charset=utf-8')),
18 | 'application/pdf'
19 | )
20 | t.is(
21 | getContentType(contentType('INVALID/TYPE; charset=utf-8')),
22 | 'invalid/type'
23 | )
24 | })
25 |
--------------------------------------------------------------------------------