├── .editorconfig ├── .gitattributes ├── .github ├── dependabot.yml └── workflows │ ├── main.yml │ └── pull_request.yml ├── .gitignore ├── .npmrc ├── CHANGELOG.md ├── LICENSE ├── README.md ├── benchmark ├── get-content-type │ └── index.js ├── mupdf │ ├── generate.js │ └── index.js └── package.json ├── bin └── index.js ├── package.json ├── scripts └── postinstall ├── src ├── html.js ├── index.js └── util.js └── test ├── auto-domains.js ├── encoding.js ├── fixtures ├── 51242_54045.html ├── browserless.html ├── rp.pl.html └── utf8.with.meta.html ├── helpers.js ├── html ├── get-date.js ├── index.js ├── rewrite-css-urls.js ├── rewrite-html.js ├── rewrite-urls.js └── snapshots │ ├── index.js.md │ ├── index.js.snap │ ├── rewrite-css-urls.js.md │ ├── rewrite-css-urls.js.snap │ ├── rewrite-urls.js.md │ └── rewrite-urls.js.snap ├── index.js ├── is-fetch-mode.js ├── mode.js ├── pdf.js ├── redirects.js ├── snapshots ├── index.js.md ├── index.js.snap ├── pdf.js.md └── pdf.js.snap ├── url.js └── util ├── get-charset.js ├── get-content-length.js └── get-content-type.js /.editorconfig: -------------------------------------------------------------------------------- 1 | # http://editorconfig.org 2 | 3 | root = true 4 | 5 | [*] 6 | indent_style = space 7 | indent_size = 2 8 | end_of_line = lf 9 | charset = utf-8 10 | trim_trailing_whitespace = true 11 | insert_final_newline = true 12 | max_line_length = 80 13 | indent_brace_style = 1TBS 14 | spaces_around_operators = true 15 | quote_type = auto 16 | 17 | [package.json] 18 | indent_style = space 19 | indent_size = 2 20 | 21 | [*.md] 22 | trim_trailing_whitespace = false 23 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto 2 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: npm 4 | directory: '/' 5 | schedule: 6 | interval: daily 7 | - package-ecosystem: 'github-actions' 8 | directory: '/' 9 | schedule: 10 | # Check for updates to GitHub Actions every weekday 11 | interval: 'daily' 12 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: main 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | 8 | jobs: 9 | contributors: 10 | if: "${{ github.event.head_commit.message != 'build: contributors' }}" 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v4 15 | with: 16 | fetch-depth: 0 17 | token: ${{ secrets.GITHUB_TOKEN }} 18 | - name: Setup Node.js 19 | uses: actions/setup-node@v4 20 | with: 21 | node-version: lts/* 22 | - name: Contributors 23 | run: | 24 | git config --global user.email ${{ secrets.GIT_EMAIL }} 25 | git config --global user.name ${{ secrets.GIT_USERNAME }} 26 | npm run contributors 27 | - name: Push changes 28 | run: | 29 | git push origin ${{ github.head_ref }} 30 | 31 | release: 32 | if: | 33 | !startsWith(github.event.head_commit.message, 'chore(release):') && 34 | !startsWith(github.event.head_commit.message, 'docs:') && 35 | !startsWith(github.event.head_commit.message, 'ci:') 36 | needs: [contributors] 37 | runs-on: ubuntu-latest 38 | steps: 39 | - name: Checkout 40 | uses: actions/checkout@v4 41 | with: 42 | token: ${{ secrets.GITHUB_TOKEN }} 43 | - name: Setup Node.js 44 | uses: actions/setup-node@v4 45 | with: 46 | node-version: lts/* 47 | - name: Setup PNPM 48 | uses: pnpm/action-setup@v4 49 | with: 50 | version: latest 51 | run_install: true 52 | - name: Install mupdf-tools 53 | run: sudo apt-get install -y mupdf-tools 54 | - name: Test 55 | run: pnpm test 56 | - name: Report 57 | run: npx c8 report --reporter=text-lcov > coverage/lcov.info 58 | - name: Coverage 59 | uses: coverallsapp/github-action@main 60 | with: 61 | github-token: ${{ secrets.GITHUB_TOKEN }} 62 | - name: Release 63 | env: 64 | GH_TOKEN: ${{ secrets.GH_TOKEN }} 65 | NPM_TOKEN: ${{ secrets.NPM_TOKEN }} 66 | run: | 67 | git config --global user.email ${{ secrets.GIT_EMAIL }} 68 | git config --global user.name ${{ secrets.GIT_USERNAME }} 69 | git pull origin master 70 | pnpm run release 71 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: pull_request 2 | 3 | on: 4 | push: 5 | branches: 6 | - master 7 | pull_request: 8 | branches: 9 | - master 10 | 11 | jobs: 12 | test: 13 | if: github.ref != 'refs/heads/master' 14 | runs-on: ubuntu-latest 15 | steps: 16 | - name: Checkout 17 | uses: actions/checkout@v4 18 | with: 19 | token: ${{ secrets.GITHUB_TOKEN }} 20 | - name: Setup Node.js 21 | uses: actions/setup-node@v4 22 | with: 23 | node-version: lts/* 24 | - name: Setup PNPM 25 | uses: pnpm/action-setup@v4 26 | with: 27 | version: latest 28 | run_install: true 29 | - name: Install mupdf-tools 30 | run: sudo apt-get install -y mupdf-tools 31 | - name: Test 32 | run: pnpm test 33 | - name: Report 34 | run: npx c8 report --reporter=text-lcov > coverage/lcov.info 35 | - name: Coverage 36 | uses: coverallsapp/github-action@main 37 | with: 38 | github-token: ${{ secrets.GITHUB_TOKEN }} 39 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ############################ 2 | # npm 3 | ############################ 4 | node_modules 5 | npm-debug.log 6 | .node_history 7 | yarn.lock 8 | package-lock.json 9 | 10 | ############################ 11 | # tmp, editor & OS files 12 | ############################ 13 | .tmp 14 | *.swo 15 | *.swp 16 | *.swn 17 | *.swm 18 | .DS_Store 19 | *# 20 | *~ 21 | .idea 22 | *sublime* 23 | nbproject 24 | src/auto-domains.json 25 | 26 | ############################ 27 | # Tests 28 | ############################ 29 | testApp 30 | coverage 31 | .nyc_output 32 | 33 | ############################ 34 | # Other 35 | ############################ 36 | .envrc 37 | -------------------------------------------------------------------------------- /.npmrc: -------------------------------------------------------------------------------- 1 | audit=false 2 | fund=false 3 | package-lock=false 4 | prefer-dedupe=true 5 | prefer-offline=false 6 | save-prefix=~ 7 | save=false 8 | strict-peer-dependencies=false 9 | unsafe-perm=true 10 | loglevel=error 11 | shamefully-hoist=true 12 | resolution-mode=highest 13 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | All notable changes to this project will be documented in this file. See [standard-version](https://github.com/conventional-changelog/standard-version) for commit guidelines. 4 | 5 | ### 2.21.5 (2025-05-20) 6 | 7 | ### [2.21.4](https://github.com/microlinkhq/html-get/compare/v2.21.3...v2.21.4) (2025-04-29) 8 | 9 | ### 2.21.3 (2025-04-12) 10 | 11 | ### 2.21.2 (2025-03-28) 12 | 13 | ### 2.21.1 (2025-03-13) 14 | 15 | ## 2.21.0 (2025-03-13) 16 | 17 | 18 | ### Features 19 | 20 | * improve pdf handling ([#215](https://github.com/microlinkhq/html-get/issues/215)) ([a9d9bbf](https://github.com/microlinkhq/html-get/commit/a9d9bbf640155775418f759c2fc7110e5c81ca65)) 21 | 22 | ## [2.20.0](https://github.com/microlinkhq/html-get/compare/v2.18.5...v2.20.0) (2025-03-02) 23 | 24 | 25 | ### Features 26 | 27 | * add `serializeHtml` ([6018775](https://github.com/microlinkhq/html-get/commit/60187753fcd4b636c152f3e6d3091cc0c59d00fb)) 28 | * **html:** better url rewrite ([#213](https://github.com/microlinkhq/html-get/issues/213)) ([e261c9c](https://github.com/microlinkhq/html-get/commit/e261c9cb39972a4cc7791f0b0a7927650f13cb74)) 29 | 30 | ## 2.19.0 (2025-02-28) 31 | 32 | 33 | ### Features 34 | 35 | * **html:** better url rewrite ([#213](https://github.com/microlinkhq/html-get/issues/213)) ([e261c9c](https://github.com/microlinkhq/html-get/commit/e261c9cb39972a4cc7791f0b0a7927650f13cb74)) 36 | 37 | ### 2.18.5 (2025-01-13) 38 | 39 | ### 2.18.4 (2025-01-08) 40 | 41 | ### 2.18.3 (2025-01-08) 42 | 43 | ### 2.18.2 (2025-01-04) 44 | 45 | ### 2.18.1 (2024-12-31) 46 | 47 | ## 2.18.0 (2024-12-05) 48 | 49 | 50 | ### Features 51 | 52 | * add redirects ([#210](https://github.com/microlinkhq/html-get/issues/210)) ([54cfe61](https://github.com/microlinkhq/html-get/commit/54cfe610e6dd3fd463e5af9b9252eabc66b59936)) 53 | 54 | ### 2.17.2 (2024-10-19) 55 | 56 | ### 2.17.1 (2024-10-18) 57 | 58 | ## 2.17.0 (2024-10-18) 59 | 60 | 61 | ### Features 62 | 63 | * add `rewriteHtml` ([#207](https://github.com/microlinkhq/html-get/issues/207)) ([ec99777](https://github.com/microlinkhq/html-get/commit/ec99777a91869c679309f7747573e734ed8e5e73)) 64 | 65 | ### 2.16.11 (2024-09-15) 66 | 67 | 68 | ### Bug Fixes 69 | 70 | * don't pipe stderr to stdout ([#206](https://github.com/microlinkhq/html-get/issues/206)) ([6b6b4cb](https://github.com/microlinkhq/html-get/commit/6b6b4cbf97791f6495d1c1bc60fe5dadf3dee318)) 71 | 72 | ### 2.16.10 (2024-09-08) 73 | 74 | ### 2.16.9 (2024-08-12) 75 | 76 | ### 2.16.8 (2024-06-20) 77 | 78 | ### 2.16.7 (2024-06-06) 79 | 80 | ### 2.16.6 (2024-05-31) 81 | 82 | ### 2.16.5 (2024-05-31) 83 | 84 | ### 2.16.4 (2024-05-09) 85 | 86 | ### 2.16.3 (2024-03-14) 87 | 88 | 89 | ### Bug Fixes 90 | 91 | * HTML should be a string ([#201](https://github.com/microlinkhq/html-get/issues/201)) ([f62cbb2](https://github.com/microlinkhq/html-get/commit/f62cbb22de64cc81543b871f38f67e8a3ebd8c99)) 92 | 93 | ### 2.16.2 (2024-03-02) 94 | 95 | ### 2.16.1 (2024-02-26) 96 | 97 | ## [2.16.0](https://github.com/microlinkhq/html-get/compare/v2.15.1...v2.16.0) (2024-02-17) 98 | 99 | 100 | ### Features 101 | 102 | * **html:** remove localhost alike URLs ([#193](https://github.com/microlinkhq/html-get/issues/193)) ([3cf2927](https://github.com/microlinkhq/html-get/commit/3cf2927d532a832f1a0736474b6a7412d8f40b94)) 103 | 104 | ### [2.15.1](https://github.com/microlinkhq/html-get/compare/v2.15.0...v2.15.1) (2024-02-08) 105 | 106 | ## [2.15.0](https://github.com/microlinkhq/html-get/compare/v2.14.4...v2.15.0) (2024-01-24) 107 | 108 | 109 | ### Features 110 | 111 | * turn PDF into HTML ([#189](https://github.com/microlinkhq/html-get/issues/189)) ([cbf0835](https://github.com/microlinkhq/html-get/commit/cbf08353440cb55ff37aa1434860eb5db5d65fea)) 112 | 113 | ### 2.14.4 (2024-01-18) 114 | 115 | 116 | ### Bug Fixes 117 | 118 | * content-type is undefined ([#187](https://github.com/microlinkhq/html-get/issues/187)) ([1839b42](https://github.com/microlinkhq/html-get/commit/1839b4246f5111665fc9ba8d78ffa6bb86dfd05c)) 119 | 120 | ### 2.14.3 (2024-01-18) 121 | 122 | 123 | ### Bug Fixes 124 | 125 | * media URL with HTML markup ([8564529](https://github.com/microlinkhq/html-get/commit/8564529823910c3a18ca40a9109d91ebee7f6e4a)) 126 | 127 | ### 2.14.2 (2024-01-16) 128 | 129 | 130 | ### Bug Fixes 131 | 132 | * avoid to encode media URLs ([#185](https://github.com/microlinkhq/html-get/issues/185)) ([c93c083](https://github.com/microlinkhq/html-get/commit/c93c0839e61704a825ecdc1dcc7f1bc64a64adf6)) 133 | 134 | ### 2.14.1 (2024-01-16) 135 | 136 | ## 2.14.0 (2024-01-06) 137 | 138 | 139 | ### Features 140 | 141 | * don't prerender for Twitter ([62bcafe](https://github.com/microlinkhq/html-get/commit/62bcafef070d5c72c87c4fc26df8b8f958c6ef84)) 142 | 143 | ### 2.13.15 (2024-01-01) 144 | 145 | ### 2.13.14 (2023-12-21) 146 | 147 | ### 2.13.13 (2023-12-18) 148 | 149 | 150 | ### Bug Fixes 151 | 152 | * remove redundant setTimeout ([#181](https://github.com/microlinkhq/html-get/issues/181)) ([f265414](https://github.com/microlinkhq/html-get/commit/f265414442133927b4c6b149909e4442a55c0928)) 153 | 154 | ### 2.13.12 (2023-12-18) 155 | 156 | ### 2.13.11 (2023-12-08) 157 | 158 | ### 2.13.10 (2023-11-24) 159 | 160 | ### 2.13.9 (2023-11-11) 161 | 162 | ### 2.13.8 (2023-10-09) 163 | 164 | ### 2.13.7 (2023-10-08) 165 | 166 | 167 | ### Bug Fixes 168 | 169 | * invoke function ([2c80be4](https://github.com/microlinkhq/html-get/commit/2c80be4325bb8ace57e0709f36ac3affd5cd5cdf)) 170 | 171 | ### 2.13.6 (2023-10-08) 172 | 173 | ### 2.13.5 (2023-10-08) 174 | 175 | ### 2.13.4 (2023-09-19) 176 | 177 | ### 2.13.3 (2023-09-05) 178 | 179 | ### 2.13.2 (2023-08-17) 180 | 181 | ### 2.13.1 (2023-08-10) 182 | 183 | ## 2.13.0 (2023-08-05) 184 | 185 | 186 | ### Features 187 | 188 | * add type markup property for video/audio ([#168](https://github.com/microlinkhq/html-get/issues/168)) ([d8a33ed](https://github.com/microlinkhq/html-get/commit/d8a33ed52cd408cedcbe41c003148d7f76c893f6)) 189 | 190 | ## 2.12.0 (2023-08-05) 191 | 192 | 193 | ### Features 194 | 195 | * better timestamp inference ([#167](https://github.com/microlinkhq/html-get/issues/167)) ([6973d25](https://github.com/microlinkhq/html-get/commit/6973d25b96d0c8a01104a8981fbeef9e1f4029bc)) 196 | 197 | ### 2.11.6 (2023-07-21) 198 | 199 | 200 | ### Bug Fixes 201 | 202 | * ensure domain is not falsy ([#166](https://github.com/microlinkhq/html-get/issues/166)) ([e42dcf7](https://github.com/microlinkhq/html-get/commit/e42dcf7012c358635c545bb6e15f51392971a1a1)) 203 | 204 | ### 2.11.5 (2023-07-19) 205 | 206 | ### 2.11.4 (2023-07-17) 207 | 208 | ### 2.11.3 (2023-06-19) 209 | 210 | ### 2.11.2 (2023-06-19) 211 | 212 | ### 2.11.1 (2023-06-06) 213 | 214 | ## 2.11.0 (2023-06-03) 215 | 216 | 217 | ### Features 218 | 219 | * make it cancelable ([#163](https://github.com/microlinkhq/html-get/issues/163)) ([c1d4aa8](https://github.com/microlinkhq/html-get/commit/c1d4aa8cbf9b71827f459df5757474449c9311d0)) 220 | 221 | ### 2.10.7 (2023-06-03) 222 | 223 | ### 2.10.6 (2023-05-15) 224 | 225 | 226 | ### Bug Fixes 227 | 228 | * signal-exit export ([#160](https://github.com/microlinkhq/html-get/issues/160)) ([7a73175](https://github.com/microlinkhq/html-get/commit/7a731758ed4d2c2090b9cc7290c4742c65d0c9c9)) 229 | 230 | ### 2.10.5 (2023-05-13) 231 | 232 | ### 2.10.4 (2023-05-02) 233 | 234 | ### [2.10.3](https://github.com/microlinkhq/html-get/compare/v2.10.2...v2.10.3) (2023-03-29) 235 | 236 | ### 2.10.2 (2023-03-28) 237 | 238 | ### 2.10.1 (2023-03-28) 239 | 240 | 241 | ### Bug Fixes 242 | 243 | * pass a URL instance ([#158](https://github.com/microlinkhq/html-get/issues/158)) ([0c51c64](https://github.com/microlinkhq/html-get/commit/0c51c646089f5ace1bed273ee51399cd63348f47)) 244 | 245 | ## 2.10.0 (2023-03-06) 246 | 247 | 248 | ### Features 249 | 250 | * abort images by default ([#157](https://github.com/microlinkhq/html-get/issues/157)) ([7794404](https://github.com/microlinkhq/html-get/commit/7794404be273f44a37b3cb5ce7f40b0002ad85f7)) 251 | 252 | ### 2.9.33 (2023-03-05) 253 | 254 | ### 2.9.32 (2023-03-04) 255 | 256 | ### 2.9.31 (2023-02-26) 257 | 258 | ### 2.9.30 (2023-02-26) 259 | 260 | ### 2.9.29 (2023-02-07) 261 | 262 | ### 2.9.28 (2023-01-04) 263 | 264 | ### 2.9.27 (2022-12-11) 265 | 266 | ### 2.9.26 (2022-11-28) 267 | 268 | ### 2.9.25 (2022-11-21) 269 | 270 | ### 2.9.24 (2022-11-11) 271 | 272 | ### 2.9.23 (2022-09-26) 273 | 274 | ### 2.9.22 (2022-08-13) 275 | 276 | ### 2.9.21 (2022-06-27) 277 | 278 | ### 2.9.20 (2022-06-14) 279 | 280 | ### 2.9.19 (2022-06-07) 281 | 282 | ### 2.9.18 (2022-05-29) 283 | 284 | ### 2.9.17 (2022-05-29) 285 | 286 | ### 2.9.16 (2022-05-02) 287 | 288 | ### [2.9.16-0](https://github.com/microlinkhq/html-get/compare/v2.9.15...v2.9.16-0) (2022-05-02) 289 | 290 | 291 | ### Bug Fixes 292 | 293 | * ensure to rewrite relative non root URLs ([d8d3996](https://github.com/microlinkhq/html-get/commit/d8d399604a21e6f4b9e647ef87ab5967d23c5684)) 294 | 295 | ### 2.9.15 (2022-05-02) 296 | 297 | 298 | ### Bug Fixes 299 | 300 | * linter ([1f32f3b](https://github.com/microlinkhq/html-get/commit/1f32f3b5cfa06a8d0225bf0a7789ca16fe901591)) 301 | 302 | ### 2.9.14 (2022-04-04) 303 | 304 | ### 2.9.13 (2022-03-28) 305 | 306 | ### 2.9.12 (2022-03-14) 307 | 308 | ### 2.9.11 (2022-02-25) 309 | 310 | ### [2.9.10](https://github.com/microlinkhq/html-get/compare/v2.9.10-beta.0...v2.9.10) (2022-02-10) 311 | 312 | ### [2.9.10-beta.0](https://github.com/microlinkhq/html-get/compare/v2.9.9...v2.9.10-beta.0) (2022-02-06) 313 | 314 | ### 2.9.9 (2022-01-29) 315 | 316 | ### 2.9.8 (2021-12-15) 317 | 318 | ### 2.9.7 (2021-12-01) 319 | 320 | 321 | ### Bug Fixes 322 | 323 | * sort ([7bf5c81](https://github.com/microlinkhq/html-get/commit/7bf5c81380c7f7a4f1d5b7c688da3f9368a25e6d)) 324 | 325 | ### 2.9.6 (2021-10-25) 326 | 327 | ### [2.9.5](https://github.com/microlinkhq/html-get/compare/v2.9.4...v2.9.5) (2021-09-08) 328 | 329 | 330 | ### Bug Fixes 331 | 332 | * ensure reponse is a buffer ([842ed98](https://github.com/microlinkhq/html-get/commit/842ed98f713be107ee7a4b945d89e60376df6209)) 333 | 334 | ### [2.9.4](https://github.com/microlinkhq/html-get/compare/v2.9.3...v2.9.4) (2021-09-08) 335 | 336 | ### 2.9.3 (2021-09-04) 337 | 338 | ### 2.9.2 (2021-08-16) 339 | 340 | ### 2.9.1 (2021-08-03) 341 | 342 | ## [2.9.0](https://github.com/microlinkhq/html-get/compare/v2.8.25...v2.9.0) (2021-06-29) 343 | 344 | 345 | ### Features 346 | 347 | * provide getBrowserless is mandatory ([28bed9a](https://github.com/microlinkhq/html-get/commit/28bed9a7563db6e91961a75e4e4826d607be5ed9)) 348 | 349 | 350 | ### Bug Fixes 351 | 352 | * linter ([cac07e3](https://github.com/microlinkhq/html-get/commit/cac07e30ac312ca3a20508e5a221510e974ddfeb)) 353 | 354 | ### [2.8.25](https://github.com/microlinkhq/html-get/compare/v2.8.24...v2.8.25) (2021-06-25) 355 | 356 | ### [2.8.24](https://github.com/microlinkhq/html-get/compare/v2.8.23...v2.8.24) (2021-06-17) 357 | 358 | ### [2.8.23](https://github.com/microlinkhq/html-get/compare/v2.8.22...v2.8.23) (2021-06-15) 359 | 360 | ### [2.8.22](https://github.com/microlinkhq/html-get/compare/v2.8.21...v2.8.22) (2021-05-31) 361 | 362 | ### [2.8.21](https://github.com/microlinkhq/html-get/compare/v2.8.20...v2.8.21) (2021-03-30) 363 | 364 | ### [2.8.20](https://github.com/microlinkhq/html-get/compare/v2.8.19...v2.8.20) (2021-03-10) 365 | 366 | ### [2.8.19](https://github.com/microlinkhq/html-get/compare/v2.8.18...v2.8.19) (2021-02-24) 367 | 368 | ### [2.8.18](https://github.com/microlinkhq/html-get/compare/v2.8.17...v2.8.18) (2021-02-16) 369 | 370 | ### [2.8.17](https://github.com/microlinkhq/html-get/compare/v2.8.16...v2.8.17) (2021-02-08) 371 | 372 | ### [2.8.16](https://github.com/microlinkhq/html-get/compare/v2.8.15...v2.8.16) (2021-01-30) 373 | 374 | ### [2.8.15](https://github.com/microlinkhq/html-get/compare/v2.8.14...v2.8.15) (2021-01-30) 375 | 376 | ### [2.8.14](https://github.com/microlinkhq/html-get/compare/v2.8.13...v2.8.14) (2021-01-25) 377 | 378 | ### [2.8.13](https://github.com/microlinkhq/html-get/compare/v2.8.12...v2.8.13) (2021-01-12) 379 | 380 | 381 | ### Bug Fixes 382 | 383 | * ensure reflected timeout is half timeout ([f6ae775](https://github.com/microlinkhq/html-get/commit/f6ae7758f499c869e8b6b5e3a076df4ac37b4166)) 384 | * setup browserless.evaluate timeout properly ([aa24bb5](https://github.com/microlinkhq/html-get/commit/aa24bb53bffb084ddcbadb53e42a9a73017741eb)) 385 | * setup default timeout when is necessary ([9d8b033](https://github.com/microlinkhq/html-get/commit/9d8b033714323261402101475298772b05e41244)) 386 | 387 | ### [2.8.12](https://github.com/microlinkhq/html-get/compare/v2.8.11...v2.8.12) (2020-12-23) 388 | 389 | ### [2.8.11](https://github.com/microlinkhq/html-get/compare/v2.8.10...v2.8.11) (2020-12-17) 390 | 391 | ### [2.8.10](https://github.com/microlinkhq/html-get/compare/v2.8.9...v2.8.10) (2020-11-10) 392 | 393 | ### [2.8.9](https://github.com/microlinkhq/html-get/compare/v2.8.8...v2.8.9) (2020-11-04) 394 | 395 | 396 | ### Bug Fixes 397 | 398 | * linter ([987af59](https://github.com/microlinkhq/html-get/commit/987af59d4022c020d82a2bdeaa4f80a837c8f906)) 399 | 400 | ### [2.8.8](https://github.com/microlinkhq/html-get/compare/v2.8.7...v2.8.8) (2020-10-21) 401 | 402 | ### [2.8.7](https://github.com/microlinkhq/html-get/compare/v2.8.6...v2.8.7) (2020-10-12) 403 | 404 | ### [2.8.6](https://github.com/microlinkhq/html-get/compare/v2.8.5...v2.8.6) (2020-09-19) 405 | 406 | ### [2.8.5](https://github.com/microlinkhq/html-get/compare/v2.8.4...v2.8.5) (2020-09-16) 407 | 408 | ### [2.8.4](https://github.com/microlinkhq/html-get/compare/v2.8.3...v2.8.4) (2020-09-04) 409 | 410 | ### [2.8.3](https://github.com/microlinkhq/html-get/compare/v2.8.2...v2.8.3) (2020-09-03) 411 | 412 | ### [2.8.2](https://github.com/microlinkhq/html-get/compare/v2.8.1...v2.8.2) (2020-08-24) 413 | 414 | ### [2.8.1](https://github.com/microlinkhq/html-get/compare/v2.8.0...v2.8.1) (2020-08-24) 415 | 416 | ## [2.8.0](https://github.com/microlinkhq/html-get/compare/v2.7.32...v2.8.0) (2020-08-24) 417 | 418 | 419 | ### Features 420 | 421 | * add rewriteUrls parameter ([9cd5095](https://github.com/microlinkhq/html-get/commit/9cd5095fa0729078703c4f356175767ab6fb0ab7)), closes [#92](https://github.com/microlinkhq/html-get/issues/92) 422 | 423 | 424 | ### Bug Fixes 425 | 426 | * linter ([f5e620c](https://github.com/microlinkhq/html-get/commit/f5e620ce553827789dd486b97716a978483be44e)) 427 | 428 | ### [2.7.32](https://github.com/microlinkhq/html-get/compare/v2.7.31...v2.7.32) (2020-08-23) 429 | 430 | 431 | ### Bug Fixes 432 | 433 | * handle invalid URLs ([2725796](https://github.com/microlinkhq/html-get/commit/27257965fc89fd989244328204083cad8308fd25)) 434 | 435 | ### [2.7.31](https://github.com/microlinkhq/html-get/compare/v2.7.30...v2.7.31) (2020-08-23) 436 | 437 | ### [2.7.30](https://github.com/microlinkhq/html-get/compare/v2.7.29...v2.7.30) (2020-08-11) 438 | 439 | ### [2.7.29](https://github.com/microlinkhq/html-get/compare/v2.7.28...v2.7.29) (2020-07-29) 440 | 441 | ### [2.7.28](https://github.com/microlinkhq/html-get/compare/v2.7.27...v2.7.28) (2020-07-22) 442 | 443 | 444 | ### Bug Fixes 445 | 446 | * linter ([6745da7](https://github.com/microlinkhq/html-get/commit/6745da7a973b8e669dff8b58d8c0f1c8719af925)) 447 | 448 | ### [2.7.27](https://github.com/microlinkhq/html-get/compare/v2.7.26...v2.7.27) (2020-07-22) 449 | 450 | ### [2.7.26](https://github.com/microlinkhq/html-get/compare/v2.7.25...v2.7.26) (2020-07-20) 451 | 452 | ### [2.7.25](https://github.com/microlinkhq/html-get/compare/v2.7.24...v2.7.25) (2020-07-20) 453 | 454 | ### [2.7.24](https://github.com/microlinkhq/html-get/compare/v2.7.23...v2.7.24) (2020-07-16) 455 | 456 | ### [2.7.23](https://github.com/microlinkhq/html-get/compare/v2.7.22...v2.7.23) (2020-07-09) 457 | 458 | ### [2.7.22](https://github.com/microlinkhq/html-get/compare/v2.7.21...v2.7.22) (2020-07-08) 459 | 460 | ### [2.7.21](https://github.com/microlinkhq/html-get/compare/v2.7.20...v2.7.21) (2020-07-05) 461 | 462 | ### [2.7.20](https://github.com/microlinkhq/html-get/compare/v2.7.19...v2.7.20) (2020-07-04) 463 | 464 | ### [2.7.19](https://github.com/microlinkhq/html-get/compare/v2.7.18...v2.7.19) (2020-06-30) 465 | 466 | 467 | ### Bug Fixes 468 | 469 | * linter ([51a77e7](https://github.com/microlinkhq/html-get/commit/51a77e7335e1d2766ca9157de4a32c166003769b)) 470 | 471 | ### [2.7.18](https://github.com/microlinkhq/html-get/compare/v2.7.17...v2.7.18) (2020-06-30) 472 | 473 | ### [2.7.17](https://github.com/microlinkhq/html-get/compare/v2.7.16...v2.7.17) (2020-06-25) 474 | 475 | ### [2.7.16](https://github.com/microlinkhq/html-get/compare/v2.7.15...v2.7.16) (2020-06-25) 476 | 477 | ### [2.7.15](https://github.com/microlinkhq/html-get/compare/v2.7.14...v2.7.15) (2020-06-11) 478 | 479 | 480 | ### Bug Fixes 481 | 482 | * linter ([59f267b](https://github.com/microlinkhq/html-get/commit/59f267b802ecba4c7e25c32dbcebad827853be87)) 483 | 484 | ### [2.7.14](https://github.com/microlinkhq/html-get/compare/v2.7.13...v2.7.14) (2020-06-10) 485 | 486 | 487 | ### Bug Fixes 488 | 489 | * explicit disable prerender ([72cfdaf](https://github.com/microlinkhq/html-get/commit/72cfdaf2be42c9e77d4c9c2491ce393f1943c711)) 490 | * linter ([2fc84a1](https://github.com/microlinkhq/html-get/commit/2fc84a10e05d4369eaec5e8eec9b62a011a3f4c9)) 491 | 492 | ### [2.7.13](https://github.com/microlinkhq/html-get/compare/v2.7.12...v2.7.13) (2020-06-09) 493 | 494 | ### [2.7.12](https://github.com/microlinkhq/html-get/compare/v2.7.11...v2.7.12) (2020-06-07) 495 | 496 | ### [2.7.11](https://github.com/microlinkhq/html-get/compare/v2.7.10...v2.7.11) (2020-06-07) 497 | 498 | ### [2.7.10](https://github.com/microlinkhq/html-get/compare/v2.7.9...v2.7.10) (2020-06-07) 499 | 500 | 501 | ### Bug Fixes 502 | 503 | * avoid rewrite inline javascript ([b6baa7c](https://github.com/microlinkhq/html-get/commit/b6baa7c4793be75b9e70547876ba0f289110070e)) 504 | 505 | ### [2.7.9](https://github.com/microlinkhq/html-get/compare/v2.7.8...v2.7.9) (2020-06-07) 506 | 507 | 508 | ### Bug Fixes 509 | 510 | * use new keyword ([8c7d28f](https://github.com/microlinkhq/html-get/commit/8c7d28fb50f6c6de9e23a08c9ecb698300eaebac)) 511 | 512 | ### [2.7.8](https://github.com/microlinkhq/html-get/compare/v2.7.7...v2.7.8) (2020-06-07) 513 | 514 | ### [2.7.7](https://github.com/microlinkhq/html-get/compare/v2.7.6...v2.7.7) (2020-06-07) 515 | 516 | ### [2.7.6](https://github.com/microlinkhq/html-get/compare/v2.7.5...v2.7.6) (2020-06-06) 517 | 518 | 519 | ### Bug Fixes 520 | 521 | * linter ([7e1d3c1](https://github.com/microlinkhq/html-get/commit/7e1d3c1f01e86a7febc991318d98dd7c07ef82dc)) 522 | 523 | ### [2.7.5](https://github.com/microlinkhq/html-get/compare/v2.7.4...v2.7.5) (2020-06-06) 524 | 525 | ### [2.7.4](https://github.com/microlinkhq/html-get/compare/v2.7.3...v2.7.4) (2020-06-06) 526 | 527 | 528 | ### Bug Fixes 529 | 530 | * ensure to don't rewrite selectors into urls ([a9aecd6](https://github.com/microlinkhq/html-get/commit/a9aecd6d5ef7e204b2fd19821dd25b20ff1a20a7)) 531 | * linter ([89d3777](https://github.com/microlinkhq/html-get/commit/89d3777e82dc8b38cf9e34d050eb58d51634d27b)) 532 | 533 | ### [2.7.3](https://github.com/microlinkhq/html-get/compare/v2.7.2...v2.7.3) (2020-06-03) 534 | 535 | 536 | ### Bug Fixes 537 | 538 | * better way to handle a date ([673cf3c](https://github.com/microlinkhq/html-get/commit/673cf3ca5d41123fa0f096043126c7ab9d3b4a5f)) 539 | 540 | ### [2.7.2](https://github.com/microlinkhq/html-get/compare/v2.7.1...v2.7.2) (2020-06-03) 541 | 542 | 543 | ### Bug Fixes 544 | 545 | * linter ([2f31844](https://github.com/microlinkhq/html-get/commit/2f31844788a7a3de52b1da66ac2d2494420b1f1a)) 546 | 547 | ### [2.7.1](https://github.com/microlinkhq/html-get/compare/v2.7.0...v2.7.1) (2020-06-03) 548 | 549 | ## [2.7.0](https://github.com/microlinkhq/html-get/compare/v2.6.3...v2.7.0) (2020-06-03) 550 | 551 | ### [2.6.3](https://github.com/microlinkhq/html-get/compare/v2.6.2...v2.6.3) (2020-06-01) 552 | 553 | 554 | ### Bug Fixes 555 | 556 | * linter ([028bc8e](https://github.com/microlinkhq/html-get/commit/028bc8ea29f4aae3c9744b13997c269831950b61)) 557 | * prevent inject twice ([ec9720a](https://github.com/microlinkhq/html-get/commit/ec9720a7e27b11b2008cbe25a53ca0c8c3d2d165)) 558 | 559 | ### [2.6.2](https://github.com/microlinkhq/html-get/compare/v2.6.1...v2.6.2) (2020-06-01) 560 | 561 | 562 | ### Bug Fixes 563 | 564 | * linter ([10f3945](https://github.com/microlinkhq/html-get/commit/10f3945235105e479fa12900edab6636d404cf0c)) 565 | 566 | ### [2.6.1](https://github.com/microlinkhq/html-get/compare/v2.6.0...v2.6.1) (2020-06-01) 567 | 568 | ## [2.6.0](https://github.com/microlinkhq/html-get/compare/v2.5.0...v2.6.0) (2020-06-01) 569 | 570 | 571 | ### Features 572 | 573 | * rewrite urls inside css ([52ff754](https://github.com/microlinkhq/html-get/commit/52ff75443be91c6b0b2b8277465511d7337ddefa)) 574 | 575 | ## [2.5.0](https://github.com/microlinkhq/html-get/compare/v2.4.18...v2.5.0) (2020-06-01) 576 | 577 | 578 | ### Features 579 | 580 | * remove non necessary he dependency ([d2a33ae](https://github.com/microlinkhq/html-get/commit/d2a33ae9723be5fcae22218d570b1dc5a4f088f0)) 581 | * rewrite urls into absolute ([29efdb9](https://github.com/microlinkhq/html-get/commit/29efdb98e868b6e7b79b8f4438f0799322f29efc)) 582 | 583 | 584 | ### Bug Fixes 585 | 586 | * linter ([820e019](https://github.com/microlinkhq/html-get/commit/820e01936fa8c23c568fec9a0d78b6b71901b0bc)) 587 | 588 | ### [2.4.18](https://github.com/microlinkhq/html-get/compare/v2.4.16...v2.4.18) (2020-05-31) 589 | 590 | ### [2.4.17](https://github.com/microlinkhq/html-get/compare/v2.4.16...v2.4.17) (2020-05-31) 591 | 592 | ### [2.4.16](https://github.com/microlinkhq/html-get/compare/v2.4.15...v2.4.16) (2020-05-30) 593 | 594 | ### [2.4.15](https://github.com/microlinkhq/html-get/compare/v2.4.14...v2.4.15) (2020-05-25) 595 | 596 | ### [2.4.14](https://github.com/microlinkhq/html-get/compare/v2.4.13...v2.4.14) (2020-05-25) 597 | 598 | ### [2.4.13](https://github.com/microlinkhq/html-get/compare/v2.4.12...v2.4.13) (2020-05-25) 599 | 600 | ### [2.4.12](https://github.com/microlinkhq/html-get/compare/v2.4.11...v2.4.12) (2020-05-19) 601 | 602 | 603 | ### Bug Fixes 604 | 605 | * remove .only ([ef498da](https://github.com/microlinkhq/html-get/commit/ef498da1ab94bd63ced63cb109b79559370a39a5)) 606 | 607 | ### [2.4.11](https://github.com/microlinkhq/html-get/compare/v2.4.10...v2.4.11) (2020-05-18) 608 | 609 | ### [2.4.10](https://github.com/microlinkhq/html-get/compare/v2.4.9...v2.4.10) (2020-05-16) 610 | 611 | 612 | ### Bug Fixes 613 | 614 | * linter ([4a64268](https://github.com/microlinkhq/html-get/commit/4a642682e3550eae881750527dbeb5d237193fac)) 615 | 616 | ### [2.4.9](https://github.com/microlinkhq/html-get/compare/v2.4.8...v2.4.9) (2020-05-10) 617 | 618 | ### [2.4.8](https://github.com/microlinkhq/html-get/compare/v2.4.7...v2.4.8) (2020-05-08) 619 | 620 | 621 | ### Bug Fixes 622 | 623 | * bind req properly ([b9982ff](https://github.com/microlinkhq/html-get/commit/b9982ff0b5e2a909e086a68b45b9e2a2ad727ad8)) 624 | * linter ([a45c748](https://github.com/microlinkhq/html-get/commit/a45c74881fa8e93b0873abd26bd0aaa7d130027b)) 625 | 626 | ### [2.4.7](https://github.com/microlinkhq/html-get/compare/v2.4.6...v2.4.7) (2020-05-04) 627 | 628 | ### [2.4.6](https://github.com/microlinkhq/html-get/compare/v2.4.5...v2.4.6) (2020-04-26) 629 | 630 | 631 | ### Bug Fixes 632 | 633 | * linter ([3ef2d33](https://github.com/microlinkhq/html-get/commit/3ef2d33618f8d241c99c74a52e2993cc6f47089f)) 634 | 635 | ### [2.4.5](https://github.com/microlinkhq/html-get/compare/v2.4.4...v2.4.5) (2020-04-21) 636 | 637 | ### [2.4.4](https://github.com/microlinkhq/html-get/compare/v2.4.3...v2.4.4) (2020-04-16) 638 | 639 | ### [2.4.3](https://github.com/microlinkhq/html-get/compare/v2.4.2...v2.4.3) (2020-03-26) 640 | 641 | ### [2.4.2](https://github.com/microlinkhq/html-get/compare/v2.4.1...v2.4.2) (2020-03-12) 642 | 643 | ### [2.4.1](https://github.com/microlinkhq/html-get/compare/v2.4.0...v2.4.1) (2020-03-06) 644 | 645 | ## [2.4.0](https://github.com/microlinkhq/html-get/compare/v2.3.10...v2.4.0) (2020-03-06) 646 | 647 | 648 | ### Features 649 | 650 | * expose status code ([b66c1e0](https://github.com/microlinkhq/html-get/commit/b66c1e0365690f0526540f1e612c5f9b97f6b430)) 651 | 652 | 653 | ### Bug Fixes 654 | 655 | * linter ([b70ae44](https://github.com/microlinkhq/html-get/commit/b70ae44a17e45994b3f1324f2c69aa87a4c56072)) 656 | 657 | ### [2.3.10](https://github.com/microlinkhq/html-get/compare/v2.3.9...v2.3.10) (2020-02-25) 658 | 659 | ### [2.3.9](https://github.com/microlinkhq/html-get/compare/v2.3.8...v2.3.9) (2020-02-23) 660 | 661 | ### [2.3.8](https://github.com/microlinkhq/html-get/compare/v2.3.7...v2.3.8) (2020-02-23) 662 | 663 | 664 | ### Bug Fixes 665 | 666 | * linter ([7aa80e5](https://github.com/microlinkhq/html-get/commit/7aa80e5de99effa89da00bdb824f73e62055744a)) 667 | * setup shell ([25695a6](https://github.com/microlinkhq/html-get/commit/25695a68fc913b5872c85434ec1d93f5da5bae9f)) 668 | * throw an abort error if response is not present ([78fd15f](https://github.com/microlinkhq/html-get/commit/78fd15fa0a0ba3974b65829a6974e97e413bd6db)) 669 | 670 | ### [2.3.7](https://github.com/microlinkhq/html-get/compare/v2.3.6...v2.3.7) (2020-02-21) 671 | 672 | ### [2.3.6](https://github.com/microlinkhq/html-get/compare/v2.3.5...v2.3.6) (2020-02-19) 673 | 674 | ### [2.3.5](https://github.com/microlinkhq/html-get/compare/v2.3.4...v2.3.5) (2020-02-19) 675 | 676 | 677 | ### Bug Fixes 678 | 679 | * cli url ([ee41128](https://github.com/microlinkhq/html-get/commit/ee411282c1fd63ae4b87609852c19d3f529f4907)) 680 | 681 | ### [2.3.4](https://github.com/microlinkhq/html-get/compare/v2.3.3...v2.3.4) (2020-02-14) 682 | 683 | ### [2.3.3](https://github.com/microlinkhq/html-get/compare/v2.3.2...v2.3.3) (2020-02-10) 684 | 685 | ### [2.3.2](https://github.com/microlinkhq/html-get/compare/v2.3.1...v2.3.2) (2020-02-07) 686 | 687 | ### [2.3.1](https://github.com/microlinkhq/html-get/compare/v2.3.0...v2.3.1) (2020-02-02) 688 | 689 | 690 | ### Bug Fixes 691 | 692 | * linter ([73e0d17](https://github.com/microlinkhq/html-get/commit/73e0d17dd8ee8db253425521418e7d42e5a6a1c1)) 693 | 694 | ## [2.3.0](https://github.com/microlinkhq/html-get/compare/v2.2.33...v2.3.0) (2020-02-02) 695 | 696 | 697 | ### Features 698 | 699 | * update dependencies ([fb9ea1e](https://github.com/microlinkhq/html-get/commit/fb9ea1e2b4c8389b8e6166f2ad849224a945d61e)) 700 | 701 | ### [2.2.33](https://github.com/microlinkhq/html-get/compare/v2.2.32...v2.2.33) (2020-01-16) 702 | 703 | ### [2.2.32](https://github.com/microlinkhq/html-get/compare/v2.2.31...v2.2.32) (2020-01-08) 704 | 705 | ### [2.2.31](https://github.com/microlinkhq/html-get/compare/v2.2.30...v2.2.31) (2020-01-02) 706 | 707 | ### [2.2.30](https://github.com/microlinkhq/html-get/compare/v2.2.29...v2.2.30) (2019-11-09) 708 | 709 | ### [2.2.29](https://github.com/microlinkhq/html-get/compare/v2.2.28...v2.2.29) (2019-11-01) 710 | 711 | 712 | ### Bug Fixes 713 | 714 | * linter ([a7f40fe](https://github.com/microlinkhq/html-get/commit/a7f40fe39ed195dc71bc3500cd29bc0ca1ddf996)) 715 | 716 | ### [2.2.28](https://github.com/microlinkhq/html-get/compare/v2.2.27...v2.2.28) (2019-10-31) 717 | 718 | ### [2.2.27](https://github.com/microlinkhq/html-get/compare/v2.2.26...v2.2.27) (2019-10-14) 719 | 720 | ### [2.2.26](https://github.com/microlinkhq/html-get/compare/v2.2.25...v2.2.26) (2019-10-03) 721 | 722 | ### [2.2.25](https://github.com/microlinkhq/html-get/compare/v2.2.24...v2.2.25) (2019-10-02) 723 | 724 | 725 | ### Build System 726 | 727 | * round ms ([af68289](https://github.com/microlinkhq/html-get/commit/af68289)) 728 | 729 | 730 | 731 | ### [2.2.24](https://github.com/microlinkhq/html-get/compare/v2.2.23...v2.2.24) (2019-09-30) 732 | 733 | 734 | ### Build System 735 | 736 | * add wikipedia ([c08d0a7](https://github.com/microlinkhq/html-get/commit/c08d0a7)) 737 | 738 | 739 | 740 | ### [2.2.23](https://github.com/microlinkhq/html-get/compare/v2.2.22...v2.2.23) (2019-09-25) 741 | 742 | 743 | ### Build System 744 | 745 | * **deps:** update tldts requirement from ~5.4.1 to ~5.5.0 ([0b3f7c0](https://github.com/microlinkhq/html-get/commit/0b3f7c0)) 746 | * **deps:** update tldts requirement from ~5.4.1 to ~5.5.0 ([#57](https://github.com/microlinkhq/html-get/issues/57)) ([cc7dfe8](https://github.com/microlinkhq/html-get/commit/cc7dfe8)) 747 | 748 | 749 | 750 | ### [2.2.22](https://github.com/microlinkhq/html-get/compare/v2.2.21...v2.2.22) (2019-09-23) 751 | 752 | 753 | ### Build System 754 | 755 | * update dependencies ([0593ab3](https://github.com/microlinkhq/html-get/commit/0593ab3)) 756 | 757 | 758 | 759 | ### [2.2.21](https://github.com/microlinkhq/html-get/compare/v2.2.20...v2.2.21) (2019-09-13) 760 | 761 | 762 | ### Build System 763 | 764 | * **deps:** update @metascraper/helpers requirement ([15a4f36](https://github.com/microlinkhq/html-get/commit/15a4f36)) 765 | * **deps:** update @metascraper/helpers requirement from ~5.6.6… ([#56](https://github.com/microlinkhq/html-get/issues/56)) ([d84a9f7](https://github.com/microlinkhq/html-get/commit/d84a9f7)) 766 | 767 | 768 | 769 | ### [2.2.20](https://github.com/microlinkhq/html-get/compare/v2.2.19...v2.2.20) (2019-09-02) 770 | 771 | 772 | ### Build System 773 | 774 | * tweak timeout timing ([657aecc](https://github.com/microlinkhq/html-get/commit/657aecc)) 775 | 776 | 777 | 778 | ### [2.2.19](https://github.com/microlinkhq/html-get/compare/v2.2.18...v2.2.19) (2019-08-29) 779 | 780 | 781 | 782 | ### [2.2.18](https://github.com/microlinkhq/html-get/compare/v2.2.17...v2.2.18) (2019-08-29) 783 | 784 | 785 | 786 | ### [2.2.17](https://github.com/microlinkhq/html-get/compare/v2.2.16...v2.2.17) (2019-08-29) 787 | 788 | 789 | 790 | ### [2.2.16](https://github.com/microlinkhq/html-get/compare/v2.2.15...v2.2.16) (2019-08-29) 791 | 792 | 793 | ### Build System 794 | 795 | * update dependencies ([cde658b](https://github.com/microlinkhq/html-get/commit/cde658b)) 796 | 797 | 798 | 799 | ### [2.2.15](https://github.com/microlinkhq/html-get/compare/v2.2.14...v2.2.15) (2019-08-25) 800 | 801 | 802 | ### Build System 803 | 804 | * increment default timeout ([9609527](https://github.com/microlinkhq/html-get/commit/9609527)) 805 | 806 | 807 | 808 | ### [2.2.14](https://github.com/microlinkhq/html-get/compare/v2.2.13...v2.2.14) (2019-08-12) 809 | 810 | 811 | ### Build System 812 | 813 | * **deps:** update reachable-url requirement from ~1.1.8 to ~1.2… ([#54](https://github.com/microlinkhq/html-get/issues/54)) ([dffcb50](https://github.com/microlinkhq/html-get/commit/dffcb50)) 814 | * **deps:** update reachable-url requirement from ~1.1.8 to ~1.2.0 ([1c6d216](https://github.com/microlinkhq/html-get/commit/1c6d216)) 815 | 816 | 817 | 818 | ### [2.2.13](https://github.com/microlinkhq/html-get/compare/v2.2.12...v2.2.13) (2019-08-12) 819 | 820 | 821 | ### Build System 822 | 823 | * set html charset ([1ec92ad](https://github.com/microlinkhq/html-get/commit/1ec92ad)) 824 | 825 | 826 | 827 | ### [2.2.12](https://github.com/microlinkhq/html-get/compare/v2.2.11...v2.2.12) (2019-07-31) 828 | 829 | 830 | ### Build System 831 | 832 | * **deps:** update write-json-file requirement from ~4.1.0 to ~4… ([#53](https://github.com/microlinkhq/html-get/issues/53)) ([f62afa6](https://github.com/microlinkhq/html-get/commit/f62afa6)) 833 | * **deps:** update write-json-file requirement from ~4.1.0 to ~4.2.0 ([452fcfa](https://github.com/microlinkhq/html-get/commit/452fcfa)) 834 | 835 | 836 | 837 | ### [2.2.11](https://github.com/microlinkhq/html-get/compare/v2.2.10...v2.2.11) (2019-07-11) 838 | 839 | 840 | ### Build System 841 | 842 | * **deps:** update @metascraper/helpers requirement ([fa8e5c8](https://github.com/microlinkhq/html-get/commit/fa8e5c8)) 843 | * **deps:** update @metascraper/helpers requirement from ~5.5.0… ([#52](https://github.com/microlinkhq/html-get/issues/52)) ([33a2583](https://github.com/microlinkhq/html-get/commit/33a2583)) 844 | 845 | 846 | 847 | ### [2.2.10](https://github.com/microlinkhq/html-get/compare/v2.2.9...v2.2.10) (2019-07-04) 848 | 849 | 850 | ### Build System 851 | 852 | * **deps:** update tldts requirement from ~5.1.0 to ~5.3.0 ([c778854](https://github.com/microlinkhq/html-get/commit/c778854)) 853 | * **deps:** update tldts requirement from ~5.1.0 to ~5.3.0 ([#51](https://github.com/microlinkhq/html-get/issues/51)) ([425dfe5](https://github.com/microlinkhq/html-get/commit/425dfe5)) 854 | 855 | 856 | 857 | ### [2.2.9](https://github.com/microlinkhq/html-get/compare/v2.2.8...v2.2.9) (2019-06-30) 858 | 859 | 860 | 861 | ### [2.2.8](https://github.com/microlinkhq/html-get/compare/v2.2.7...v2.2.8) (2019-06-30) 862 | 863 | 864 | ### Bug Fixes 865 | 866 | * text --> txt ([a9c6f24](https://github.com/microlinkhq/html-get/commit/a9c6f24)) 867 | 868 | 869 | 870 | ### [2.2.7](https://github.com/microlinkhq/html-get/compare/v2.2.6...v2.2.7) (2019-06-21) 871 | 872 | 873 | ### Build System 874 | 875 | * update dependencies ([82e61b8](https://github.com/microlinkhq/html-get/commit/82e61b8)) 876 | 877 | 878 | 879 | ### [2.2.6](https://github.com/microlinkhq/html-get/compare/v2.2.5...v2.2.6) (2019-06-20) 880 | 881 | 882 | ### Build System 883 | 884 | * add more html compatible mime types ([4a2d392](https://github.com/microlinkhq/html-get/commit/4a2d392)) 885 | 886 | 887 | 888 | ### [2.2.5](https://github.com/microlinkhq/html-get/compare/v2.2.4...v2.2.5) (2019-06-19) 889 | 890 | 891 | ### Build System 892 | 893 | * update travis ([92ad2df](https://github.com/microlinkhq/html-get/commit/92ad2df)) 894 | 895 | 896 | 897 | ### [2.2.4](https://github.com/microlinkhq/html-get/compare/v2.2.3...v2.2.4) (2019-06-12) 898 | 899 | 900 | ### Build System 901 | 902 | * update dependencies ([f0ff053](https://github.com/microlinkhq/html-get/commit/f0ff053)) 903 | 904 | 905 | 906 | ### [2.2.3](https://github.com/microlinkhq/html-get/compare/v2.2.2...v2.2.3) (2019-06-12) 907 | 908 | 909 | ### Build System 910 | 911 | * update dependencies ([c256817](https://github.com/microlinkhq/html-get/commit/c256817)) 912 | 913 | 914 | 915 | ### [2.2.2](https://github.com/microlinkhq/html-get/compare/v2.2.1...v2.2.2) (2019-06-12) 916 | 917 | 918 | ### Bug Fixes 919 | 920 | * setup default content type ([00615c4](https://github.com/microlinkhq/html-get/commit/00615c4)) 921 | 922 | 923 | ### Tests 924 | 925 | * update ([aaa7107](https://github.com/microlinkhq/html-get/commit/aaa7107)) 926 | * use a valid mp4 video url ([5b1ecfd](https://github.com/microlinkhq/html-get/commit/5b1ecfd)) 927 | 928 | 929 | 930 | ### [2.2.1](https://github.com/microlinkhq/html-get/compare/v2.2.0...v2.2.1) (2019-05-31) 931 | 932 | 933 | ### Build System 934 | 935 | * remove pass host ([43f3af3](https://github.com/microlinkhq/html-get/commit/43f3af3)) 936 | 937 | 938 | 939 | ## [2.2.0](https://github.com/microlinkhq/html-get/compare/v2.1.5...v2.2.0) (2019-05-30) 940 | 941 | 942 | ### Features 943 | 944 | * expose .getDomainWithoutSuffix method ([c1d87da](https://github.com/microlinkhq/html-get/commit/c1d87da)) 945 | 946 | 947 | 948 | ### [2.1.5](https://github.com/microlinkhq/html-get/compare/v2.1.4...v2.1.5) (2019-05-20) 949 | 950 | 951 | ### Build System 952 | 953 | * change git-authors-cli position ([d75aedd](https://github.com/microlinkhq/html-get/commit/d75aedd)) 954 | 955 | 956 | 957 | ### [2.1.4](https://github.com/microlinkhq/html-get/compare/v2.1.3...v2.1.4) (2019-05-17) 958 | 959 | 960 | ### Bug Fixes 961 | 962 | * require URL constructor ([72a4bc3](https://github.com/microlinkhq/html-get/commit/72a4bc3)) 963 | 964 | 965 | 966 | ### [2.1.3](https://github.com/microlinkhq/html-get/compare/v2.1.2...v2.1.3) (2019-05-17) 967 | 968 | 969 | ### Build System 970 | 971 | * determinate host from reachable url ([4c468d7](https://github.com/microlinkhq/html-get/commit/4c468d7)) 972 | 973 | 974 | 975 | ### [2.1.2](https://github.com/microlinkhq/html-get/compare/v2.1.0...v2.1.2) (2019-05-16) 976 | 977 | 978 | ### Bug Fixes 979 | 980 | * add scripts to files ([4fc3cb8](https://github.com/microlinkhq/html-get/commit/4fc3cb8)) 981 | 982 | 983 | ### Build System 984 | 985 | * short error log ([f08a9ee](https://github.com/microlinkhq/html-get/commit/f08a9ee)) 986 | 987 | 988 | 989 | ### [2.1.1](https://github.com/microlinkhq/html-get/compare/v2.1.0...v2.1.1) (2019-05-16) 990 | 991 | 992 | ### Bug Fixes 993 | 994 | * add scripts to files ([4fc3cb8](https://github.com/microlinkhq/html-get/commit/4fc3cb8)) 995 | 996 | 997 | 998 | 999 | # [2.1.0](https://github.com/microlinkhq/html-get/compare/v2.0.18...v2.1.0) (2019-05-16) 1000 | 1001 | 1002 | ### Features 1003 | 1004 | * Improve domains auto detection ([#50](https://github.com/microlinkhq/html-get/issues/50)) ([782446d](https://github.com/microlinkhq/html-get/commit/782446d)) 1005 | 1006 | 1007 | 1008 | ## [2.0.18](https://github.com/microlinkhq/html-get/compare/v2.0.17...v2.0.18) (2019-05-09) 1009 | 1010 | 1011 | ### Build System 1012 | 1013 | * update copy ([8c6306e](https://github.com/microlinkhq/html-get/commit/8c6306e)) 1014 | 1015 | 1016 | 1017 | ### [2.0.17](https://github.com/Kikobeats/html-get/compare/v2.0.16...v2.0.17) (2019-05-07) 1018 | 1019 | 1020 | ### Bug Fixes 1021 | 1022 | * **package:** update @metascraper/helpers to version 5.1.0 ([7bff17c](https://github.com/Kikobeats/html-get/commit/7bff17c)) 1023 | * **package:** update @metascraper/helpers to version 5.2.0 ([e3eef70](https://github.com/Kikobeats/html-get/commit/e3eef70)) 1024 | * **package:** update time-span to version 3.1.0 ([1001609](https://github.com/Kikobeats/html-get/commit/1001609)) 1025 | * command ([fc18507](https://github.com/Kikobeats/html-get/commit/fc18507)) 1026 | 1027 | 1028 | ### Build System 1029 | 1030 | * add og:type as metadata ([57bd613](https://github.com/Kikobeats/html-get/commit/57bd613)) 1031 | * add og:type as metadata ([#48](https://github.com/Kikobeats/html-get/issues/48)) ([32c45bb](https://github.com/Kikobeats/html-get/commit/32c45bb)) 1032 | * update dependencies ([70ca8f1](https://github.com/Kikobeats/html-get/commit/70ca8f1)) 1033 | * use html detection based on file type ([c5e099e](https://github.com/Kikobeats/html-get/commit/c5e099e)) 1034 | * use html detection based on headers and content ([27d52c6](https://github.com/Kikobeats/html-get/commit/27d52c6)) 1035 | 1036 | 1037 | 1038 | 1039 | ## [2.0.16](https://github.com/microlinkhq/html-get/compare/v2.0.15...v2.0.16) (2019-04-01) 1040 | 1041 | 1042 | 1043 | 1044 | ## [2.0.15](https://github.com/microlinkhq/html-get/compare/v2.0.14...v2.0.15) (2019-03-31) 1045 | 1046 | 1047 | ### Bug Fixes 1048 | 1049 | * **package:** update mem to version 4.3.0 ([a13ff0c](https://github.com/microlinkhq/html-get/commit/a13ff0c)) 1050 | 1051 | 1052 | 1053 | 1054 | ## [2.0.14](https://github.com/microlinkhq/html-get/compare/v2.0.13...v2.0.14) (2019-03-31) 1055 | 1056 | 1057 | ### Bug Fixes 1058 | 1059 | * **package:** update p-cancelable to version 2.0.0 ([1543cf1](https://github.com/microlinkhq/html-get/commit/1543cf1)) 1060 | 1061 | 1062 | 1063 | 1064 | ## [2.0.13](https://github.com/microlinkhq/html-get/compare/v2.0.12...v2.0.13) (2019-03-26) 1065 | 1066 | 1067 | ### Bug Fixes 1068 | 1069 | * don't throw an error on cancel ([98cddef](https://github.com/microlinkhq/html-get/commit/98cddef)) 1070 | 1071 | 1072 | 1073 | 1074 | ## [2.0.12](https://github.com/microlinkhq/html-get/compare/v2.0.10...v2.0.12) (2019-03-17) 1075 | 1076 | 1077 | ### Bug Fixes 1078 | 1079 | * **package:** update [@metascraper](https://github.com/metascraper)/helpers to version 5.0.0 ([138879a](https://github.com/microlinkhq/html-get/commit/138879a)) 1080 | * **package:** update mem to version 4.2.0 ([60f8b90](https://github.com/microlinkhq/html-get/commit/60f8b90)) 1081 | * **package:** update p-cancelable to version 1.1.0 ([2631d34](https://github.com/microlinkhq/html-get/commit/2631d34)) 1082 | * **package:** update time-span to version 3.0.0 ([b53deef](https://github.com/microlinkhq/html-get/commit/b53deef)) 1083 | 1084 | 1085 | 1086 | 1087 | ## [2.0.11](https://github.com/microlinkhq/html-get/compare/v2.0.10...v2.0.11) (2019-03-08) 1088 | 1089 | 1090 | ### Bug Fixes 1091 | 1092 | * **package:** update p-cancelable to version 1.1.0 ([2631d34](https://github.com/microlinkhq/html-get/commit/2631d34)) 1093 | * **package:** update time-span to version 3.0.0 ([b53deef](https://github.com/microlinkhq/html-get/commit/b53deef)) 1094 | 1095 | 1096 | 1097 | 1098 | ## [2.0.10](https://github.com/microlinkhq/html-get/compare/v2.0.9...v2.0.10) (2019-01-25) 1099 | 1100 | 1101 | ### Bug Fixes 1102 | 1103 | * remove timeout ([a02046f](https://github.com/microlinkhq/html-get/commit/a02046f)), closes [#13](https://github.com/microlinkhq/html-get/issues/13) 1104 | 1105 | 1106 | 1107 | 1108 | ## [2.0.9](https://github.com/microlinkhq/html-get/compare/v2.0.8...v2.0.9) (2019-01-17) 1109 | 1110 | 1111 | ### Bug Fixes 1112 | 1113 | * **package:** update got to version 9.6.0 ([bcb3b9e](https://github.com/microlinkhq/html-get/commit/bcb3b9e)) 1114 | 1115 | 1116 | 1117 | 1118 | ## [2.0.8](https://github.com/microlinkhq/html-get/compare/v2.0.7...v2.0.8) (2019-01-14) 1119 | 1120 | 1121 | ### Bug Fixes 1122 | 1123 | * add missing dev dependency ([26b51ca](https://github.com/microlinkhq/html-get/commit/26b51ca)) 1124 | 1125 | 1126 | 1127 | 1128 | ## [2.0.7](https://github.com/microlinkhq/html-get/compare/v2.0.6...v2.0.7) (2019-01-10) 1129 | 1130 | 1131 | ### Bug Fixes 1132 | 1133 | * **package:** update [@metascraper](https://github.com/metascraper)/helpers to version 4.9.0 ([14bf5f0](https://github.com/microlinkhq/html-get/commit/14bf5f0)) 1134 | 1135 | 1136 | 1137 | 1138 | ## [2.0.6](https://github.com/microlinkhq/html-get/compare/v2.0.5...v2.0.6) (2019-01-07) 1139 | 1140 | 1141 | ### Bug Fixes 1142 | 1143 | * **package:** update browserless to version 5.1.0 ([338d339](https://github.com/microlinkhq/html-get/commit/338d339)) 1144 | 1145 | 1146 | 1147 | 1148 | ## [2.0.5](https://github.com/microlinkhq/html-get/compare/v2.0.4...v2.0.5) (2019-01-07) 1149 | 1150 | 1151 | ### Bug Fixes 1152 | 1153 | * **package:** update browserless to version 5.0.0 ([f946861](https://github.com/microlinkhq/html-get/commit/f946861)) 1154 | 1155 | 1156 | 1157 | 1158 | ## [2.0.4](https://github.com/microlinkhq/html-get/compare/v2.0.3...v2.0.4) (2018-12-19) 1159 | 1160 | 1161 | 1162 | 1163 | ## [2.0.3](https://github.com/microlinkhq/html-get/compare/v2.0.2...v2.0.3) (2018-12-19) 1164 | 1165 | 1166 | ### Bug Fixes 1167 | 1168 | * **package:** update got to version 9.5.0 ([3780bf7](https://github.com/microlinkhq/html-get/commit/3780bf7)) 1169 | 1170 | 1171 | 1172 | 1173 | ## [2.0.2](https://github.com/microlinkhq/html-get/compare/v2.0.1...v2.0.2) (2018-12-17) 1174 | 1175 | 1176 | ### Bug Fixes 1177 | 1178 | * correct html markup ([2c1f925](https://github.com/microlinkhq/html-get/commit/2c1f925)) 1179 | 1180 | 1181 | 1182 | 1183 | ## [2.0.1](https://github.com/microlinkhq/html-get/compare/v2.0.0...v2.0.1) (2018-12-16) 1184 | 1185 | 1186 | ### Bug Fixes 1187 | 1188 | * remove og image from template ([96d139b](https://github.com/microlinkhq/html-get/commit/96d139b)) 1189 | 1190 | 1191 | 1192 | 1193 | # [2.0.0](https://github.com/microlinkhq/html-get/compare/v1.6.5...v2.0.0) (2018-12-16) 1194 | 1195 | 1196 | ### Features 1197 | 1198 | * Ensure to resolve media URLs ([61aee97](https://github.com/microlinkhq/html-get/commit/61aee97)) 1199 | 1200 | 1201 | ### BREAKING CHANGES 1202 | 1203 | * Rename fetchMode → getMode 1204 | 1205 | 1206 | 1207 | 1208 | ## [1.6.5](https://github.com/microlinkhq/html-get/compare/v1.6.4...v1.6.5) (2018-12-11) 1209 | 1210 | 1211 | ### Bug Fixes 1212 | 1213 | * require url ([6981590](https://github.com/microlinkhq/html-get/commit/6981590)) 1214 | 1215 | 1216 | 1217 | 1218 | ## [1.6.4](https://github.com/microlinkhq/html-get/compare/v1.6.3...v1.6.4) (2018-12-11) 1219 | 1220 | 1221 | 1222 | 1223 | ## [1.6.3](https://github.com/microlinkhq/html-get/compare/v1.6.2...v1.6.3) (2018-12-11) 1224 | 1225 | 1226 | ### Bug Fixes 1227 | 1228 | * **package:** update got to version 9.4.0 ([5a2f3dd](https://github.com/microlinkhq/html-get/commit/5a2f3dd)) 1229 | 1230 | 1231 | 1232 | 1233 | ## [1.6.2](https://github.com/microlinkhq/html-get/compare/v1.6.1...v1.6.2) (2018-12-01) 1234 | 1235 | 1236 | ### Bug Fixes 1237 | 1238 | * increase timeout ([ae81e61](https://github.com/microlinkhq/html-get/commit/ae81e61)) 1239 | 1240 | 1241 | 1242 | 1243 | ## [1.6.1](https://github.com/microlinkhq/html-get/compare/v1.6.0...v1.6.1) (2018-12-01) 1244 | 1245 | 1246 | ### Bug Fixes 1247 | 1248 | * ensure call cancel fn properly ([ee0d9fe](https://github.com/microlinkhq/html-get/commit/ee0d9fe)) 1249 | 1250 | 1251 | 1252 | 1253 | # [1.6.0](https://github.com/microlinkhq/html-get/compare/v1.5.1...v1.6.0) (2018-11-30) 1254 | 1255 | 1256 | ### Features 1257 | 1258 | * better url encoding ([07b3990](https://github.com/microlinkhq/html-get/commit/07b3990)) 1259 | 1260 | 1261 | 1262 | 1263 | ## [1.5.1](https://github.com/microlinkhq/html-get/compare/v1.5.0...v1.5.1) (2018-11-19) 1264 | 1265 | 1266 | ### Bug Fixes 1267 | 1268 | * add bin folder to npm ([f1f3cf3](https://github.com/microlinkhq/html-get/commit/f1f3cf3)) 1269 | 1270 | 1271 | 1272 | 1273 | # [1.5.0](https://github.com/microlinkhq/html-get/compare/v1.4.5...v1.5.0) (2018-11-19) 1274 | 1275 | 1276 | ### Features 1277 | 1278 | * add cli ([aeff0a4](https://github.com/microlinkhq/html-get/commit/aeff0a4)) 1279 | * ensure url is encoded ([8ae42b3](https://github.com/microlinkhq/html-get/commit/8ae42b3)) 1280 | 1281 | 1282 | 1283 | 1284 | ## [1.4.5](https://github.com/microlinkhq/html-get/compare/v1.4.4...v1.4.5) (2018-11-17) 1285 | 1286 | 1287 | ### Bug Fixes 1288 | 1289 | * ensure to decode HTML entities ([d47fd4e](https://github.com/microlinkhq/html-get/commit/d47fd4e)) 1290 | 1291 | 1292 | 1293 | 1294 | ## [1.4.4](https://github.com/microlinkhq/html-get/compare/v1.4.3...v1.4.4) (2018-11-17) 1295 | 1296 | 1297 | 1298 | 1299 | ## [1.4.3](https://github.com/microlinkhq/html-get/compare/v1.4.2...v1.4.3) (2018-11-15) 1300 | 1301 | 1302 | ### Bug Fixes 1303 | 1304 | * **package:** update browserless to version 4.2.0 ([25085e4](https://github.com/microlinkhq/html-get/commit/25085e4)) 1305 | 1306 | 1307 | 1308 | 1309 | ## [1.4.2](https://github.com/microlinkhq/html-get/compare/v1.4.1...v1.4.2) (2018-10-30) 1310 | 1311 | 1312 | ### Bug Fixes 1313 | 1314 | * **package:** update got to version 9.3.0 ([39d8774](https://github.com/microlinkhq/html-get/commit/39d8774)) 1315 | 1316 | 1317 | 1318 | 1319 | ## 1.4.1 (2018-10-30) 1320 | 1321 | 1322 | 1323 | 1324 | # 1.4.0 (2018-10-24) 1325 | 1326 | 1327 | 1328 | 1329 | ## 1.3.6 (2018-10-23) 1330 | 1331 | 1332 | ### Bug Fixes 1333 | 1334 | * remove youtube from auto domains ([5f20ed0](https://github.com/microlinkhq/html-get/commit/5f20ed0)) 1335 | 1336 | 1337 | 1338 | 1339 | ## 1.3.5 (2018-10-23) 1340 | 1341 | 1342 | 1343 | 1344 | ## 1.3.4 (2018-10-23) 1345 | 1346 | 1347 | 1348 | 1349 | ## 1.3.3 (2018-10-14) 1350 | 1351 | 1352 | 1353 | 1354 | ## 1.3.2 (2018-10-09) 1355 | 1356 | 1357 | 1358 | 1359 | ## 1.3.1 (2018-10-09) 1360 | 1361 | 1362 | 1363 | 1364 | # 1.3.0 (2018-09-25) 1365 | 1366 | 1367 | 1368 | 1369 | # 1.2.0 (2018-09-23) 1370 | 1371 | 1372 | 1373 | 1374 | ## 1.1.3 (2018-09-23) 1375 | 1376 | 1377 | 1378 | 1379 | ## 1.1.2 (2018-09-18) 1380 | 1381 | 1382 | 1383 | 1384 | ## 1.1.1 (2018-09-16) 1385 | 1386 | 1387 | 1388 | 1389 | # 1.1.0 (2018-09-12) 1390 | 1391 | 1392 | 1393 | 1394 | ## 1.0.11 (2018-09-10) 1395 | 1396 | 1397 | ### Bug Fixes 1398 | 1399 | * **package:** update got to version 9.2.0 ([183ec72](https://github.com/microlinkhq/html-get/commit/183ec72)) 1400 | 1401 | 1402 | 1403 | 1404 | ## 1.0.10 (2018-08-29) 1405 | 1406 | 1407 | ### Bug Fixes 1408 | 1409 | * **package:** update browserless to version 4.1.0 ([bb9b68c](https://github.com/microlinkhq/html-get/commit/bb9b68c)) 1410 | * **package:** update got to version 9.1.0 ([065bd3d](https://github.com/microlinkhq/html-get/commit/065bd3d)) 1411 | 1412 | 1413 | 1414 | 1415 | ## 1.0.9 (2018-08-07) 1416 | 1417 | 1418 | ### Bug Fixes 1419 | 1420 | * **package:** update got to version 9.0.0 ([33e5d52](https://github.com/microlinkhq/html-get/commit/33e5d52)) 1421 | 1422 | 1423 | 1424 | 1425 | ## 1.0.8 (2018-07-23) 1426 | 1427 | 1428 | 1429 | 1430 | ## 1.0.7 (2018-06-30) 1431 | 1432 | 1433 | 1434 | 1435 | ## 1.0.6 (2018-06-29) 1436 | 1437 | 1438 | 1439 | 1440 | ## 1.0.5 (2018-06-29) 1441 | 1442 | 1443 | 1444 | 1445 | ## 1.0.4 (2018-06-29) 1446 | 1447 | 1448 | 1449 | 1450 | # 1.4.0 (2018-10-24) 1451 | 1452 | * Add debug logs ([c419eed](https://github.com/microlinkhq/html-get/commit/c419eed)) 1453 | * Add timeout for ping ([db1913f](https://github.com/microlinkhq/html-get/commit/db1913f)) 1454 | * Add url resolution ([43fa92c](https://github.com/microlinkhq/html-get/commit/43fa92c)) 1455 | * Adjust timeouts ([c5da87e](https://github.com/microlinkhq/html-get/commit/c5da87e)) 1456 | * Release 1.3.7 ([d429c89](https://github.com/microlinkhq/html-get/commit/d429c89)) 1457 | * Update scripts ([24b93f1](https://github.com/microlinkhq/html-get/commit/24b93f1)) 1458 | 1459 | 1460 | 1461 | 1462 | ## 1.3.7 (2018-10-24) 1463 | 1464 | * Add debug logs ([c419eed](https://github.com/microlinkhq/html-get/commit/c419eed)) 1465 | * Add timeout for ping ([db1913f](https://github.com/microlinkhq/html-get/commit/db1913f)) 1466 | * Add url resolution ([43fa92c](https://github.com/microlinkhq/html-get/commit/43fa92c)) 1467 | * Adjust timeouts ([c5da87e](https://github.com/microlinkhq/html-get/commit/c5da87e)) 1468 | 1469 | 1470 | 1471 | 1472 | ## 1.3.6 (2018-10-23) 1473 | 1474 | * fix: remove youtube from auto domains ([5f20ed0](https://github.com/microlinkhq/html-get/commit/5f20ed0)) 1475 | 1476 | 1477 | 1478 | 1479 | ## 1.3.5 (2018-10-23) 1480 | 1481 | * Migrate linter staged ([3b140f9](https://github.com/microlinkhq/html-get/commit/3b140f9)) 1482 | 1483 | 1484 | 1485 | 1486 | ## 1.3.4 (2018-10-23) 1487 | 1488 | * Increment prerender time to 8s ([342f4ce](https://github.com/microlinkhq/html-get/commit/342f4ce)) 1489 | 1490 | 1491 | 1492 | 1493 | ## 1.3.3 (2018-10-14) 1494 | 1495 | * Ensure always return domain ([07e4184](https://github.com/microlinkhq/html-get/commit/07e4184)) 1496 | 1497 | 1498 | 1499 | 1500 | ## 1.3.2 (2018-10-09) 1501 | 1502 | * Remove Amazon ([99f4481](https://github.com/microlinkhq/html-get/commit/99f4481)) 1503 | 1504 | 1505 | 1506 | 1507 | ## 1.3.1 (2018-10-09) 1508 | 1509 | * Add Amazon as fetch mode ([bebb03c](https://github.com/microlinkhq/html-get/commit/bebb03c)) 1510 | * Update dependencies ([1bae7ef](https://github.com/microlinkhq/html-get/commit/1bae7ef)) 1511 | 1512 | 1513 | 1514 | 1515 | # 1.3.0 (2018-09-25) 1516 | 1517 | * Use timeout for version ([2664d51](https://github.com/microlinkhq/html-get/commit/2664d51)) 1518 | 1519 | 1520 | 1521 | 1522 | # 1.2.0 (2018-09-23) 1523 | 1524 | * Sort domains based on ranking ([d905800](https://github.com/microlinkhq/html-get/commit/d905800)) 1525 | 1526 | 1527 | 1528 | 1529 | ## 1.1.3 (2018-09-23) 1530 | 1531 | * Bind context ([ae0971b](https://github.com/microlinkhq/html-get/commit/ae0971b)) 1532 | 1533 | 1534 | 1535 | 1536 | ## 1.1.2 (2018-09-18) 1537 | 1538 | * Resolve req properly under prerender timeout ([c63ebe9](https://github.com/microlinkhq/html-get/commit/c63ebe9)) 1539 | 1540 | 1541 | 1542 | 1543 | ## 1.1.1 (2018-09-16) 1544 | 1545 | * Reflect fetch on prerender ([ea2d230](https://github.com/microlinkhq/html-get/commit/ea2d230)) 1546 | 1547 | 1548 | 1549 | 1550 | # 1.1.0 (2018-09-12) 1551 | 1552 | * Add hard timeout ([c0e39b3](https://github.com/microlinkhq/html-get/commit/c0e39b3)) 1553 | 1554 | 1555 | 1556 | 1557 | ## 1.0.11 (2018-09-10) 1558 | 1559 | * Lock standard version ([1ce4e76](https://github.com/microlinkhq/html-get/commit/1ce4e76)) 1560 | * Reflect fetch mode used on the output ([5d926fe](https://github.com/microlinkhq/html-get/commit/5d926fe)) 1561 | * fix(package): update got to version 9.2.0 ([183ec72](https://github.com/microlinkhq/html-get/commit/183ec72)) 1562 | 1563 | 1564 | 1565 | 1566 | ## 1.0.10 (2018-08-29) 1567 | 1568 | * fix(package): update browserless to version 4.1.0 ([bb9b68c](https://github.com/microlinkhq/html-get/commit/bb9b68c)) 1569 | * fix(package): update got to version 9.1.0 ([065bd3d](https://github.com/microlinkhq/html-get/commit/065bd3d)) 1570 | 1571 | 1572 | 1573 | 1574 | ## 1.0.9 (2018-08-07) 1575 | 1576 | * fix(package): update got to version 9.0.0 ([33e5d52](https://github.com/microlinkhq/html-get/commit/33e5d52)) 1577 | 1578 | 1579 | 1580 | 1581 | ## 1.0.8 (2018-07-23) 1582 | 1583 | * Fix cancel fetch ([aff43a4](https://github.com/microlinkhq/html-get/commit/aff43a4)) 1584 | 1585 | 1586 | 1587 | 1588 | ## 1.0.7 (2018-06-30) 1589 | 1590 | * Fix typo ([126a8d2](https://github.com/microlinkhq/html-get/commit/126a8d2)) 1591 | * Update deps ([aa08a96](https://github.com/microlinkhq/html-get/commit/aa08a96)) 1592 | 1593 | 1594 | 1595 | 1596 | ## 1.0.6 (2018-06-29) 1597 | 1598 | * Unify browserless instance ([40aae0e](https://github.com/microlinkhq/html-get/commit/40aae0e)) 1599 | 1600 | 1601 | 1602 | 1603 | ## 1.0.5 (2018-06-29) 1604 | 1605 | * Rename ([a6240ec](https://github.com/microlinkhq/html-get/commit/a6240ec)) 1606 | 1607 | 1608 | 1609 | 1610 | ## 1.0.4 (2018-06-29) 1611 | 1612 | * Lazy browserless initialization ([47f69da](https://github.com/microlinkhq/html-get/commit/47f69da)) 1613 | * Update ([184ae17](https://github.com/microlinkhq/html-get/commit/184ae17)) 1614 | * Update copy ([af3d011](https://github.com/microlinkhq/html-get/commit/af3d011)) 1615 | * Update README.md ([df3799b](https://github.com/microlinkhq/html-get/commit/df3799b)) 1616 | * docs(readme): add Greenkeeper badge ([b80cd1e](https://github.com/microlinkhq/html-get/commit/b80cd1e)) 1617 | 1618 | 1619 | 1620 | 1621 | ## 1.0.3 (2018-06-28) 1622 | 1623 | * Expose create browserless ([7b487e5](https://github.com/microlinkhq/html-get/commit/7b487e5)) 1624 | 1625 | 1626 | 1627 | 1628 | ## 1.0.2 (2018-06-28) 1629 | 1630 | * Add puppeteer as peer dependency ([48b0ec8](https://github.com/microlinkhq/html-get/commit/48b0ec8)) 1631 | * Update README.md ([4425c3c](https://github.com/microlinkhq/html-get/commit/4425c3c)) 1632 | 1633 | 1634 | 1635 | 1636 | ## 1.0.1 (2018-06-26) 1637 | 1638 | * Add default puppeteer config ([373f8c4](https://github.com/microlinkhq/html-get/commit/373f8c4)) 1639 | * Update README.md ([7661915](https://github.com/microlinkhq/html-get/commit/7661915)) 1640 | * Update README.md ([fd93e9a](https://github.com/microlinkhq/html-get/commit/fd93e9a)) 1641 | 1642 | 1643 | 1644 | 1645 | # 1.0.0 (2018-06-26) 1646 | 1647 | * First commit ([f60248e](https://github.com/microlinkhq/html-get/commit/f60248e)) 1648 | * Rename ([800d199](https://github.com/microlinkhq/html-get/commit/800d199)) 1649 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright © 2019 Microlink (microlink.io) 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 | microlink logo 3 | microlink logo 4 |
5 |
6 |
7 | 8 | ![Last version](https://img.shields.io/github/tag/microlinkhq/html-get.svg?style=flat-square) 9 | [![Coverage Status](https://img.shields.io/coveralls/microlinkhq/html-get.svg?style=flat-square)](https://coveralls.io/github/microlinkhq/html-get) 10 | [![NPM Status](https://img.shields.io/npm/dm/html-get.svg?style=flat-square)](https://www.npmjs.org/package/html-get) 11 | 12 | > Get the HTML from any website, fine-tuned for correction & speed. 13 | 14 | ## Features 15 | 16 | - Get HTML markup for any URL, including images, video, audio, or pdf. 17 | - Block ads tracker or any non-necessary network subrequest. 18 | - Handle unreachable or timeout URLs gracefully. 19 | - Ensure HTML markup is appropriately encoded. 20 | 21 | **html-get** takes advantage of [puppeteer](https://github.com/GoogleChrome/puppeteer) headless technology when is needed, such as client-side apps that needs to be prerender. 22 | 23 | ## Install 24 | 25 | ```bash 26 | $ npm install browserless puppeteer html-get --save 27 | ``` 28 | 29 | ## Usage 30 | 31 | ```js 32 | const createBrowserless = require('browserless') 33 | const getHTML = require('html-get') 34 | 35 | // Spawn Chromium process once 36 | const browserlessFactory = createBrowserless() 37 | 38 | // Kill the process when Node.js exit 39 | process.on('exit', () => { 40 | console.log('closing resources!') 41 | browserlessFactory.close() 42 | }) 43 | 44 | const getContent = async url => { 45 | // create a browser context inside Chromium process 46 | const browserContext = browserlessFactory.createContext() 47 | const getBrowserless = () => browserContext 48 | const result = await getHTML(url, { getBrowserless }) 49 | // close the browser context after it's used 50 | await getBrowserless((browser) => browser.destroyContext()) 51 | return result 52 | } 53 | 54 | getContent('https://example.com') 55 | .then(content => { 56 | console.log(content) 57 | process.exit() 58 | }) 59 | .catch(error => { 60 | console.error(error) 61 | process.exit(1) 62 | }) 63 | ``` 64 | 65 | ### Command Line 66 | 67 | ``` 68 | $ npx html-get https://example.com 69 | ``` 70 | 71 | ## API 72 | 73 | ### getHTML(url, [options]) 74 | 75 | #### url 76 | 77 | *Required*
78 | Type: `string` 79 | 80 | The target URL for getting the HTML markup. 81 | 82 | #### options 83 | 84 | ##### encoding 85 | 86 | Type: `string` 87 | Default: `'utf-8'` 88 | 89 | It ensures the HTML markup is encoded to the encoded value provided. 90 | 91 | The value will be passes to [`html-encode`](https://github.com/kikobeats/html-encode) 92 | 93 | ##### getBrowserless 94 | 95 | *Required*
96 | Type: `function` 97 | 98 | A function that should return a [browserless](https://browserless.js.org/) instance to be used for interact with puppeteer: 99 | 100 | ##### getMode 101 | 102 | Type: `function` 103 | 104 | It determines the strategy to use based on the `url`, being the possibles values `'fetch'` or `'prerender'` . 105 | 106 | ##### getTemporalFile 107 | 108 | Type: `function` 109 | 110 | It creates a temporal file. 111 | 112 | ##### gotOpts 113 | 114 | Type: `object` 115 | 116 | It passes configuration object to [got](https://www.npmjs.com/package/got) under `'fetch'` strategy. 117 | 118 | ##### headers 119 | 120 | Type: `object` 121 | 122 | Request headers that will be passed to fetch/prerender process. 123 | 124 | ##### mutool 125 | 126 | Type: `function`|`boolean`
127 | Default: `source code` 128 | 129 | It returns a function that receives that executes [mutool](https://mupdf.com/) binary for turning PDF files into HTML markup. 130 | 131 | It can explicitly disabled passing `false`. 132 | 133 | ##### prerender 134 | 135 | Type: `boolean`|`string`
136 | Default: `'auto'` 137 | 138 | Enable or disable prerendering as mechanism for getting the HTML markup explicitly. 139 | 140 | The value `auto` means that that internally use a list of websites that don't need to use prerendering by default. This list is used for speedup the process, using `fetch` mode for these websites. 141 | 142 | See [getMode parameter](#getMode) for know more. 143 | 144 | ##### puppeteerOpts 145 | 146 | Type: `object` 147 | 148 | It passes coniguration object to [puppeteer](https://www.npmjs.com/package/puppeteer) under `'prerender'` strategy. 149 | 150 | ##### rewriteUrls 151 | 152 | Type: `boolean`
153 | Default: `false` 154 | 155 | When is `true`, it will be rewritten CSS/HTML relatives URLs present in the HTML markup into absolutes. 156 | 157 | ##### rewriteHtml 158 | 159 | Type: `boolean`
160 | Default: `false` 161 | 162 | When is `true`, it will rewrite some common mistake related with HTML meta tags. 163 | 164 | ##### serializeHtml 165 | 166 | It determines how HTML should be serialied before returning. 167 | 168 | It's serialized `$ => ({ html: $.html() })` by default. 169 | 170 | ## License 171 | 172 | **html-get** © [Microlink](https://microlink.io), released under the [MIT](https://github.com/microlinkhq/html-get/blob/master/LICENSE.md) License.
173 | Authored and maintained by [Kiko Beats](https://kikobeats.com) with help from [contributors](https://github.com/microlinkhq/html-get/contributors). 174 | 175 | > [microlink.io](https://microlink.io) · GitHub [microlinkhq](https://github.com/microlinkhq) · X [@microlinkhq](https://x.com/microlinkhq) 176 | -------------------------------------------------------------------------------- /benchmark/get-content-type/index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const NullProtoObj = require('null-prototype-object') 4 | const { parse } = require('content-type') 5 | 6 | const parseContentType = contentType => 7 | typeof contentType === 'string' 8 | ? parse(contentType) 9 | : { type: undefined, parameters: {} } 10 | 11 | const createContentTypeFunction = useCache => { 12 | const CACHE = useCache ? new NullProtoObj() : null 13 | 14 | return headers => { 15 | const contentType = headers['content-type'] 16 | if (useCache) { 17 | return ( 18 | CACHE[contentType] || 19 | (CACHE[contentType] = parseContentType(contentType)) 20 | ) 21 | } else { 22 | return parseContentType(contentType) 23 | } 24 | } 25 | } 26 | 27 | // Benchmark function 28 | const benchmark = (iterations, useCache) => { 29 | const headersList = [ 30 | { 'content-type': 'application/json; charset=utf-8' }, 31 | { 'content-type': 'text/html; charset=utf-8' }, 32 | { 'content-type': 'application/xml; charset=utf-8' }, 33 | { 'content-type': 'text/plain; charset=utf-8' }, 34 | { 'content-type': 'application/json' } 35 | ] 36 | 37 | const contentTypeFunc = createContentTypeFunction(useCache) 38 | 39 | console.time(useCache ? 'Benchmark with Cache' : 'Benchmark without Cache') 40 | for (let i = 0; i < iterations; i++) { 41 | for (const headers of headersList) { 42 | contentTypeFunc(headers) 43 | } 44 | } 45 | console.timeEnd(useCache ? 'Benchmark with Cache' : 'Benchmark without Cache') 46 | } 47 | 48 | // Run the benchmark 49 | const iterations = 100000 50 | benchmark(iterations, false) // Without Cache 51 | benchmark(iterations, true) // With Cache 52 | -------------------------------------------------------------------------------- /benchmark/mupdf/generate.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const { randomBytes } = require('crypto') 4 | const PDFDocument = require('pdfkit') 5 | const bytes = require('bytes-iec') 6 | const path = require('path') 7 | const fs = require('fs') 8 | 9 | function generatePdf (filename, filesize) { 10 | const doc = new PDFDocument() 11 | const filepath = path.join(__dirname, 'fixtures', filename) 12 | const stream = fs.createWriteStream(filepath) 13 | doc.pipe(stream) 14 | 15 | // adjust base64 overheard 16 | const size = bytes.format(Math.floor(filesize * 0.55)) 17 | const randomData = randomBytes(bytes(size)).toString('base64') 18 | 19 | doc.text(randomData, { 20 | width: 410, 21 | align: 'left' 22 | }) 23 | 24 | doc.end() 25 | 26 | stream.on('finish', () => console.log(filename)) 27 | } 28 | 29 | const sizes = [...Array(10).keys()] 30 | .map(index => { 31 | const base = (index + 1) * 100 32 | const filename = bytes.format(base * 1000).toLowerCase() 33 | const filesize = bytes(`${base}KB`) 34 | return { filename, filesize } 35 | }) 36 | .concat([ 37 | { filename: '5mb', filesize: bytes('5MB') }, 38 | { filename: '10mb', filesize: bytes('10MB') }, 39 | { filename: '20mb', filesize: bytes('20MB') } 40 | ]) 41 | 42 | for (const { filename, filesize } of sizes) { 43 | generatePdf(`${filename}.pdf`, filesize) 44 | } 45 | -------------------------------------------------------------------------------- /benchmark/mupdf/index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const { readFile, readdir } = require('fs/promises') 4 | const { defaultMutool } = require('../../src') 5 | const path = require('path') 6 | 7 | const OUTPUT = path.join(__dirname, 'output.pdf') 8 | 9 | class Benchmark { 10 | constructor (title) { 11 | this.title = title 12 | this.testCases = [] 13 | this.results = [] 14 | this.verifications = [] 15 | } 16 | 17 | add (name, fn) { 18 | this.testCases.push({ name, fn }) 19 | return this 20 | } 21 | 22 | verification (fn) { 23 | this.verifications.push(fn) 24 | return this 25 | } 26 | 27 | async run () { 28 | console.log(`\n${this.title}\n`) 29 | 30 | for (const [index, { name, fn }] of this.testCases 31 | .sort(() => Math.random() - 0.5) 32 | .entries()) { 33 | const start = Date.now() 34 | const result = await fn() 35 | for (const verification of this.verifications) { 36 | try { 37 | verification(result) 38 | } catch (error) { 39 | throw new Error(`Verification failed for '${name}': ${error.message}`) 40 | } 41 | } 42 | const duration = Date.now() - start 43 | this.results.push({ name, duration, result }) 44 | console.log(`${index + 1}. ${name}: ${duration}ms`) 45 | } 46 | 47 | const { name, duration } = this.results.reduce( 48 | (prev, curr, idx) => 49 | prev.duration < curr.duration ? prev : { ...curr, index: idx }, 50 | { duration: Infinity } 51 | ) 52 | const [fastest, secondFastest] = this.results.sort( 53 | (a, b) => a.duration - b.duration 54 | ) 55 | 56 | const percentageFaster = 57 | ((secondFastest.duration - fastest.duration) / secondFastest.duration) * 58 | 100 59 | 60 | console.log( 61 | `\nFastest: "${name}" with ${duration}ms (${percentageFaster.toFixed( 62 | 2 63 | )}%)` 64 | ) 65 | } 66 | } 67 | 68 | const main = async () => { 69 | const mutool = defaultMutool() 70 | 71 | const fixtures = await readdir(path.join(__dirname, 'fixtures')) 72 | 73 | for (const filename of fixtures) { 74 | const filepath = path.join(__dirname, 'fixtures', filename) 75 | 76 | await new Benchmark(`Benchmarking mutool ${filename}`) 77 | .verification(output => { 78 | if (typeof output !== 'string') { 79 | throw new TypeError(`Expected a string, got ${typeof output}`) 80 | } 81 | }) 82 | .add('write in memory', async () => { 83 | const result = await mutool(filepath) 84 | return result.stdout 85 | }) 86 | .add('write in file, read async', async () => { 87 | await mutool(`-o ${OUTPUT} ${filepath}`) 88 | return readFile(OUTPUT, 'utf-8') 89 | }) 90 | .run() 91 | } 92 | } 93 | 94 | main() 95 | -------------------------------------------------------------------------------- /benchmark/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "@html-get/benchmark", 3 | "version": "0.0.0", 4 | "dependencies": { 5 | "bytes-iec": "~3.1.1", 6 | "pdfkit": "~0.16.0" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /bin/index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | 'use strict' 4 | 5 | const createBrowserless = require('browserless') 6 | const { URL } = require('url') 7 | const mri = require('mri') 8 | 9 | const getHTML = require('..') 10 | 11 | const browserlessFactory = createBrowserless() 12 | 13 | const { _: input, debug: isDebug, ...args } = mri(process.argv.slice(2)) 14 | const url = new URL(input).toString() 15 | 16 | const browserContext = browserlessFactory.createContext() 17 | const getBrowserless = () => browserContext 18 | 19 | getHTML(url, { getBrowserless, ...args }) 20 | .then(async ({ html, stats, headers, statusCode }) => { 21 | if (isDebug) { 22 | console.log(` 23 | url: ${url} 24 | html: ${Buffer.from(html).byteLength} bytes (HTTP ${statusCode}) 25 | time: ${stats.timing} (${stats.mode}) 26 | headers: ${ 27 | headers 28 | ? Object.keys(headers).reduce( 29 | (acc, key) => `${acc}${key}=${headers[key]} `, 30 | '' 31 | ) 32 | : '-' 33 | } 34 | `) 35 | } else { 36 | console.log(html) 37 | } 38 | process.exit() 39 | }) 40 | .catch(error => console.error(error) || process.exit(1)) 41 | .finally(async () => { 42 | await getBrowserless(browser => browser.destroyContext()) 43 | browserlessFactory.close() 44 | }) 45 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "html-get", 3 | "description": "Get the HTML from any website, fine-tuned for correction & speed", 4 | "homepage": "https://nicedoc.com/microlinkhq/html-get", 5 | "version": "2.21.5", 6 | "main": "src/index.js", 7 | "bin": { 8 | "html-get": "bin/index.js" 9 | }, 10 | "author": { 11 | "email": "josefrancisco.verdu@gmail.com", 12 | "name": "Kiko Beats", 13 | "url": "https://kikobeats.com" 14 | }, 15 | "contributors": [], 16 | "repository": { 17 | "type": "git", 18 | "url": "git+https://github.com/microlinkhq/html-get.git" 19 | }, 20 | "bugs": { 21 | "url": "https://github.com/microlinkhq/html-get/issues" 22 | }, 23 | "keywords": [ 24 | "audio", 25 | "fetch", 26 | "get", 27 | "got", 28 | "headless", 29 | "html", 30 | "image", 31 | "markup", 32 | "pdf", 33 | "prerender", 34 | "request", 35 | "video" 36 | ], 37 | "dependencies": { 38 | "@kikobeats/time-span": "~1.0.5", 39 | "@metascraper/helpers": "~5.46.1", 40 | "cheerio": "~1.0.0", 41 | "content-type": "~1.0.5", 42 | "css-url-regex": "~4.0.0", 43 | "debug-logfmt": "~1.2.3", 44 | "execall": "~2.0.0", 45 | "got": "~11.8.6", 46 | "html-encode": "~2.1.7", 47 | "html-urls": "~2.4.62", 48 | "is-html-content": "~1.0.0", 49 | "is-local-address": "~2.2.0", 50 | "lodash": "~4.17.21", 51 | "mri": "~1.2.0", 52 | "null-prototype-object": "~1.2.0", 53 | "p-cancelable": "~2.1.0", 54 | "p-retry": "~4.6.0", 55 | "tinyspawn": "~1.5.0", 56 | "top-sites": "~1.1.220" 57 | }, 58 | "devDependencies": { 59 | "@commitlint/cli": "latest", 60 | "@commitlint/config-conventional": "latest", 61 | "@ksmithut/prettier-standard": "latest", 62 | "async-listen": "latest", 63 | "ava": "5", 64 | "browserless": "latest", 65 | "c8": "latest", 66 | "ci-publish": "latest", 67 | "finepack": "latest", 68 | "git-authors-cli": "latest", 69 | "github-generate-release": "latest", 70 | "nano-staged": "latest", 71 | "pretty": "latest", 72 | "puppeteer": "latest", 73 | "regex-iso-date": "latest", 74 | "simple-git-hooks": "latest", 75 | "standard": "latest", 76 | "standard-version": "latest" 77 | }, 78 | "engines": { 79 | "node": ">= 10" 80 | }, 81 | "files": [ 82 | "bin", 83 | "scripts", 84 | "src" 85 | ], 86 | "scripts": { 87 | "clean": "rm -rf node_modules", 88 | "contributors": "(npx git-authors-cli && npx finepack && git add package.json && git commit -m 'build: contributors' --no-verify) || true", 89 | "lint": "standard", 90 | "postinstall": "node scripts/postinstall", 91 | "postrelease": "npm run release:tags && npm run release:github && (ci-publish || npm publish --access=public)", 92 | "pretest": "npm run lint", 93 | "release": "standard-version -a", 94 | "release:github": "github-generate-release", 95 | "release:tags": "git push --follow-tags origin HEAD:master", 96 | "test": "c8 ava" 97 | }, 98 | "license": "MIT", 99 | "ava": { 100 | "files": [ 101 | "test/**/*.js", 102 | "!test/helpers.js" 103 | ], 104 | "timeout": "2m", 105 | "workerThreads": false 106 | }, 107 | "commitlint": { 108 | "extends": [ 109 | "@commitlint/config-conventional" 110 | ], 111 | "rules": { 112 | "body-max-line-length": [ 113 | 0 114 | ] 115 | } 116 | }, 117 | "nano-staged": { 118 | "*.js": [ 119 | "prettier-standard", 120 | "standard --fix" 121 | ], 122 | "package.json": [ 123 | "finepack" 124 | ] 125 | }, 126 | "pnpm": { 127 | "neverBuiltDependencies": [] 128 | }, 129 | "simple-git-hooks": { 130 | "commit-msg": "npx commitlint --edit", 131 | "pre-commit": "npx nano-staged" 132 | } 133 | } 134 | -------------------------------------------------------------------------------- /scripts/postinstall: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | 'use strict' 4 | 5 | const { compact, reduce, findIndex } = require('lodash') 6 | const { parseUrl } = require('@metascraper/helpers') 7 | const { writeFile } = require('fs').promises 8 | const topsites = require('top-sites') 9 | 10 | const domains = [ 11 | [['domain', 'abc.net.au']], 12 | [['domain', 'x.com']], 13 | [['domainWithoutSuffix', 'apple']], 14 | [['domainWithoutSuffix', 'arxiv']], 15 | [['domainWithoutSuffix', 'bbc']], 16 | [['domainWithoutSuffix', 'blogspot']], 17 | [['domainWithoutSuffix', 'csdn']], 18 | [['domainWithoutSuffix', 'deviantart']], 19 | [['domainWithoutSuffix', 'digg']], 20 | [['domainWithoutSuffix', 'dribbble']], 21 | [['domainWithoutSuffix', 'engadget']], 22 | [['domainWithoutSuffix', 'etsy']], 23 | [['domainWithoutSuffix', 'eventbrite']], 24 | [['domainWithoutSuffix', 'flickr']], 25 | [['domainWithoutSuffix', 'ghost']], 26 | [['domainWithoutSuffix', 'giphy']], 27 | [['domainWithoutSuffix', 'github']], 28 | [['domainWithoutSuffix', 'gitlab']], 29 | [['domainWithoutSuffix', 'google']], 30 | [['domainWithoutSuffix', 'huffingtonpost']], 31 | [['domainWithoutSuffix', 'imdb']], 32 | [['domainWithoutSuffix', 'imgur']], 33 | [['domainWithoutSuffix', 'instagram']], 34 | [['domainWithoutSuffix', 'meetup']], 35 | [['domainWithoutSuffix', 'microsoft']], 36 | [['domainWithoutSuffix', 'nytimes']], 37 | [['domainWithoutSuffix', 'pinterest']], 38 | [['domainWithoutSuffix', 'producthunt']], 39 | [['domainWithoutSuffix', 'reddit']], 40 | [['domainWithoutSuffix', 'slideshare']], 41 | [['domainWithoutSuffix', 'soundcloud']], 42 | [['domainWithoutSuffix', 'sourceforge']], 43 | [['domainWithoutSuffix', 'spotify']], 44 | [['domainWithoutSuffix', 'stackoverflow']], 45 | [['domainWithoutSuffix', 'substack']], 46 | [['domainWithoutSuffix', 'techcrunch']], 47 | [['domainWithoutSuffix', 'telegraph']], 48 | [['domainWithoutSuffix', 'theguardian']], 49 | [['domainWithoutSuffix', 'theverge']], 50 | [['domainWithoutSuffix', 'tumblr']], 51 | [['domainWithoutSuffix', 'twitter']], 52 | [['domainWithoutSuffix', 'vimeo']], 53 | [['domainWithoutSuffix', 'wikipedia']], 54 | [['domainWithoutSuffix', 'wordpress']], 55 | [['domainWithoutSuffix', 'ycombinator']], 56 | [['domainWithoutSuffix', 'yelp']], 57 | [['domainWithoutSuffix', 'youtube']], 58 | [['domainWithoutSuffix', 'zoom']] 59 | ] 60 | 61 | const { top, rest } = reduce( 62 | domains, 63 | (acc, conditions) => { 64 | for (const [key, value] of conditions) { 65 | const index = findIndex(topsites, ({ rootDomain }) => { 66 | const parsedUrl = parseUrl(rootDomain) 67 | return parsedUrl[key] === value 68 | }) 69 | 70 | if (index !== -1) { 71 | acc.top[index] = conditions 72 | return acc 73 | } 74 | } 75 | 76 | acc.rest.push(conditions) 77 | return acc 78 | }, 79 | { top: new Array(topsites.length), rest: [] } 80 | ) 81 | 82 | writeFile('./src/auto-domains.json', JSON.stringify(compact(top).concat(rest)), null, 2).catch( 83 | error => console.log(error) 84 | ) 85 | -------------------------------------------------------------------------------- /src/html.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const debug = require('debug-logfmt')('html-get:rewrite') 4 | const { get, castArray, forEach } = require('lodash') 5 | const isLocalAddress = require('is-local-address') 6 | const { TAGS: URL_TAGS } = require('html-urls') 7 | const isHTML = require('is-html-content') 8 | const cssUrl = require('css-url-regex') 9 | const execall = require('execall') 10 | const cheerio = require('cheerio') 11 | const { URL } = require('url') 12 | const path = require('path') 13 | 14 | const { 15 | date: toDate, 16 | isMime, 17 | isUrl, 18 | mimeExtension, 19 | parseUrl 20 | } = require('@metascraper/helpers') 21 | 22 | const { getContentType, getCharset } = require('./util') 23 | 24 | const has = el => el.length !== 0 25 | 26 | const upsert = (el, collection, item) => !has(el) && collection.push(item) 27 | 28 | /** 29 | * Infer timestamp from `last-modified`, `date`, or `age` response headers. 30 | */ 31 | const getDate = headers => { 32 | const timestamp = get(headers, 'last-modified') || get(headers, 'date') 33 | return timestamp 34 | ? toDate(timestamp) 35 | : toDate(Date.now() - Number(get(headers, 'age')) * 1000) 36 | } 37 | 38 | const addHead = ({ $, url, headers }) => { 39 | const tags = [] 40 | const charset = getCharset(headers) 41 | const date = getDate(headers) 42 | const { domain } = parseUrl(url) 43 | const head = $('head') 44 | 45 | upsert(head.find('title'), tags, `${path.basename(url)}`) 46 | 47 | if (domain) { 48 | upsert( 49 | head.find('meta[property="og:site_name"]'), 50 | tags, 51 | `` 52 | ) 53 | } 54 | 55 | if (date) { 56 | upsert( 57 | head.find('meta[property="article:published_time"]'), 58 | tags, 59 | `` 60 | ) 61 | } 62 | 63 | upsert( 64 | head.find('link[rel="canonical"]'), 65 | tags, 66 | `` 67 | ) 68 | 69 | if (charset) { 70 | upsert(head.find('meta[charset]'), tags, ``) 71 | } 72 | 73 | tags.forEach(tag => head.append(tag)) 74 | } 75 | 76 | const addBody = ({ url, headers, html }) => { 77 | const contentType = getContentType(headers) 78 | let element = '' 79 | 80 | if (isMime(contentType, 'image')) { 81 | element = `` 82 | } else if (isMime(contentType, 'video')) { 83 | element = `` 84 | } else if (isMime(contentType, 'audio')) { 85 | element = `` 86 | } else if (mimeExtension(contentType) === 'json') { 87 | element = `
${html}
` 88 | } 89 | 90 | return `${element}` 91 | } 92 | 93 | const isOpenGraph = (prop = '') => 94 | ['og:', 'fb:', 'al:'].some(prefix => prop.startsWith(prefix)) 95 | 96 | const rewriteMetaTags = ({ $ }) => { 97 | $('meta').each((_, element) => { 98 | const el = $(element) 99 | if (!el.attr('content')) return 100 | 101 | const name = el.attr('name') 102 | const property = el.attr('property') 103 | 104 | // Convert 'name' to 'property' for Open Graph tags if 'property' is not already set correctly 105 | if (property !== name && isOpenGraph(name)) { 106 | el.removeAttr('name').attr('property', name) 107 | debug('og', el.attr()) 108 | // Convert 'property' to 'name' for non-Open Graph tags 109 | } else if (property && !isOpenGraph(property)) { 110 | el.removeAttr('property').attr('name', property) 111 | debug('meta', el.attr()) 112 | } 113 | }) 114 | } 115 | 116 | const rewriteHtmlUrls = ({ $, url }) => { 117 | forEach(URL_TAGS, (tagName, urlAttr) => { 118 | $(tagName.join(',')).each(function () { 119 | const el = $(this) 120 | const attr = el.attr(urlAttr) 121 | if (typeof attr !== 'string') return 122 | try { 123 | const urlObj = new URL(attr, url) 124 | if (!urlObj.protocol.startsWith('http')) return 125 | if (isLocalAddress(urlObj.hostname)) { 126 | el.remove() 127 | } else { 128 | el.attr(urlAttr, urlObj.toString()) 129 | } 130 | } catch (_) {} 131 | }) 132 | }) 133 | } 134 | 135 | const replaceCssUrls = (url, stylesheet) => { 136 | const cssUrls = Array.from(execall(cssUrl(), stylesheet)).reduce( 137 | (acc, match) => { 138 | match.subMatches.forEach(match => acc.add(match)) 139 | return acc 140 | }, 141 | new Set() 142 | ) 143 | 144 | cssUrls.forEach(cssUrl => { 145 | if (cssUrl.startsWith('/')) { 146 | try { 147 | const absoluteUrl = new URL(cssUrl, url).toString() 148 | stylesheet = stylesheet.replaceAll( 149 | `url(${cssUrl})`, 150 | `url(${absoluteUrl})` 151 | ) 152 | } catch (_) {} 153 | } 154 | }) 155 | 156 | return stylesheet 157 | } 158 | 159 | const rewriteCssUrls = ({ $, url }) => { 160 | // Process 162 | $('style').each((_, element) => 163 | $(element).html(replaceCssUrls(url, $(element).html())) 164 | ) 165 | 166 | // Process elements with style attributes 167 | // e.g.,
168 | $('[style]').each((_, element) => 169 | $(element).attr('style', replaceCssUrls(url, $(element).attr('style'))) 170 | ) 171 | 172 | return $ 173 | } 174 | 175 | const injectStyle = ({ $, styles }) => 176 | castArray(styles).forEach(style => 177 | $('head').append( 178 | isUrl(style) 179 | ? `` 180 | : `` 181 | ) 182 | ) 183 | 184 | const injectScripts = ({ $, scripts, type }) => 185 | castArray(scripts).forEach(script => 186 | $('head').append( 187 | isUrl(script) 188 | ? `` 189 | : `` 190 | ) 191 | ) 192 | 193 | const addDocType = html => 194 | html.startsWith('${html}` 195 | 196 | module.exports = ({ 197 | html, 198 | url, 199 | headers = {}, 200 | styles, 201 | hide, 202 | remove, 203 | rewriteUrls, 204 | rewriteHtml, 205 | scripts, 206 | modules 207 | }) => { 208 | const content = addDocType( 209 | isHTML(html) ? html : addBody({ url, headers, html }) 210 | ) 211 | 212 | const $ = cheerio.load(content) 213 | 214 | if (rewriteUrls) rewriteHtmlUrls({ $, url }) 215 | 216 | if (rewriteHtml) rewriteMetaTags({ $, url }) 217 | 218 | addHead({ $, url, headers }) 219 | 220 | if (styles) injectStyle({ $, styles }) 221 | 222 | if (hide) { 223 | injectStyle({ 224 | $, 225 | styles: `${castArray(hide).join(', ')} { visibility: hidden !important; }` 226 | }) 227 | } 228 | 229 | if (remove) { 230 | injectStyle({ 231 | $, 232 | styles: `${castArray(remove).join(', ')} { display: none !important; }` 233 | }) 234 | } 235 | 236 | if (scripts) injectScripts({ $, scripts, type: 'text/javascript' }) 237 | if (modules) injectScripts({ $, modules, type: 'module' }) 238 | 239 | return rewriteUrls ? rewriteCssUrls({ $, url }) : $ 240 | } 241 | 242 | module.exports.getDate = getDate 243 | -------------------------------------------------------------------------------- /src/index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const { parseUrl, isMediaUrl, isPdfUrl } = require('@metascraper/helpers') 4 | const { readFile, writeFile } = require('fs/promises') 5 | const timeSpan = require('@kikobeats/time-span')() 6 | const debug = require('debug-logfmt')('html-get') 7 | const { execSync } = require('child_process') 8 | const PCancelable = require('p-cancelable') 9 | const { AbortError } = require('p-retry') 10 | const htmlEncode = require('html-encode') 11 | const crypto = require('crypto') 12 | const $ = require('tinyspawn') 13 | const path = require('path') 14 | const got = require('got') 15 | const os = require('os') 16 | 17 | const { getContentLength, getContentType } = require('./util') 18 | const autoDomains = require('./auto-domains') 19 | const addHtml = require('./html') 20 | 21 | const REQ_TIMEOUT = 8000 22 | 23 | const ABORT_TYPES = ['image', 'stylesheet', 'font'] 24 | 25 | const PDF_SIZE_TRESHOLD = 150 * 1024 // 150kb 26 | 27 | const fetch = PCancelable.fn( 28 | async ( 29 | url, 30 | { 31 | getTemporalFile, 32 | mutool, 33 | reflect = false, 34 | timeout = REQ_TIMEOUT, 35 | toEncode, 36 | ...opts 37 | }, 38 | onCancel 39 | ) => { 40 | const reqTimeout = reflect ? timeout / 2 : timeout 41 | 42 | const req = got(url, { 43 | ...opts, 44 | timeout: reqTimeout, 45 | responseType: 'buffer' 46 | }) 47 | 48 | onCancel.shouldReject = false 49 | 50 | onCancel(() => { 51 | debug('fetch:cancel', { url, reflect }) 52 | req.cancel() 53 | }) 54 | 55 | const redirects = [] 56 | req.on('redirect', res => 57 | redirects.push({ statusCode: res.statusCode, url: res.url }) 58 | ) 59 | 60 | try { 61 | const res = await req 62 | 63 | const html = await (async () => { 64 | const contentType = getContentType(res.headers) 65 | 66 | if (mutool && contentType === 'application/pdf') { 67 | const file = getTemporalFile(url, 'pdf') 68 | await writeFile(file.path, res.body) 69 | if (getContentLength(res.headers) > PDF_SIZE_TRESHOLD) { 70 | const ofile = getTemporalFile(`${url}-pdf`, 'pdf') 71 | await mutool(`-o ${ofile.path} ${file.path}`) 72 | return readFile(ofile.path, 'utf-8') 73 | } else { 74 | const { stdout } = await mutool(file.path) 75 | return stdout 76 | } 77 | } 78 | 79 | return contentType === 'text/html' || !isMediaUrl(url) 80 | ? await toEncode(res.body, res.headers['content-type']) 81 | : res.body.toString() 82 | })() 83 | 84 | return { 85 | headers: res.headers, 86 | html, 87 | mode: 'fetch', 88 | url: res.url, 89 | statusCode: res.statusCode, 90 | redirects 91 | } 92 | } catch (error) { 93 | debug('fetch:error', { url, message: error.message || error, reflect }) 94 | return reflect 95 | ? { isRejected: true, error } 96 | : { 97 | url, 98 | html: '', 99 | mode: 'fetch', 100 | headers: error.response ? error.response.headers : {}, 101 | statusCode: error.response ? error.response.statusCode : undefined, 102 | redirects 103 | } 104 | } 105 | } 106 | ) 107 | 108 | const prerender = PCancelable.fn( 109 | async ( 110 | url, 111 | { 112 | abortTypes = ABORT_TYPES, 113 | getBrowserless, 114 | gotOpts, 115 | headers, 116 | timeout = REQ_TIMEOUT, 117 | toEncode, 118 | ...opts 119 | }, 120 | onCancel 121 | ) => { 122 | let fetchRes 123 | let data = {} 124 | let isFetchResRejected = false 125 | 126 | onCancel(() => fetchRes.cancel()) 127 | 128 | try { 129 | fetchRes = fetch(url, { 130 | reflect: true, 131 | toEncode, 132 | ...gotOpts, 133 | headers, 134 | timeout 135 | }) 136 | const browserless = await getBrowserless() 137 | 138 | const getPayload = browserless.evaluate( 139 | async (page, response) => { 140 | if (!response) throw new AbortError('empty response') 141 | 142 | return { 143 | headers: response.headers(), 144 | html: await page.content(), 145 | mode: 'prerender', 146 | url: response.url(), 147 | statusCode: response.status(), 148 | redirects: response 149 | .request() 150 | .redirectChain() 151 | .map(req => ({ 152 | statusCode: req.response().status(), 153 | url: req.url() 154 | })) 155 | } 156 | }, 157 | { 158 | timeout, 159 | headers, 160 | abortTypes 161 | } 162 | ) 163 | 164 | const payload = await getPayload(url, opts) 165 | await fetchRes.cancel() 166 | debug('prerender', { url, state: 'success' }) 167 | return payload 168 | } catch (err) { 169 | const { isRejected, ...dataProps } = await fetchRes 170 | 171 | debug('prerender:error', { 172 | url, 173 | isRejected, 174 | error: err.message 175 | }) 176 | 177 | isFetchResRejected = isRejected 178 | data = dataProps 179 | } 180 | 181 | return isFetchResRejected 182 | ? { 183 | headers: data.headers || {}, 184 | html: '', 185 | url, 186 | mode: 'prerender' 187 | } 188 | : data 189 | } 190 | ) 191 | 192 | const modes = { fetch, prerender } 193 | 194 | const isFetchMode = url => { 195 | const parsedUrl = parseUrl(url) 196 | return autoDomains.some(conditions => 197 | conditions.every(([prop, value]) => parsedUrl[prop] === value) 198 | ) 199 | } 200 | 201 | const defaultGetMode = (url, { prerender }) => { 202 | if (prerender === false || isMediaUrl(url) || isPdfUrl(url)) return 'fetch' 203 | if (prerender === true) return 'prerender' 204 | return isFetchMode(url) ? 'fetch' : 'prerender' 205 | } 206 | 207 | const defaultGetTemporalFile = (input, ext) => { 208 | const hash = crypto.createHash('sha256').update(input).digest('hex') 209 | const filepath = path.join( 210 | os.tmpdir(), 211 | ext === undefined ? hash : `${hash}.${ext}` 212 | ) 213 | return { path: filepath } 214 | } 215 | 216 | const defaultMutool = () => 217 | (() => { 218 | try { 219 | const mutoolPath = execSync('which mutool', { 220 | stdio: ['pipe', 'pipe', 'ignore'] 221 | }) 222 | .toString() 223 | .trim() 224 | return (...args) => $(`${mutoolPath} draw -q -F html ${args}`) 225 | } catch (_) {} 226 | })() 227 | 228 | const getContent = PCancelable.fn( 229 | ( 230 | url, 231 | mode, 232 | { 233 | getBrowserless, 234 | getTemporalFile, 235 | gotOpts, 236 | headers, 237 | mutool, 238 | puppeteerOpts, 239 | rewriteUrls, 240 | rewriteHtml, 241 | toEncode 242 | }, 243 | onCancel 244 | ) => { 245 | const isFetchMode = mode === 'fetch' 246 | 247 | const fetchOpts = isFetchMode 248 | ? { headers, toEncode, mutool, getTemporalFile, ...gotOpts } 249 | : { headers, toEncode, getBrowserless, gotOpts, ...puppeteerOpts } 250 | 251 | const promise = modes[mode](url, fetchOpts) 252 | onCancel(() => promise.cancel()) 253 | 254 | return promise.then(content => { 255 | const $ = addHtml({ 256 | ...content, 257 | ...(isFetchMode ? puppeteerOpts : undefined), 258 | rewriteUrls, 259 | rewriteHtml 260 | }) 261 | 262 | return { ...content, $ } 263 | }) 264 | } 265 | ) 266 | 267 | module.exports = PCancelable.fn( 268 | async ( 269 | targetUrl, 270 | { 271 | encoding = 'utf-8', 272 | getBrowserless, 273 | getMode = defaultGetMode, 274 | getTemporalFile = defaultGetTemporalFile, 275 | gotOpts, 276 | headers, 277 | mutool = defaultMutool(), 278 | prerender = 'auto', 279 | puppeteerOpts, 280 | rewriteHtml = false, 281 | rewriteUrls = false, 282 | serializeHtml = $ => ({ html: $.html() }) 283 | } = {}, 284 | onCancel 285 | ) => { 286 | if (!getBrowserless && prerender !== false) { 287 | throw TypeError( 288 | "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`" 289 | ) 290 | } 291 | 292 | const toEncode = htmlEncode(encoding) 293 | const reqMode = getMode(targetUrl, { prerender }) 294 | 295 | const duration = timeSpan() 296 | 297 | const promise = getContent(targetUrl, reqMode, { 298 | getBrowserless, 299 | getTemporalFile, 300 | gotOpts, 301 | headers, 302 | mutool, 303 | puppeteerOpts, 304 | rewriteUrls, 305 | rewriteHtml, 306 | toEncode 307 | }) 308 | 309 | onCancel(() => promise.cancel()) 310 | 311 | const { mode, $, ...payload } = await promise 312 | 313 | return Object.assign(payload, { 314 | ...serializeHtml($), 315 | stats: { mode, timing: duration() } 316 | }) 317 | } 318 | ) 319 | 320 | module.exports.REQ_TIMEOUT = REQ_TIMEOUT 321 | module.exports.ABORT_TYPES = ABORT_TYPES 322 | module.exports.PDF_SIZE_TRESHOLD = PDF_SIZE_TRESHOLD 323 | module.exports.isFetchMode = isFetchMode 324 | module.exports.getContent = getContent 325 | module.exports.defaultMutool = defaultMutool 326 | -------------------------------------------------------------------------------- /src/util.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const NullProtoObj = require('null-prototype-object') 4 | const { parse } = require('content-type') 5 | 6 | const CACHE = new NullProtoObj() 7 | 8 | const parseContentType = contentType => 9 | typeof contentType === 'string' 10 | ? parse(contentType) 11 | : { type: undefined, parameters: {} } 12 | 13 | const contentType = headers => { 14 | const contentType = headers['content-type'] 15 | return ( 16 | CACHE[contentType] || (CACHE[contentType] = parseContentType(contentType)) 17 | ) 18 | } 19 | 20 | const getContentType = headers => contentType(headers).type 21 | 22 | const getCharset = headers => 23 | contentType(headers).parameters.charset?.toLowerCase() 24 | 25 | const getContentLength = headers => Number(headers['content-length']) 26 | 27 | module.exports = { 28 | getCharset, 29 | getContentLength, 30 | getContentType 31 | } 32 | -------------------------------------------------------------------------------- /test/auto-domains.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const autoDomains = require('../src/auto-domains.json') 6 | 7 | test('domains are sorted by popularity', t => { 8 | t.true(['youtube', 'google'].includes(autoDomains[0][0][1])) 9 | }) 10 | -------------------------------------------------------------------------------- /test/encoding.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const { runFixtureServer, initBrowserless } = require('./helpers') 6 | const getHTML = require('..') 7 | 8 | const getBrowserless = initBrowserless(test) 9 | 10 | ;[false, true].forEach(prerender => { 11 | const mode = prerender ? 'prerender' : 'fetch' 12 | 13 | test(`${mode} » Shift-JIS`, async t => { 14 | const url = await runFixtureServer(t, '51242_54045.html') 15 | const { html } = await getHTML(url, { prerender, getBrowserless }) 16 | t.true(html.includes('或る日の小せん')) 17 | }) 18 | 19 | test(`${mode} » Windows-1250`, async t => { 20 | const url = await runFixtureServer(t, 'rp.pl.html') 21 | const { html } = await getHTML(url, { prerender, getBrowserless }) 22 | t.true(html.includes('majątków')) 23 | }) 24 | 25 | test(`${mode} » UTF-8`, async t => { 26 | const url = await runFixtureServer(t, 'utf8.with.meta.html') 27 | const { html } = await getHTML(url, { prerender, getBrowserless }) 28 | t.true(html.includes('日本語')) 29 | }) 30 | }) 31 | -------------------------------------------------------------------------------- /test/fixtures/51242_54045.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/fixtures/51242_54045.html -------------------------------------------------------------------------------- /test/fixtures/browserless.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | browserless, a puppeter-like Node.js library for interacting with Headless production scenarios. 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 |
49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | -------------------------------------------------------------------------------- /test/fixtures/rp.pl.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/fixtures/rp.pl.html -------------------------------------------------------------------------------- /test/fixtures/utf8.with.meta.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 |

日本語

7 | 8 | 9 | -------------------------------------------------------------------------------- /test/helpers.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const { default: listen } = require('async-listen') 4 | const createBrowserless = require('browserless') 5 | const dateRegex = require('regex-iso-date') 6 | const { createServer } = require('http') 7 | const pretty = require('pretty') 8 | const path = require('path') 9 | const fs = require('fs') 10 | 11 | const createHeaders = name => contentType => ({ 12 | [name]: contentType 13 | }) 14 | 15 | const closeServer = server => 16 | require('util').promisify(server.close.bind(server))() 17 | 18 | const fixture = name => 19 | fs.readFileSync(path.join(__dirname, '/fixtures/', name)) 20 | 21 | const initBrowserless = test => { 22 | const browserlessFactory = createBrowserless() 23 | test.after.always(browserlessFactory.close) 24 | return () => browserlessFactory.createContext() 25 | } 26 | 27 | const runServer = async (t, fn) => { 28 | const server = createServer(fn) 29 | const url = await listen(server) 30 | t.teardown(() => closeServer(server)) 31 | return url 32 | } 33 | 34 | const runFixtureServer = async (t, fixturePath) => 35 | runServer(t, (_, res) => { 36 | res.setHeader('content-type', 'text/html') 37 | res.end(fixture(fixturePath)) 38 | }) 39 | 40 | const prettyHtml = html => 41 | pretty(html, { ocd: true }).replace(dateRegex(), '{DATE}') 42 | 43 | module.exports = { 44 | createHeaders, 45 | initBrowserless, 46 | prettyHtml, 47 | runFixtureServer, 48 | runServer 49 | } 50 | -------------------------------------------------------------------------------- /test/html/get-date.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const { getDate } = require('../../src/html') 6 | 7 | test('from `last-modified`', t => { 8 | const date = getDate({ 'last-modified': 'Fri, 04 Aug 2023 21:10:56 GMT' }) 9 | t.is(date, '2023-08-04T21:10:56.000Z') 10 | }) 11 | 12 | test('from `date`', t => { 13 | const date = getDate({ 'last-modified': 'Sat, 05 Aug 2023 09:43:59 GMT' }) 14 | t.is(date, '2023-08-05T09:43:59.000Z') 15 | }) 16 | 17 | test('from `age`', t => { 18 | { 19 | const date = getDate({ age: '1884' }) 20 | t.truthy(date) 21 | } 22 | { 23 | const date = getDate({}) 24 | t.is(date, undefined) 25 | } 26 | }) 27 | -------------------------------------------------------------------------------- /test/html/index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const cheerio = require('cheerio') 4 | const test = require('ava') 5 | 6 | const { prettyHtml } = require('../helpers') 7 | 8 | const html = (...args) => require('../../src/html')(...args).html() 9 | 10 | test('add minimal html markup', t => { 11 | const output = html({ 12 | url: 'https://kikobeats.com', 13 | html: '', 14 | headers: {} 15 | }) 16 | 17 | t.snapshot(prettyHtml(output)) 18 | }) 19 | 20 | test('add meta charset', t => { 21 | const output = html({ 22 | url: 'https://kikobeats.com', 23 | html: '', 24 | headers: { 'content-type': 'text/html; charset=utf-8' } 25 | }) 26 | 27 | t.snapshot(prettyHtml(output)) 28 | }) 29 | 30 | test('add doctype', t => { 31 | const output = html({ 32 | url: 'https://kikobeats.com', 33 | html: ` 34 | 35 | 36 | kikobeats.com 37 | 38 | 39 | 40 | 41 | 42 | `, 43 | headers: { 'content-type': 'text/html; charset=utf-8' } 44 | }) 45 | 46 | t.snapshot(prettyHtml(output)) 47 | }) 48 | 49 | test('add json markup', t => { 50 | const output = html({ 51 | html: '{"origin":"83.46.149.83","city":"Madrid","alpha2":"ES","alpha3":"ESP","callingCodes":["+34"],"currencies":{"EUR":{"name":"Euro","symbol":"€"}},"eeaMember":true,"euMember":true,"flag":"🇪🇸","languages":{"spa":"Spanish"},"numeric":724,"tld":[".es"],"region":"MD","latitude":"40.4163","longitude":"-3.6934","timezone":"Europe/Madrid","headers":{"accept":"*/*","accept-encoding":"gzip","cdn-loop":"cloudflare","cf-connecting-ip":"83.46.149.83","cf-ipcountry":"ES","cf-ray":"73a29be38cdf37c7-MAD","cf-visitor":"{"scheme":"https"}","connection":"Keep-Alive","host":"geolocation.microlink.io","user-agent":"curl/7.79.1","x-forwarded-for":"172.70.57.171","x-forwarded-host":"geolocation.microlink.io","x-forwarded-proto":"https","x-real-ip":"172.70.57.171","x-vercel-edge-region":"dev","x-vercel-id":"cdg1::x96k9-1660405852783-a0083d276cde","x-vercel-ip-city":"Madrid","x-vercel-ip-country":"ES","x-vercel-ip-country-region":"MD","x-vercel-ip-latitude":"40.4163","x-vercel-ip-longitude":"-3.6934","x-vercel-ip-timezone":"Europe/Madrid","x-vercel-proxied-for":"172.70.57.171"}}', 52 | url: 'https://geolocation.microlink.io/', 53 | headers: { 'content-type': 'application/json' } 54 | }) 55 | 56 | t.snapshot(prettyHtml(output)) 57 | }) 58 | 59 | test('add image markup', t => { 60 | const output = html({ 61 | url: 'https://media.giphy.com/media/LqTSLCsIIkCTvQ8X9g/giphy.gif', 62 | headers: { 'content-type': 'image/gif' } 63 | }) 64 | 65 | t.snapshot(prettyHtml(output)) 66 | }) 67 | 68 | test('add audio markup', t => { 69 | const output = html({ 70 | url: 'http://websrvr90va.audiovideoweb.com/va90web25003/companions/Foundations%20of%20Rock/13.01.mp3', 71 | headers: { 'content-type': 'audio/mp3' } 72 | }) 73 | 74 | t.snapshot(prettyHtml(output)) 75 | }) 76 | 77 | test('add video markup', t => { 78 | const output = html({ 79 | url: 'https://sample-videos.com/video123/mp4/720/big_buck_bunny_720p_1mb.mp4', 80 | headers: { 'content-type': 'video/mp4' } 81 | }) 82 | 83 | t.snapshot(prettyHtml(output)) 84 | }) 85 | 86 | test('styles injection', t => { 87 | const output = html({ 88 | url: 'https://kikobeats.com', 89 | html: ` 90 | 91 | 92 | 93 | 94 | Document 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | `, 103 | styles: [ 104 | 'https://necolas.github.io/normalize.css/8.0.1/normalize.css', 105 | 'body { background: black; }' 106 | ] 107 | }) 108 | 109 | t.true( 110 | output.includes( 111 | '' 112 | ) 113 | ) 114 | 115 | t.true(output.includes('background: black')) 116 | 117 | t.snapshot(prettyHtml(output)) 118 | }) 119 | 120 | test('scripts injection', t => { 121 | const output = html({ 122 | url: 'https://kikobeats.com', 123 | html: ` 124 | 125 | 126 | 127 | 128 | Document 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | `, 137 | scripts: [ 138 | ` 139 | ;(function mutateWindow () { 140 | const iframe = document.createElement('iframe') 141 | iframe.style.display = 'none' 142 | document.body.appendChild(iframe) 143 | 144 | const a = Object.getOwnPropertyNames(iframe.contentWindow) 145 | const b = Object.getOwnPropertyNames(window) 146 | 147 | const diffKeys = b.filter(c => !a.includes(c)) 148 | const diffObj = {} 149 | diffKeys.forEach(key => (diffObj[key] = window[key])) 150 | 151 | console.log('Found', diffKeys.length, 'keys mutates on window') 152 | copy(diffObj) 153 | console.log('Copied to clipboard!') 154 | })()`, 155 | 'https://code.jquery.com/jquery-3.5.1.min.js' 156 | ] 157 | }) 158 | 159 | t.true(output.includes('mutateWindow')) 160 | 161 | t.true( 162 | output.includes( 163 | '' 164 | ) 165 | ) 166 | 167 | t.snapshot(prettyHtml(output)) 168 | }) 169 | 170 | test('hide elements', t => { 171 | const output = html({ 172 | url: 'https://kikobeats.com', 173 | html: ` 174 | 175 | 176 | 177 | 178 | Document 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | `, 187 | hide: '#banner' 188 | }) 189 | 190 | t.true(output.includes('#banner { visibility: hidden !important; }')) 191 | t.snapshot(prettyHtml(output)) 192 | }) 193 | 194 | test('remove elements', t => { 195 | const output = html({ 196 | url: 'https://kikobeats.com', 197 | html: ` 198 | 199 | 200 | 201 | 202 | Document 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | `, 211 | remove: '#banner' 212 | }) 213 | 214 | t.true(output.includes('#banner { display: none !important; }')) 215 | t.snapshot(prettyHtml(output)) 216 | }) 217 | 218 | test('add `og:site_name` when is possible', t => { 219 | t.is( 220 | cheerio 221 | .load(html({ url: 'https://1.1.1.1', html: '', headers: {} }))( 222 | 'meta[property="og:site_name"]' 223 | ) 224 | .attr('content'), 225 | undefined 226 | ) 227 | t.is( 228 | cheerio 229 | .load(html({ url: 'https://kikobeats.com', html: '', headers: {} }))( 230 | 'meta[property="og:site_name"]' 231 | ) 232 | .attr('content'), 233 | 'kikobeats.com' 234 | ) 235 | }) 236 | -------------------------------------------------------------------------------- /test/html/rewrite-css-urls.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const execall = require('execall') 4 | const test = require('ava') 5 | 6 | const { prettyHtml } = require('../helpers') 7 | 8 | const html = (...args) => require('../../src/html')(...args).html() 9 | 10 | test("don't modify html markup", t => { 11 | const output = html({ 12 | rewriteUrls: true, 13 | url: 'https://www.rubiomonocoatusa.com/blogs/blog/how-to-apply-oil-plus-2c-to-furniture', 14 | html: ` 15 | 16 | 17 | 18 | 19 | 20 | 21 | `, 22 | headers: { 23 | 'content-type': 'text/html; charset=utf-8' 24 | } 25 | }) 26 | 27 | t.snapshot(prettyHtml(output)) 28 | }) 29 | 30 | test('rewrites relative URLs inside stylesheet', t => { 31 | const output = html({ 32 | rewriteUrls: true, 33 | url: 'https://kikobeats.com', 34 | html: ` 35 | 36 | 37 |
38 |
39 | 40 | 41 | `, 42 | headers: { 43 | 'content-type': 'text/html; charset=utf-8' 44 | } 45 | }) 46 | 47 | const results = execall( 48 | /https:\/\/kikobeats.com\/images\/microlink\.jpg/g, 49 | output 50 | ) 51 | 52 | t.is(results.length, 2) 53 | t.snapshot(prettyHtml(output)) 54 | }) 55 | -------------------------------------------------------------------------------- /test/html/rewrite-html.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | const cheerio = require('cheerio') 5 | 6 | const { prettyHtml } = require('../helpers') 7 | 8 | const html = (...args) => require('../../src/html')(...args).html() 9 | 10 | const composeHtml = meta => 11 | prettyHtml(` 12 | 13 | 14 | 15 | kikobeats.com 16 | 17 | 18 | ${meta.join('\n')} 19 | 20 | 21 | `) 22 | 23 | ;['fb', 'al'].forEach(prefix => { 24 | test(`treat '${prefix}:' following 'og:' spec`, t => { 25 | const output = html({ 26 | rewriteHtml: true, 27 | url: 'https://kikobeats.com', 28 | html: composeHtml([ 29 | `` 30 | ]), 31 | headers: { 'content-type': 'text/html; charset=utf-8' } 32 | }) 33 | 34 | const $ = cheerio.load(output) 35 | t.is( 36 | $(`meta[property="${prefix}:ios:url"]`).attr('content'), 37 | 'applinks://docs' 38 | ) 39 | t.is($(`meta[name="${prefix}:ios:url"]`).attr('content'), undefined) 40 | }) 41 | }) 42 | ;['twitter', 'fb', 'al', 'og'].forEach(prefix => { 43 | test(`don't rewrite '${prefix}:' if content is empty`, t => { 44 | const output = html({ 45 | rewriteHtml: true, 46 | url: 'https://kikobeats.com', 47 | html: composeHtml([``]), 48 | headers: { 'content-type': 'text/html; charset=utf-8' } 49 | }) 50 | 51 | const $ = cheerio.load(output) 52 | t.is($(`meta[name="${prefix}:ios:url"]`).attr('content'), '') 53 | t.is($(`meta[property="${prefix}:ios:url"]`).attr('content'), undefined) 54 | }) 55 | }) 56 | 57 | test("don't rewrite meta if content is empty", t => { 58 | const output = html({ 59 | rewriteHtml: true, 60 | url: 'https://kikobeats.com', 61 | html: composeHtml(['']), 62 | headers: { 'content-type': 'text/html; charset=utf-8' } 63 | }) 64 | 65 | const $ = cheerio.load(output) 66 | t.is($('meta[property="title"]').attr('content'), '') 67 | t.is($('meta[name="title"]').attr('content'), undefined) 68 | }) 69 | 70 | test('rewrite multiple meta wrong markup', t => { 71 | const output = html({ 72 | rewriteHtml: true, 73 | url: 'https://kikobeats.com', 74 | html: composeHtml([ 75 | '', 76 | '', 77 | '' 78 | ]), 79 | headers: { 'content-type': 'text/html; charset=utf-8' } 80 | }) 81 | 82 | const $ = cheerio.load(output) 83 | t.is($('meta[name="title"]').attr('content'), 'Kiko Beats') 84 | t.is($('meta[property="title"]').attr('content'), undefined) 85 | t.is( 86 | $('meta[name="description"]').attr('content'), 87 | 'Personal website of Kiko Beats' 88 | ) 89 | t.is($('meta[property="description"]').attr('content'), undefined) 90 | t.is( 91 | $('meta[name="image"]').attr('content'), 92 | 'https://kikobeats.com/image.jpg' 93 | ) 94 | t.is($('meta[property="image"]').attr('content'), undefined) 95 | }) 96 | 97 | test("rewrite multiple 'twitter:' wrong markup", t => { 98 | const output = html({ 99 | rewriteHtml: true, 100 | url: 'https://kikobeats.com', 101 | html: composeHtml([ 102 | '', 103 | '', 104 | '' 105 | ]), 106 | headers: { 'content-type': 'text/html; charset=utf-8' } 107 | }) 108 | 109 | const $ = cheerio.load(output) 110 | t.is($('meta[name="twitter:title"]').attr('content'), 'Kiko Beats') 111 | t.is($('meta[property="twitter:title"]').attr('content'), undefined) 112 | t.is( 113 | $('meta[name="twitter:description"]').attr('content'), 114 | 'Personal website of Kiko Beats' 115 | ) 116 | t.is($('meta[property="twitter:description"]').attr('content'), undefined) 117 | t.is( 118 | $('meta[name="twitter:image"]').attr('content'), 119 | 'https://kikobeats.com/image.jpg' 120 | ) 121 | t.is($('meta[property="twitter:image"]').attr('content'), undefined) 122 | }) 123 | ;['al', 'fb', 'og'].forEach(prefix => { 124 | test(`rewrite multiple '${prefix}' wrong markup`, t => { 125 | const output = html({ 126 | rewriteHtml: true, 127 | url: 'https://kikobeats.com', 128 | html: composeHtml([ 129 | ``, 130 | `` 131 | ]), 132 | headers: { 'content-type': 'text/html; charset=utf-8' } 133 | }) 134 | 135 | const $ = cheerio.load(output) 136 | t.is($(`meta[property="${prefix}:app_id"]`).attr('content'), '1234') 137 | t.is($(`meta[name="${prefix}:app_id"]`).attr('content'), undefined) 138 | t.is($(`meta[property="${prefix}:session_id"]`).attr('content'), '5678') 139 | t.is($(`meta[name="${prefix}:session_id"]`).attr('content'), undefined) 140 | }) 141 | }) 142 | -------------------------------------------------------------------------------- /test/html/rewrite-urls.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const path = require('path') 4 | const test = require('ava') 5 | const fs = require('fs') 6 | 7 | const { prettyHtml } = require('../helpers') 8 | 9 | const html = (...args) => require('../../src/html')(...args).html() 10 | 11 | test('remove localhost alike URLs', t => { 12 | const output = html({ 13 | rewriteUrls: true, 14 | url: 'https://kikobeats.com', 15 | html: ` 16 | 17 | 18 | kikobeats.com 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | Email 29 | FTP Link 30 | Base64 Image 31 | 32 | `, 33 | headers: { 'content-type': 'text/html; charset=utf-8' } 34 | }) 35 | 36 | t.snapshot(prettyHtml(output)) 37 | }) 38 | 39 | test('rewrites relative root URLs inside html markup', t => { 40 | const output = html({ 41 | rewriteUrls: true, 42 | url: 'https://browserless.js.org', 43 | html: fs.readFileSync( 44 | path.resolve(__dirname, '../fixtures/browserless.html'), 45 | 'utf8' 46 | ), 47 | headers: { 48 | 'content-type': 'text/html; charset=utf-8' 49 | } 50 | }) 51 | 52 | t.true(output.includes('https://browserless.js.org/static/main.min.js')) 53 | t.true(output.includes('https://unpkg.com/docsify/lib/docsify.min.js')) 54 | 55 | t.snapshot(prettyHtml(output)) 56 | }) 57 | 58 | test('rewrites relative URLs inside html markup', t => { 59 | const output = html({ 60 | rewriteUrls: true, 61 | url: 'https://moovility.me/', 62 | html: ` 63 | 64 | 65 | 66 | 67 | 68 | `, 69 | headers: { 70 | 'content-type': 'text/html; charset=utf-8' 71 | } 72 | }) 73 | 74 | t.true(output.includes('https://moovility.me/img/icons/MOV/icon2-76.png')) 75 | 76 | t.snapshot(prettyHtml(output)) 77 | }) 78 | 79 | test(" don't modify inline javascript", t => { 80 | const output = html({ 81 | rewriteUrls: true, 82 | url: 'https://www.latimes.com/opinion/story/2020-06-07/column-muralist-honors-african-americans-killed-by-police', 83 | html: ` 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | Print 92 | 93 | `, 94 | headers: { 95 | 'content-type': 'text/html;charset=UTF-8' 96 | } 97 | }) 98 | 99 | t.true( 100 | output.includes( 101 | 'Print' 102 | ) 103 | ) 104 | 105 | t.snapshot(prettyHtml(output)) 106 | }) 107 | 108 | test("don't modify non http protocols", t => { 109 | const output = html({ 110 | rewriteUrls: true, 111 | url: 'https://www.latimes.com/opinion/story/2020-06-07/column-muralist-honors-african-americans-killed-by-police', 112 | html: ` 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | `, 128 | headers: { 129 | 'content-type': 'text/html;charset=UTF-8' 130 | } 131 | }) 132 | 133 | t.true(output.includes('')) 134 | t.true(output.includes('')) 135 | t.true(output.includes('')) 137 | t.true(output.includes('')) 138 | t.true(output.includes('')) 139 | 140 | t.snapshot(prettyHtml(output)) 141 | }) 142 | 143 | test("don't modify data URIs", t => { 144 | const output = html({ 145 | rewriteUrls: true, 146 | url: 'https://example.com', 147 | html: ` 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | star 156 | 157 | `, 158 | headers: { 159 | 'content-type': 'text/html;charset=UTF-8' 160 | } 161 | }) 162 | 163 | t.true( 164 | output.includes( 165 | 'star' 166 | ) 167 | ) 168 | 169 | t.snapshot(prettyHtml(output)) 170 | }) 171 | 172 | test("don't modify undefined attributes", t => { 173 | const output = html({ 174 | rewriteUrls: true, 175 | url: 'https://moovility.me', 176 | html: ` 177 | 178 | 179 | 180 | Document 181 | 182 | 183 | 184 | 185 | `, 186 | headers: { 187 | 'content-type': 'text/html;charset=UTF-8' 188 | } 189 | }) 190 | 191 | t.true(output.includes("")) 192 | 193 | t.snapshot(prettyHtml(output)) 194 | }) 195 | -------------------------------------------------------------------------------- /test/html/snapshots/index.js.md: -------------------------------------------------------------------------------- 1 | # Snapshot report for `test/html/index.js` 2 | 3 | The actual snapshot is saved in `index.js.snap`. 4 | 5 | Generated by [AVA](https://avajs.dev). 6 | 7 | ## add minimal html markup 8 | 9 | > Snapshot 1 10 | 11 | `␊ 12 | ␊ 13 | ␊ 14 | kikobeats.com␊ 15 | ␊ 16 | ␊ 17 | ␊ 18 | ␊ 19 | ` 20 | 21 | ## add meta charset 22 | 23 | > Snapshot 1 24 | 25 | `␊ 26 | ␊ 27 | ␊ 28 | kikobeats.com␊ 29 | ␊ 30 | ␊ 31 | ␊ 32 | ␊ 33 | ␊ 34 | ` 35 | 36 | ## add doctype 37 | 38 | > Snapshot 1 39 | 40 | `␊ 41 | ␊ 42 | ␊ 43 | kikobeats.com␊ 44 | ␊ 45 | ␊ 46 | ␊ 47 | ␊ 48 | ␊ 49 | ␊ 50 | ` 51 | 52 | ## add json markup 53 | 54 | > Snapshot 1 55 | 56 | `␊ 57 | ␊ 58 | ␊ 59 | geolocation.microlink.io␊ 60 | ␊ 61 | ␊ 62 | ␊ 63 |
{"origin":"83.46.149.83","city":"Madrid","alpha2":"ES","alpha3":"ESP","callingCodes":["+34"],"currencies":{"EUR":{"name":"Euro","symbol":"€"}},"eeaMember":true,"euMember":true,"flag":"🇪🇸","languages":{"spa":"Spanish"},"numeric":724,"tld":[".es"],"region":"MD","latitude":"40.4163","longitude":"-3.6934","timezone":"Europe/Madrid","headers":{"accept":"*/*","accept-encoding":"gzip","cdn-loop":"cloudflare","cf-connecting-ip":"83.46.149.83","cf-ipcountry":"ES","cf-ray":"73a29be38cdf37c7-MAD","cf-visitor":"{"scheme":"https"}","connection":"Keep-Alive","host":"geolocation.microlink.io","user-agent":"curl/7.79.1","x-forwarded-for":"172.70.57.171","x-forwarded-host":"geolocation.microlink.io","x-forwarded-proto":"https","x-real-ip":"172.70.57.171","x-vercel-edge-region":"dev","x-vercel-id":"cdg1::x96k9-1660405852783-a0083d276cde","x-vercel-ip-city":"Madrid","x-vercel-ip-country":"ES","x-vercel-ip-country-region":"MD","x-vercel-ip-latitude":"40.4163","x-vercel-ip-longitude":"-3.6934","x-vercel-ip-timezone":"Europe/Madrid","x-vercel-proxied-for":"172.70.57.171"}}
␊ 64 | ␊ 65 | ` 66 | 67 | ## add image markup 68 | 69 | > Snapshot 1 70 | 71 | `␊ 72 | ␊ 73 | ␊ 74 | giphy.gif␊ 75 | ␊ 76 | ␊ 77 | ␊ 78 | ␊ 79 | ` 80 | 81 | ## add audio markup 82 | 83 | > Snapshot 1 84 | 85 | `␊ 86 | ␊ 87 | ␊ 88 | 13.01.mp3␊ 89 | ␊ 90 | ␊ 91 | ␊ 92 | ␊ 95 | ` 96 | 97 | ## add video markup 98 | 99 | > Snapshot 1 100 | 101 | `␊ 102 | ␊ 103 | ␊ 104 | big_buck_bunny_720p_1mb.mp4␊ 105 | ␊ 106 | ␊ 107 | ␊ 108 | ␊ 111 | ` 112 | 113 | ## styles injection 114 | 115 | > Snapshot 1 116 | 117 | `␊ 118 | ␊ 119 | ␊ 120 | ␊ 121 | ␊ 122 | Document␊ 123 | ␊ 124 | ␊ 125 | ␊ 126 | ␊ 127 | ␊ 128 | ␊ 133 | ␊ 134 | ␊ 135 | ␊ 136 | ` 137 | 138 | ## scripts injection 139 | 140 | > Snapshot 1 141 | 142 | `␊ 143 | ␊ 144 | ␊ 145 | ␊ 146 | ␊ 147 | Document␊ 148 | ␊ 149 | ␊ 150 | ␊ 151 | ␊ 152 | ␊ 168 | ␊ 169 | ␊ 170 | ␊ 171 | ␊ 172 | ` 173 | 174 | ## hide elements 175 | 176 | > Snapshot 1 177 | 178 | `␊ 179 | ␊ 180 | ␊ 181 | ␊ 182 | ␊ 183 | Document␊ 184 | ␊ 185 | ␊ 186 | ␊ 187 | ␊ 188 | ␊ 193 | ␊ 194 | ␊ 195 | ␊ 196 | ` 197 | 198 | ## remove elements 199 | 200 | > Snapshot 1 201 | 202 | `␊ 203 | ␊ 204 | ␊ 205 | ␊ 206 | ␊ 207 | Document␊ 208 | ␊ 209 | ␊ 210 | ␊ 211 | ␊ 212 | ␊ 217 | ␊ 218 | ␊ 219 | ␊ 220 | ` 221 | -------------------------------------------------------------------------------- /test/html/snapshots/index.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/html/snapshots/index.js.snap -------------------------------------------------------------------------------- /test/html/snapshots/rewrite-css-urls.js.md: -------------------------------------------------------------------------------- 1 | # Snapshot report for `test/html/rewrite-css-urls.js` 2 | 3 | The actual snapshot is saved in `rewrite-css-urls.js.snap`. 4 | 5 | Generated by [AVA](https://avajs.dev). 6 | 7 | ## don't modify html markup 8 | 9 | > Snapshot 1 10 | 11 | `␊ 12 | ␊ 13 | ␊ 14 | ␊ 19 | ␊ 20 | how-to-apply-oil-plus-2c-to-furniture␊ 21 | ␊ 22 | ␊ 23 | ␊ 24 | ␊ 25 | ␊ 26 | ␊ 27 | ` 28 | 29 | ## rewrites relative URLs inside stylesheet 30 | 31 | > Snapshot 1 32 | 33 | `␊ 34 | ␊ 35 | ␊ 36 | kikobeats.com␊ 37 | ␊ 38 | ␊ 39 | ␊ 40 | ␊ 41 | ␊ 42 |
␊ 43 |
␊ 44 | ␊ 45 | ` 46 | -------------------------------------------------------------------------------- /test/html/snapshots/rewrite-css-urls.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/html/snapshots/rewrite-css-urls.js.snap -------------------------------------------------------------------------------- /test/html/snapshots/rewrite-urls.js.md: -------------------------------------------------------------------------------- 1 | # Snapshot report for `test/html/rewrite-urls.js` 2 | 3 | The actual snapshot is saved in `rewrite-urls.js.snap`. 4 | 5 | Generated by [AVA](https://avajs.dev). 6 | 7 | ## remove localhost alike URLs 8 | 9 | > Snapshot 1 10 | 11 | `␊ 12 | ␊ 13 | ␊ 14 | kikobeats.com␊ 15 | ␊ 16 | ␊ 17 | ␊ 18 | ␊ 19 | ␊ 20 | ␊ 21 | Email␊ 22 | FTP Link␊ 23 | Base64 Image␊ 24 | ␊ 25 | ` 26 | 27 | ## rewrites relative root URLs inside html markup 28 | 29 | > Snapshot 1 30 | 31 | `␊ 32 | ␊ 33 | ␊ 34 | ␊ 35 | ␊ 36 | ␊ 37 | ␊ 38 | ␊ 39 | ␊ 40 | ␊ 41 | ␊ 42 | ␊ 43 | browserless, a puppeter-like Node.js library for interacting with Headless production scenarios.␊ 44 | ␊ 45 | ␊ 46 | ␊ 47 | ␊ 48 | ␊ 49 | ␊ 50 | ␊ 51 | ␊ 52 | ␊ 53 | ␊ 54 | ␊ 55 | ␊ 56 | ␊ 57 | ␊ 58 | ␊ 59 | ␊ 60 | ␊ 61 | ␊ 62 | ␊ 63 | ␊ 64 | ␊ 65 | ␊ 66 | ␊ 67 | ␊ 68 | ␊ 69 | ␊ 70 | ␊ 71 | ␊ 72 | ␊ 73 | ␊ 74 | ␊ 75 | ␊ 76 | ␊ 77 | ␊ 78 |
␊ 79 | ␊ 80 | ␊ 81 | ␊ 82 | ␊ 83 | ␊ 84 | ␊ 85 | ␊ 86 | ␊ 87 | ` 88 | 89 | ## rewrites relative URLs inside html markup 90 | 91 | > Snapshot 1 92 | 93 | `␊ 94 | ␊ 95 | ␊ 96 | ␊ 97 | moovility.me␊ 98 | ␊ 99 | ␊ 100 | ␊ 101 | ␊ 102 | ␊ 103 | ␊ 104 | ` 105 | 106 | ## don't modify inline javascript 107 | 108 | > Snapshot 1 109 | 110 | `␊ 111 | ␊ 112 | ␊ 113 | ␊ 114 | ␊ 115 | column-muralist-honors-african-americans-killed-by-police␊ 116 | ␊ 117 | ␊ 118 | ␊ 119 | ␊ 120 | ␊ 121 | ␊ 122 | Print␊ 123 | ␊ 124 | ` 125 | 126 | ## don't modify non http protocols 127 | 128 | > Snapshot 1 129 | 130 | `␊ 131 | ␊ 132 | ␊ 133 | ␊ 134 | ␊ 135 | column-muralist-honors-african-americans-killed-by-police␊ 136 | ␊ 137 | ␊ 138 | ␊ 139 | ␊ 140 | ␊ 141 | ␊ 142 | ␊ 143 | ␊ 144 | ␊ 145 | ␊ 146 | ␊ 147 | ` 148 | 149 | ## don't modify data URIs 150 | 151 | > Snapshot 1 152 | 153 | `␊ 154 | ␊ 155 | ␊ 156 | ␊ 157 | ␊ 158 | example.com␊ 159 | ␊ 160 | ␊ 161 | ␊ 162 | ␊ 163 | star␊ 164 | ␊ 165 | ` 166 | 167 | ## don't modify undefined attributes 168 | 169 | > Snapshot 1 170 | 171 | `␊ 172 | ␊ 173 | ␊ 174 | Document␊ 175 | ␊ 176 | ␊ 177 | ␊ 178 | ␊ 179 | ␊ 180 | ␊ 183 | ␊ 184 | ` 185 | -------------------------------------------------------------------------------- /test/html/snapshots/rewrite-urls.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/html/snapshots/rewrite-urls.js.snap -------------------------------------------------------------------------------- /test/index.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const PCancelable = require('p-cancelable') 4 | const cheerio = require('cheerio') 5 | const test = require('ava') 6 | 7 | const { initBrowserless, runServer, prettyHtml } = require('./helpers') 8 | const getHTML = require('..') 9 | 10 | const getBrowserless = initBrowserless(test) 11 | 12 | const wait = async (promise, prop) => { 13 | const res = await promise 14 | return prop ? res[prop] : res 15 | } 16 | 17 | test('throw an error if `getBrowserless` is not provided', async t => { 18 | const url = 'https://example.com' 19 | const error = await t.throwsAsync(getHTML(url)) 20 | t.is(error.name, 'TypeError') 21 | t.is( 22 | error.message, 23 | "Need to provide a `getBrowserless` function. Try to pass `getBrowserless: require('browserless')`" 24 | ) 25 | }) 26 | 27 | test('promise is cancelable', async t => { 28 | const url = 'https://example.com' 29 | t.true(getHTML(url, { getBrowserless: () => {} }) instanceof PCancelable) 30 | t.true( 31 | getHTML.getContent(url, 'fetch', { 32 | getBrowserless: () => {} 33 | }) instanceof PCancelable 34 | ) 35 | }) 36 | 37 | test('reachable URL', async t => { 38 | const url = 'https://example.com' 39 | const [prerenderDisabled, prerenderEnabled] = await Promise.all([ 40 | getHTML(url, { prerender: false, getBrowserless }), 41 | getHTML(url, { 42 | prerender: true, 43 | getBrowserless, 44 | puppeteerOpts: { adblock: false } 45 | }) 46 | ]) 47 | 48 | t.is( 49 | await wait( 50 | getHTML(url, { prerender: false, getBrowserless }), 51 | 'statusCode' 52 | ), 53 | 200 54 | ) 55 | t.is( 56 | await wait( 57 | getHTML(url, { 58 | prerender: true, 59 | getBrowserless, 60 | puppeteerOpts: { adblock: false } 61 | }), 62 | 'statusCode' 63 | ), 64 | 200 65 | ) 66 | 67 | t.is(prerenderDisabled.statusCode, prerenderEnabled.statusCode) 68 | t.is(prerenderDisabled.statusCode, 200) 69 | 70 | t.true(Object.keys(prerenderDisabled.headers).length > 0) 71 | t.true(Object.keys(prerenderEnabled.headers).length > 0) 72 | t.is(typeof prerenderDisabled.headers, typeof prerenderEnabled.headers) 73 | 74 | t.true(prerenderDisabled.html.length > 0) 75 | t.true(prerenderEnabled.html.length > 0) 76 | t.is(typeof prerenderDisabled.html, typeof prerenderEnabled.html) 77 | }) 78 | 79 | test('timeout URL', async t => { 80 | const url = 'https://test-timeout.vercel.app' 81 | 82 | const [prerenderDisabled, prerenderEnabled] = await Promise.all([ 83 | getHTML(url, { 84 | prerender: false, 85 | getBrowserless, 86 | gotOpts: { timeout: 1000 } 87 | }), 88 | getHTML(url, { 89 | prerender: true, 90 | getBrowserless, 91 | puppeteerOpts: { timeout: 2000, adblock: false } 92 | }) 93 | ]) 94 | 95 | t.is(prerenderDisabled.url, prerenderEnabled.url) 96 | t.is(prerenderDisabled.html, prerenderEnabled.html) 97 | t.is(prerenderDisabled.statusCode, prerenderEnabled.statusCode) 98 | t.deepEqual(prerenderDisabled.headers, prerenderEnabled.headers) 99 | }) 100 | 101 | test('unreachable URL', async t => { 102 | const url = 'https://notexisturl.dev' 103 | 104 | const [prerenderDisabled, prerenderEnabled] = await Promise.all([ 105 | getHTML(url, { prerender: false, getBrowserless }), 106 | getHTML(url, { 107 | prerender: true, 108 | getBrowserless, 109 | puppeteerOpts: { adblock: false } 110 | }) 111 | ]) 112 | 113 | t.is(prerenderDisabled.url, prerenderEnabled.url) 114 | t.is(prerenderDisabled.html, prerenderEnabled.html) 115 | t.is(prerenderDisabled.statusCode, prerenderEnabled.statusCode) 116 | t.deepEqual(prerenderDisabled.headers, prerenderEnabled.headers) 117 | }) 118 | 119 | test('from audio URL', async t => { 120 | const targetUrl = 121 | 'https://audiodemos.github.io/vctk_set0/embedadapt_100sample.wav' 122 | const { url, stats, html } = await getHTML(targetUrl, { 123 | getBrowserless, 124 | prerender: false 125 | }) 126 | 127 | t.is(stats.mode, 'fetch') 128 | t.is(url, targetUrl) 129 | t.snapshot(prettyHtml(html)) 130 | }) 131 | 132 | test('from image URL', async t => { 133 | const targetUrl = 'https://kikobeats.com/images/avatar.jpg' 134 | const { url, stats, html } = await getHTML(targetUrl, { getBrowserless }) 135 | 136 | t.is(stats.mode, 'fetch') 137 | t.is(url, targetUrl) 138 | 139 | const $ = cheerio.load(html) 140 | $('meta[name="date"]').remove() 141 | 142 | t.snapshot(prettyHtml($.html())) 143 | }) 144 | 145 | test('from SVG image URL', async t => { 146 | const targetUrl = 'https://cdn.microlink.io/file-examples/sample.svg' 147 | const { stats } = await getHTML(targetUrl, { getBrowserless }) 148 | t.true(stats.timing < 3000) 149 | t.is(stats.mode, 'fetch') 150 | }) 151 | 152 | test('from big image URL', async t => { 153 | const targetUrl = 154 | 'https://static.jutarnji.hr/images/live-multimedia/binary/2016/6/17/10/iStock_82744687_XXLARGE.jpg' 155 | const { stats } = await getHTML(targetUrl, { getBrowserless }) 156 | t.true(stats.timing < 3000) 157 | t.is(stats.mode, 'fetch') 158 | }) 159 | 160 | test('from URL with no content type', async t => { 161 | const targetUrl = await runServer(t, (_, res) => { 162 | res.end('.') 163 | }) 164 | const { stats } = await getHTML(targetUrl, { 165 | getBrowserless, 166 | prerender: false 167 | }) 168 | t.is(stats.mode, 'fetch') 169 | }) 170 | 171 | test('from image URL that returns HTML markup', async t => { 172 | const targetUrl = 173 | 'https://www.europapress.es/chance/gente/%7B%7BrutaFoto%7D%7D%7B%7Bfechor%7D%7D_%7B%7BanchoFoto%7D%7D_%7B%7BaltoFoto%7D%7D%7B%7BversionFoto%7D%7D.jpg' 174 | const { stats } = await getHTML(targetUrl, { getBrowserless }) 175 | t.true(stats.timing < 3000) 176 | t.is(stats.mode, 'fetch') 177 | }) 178 | 179 | test('from video URL', async t => { 180 | const targetUrl = 'https://cdn.microlink.io/file-examples/sample.mp4' 181 | const { url, stats, html } = await getHTML(targetUrl, { 182 | prerender: false, 183 | getBrowserless 184 | }) 185 | 186 | t.is(stats.mode, 'fetch') 187 | t.is(url, targetUrl) 188 | t.snapshot(prettyHtml(html)) 189 | }) 190 | 191 | test('from bad SSL URL', async t => { 192 | const targetUrl = 'https://self-signed.badssl.com/' 193 | const { url, stats, html } = await getHTML(targetUrl, { 194 | prerender: false, 195 | getBrowserless, 196 | gotOpts: { 197 | https: { rejectUnauthorized: false } 198 | } 199 | }) 200 | 201 | t.true(html.includes('background: red')) 202 | t.is(stats.mode, 'fetch') 203 | t.is(url, targetUrl) 204 | t.snapshot(prettyHtml(html)) 205 | }) 206 | -------------------------------------------------------------------------------- /test/is-fetch-mode.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const { isFetchMode } = require('..') 6 | 7 | test('true', t => { 8 | t.true( 9 | isFetchMode( 10 | 'https://www.abc.net.au/news/2023-06-14/idpwd-2023-calling-all-budding-storytellers-with-disability/102388090' 11 | ) 12 | ) 13 | t.true( 14 | isFetchMode('https://twitter.com/Kikobeats/status/1741205717636264436') 15 | ) 16 | }) 17 | -------------------------------------------------------------------------------- /test/mode.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const getHTML = require('../src') 6 | const { initBrowserless } = require('./helpers') 7 | 8 | const getBrowserless = initBrowserless(test) 9 | 10 | test('`{ prerender: true }`', async t => { 11 | const url = 'https://example.com' 12 | const { stats } = await getHTML(url, { getBrowserless }) 13 | t.is(stats.mode, 'prerender') 14 | }) 15 | 16 | test('`{ prerender: false }`', async t => { 17 | const url = 'https://example.com' 18 | const { stats } = await getHTML(url, { prerender: false, getBrowserless }) 19 | t.is(stats.mode, 'fetch') 20 | }) 21 | 22 | test("`{ prerender: 'auto' }`", async t => { 23 | { 24 | const url = 'https://google.com' 25 | const { stats } = await getHTML(url, { 26 | getBrowserless, 27 | puppeteerOpts: { adblock: false } 28 | }) 29 | t.is(stats.mode, 'fetch') 30 | } 31 | { 32 | const url = 'https://twitter.com/Kikobeats/status/1741205717636264436' 33 | const { html, stats } = await getHTML(url, { 34 | headers: { 35 | 'user-agent': 'Slackbot 1.0 (+https://api.slack.com/robots)' 36 | }, 37 | getBrowserless, 38 | puppeteerOpts: { adblock: false } 39 | }) 40 | t.true(html.includes('og:title')) 41 | t.is(stats.mode, 'fetch') 42 | } 43 | }) 44 | 45 | test.skip('prerender error fallback into fetch mode', async t => { 46 | const url = 47 | 'https://www.sportsnet.ca/hockey/nhl/leafs-john-tavares-return-new-york-hope-positive/' 48 | const { stats, html } = await getHTML(url, { 49 | prerender: true, 50 | getBrowserless, 51 | puppeteerOpts: { adblock: false } 52 | }) 53 | t.true(!!html) 54 | t.is(stats.mode, 'fetch') 55 | }) 56 | -------------------------------------------------------------------------------- /test/pdf.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const cheerio = require('cheerio') 4 | const test = require('ava') 5 | 6 | const { initBrowserless, prettyHtml } = require('./helpers') 7 | const getHTML = require('..') 8 | 9 | const getBrowserless = initBrowserless(test) 10 | 11 | const PDF_OVER_TRESHOLD = 'https://cdn.microlink.io/file-examples/sample.pdf' 12 | const PDF_UNDER_TRESHOLD = 'https://pdfobject.com/pdf/sample.pdf' 13 | 14 | test('disable if `mutool` is not installed', async t => { 15 | const targetUrl = 'https://cdn.microlink.io/file-examples/sample.pdf' 16 | const { url, stats, html } = await getHTML(targetUrl, { 17 | mutool: false, 18 | getBrowserless 19 | }) 20 | 21 | const $ = cheerio.load(html) 22 | $('meta[name="date"]').remove() 23 | 24 | t.is(url, targetUrl) 25 | t.snapshot(prettyHtml($.html())) 26 | t.is(stats.mode, 'fetch') 27 | }) 28 | 29 | test('turn PDF into HTML markup over the treshold', async t => { 30 | const targetUrl = PDF_OVER_TRESHOLD 31 | const { url, stats, html } = await getHTML(targetUrl, { 32 | getBrowserless 33 | }) 34 | 35 | const $ = cheerio.load(html) 36 | t.is(url, targetUrl) 37 | t.is( 38 | $('p').first().text(), 39 | 'Instructions for Adding Your Logo & Address to AAO-HNSF Patient Handouts' 40 | ) 41 | t.is(stats.mode, 'fetch') 42 | }) 43 | 44 | test('turn PDF into HTML markup under the treshold', async t => { 45 | const targetUrl = PDF_UNDER_TRESHOLD 46 | const { url, stats, html } = await getHTML(targetUrl, { 47 | getBrowserless 48 | }) 49 | const $ = cheerio.load(html) 50 | t.is(url, targetUrl) 51 | t.is($('p').eq(1).text(), 'This is a simple PDF file. Fun fun fun.') 52 | t.is(stats.mode, 'fetch') 53 | }) 54 | -------------------------------------------------------------------------------- /test/redirects.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const { initBrowserless } = require('./helpers') 6 | const getHTML = require('..') 7 | 8 | const getBrowserless = initBrowserless(test) 9 | 10 | ;[true, false].forEach(prerender => { 11 | const mode = prerender ? 'prerender' : 'fetch' 12 | 13 | test(`${mode} » collect redirects`, async t => { 14 | const targetUrl = 15 | 'https://test-redirect-drab.vercel.app/?url=https%3A%2F%2Ftest-redirect-drab.vercel.app%3Furl%3Dhttps%253A%252F%252Ftest-redirect-drab.vercel.app%252F%253Furl%253Dhttps%253A%252F%252Fexample.com' 16 | 17 | const { redirects } = await getHTML(targetUrl, { 18 | prerender, 19 | getBrowserless 20 | }) 21 | 22 | t.deepEqual(redirects, [ 23 | { 24 | statusCode: 302, 25 | url: 'https://test-redirect-drab.vercel.app/?url=https%3A%2F%2Ftest-redirect-drab.vercel.app%3Furl%3Dhttps%253A%252F%252Ftest-redirect-drab.vercel.app%252F%253Furl%253Dhttps%253A%252F%252Fexample.com' 26 | }, 27 | { 28 | statusCode: 302, 29 | url: 'https://test-redirect-drab.vercel.app/?url=https%3A%2F%2Ftest-redirect-drab.vercel.app%2F%3Furl%3Dhttps%3A%2F%2Fexample.com' 30 | }, 31 | { 32 | statusCode: 302, 33 | url: 'https://test-redirect-drab.vercel.app/?url=https://example.com' 34 | } 35 | ]) 36 | }) 37 | }) 38 | -------------------------------------------------------------------------------- /test/snapshots/index.js.md: -------------------------------------------------------------------------------- 1 | # Snapshot report for `test/index.js` 2 | 3 | The actual snapshot is saved in `index.js.snap`. 4 | 5 | Generated by [AVA](https://avajs.dev). 6 | 7 | ## from audio URL 8 | 9 | > Snapshot 1 10 | 11 | `␊ 12 | ␊ 13 | ␊ 14 | embedadapt_100sample.wav␊ 15 | ␊ 16 | ␊ 17 | ␊ 18 | ␊ 19 | ␊ 22 | ` 23 | 24 | ## from image URL 25 | 26 | > Snapshot 1 27 | 28 | `␊ 29 | ␊ 30 | ␊ 31 | avatar.jpg␊ 32 | ␊ 33 | ␊ 34 | ␊ 35 | ␊ 36 | ` 37 | 38 | ## from video URL 39 | 40 | > Snapshot 1 41 | 42 | `␊ 43 | ␊ 44 | ␊ 45 | sample.mp4␊ 46 | ␊ 47 | ␊ 48 | ␊ 49 | ␊ 50 | ␊ 53 | ` 54 | 55 | ## from bad SSL URL 56 | 57 | > Snapshot 1 58 | 59 | `␊ 60 | ␊ 61 | ␊ 62 | ␊ 63 | ␊ 64 | ␊ 65 | ␊ 66 | self-signed.badssl.com␊ 67 | ␊ 68 | ␊ 73 | ␊ 74 | ␊ 75 | ␊ 76 | ␊ 77 | ␊ 78 |
␊ 79 |

␊ 80 | self-signed.
badssl.com␊ 81 |

␊ 82 |
␊ 83 | ␊ 84 | ` 85 | -------------------------------------------------------------------------------- /test/snapshots/index.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/snapshots/index.js.snap -------------------------------------------------------------------------------- /test/snapshots/pdf.js.md: -------------------------------------------------------------------------------- 1 | # Snapshot report for `test/pdf.js` 2 | 3 | The actual snapshot is saved in `pdf.js.snap`. 4 | 5 | Generated by [AVA](https://avajs.dev). 6 | 7 | ## disable if `mutool` is not installed 8 | 9 | > Snapshot 1 10 | 11 | `␊ 12 | ␊ 13 | ␊ 14 | sample.pdf␊ 15 | ␊ 16 | ␊ 17 | ␊ 18 | ␊ 19 | ` 20 | -------------------------------------------------------------------------------- /test/snapshots/pdf.js.snap: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microlinkhq/html-get/32ca81a633865e8e236408ec1081d7c0e02b1292/test/snapshots/pdf.js.snap -------------------------------------------------------------------------------- /test/url.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const { initBrowserless, runServer, prettyHtml } = require('./helpers') 6 | const getHTML = require('..') 7 | 8 | const getBrowserless = initBrowserless(test) 9 | 10 | ;[false, true].forEach(prerender => { 11 | const mode = prerender ? 'prerender' : 'fetch' 12 | test(`${mode} » as string`, async t => { 13 | const url = await runServer(t, (_, res) => 14 | res.end('.') 15 | ) 16 | const { html } = await getHTML(url.toString(), { 17 | getBrowserless, 18 | prerender, 19 | puppeteerOpts: { adblock: false, animations: true } 20 | }) 21 | 22 | t.is( 23 | prettyHtml(html), 24 | prettyHtml(` 25 | 26 | 27 | . 28 | 29 | 30 | 31 | 32 | `) 33 | ) 34 | }) 35 | 36 | test(`${mode} » as WHATWG URL object`, async t => { 37 | const url = await runServer(t, (_, res) => 38 | res.end('.') 39 | ) 40 | const { html } = await getHTML(url, { 41 | getBrowserless, 42 | prerender, 43 | puppeteerOpts: { adblock: false, animations: true } 44 | }) 45 | 46 | t.is( 47 | prettyHtml(html), 48 | prettyHtml(` 49 | 50 | 51 | . 52 | 53 | 54 | 55 | 56 | `) 57 | ) 58 | }) 59 | }) 60 | -------------------------------------------------------------------------------- /test/util/get-charset.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const { getCharset } = require('../../src/util') 6 | 7 | const { createHeaders } = require('../helpers') 8 | 9 | const contentType = createHeaders('content-type') 10 | 11 | test('returns lower case value detected from content-type', t => { 12 | t.is(getCharset(contentType('text/html; charset=UTF-8')), 'utf-8') 13 | t.is(getCharset(contentType('text/html; charset=ISO-8859-1')), 'iso-8859-1') 14 | }) 15 | 16 | test('returns undefined when charset is not detected', t => { 17 | t.is(getCharset(contentType('text/html; foo=bar')), undefined) 18 | t.is(getCharset(contentType('text/html')), undefined) 19 | t.is(getCharset(contentType('text/html')), undefined) 20 | t.is(getCharset(contentType('invalid/type')), undefined) 21 | }) 22 | -------------------------------------------------------------------------------- /test/util/get-content-length.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const { getContentLength } = require('../../src/util') 6 | 7 | const { PDF_SIZE_TRESHOLD } = require('../../src') 8 | 9 | const { createHeaders } = require('../helpers') 10 | 11 | const contentLength = createHeaders('content-length') 12 | 13 | test('parse content length into number', t => { 14 | { 15 | const raw = PDF_SIZE_TRESHOLD - PDF_SIZE_TRESHOLD * 0.25 16 | const input = String(raw) 17 | const length = getContentLength(contentLength(input)) 18 | t.is(length, raw) 19 | t.true(length < PDF_SIZE_TRESHOLD) 20 | } 21 | { 22 | const raw = PDF_SIZE_TRESHOLD + PDF_SIZE_TRESHOLD * 0.25 23 | const input = String(raw) 24 | const length = getContentLength(contentLength(input)) 25 | t.is(length, raw) 26 | t.false(length < PDF_SIZE_TRESHOLD) 27 | } 28 | }) 29 | 30 | test('returns 0 if value is not present', t => { 31 | const length = getContentLength(contentLength()) 32 | t.is(length, NaN) 33 | t.false(length > PDF_SIZE_TRESHOLD) 34 | }) 35 | -------------------------------------------------------------------------------- /test/util/get-content-type.js: -------------------------------------------------------------------------------- 1 | 'use strict' 2 | 3 | const test = require('ava') 4 | 5 | const { getContentType } = require('../../src/util') 6 | 7 | const { createHeaders } = require('../helpers') 8 | 9 | const contentType = createHeaders('content-type') 10 | 11 | test('return media type', t => { 12 | t.is( 13 | getContentType(contentType('application/pdf; charset=utf-8')), 14 | 'application/pdf' 15 | ) 16 | t.is( 17 | getContentType(contentType('APPLICATION/PDF; charset=utf-8')), 18 | 'application/pdf' 19 | ) 20 | t.is( 21 | getContentType(contentType('INVALID/TYPE; charset=utf-8')), 22 | 'invalid/type' 23 | ) 24 | }) 25 | --------------------------------------------------------------------------------