├── .codeclimate.yml ├── .eslintrc.yml ├── .github ├── dependabot.yml └── workflows │ ├── codeql.yml │ ├── node.js.yml │ └── stale.yml ├── .gitignore ├── LICENSE ├── README.md ├── lib ├── browserUtils │ ├── .eslintrc.yml │ └── scrollToBottom.js ├── index.js └── logger.js ├── package.json └── test ├── mock ├── index.html └── navigation.html └── puppeteer-plugin.test.js /.codeclimate.yml: -------------------------------------------------------------------------------- 1 | engines: 2 | duplication: 3 | enabled: true 4 | config: 5 | languages: 6 | - javascript 7 | eslint: 8 | enabled: true 9 | channel: "eslint-7" 10 | fixme: 11 | enabled: true 12 | ratings: 13 | paths: 14 | - "**.js" 15 | exclude_paths: 16 | - test/ 17 | - lib/browserUtils/ 18 | -------------------------------------------------------------------------------- /.eslintrc.yml: -------------------------------------------------------------------------------- 1 | extends: "eslint:recommended" 2 | parserOptions: 3 | ecmaVersion: 8 4 | sourceType: "module" 5 | env: 6 | node: true 7 | es6: true 8 | rules: 9 | consistent-return: "error" 10 | curly: "error" 11 | default-case: "error" 12 | dot-notation: "error" 13 | eqeqeq: "error" 14 | no-extend-native: "error" 15 | no-implicit-coercion: "error" 16 | no-loop-func: "error" 17 | no-multi-spaces: "error" 18 | no-throw-literal: "error" 19 | global-require: "error" 20 | no-path-concat: "error" 21 | brace-style: ["error", "1tbs", {allowSingleLine: true}] 22 | camelcase: "error" 23 | consistent-this: ["error", "self"] 24 | indent: ["error", "tab", {SwitchCase: 1}] 25 | linebreak-style: ["error", "unix"] 26 | eol-last: "error" 27 | quotes: ["error", "single"] 28 | semi: "error" 29 | space-infix-ops: "error" 30 | space-unary-ops: "error" 31 | func-names: "warn" 32 | space-before-function-paren: "warn" 33 | no-spaced-func: "warn" 34 | keyword-spacing: "error" 35 | space-before-blocks: "error" 36 | no-console: "error" 37 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "npm" 4 | directory: "/" 5 | assignees: 6 | - "s0ph1e" 7 | open-pull-requests-limit: 10 8 | schedule: 9 | interval: "weekly" 10 | - package-ecosystem: "github-actions" 11 | directory: "/" 12 | assignees: 13 | - "aivus" 14 | schedule: 15 | interval: "weekly" 16 | -------------------------------------------------------------------------------- /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | name: "CodeQL" 2 | 3 | on: 4 | push: 5 | branches: ["master"] 6 | pull_request: 7 | branches: ["master"] 8 | schedule: 9 | - cron: '0 1 * * 2' 10 | 11 | jobs: 12 | analyze: 13 | name: Analyze 14 | runs-on: ubuntu-latest 15 | permissions: 16 | actions: read 17 | contents: read 18 | security-events: write 19 | 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | language: [ 'javascript' ] 24 | 25 | steps: 26 | - name: Checkout repository 27 | uses: actions/checkout@v4 28 | 29 | - name: Initialize CodeQL 30 | uses: github/codeql-action/init@v3 31 | with: 32 | languages: ${{ matrix.language }} 33 | 34 | - name: Autobuild 35 | uses: github/codeql-action/autobuild@v3 36 | 37 | - name: Perform CodeQL Analysis 38 | uses: github/codeql-action/analyze@v3 39 | with: 40 | category: "/language:${{matrix.language}}" 41 | -------------------------------------------------------------------------------- /.github/workflows/node.js.yml: -------------------------------------------------------------------------------- 1 | name: Node.js CI 2 | 3 | on: 4 | push: 5 | branches: [ master ] 6 | pull_request: 7 | branches: [ master ] 8 | schedule: 9 | - cron: '17 2 * * *' 10 | workflow_dispatch: ~ 11 | 12 | jobs: 13 | test: 14 | timeout-minutes: 10 15 | runs-on: ${{ matrix.os }} 16 | strategy: 17 | fail-fast: false 18 | matrix: 19 | node-version: 20 | - 18 21 | - 20 22 | - 22 23 | - current 24 | os: 25 | - ubuntu-latest 26 | - windows-latest 27 | include: 28 | - node-version: 22 29 | os: macos-latest 30 | 31 | steps: 32 | - uses: actions/checkout@v4 33 | - name: Use Node.js ${{ matrix.node-version }} 34 | uses: actions/setup-node@v4 35 | with: 36 | node-version: ${{ matrix.node-version }} 37 | - name: Disable AppArmor 38 | if: ${{ matrix.os == 'ubuntu-latest' }} 39 | run: echo 0 | sudo tee /proc/sys/kernel/apparmor_restrict_unprivileged_userns 40 | - run: npm i 41 | - run: npm test 42 | - run: npm run eslint 43 | if: ${{ matrix.node-version == '22' && matrix.os == 'ubuntu-latest' }} 44 | - name: Publish codeclimate code coverage 45 | if: ${{ matrix.node-version == '22' && matrix.os == 'ubuntu-latest' }} 46 | uses: paambaati/codeclimate-action@v9.0.0 47 | env: 48 | CC_TEST_REPORTER_ID: 150be11cde8a18d41d37df6c31823d35892fdd1dbf79c969142c6b3033104e46 49 | with: 50 | coverageLocations: | 51 | ${{github.workspace}}/coverage/lcov.info:lcov 52 | -------------------------------------------------------------------------------- /.github/workflows/stale.yml: -------------------------------------------------------------------------------- 1 | # This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time. 2 | # 3 | # You can adjust the behavior by modifying this file. 4 | # For more information, see: 5 | # https://github.com/actions/stale 6 | name: Mark stale issues and pull requests 7 | 8 | on: 9 | workflow_dispatch: ~ 10 | schedule: 11 | - cron: '39 3 * * *' 12 | 13 | jobs: 14 | stale: 15 | runs-on: ubuntu-latest 16 | permissions: 17 | issues: write 18 | 19 | steps: 20 | - uses: actions/stale@v9 21 | with: 22 | repo-token: ${{ secrets.GITHUB_TOKEN }} 23 | days-before-stale: 60 24 | days-before-close: 7 25 | 26 | # Do not stale PRs 27 | days-before-pr-stale: -1 28 | days-before-pr-close: -1 29 | 30 | exempt-issue-labels: 'bug,maybe-later,help wanted' 31 | stale-issue-message: 'This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.' 32 | stale-issue-label: 'wontfix' 33 | # debug-only: true 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | package-lock.json 3 | .idea 4 | coverage 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018-2023 Sofiia Antypenko 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Version](https://img.shields.io/npm/v/website-scraper-puppeteer.svg?style=flat)](https://www.npmjs.org/package/website-scraper-puppeteer) 2 | [![Downloads](https://img.shields.io/npm/dm/website-scraper-puppeteer.svg?style=flat)](https://www.npmjs.org/package/website-scraper-puppeteer) 3 | [![Node.js CI](https://github.com/website-scraper/website-scraper-puppeteer/actions/workflows/node.js.yml/badge.svg)](https://github.com/website-scraper/website-scraper-puppeteer) 4 | [![Test Coverage](https://codeclimate.com/github/website-scraper/website-scraper-puppeteer/badges/coverage.svg)](https://codeclimate.com/github/website-scraper/website-scraper-puppeteer/coverage) 5 | 6 | # website-scraper-puppeteer 7 | Plugin for [website-scraper](https://github.com/website-scraper/node-website-scraper) which returns html for dynamic websites using [puppeteer](https://github.com/puppeteer/puppeteer). 8 | 9 | This module is an Open Source Software maintained by one developer in free time. If you want to thank the author of this module you can use [GitHub Sponsors](https://github.com/sponsors/s0ph1e) or [Patreon](https://www.patreon.com/s0ph1e). 10 | 11 | ## Requirements 12 | * nodejs version >= 18 13 | * website-scraper version >= 5 14 | 15 | ## Installation 16 | ```sh 17 | npm install website-scraper website-scraper-puppeteer 18 | ``` 19 | 20 | ## Usage 21 | ```javascript 22 | import scrape from 'website-scraper'; 23 | import PuppeteerPlugin from 'website-scraper-puppeteer'; 24 | 25 | await scrape({ 26 | urls: ['https://www.instagram.com/gopro/'], 27 | directory: '/path/to/save', 28 | plugins: [ 29 | new PuppeteerPlugin({ 30 | launchOptions: { headless: "new" }, /* optional */ 31 | gotoOptions: { waitUntil: "networkidle0" }, /* optional */ 32 | scrollToBottom: { timeout: 10000, viewportN: 10 }, /* optional */ 33 | blockNavigation: true, /* optional */ 34 | }) 35 | ] 36 | }); 37 | ``` 38 | Puppeteer plugin constructor accepts next params: 39 | * `launchOptions` - *(optional)* - puppeteer launch options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/main/docs/api/puppeteer.puppeteerlaunchoptions.md) 40 | * `gotoOptions` - *(optional)* - puppeteer page.goto options, can be found in [puppeteer docs](https://github.com/puppeteer/puppeteer/blob/main/docs/api/puppeteer.frame.goto.md#parameters) 41 | * `scrollToBottom` - *(optional)* - in some cases, the page needs to be scrolled down to render its assets (lazyloading). Because some pages can be really endless, the scrolldown process can be interrupted before reaching the bottom when one or both of the bellow limitations are reached: 42 | * `timeout` - in milliseconds 43 | * `viewportN` - viewport height multiplier 44 | * `blockNavigation` - *(optional)* - defines whether navigation away from the page is permitted or not. If it is set to true, then the page is locked to the current url and redirects with `location.replace(anotherPage)` will not pass. Defaults to `false` 45 | 46 | ## How it works 47 | It starts Chromium in headless mode which just opens page and waits until page is loaded. 48 | It is far from ideal because probably you need to wait until some resource is loaded or click some button or log in. Currently this module doesn't support such functionality. 49 | -------------------------------------------------------------------------------- /lib/browserUtils/.eslintrc.yml: -------------------------------------------------------------------------------- 1 | extends: '../../.eslintrc.yml' 2 | env: 3 | browser: true 4 | -------------------------------------------------------------------------------- /lib/browserUtils/scrollToBottom.js: -------------------------------------------------------------------------------- 1 | export default async (timeout, viewportN) => { 2 | await new Promise((resolve) => { 3 | let totalHeight = 0, distance = 200, duration = 0, maxHeight = window.innerHeight * viewportN; 4 | const timer = setInterval(() => { 5 | duration += 200; 6 | window.scrollBy(0, distance); 7 | totalHeight += distance; 8 | if (totalHeight >= document.body.scrollHeight || duration >= timeout || totalHeight >= maxHeight) { 9 | clearInterval(timer); 10 | resolve(); 11 | } 12 | }, 200); 13 | }); 14 | }; 15 | -------------------------------------------------------------------------------- /lib/index.js: -------------------------------------------------------------------------------- 1 | import puppeteer from 'puppeteer'; 2 | import logger from './logger.js'; 3 | import scrollToBottomBrowser from './browserUtils/scrollToBottom.js'; 4 | 5 | class PuppeteerPlugin { 6 | constructor ({ 7 | launchOptions = {}, 8 | gotoOptions = {}, 9 | scrollToBottom = null, 10 | blockNavigation = false 11 | } = {}) { 12 | this.launchOptions = launchOptions; 13 | this.gotoOptions = gotoOptions; 14 | this.scrollToBottom = scrollToBottom; 15 | this.blockNavigation = blockNavigation; 16 | this.browser = null; 17 | this.headers = {}; 18 | 19 | logger.info('init plugin', { launchOptions, scrollToBottom, blockNavigation }); 20 | } 21 | 22 | apply (registerAction) { 23 | registerAction('beforeStart', async () => { 24 | this.browser = await puppeteer.launch(this.launchOptions); 25 | }); 26 | 27 | registerAction('beforeRequest', async ({requestOptions}) => { 28 | if (hasValues(requestOptions.headers)) { 29 | this.headers = Object.assign({}, requestOptions.headers); 30 | } 31 | return {requestOptions}; 32 | }); 33 | 34 | registerAction('afterResponse', async ({response}) => { 35 | const contentType = response.headers['content-type']; 36 | const isHtml = contentType && contentType.split(';')[0] === 'text/html'; 37 | if (isHtml) { 38 | const url = response.url; 39 | const page = await this.browser.newPage(); 40 | 41 | if (hasValues(this.headers)) { 42 | logger.info('set headers to puppeteer page', this.headers); 43 | await page.setExtraHTTPHeaders(this.headers); 44 | } 45 | 46 | if (this.blockNavigation) { 47 | await blockNavigation(page, url); 48 | } 49 | 50 | await page.goto(url, this.gotoOptions); 51 | 52 | if (this.scrollToBottom) { 53 | await scrollToBottom(page, this.scrollToBottom.timeout, this.scrollToBottom.viewportN); 54 | } 55 | 56 | const content = await page.content(); 57 | await page.close(); 58 | 59 | // convert utf-8 -> binary string because website-scraper needs binary 60 | return Buffer.from(content).toString('binary'); 61 | } else { 62 | return response.body; 63 | } 64 | }); 65 | 66 | registerAction('afterFinish', () => this.browser && this.browser.close()); 67 | } 68 | } 69 | 70 | function hasValues (obj) { 71 | return obj && Object.keys(obj).length > 0; 72 | } 73 | 74 | 75 | async function scrollToBottom (page, timeout, viewportN) { 76 | logger.info(`scroll puppeteer page to bottom ${viewportN} times with timeout = ${timeout}`); 77 | 78 | await page.evaluate(scrollToBottomBrowser, timeout, viewportN); 79 | } 80 | 81 | async function blockNavigation (page, url) { 82 | logger.info(`block navigation for puppeteer page from url ${url}`); 83 | 84 | page.on('request', req => { 85 | if (req.isNavigationRequest() && req.frame() === page.mainFrame() && req.url() !== url) { 86 | req.abort('aborted'); 87 | } else { 88 | req.continue(); 89 | } 90 | }); 91 | await page.setRequestInterception(true); 92 | } 93 | 94 | export default PuppeteerPlugin; 95 | -------------------------------------------------------------------------------- /lib/logger.js: -------------------------------------------------------------------------------- 1 | import debug from 'debug'; 2 | 3 | const appName = 'website-scraper-puppeteer'; 4 | const logLevels = ['error', 'warn', 'info', 'debug', 'log']; 5 | 6 | const logger = {}; 7 | logLevels.forEach(logLevel => { 8 | logger[logLevel] = debug(`${appName}:${logLevel}`); 9 | }); 10 | 11 | export default logger; 12 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "website-scraper-puppeteer", 3 | "version": "1.1.0", 4 | "description": "Plugin for website-scraper which returns html for dynamic websites using puppeteer", 5 | "readmeFilename": "README.md", 6 | "type": "module", 7 | "exports": { 8 | ".": "./lib/index.js" 9 | }, 10 | "keywords": [ 11 | "website-scraper", 12 | "puppeteer", 13 | "chromium", 14 | "chrome", 15 | "headless", 16 | "html" 17 | ], 18 | "dependencies": { 19 | "debug": "^4.1.1", 20 | "puppeteer": "^23.0.*" 21 | }, 22 | "peerDependencies": { 23 | "website-scraper": "^5.0.0" 24 | }, 25 | "devDependencies": { 26 | "c8": "^10.1.2", 27 | "chai": "^5.1.1", 28 | "eslint": "^8.5.0", 29 | "finalhandler": "^1.1.2", 30 | "fs-extra": "^11.1.0", 31 | "mocha": "^11.0.1", 32 | "serve-static": "^1.13.2", 33 | "website-scraper": "^5.0.0" 34 | }, 35 | "scripts": { 36 | "test": "c8 --all --reporter=text --reporter=lcov mocha --recursive --timeout 15000", 37 | "eslint": "eslint lib/**" 38 | }, 39 | "repository": { 40 | "type": "git", 41 | "url": "git+https://github.com/website-scraper/website-scraper-puppeteer.git" 42 | }, 43 | "author": "Sofiia Antypenko ", 44 | "license": "MIT", 45 | "bugs": { 46 | "url": "https://github.com/website-scraper/website-scraper-puppeteer/issues" 47 | }, 48 | "homepage": "https://github.com/website-scraper/website-scraper-puppeteer#readme", 49 | "files": [ 50 | "lib" 51 | ], 52 | "engines": { 53 | "node": ">=18" 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /test/mock/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Test 6 | 7 | 8 | 9 |
10 |
11 | 12 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /test/mock/navigation.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | Test 6 | 7 | 8 | 9 |
10 | 11 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /test/puppeteer-plugin.test.js: -------------------------------------------------------------------------------- 1 | import { expect } from 'chai'; 2 | import http from 'http'; 3 | import finalhandler from 'finalhandler'; 4 | import serveStatic from 'serve-static'; 5 | import fs from 'fs-extra'; 6 | import scrape from 'website-scraper'; 7 | import PuppeteerPlugin from '../lib/index.js'; 8 | 9 | const directory = './test/tmp'; 10 | const SERVE_WEBSITE_PORT = 4567; 11 | 12 | describe('Puppeteer plugin test', () => { 13 | let result, content, server; 14 | 15 | before('start webserver', () => server = startWebserver(SERVE_WEBSITE_PORT)); 16 | after('stop webserver', () => server.close()) 17 | 18 | describe('Dynamic content', () => { 19 | before('scrape website', async () => { 20 | result = await scrape({ 21 | urls: [`http://localhost:${SERVE_WEBSITE_PORT}`], 22 | directory: directory, 23 | plugins: [ new PuppeteerPlugin({ 24 | scrollToBottom: { timeout: 50, viewportN: 10 } 25 | }) ] 26 | }); 27 | }); 28 | before('get content from file', () => { 29 | content = fs.readFileSync(`${directory}/${result[0].filename}`).toString(); 30 | }); 31 | after('delete dir', () => fs.removeSync(directory)); 32 | 33 | it('should have 1 item in result array', () => { 34 | expect(result.length).eql(1); 35 | }); 36 | 37 | it('should render dymanic website', async () => { 38 | expect(content).to.contain('
Hello world from JS!
'); 39 | }); 40 | 41 | it('should render special characters correctly', async () => { 42 | expect(content).to.contain('
7년 동안 한국에서 살았어요. Слава Україні!
'); 43 | }); 44 | }); 45 | 46 | describe('Block navigation', () => { 47 | before('scrape website', async () => { 48 | result = await scrape({ 49 | urls: [`http://localhost:${SERVE_WEBSITE_PORT}/navigation.html`], 50 | directory: directory, 51 | plugins: [ 52 | new PuppeteerPlugin({ 53 | launchOptions: { headless: "new" }, 54 | blockNavigation: true 55 | }) 56 | ] 57 | }); 58 | }); 59 | before('get content from file', () => { 60 | content = fs.readFileSync(`${directory}/${result[0].filename}`).toString(); 61 | }); 62 | after('delete dir', () => fs.removeSync(directory)); 63 | 64 | it('should render content (and not be redirected)', async () => { 65 | expect(content).to.contain('
Navigation blocked!
'); 66 | }); 67 | }); 68 | 69 | 70 | }); 71 | 72 | function startWebserver(port = 3000) { 73 | const serve = serveStatic('./test/mock', {'index': ['index.html']}); 74 | const server = http.createServer(function onRequest (req, res) { 75 | serve(req, res, finalhandler(req, res)) 76 | }); 77 | 78 | return server.listen(port) 79 | } 80 | --------------------------------------------------------------------------------