├── .all-contributorsrc ├── .github ├── FUNDING.yml └── workflows │ └── main.yml ├── .gitignore ├── .vscode ├── extensions.json └── settings.json ├── LICENSE ├── README.md ├── examples ├── hn-custom-browser.js ├── hn-multiple.js └── hn.js ├── header.png ├── package.json ├── src ├── defaults.ts ├── index.ts ├── puppet-scraper.ts └── types.ts ├── test ├── puppet-scraper.test.ts ├── utils.ts └── variables.ts ├── tsconfig.json ├── tsdx.config.js ├── types └── index.d.ts └── yarn.lock /.all-contributorsrc: -------------------------------------------------------------------------------- 1 | { 2 | "projectName": "puppet-scraper", 3 | "projectOwner": "grikomsn", 4 | "repoType": "github", 5 | "repoHost": "https://github.com", 6 | "files": ["README.md"], 7 | "imageSize": 100, 8 | "commit": true, 9 | "commitConvention": "none", 10 | "contributors": [ 11 | { 12 | "login": "grikomsn", 13 | "name": "Griko Nibras", 14 | "avatar_url": "https://avatars1.githubusercontent.com/u/8220954?v=4", 15 | "profile": "https://griko.id", 16 | "contributions": ["code", "maintenance"] 17 | } 18 | ], 19 | "contributorsPerLine": 7 20 | } 21 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | issuehunt: grikomsn 2 | ko_fi: grikomsn 3 | liberapay: grikomsn 4 | custom: ['https://karyakarsa.com/grikomsn', 'https://saweria.co/grikomsn', 'https://trakteer.id/grikomsn'] 5 | 6 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 7 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] 8 | open_collective: # Replace with a single Open Collective username 9 | otechie: # Replace with a single Otechie username 10 | patreon: # Replace with a single Patreon username 11 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 12 | -------------------------------------------------------------------------------- /.github/workflows/main.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | on: [push] 3 | jobs: 4 | build: 5 | runs-on: ubuntu-latest 6 | 7 | steps: 8 | - name: Checkout 9 | uses: actions/checkout@v2 10 | 11 | - name: Use Node 12 12 | uses: actions/setup-node@v1 13 | with: 14 | node-version: 12.x 15 | 16 | - name: Use cached node_modules 17 | uses: actions/cache@v1 18 | with: 19 | path: node_modules 20 | key: nodeModules-${{ hashFiles('**/yarn.lock') }} 21 | restore-keys: | 22 | nodeModules- 23 | 24 | - name: Install dependencies 25 | uses: ianwalter/puppeteer@cbdd5c50c8d6b6275cdf46e4ad2b3f7ee61211ce 26 | with: 27 | args: yarn install --frozen-lockfile 28 | env: 29 | CI: true 30 | 31 | - name: Lint 32 | run: yarn lint 33 | env: 34 | CI: true 35 | 36 | - name: Test 37 | uses: ianwalter/puppeteer@cbdd5c50c8d6b6275cdf46e4ad2b3f7ee61211ce 38 | with: 39 | args: yarn test --ci --coverage --maxWorkers=2 40 | env: 41 | CI: true 42 | 43 | - name: Build 44 | run: yarn build 45 | env: 46 | CI: true 47 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.log 2 | .DS_Store 3 | dist 4 | node_modules 5 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": ["rbbit.typescript-hero"] 3 | } 4 | -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "typescriptHero.imports.insertSemicolons": true, 3 | "typescriptHero.imports.organizeOnSave": true, 4 | "typescriptHero.imports.organizeSortsByFirstSpecifier": true 5 | } 6 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Griko Nibras 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 |
4 | 5 | [![puppet-scraper](./header.png)](.) 6 | 7 | ![github release](https://badgen.net/github/release/grikomsn/puppet-scraper?icon=github) 8 | ![npm version](https://badgen.net/npm/v/puppet-scraper?icon=npm) 9 | 10 |
11 | 12 | --- 13 | 14 | - [Brief example](#brief-example) 15 | - [Usage](#usage) 16 | - [Installing dependency](#installing-dependency) 17 | - [Instantiation](#instantiation) 18 | - [Customize options](#customize-options) 19 | - [Scraping single page](#scraping-single-page) 20 | - [Scraping multiple pages](#scraping-multiple-pages) 21 | - [Closing instance](#closing-instance) 22 | - [Access the browser instance](#access-the-browser-instance) 23 | - [Contributing](#contributing) 24 | - [License](#license) 25 | 26 | --- 27 | 28 | **PuppetScraper is a opinionated wrapper library for utilizing [Puppeteer](https://github.com/puppeteer/puppeteer) to scrape pages easily, bootstrapped using [Jared Palmer's tsdx](https://github.com/jaredpalmer/tsdx).** 29 | 30 | Most people create a new scraping project by `require`-ing Puppeteer and create their own logic to scrape pages, and that logic will get more complicated when trying to use multiple pages. 31 | 32 | PuppetScraper allows you to just pass the URLs to scrape, the function to evaluate (the scraping logic), and how many pages (or tabs) to open at a time. Basically, PuppetScraper abstracts the need to create multiple page instances and retrying the evaluation logic. 33 | 34 | **Version 0.1.0 note**: PuppetScraper was initially made as a project template rather than a wrapper library, but the core logic is still the same with various improvements and without extra libraries, so you can include PuppetScraper in your project easily using `npm` or `yarn`. 35 | 36 | ## Brief example 37 | 38 | Here's a [basic example](./examples/hn.js) on scraping the entries on [first page Hacker News](https://news.ycombinator.com): 39 | 40 | ```js 41 | // examples/hn.js 42 | 43 | const { PuppetScraper } = require('puppet-scraper'); 44 | 45 | const ps = await PuppetScraper.launch(); 46 | 47 | const data = await ps.scrapeFromUrl({ 48 | url: 'https://news.ycombinator.com', 49 | evaluateFn: () => { 50 | let items = []; 51 | 52 | document.querySelectorAll('.storylink').forEach((node) => { 53 | items.push({ 54 | title: node.innerText, 55 | url: node.href, 56 | }); 57 | }); 58 | 59 | return items; 60 | }, 61 | }); 62 | 63 | console.log({ data }); 64 | 65 | await ps.close(); 66 | ``` 67 | 68 | View more examples on the [`examples` directory](./examples). 69 | 70 | ## Usage 71 | 72 | ### Installing dependency 73 | 74 | Install `puppet-scraper` via `npm` or `yarn`: 75 | 76 | ```console 77 | $ npm install puppet-scraper 78 | --- or --- 79 | $ yarn add puppet-scraper 80 | ``` 81 | 82 | Install peer dependency `puppeteer` or Puppeteer equivalent ([`chrome-aws-lambda`](https://github.com/alixaxel/chrome-aws-lambda), untested): 83 | 84 | ```console 85 | $ npm install puppeteer 86 | --- or --- 87 | $ yarn add puppeteer 88 | ``` 89 | 90 | ### Instantiation 91 | 92 | Create the PuppetScraper instance, either launching a new browser instance, connect or use an existing browser instance: 93 | 94 | ```js 95 | const { PuppetScraper } = require('puppet-scraper'); 96 | const Puppeteer = require('puppeteer'); 97 | 98 | // launches a new browser instance 99 | const instance = await PuppetScraper.launch(); 100 | 101 | // connect to an existing browser instance 102 | const external = await PuppetScraper.connect({ 103 | browserWSEndpoint: 'ws://127.0.0.1:9222/devtools/browser/...', 104 | }); 105 | 106 | // use an existing browser instance 107 | const browser = await Puppeteer.launch(); 108 | const existing = await PuppetScraper.use({ browser }); 109 | ``` 110 | 111 | ### Customize options 112 | 113 | `launch` and `connect` has the same props with `Puppeteer.launch` and `Puppeteer.connect`, but with an extra `concurrentPages` and `maxEvaluationRetries` property: 114 | 115 | ```js 116 | const { PuppetScraper } = require('puppet-scraper'); 117 | 118 | const instance = await PuppetScraper.launch({ 119 | concurrentPages: 3, 120 | maxEvaluationRetries: 10 121 | headless: false, 122 | }); 123 | ``` 124 | 125 | `concurrentPages` is for how many pages/tabs will be opened and use for scraping. 126 | 127 | `maxEvaluationRetries` is for how many times the page will try to evaluate the given function on `evaluateFn` (see below), where if the evaluation throws an error, the page will reload and try to re-evaluate again. 128 | 129 | If `concurrentPages` and `maxEvaluationRetries` is not determined, it will use the [default values](./src/defaults.ts): 130 | 131 | ```ts 132 | export const DEFAULT_CONCURRENT_PAGES = 3; 133 | export const DEFAULT_EVALUATION_RETRIES = 10; 134 | ``` 135 | 136 | ### Scraping single page 137 | 138 | As shown like the example above, use `.scrapeFromUrl` and pass an object with the following properties: 139 | 140 | - `url: string`, page URL to be opened 141 | - `evaluateFn: function`, function to evaluate (scraper method) 142 | - `pageOptions: object`, [`Puppeteer.DirectNavigationOptions`](https://github.com/DefinitelyTyped/DefinitelyTyped/blob/master/types/puppeteer/index.d.ts#L551) props to override page behaviors 143 | 144 | ```js 145 | const data = await instance.scrapeFromUrl({ 146 | url: 'https://news.ycombinator.com', 147 | evaluateFn: () => { 148 | let items = []; 149 | 150 | document.querySelectorAll('.storylink').forEach((node) => { 151 | items.push({ 152 | title: node.innerText, 153 | url: node.href, 154 | }); 155 | }); 156 | 157 | return items; 158 | }, 159 | }); 160 | ``` 161 | 162 | `pageOptions` defaults the `waitUntil` property to `networkidle0`, which you can read more on the [API documentation](https://pptr.dev/#?product=Puppeteer&version=v3.0.2&show=api-pagegotourl-options). 163 | 164 | ### Scraping multiple pages 165 | 166 | Same as `.scrapeFromUrl` but passes `urls` property which contain `string`s of URL: 167 | 168 | - `urls: string[]`, page URLs to be opened 169 | - `evaluateFn: function`, function to evaluate (scraper method) 170 | - `pageOptions: object`, [`Puppeteer.DirectNavigationOptions`](https://github.com/DefinitelyTyped/DefinitelyTyped/blob/master/types/puppeteer/index.d.ts#L551) props to override page behaviors 171 | 172 | ```js 173 | const urls = Array.from({ length: 5 }).map( 174 | (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`, 175 | ); 176 | 177 | const data = await ps.scrapeFromUrls({ 178 | urls, 179 | evaluateFn: () => { 180 | let items = []; 181 | 182 | document.querySelectorAll('.storylink').forEach((node) => { 183 | items.push({ 184 | title: node.innerText, 185 | url: node.href, 186 | }); 187 | }); 188 | 189 | return items; 190 | }, 191 | }); 192 | ``` 193 | 194 | ### Closing instance 195 | 196 | When there's nothing left to do, don't forget to close the instance with closes the browser: 197 | 198 | ```js 199 | await instance.close(); 200 | ``` 201 | 202 | ### Access the browser instance 203 | 204 | PuppetScraper also exposes the browser instance if you want to do things manually: 205 | 206 | ```js 207 | const browser = instance.___internal.browser; 208 | ``` 209 | 210 | ## Contributing 211 | 212 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)): 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 |

Griko Nibras

💻 🚧
222 | 223 | 224 | 225 | 226 | 227 | This project follows the [all-contributors][all-contributors] specification. 228 | Contributions of any kind welcome! 229 | 230 | ## License 231 | 232 | [MIT License, Copyright (c) 2020 Griko Nibras](./LICENSE) 233 | 234 | [all-contributors]: https://github.com/all-contributors/all-contributors 235 | -------------------------------------------------------------------------------- /examples/hn-custom-browser.js: -------------------------------------------------------------------------------- 1 | const { PuppetScraper } = require('..'); 2 | 3 | async function hnCustomBrowser() { 4 | const ps = await PuppetScraper.launch({ 5 | executablePath: 6 | 'C:\\Program Files (x86)\\Microsoft\\Edge Dev\\Application\\msedge.exe', 7 | headless: false, 8 | }); 9 | 10 | const data = await ps.scrapeFromUrl({ 11 | url: 'https://news.ycombinator.com', 12 | evaluateFn: () => { 13 | let items = []; 14 | 15 | document.querySelectorAll('.storylink').forEach((node) => { 16 | items.push({ 17 | title: node.innerText, 18 | url: node.href, 19 | }); 20 | }); 21 | 22 | return items; 23 | }, 24 | }); 25 | 26 | console.log({ data }); 27 | 28 | await ps.close(); 29 | } 30 | 31 | hnCustomBrowser(); 32 | -------------------------------------------------------------------------------- /examples/hn-multiple.js: -------------------------------------------------------------------------------- 1 | const { PuppetScraper } = require('..'); 2 | 3 | async function hnMultiple() { 4 | const ps = await PuppetScraper.launch({ 5 | concurrentPages: 5, 6 | }); 7 | 8 | const urls = Array.from({ length: 5 }).map( 9 | (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`, 10 | ); 11 | 12 | const data = await ps.scrapeFromUrls({ 13 | urls, 14 | evaluateFn: () => { 15 | let items = []; 16 | 17 | document.querySelectorAll('.storylink').forEach((node) => { 18 | items.push({ 19 | title: node.innerText, 20 | url: node.href, 21 | }); 22 | }); 23 | 24 | return items; 25 | }, 26 | }); 27 | 28 | console.log({ data }); 29 | 30 | await ps.close(); 31 | } 32 | 33 | hnMultiple(); 34 | -------------------------------------------------------------------------------- /examples/hn.js: -------------------------------------------------------------------------------- 1 | const { PuppetScraper } = require('..'); 2 | 3 | async function hn() { 4 | const ps = await PuppetScraper.launch(); 5 | 6 | const data = await ps.scrapeFromUrl({ 7 | url: 'https://news.ycombinator.com', 8 | evaluateFn: () => { 9 | let items = []; 10 | 11 | document.querySelectorAll('.storylink').forEach((node) => { 12 | items.push({ 13 | title: node.innerText, 14 | url: node.href, 15 | }); 16 | }); 17 | 18 | return items; 19 | }, 20 | }); 21 | 22 | console.log({ data }); 23 | 24 | await ps.close(); 25 | } 26 | 27 | hn(); 28 | -------------------------------------------------------------------------------- /header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/grikomsn/puppet-scraper/856b3e25ee263f9d80d0cb3b9bbda4763eb3d4bc/header.png -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "puppet-scraper", 3 | "description": "Scraping using Puppeteer the sane way 🤹🏻‍♂️", 4 | "version": "0.2.0", 5 | "repository": "https://github.com/grikomsn/puppet-scraper.git", 6 | "author": "Griko Nibras ", 7 | "files": [ 8 | "dist", 9 | "src" 10 | ], 11 | "main": "dist/index.js", 12 | "module": "dist/puppet-scraper.esm.js", 13 | "typings": "dist/index.d.ts", 14 | "scripts": { 15 | "prepare": "tsdx build", 16 | "build": "tsdx build", 17 | "lint": "tsdx lint --fix src test types", 18 | "test": "tsdx test", 19 | "watch": "tsdx watch", 20 | "format": "yarn format:examples && yarn format:index", 21 | "format:examples": "prettier --write \"examples/**/*.js\"", 22 | "format:index": "prettier --write \"*.{js,json,md}\"", 23 | "contributors:add": "all-contributors add", 24 | "contributors:generate": "all-contributors generate" 25 | }, 26 | "dependencies": { 27 | "promise-retry": "^1.1.1", 28 | "puppeteer-core": "^3.0.3" 29 | }, 30 | "peerDependencies": { 31 | "puppeteer": "^3.0.3" 32 | }, 33 | "devDependencies": { 34 | "@types/promise-retry": "^1.1.3", 35 | "@types/puppeteer": "^2.0.1", 36 | "@types/puppeteer-core": "^2.0.0", 37 | "all-contributors-cli": "^6.14.2", 38 | "husky": "^4.2.5", 39 | "puppeteer": "^3.0.3", 40 | "tsdx": "^0.13.2", 41 | "tslib": "^1.11.2", 42 | "typescript": "^3.8.3" 43 | }, 44 | "engines": { 45 | "node": ">=10" 46 | }, 47 | "husky": { 48 | "hooks": { 49 | "pre-commit": "yarn format && yarn lint && yarn contributors:generate" 50 | } 51 | }, 52 | "prettier": { 53 | "arrowParens": "always", 54 | "printWidth": 80, 55 | "semi": true, 56 | "singleQuote": true, 57 | "trailingComma": "all" 58 | }, 59 | "license": "MIT" 60 | } 61 | -------------------------------------------------------------------------------- /src/defaults.ts: -------------------------------------------------------------------------------- 1 | import { DirectNavigationOptions } from 'puppeteer'; 2 | 3 | export const DEFAULT_CONCURRENT_PAGES = 3; 4 | 5 | export const DEFAULT_EVALUATION_RETRIES = 10; 6 | 7 | export const DEFAULT_PAGE_OPTIONS: DirectNavigationOptions = { 8 | waitUntil: 'networkidle0', 9 | }; 10 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | export * from './defaults'; 2 | export * from './types'; 3 | 4 | export { PuppetScraper } from './puppet-scraper'; 5 | -------------------------------------------------------------------------------- /src/puppet-scraper.ts: -------------------------------------------------------------------------------- 1 | import promiseRetry from 'promise-retry'; 2 | import Puppeteer, { Page } from 'puppeteer'; 3 | 4 | import { 5 | DEFAULT_CONCURRENT_PAGES, 6 | DEFAULT_EVALUATION_RETRIES, 7 | DEFAULT_PAGE_OPTIONS, 8 | } from './defaults'; 9 | import { 10 | PSBootstrap, 11 | PSConnect, 12 | PSLaunch, 13 | PSUse, 14 | ScrapeFromUrl, 15 | ScrapeFromUrls, 16 | } from './types'; 17 | 18 | const bootstrap: PSBootstrap = async ({ 19 | browser, 20 | concurrentPages = DEFAULT_CONCURRENT_PAGES, 21 | maxEvaluationRetries = DEFAULT_EVALUATION_RETRIES, 22 | } = {}) => { 23 | let pages: Page[] = Array.from({ length: concurrentPages }); 24 | 25 | const scrapeFromUrl: ScrapeFromUrl = async (props) => { 26 | const { url, evaluateFn, pageOptions } = props; 27 | 28 | const mergedPageOptions = { 29 | ...DEFAULT_PAGE_OPTIONS, 30 | ...pageOptions, 31 | }; 32 | 33 | let page = pages[0]; 34 | if (!page) { 35 | page = await browser.newPage(); 36 | } 37 | 38 | return page.goto(url, mergedPageOptions).then(() => 39 | promiseRetry( 40 | async (retry) => { 41 | try { 42 | return page.evaluate(evaluateFn); 43 | } catch (error) { 44 | await page.reload(); 45 | return retry(error); 46 | } 47 | }, 48 | { maxRetryTime: maxEvaluationRetries }, 49 | ), 50 | ); 51 | }; 52 | 53 | const scrapeFromUrls: ScrapeFromUrls = async (props) => { 54 | const { urls, evaluateFn, pageOptions } = props; 55 | 56 | const mergedPageOptions = { 57 | ...DEFAULT_PAGE_OPTIONS, 58 | ...pageOptions, 59 | }; 60 | 61 | pages = await Promise.all( 62 | pages.map(async (page) => { 63 | if (!page) page = await browser.newPage(); 64 | return page; 65 | }), 66 | ); 67 | 68 | let current = 0; 69 | let total = urls.length; 70 | 71 | const results = []; 72 | while (current < total) { 73 | const finishedPages = await Promise.all( 74 | // eslint-disable-next-line no-loop-func 75 | pages.reduce[]>((finishedPages, page) => { 76 | if (current <= total) { 77 | const finishedPage = page 78 | .goto(urls[current++], mergedPageOptions) 79 | .then(() => page); 80 | 81 | return finishedPages.concat(finishedPage); 82 | } 83 | return finishedPages; 84 | }, []), 85 | ); 86 | 87 | const evaluatingPages = finishedPages.map((page) => 88 | promiseRetry( 89 | async (retry) => { 90 | try { 91 | return page.evaluate(evaluateFn); 92 | } catch (error) { 93 | await page.reload(); 94 | return retry(error); 95 | } 96 | }, 97 | { maxRetryTime: maxEvaluationRetries }, 98 | ), 99 | ); 100 | 101 | const data = await Promise.all(evaluatingPages); 102 | results.push(...data); 103 | } 104 | 105 | return results; 106 | }; 107 | 108 | const close = () => browser.close(); 109 | 110 | return { 111 | scrapeFromUrl, 112 | scrapeFromUrls, 113 | close, 114 | ___internal: { 115 | browser, 116 | }, 117 | }; 118 | }; 119 | 120 | const connect: PSConnect = async ({ 121 | concurrentPages, 122 | maxEvaluationRetries, 123 | ...opts 124 | } = {}) => { 125 | const browser = await Puppeteer.connect(opts); 126 | return bootstrap({ browser, concurrentPages, maxEvaluationRetries }); 127 | }; 128 | 129 | const launch: PSLaunch = async ({ 130 | concurrentPages, 131 | maxEvaluationRetries, 132 | ...opts 133 | } = {}) => { 134 | const browser = await Puppeteer.launch(opts); 135 | return bootstrap({ browser, concurrentPages, maxEvaluationRetries }); 136 | }; 137 | 138 | const use: PSUse = (opts) => { 139 | if (!opts.browser) { 140 | throw new Error('browser is not defined'); 141 | } 142 | return bootstrap(opts); 143 | }; 144 | 145 | export const PuppetScraper = { connect, launch, use }; 146 | -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | import { 2 | Browser, 3 | ConnectOptions, 4 | DirectNavigationOptions, 5 | LaunchOptions, 6 | } from 'puppeteer'; 7 | 8 | // #region ScrapeFromUrl 9 | 10 | export interface ScrapeFromUrlProps { 11 | url: string; 12 | evaluateFn: () => T; 13 | pageOptions?: DirectNavigationOptions; 14 | } 15 | 16 | export type ScrapeFromUrl = ( 17 | props: ScrapeFromUrlProps, 18 | ) => Promise ? U : T>; 19 | 20 | // #endregion 21 | 22 | // #region ScrapeFromUrls 23 | 24 | export interface ScrapeFromUrlsProps { 25 | urls: string[]; 26 | evaluateFn: () => T; 27 | pageOptions?: DirectNavigationOptions; 28 | } 29 | 30 | export type ScrapeFromUrls = ( 31 | props: ScrapeFromUrlsProps, 32 | ) => Promise<(T extends PromiseLike ? U : T)[]>; 33 | 34 | // #endregion 35 | 36 | // #region PuppetScraper 37 | 38 | export interface PSInstance { 39 | scrapeFromUrl: ScrapeFromUrl; 40 | scrapeFromUrls: ScrapeFromUrls; 41 | close: () => Promise; 42 | 43 | ___internal: { 44 | browser: Browser; 45 | }; 46 | } 47 | 48 | export type PSBootstrapProps = { 49 | browser?: Browser; 50 | concurrentPages?: number; 51 | maxEvaluationRetries?: number; 52 | }; 53 | 54 | export type PSBootstrap = (props?: PSBootstrapProps) => Promise; 55 | 56 | export type PSLaunchOptions = PSBootstrapProps & LaunchOptions; 57 | 58 | export type PSLaunch = (opts?: PSLaunchOptions) => ReturnType; 59 | 60 | export type PSConnectOptions = PSBootstrapProps & ConnectOptions; 61 | 62 | export type PSConnect = (opts: PSConnectOptions) => ReturnType; 63 | 64 | export type PSUseOptions = PSBootstrapProps & { browser: Browser }; 65 | 66 | export type PSUse = (opts: PSUseOptions) => ReturnType; 67 | 68 | // #endregion 69 | -------------------------------------------------------------------------------- /test/puppet-scraper.test.ts: -------------------------------------------------------------------------------- 1 | import { PSInstance, PuppetScraper } from '..'; 2 | import { sleep } from './utils'; 3 | import { launchOptions, timeout } from './variables'; 4 | 5 | type DataType = { title: string; url: string }; 6 | 7 | describe('launching instance', () => { 8 | it( 9 | 'should launch and close without errors', 10 | () => { 11 | let instance: PSInstance; 12 | 13 | const instantiateAndClose = async () => { 14 | instance = await PuppetScraper.launch(launchOptions); 15 | await instance.close(); 16 | }; 17 | 18 | expect(instantiateAndClose).not.toThrow(); 19 | }, 20 | timeout, 21 | ); 22 | }); 23 | 24 | describe('scrape hacker news from single url', () => { 25 | let instance: PSInstance; 26 | 27 | beforeAll(() => { 28 | const creating = PuppetScraper.launch(launchOptions); 29 | return creating.then((created) => (instance = created)); 30 | }, timeout); 31 | 32 | it( 33 | 'should scrape without errors', 34 | () => { 35 | const data = instance 36 | .scrapeFromUrl({ 37 | url: 'https://news.ycombinator.com', 38 | evaluateFn: () => { 39 | let data: DataType[] = []; 40 | 41 | document.querySelectorAll('.storylink').forEach((node) => { 42 | data.push({ 43 | title: (node as HTMLAnchorElement).innerText, 44 | url: (node as HTMLAnchorElement).href, 45 | }); 46 | }); 47 | 48 | return data; 49 | }, 50 | }) 51 | .catch(); 52 | 53 | expect(data).resolves.not.toBeNull(); 54 | expect(data).resolves.toHaveLength(30); 55 | expect(data).resolves.toHaveProperty([0, 'title']); 56 | expect(data).resolves.toHaveProperty([0, 'url']); 57 | 58 | return data; 59 | }, 60 | timeout, 61 | ); 62 | 63 | it( 64 | 'should instantiate one page', 65 | async () => { 66 | let totalPages = 0; 67 | const expectedPages = 2; 68 | 69 | // 2 pages due to 1 is default tab opening 70 | while (totalPages < expectedPages) { 71 | const pages = await instance.___internal.browser.pages(); 72 | totalPages = pages.length; 73 | await sleep(1000); 74 | } 75 | 76 | expect(totalPages).toEqual(expectedPages); 77 | }, 78 | timeout, 79 | ); 80 | 81 | afterAll(() => { 82 | return instance.close(); 83 | }, timeout); 84 | }); 85 | 86 | describe('scrape hacker news from multiple urls', () => { 87 | let instance: PSInstance; 88 | 89 | beforeAll(async () => { 90 | const creating = PuppetScraper.launch(launchOptions); 91 | return creating.then((created) => (instance = created)); 92 | }, timeout); 93 | 94 | it( 95 | 'should scrape without errors', 96 | () => { 97 | const pages = 5; 98 | const urls = Array.from({ length: pages }).map( 99 | (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`, 100 | ); 101 | 102 | const data = instance 103 | .scrapeFromUrls({ 104 | urls, 105 | evaluateFn: () => { 106 | let items: DataType[] = []; 107 | 108 | document.querySelectorAll('.storylink').forEach((node) => { 109 | items.push({ 110 | title: (node as HTMLAnchorElement).innerText, 111 | url: (node as HTMLAnchorElement).href, 112 | }); 113 | }); 114 | 115 | return items; 116 | }, 117 | }) 118 | .catch(); 119 | 120 | expect(data).resolves.not.toBeNull(); 121 | expect(data).resolves.toHaveLength(pages); 122 | expect(data).resolves.toHaveProperty([0, 0, 'title']); 123 | expect(data).resolves.toHaveProperty([0, 0, 'url']); 124 | 125 | return data; 126 | }, 127 | timeout, 128 | ); 129 | 130 | it( 131 | 'should instantiate all pages', 132 | async () => { 133 | let totalPages = 0; 134 | const expectedPages = launchOptions.concurrentPages + 1; 135 | 136 | // concurrent pages + 1 due to default tab opening 137 | while (totalPages < expectedPages) { 138 | const pages = await instance.___internal.browser.pages(); 139 | totalPages = pages.length; 140 | await sleep(1000); 141 | } 142 | 143 | expect(totalPages).toEqual(expectedPages); 144 | }, 145 | timeout, 146 | ); 147 | 148 | afterAll(() => { 149 | return instance.close(); 150 | }, timeout); 151 | }); 152 | -------------------------------------------------------------------------------- /test/utils.ts: -------------------------------------------------------------------------------- 1 | export const sleep = (ms: number) => { 2 | return new Promise((resolve) => setTimeout(resolve, ms)); 3 | }; 4 | -------------------------------------------------------------------------------- /test/variables.ts: -------------------------------------------------------------------------------- 1 | import { PSLaunchOptions } from '..'; 2 | 3 | export const launchOptions: PSLaunchOptions = { 4 | ...(process.env.CI 5 | ? { executablePath: 'google-chrome-stable', args: ['--no-sandbox'] } 6 | : {}), 7 | concurrentPages: 5, 8 | }; 9 | 10 | export const timeout = 10000; 11 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "baseUrl": "./", 4 | "declaration": true, 5 | "esModuleInterop": true, 6 | "importHelpers": true, 7 | "jsx": "react", 8 | "lib": ["dom", "esnext"], 9 | "module": "esnext", 10 | "moduleResolution": "node", 11 | "noFallthroughCasesInSwitch": true, 12 | "noImplicitReturns": true, 13 | "noUnusedLocals": true, 14 | "noUnusedParameters": true, 15 | "paths": { 16 | "*": ["node_modules/*"] 17 | }, 18 | "rootDir": "./src", 19 | "sourceMap": true, 20 | "strict": true, 21 | "strictNullChecks": false 22 | }, 23 | "include": ["src", "types"] 24 | } 25 | -------------------------------------------------------------------------------- /tsdx.config.js: -------------------------------------------------------------------------------- 1 | // https://github.com/jaredpalmer/tsdx#customization 2 | 3 | module.exports = { 4 | rollup(config, _options) { 5 | return config; 6 | }, 7 | target: 'node', 8 | }; 9 | -------------------------------------------------------------------------------- /types/index.d.ts: -------------------------------------------------------------------------------- 1 | declare var __DEV__: boolean; 2 | --------------------------------------------------------------------------------