├── .all-contributorsrc
├── .github
    ├── FUNDING.yml
    └── workflows
    │   └── main.yml
├── .gitignore
├── .vscode
    ├── extensions.json
    └── settings.json
├── LICENSE
├── README.md
├── examples
    ├── hn-custom-browser.js
    ├── hn-multiple.js
    └── hn.js
├── header.png
├── package.json
├── src
    ├── defaults.ts
    ├── index.ts
    ├── puppet-scraper.ts
    └── types.ts
├── test
    ├── puppet-scraper.test.ts
    ├── utils.ts
    └── variables.ts
├── tsconfig.json
├── tsdx.config.js
├── types
    └── index.d.ts
└── yarn.lock


/.all-contributorsrc:
--------------------------------------------------------------------------------
 1 | {
 2 |   "projectName": "puppet-scraper",
 3 |   "projectOwner": "grikomsn",
 4 |   "repoType": "github",
 5 |   "repoHost": "https://github.com",
 6 |   "files": ["README.md"],
 7 |   "imageSize": 100,
 8 |   "commit": true,
 9 |   "commitConvention": "none",
10 |   "contributors": [
11 |     {
12 |       "login": "grikomsn",
13 |       "name": "Griko Nibras",
14 |       "avatar_url": "https://avatars1.githubusercontent.com/u/8220954?v=4",
15 |       "profile": "https://griko.id",
16 |       "contributions": ["code", "maintenance"]
17 |     }
18 |   ],
19 |   "contributorsPerLine": 7
20 | }
21 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | issuehunt: grikomsn
 2 | ko_fi: grikomsn
 3 | liberapay: grikomsn
 4 | custom: ['https://karyakarsa.com/grikomsn', 'https://saweria.co/grikomsn', 'https://trakteer.id/grikomsn']
 5 | 
 6 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 7 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 8 | open_collective: # Replace with a single Open Collective username
 9 | otechie: # Replace with a single Otechie username
10 | patreon: # Replace with a single Patreon username
11 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
12 | 


--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | on: [push]
 3 | jobs:
 4 |   build:
 5 |     runs-on: ubuntu-latest
 6 | 
 7 |     steps:
 8 |       - name: Checkout
 9 |         uses: actions/checkout@v2
10 | 
11 |       - name: Use Node 12
12 |         uses: actions/setup-node@v1
13 |         with:
14 |           node-version: 12.x
15 | 
16 |       - name: Use cached node_modules
17 |         uses: actions/cache@v1
18 |         with:
19 |           path: node_modules
20 |           key: nodeModules-${{ hashFiles('**/yarn.lock') }}
21 |           restore-keys: |
22 |             nodeModules-
23 | 
24 |       - name: Install dependencies
25 |         uses: ianwalter/puppeteer@cbdd5c50c8d6b6275cdf46e4ad2b3f7ee61211ce
26 |         with:
27 |           args: yarn install --frozen-lockfile
28 |         env:
29 |           CI: true
30 | 
31 |       - name: Lint
32 |         run: yarn lint
33 |         env:
34 |           CI: true
35 | 
36 |       - name: Test
37 |         uses: ianwalter/puppeteer@cbdd5c50c8d6b6275cdf46e4ad2b3f7ee61211ce
38 |         with:
39 |           args: yarn test --ci --coverage --maxWorkers=2
40 |         env:
41 |           CI: true
42 | 
43 |       - name: Build
44 |         run: yarn build
45 |         env:
46 |           CI: true
47 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | .DS_Store
3 | dist
4 | node_modules
5 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": ["rbbit.typescript-hero"]
3 | }
4 | 


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |   "typescriptHero.imports.insertSemicolons": true,
3 |   "typescriptHero.imports.organizeOnSave": true,
4 |   "typescriptHero.imports.organizeSortsByFirstSpecifier": true
5 | }
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Griko Nibras
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- markdownlint-disable MD014 MD033 MD041 -->
  2 | 
  3 | <div align="center">
  4 | 
  5 | [![puppet-scraper](./header.png)](.)
  6 | 
  7 | ![github release](https://badgen.net/github/release/grikomsn/puppet-scraper?icon=github)
  8 | ![npm version](https://badgen.net/npm/v/puppet-scraper?icon=npm)
  9 | 
 10 | </div>
 11 | 
 12 | ---
 13 | 
 14 | - [Brief example](#brief-example)
 15 | - [Usage](#usage)
 16 |   - [Installing dependency](#installing-dependency)
 17 |   - [Instantiation](#instantiation)
 18 |   - [Customize options](#customize-options)
 19 |   - [Scraping single page](#scraping-single-page)
 20 |   - [Scraping multiple pages](#scraping-multiple-pages)
 21 |   - [Closing instance](#closing-instance)
 22 |   - [Access the browser instance](#access-the-browser-instance)
 23 | - [Contributing](#contributing)
 24 | - [License](#license)
 25 | 
 26 | ---
 27 | 
 28 | **PuppetScraper is a opinionated wrapper library for utilizing [Puppeteer](https://github.com/puppeteer/puppeteer) to scrape pages easily, bootstrapped using [Jared Palmer's tsdx](https://github.com/jaredpalmer/tsdx).**
 29 | 
 30 | Most people create a new scraping project by `require`-ing Puppeteer and create their own logic to scrape pages, and that logic will get more complicated when trying to use multiple pages.
 31 | 
 32 | PuppetScraper allows you to just pass the URLs to scrape, the function to evaluate (the scraping logic), and how many pages (or tabs) to open at a time. Basically, PuppetScraper abstracts the need to create multiple page instances and retrying the evaluation logic.
 33 | 
 34 | **Version 0.1.0 note**: PuppetScraper was initially made as a project template rather than a wrapper library, but the core logic is still the same with various improvements and without extra libraries, so you can include PuppetScraper in your project easily using `npm` or `yarn`.
 35 | 
 36 | ## Brief example
 37 | 
 38 | Here's a [basic example](./examples/hn.js) on scraping the entries on [first page Hacker News](https://news.ycombinator.com):
 39 | 
 40 | ```js
 41 | // examples/hn.js
 42 | 
 43 | const { PuppetScraper } = require('puppet-scraper');
 44 | 
 45 | const ps = await PuppetScraper.launch();
 46 | 
 47 | const data = await ps.scrapeFromUrl({
 48 |   url: 'https://news.ycombinator.com',
 49 |   evaluateFn: () => {
 50 |     let items = [];
 51 | 
 52 |     document.querySelectorAll('.storylink').forEach((node) => {
 53 |       items.push({
 54 |         title: node.innerText,
 55 |         url: node.href,
 56 |       });
 57 |     });
 58 | 
 59 |     return items;
 60 |   },
 61 | });
 62 | 
 63 | console.log({ data });
 64 | 
 65 | await ps.close();
 66 | ```
 67 | 
 68 | View more examples on the [`examples` directory](./examples).
 69 | 
 70 | ## Usage
 71 | 
 72 | ### Installing dependency
 73 | 
 74 | Install `puppet-scraper` via `npm` or `yarn`:
 75 | 
 76 | ```console
 77 | $ npm install puppet-scraper
 78 |       --- or ---
 79 | $ yarn add puppet-scraper
 80 | ```
 81 | 
 82 | Install peer dependency `puppeteer` or Puppeteer equivalent ([`chrome-aws-lambda`](https://github.com/alixaxel/chrome-aws-lambda), untested):
 83 | 
 84 | ```console
 85 | $ npm install puppeteer
 86 |       --- or ---
 87 | $ yarn add puppeteer
 88 | ```
 89 | 
 90 | ### Instantiation
 91 | 
 92 | Create the PuppetScraper instance, either launching a new browser instance, connect or use an existing browser instance:
 93 | 
 94 | ```js
 95 | const { PuppetScraper } = require('puppet-scraper');
 96 | const Puppeteer = require('puppeteer');
 97 | 
 98 | // launches a new browser instance
 99 | const instance = await PuppetScraper.launch();
100 | 
101 | // connect to an existing browser instance
102 | const external = await PuppetScraper.connect({
103 |   browserWSEndpoint: 'ws://127.0.0.1:9222/devtools/browser/...',
104 | });
105 | 
106 | // use an existing browser instance
107 | const browser = await Puppeteer.launch();
108 | const existing = await PuppetScraper.use({ browser });
109 | ```
110 | 
111 | ### Customize options
112 | 
113 | `launch` and `connect` has the same props with `Puppeteer.launch` and `Puppeteer.connect`, but with an extra `concurrentPages` and `maxEvaluationRetries` property:
114 | 
115 | ```js
116 | const { PuppetScraper } = require('puppet-scraper');
117 | 
118 | const instance = await PuppetScraper.launch({
119 |   concurrentPages: 3,
120 |   maxEvaluationRetries: 10
121 |   headless: false,
122 | });
123 | ```
124 | 
125 | `concurrentPages` is for how many pages/tabs will be opened and use for scraping.
126 | 
127 | `maxEvaluationRetries` is for how many times the page will try to evaluate the given function on `evaluateFn` (see below), where if the evaluation throws an error, the page will reload and try to re-evaluate again.
128 | 
129 | If `concurrentPages` and `maxEvaluationRetries` is not determined, it will use the [default values](./src/defaults.ts):
130 | 
131 | ```ts
132 | export const DEFAULT_CONCURRENT_PAGES = 3;
133 | export const DEFAULT_EVALUATION_RETRIES = 10;
134 | ```
135 | 
136 | ### Scraping single page
137 | 
138 | As shown like the example above, use `.scrapeFromUrl` and pass an object with the following properties:
139 | 
140 | - `url: string`, page URL to be opened
141 | - `evaluateFn: function`, function to evaluate (scraper method)
142 | - `pageOptions: object`, [`Puppeteer.DirectNavigationOptions`](https://github.com/DefinitelyTyped/DefinitelyTyped/blob/master/types/puppeteer/index.d.ts#L551) props to override page behaviors
143 | 
144 | ```js
145 | const data = await instance.scrapeFromUrl({
146 |   url: 'https://news.ycombinator.com',
147 |   evaluateFn: () => {
148 |     let items = [];
149 | 
150 |     document.querySelectorAll('.storylink').forEach((node) => {
151 |       items.push({
152 |         title: node.innerText,
153 |         url: node.href,
154 |       });
155 |     });
156 | 
157 |     return items;
158 |   },
159 | });
160 | ```
161 | 
162 | `pageOptions` defaults the `waitUntil` property to `networkidle0`, which you can read more on the [API documentation](https://pptr.dev/#?product=Puppeteer&version=v3.0.2&show=api-pagegotourl-options).
163 | 
164 | ### Scraping multiple pages
165 | 
166 | Same as `.scrapeFromUrl` but passes `urls` property which contain `string`s of URL:
167 | 
168 | - `urls: string[]`, page URLs to be opened
169 | - `evaluateFn: function`, function to evaluate (scraper method)
170 | - `pageOptions: object`, [`Puppeteer.DirectNavigationOptions`](https://github.com/DefinitelyTyped/DefinitelyTyped/blob/master/types/puppeteer/index.d.ts#L551) props to override page behaviors
171 | 
172 | ```js
173 | const urls = Array.from({ length: 5 }).map(
174 |   (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`,
175 | );
176 | 
177 | const data = await ps.scrapeFromUrls({
178 |   urls,
179 |   evaluateFn: () => {
180 |     let items = [];
181 | 
182 |     document.querySelectorAll('.storylink').forEach((node) => {
183 |       items.push({
184 |         title: node.innerText,
185 |         url: node.href,
186 |       });
187 |     });
188 | 
189 |     return items;
190 |   },
191 | });
192 | ```
193 | 
194 | ### Closing instance
195 | 
196 | When there's nothing left to do, don't forget to close the instance with closes the browser:
197 | 
198 | ```js
199 | await instance.close();
200 | ```
201 | 
202 | ### Access the browser instance
203 | 
204 | PuppetScraper also exposes the browser instance if you want to do things manually:
205 | 
206 | ```js
207 | const browser = instance.___internal.browser;
208 | ```
209 | 
210 | ## Contributing
211 | 
212 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
213 | 
214 | <!-- ALL-CONTRIBUTORS-LIST:START - Do not remove or modify this section -->
215 | <!-- prettier-ignore-start -->
216 | <!-- markdownlint-disable -->
217 | <table>
218 |   <tr>
219 |     <td align="center"><a href="https://griko.id"><img src="https://avatars1.githubusercontent.com/u/8220954?v=4" width="100px;" alt=""/><br /><sub><b>Griko Nibras</b></sub></a><br /><a href="https://github.com/grikomsn/puppet-scraper/commits?author=grikomsn" title="Code">💻</a> <a href="#maintenance-grikomsn" title="Maintenance">🚧</a></td>
220 |   </tr>
221 | </table>
222 | 
223 | <!-- markdownlint-enable -->
224 | <!-- prettier-ignore-end -->
225 | <!-- ALL-CONTRIBUTORS-LIST:END -->
226 | 
227 | This project follows the [all-contributors][all-contributors] specification.
228 | Contributions of any kind welcome!
229 | 
230 | ## License
231 | 
232 | [MIT License, Copyright (c) 2020 Griko Nibras](./LICENSE)
233 | 
234 | [all-contributors]: https://github.com/all-contributors/all-contributors
235 | 


--------------------------------------------------------------------------------
/examples/hn-custom-browser.js:
--------------------------------------------------------------------------------
 1 | const { PuppetScraper } = require('..');
 2 | 
 3 | async function hnCustomBrowser() {
 4 |   const ps = await PuppetScraper.launch({
 5 |     executablePath:
 6 |       'C:\\Program Files (x86)\\Microsoft\\Edge Dev\\Application\\msedge.exe',
 7 |     headless: false,
 8 |   });
 9 | 
10 |   const data = await ps.scrapeFromUrl({
11 |     url: 'https://news.ycombinator.com',
12 |     evaluateFn: () => {
13 |       let items = [];
14 | 
15 |       document.querySelectorAll('.storylink').forEach((node) => {
16 |         items.push({
17 |           title: node.innerText,
18 |           url: node.href,
19 |         });
20 |       });
21 | 
22 |       return items;
23 |     },
24 |   });
25 | 
26 |   console.log({ data });
27 | 
28 |   await ps.close();
29 | }
30 | 
31 | hnCustomBrowser();
32 | 


--------------------------------------------------------------------------------
/examples/hn-multiple.js:
--------------------------------------------------------------------------------
 1 | const { PuppetScraper } = require('..');
 2 | 
 3 | async function hnMultiple() {
 4 |   const ps = await PuppetScraper.launch({
 5 |     concurrentPages: 5,
 6 |   });
 7 | 
 8 |   const urls = Array.from({ length: 5 }).map(
 9 |     (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`,
10 |   );
11 | 
12 |   const data = await ps.scrapeFromUrls({
13 |     urls,
14 |     evaluateFn: () => {
15 |       let items = [];
16 | 
17 |       document.querySelectorAll('.storylink').forEach((node) => {
18 |         items.push({
19 |           title: node.innerText,
20 |           url: node.href,
21 |         });
22 |       });
23 | 
24 |       return items;
25 |     },
26 |   });
27 | 
28 |   console.log({ data });
29 | 
30 |   await ps.close();
31 | }
32 | 
33 | hnMultiple();
34 | 


--------------------------------------------------------------------------------
/examples/hn.js:
--------------------------------------------------------------------------------
 1 | const { PuppetScraper } = require('..');
 2 | 
 3 | async function hn() {
 4 |   const ps = await PuppetScraper.launch();
 5 | 
 6 |   const data = await ps.scrapeFromUrl({
 7 |     url: 'https://news.ycombinator.com',
 8 |     evaluateFn: () => {
 9 |       let items = [];
10 | 
11 |       document.querySelectorAll('.storylink').forEach((node) => {
12 |         items.push({
13 |           title: node.innerText,
14 |           url: node.href,
15 |         });
16 |       });
17 | 
18 |       return items;
19 |     },
20 |   });
21 | 
22 |   console.log({ data });
23 | 
24 |   await ps.close();
25 | }
26 | 
27 | hn();
28 | 


--------------------------------------------------------------------------------
/header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grikomsn/puppet-scraper/856b3e25ee263f9d80d0cb3b9bbda4763eb3d4bc/header.png


--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "puppet-scraper",
 3 |   "description": "Scraping using Puppeteer the sane way 🤹🏻‍♂️",
 4 |   "version": "0.2.0",
 5 |   "repository": "https://github.com/grikomsn/puppet-scraper.git",
 6 |   "author": "Griko Nibras <git@griko.id>",
 7 |   "files": [
 8 |     "dist",
 9 |     "src"
10 |   ],
11 |   "main": "dist/index.js",
12 |   "module": "dist/puppet-scraper.esm.js",
13 |   "typings": "dist/index.d.ts",
14 |   "scripts": {
15 |     "prepare": "tsdx build",
16 |     "build": "tsdx build",
17 |     "lint": "tsdx lint --fix src test types",
18 |     "test": "tsdx test",
19 |     "watch": "tsdx watch",
20 |     "format": "yarn format:examples && yarn format:index",
21 |     "format:examples": "prettier --write \"examples/**/*.js\"",
22 |     "format:index": "prettier --write \"*.{js,json,md}\"",
23 |     "contributors:add": "all-contributors add",
24 |     "contributors:generate": "all-contributors generate"
25 |   },
26 |   "dependencies": {
27 |     "promise-retry": "^1.1.1",
28 |     "puppeteer-core": "^3.0.3"
29 |   },
30 |   "peerDependencies": {
31 |     "puppeteer": "^3.0.3"
32 |   },
33 |   "devDependencies": {
34 |     "@types/promise-retry": "^1.1.3",
35 |     "@types/puppeteer": "^2.0.1",
36 |     "@types/puppeteer-core": "^2.0.0",
37 |     "all-contributors-cli": "^6.14.2",
38 |     "husky": "^4.2.5",
39 |     "puppeteer": "^3.0.3",
40 |     "tsdx": "^0.13.2",
41 |     "tslib": "^1.11.2",
42 |     "typescript": "^3.8.3"
43 |   },
44 |   "engines": {
45 |     "node": ">=10"
46 |   },
47 |   "husky": {
48 |     "hooks": {
49 |       "pre-commit": "yarn format && yarn lint && yarn contributors:generate"
50 |     }
51 |   },
52 |   "prettier": {
53 |     "arrowParens": "always",
54 |     "printWidth": 80,
55 |     "semi": true,
56 |     "singleQuote": true,
57 |     "trailingComma": "all"
58 |   },
59 |   "license": "MIT"
60 | }
61 | 


--------------------------------------------------------------------------------
/src/defaults.ts:
--------------------------------------------------------------------------------
 1 | import { DirectNavigationOptions } from 'puppeteer';
 2 | 
 3 | export const DEFAULT_CONCURRENT_PAGES = 3;
 4 | 
 5 | export const DEFAULT_EVALUATION_RETRIES = 10;
 6 | 
 7 | export const DEFAULT_PAGE_OPTIONS: DirectNavigationOptions = {
 8 |   waitUntil: 'networkidle0',
 9 | };
10 | 


--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | export * from './defaults';
2 | export * from './types';
3 | 
4 | export { PuppetScraper } from './puppet-scraper';
5 | 


--------------------------------------------------------------------------------
/src/puppet-scraper.ts:
--------------------------------------------------------------------------------
  1 | import promiseRetry from 'promise-retry';
  2 | import Puppeteer, { Page } from 'puppeteer';
  3 | 
  4 | import {
  5 |   DEFAULT_CONCURRENT_PAGES,
  6 |   DEFAULT_EVALUATION_RETRIES,
  7 |   DEFAULT_PAGE_OPTIONS,
  8 | } from './defaults';
  9 | import {
 10 |   PSBootstrap,
 11 |   PSConnect,
 12 |   PSLaunch,
 13 |   PSUse,
 14 |   ScrapeFromUrl,
 15 |   ScrapeFromUrls,
 16 | } from './types';
 17 | 
 18 | const bootstrap: PSBootstrap = async ({
 19 |   browser,
 20 |   concurrentPages = DEFAULT_CONCURRENT_PAGES,
 21 |   maxEvaluationRetries = DEFAULT_EVALUATION_RETRIES,
 22 | } = {}) => {
 23 |   let pages: Page[] = Array.from({ length: concurrentPages });
 24 | 
 25 |   const scrapeFromUrl: ScrapeFromUrl = async (props) => {
 26 |     const { url, evaluateFn, pageOptions } = props;
 27 | 
 28 |     const mergedPageOptions = {
 29 |       ...DEFAULT_PAGE_OPTIONS,
 30 |       ...pageOptions,
 31 |     };
 32 | 
 33 |     let page = pages[0];
 34 |     if (!page) {
 35 |       page = await browser.newPage();
 36 |     }
 37 | 
 38 |     return page.goto(url, mergedPageOptions).then(() =>
 39 |       promiseRetry(
 40 |         async (retry) => {
 41 |           try {
 42 |             return page.evaluate(evaluateFn);
 43 |           } catch (error) {
 44 |             await page.reload();
 45 |             return retry(error);
 46 |           }
 47 |         },
 48 |         { maxRetryTime: maxEvaluationRetries },
 49 |       ),
 50 |     );
 51 |   };
 52 | 
 53 |   const scrapeFromUrls: ScrapeFromUrls = async (props) => {
 54 |     const { urls, evaluateFn, pageOptions } = props;
 55 | 
 56 |     const mergedPageOptions = {
 57 |       ...DEFAULT_PAGE_OPTIONS,
 58 |       ...pageOptions,
 59 |     };
 60 | 
 61 |     pages = await Promise.all(
 62 |       pages.map(async (page) => {
 63 |         if (!page) page = await browser.newPage();
 64 |         return page;
 65 |       }),
 66 |     );
 67 | 
 68 |     let current = 0;
 69 |     let total = urls.length;
 70 | 
 71 |     const results = [];
 72 |     while (current < total) {
 73 |       const finishedPages = await Promise.all(
 74 |         // eslint-disable-next-line no-loop-func
 75 |         pages.reduce<Promise<Page>[]>((finishedPages, page) => {
 76 |           if (current <= total) {
 77 |             const finishedPage = page
 78 |               .goto(urls[current++], mergedPageOptions)
 79 |               .then(() => page);
 80 | 
 81 |             return finishedPages.concat(finishedPage);
 82 |           }
 83 |           return finishedPages;
 84 |         }, []),
 85 |       );
 86 | 
 87 |       const evaluatingPages = finishedPages.map((page) =>
 88 |         promiseRetry(
 89 |           async (retry) => {
 90 |             try {
 91 |               return page.evaluate(evaluateFn);
 92 |             } catch (error) {
 93 |               await page.reload();
 94 |               return retry(error);
 95 |             }
 96 |           },
 97 |           { maxRetryTime: maxEvaluationRetries },
 98 |         ),
 99 |       );
100 | 
101 |       const data = await Promise.all(evaluatingPages);
102 |       results.push(...data);
103 |     }
104 | 
105 |     return results;
106 |   };
107 | 
108 |   const close = () => browser.close();
109 | 
110 |   return {
111 |     scrapeFromUrl,
112 |     scrapeFromUrls,
113 |     close,
114 |     ___internal: {
115 |       browser,
116 |     },
117 |   };
118 | };
119 | 
120 | const connect: PSConnect = async ({
121 |   concurrentPages,
122 |   maxEvaluationRetries,
123 |   ...opts
124 | } = {}) => {
125 |   const browser = await Puppeteer.connect(opts);
126 |   return bootstrap({ browser, concurrentPages, maxEvaluationRetries });
127 | };
128 | 
129 | const launch: PSLaunch = async ({
130 |   concurrentPages,
131 |   maxEvaluationRetries,
132 |   ...opts
133 | } = {}) => {
134 |   const browser = await Puppeteer.launch(opts);
135 |   return bootstrap({ browser, concurrentPages, maxEvaluationRetries });
136 | };
137 | 
138 | const use: PSUse = (opts) => {
139 |   if (!opts.browser) {
140 |     throw new Error('browser is not defined');
141 |   }
142 |   return bootstrap(opts);
143 | };
144 | 
145 | export const PuppetScraper = { connect, launch, use };
146 | 


--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
 1 | import {
 2 |   Browser,
 3 |   ConnectOptions,
 4 |   DirectNavigationOptions,
 5 |   LaunchOptions,
 6 | } from 'puppeteer';
 7 | 
 8 | // #region ScrapeFromUrl
 9 | 
10 | export interface ScrapeFromUrlProps<T extends any> {
11 |   url: string;
12 |   evaluateFn: () => T;
13 |   pageOptions?: DirectNavigationOptions;
14 | }
15 | 
16 | export type ScrapeFromUrl = <T>(
17 |   props: ScrapeFromUrlProps<T>,
18 | ) => Promise<T extends PromiseLike<infer U> ? U : T>;
19 | 
20 | // #endregion
21 | 
22 | // #region ScrapeFromUrls
23 | 
24 | export interface ScrapeFromUrlsProps<T extends any> {
25 |   urls: string[];
26 |   evaluateFn: () => T;
27 |   pageOptions?: DirectNavigationOptions;
28 | }
29 | 
30 | export type ScrapeFromUrls = <T>(
31 |   props: ScrapeFromUrlsProps<T>,
32 | ) => Promise<(T extends PromiseLike<infer U> ? U : T)[]>;
33 | 
34 | // #endregion
35 | 
36 | // #region PuppetScraper
37 | 
38 | export interface PSInstance {
39 |   scrapeFromUrl: ScrapeFromUrl;
40 |   scrapeFromUrls: ScrapeFromUrls;
41 |   close: () => Promise<void>;
42 | 
43 |   ___internal: {
44 |     browser: Browser;
45 |   };
46 | }
47 | 
48 | export type PSBootstrapProps = {
49 |   browser?: Browser;
50 |   concurrentPages?: number;
51 |   maxEvaluationRetries?: number;
52 | };
53 | 
54 | export type PSBootstrap = (props?: PSBootstrapProps) => Promise<PSInstance>;
55 | 
56 | export type PSLaunchOptions = PSBootstrapProps & LaunchOptions;
57 | 
58 | export type PSLaunch = (opts?: PSLaunchOptions) => ReturnType<PSBootstrap>;
59 | 
60 | export type PSConnectOptions = PSBootstrapProps & ConnectOptions;
61 | 
62 | export type PSConnect = (opts: PSConnectOptions) => ReturnType<PSBootstrap>;
63 | 
64 | export type PSUseOptions = PSBootstrapProps & { browser: Browser };
65 | 
66 | export type PSUse = (opts: PSUseOptions) => ReturnType<PSBootstrap>;
67 | 
68 | // #endregion
69 | 


--------------------------------------------------------------------------------
/test/puppet-scraper.test.ts:
--------------------------------------------------------------------------------
  1 | import { PSInstance, PuppetScraper } from '..';
  2 | import { sleep } from './utils';
  3 | import { launchOptions, timeout } from './variables';
  4 | 
  5 | type DataType = { title: string; url: string };
  6 | 
  7 | describe('launching instance', () => {
  8 |   it(
  9 |     'should launch and close without errors',
 10 |     () => {
 11 |       let instance: PSInstance;
 12 | 
 13 |       const instantiateAndClose = async () => {
 14 |         instance = await PuppetScraper.launch(launchOptions);
 15 |         await instance.close();
 16 |       };
 17 | 
 18 |       expect(instantiateAndClose).not.toThrow();
 19 |     },
 20 |     timeout,
 21 |   );
 22 | });
 23 | 
 24 | describe('scrape hacker news from single url', () => {
 25 |   let instance: PSInstance;
 26 | 
 27 |   beforeAll(() => {
 28 |     const creating = PuppetScraper.launch(launchOptions);
 29 |     return creating.then((created) => (instance = created));
 30 |   }, timeout);
 31 | 
 32 |   it(
 33 |     'should scrape without errors',
 34 |     () => {
 35 |       const data = instance
 36 |         .scrapeFromUrl({
 37 |           url: 'https://news.ycombinator.com',
 38 |           evaluateFn: () => {
 39 |             let data: DataType[] = [];
 40 | 
 41 |             document.querySelectorAll('.storylink').forEach((node) => {
 42 |               data.push({
 43 |                 title: (node as HTMLAnchorElement).innerText,
 44 |                 url: (node as HTMLAnchorElement).href,
 45 |               });
 46 |             });
 47 | 
 48 |             return data;
 49 |           },
 50 |         })
 51 |         .catch();
 52 | 
 53 |       expect(data).resolves.not.toBeNull();
 54 |       expect(data).resolves.toHaveLength(30);
 55 |       expect(data).resolves.toHaveProperty([0, 'title']);
 56 |       expect(data).resolves.toHaveProperty([0, 'url']);
 57 | 
 58 |       return data;
 59 |     },
 60 |     timeout,
 61 |   );
 62 | 
 63 |   it(
 64 |     'should instantiate one page',
 65 |     async () => {
 66 |       let totalPages = 0;
 67 |       const expectedPages = 2;
 68 | 
 69 |       // 2 pages due to 1 is default tab opening
 70 |       while (totalPages < expectedPages) {
 71 |         const pages = await instance.___internal.browser.pages();
 72 |         totalPages = pages.length;
 73 |         await sleep(1000);
 74 |       }
 75 | 
 76 |       expect(totalPages).toEqual(expectedPages);
 77 |     },
 78 |     timeout,
 79 |   );
 80 | 
 81 |   afterAll(() => {
 82 |     return instance.close();
 83 |   }, timeout);
 84 | });
 85 | 
 86 | describe('scrape hacker news from multiple urls', () => {
 87 |   let instance: PSInstance;
 88 | 
 89 |   beforeAll(async () => {
 90 |     const creating = PuppetScraper.launch(launchOptions);
 91 |     return creating.then((created) => (instance = created));
 92 |   }, timeout);
 93 | 
 94 |   it(
 95 |     'should scrape without errors',
 96 |     () => {
 97 |       const pages = 5;
 98 |       const urls = Array.from({ length: pages }).map(
 99 |         (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`,
100 |       );
101 | 
102 |       const data = instance
103 |         .scrapeFromUrls({
104 |           urls,
105 |           evaluateFn: () => {
106 |             let items: DataType[] = [];
107 | 
108 |             document.querySelectorAll('.storylink').forEach((node) => {
109 |               items.push({
110 |                 title: (node as HTMLAnchorElement).innerText,
111 |                 url: (node as HTMLAnchorElement).href,
112 |               });
113 |             });
114 | 
115 |             return items;
116 |           },
117 |         })
118 |         .catch();
119 | 
120 |       expect(data).resolves.not.toBeNull();
121 |       expect(data).resolves.toHaveLength(pages);
122 |       expect(data).resolves.toHaveProperty([0, 0, 'title']);
123 |       expect(data).resolves.toHaveProperty([0, 0, 'url']);
124 | 
125 |       return data;
126 |     },
127 |     timeout,
128 |   );
129 | 
130 |   it(
131 |     'should instantiate all pages',
132 |     async () => {
133 |       let totalPages = 0;
134 |       const expectedPages = launchOptions.concurrentPages + 1;
135 | 
136 |       // concurrent pages + 1 due to default tab opening
137 |       while (totalPages < expectedPages) {
138 |         const pages = await instance.___internal.browser.pages();
139 |         totalPages = pages.length;
140 |         await sleep(1000);
141 |       }
142 | 
143 |       expect(totalPages).toEqual(expectedPages);
144 |     },
145 |     timeout,
146 |   );
147 | 
148 |   afterAll(() => {
149 |     return instance.close();
150 |   }, timeout);
151 | });
152 | 


--------------------------------------------------------------------------------
/test/utils.ts:
--------------------------------------------------------------------------------
1 | export const sleep = (ms: number) => {
2 |   return new Promise((resolve) => setTimeout(resolve, ms));
3 | };
4 | 


--------------------------------------------------------------------------------
/test/variables.ts:
--------------------------------------------------------------------------------
 1 | import { PSLaunchOptions } from '..';
 2 | 
 3 | export const launchOptions: PSLaunchOptions = {
 4 |   ...(process.env.CI
 5 |     ? { executablePath: 'google-chrome-stable', args: ['--no-sandbox'] }
 6 |     : {}),
 7 |   concurrentPages: 5,
 8 | };
 9 | 
10 | export const timeout = 10000;
11 | 


--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "compilerOptions": {
 3 |     "baseUrl": "./",
 4 |     "declaration": true,
 5 |     "esModuleInterop": true,
 6 |     "importHelpers": true,
 7 |     "jsx": "react",
 8 |     "lib": ["dom", "esnext"],
 9 |     "module": "esnext",
10 |     "moduleResolution": "node",
11 |     "noFallthroughCasesInSwitch": true,
12 |     "noImplicitReturns": true,
13 |     "noUnusedLocals": true,
14 |     "noUnusedParameters": true,
15 |     "paths": {
16 |       "*": ["node_modules/*"]
17 |     },
18 |     "rootDir": "./src",
19 |     "sourceMap": true,
20 |     "strict": true,
21 |     "strictNullChecks": false
22 |   },
23 |   "include": ["src", "types"]
24 | }
25 | 


--------------------------------------------------------------------------------
/tsdx.config.js:
--------------------------------------------------------------------------------
1 | // https://github.com/jaredpalmer/tsdx#customization
2 | 
3 | module.exports = {
4 |   rollup(config, _options) {
5 |     return config;
6 |   },
7 |   target: 'node',
8 | };
9 | 


--------------------------------------------------------------------------------
/types/index.d.ts:
--------------------------------------------------------------------------------
1 | declare var __DEV__: boolean;
2 | 


--------------------------------------------------------------------------------