├── .all-contributorsrc
├── .github
├── FUNDING.yml
└── workflows
│ └── main.yml
├── .gitignore
├── .vscode
├── extensions.json
└── settings.json
├── LICENSE
├── README.md
├── examples
├── hn-custom-browser.js
├── hn-multiple.js
└── hn.js
├── header.png
├── package.json
├── src
├── defaults.ts
├── index.ts
├── puppet-scraper.ts
└── types.ts
├── test
├── puppet-scraper.test.ts
├── utils.ts
└── variables.ts
├── tsconfig.json
├── tsdx.config.js
├── types
└── index.d.ts
└── yarn.lock
/.all-contributorsrc:
--------------------------------------------------------------------------------
1 | {
2 | "projectName": "puppet-scraper",
3 | "projectOwner": "grikomsn",
4 | "repoType": "github",
5 | "repoHost": "https://github.com",
6 | "files": ["README.md"],
7 | "imageSize": 100,
8 | "commit": true,
9 | "commitConvention": "none",
10 | "contributors": [
11 | {
12 | "login": "grikomsn",
13 | "name": "Griko Nibras",
14 | "avatar_url": "https://avatars1.githubusercontent.com/u/8220954?v=4",
15 | "profile": "https://griko.id",
16 | "contributions": ["code", "maintenance"]
17 | }
18 | ],
19 | "contributorsPerLine": 7
20 | }
21 |
--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | issuehunt: grikomsn
2 | ko_fi: grikomsn
3 | liberapay: grikomsn
4 | custom: ['https://karyakarsa.com/grikomsn', 'https://saweria.co/grikomsn', 'https://trakteer.id/grikomsn']
5 |
6 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
7 | github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
8 | open_collective: # Replace with a single Open Collective username
9 | otechie: # Replace with a single Otechie username
10 | patreon: # Replace with a single Patreon username
11 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
12 |
--------------------------------------------------------------------------------
/.github/workflows/main.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 | on: [push]
3 | jobs:
4 | build:
5 | runs-on: ubuntu-latest
6 |
7 | steps:
8 | - name: Checkout
9 | uses: actions/checkout@v2
10 |
11 | - name: Use Node 12
12 | uses: actions/setup-node@v1
13 | with:
14 | node-version: 12.x
15 |
16 | - name: Use cached node_modules
17 | uses: actions/cache@v1
18 | with:
19 | path: node_modules
20 | key: nodeModules-${{ hashFiles('**/yarn.lock') }}
21 | restore-keys: |
22 | nodeModules-
23 |
24 | - name: Install dependencies
25 | uses: ianwalter/puppeteer@cbdd5c50c8d6b6275cdf46e4ad2b3f7ee61211ce
26 | with:
27 | args: yarn install --frozen-lockfile
28 | env:
29 | CI: true
30 |
31 | - name: Lint
32 | run: yarn lint
33 | env:
34 | CI: true
35 |
36 | - name: Test
37 | uses: ianwalter/puppeteer@cbdd5c50c8d6b6275cdf46e4ad2b3f7ee61211ce
38 | with:
39 | args: yarn test --ci --coverage --maxWorkers=2
40 | env:
41 | CI: true
42 |
43 | - name: Build
44 | run: yarn build
45 | env:
46 | CI: true
47 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.log
2 | .DS_Store
3 | dist
4 | node_modules
5 |
--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 | "recommendations": ["rbbit.typescript-hero"]
3 | }
4 |
--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 | "typescriptHero.imports.insertSemicolons": true,
3 | "typescriptHero.imports.organizeOnSave": true,
4 | "typescriptHero.imports.organizeSortsByFirstSpecifier": true
5 | }
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Griko Nibras
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | [](.)
6 |
7 | 
8 | 
9 |
10 |
11 |
12 | ---
13 |
14 | - [Brief example](#brief-example)
15 | - [Usage](#usage)
16 | - [Installing dependency](#installing-dependency)
17 | - [Instantiation](#instantiation)
18 | - [Customize options](#customize-options)
19 | - [Scraping single page](#scraping-single-page)
20 | - [Scraping multiple pages](#scraping-multiple-pages)
21 | - [Closing instance](#closing-instance)
22 | - [Access the browser instance](#access-the-browser-instance)
23 | - [Contributing](#contributing)
24 | - [License](#license)
25 |
26 | ---
27 |
28 | **PuppetScraper is a opinionated wrapper library for utilizing [Puppeteer](https://github.com/puppeteer/puppeteer) to scrape pages easily, bootstrapped using [Jared Palmer's tsdx](https://github.com/jaredpalmer/tsdx).**
29 |
30 | Most people create a new scraping project by `require`-ing Puppeteer and create their own logic to scrape pages, and that logic will get more complicated when trying to use multiple pages.
31 |
32 | PuppetScraper allows you to just pass the URLs to scrape, the function to evaluate (the scraping logic), and how many pages (or tabs) to open at a time. Basically, PuppetScraper abstracts the need to create multiple page instances and retrying the evaluation logic.
33 |
34 | **Version 0.1.0 note**: PuppetScraper was initially made as a project template rather than a wrapper library, but the core logic is still the same with various improvements and without extra libraries, so you can include PuppetScraper in your project easily using `npm` or `yarn`.
35 |
36 | ## Brief example
37 |
38 | Here's a [basic example](./examples/hn.js) on scraping the entries on [first page Hacker News](https://news.ycombinator.com):
39 |
40 | ```js
41 | // examples/hn.js
42 |
43 | const { PuppetScraper } = require('puppet-scraper');
44 |
45 | const ps = await PuppetScraper.launch();
46 |
47 | const data = await ps.scrapeFromUrl({
48 | url: 'https://news.ycombinator.com',
49 | evaluateFn: () => {
50 | let items = [];
51 |
52 | document.querySelectorAll('.storylink').forEach((node) => {
53 | items.push({
54 | title: node.innerText,
55 | url: node.href,
56 | });
57 | });
58 |
59 | return items;
60 | },
61 | });
62 |
63 | console.log({ data });
64 |
65 | await ps.close();
66 | ```
67 |
68 | View more examples on the [`examples` directory](./examples).
69 |
70 | ## Usage
71 |
72 | ### Installing dependency
73 |
74 | Install `puppet-scraper` via `npm` or `yarn`:
75 |
76 | ```console
77 | $ npm install puppet-scraper
78 | --- or ---
79 | $ yarn add puppet-scraper
80 | ```
81 |
82 | Install peer dependency `puppeteer` or Puppeteer equivalent ([`chrome-aws-lambda`](https://github.com/alixaxel/chrome-aws-lambda), untested):
83 |
84 | ```console
85 | $ npm install puppeteer
86 | --- or ---
87 | $ yarn add puppeteer
88 | ```
89 |
90 | ### Instantiation
91 |
92 | Create the PuppetScraper instance, either launching a new browser instance, connect or use an existing browser instance:
93 |
94 | ```js
95 | const { PuppetScraper } = require('puppet-scraper');
96 | const Puppeteer = require('puppeteer');
97 |
98 | // launches a new browser instance
99 | const instance = await PuppetScraper.launch();
100 |
101 | // connect to an existing browser instance
102 | const external = await PuppetScraper.connect({
103 | browserWSEndpoint: 'ws://127.0.0.1:9222/devtools/browser/...',
104 | });
105 |
106 | // use an existing browser instance
107 | const browser = await Puppeteer.launch();
108 | const existing = await PuppetScraper.use({ browser });
109 | ```
110 |
111 | ### Customize options
112 |
113 | `launch` and `connect` has the same props with `Puppeteer.launch` and `Puppeteer.connect`, but with an extra `concurrentPages` and `maxEvaluationRetries` property:
114 |
115 | ```js
116 | const { PuppetScraper } = require('puppet-scraper');
117 |
118 | const instance = await PuppetScraper.launch({
119 | concurrentPages: 3,
120 | maxEvaluationRetries: 10
121 | headless: false,
122 | });
123 | ```
124 |
125 | `concurrentPages` is for how many pages/tabs will be opened and use for scraping.
126 |
127 | `maxEvaluationRetries` is for how many times the page will try to evaluate the given function on `evaluateFn` (see below), where if the evaluation throws an error, the page will reload and try to re-evaluate again.
128 |
129 | If `concurrentPages` and `maxEvaluationRetries` is not determined, it will use the [default values](./src/defaults.ts):
130 |
131 | ```ts
132 | export const DEFAULT_CONCURRENT_PAGES = 3;
133 | export const DEFAULT_EVALUATION_RETRIES = 10;
134 | ```
135 |
136 | ### Scraping single page
137 |
138 | As shown like the example above, use `.scrapeFromUrl` and pass an object with the following properties:
139 |
140 | - `url: string`, page URL to be opened
141 | - `evaluateFn: function`, function to evaluate (scraper method)
142 | - `pageOptions: object`, [`Puppeteer.DirectNavigationOptions`](https://github.com/DefinitelyTyped/DefinitelyTyped/blob/master/types/puppeteer/index.d.ts#L551) props to override page behaviors
143 |
144 | ```js
145 | const data = await instance.scrapeFromUrl({
146 | url: 'https://news.ycombinator.com',
147 | evaluateFn: () => {
148 | let items = [];
149 |
150 | document.querySelectorAll('.storylink').forEach((node) => {
151 | items.push({
152 | title: node.innerText,
153 | url: node.href,
154 | });
155 | });
156 |
157 | return items;
158 | },
159 | });
160 | ```
161 |
162 | `pageOptions` defaults the `waitUntil` property to `networkidle0`, which you can read more on the [API documentation](https://pptr.dev/#?product=Puppeteer&version=v3.0.2&show=api-pagegotourl-options).
163 |
164 | ### Scraping multiple pages
165 |
166 | Same as `.scrapeFromUrl` but passes `urls` property which contain `string`s of URL:
167 |
168 | - `urls: string[]`, page URLs to be opened
169 | - `evaluateFn: function`, function to evaluate (scraper method)
170 | - `pageOptions: object`, [`Puppeteer.DirectNavigationOptions`](https://github.com/DefinitelyTyped/DefinitelyTyped/blob/master/types/puppeteer/index.d.ts#L551) props to override page behaviors
171 |
172 | ```js
173 | const urls = Array.from({ length: 5 }).map(
174 | (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`,
175 | );
176 |
177 | const data = await ps.scrapeFromUrls({
178 | urls,
179 | evaluateFn: () => {
180 | let items = [];
181 |
182 | document.querySelectorAll('.storylink').forEach((node) => {
183 | items.push({
184 | title: node.innerText,
185 | url: node.href,
186 | });
187 | });
188 |
189 | return items;
190 | },
191 | });
192 | ```
193 |
194 | ### Closing instance
195 |
196 | When there's nothing left to do, don't forget to close the instance with closes the browser:
197 |
198 | ```js
199 | await instance.close();
200 | ```
201 |
202 | ### Access the browser instance
203 |
204 | PuppetScraper also exposes the browser instance if you want to do things manually:
205 |
206 | ```js
207 | const browser = instance.___internal.browser;
208 | ```
209 |
210 | ## Contributing
211 |
212 | Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/docs/en/emoji-key)):
213 |
214 |
215 |
216 |
217 |
222 |
223 |
224 |
225 |
226 |
227 | This project follows the [all-contributors][all-contributors] specification.
228 | Contributions of any kind welcome!
229 |
230 | ## License
231 |
232 | [MIT License, Copyright (c) 2020 Griko Nibras](./LICENSE)
233 |
234 | [all-contributors]: https://github.com/all-contributors/all-contributors
235 |
--------------------------------------------------------------------------------
/examples/hn-custom-browser.js:
--------------------------------------------------------------------------------
1 | const { PuppetScraper } = require('..');
2 |
3 | async function hnCustomBrowser() {
4 | const ps = await PuppetScraper.launch({
5 | executablePath:
6 | 'C:\\Program Files (x86)\\Microsoft\\Edge Dev\\Application\\msedge.exe',
7 | headless: false,
8 | });
9 |
10 | const data = await ps.scrapeFromUrl({
11 | url: 'https://news.ycombinator.com',
12 | evaluateFn: () => {
13 | let items = [];
14 |
15 | document.querySelectorAll('.storylink').forEach((node) => {
16 | items.push({
17 | title: node.innerText,
18 | url: node.href,
19 | });
20 | });
21 |
22 | return items;
23 | },
24 | });
25 |
26 | console.log({ data });
27 |
28 | await ps.close();
29 | }
30 |
31 | hnCustomBrowser();
32 |
--------------------------------------------------------------------------------
/examples/hn-multiple.js:
--------------------------------------------------------------------------------
1 | const { PuppetScraper } = require('..');
2 |
3 | async function hnMultiple() {
4 | const ps = await PuppetScraper.launch({
5 | concurrentPages: 5,
6 | });
7 |
8 | const urls = Array.from({ length: 5 }).map(
9 | (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`,
10 | );
11 |
12 | const data = await ps.scrapeFromUrls({
13 | urls,
14 | evaluateFn: () => {
15 | let items = [];
16 |
17 | document.querySelectorAll('.storylink').forEach((node) => {
18 | items.push({
19 | title: node.innerText,
20 | url: node.href,
21 | });
22 | });
23 |
24 | return items;
25 | },
26 | });
27 |
28 | console.log({ data });
29 |
30 | await ps.close();
31 | }
32 |
33 | hnMultiple();
34 |
--------------------------------------------------------------------------------
/examples/hn.js:
--------------------------------------------------------------------------------
1 | const { PuppetScraper } = require('..');
2 |
3 | async function hn() {
4 | const ps = await PuppetScraper.launch();
5 |
6 | const data = await ps.scrapeFromUrl({
7 | url: 'https://news.ycombinator.com',
8 | evaluateFn: () => {
9 | let items = [];
10 |
11 | document.querySelectorAll('.storylink').forEach((node) => {
12 | items.push({
13 | title: node.innerText,
14 | url: node.href,
15 | });
16 | });
17 |
18 | return items;
19 | },
20 | });
21 |
22 | console.log({ data });
23 |
24 | await ps.close();
25 | }
26 |
27 | hn();
28 |
--------------------------------------------------------------------------------
/header.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/grikomsn/puppet-scraper/856b3e25ee263f9d80d0cb3b9bbda4763eb3d4bc/header.png
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "puppet-scraper",
3 | "description": "Scraping using Puppeteer the sane way 🤹🏻♂️",
4 | "version": "0.2.0",
5 | "repository": "https://github.com/grikomsn/puppet-scraper.git",
6 | "author": "Griko Nibras ",
7 | "files": [
8 | "dist",
9 | "src"
10 | ],
11 | "main": "dist/index.js",
12 | "module": "dist/puppet-scraper.esm.js",
13 | "typings": "dist/index.d.ts",
14 | "scripts": {
15 | "prepare": "tsdx build",
16 | "build": "tsdx build",
17 | "lint": "tsdx lint --fix src test types",
18 | "test": "tsdx test",
19 | "watch": "tsdx watch",
20 | "format": "yarn format:examples && yarn format:index",
21 | "format:examples": "prettier --write \"examples/**/*.js\"",
22 | "format:index": "prettier --write \"*.{js,json,md}\"",
23 | "contributors:add": "all-contributors add",
24 | "contributors:generate": "all-contributors generate"
25 | },
26 | "dependencies": {
27 | "promise-retry": "^1.1.1",
28 | "puppeteer-core": "^3.0.3"
29 | },
30 | "peerDependencies": {
31 | "puppeteer": "^3.0.3"
32 | },
33 | "devDependencies": {
34 | "@types/promise-retry": "^1.1.3",
35 | "@types/puppeteer": "^2.0.1",
36 | "@types/puppeteer-core": "^2.0.0",
37 | "all-contributors-cli": "^6.14.2",
38 | "husky": "^4.2.5",
39 | "puppeteer": "^3.0.3",
40 | "tsdx": "^0.13.2",
41 | "tslib": "^1.11.2",
42 | "typescript": "^3.8.3"
43 | },
44 | "engines": {
45 | "node": ">=10"
46 | },
47 | "husky": {
48 | "hooks": {
49 | "pre-commit": "yarn format && yarn lint && yarn contributors:generate"
50 | }
51 | },
52 | "prettier": {
53 | "arrowParens": "always",
54 | "printWidth": 80,
55 | "semi": true,
56 | "singleQuote": true,
57 | "trailingComma": "all"
58 | },
59 | "license": "MIT"
60 | }
61 |
--------------------------------------------------------------------------------
/src/defaults.ts:
--------------------------------------------------------------------------------
1 | import { DirectNavigationOptions } from 'puppeteer';
2 |
3 | export const DEFAULT_CONCURRENT_PAGES = 3;
4 |
5 | export const DEFAULT_EVALUATION_RETRIES = 10;
6 |
7 | export const DEFAULT_PAGE_OPTIONS: DirectNavigationOptions = {
8 | waitUntil: 'networkidle0',
9 | };
10 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | export * from './defaults';
2 | export * from './types';
3 |
4 | export { PuppetScraper } from './puppet-scraper';
5 |
--------------------------------------------------------------------------------
/src/puppet-scraper.ts:
--------------------------------------------------------------------------------
1 | import promiseRetry from 'promise-retry';
2 | import Puppeteer, { Page } from 'puppeteer';
3 |
4 | import {
5 | DEFAULT_CONCURRENT_PAGES,
6 | DEFAULT_EVALUATION_RETRIES,
7 | DEFAULT_PAGE_OPTIONS,
8 | } from './defaults';
9 | import {
10 | PSBootstrap,
11 | PSConnect,
12 | PSLaunch,
13 | PSUse,
14 | ScrapeFromUrl,
15 | ScrapeFromUrls,
16 | } from './types';
17 |
18 | const bootstrap: PSBootstrap = async ({
19 | browser,
20 | concurrentPages = DEFAULT_CONCURRENT_PAGES,
21 | maxEvaluationRetries = DEFAULT_EVALUATION_RETRIES,
22 | } = {}) => {
23 | let pages: Page[] = Array.from({ length: concurrentPages });
24 |
25 | const scrapeFromUrl: ScrapeFromUrl = async (props) => {
26 | const { url, evaluateFn, pageOptions } = props;
27 |
28 | const mergedPageOptions = {
29 | ...DEFAULT_PAGE_OPTIONS,
30 | ...pageOptions,
31 | };
32 |
33 | let page = pages[0];
34 | if (!page) {
35 | page = await browser.newPage();
36 | }
37 |
38 | return page.goto(url, mergedPageOptions).then(() =>
39 | promiseRetry(
40 | async (retry) => {
41 | try {
42 | return page.evaluate(evaluateFn);
43 | } catch (error) {
44 | await page.reload();
45 | return retry(error);
46 | }
47 | },
48 | { maxRetryTime: maxEvaluationRetries },
49 | ),
50 | );
51 | };
52 |
53 | const scrapeFromUrls: ScrapeFromUrls = async (props) => {
54 | const { urls, evaluateFn, pageOptions } = props;
55 |
56 | const mergedPageOptions = {
57 | ...DEFAULT_PAGE_OPTIONS,
58 | ...pageOptions,
59 | };
60 |
61 | pages = await Promise.all(
62 | pages.map(async (page) => {
63 | if (!page) page = await browser.newPage();
64 | return page;
65 | }),
66 | );
67 |
68 | let current = 0;
69 | let total = urls.length;
70 |
71 | const results = [];
72 | while (current < total) {
73 | const finishedPages = await Promise.all(
74 | // eslint-disable-next-line no-loop-func
75 | pages.reduce[]>((finishedPages, page) => {
76 | if (current <= total) {
77 | const finishedPage = page
78 | .goto(urls[current++], mergedPageOptions)
79 | .then(() => page);
80 |
81 | return finishedPages.concat(finishedPage);
82 | }
83 | return finishedPages;
84 | }, []),
85 | );
86 |
87 | const evaluatingPages = finishedPages.map((page) =>
88 | promiseRetry(
89 | async (retry) => {
90 | try {
91 | return page.evaluate(evaluateFn);
92 | } catch (error) {
93 | await page.reload();
94 | return retry(error);
95 | }
96 | },
97 | { maxRetryTime: maxEvaluationRetries },
98 | ),
99 | );
100 |
101 | const data = await Promise.all(evaluatingPages);
102 | results.push(...data);
103 | }
104 |
105 | return results;
106 | };
107 |
108 | const close = () => browser.close();
109 |
110 | return {
111 | scrapeFromUrl,
112 | scrapeFromUrls,
113 | close,
114 | ___internal: {
115 | browser,
116 | },
117 | };
118 | };
119 |
120 | const connect: PSConnect = async ({
121 | concurrentPages,
122 | maxEvaluationRetries,
123 | ...opts
124 | } = {}) => {
125 | const browser = await Puppeteer.connect(opts);
126 | return bootstrap({ browser, concurrentPages, maxEvaluationRetries });
127 | };
128 |
129 | const launch: PSLaunch = async ({
130 | concurrentPages,
131 | maxEvaluationRetries,
132 | ...opts
133 | } = {}) => {
134 | const browser = await Puppeteer.launch(opts);
135 | return bootstrap({ browser, concurrentPages, maxEvaluationRetries });
136 | };
137 |
138 | const use: PSUse = (opts) => {
139 | if (!opts.browser) {
140 | throw new Error('browser is not defined');
141 | }
142 | return bootstrap(opts);
143 | };
144 |
145 | export const PuppetScraper = { connect, launch, use };
146 |
--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
1 | import {
2 | Browser,
3 | ConnectOptions,
4 | DirectNavigationOptions,
5 | LaunchOptions,
6 | } from 'puppeteer';
7 |
8 | // #region ScrapeFromUrl
9 |
10 | export interface ScrapeFromUrlProps {
11 | url: string;
12 | evaluateFn: () => T;
13 | pageOptions?: DirectNavigationOptions;
14 | }
15 |
16 | export type ScrapeFromUrl = (
17 | props: ScrapeFromUrlProps,
18 | ) => Promise ? U : T>;
19 |
20 | // #endregion
21 |
22 | // #region ScrapeFromUrls
23 |
24 | export interface ScrapeFromUrlsProps {
25 | urls: string[];
26 | evaluateFn: () => T;
27 | pageOptions?: DirectNavigationOptions;
28 | }
29 |
30 | export type ScrapeFromUrls = (
31 | props: ScrapeFromUrlsProps,
32 | ) => Promise<(T extends PromiseLike ? U : T)[]>;
33 |
34 | // #endregion
35 |
36 | // #region PuppetScraper
37 |
38 | export interface PSInstance {
39 | scrapeFromUrl: ScrapeFromUrl;
40 | scrapeFromUrls: ScrapeFromUrls;
41 | close: () => Promise;
42 |
43 | ___internal: {
44 | browser: Browser;
45 | };
46 | }
47 |
48 | export type PSBootstrapProps = {
49 | browser?: Browser;
50 | concurrentPages?: number;
51 | maxEvaluationRetries?: number;
52 | };
53 |
54 | export type PSBootstrap = (props?: PSBootstrapProps) => Promise;
55 |
56 | export type PSLaunchOptions = PSBootstrapProps & LaunchOptions;
57 |
58 | export type PSLaunch = (opts?: PSLaunchOptions) => ReturnType;
59 |
60 | export type PSConnectOptions = PSBootstrapProps & ConnectOptions;
61 |
62 | export type PSConnect = (opts: PSConnectOptions) => ReturnType;
63 |
64 | export type PSUseOptions = PSBootstrapProps & { browser: Browser };
65 |
66 | export type PSUse = (opts: PSUseOptions) => ReturnType;
67 |
68 | // #endregion
69 |
--------------------------------------------------------------------------------
/test/puppet-scraper.test.ts:
--------------------------------------------------------------------------------
1 | import { PSInstance, PuppetScraper } from '..';
2 | import { sleep } from './utils';
3 | import { launchOptions, timeout } from './variables';
4 |
5 | type DataType = { title: string; url: string };
6 |
7 | describe('launching instance', () => {
8 | it(
9 | 'should launch and close without errors',
10 | () => {
11 | let instance: PSInstance;
12 |
13 | const instantiateAndClose = async () => {
14 | instance = await PuppetScraper.launch(launchOptions);
15 | await instance.close();
16 | };
17 |
18 | expect(instantiateAndClose).not.toThrow();
19 | },
20 | timeout,
21 | );
22 | });
23 |
24 | describe('scrape hacker news from single url', () => {
25 | let instance: PSInstance;
26 |
27 | beforeAll(() => {
28 | const creating = PuppetScraper.launch(launchOptions);
29 | return creating.then((created) => (instance = created));
30 | }, timeout);
31 |
32 | it(
33 | 'should scrape without errors',
34 | () => {
35 | const data = instance
36 | .scrapeFromUrl({
37 | url: 'https://news.ycombinator.com',
38 | evaluateFn: () => {
39 | let data: DataType[] = [];
40 |
41 | document.querySelectorAll('.storylink').forEach((node) => {
42 | data.push({
43 | title: (node as HTMLAnchorElement).innerText,
44 | url: (node as HTMLAnchorElement).href,
45 | });
46 | });
47 |
48 | return data;
49 | },
50 | })
51 | .catch();
52 |
53 | expect(data).resolves.not.toBeNull();
54 | expect(data).resolves.toHaveLength(30);
55 | expect(data).resolves.toHaveProperty([0, 'title']);
56 | expect(data).resolves.toHaveProperty([0, 'url']);
57 |
58 | return data;
59 | },
60 | timeout,
61 | );
62 |
63 | it(
64 | 'should instantiate one page',
65 | async () => {
66 | let totalPages = 0;
67 | const expectedPages = 2;
68 |
69 | // 2 pages due to 1 is default tab opening
70 | while (totalPages < expectedPages) {
71 | const pages = await instance.___internal.browser.pages();
72 | totalPages = pages.length;
73 | await sleep(1000);
74 | }
75 |
76 | expect(totalPages).toEqual(expectedPages);
77 | },
78 | timeout,
79 | );
80 |
81 | afterAll(() => {
82 | return instance.close();
83 | }, timeout);
84 | });
85 |
86 | describe('scrape hacker news from multiple urls', () => {
87 | let instance: PSInstance;
88 |
89 | beforeAll(async () => {
90 | const creating = PuppetScraper.launch(launchOptions);
91 | return creating.then((created) => (instance = created));
92 | }, timeout);
93 |
94 | it(
95 | 'should scrape without errors',
96 | () => {
97 | const pages = 5;
98 | const urls = Array.from({ length: pages }).map(
99 | (_, i) => `https://news.ycombinator.com/news?p=${i + 1}`,
100 | );
101 |
102 | const data = instance
103 | .scrapeFromUrls({
104 | urls,
105 | evaluateFn: () => {
106 | let items: DataType[] = [];
107 |
108 | document.querySelectorAll('.storylink').forEach((node) => {
109 | items.push({
110 | title: (node as HTMLAnchorElement).innerText,
111 | url: (node as HTMLAnchorElement).href,
112 | });
113 | });
114 |
115 | return items;
116 | },
117 | })
118 | .catch();
119 |
120 | expect(data).resolves.not.toBeNull();
121 | expect(data).resolves.toHaveLength(pages);
122 | expect(data).resolves.toHaveProperty([0, 0, 'title']);
123 | expect(data).resolves.toHaveProperty([0, 0, 'url']);
124 |
125 | return data;
126 | },
127 | timeout,
128 | );
129 |
130 | it(
131 | 'should instantiate all pages',
132 | async () => {
133 | let totalPages = 0;
134 | const expectedPages = launchOptions.concurrentPages + 1;
135 |
136 | // concurrent pages + 1 due to default tab opening
137 | while (totalPages < expectedPages) {
138 | const pages = await instance.___internal.browser.pages();
139 | totalPages = pages.length;
140 | await sleep(1000);
141 | }
142 |
143 | expect(totalPages).toEqual(expectedPages);
144 | },
145 | timeout,
146 | );
147 |
148 | afterAll(() => {
149 | return instance.close();
150 | }, timeout);
151 | });
152 |
--------------------------------------------------------------------------------
/test/utils.ts:
--------------------------------------------------------------------------------
1 | export const sleep = (ms: number) => {
2 | return new Promise((resolve) => setTimeout(resolve, ms));
3 | };
4 |
--------------------------------------------------------------------------------
/test/variables.ts:
--------------------------------------------------------------------------------
1 | import { PSLaunchOptions } from '..';
2 |
3 | export const launchOptions: PSLaunchOptions = {
4 | ...(process.env.CI
5 | ? { executablePath: 'google-chrome-stable', args: ['--no-sandbox'] }
6 | : {}),
7 | concurrentPages: 5,
8 | };
9 |
10 | export const timeout = 10000;
11 |
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "baseUrl": "./",
4 | "declaration": true,
5 | "esModuleInterop": true,
6 | "importHelpers": true,
7 | "jsx": "react",
8 | "lib": ["dom", "esnext"],
9 | "module": "esnext",
10 | "moduleResolution": "node",
11 | "noFallthroughCasesInSwitch": true,
12 | "noImplicitReturns": true,
13 | "noUnusedLocals": true,
14 | "noUnusedParameters": true,
15 | "paths": {
16 | "*": ["node_modules/*"]
17 | },
18 | "rootDir": "./src",
19 | "sourceMap": true,
20 | "strict": true,
21 | "strictNullChecks": false
22 | },
23 | "include": ["src", "types"]
24 | }
25 |
--------------------------------------------------------------------------------
/tsdx.config.js:
--------------------------------------------------------------------------------
1 | // https://github.com/jaredpalmer/tsdx#customization
2 |
3 | module.exports = {
4 | rollup(config, _options) {
5 | return config;
6 | },
7 | target: 'node',
8 | };
9 |
--------------------------------------------------------------------------------
/types/index.d.ts:
--------------------------------------------------------------------------------
1 | declare var __DEV__: boolean;
2 |
--------------------------------------------------------------------------------