├── .eslintignore ├── bin ├── run.cmd └── run ├── .prettierrc ├── .prettierignore ├── .gitattributes ├── src ├── pkg │ ├── README.md │ ├── plugins │ │ └── private │ │ │ ├── .gitignore │ │ │ └── README.md │ ├── automation-utils │ │ ├── README.md │ │ ├── session │ │ │ ├── storages │ │ │ │ ├── cookies.ts │ │ │ │ ├── IndexedDB │ │ │ │ │ ├── database-names.ts │ │ │ │ │ ├── index.ts │ │ │ │ │ ├── get.ts │ │ │ │ │ └── set.ts │ │ │ │ ├── sessionStorage.ts │ │ │ │ └── localStorage.ts │ │ │ └── session.ts │ │ ├── appearance.ts │ │ ├── invisible-recaptcha.ts │ │ └── offset.ts │ ├── services │ │ ├── README.md │ │ ├── db.ts │ │ ├── env.ts │ │ ├── util.ts │ │ └── log │ │ │ ├── uncaught.ts │ │ │ └── log.ts │ ├── schemas │ │ ├── cdp.ts │ │ └── pkg-env.ts │ └── browsers │ │ └── browser.ts └── app │ ├── modules │ └── README.md │ ├── schemas │ └── app-env.ts │ ├── cmd │ └── dev.ts │ └── detectors │ ├── proxy-drop.ts │ └── README.md ├── jest.config.js ├── assets └── images │ ├── boilerplate.jpeg │ ├── boilerplate.xcf │ └── boilerplate-readme.jpeg ├── .dockerignore ├── prisma └── schema.prisma ├── .eslintrc.js ├── .gitignore ├── .env ├── package.json ├── README.md └── tsconfig.json /.eslintignore: -------------------------------------------------------------------------------- 1 | .eslintrc.js -------------------------------------------------------------------------------- /bin/run.cmd: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | node "%~dp0\run" %* -------------------------------------------------------------------------------- /.prettierrc: -------------------------------------------------------------------------------- 1 | { 2 | "tabWidth": 2, 3 | "useTabs": false 4 | } 5 | -------------------------------------------------------------------------------- /.prettierignore: -------------------------------------------------------------------------------- 1 | # Ignore artifacts: 2 | build 3 | logs 4 | storage 5 | vendor -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.jpeg filter=lfs diff=lfs merge=lfs -text 2 | *.xcf filter=lfs diff=lfs merge=lfs -text 3 | -------------------------------------------------------------------------------- /src/pkg/README.md: -------------------------------------------------------------------------------- 1 | # Shared Packages 📦 2 | 3 | The code that's shared between your projects should lie here. 4 | -------------------------------------------------------------------------------- /src/pkg/plugins/private/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore every file... 2 | * 3 | 4 | # ... except these 5 | !.gitignore 6 | !README.md -------------------------------------------------------------------------------- /src/pkg/plugins/private/README.md: -------------------------------------------------------------------------------- 1 | # Private plugins ㊙️ 2 | 3 | Put your private plugins here. Beware: by default, they're ignored by git. 4 | -------------------------------------------------------------------------------- /jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | preset: "ts-jest", 3 | testEnvironment: "node", 4 | testPathIgnorePatterns: ["/lib"], 5 | }; 6 | -------------------------------------------------------------------------------- /bin/run: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | 3 | require('@oclif/command').run() 4 | .then(require('@oclif/command/flush')) 5 | .catch(require('@oclif/errors/handle')) -------------------------------------------------------------------------------- /src/pkg/automation-utils/README.md: -------------------------------------------------------------------------------- 1 | # Automation Utils 🤖 2 | 3 | Modules that are useful for every project. They should get converted to plugins eventually. 4 | -------------------------------------------------------------------------------- /assets/images/boilerplate.jpeg: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:efd9642308b5c900ff6719b8ec08d3de7d7d75e7fc6bfecc24c9b37ecd7e06b1 3 | size 210044 4 | -------------------------------------------------------------------------------- /assets/images/boilerplate.xcf: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7788ba606e567a6211e15a6030ff92758baf197065a32a7b39c1b43c182e1e3c 3 | size 3188660 4 | -------------------------------------------------------------------------------- /assets/images/boilerplate-readme.jpeg: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:567d058b2f304b572d8e6c5447ab3c85f63c828708568661459f991c29f4879e 3 | size 111303 4 | -------------------------------------------------------------------------------- /src/pkg/services/README.md: -------------------------------------------------------------------------------- 1 | # Services 2 | 3 | Packages that are used throughout the code. 4 | 5 | Examples: 6 | - Database 7 | - Logger 8 | - Environment variables accessor -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | app 2 | core 3 | node_modules 4 | test 5 | debug.log 6 | .dockerignore 7 | .eslintrc.json 8 | .prettierignore 9 | .prettierrc.json 10 | docker-compose.yml 11 | Dockerfile -------------------------------------------------------------------------------- /src/app/modules/README.md: -------------------------------------------------------------------------------- 1 | # Modules 🧩 2 | 3 | Building blocks of your app should lie here. 4 | 5 | Examples: 6 | 7 | - login 8 | - filling out a form 9 | - completing a CAPTCHA 10 | -------------------------------------------------------------------------------- /src/pkg/services/db.ts: -------------------------------------------------------------------------------- 1 | import { PrismaClient } from "@prisma/client"; 2 | 3 | /** 4 | * Create a single Prisma connection for the application. 5 | */ 6 | export const db: PrismaClient = new PrismaClient(); 7 | -------------------------------------------------------------------------------- /src/pkg/services/env.ts: -------------------------------------------------------------------------------- 1 | import dotenv from "dotenv"; 2 | import { GlobalEnvSchema } from "../schemas/pkg-env"; 3 | 4 | /** Initialize Environment Variables **/ 5 | dotenv.config(); 6 | 7 | /** Default Export **/ 8 | export default GlobalEnvSchema.parse(process.env); 9 | -------------------------------------------------------------------------------- /src/pkg/schemas/cdp.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | 3 | /** 4 | * @see https://chromedevtools.github.io/devtools-protocol/tot/IndexedDB/#method-requestDatabaseNames 5 | */ 6 | export const CDPIndexedDBDatabaseNames = z.object({ 7 | databaseNames: z.array(z.string()), 8 | }); 9 | -------------------------------------------------------------------------------- /src/app/schemas/app-env.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | 3 | /** 4 | * Variables that should be included in .env. 5 | * @example see `pkg/schemas/pkg-env.ts` 6 | */ 7 | export const AppEnvSchema = z.object({ 8 | // define here things that should get included in the .env file by the user 9 | }); 10 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/session/storages/cookies.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "puppeteer"; 2 | 3 | export async function getCookies(page: Page) { 4 | return JSON.stringify(await page.cookies()); 5 | } 6 | 7 | export async function setCookies(page: Page, cookies: string) { 8 | await page.setCookie(...JSON.parse(cookies)); 9 | } 10 | -------------------------------------------------------------------------------- /src/app/cmd/dev.ts: -------------------------------------------------------------------------------- 1 | import { Command } from "@oclif/command"; 2 | 3 | export default class Hello extends Command { 4 | static description = "start here ;)"; 5 | 6 | static examples = []; 7 | 8 | static flags = {}; 9 | 10 | static args = [{ name: "file" }]; 11 | 12 | async run() { 13 | const { args, flags } = this.parse(Hello); 14 | 15 | console.log(args, flags); 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /prisma/schema.prisma: -------------------------------------------------------------------------------- 1 | // This is your Prisma schema file, 2 | // learn more about it in the docs: https://pris.ly/d/prisma-schema 3 | 4 | datasource db { 5 | provider = "sqlite" 6 | url = env("DATABASE_URL") 7 | } 8 | 9 | generator client { 10 | provider = "prisma-client-js" 11 | } 12 | 13 | model User { 14 | id Int @id @default(autoincrement()) 15 | email String @unique 16 | name String? 17 | } 18 | -------------------------------------------------------------------------------- /.eslintrc.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | env: { 3 | es2021: true, 4 | node: true, 5 | }, 6 | extends: ["eslint:recommended", "plugin:@typescript-eslint/recommended"], 7 | parser: "@typescript-eslint/parser", 8 | parserOptions: { 9 | ecmaVersion: 12, 10 | sourceType: "module", 11 | project: "tsconfig.json", 12 | }, 13 | plugins: ["@typescript-eslint"], 14 | rules: { 15 | "@typescript-eslint/no-floating-promises": "error", 16 | }, 17 | }; 18 | -------------------------------------------------------------------------------- /src/pkg/schemas/pkg-env.ts: -------------------------------------------------------------------------------- 1 | import { z } from "zod"; 2 | import { AppEnvSchema } from "../../app/schemas/app-env"; 3 | 4 | export const DefaultEnvSchema = z.object({ 5 | DATABASE_URL: z.string().optional(), 6 | TOKEN_2CAPTCHA: z.string().optional(), 7 | GOOGLE_APPLICATION_CREDENTIALS: z.string().optional(), 8 | PUSHBULLET_APIKEY: z.string().optional(), 9 | // TODO: restrict to possible log levels 10 | LOG_LEVEL: z.string(), 11 | PROJECT_NAME: z.string(), 12 | CHROME_PATH: z.string().optional(), 13 | }); 14 | 15 | export const GlobalEnvSchema = DefaultEnvSchema.merge(AppEnvSchema); 16 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/session/storages/IndexedDB/database-names.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "puppeteer"; 2 | import { CDPIndexedDBDatabaseNames } from "../../../../schemas/cdp"; 3 | 4 | export async function getDatabaseNames( 5 | page: Page, 6 | securityOrigin: string 7 | ): Promise { 8 | const session = await page.target().createCDPSession(); 9 | 10 | const dbNames = CDPIndexedDBDatabaseNames.parse( 11 | await session.send("IndexedDB.requestDatabaseNames", { 12 | securityOrigin, 13 | }) 14 | ); 15 | 16 | await session.detach(); 17 | 18 | return dbNames.databaseNames; 19 | } 20 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/session/storages/sessionStorage.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "puppeteer"; 2 | 3 | export async function getSessionStorage(page: Page): Promise { 4 | return page.evaluate(() => 5 | JSON.stringify(Object.assign({}, window.sessionStorage)) 6 | ); 7 | } 8 | 9 | export async function setSessionStorage( 10 | page: Page, 11 | sessionStorage: string 12 | ): Promise { 13 | await page.evaluate((sessionStorage: string) => { 14 | for (const [key, val] of Object.entries(JSON.parse(sessionStorage))) { 15 | window.sessionStorage.setItem(key, val as string); 16 | } 17 | }, sessionStorage); 18 | } 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Environment 2 | # .env # note: you should gitignore your .env 3 | 4 | # JS 5 | yarn-error.log 6 | 7 | # Vendors 8 | node_modules/ 9 | 10 | # Database 11 | *.sqlite 12 | *.db 13 | *.db-journal 14 | 15 | # Transient Data 16 | storage 17 | .session-data* 18 | .session-data 19 | 20 | # TypeScript 21 | run.js 22 | *.tsbuildinfo 23 | lib/**/* 24 | 25 | # Temp Files 26 | cache/ 27 | *.zip 28 | 29 | # Runtime data 30 | pids 31 | *.pid 32 | *.seed 33 | *.pid.lock 34 | 35 | # Editors 36 | .idea/ 37 | .vscode/ 38 | .eslintcache 39 | 40 | # Logs 41 | *.log 42 | npm-debug.log* 43 | yarn-debug.log* 44 | yarn-error.log* 45 | *.dump 46 | 47 | # OS metadata 48 | .DS_Store 49 | Thumbs.db -------------------------------------------------------------------------------- /src/pkg/automation-utils/appearance.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "puppeteer"; 2 | 3 | /** 4 | * Execute once something appears on the page. 5 | * @param sel the element that will appear 6 | * @param timeout the timeout by which the target element should've appeared 7 | * @param cb the function that will be called once the element appears 8 | * @returns 9 | */ 10 | export async function doOnAppareance( 11 | page: Page, 12 | sel: string, 13 | timeout: number, 14 | cb: (page: Page) => Promise 15 | ): Promise { 16 | try { 17 | await page.waitForSelector(sel, { 18 | timeout, 19 | }); 20 | } catch { 21 | return; 22 | } 23 | 24 | return cb(page); 25 | } 26 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/session/storages/localStorage.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "puppeteer"; 2 | 3 | export async function getLocalStorage(page: Page): Promise { 4 | // STEALTH: use isolated worlds 5 | const localStorage = await page.evaluate(() => 6 | Object.assign({}, window.localStorage) 7 | ); 8 | return JSON.stringify(localStorage); 9 | } 10 | 11 | export async function setLocalStorage( 12 | page: Page, 13 | localStorage: string 14 | ): Promise { 15 | // STEALTH: use isolated worlds 16 | await page.evaluate((localStorage: string) => { 17 | for (const [key, val] of Object.entries(JSON.parse(localStorage))) { 18 | window.localStorage.setItem(key, val as string); 19 | } 20 | }, localStorage); 21 | } 22 | -------------------------------------------------------------------------------- /src/app/detectors/proxy-drop.ts: -------------------------------------------------------------------------------- 1 | import pEvent from "p-event"; 2 | import { Page } from "puppeteer"; 3 | 4 | export class ProxyDropError extends Error { 5 | constructor(message: string) { 6 | super(message); 7 | this.name = "ProxyDropError"; 8 | } 9 | } 10 | 11 | export async function catchProxyDrop(page: Page): Promise { 12 | await (await page.target().createCDPSession()).send("Page.enable"); 13 | 14 | await pEvent(page, "requestfailed", { 15 | filter: (req: any) => { 16 | return [ 17 | "TUNNEL_CONNECTION_FAILED", 18 | "PROXY_CONNECTION_FAILED", 19 | "NETWORK_CHANGED", 20 | ].some((error) => req.failure().errorText.includes(error)); 21 | }, 22 | }); 23 | 24 | throw new ProxyDropError("Proxy drop detected."); 25 | } 26 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/invisible-recaptcha.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "puppeteer"; 2 | import { log } from "../services/log/log"; 3 | import { doOnAppareance } from "./appearance"; 4 | 5 | /** 6 | * Solve an invisible recaptcha challenge. 7 | * @param sel the recaptcha box selector 8 | * @param timeout the timeout by which the captcha box should've appeared 9 | */ 10 | export async function solveInvisibleRecaptcha(page: Page, timeout: number) { 11 | log.silly("invisible recaptcha solver called"); 12 | 13 | return doOnAppareance( 14 | page, 15 | '[title="recaptcha challenge"]', 16 | timeout, 17 | async (page) => { 18 | log.debug("captcha appeared; solving it using provider..."); 19 | await page.solveRecaptchas(); 20 | log.debug("recaptcha solved"); 21 | } 22 | ); 23 | } 24 | -------------------------------------------------------------------------------- /src/app/detectors/README.md: -------------------------------------------------------------------------------- 1 | # Failure detectors 📡 2 | 3 | Failure detectors are methods that detect errors when navigating and throws them. 4 | 5 | Examples: 6 | 7 | - Login errors 8 | - Networking issues 9 | - Anti-bot rate-limiting 10 | 11 | By default, failure detectors should not timeout. 12 | They also should never return. (To ensure that, use the `Promise` return type.) 13 | 14 | To use failure detectors: 15 | 16 | ```typescript 17 | const browser = await newBrowser() 18 | const page = browser.newPage() 19 | 20 | // Making a Frankenstein promise that will throw if one of our detectors is angry 21 | await Promise.race([ 22 | bot.start(page), 23 | detectProxyDrop(page), 24 | detectDatadomeBlock(page), 25 | detectCloudfrontBlock(page) 26 | ]) 27 | ``` 28 | 29 | ## See also 30 | 31 | - [p-event](https://yarnpkg.com/package/p-event): listen to events like they're promises. 32 | -------------------------------------------------------------------------------- /src/pkg/services/util.ts: -------------------------------------------------------------------------------- 1 | /** 2 | * An expression that will throw the given error message. 3 | * Useful when you want to assert that a value is undefined. 4 | * 5 | * Example: 6 | * 7 | * ``` 8 | * // will throw because the function returns null 9 | * returnsNull("argument") ?? throwExpression("the function returned undefined !") 10 | * 11 | * // will NOT throw because the function returns neither null nor undefined 12 | * returnsSomething("hi") ?? throwExpression("the function returned undefined !") 13 | * ``` 14 | * 15 | * [stackoverflow source](https://stackoverflow.com/a/65666402/4564097) 16 | */ 17 | export function throwExpression(errorMessage: string): never { 18 | throw new Error(errorMessage); 19 | } 20 | 21 | export function random(min: number, max: number): number { 22 | min = Math.ceil(min); 23 | max = Math.floor(max); 24 | return Math.floor(Math.random() * (max - min + 1)) + min; 25 | } 26 | -------------------------------------------------------------------------------- /.env: -------------------------------------------------------------------------------- 1 | ################################## WARNING ####################################### 2 | # You should add this file's path to .gitignore before pushing your code # 3 | ################################## INFO ######################################### 4 | # This file defines the environment variables that will be used by the program # 5 | # You should rename this file to .env to be understood by your program # 6 | # You can access these values using `env.YOUR_VALUE` # 7 | ################################################################################## 8 | 9 | # Used for logging 10 | PROJECT_NAME="puppeteer-boiler" 11 | 12 | # The database URL that will be used by Prisma (the underlying ORM, that handles databases operations) 13 | DATABASE_URL="file:./database.db" 14 | 15 | # Your 2CAPTCHA Token for recaptcha solving 16 | TOKEN_2CAPTCHA=XXXXX 17 | 18 | # GCP Service account credentials path 19 | # If not set, logs won't be sent to Google. 20 | GOOGLE_APPLICATION_CREDENTIALS="resources/google/gcp-creds.json" 21 | 22 | # Pushbullet TOKEN to receive notifications on your phone 23 | PUSHBULLET_APIKEY= 24 | 25 | # Log level 26 | LOG_LEVEL=silly -------------------------------------------------------------------------------- /src/pkg/automation-utils/offset.ts: -------------------------------------------------------------------------------- 1 | import { ElementHandle, Page } from "puppeteer"; 2 | 3 | /** 4 | * @returns the page's height 5 | */ 6 | export async function getPageHeight(page: Page): Promise { 7 | return page.evaluate(() => window.innerHeight); 8 | } 9 | 10 | /** 11 | * @returns the distance of the given el from the top of the page 12 | */ 13 | export async function getElOffsetRelativeToPage( 14 | el: ElementHandle 15 | ): Promise { 16 | return await el.evaluate((el) => { 17 | const bodyRect = document.body.getBoundingClientRect(), 18 | elemRect = el.getBoundingClientRect(), 19 | offset = elemRect.top - bodyRect.top; 20 | return offset; 21 | }); 22 | } 23 | 24 | export async function getElOffsetRelativeToWindow( 25 | el: ElementHandle 26 | ): Promise { 27 | return await el.evaluate((el) => { 28 | const elemRect = el.getBoundingClientRect(); 29 | 30 | return elemRect.y; 31 | }); 32 | } 33 | 34 | /** 35 | * @param sel a scrollable element (defaults to html) 36 | * @returns the scrollTop of the given element 37 | */ 38 | export async function getScrollTop(page: Page, sel = "html"): Promise { 39 | return await page.evaluate( 40 | (sel: string) => document.querySelector(sel)?.scrollTop || -1, 41 | sel 42 | ); 43 | } 44 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/session/storages/IndexedDB/index.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "puppeteer"; 2 | import { z } from "zod"; 3 | import { getDatabaseNames } from "./database-names"; 4 | import { getIndexedDB } from "./get"; 5 | import { setIndexedDB } from "./set"; 6 | 7 | export const ExportedIndexedDBDatabase = z.object({ 8 | name: z.string(), 9 | data: z.any(), 10 | securityOrigin: z.string(), 11 | }); 12 | 13 | export type ExportedIndexedDBDatabase = z.infer< 14 | typeof ExportedIndexedDBDatabase 15 | >; 16 | 17 | /** 18 | * @param securityOrigin get this from the "application > IndexedDB" developper panel 19 | * @returns databases 20 | */ 21 | export async function getAllIndexedDB(page: Page, securityOrigin: string) { 22 | const dbNames = await getDatabaseNames(page, securityOrigin); 23 | 24 | const databases: ExportedIndexedDBDatabase[] = []; 25 | 26 | for (const db of dbNames) { 27 | databases.push({ 28 | name: db, 29 | data: await getIndexedDB(page, db), 30 | securityOrigin, 31 | }); 32 | } 33 | 34 | return databases; 35 | } 36 | 37 | export async function setAllIndexedDB( 38 | page: Page, 39 | databases: ExportedIndexedDBDatabase[] 40 | ) { 41 | for (const db of databases) { 42 | if (!page.url().includes(db.securityOrigin)) { 43 | await page.goto(db.securityOrigin); 44 | } 45 | 46 | await setIndexedDB(page, db.data, db.name); 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/pkg/services/log/uncaught.ts: -------------------------------------------------------------------------------- 1 | import delay from "delay"; 2 | import { log } from "./log"; 3 | 4 | export class UnhandledRejectionError extends Error { 5 | constructor(message: string) { 6 | super(message); 7 | this.name = "UnhandledRejectionError"; 8 | } 9 | } 10 | 11 | export function registerUncaughtListeners() { 12 | // use this to report fatal errors to GCP 13 | process.on("uncaughtException", async (err) => { 14 | /** 15 | * errors in this format will be picked up by GCP's error reporting tool 16 | */ 17 | log.error("fatal error: " + err.message, { 18 | message: err.message, 19 | stack: err.stack, 20 | }); 21 | 22 | await delay(5000); // avoid restart-burns 23 | process.exit(1); 24 | }); 25 | 26 | process.on("unhandledRejection", (reason: any, promise) => { 27 | const isSessionClosedError = `${reason}`.includes( 28 | "Session closed. Most likely the page has been closed." 29 | ); 30 | const isTargetClosedError = `${reason}`.includes("Target closed."); 31 | 32 | const metadata = { 33 | reason: reason?.stack || reason, 34 | promise, 35 | }; 36 | 37 | if (isTargetClosedError) { 38 | log.warn("Target closed error", metadata); 39 | return; 40 | } else if (isSessionClosedError) { 41 | log.warn("Session closed error", metadata); 42 | return; 43 | } 44 | 45 | const err = new UnhandledRejectionError( 46 | "Unhandled Promise rejection. Infos: " + JSON.stringify(metadata) 47 | ); 48 | log.error("fatal error: " + err, { 49 | name: err.name, 50 | metadata, 51 | }); 52 | throw err; 53 | }); 54 | } 55 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/session/session.ts: -------------------------------------------------------------------------------- 1 | import { Page } from "puppeteer"; 2 | import * as z from "zod"; 3 | import { log } from "../../services/log/log"; 4 | import { getCookies, setCookies } from "./storages/cookies"; 5 | import { 6 | ExportedIndexedDBDatabase, 7 | getAllIndexedDB, 8 | setAllIndexedDB, 9 | } from "./storages/IndexedDB"; 10 | import { getLocalStorage, setLocalStorage } from "./storages/localStorage"; 11 | import { 12 | getSessionStorage, 13 | setSessionStorage, 14 | } from "./storages/sessionStorage"; 15 | 16 | // note: we are not implementing sessionStorage because the content is removed each time the tab is closed 17 | 18 | export const SessionData = z.object({ 19 | localStorage: z.string(), 20 | sessionStorage: z.string(), 21 | indexedDBDatabases: z.array(ExportedIndexedDBDatabase), 22 | cookies: z.string(), 23 | }); 24 | export type SessionData = z.infer; 25 | 26 | /** 27 | * 28 | * @param securityOrigin to get this: Dev Tools > Application > IndexedDB > properties of a database > security origin 29 | * @returns 30 | */ 31 | export async function getSessionData( 32 | page: Page, 33 | securityOrigin: string 34 | ): Promise { 35 | log.silly("extracting session data"); 36 | return { 37 | localStorage: await getLocalStorage(page), 38 | cookies: await getCookies(page), 39 | sessionStorage: await getSessionStorage(page), 40 | indexedDBDatabases: await getAllIndexedDB(page, securityOrigin), 41 | }; 42 | } 43 | 44 | export async function setSessionData(page: Page, sessionData: SessionData) { 45 | log.silly("restoring session data"); 46 | await Promise.all([ 47 | setCookies(page, sessionData.cookies), 48 | setLocalStorage(page, sessionData.localStorage), 49 | setSessionStorage(page, sessionData.sessionStorage), 50 | setAllIndexedDB(page, sessionData.indexedDBDatabases), 51 | ]); 52 | } 53 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "puppeteer-extra-boilerplate", 3 | "scripts": { 4 | "dev": "yarn tsc && ./bin/run dev", 5 | "cmd": "yarn tsc && ./bin/run ", 6 | "test": "yarn jest --watch", 7 | "db:init": "yarn prisma init", 8 | "db:migrate": "yarn prisma migrate dev", 9 | "db:refresh-client": "yarn prisma generate", 10 | "upgrade": "yarn upgrade-interactive", 11 | "postpack": "rm -f oclif.manifest.json", 12 | "posttest": "eslint . --ext .ts --config .eslintrc", 13 | "prepack": "rm -rf lib && tsc -b && oclif-dev manifest && oclif-dev readme", 14 | "version": "oclif-dev readme && git add README.md" 15 | }, 16 | "version": "1.0.0", 17 | "description": "A batteries included boilerplate for puppeteer-extra.", 18 | "author": "clouedoc", 19 | "license": "MIT", 20 | "devDependencies": { 21 | "@oclif/dev-cli": "^1.26.0", 22 | "@oclif/test": "^1.2.8", 23 | "@types/jest": "^26.0.22", 24 | "@types/node": "^14.14.35", 25 | "@typescript-eslint/eslint-plugin": "^4.22.0", 26 | "@typescript-eslint/parser": "^4.22.0", 27 | "eslint": "^7.24.0", 28 | "jest": "^26.6.3", 29 | "prisma": "^2.19.0", 30 | "ts-jest": "^26.5.4", 31 | "ts-node": "^9.1.1", 32 | "typescript": "^4.2.4" 33 | }, 34 | "dependencies": { 35 | "@google-cloud/logging-winston": "^4.0.4", 36 | "@oclif/command": "^1.8.0", 37 | "@oclif/config": "^1.17.0", 38 | "@oclif/plugin-help": "^3.2.2", 39 | "@prisma/client": "^2.20.1", 40 | "chalk": "^4.1.0", 41 | "delay": "^5.0.0", 42 | "dotenv": "^8.2.0", 43 | "p-event": "^4.2.0", 44 | "puppeteer": "^5", 45 | "puppeteer-extra": "^3.1.18", 46 | "puppeteer-extra-plugin-recaptcha": "^3.3.7", 47 | "puppeteer-extra-plugin-stealth": "^2.7.6", 48 | "puppeteer-extra-plugin-timezone": "^1.0.3", 49 | "recaptcha": "^1.2.1", 50 | "winston": "^3.3.3" 51 | }, 52 | "oclif": { 53 | "commands": "./src/app/cmd", 54 | "bin": "puppeteer-boiler", 55 | "plugins": [ 56 | "@oclif/plugin-help" 57 | ] 58 | }, 59 | "files": [ 60 | "/bin", 61 | "/lib" 62 | ] 63 | } 64 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/session/storages/IndexedDB/get.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | import { Page } from "puppeteer"; 3 | 4 | // TODO: make this TypeScript-compliant 5 | // STEALTH: isolated worlds 6 | export async function getIndexedDB( 7 | page: Page, 8 | dbName: string 9 | ): Promise<{ [x: string]: any }> { 10 | return (await page.evaluate((dbName: string) => { 11 | return new Promise((resolve, reject) => { 12 | const request = window.indexedDB.open(dbName); // note: this is also originally opened without the version property 13 | 14 | request.onsuccess = (e: any) => { 15 | const idbDatabase = e.target.result; 16 | const exportObject = {}; 17 | if (idbDatabase.objectStoreNames.length === 0) 18 | resolve(JSON.stringify(exportObject)); 19 | else { 20 | const transaction = idbDatabase.transaction( 21 | idbDatabase.objectStoreNames, 22 | "readonly" 23 | ); 24 | 25 | transaction.addEventListener("error", reject); 26 | 27 | for (const storeName of idbDatabase.objectStoreNames) { 28 | const allObjects = {}; 29 | transaction 30 | .objectStore(storeName) 31 | .openCursor() 32 | .addEventListener("success", (event: any) => { 33 | const cursor = event.target.result; 34 | if (cursor) { 35 | // Cursor holds value, put it into store data 36 | // @ts-expect-error 37 | allObjects[cursor.key] = cursor.value; 38 | cursor.continue(); 39 | } else { 40 | // No more values, store is done 41 | // @ts-expect-error 42 | exportObject[storeName] = allObjects; 43 | 44 | // Last store was handled 45 | if ( 46 | idbDatabase.objectStoreNames.length === 47 | Object.keys(exportObject).length 48 | ) { 49 | resolve(exportObject); 50 | } 51 | } 52 | }); 53 | } 54 | } 55 | }; 56 | 57 | request.onerror = (err) => { 58 | console.error(err); 59 | }; 60 | }); 61 | }, dbName)) as object; 62 | } 63 | -------------------------------------------------------------------------------- /src/pkg/browsers/browser.ts: -------------------------------------------------------------------------------- 1 | import os from "os"; 2 | import puppeteer from "puppeteer-extra"; 3 | import RecaptchaPlugin from "puppeteer-extra-plugin-recaptcha"; 4 | import StealthPlugin from "puppeteer-extra-plugin-stealth"; 5 | import TimezonePlugin from "puppeteer-extra-plugin-timezone"; 6 | import env from "../services/env"; 7 | import { log } from "../services/log/log"; 8 | 9 | puppeteer.use(TimezonePlugin()); 10 | 11 | export function getExecutablePath() { 12 | if (env.CHROME_PATH) { 13 | return env.CHROME_PATH; 14 | } 15 | 16 | switch (os.platform()) { 17 | case "darwin": 18 | return "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"; 19 | case "win32": 20 | return "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe"; 21 | case "linux": 22 | return "/usr/bin/google-chrome"; 23 | default: 24 | log.warn( 25 | `Couldn't find Google Chrome for the current platform. (os = ${os.platform()})` 26 | ); 27 | return ""; 28 | } 29 | } 30 | 31 | /** 32 | * create a new stealth browser 33 | * boilerplate note: you will want to change this part 34 | * 1. making it headless / headful, depending on your target 35 | * 2. modifying Stealth options 36 | * 3. and so on... 37 | */ 38 | export function newBrowser(proxy?: string) { 39 | // enable stealth 40 | puppeteer.use( 41 | StealthPlugin({ 42 | /* modifiy stealth settings here */ 43 | }) 44 | ); 45 | 46 | // enable recaptcha usage 47 | if (env.TOKEN_2CAPTCHA) { 48 | puppeteer.use( 49 | RecaptchaPlugin({ 50 | provider: { 51 | id: "2captcha", 52 | token: process.env.TOKEN_2CAPTCHA, 53 | }, 54 | visualFeedback: true, 55 | throwOnError: true, 56 | }) 57 | ); 58 | } 59 | 60 | const args = [ 61 | // sane defaults from prescience's foundation 62 | "--no-sandbox", 63 | "--disable-setuid-sandbox", 64 | "--disable-sync", 65 | "--ignore-certificate-errors", 66 | ]; 67 | 68 | if (proxy) { 69 | args.push("--proxy-server=" + proxy); 70 | } else { 71 | log.warn("created a browser without a proxy"); 72 | } 73 | 74 | const browser = puppeteer.launch({ 75 | args, 76 | headless: false, 77 | executablePath: getExecutablePath(), 78 | }); 79 | 80 | return browser; 81 | } 82 | -------------------------------------------------------------------------------- /src/pkg/services/log/log.ts: -------------------------------------------------------------------------------- 1 | import { LoggingWinston as GoogleLoggingWinston } from "@google-cloud/logging-winston"; 2 | import chalk from "chalk"; 3 | import os from "os"; 4 | import winston from "winston"; 5 | import env from "../env"; 6 | import { registerUncaughtListeners } from "./uncaught"; 7 | 8 | /** 9 | * Simple helper for stringifying all remaining 10 | * properties. 11 | */ 12 | function rest(info: Record) { 13 | const data = Object.assign({}, info, { 14 | level: undefined, 15 | message: undefined, 16 | splat: undefined, 17 | label: undefined, 18 | }); 19 | 20 | // delete all the bloating variables 21 | // default 22 | delete data.message; 23 | delete data.level; 24 | delete data.splat; 25 | delete data.label; 26 | // stack-related 27 | delete data.hostname; 28 | delete data.loggerCreationDate; 29 | delete data.project; 30 | 31 | if (Object.keys(data).length === 0) { 32 | return ""; 33 | } 34 | 35 | return chalk.grey( 36 | // using JSON.parse to remove all the `[Symbol(...)]` things 37 | `\n${JSON.stringify(data)}` 38 | ); 39 | } 40 | 41 | const transports: winston.transport[] = [ 42 | new winston.transports.Console({ 43 | format: winston.format.combine( 44 | winston.format.splat(), 45 | winston.format.colorize(), 46 | winston.format.printf( 47 | (info) => `[${info.level}] ${info.message}${rest(info)}` 48 | ) 49 | ), 50 | }), 51 | new winston.transports.File({ 52 | filename: "logs/silly.log", 53 | level: "silly", 54 | }), 55 | ]; 56 | 57 | // send logs to GCP if we have a service account available 58 | if (env.GOOGLE_APPLICATION_CREDENTIALS) { 59 | transports.push( 60 | new GoogleLoggingWinston({ 61 | level: "silly", 62 | }) 63 | ); 64 | } 65 | 66 | if (env.PUSHBULLET_APIKEY) { 67 | transports.push( 68 | // @ts-expect-error It thinks that it isn't here (because it is deprecated) while it's in actually present! 69 | new winston.transports.Pushbullet({ 70 | apikey: env.PUSHBULLET_APIKEY, 71 | level: "warn", 72 | title: "Puppeteer Notifcation", 73 | devices: "", // '' means all devices 74 | }) 75 | ); 76 | } 77 | 78 | export const log = winston.createLogger({ 79 | level: env.LOG_LEVEL || "silly", 80 | transports, 81 | defaultMeta: { 82 | hostname: os.hostname(), 83 | loggerCreationDate: new Date(), 84 | project: env.PROJECT_NAME, 85 | }, 86 | }); 87 | 88 | registerUncaughtListeners(); 89 | -------------------------------------------------------------------------------- /src/pkg/automation-utils/session/storages/IndexedDB/set.ts: -------------------------------------------------------------------------------- 1 | /* eslint-disable */ 2 | import { Page } from "puppeteer"; 3 | 4 | // STEALTH: isolated worlds 5 | // TODO: make this TypeScript-compliant 6 | export async function setIndexedDB(page: Page, data: object, dbName: string) { 7 | await page.evaluate( 8 | (data: any, dbName: string) => { 9 | return new Promise((resolve, reject) => { 10 | function initDB() { 11 | var request = window.indexedDB.open(dbName, 1); 12 | request.onsuccess = function () {}; 13 | request.onupgradeneeded = function () {}; 14 | request.onerror = function (event: any) { 15 | throw new Error(event.target.errorCode); 16 | }; 17 | } 18 | 19 | initDB(); // opens the DB at version 1, to allow us trigerring a version change 20 | // to inject our scraped database 21 | 22 | // the list of required stores, that we must create or which must exist 23 | const requiredStoreNames = Object.keys(data); 24 | 25 | const request = window.indexedDB.open(dbName, 2); // note: open with version 2 for some reason 26 | 27 | // create the object stores 28 | request.onupgradeneeded = (e: any) => { 29 | const db: IDBDatabase = e.target.result; 30 | 31 | for (const storeName of requiredStoreNames) { 32 | try { 33 | db.createObjectStore(storeName); 34 | } catch (err) { 35 | // don't mind name conflicts 36 | if (err.name !== "ConstraintError") { 37 | // but mind other kind of errors... 38 | throw err; 39 | } 40 | } 41 | } 42 | }; 43 | 44 | request.onsuccess = (e: any) => { 45 | const idbDatabase: IDBDatabase = e.target.result; 46 | 47 | // open a transaction with all our created stores 48 | const transaction = idbDatabase.transaction( 49 | requiredStoreNames, 50 | "readwrite" 51 | ); 52 | 53 | transaction.addEventListener("error", reject); 54 | 55 | for (const storeName of requiredStoreNames) { 56 | for (const [key, value] of Object.entries(data[storeName])) { 57 | transaction.objectStore(storeName).put(value, key); 58 | } 59 | // @ts-expect-error 60 | request.oncomplete = resolve(); 61 | } 62 | resolve(); 63 | }; 64 | 65 | request.onerror = (err) => { 66 | throw new Error(err.toString()); 67 | }; 68 | }); 69 | }, 70 | // @ts-expect-error 71 | data, 72 | dbName 73 | ); 74 | } 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # puppeteer-boiler 2 | 3 | ## Archived 4 | 5 | You should migrate to my more recent template: [here](https://github.com/clouedoc/typescript-boilerplate). 6 | This repository is still a great learning resource for beginners! 7 | 8 | ![cool boilerplate image](assets/images/boilerplate-readme.jpeg) 9 | 10 | ## Introduction 11 | 12 | With NodeJS and TypeScript, you can structure your project how you want. However, this can be a bit intimidating for beginners. 13 | 14 | The puppeteer-extra-boilerplate is a good learning source for scraping beginners and allows advanced users to have a batteries-included template ready. 15 | 16 | This is a WIP boilerplate. You might want to check prescience's one: . 17 | 18 | ### If you are a beginner 19 | 20 | Then welcome to scraping ! Make sure to [join the Scraping Enthusiasts discord server](https://discord.gg/QDbpFyenhA) 21 | 22 | You can use this project to help you get started in architecturing your own projects. 23 | 24 | ### If you are already experienced 25 | 26 | Then you probably will want to compare your stack with mine. 27 | 28 | ## Important setup 29 | 30 | You need to rename `.env.example` to a plain `.env`. 31 | 32 | ## Tools used 33 | 34 | ### Prisma (database) 35 | 36 | To access and edit the database, you'll need to setup a local PostgreSQL instance. Then, you can edit the connection string in `.env` 37 | 38 | [Prisma's TypeScript getting started guide](https://www.prisma.io/docs/getting-started/quickstart-typescript) 39 | 40 | ### puppeteer-extra 41 | 42 | This package is used to extend the base functionnalities of puppeteer. 43 | 44 | It is required to use puppeteer-extra plugins. 45 | 46 | ### GCP Logging 47 | 48 | **Setup:** 49 | 50 | 1. Create a GCP project 51 | 2. [Enable the Cloud Logging API](https://console.cloud.google.com/marketplace/product/google/logging.googleapis.com) 52 | 3. [Create a service account](https://console.cloud.google.com/apis/api/logging.googleapis.com/credentials?folder=true&organizationId=true) 53 | - required roles: 54 | - Logging > Logs Writer 55 | - Monitoring > Monitoring Metric Writer 56 | - source: 57 | 4. Download the service account's credentials in JSON format 58 | 5. Add the key to the root of the project and rename it to "gcp-creds.json" 59 | 60 | ## Contributing 61 | 62 | **Rules:** 63 | 64 | 1. You need to respect the code format. If you are using VSCode, install the Prettier extension, which should automatically pick up the .prettierrc file. 65 | 2. All contributions are accepted. Documentation, code, etc... 66 | 67 | ### What's next 68 | 69 | - [ ] Add stealth utils (jitter, stealth mouse movements) 70 | - [ ] Add stealth measures where there are `// STEALTH` comments. These measures should be activated thanks to an env varialbe. It would add delay, so not great for development. 71 | 72 | ## Troubleshooting 73 | 74 | ### My project is slow to use since VSCode updated ! 75 | 76 | Check VSCode's and TypeScript's issues. 77 | 78 | See: https://github.com/microsoft/TypeScript/issues/43249 79 | 80 | ## License 81 | 82 | This package is licensed under the MIT license. 83 | 84 | ## Contact 85 | 86 | Feel free to raise an issue. 87 | -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | /* Visit https://aka.ms/tsconfig.json to read more about this file */ 4 | 5 | /* Basic Options */ 6 | // "incremental": true, /* Enable incremental compilation */ 7 | "target": "esnext" /* Specify ECMAScript target version: 'ES3' (default), 'ES5', 'ES2015', 'ES2016', 'ES2017', 'ES2018', 'ES2019', 'ES2020', or 'ESNEXT'. */, 8 | "module": "esnext" /* Specify module code generation: 'none', 'commonjs', 'amd', 'system', 'umd', 'es2015', 'es2020', or 'ESNext'. */, 9 | // "lib": [], /* Specify library files to be included in the compilation. */ 10 | // "allowJs": true, /* Allow javascript files to be compiled. */ 11 | // "checkJs": true, /* Report errors in .js files. */ 12 | // "jsx": "preserve", /* Specify JSX code generation: 'preserve', 'react-native', 'react', 'react-jsx' or 'react-jsxdev'. */ 13 | // "declaration": true, /* Generates corresponding '.d.ts' file. */ 14 | // "declarationMap": true, /* Generates a sourcemap for each corresponding '.d.ts' file. */ 15 | 16 | // for cleaner stacktraces 17 | "sourceMap": true /* Generates corresponding '.map' file. */, 18 | // "outFile": "./", /* Concatenate and emit output to single file. */ 19 | "outDir": "./lib" /* Redirect output structure to the directory. */, 20 | // "rootDir": "./", /* Specify the root directory of input files. Use to control the output directory structure with --outDir. */ 21 | // "composite": true, /* Enable project compilation */ 22 | // "tsBuildInfoFile": "./", /* Specify file to store incremental compilation information */ 23 | // "removeComments": true, /* Do not emit comments to output. */ 24 | // "noEmit": true, /* Do not emit outputs. */ 25 | // "importHelpers": true, /* Import emit helpers from 'tslib'. */ 26 | // "downlevelIteration": true, /* Provide full support for iterables in 'for-of', spread, and destructuring when targeting 'ES5' or 'ES3'. */ 27 | // "isolatedModules": true, /* Transpile each file as a separate module (similar to 'ts.transpileModule'). */ 28 | 29 | /* Strict Type-Checking Options */ 30 | "strict": true /* Enable all strict type-checking options. */, 31 | // "noImplicitAny": true, /* Raise error on expressions and declarations with an implied 'any' type. */ 32 | // "strictNullChecks": true, /* Enable strict null checks. */ 33 | // "strictFunctionTypes": true, /* Enable strict checking of function types. */ 34 | // "strictBindCallApply": true, /* Enable strict 'bind', 'call', and 'apply' methods on functions. */ 35 | // "strictPropertyInitialization": true, /* Enable strict checking of property initialization in classes. */ 36 | // "noImplicitThis": true, /* Raise error on 'this' expressions with an implied 'any' type. */ 37 | // "alwaysStrict": true, /* Parse in strict mode and emit "use strict" for each source file. */ 38 | 39 | /* Additional Checks */ 40 | "noUnusedLocals": true /* Report errors on unused locals. */, 41 | "noUnusedParameters": true /* Report errors on unused parameters. */, 42 | "noImplicitReturns": true /* Report error when not all code paths in function return a value. */, 43 | "noFallthroughCasesInSwitch": true /* Report errors for fallthrough cases in switch statement. */, 44 | // "noUncheckedIndexedAccess": true, /* Include 'undefined' in index signature results */ 45 | // "noPropertyAccessFromIndexSignature": true, /* Require undeclared properties from index signatures to use element accesses. */ 46 | 47 | /* Module Resolution Options */ 48 | "moduleResolution": "node" /* Specify module resolution strategy: 'node' (Node.js) or 'classic' (TypeScript pre-1.6). */, 49 | // "baseUrl": "./", /* Base directory to resolve non-absolute module names. */ 50 | // "paths": {}, /* A series of entries which re-map imports to lookup locations relative to the 'baseUrl'. */ 51 | // "rootDirs": [], /* List of root folders whose combined content represents the structure of the project at runtime. */ 52 | // "typeRoots": [], /* List of folders to include type definitions from. */ 53 | // "types": [], /* Type declaration files to be included in compilation. */ 54 | "allowSyntheticDefaultImports": true /* Allow default imports from modules with no default export. This does not affect code emit, just typechecking. */, 55 | "esModuleInterop": true /* Enables emit interoperability between CommonJS and ES Modules via creation of namespace objects for all imports. Implies 'allowSyntheticDefaultImports'. */, 56 | // "preserveSymlinks": true, /* Do not resolve the real path of symlinks. */ 57 | // "allowUmdGlobalAccess": true, /* Allow accessing UMD globals from modules. */ 58 | 59 | /* Source Map Options */ 60 | // "sourceRoot": "", /* Specify the location where debugger should locate TypeScript files instead of source locations. */ 61 | // "mapRoot": "", /* Specify the location where debugger should locate map files instead of generated locations. */ 62 | // "inlineSourceMap": true, /* Emit a single file with source maps instead of having a separate file. */ 63 | // "inlineSources": true, /* Emit the source alongside the sourcemaps within a single file; requires '--inlineSourceMap' or '--sourceMap' to be set. */ 64 | 65 | /* Experimental Options */ 66 | // "experimentalDecorators": true /* Enables experimental support for ES7 decorators. */, 67 | // "emitDecoratorMetadata": true /* Enables experimental support for emitting type metadata for decorators. */, 68 | 69 | /* Advanced Options */ 70 | "skipLibCheck": true /* Skip type checking of declaration files. */, 71 | "forceConsistentCasingInFileNames": true /* Disallow inconsistently-cased references to the same file. */ 72 | } 73 | } 74 | --------------------------------------------------------------------------------