├── _config.yml ├── media ├── google-dom.jpg ├── logo_text.png ├── sample_code.png ├── ycombinator.png ├── sample_result.png └── carbon-config.json ├── .gitignore ├── tsconfig.json ├── examples ├── google.ts └── amazon.ts ├── package.json ├── src ├── index.ts ├── types.ts └── validate.ts └── readme.md /_config.yml: -------------------------------------------------------------------------------- 1 | theme: jekyll-theme-cayman -------------------------------------------------------------------------------- /media/google-dom.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capturr/scraper/HEAD/media/google-dom.jpg -------------------------------------------------------------------------------- /media/logo_text.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capturr/scraper/HEAD/media/logo_text.png -------------------------------------------------------------------------------- /media/sample_code.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capturr/scraper/HEAD/media/sample_code.png -------------------------------------------------------------------------------- /media/ycombinator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capturr/scraper/HEAD/media/ycombinator.png -------------------------------------------------------------------------------- /media/sample_result.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/capturr/scraper/HEAD/media/sample_result.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /node_modules 2 | /package-lock.json 3 | /bin 4 | /crawlers/debug 5 | /.vscode 6 | /*.code-workspace 7 | /test.ts -------------------------------------------------------------------------------- /tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "esModuleInterop": true, 4 | "moduleResolution": "node", 5 | "outDir": "./bin", 6 | "skipLibCheck": true, 7 | "allowJs": true, 8 | "declaration": true, 9 | "downlevelIteration": true 10 | }, 11 | "include": [ 12 | "./src" 13 | ], 14 | "exclude": [ 15 | "node_modules" 16 | ] 17 | } -------------------------------------------------------------------------------- /media/carbon-config.json: -------------------------------------------------------------------------------- 1 | {"paddingVertical":"0px","paddingHorizontal":"0px","backgroundImage":null,"backgroundImageSelection":null,"backgroundMode":"color","backgroundColor":"rgba(255,255,255,0)","dropShadow":false,"dropShadowOffsetY":"20px","dropShadowBlurRadius":"68px","theme":"night-owl","windowTheme":"none","language":"auto","fontFamily":"Hack","fontSize":"14px","lineHeight":"133%","windowControls":true,"widthAdjustment":true,"lineNumbers":false,"firstLineNumber":1,"exportSize":"2x","watermark":false,"squaredImage":false,"hiddenCharacters":false,"name":"","width":680} -------------------------------------------------------------------------------- /examples/google.ts: -------------------------------------------------------------------------------- 1 | import Scraper, { $, TExtractedPrice } from '../src'; 2 | const page = new Scraper('API_KEY'); 3 | 4 | type TGoogleResults = { 5 | price: TExtractedPrice, 6 | results: { 7 | url: string, 8 | title: string 9 | }[] 10 | } 11 | 12 | // Scrape Google search results for "bitcoin" 13 | page.get("https://www.google.com/search?q=bitcoin", { device: "desktop" }, { 14 | // Extract the current bitcoin price 15 | price: $("#search .obcontainer .card-section > div:eq(1)").filter("price"), 16 | // For each Google search result 17 | results: $("h2:contains('Web results') + div").each({ 18 | // We retrieve the URL 19 | url: $("a[href]").attr("href").filter("url"), 20 | // ... And the title text 21 | title: $("h3") 22 | }) 23 | }).then( data => { 24 | 25 | console.dir(data, { depth: null }); 26 | 27 | }); -------------------------------------------------------------------------------- /examples/amazon.ts: -------------------------------------------------------------------------------- 1 | import Scraper, { $, TExtractedPrice } from '../src'; 2 | const page = new Scraper('API_KEY'); 3 | 4 | type TReview = { 5 | author: string, 6 | title: string 7 | } 8 | 9 | type TAmazonResults = { 10 | title: string, 11 | price: TExtractedPrice, 12 | image: string, 13 | reviews: { 14 | rating?: string, 15 | list: TReview[] 16 | } 17 | } 18 | 19 | // Scrape Amazon search results for "bitcoin" 20 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, { 21 | 22 | title: $("#title"), 23 | price: $("#corePrice_feature_div .a-offscreen:first").filter("price"), 24 | image: $("#main-image").attr("src").filter("url"), 25 | 26 | reviews: { 27 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']").optional(), 28 | list: $("#cm-cr-dp-aw-review-list > [data-hook='mobley-review-content']").each({ 29 | author: $(".a-profile-name"), 30 | title: $("[data-hook='review-title']") 31 | }) 32 | } 33 | 34 | }).then( data => { 35 | 36 | console.dir(data, { depth: null }); 37 | 38 | }); -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "scrapingapi", 3 | "author": "Gaetan Le Gac ", 4 | "description": "One API to scrape All the Web.", 5 | "keywords": [ 6 | "scraping", 7 | "crawler", 8 | "crawling", 9 | "crawl", 10 | "captcha", 11 | "bot", 12 | "robot", 13 | "proxy", 14 | "spider", 15 | "scraper", 16 | "web", 17 | "html", 18 | "extract", 19 | "data" 20 | ], 21 | "version": "0.3.1", 22 | "license": "MIT", 23 | "private": false, 24 | "main": "bin/index.js", 25 | "files": [ 26 | "bin" 27 | ], 28 | "repository": { 29 | "type": "git", 30 | "url": "git://github.com/scrapingapi/scraper.git" 31 | }, 32 | "scripts": { 33 | "build": "tsc", 34 | "watch": "tsc -w", 35 | "prepare": "npm run build" 36 | }, 37 | "dependencies": { 38 | "get-root-domain": "^0.0.1", 39 | "request": "^2.88.2", 40 | "validator": "^13.7.0" 41 | }, 42 | "devDependencies": { 43 | "@types/node": "^16.11.9", 44 | "@types/request": "^2.48.8", 45 | "@types/validator": "^13.7.0", 46 | "typescript": "^4.3.5" 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /src/index.ts: -------------------------------------------------------------------------------- 1 | /*---------------------------------- 2 | - DEPENDANCES 3 | ----------------------------------*/ 4 | 5 | // Node.js 6 | import request from 'request'; 7 | 8 | // Internal 9 | import validate from './validate'; 10 | export { default as validate } from './validate'; 11 | 12 | /*---------------------------------- 13 | - TYPE 14 | ----------------------------------*/ 15 | 16 | import { 17 | 18 | TGlobalOptions, 19 | TAdapter, 20 | 21 | TRequestWithExtractors, 22 | TRequestWithBody, 23 | TScrapeResult, 24 | 25 | TExtractor, 26 | ValueExtractor 27 | } from './types'; 28 | 29 | export type { TExtractedPrice } from './types'; 30 | 31 | type TOptions = Omit; 32 | 33 | /*---------------------------------- 34 | - VARIOUS DELCARATIONS 35 | ----------------------------------*/ 36 | 37 | const local = process.argv.includes('-local'); 38 | 39 | class ApiError extends Error { 40 | public constructor( public code: number, message: string ) { 41 | super(message); 42 | } 43 | } 44 | 45 | const defaultAdapter: TAdapter = (options) => new Promise((resolve, reject) => request({ 46 | ...options, 47 | json: true 48 | }, (error, response) => { 49 | 50 | if (response && response.statusCode !== 200) 51 | error = new ApiError( response.statusCode, response.body ); 52 | 53 | if (error) { 54 | reject(error); 55 | return; 56 | } 57 | 58 | resolve(response.body); 59 | 60 | })); 61 | 62 | /*---------------------------------- 63 | - SCRAPER 64 | ----------------------------------*/ 65 | export default class Scraper { 66 | 67 | public constructor( public apiKey: string, private options: TGlobalOptions = {} ) {} 68 | 69 | public scrape( requests: TRequestWithExtractors[] ): Promise[]> { 70 | const sendRequest = this.options.adapter || defaultAdapter; 71 | return sendRequest({ 72 | method: 'POST', 73 | url: local ? 'http://localhost:3011/v0' : 'https://scrapingapi.io/v0', 74 | headers: { 75 | 'content-type': 'application/json', 76 | 'accepted': 'application/json', 77 | 'Authorization': this.apiKey, 78 | }, 79 | body: { 80 | requests: validate(requests) 81 | }, 82 | }); 83 | } 84 | 85 | public get( 86 | url: string, 87 | options?: TOptions, 88 | extract?: TExtractor | ValueExtractor 89 | ): Promise> { 90 | return this.scrape([{ method: 'GET', url, extract, ...options }]).then( res => res[0] ); 91 | } 92 | 93 | public post( 94 | url: string, 95 | body: TRequestWithBody["body"], 96 | bodyType: TRequestWithBody["bodyType"], 97 | options?: TOptions, 98 | extract?: TExtractor | ValueExtractor 99 | ): Promise> { 100 | return this.scrape([{ method: 'POST', url, extract, body, bodyType, ...options }]).then( res => res[0] ); 101 | } 102 | 103 | } 104 | 105 | export const $ = (selector: string) => new ValueExtractor(selector) 106 | 107 | /*module.exports = (apiKey: string) => ({ 108 | page: new Scraper(apiKey), 109 | $: $ 110 | })*/ -------------------------------------------------------------------------------- /src/types.ts: -------------------------------------------------------------------------------- 1 | /*---------------------------------- 2 | - CONST 3 | ----------------------------------*/ 4 | 5 | export const allowedMethods = ["GET", "POST"] as const; 6 | export const bodyTypes = ["form", "json"] as const; 7 | export const dataFilters = ["url", 'price'] as const; 8 | export const devices = ['desktop', 'tablet', 'mobile'] as const; 9 | 10 | /*---------------------------------- 11 | - GLOBAL CONFIGURATION TYPES 12 | ----------------------------------*/ 13 | 14 | export type TAdapter = (options: { 15 | method: HttpMethod, 16 | url: string, 17 | headers: {[k: string]: string}, 18 | body: { 19 | requests: TRequestWithExtractors[] 20 | } 21 | }) => Promise; 22 | 23 | export type TGlobalOptions = { 24 | adapter?: TAdapter 25 | } 26 | 27 | /*---------------------------------- 28 | - REQUEST CONFIGURATION TYPES 29 | ----------------------------------*/ 30 | 31 | export type TBasicRequest = { 32 | url: string, 33 | method?: HttpMethod, 34 | cookies?: string, 35 | device?: typeof devices[number] 36 | } 37 | 38 | export type HttpMethod = typeof allowedMethods[number]; 39 | 40 | export type TRequestWithBody = TBasicRequest & { 41 | body: { [key: string]: any }, 42 | bodyType: typeof bodyTypes[number] 43 | } 44 | 45 | export type TRequest = TBasicRequest | TRequestWithBody; 46 | 47 | export type TRequestWithExtractors = TRequest & { 48 | extract?: TExtractor, 49 | withBody?: boolean, 50 | withHeaders?: boolean, 51 | } 52 | 53 | /*---------------------------------- 54 | - SCRAPER 55 | ----------------------------------*/ 56 | 57 | export type TExtractor = TItemsExtractor | TValueExtractor | ValueExtractor | TItemsIterator; 58 | 59 | export type TSelector = "this" | string; 60 | export type TAttribute = "text" | "html" | string; 61 | export type TFilter = typeof dataFilters[number]; 62 | 63 | export type TValueExtractor = { 64 | select: TSelector, 65 | attr?: TAttribute, 66 | required?: boolean, 67 | filters?: TFilter[] 68 | } 69 | 70 | export type TItemsIterator = { 71 | $foreach?: string, 72 | items: { 73 | [name: string]: TExtractor 74 | } 75 | } 76 | 77 | export type TItemsExtractor = { 78 | [name: string]: TExtractor 79 | } 80 | 81 | export class ValueExtractor { 82 | 83 | public options: TValueExtractor; 84 | 85 | public constructor( select: TSelector ) { 86 | this.options = { 87 | select 88 | } 89 | } 90 | 91 | public attr( attribute: TAttribute ) { 92 | this.options.attr = attribute; 93 | return this; 94 | } 95 | 96 | public each( values: { [name: string]: TExtractor } ): TItemsIterator { 97 | return { $foreach: this.options.select, items: values }; 98 | } 99 | 100 | public filter( ...filterNames: TFilter[] ) { 101 | 102 | if (this.options.filters === undefined) 103 | this.options.filters = filterNames; 104 | else 105 | for (const filterName of filterNames) { 106 | if (this.options.filters.includes( filterName )) 107 | throw new Error(`The ${this.filter} filter has already be set for this selector.`); 108 | else 109 | this.options.filters.push(filterName); 110 | } 111 | 112 | return this; 113 | } 114 | 115 | public optional( isOptional: boolean = true ) { 116 | this.options.required = !isOptional; 117 | return this; 118 | } 119 | 120 | public html() { 121 | this.options.attr = 'html'; 122 | return this; 123 | } 124 | 125 | public text() { 126 | this.options.attr = 'html'; 127 | return this; 128 | } 129 | 130 | } 131 | 132 | /*---------------------------------- 133 | - RESPONSE 134 | ----------------------------------*/ 135 | export type TExtractedPrice = { 136 | amount: number, 137 | currency: string 138 | } 139 | 140 | export type TScrapeResult = { 141 | url: string, 142 | status: number, 143 | headers?: { [key: string]: string }, 144 | body?: string, 145 | data?: TData, 146 | time: number, 147 | bandwidth: number, 148 | } -------------------------------------------------------------------------------- /src/validate.ts: -------------------------------------------------------------------------------- 1 | /*---------------------------------- 2 | - DEPENDENCIES 3 | ----------------------------------*/ 4 | 5 | // Npm 6 | import getRootDomain from 'get-root-domain'; 7 | import isURL from 'validator/lib/isURL'; 8 | 9 | // Interval 10 | import { 11 | /* const */allowedMethods, bodyTypes, dataFilters, 12 | /* types */TRequestWithExtractors, TExtractor , TValueExtractor, ValueExtractor 13 | } from './types'; 14 | 15 | /*---------------------------------- 16 | - TYPES 17 | ----------------------------------*/ 18 | 19 | type TObjetDonnees = {[k: string]: any}; 20 | 21 | class BadRequest extends Error {} 22 | 23 | /*---------------------------------- 24 | - CONFIG 25 | ----------------------------------*/ 26 | 27 | const reqPerCall = 3; 28 | 29 | /*---------------------------------- 30 | - METHODS 31 | ----------------------------------*/ 32 | 33 | export default (requests: TRequestWithExtractors | TObjetDonnees): TRequestWithExtractors[] => { 34 | 35 | // Type Check 36 | if (!Array.isArray( requests )) 37 | throw new BadRequest("requests must be an array. Provided: " + typeof requests); 38 | 39 | // Requests number / call 40 | const reqCount = requests.length; 41 | if (reqCount === 0) 42 | throw new BadRequest("You must provide at least one request."); 43 | if (reqCount > reqPerCall) 44 | throw new BadRequest("You can't send more than " + reqPerCall + " requests per call (" + reqCount + " given)."); 45 | 46 | // Check every request 47 | const domains: {[domain: string]: true} = {}; 48 | for (let iReq = 0; iReq < reqCount; iReq++) { 49 | 50 | const req = requests[iReq]; 51 | 52 | // Type 53 | if (typeof req !== "object" || req === null) 54 | throw new BadRequest("requests must be an array of requests object, but requests[" + iReq + "] is an " + typeof req); 55 | 56 | // Method 57 | if (req.method === undefined) 58 | req.method = 'GET'; 59 | else if(!allowedMethods.includes(req.method)) 60 | throw new BadRequest("Only the following HTTP methods are currently allowed: " + allowedMethods.join(', ')); 61 | 62 | // URL 63 | if (typeof req.url !== "string" || !isURL(req.url, { 64 | require_protocol: true, 65 | require_valid_protocol: true, 66 | protocols: ['http', 'https'], 67 | require_host: true, 68 | require_port: false, 69 | allow_protocol_relative_urls: false, 70 | })) 71 | throw new BadRequest("The url parameter must be a valid URL string: protocol (http or https) + domain + path (optional)"); 72 | 73 | // Unique domain 74 | if (reqCount !== 1) { 75 | 76 | const domain = getRootDomain(req.url); 77 | if (domains[domain] === true) 78 | throw new BadRequest("When you send multiple requests in one call, each requets must point to different domain names. However, you're sending 2 requests to " + domain + "."); 79 | 80 | domains[domain] = true; 81 | 82 | } 83 | 84 | // Cookies 85 | if (req.cookies !== undefined) { 86 | 87 | // Type 88 | if (typeof req.cookies !== 'string') 89 | throw new BadRequest("The cookie parameter must be a string. Example: user=Bob; age=28;"); 90 | 91 | } 92 | 93 | // body 94 | if (req.body !== undefined) { 95 | 96 | // Bodytype 97 | if (req.bodyType === undefined) 98 | throw new BadRequest("The bodyType parameter must be provided when the body parameter is specified."); 99 | if (!bodyTypes.includes(req.bodyType)) 100 | throw new BadRequest("Invalid value for the bodyType parameter. Allowed values: " + bodyTypes.join(', ')); 101 | 102 | // Type 103 | if (typeof req.body !== 'object' || req.body.constructor.name !== 'Object') 104 | throw new BadRequest("The body parameter must be an object."); 105 | } 106 | 107 | if (req.extract !== undefined) 108 | req.extract = validateExtractors(req.extract, 'extract'); 109 | 110 | if (req.withBody !== undefined && typeof req.withBody !== "boolean") 111 | throw new BadRequest(`The withBody parameter must be a boolean.`); 112 | 113 | if (req.withHeaders !== undefined && typeof req.withHeaders !== "boolean") 114 | throw new BadRequest(`The withHeaders parameter must be a boolean.`); 115 | 116 | } 117 | 118 | return requests as TRequestWithExtractors[]; 119 | 120 | } 121 | 122 | export const isValueExtractor = (extract: TValueExtractor | TObjetDonnees): extract is TValueExtractor => 123 | ('select' in extract) 124 | 125 | const validateExtractors = (extract: ValueExtractor | TValueExtractor | TObjetDonnees, path: string): TExtractor => { 126 | 127 | if (!extract || Array.isArray(extract) || typeof extract !== 'object') 128 | throw new BadRequest("The " + path + " option must be an object (" + typeof extract + " given)."); 129 | 130 | if (extract instanceof ValueExtractor) 131 | extract = extract.options; 132 | 133 | // The two required options in TValueExtractor 134 | if (isValueExtractor(extract)) { 135 | 136 | /* 137 | extract: { 138 | select: "h4", 139 | attr: "text", 140 | required: true, 141 | filters: ["title"] 142 | } 143 | */ 144 | 145 | const { 146 | select, 147 | attr = 'text', 148 | required = true, 149 | filters = [] 150 | } = extract; 151 | 152 | if (typeof select !== 'string') 153 | throw new BadRequest(`The ${path}.select option must be a string.`); 154 | 155 | if (typeof attr !== 'string') 156 | throw new BadRequest(`The ${path}.attr option must be a string.`); 157 | 158 | if (typeof required !== 'boolean') 159 | throw new BadRequest(`The ${path}.required option must be a boolean.`); 160 | 161 | if (!Array.isArray( filters )) 162 | throw new BadRequest(`The ${path}.filters option must be an array of strings.`); 163 | 164 | for (const filter of filters) 165 | if (!dataFilters.includes( filter )) 166 | throw new BadRequest("The filter \"" + filter + "\" you gave in " + path + " does not exists. Possible filters: " + dataFilters.join(', ')); 167 | 168 | } else { 169 | 170 | /* 171 | extract: { 172 | "$foreach": ".sh-dgr__content", 173 | "name": { 174 | select: "h4", 175 | attr: "text", 176 | required: true, 177 | filters: ["title"] 178 | } 179 | ... 180 | } 181 | */ 182 | 183 | if (Object.keys(extract).length === 0) 184 | throw new BadRequest("The " + path + " parameter must contain at least one entry."); 185 | 186 | // For each extractor 187 | for (const name in extract) { 188 | 189 | // Foreach 190 | if (name !== '$foreach') 191 | extract[name] = validateExtractors(extract[name], path + '.' + name); 192 | else if (typeof extract[name] !== "string") 193 | throw new BadRequest("When specified, the " + path + ".$foreach parameter must be a CSS selector string."); 194 | 195 | 196 | } 197 | } 198 | 199 | return extract; 200 | 201 | } -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | -------------- 2 | # This project has been moved to [datasaucer/api](https://github.com/datasaucer/api). 3 | ------ 4 |

5 | 6 | ScrapingAPI Logo 7 | 8 |

9 | 10 |

One powerful API to scrape all the web

11 | 12 |

13 | Easily scrape data from any website, without worrying about captchas and bot detection mecanisms. 14 |

15 | 16 | 17 |
18 | 19 | ![version](https://img.shields.io/github/package-json/v/scrapingapi/scraper) 20 | [![npm](https://img.shields.io/npm/dm/scrapingapi)](https://www.npmjs.com/package/scrapingapi) 21 | [![discord](https://img.shields.io/discord/956821594372714546?label=Discord)](https://discord.gg/m7KWXcBaBu) 22 |          23 | 24 | Tweet 26 | 27 | Share on LinkedIn 29 | 30 | Share on Hacker News 32 | 33 |
34 | 35 |

36 | Website • 37 | Discord • 38 | ⭐ Give a Star 39 |

40 | 41 |

42 | 43 | How does ScraperAPI works 44 | How does ScraperAPI works 45 | 46 |

47 | 48 | ## Features 49 | 50 | * No captcha, no bot detection. Websites will see you as a human. 51 | * Integrated [**data extraction**](#extractors): 52 | * Easily extract data with CSS / jQuery-like selectors 53 | * Use filters to get ultra-clean data: url, price, ... 54 | * Iterate through items (ex: search results, products list, articles, ...) 55 | * **Bulk requests**: Up to 3 per call 56 | * Post json / form-encoded body 57 | * Set request device, headers and cookies 58 | * Returns **response body, headers, final URL & status code** 59 | * Typescript typings 60 | 61 | ----------- 62 | 63 |

64 | Do you like this project ? Please let me know, 65 | ⭐ Give it a Star :) 66 |

67 | 68 | ------------ 69 | 70 | ## Get started in 5 minutes chrono 71 | 72 | 1. **Install** the package 73 | ```console 74 | npm install --save scrapingapi 75 | ``` 76 | If you're a Yarn guy: 77 | ```console 78 | yarn add --save scrapingapi 79 | ``` 80 | 81 | 2. Create your free [API Key](https://scrapingapi.io/?utm_source=github&utm_medium=readme&utm_campaign=getstarted) 82 | 83 | 3. Make your first request (example below 👇) 84 | 85 | ## Simple Usage Example 86 | 87 | Here is an example of scraping **current Bitcoin price + search results** from Google Search. 88 | 89 | ```javascript 90 | import Scraper, { $ } from 'scrapingapi'; 91 | const page = new Scraper('API_KEY'); 92 | 93 | // Scrape Google search results for "bitcoin" 94 | page.get("https://www.google.com/search?q=bitcoin", { device: "desktop" }, { 95 | // Extract the current bitcoin price 96 | price: $("#search .obcontainer .card-section > div:eq(1)").filter("price"), 97 | // For each Google search result 98 | results: $("h2:contains('Web results') + div").each({ 99 | // We retrieve the URL 100 | url: $("a[href]").attr("href").filter("url"), 101 | // ... And the title text 102 | title: $("h3") 103 | }) 104 | }).then( data => { 105 | 106 | console.log("Here are the results:", data); 107 | 108 | }); 109 | ``` 110 | 111 | The `Scraper.get` method sends a **GET request** to the provided URL, and automatically extract the data you asked: the price and the results. 112 | 113 | ![Google Search Example](media/google-dom.jpg "Google Search Example") 114 | 115 | In the data parameter, you will get a [TScrapeResult](src/types.ts#L107) object, containing the scraping results. 116 | 117 | ```json 118 | { 119 | "url": "https://www.google.com/search?q=bitcoin", 120 | "status": 200, 121 | "time": 2.930, 122 | "bandwidth": 26.33, 123 | "data": { 124 | "price": { 125 | "amount": 49805.02, 126 | "currency": "EUR" 127 | }, 128 | "results": [{ 129 | "url": "https://bitcoin.org/", 130 | "title": "Bitcoin - Open source P2P money" 131 | }, { 132 | "url": "https://coinmarketcap.com/currencies/bitcoin/", 133 | "title": "Bitcoin price today, BTC to USD live, marketcap and chart" 134 | }, { 135 | "url": "https://www.bitcoin.com/", 136 | "title": "Bitcoin.com | Buy BTC, ETH & BCH | Wallet, news, markets ..." 137 | }, { 138 | "url": "https://en.wikipedia.org/wiki/Bitcoin", 139 | "title": "Bitcoin - Wikipedia" 140 | }] 141 | } 142 | } 143 | ``` 144 | 145 | ### Use Typescript 146 | 147 | Take advantage of the power of typescript by typing your response data: 148 | 149 | ```typescript 150 | import Scraper, { $, TExtractedPrice } from '../src'; 151 | const page = new Scraper('API_KEY'); 152 | 153 | type BitcoinGoogleResults = { 154 | // Metadata generated by the price filter 155 | price: TExtractedPrice, 156 | // An array containing an informations object for each Google search result 157 | results: { 158 | url: string, 159 | title: string 160 | }[] 161 | } 162 | 163 | page.get("https://www.google.com/search?q=bitcoin").then( ... ); 164 | ``` 165 | 166 | ----------- 167 | 168 |

169 | Do you like this project ? Please let me know, 170 | ⭐ Give it a Star :) 171 |

172 | 173 | ------------ 174 | 175 | # Documentation / Guide 176 | 177 | Let's consider we want to scrape an Amazon product page to retrieve the following info: 178 | 179 | * Product info 180 | * Title 181 | * Current price 182 | * Image URL 183 | * Reviews 184 | * Average rating 185 | * List of reviews 186 | 187 | Ready ? Let's start step by step: 188 | 189 | 1. [Make the **Request**](#request) 190 | - [**Method**: GET, POST](#request-methods) 191 | - [**Options**: device, cookies, body, withBody, withHeaders](#request-options) 192 | 2. [**Extract** your data](#extractors) 193 | - [Simple values](#value-extractor) 194 | - [Filters a Validators](#item-extractor) 195 | - [Optional values](#item-extractor) 196 | 3. [**Iterate** through lists](#response) 197 | 4. [Handle the **Response**](#response) 198 | 5. [Another **Example**](#another-example) 199 | 200 | ## 1. Make the Request 201 | 202 | ### 1.1 Request Methods 203 | 204 | This SDK provides one method per supported HTTP method: 205 | 206 | * GET: [See the definition](src/index.ts#66) 207 | ```typescript 208 | page.get( url, options, extractor ); 209 | ``` 210 | * POST: [See the definition](src/index.ts#74) 211 | ```typescript 212 | page.post( url, body, bodyType, options, extractor ); 213 | ``` 214 | * Bulk requests: 215 | With the `scrape` method, You can also send up to **3 requests per call** if each of them points to different domain names. 216 | [See the definition](src/index.ts#38) 217 | ```typescript 218 | page.scrape( requests ); 219 | ``` 220 | 221 |
Show Example 222 |

223 | 224 | For our example, we only need to make a get request. 225 | 226 | ```typescript 227 | page.get( "https://www.amazon.com/dp/B08L76BSZ5", , ); 228 | ``` 229 | 230 |

231 |
232 | 233 | ### 1.2 Request Options 234 | 235 | ```typescript 236 | page.get( url, options, extractor ); 237 | ^^^^^^^ 238 | ``` 239 | 240 | Depending on your needs, you can change some settings for your request: 241 | 242 | * **device** (string): Which user-agent do you want to use for your request: `desktop`, `mobile` or `tablet` 243 | ```json 244 | { "device": "mobile" } 245 | ``` 246 | * **cookies** (string): The cookie string you want to pass to the request. Example: 247 | ```json 248 | { "cookies": "sessionId=34; userId=87;" } 249 | ``` 250 | * **withBody** (boolean): If you want to get the page HTML in the response. Default: `false` 251 | ```json 252 | { "withBody": true } 253 | ``` 254 | * **withHeaders** (boolean): If you want to retrieve the response headers. Default: `false` 255 | ```json 256 | { "withHeaders": true } 257 | ``` 258 | 259 | For POST requests only: 260 | 261 | * **body** (object): The data to send in your POST request. Must be combined with bodyType. 262 | ```json 263 | { "body": { "name": "bob", "age": 25 } } 264 | ``` 265 | * **bodyType** (string): In which format do you want to POST your data: `form` or `json` 266 | ```json 267 | { "bodyType": "form" } 268 | ``` 269 | 270 | #### Practical Example 271 | 272 |
Show the Example 273 |

274 | 275 | Here, we will simulate a mobile device, because the mobile version of Amazon is easier to scrape given that there are less elements on the page. We will also retrieve the response headers. 276 | 277 | ```typescript 278 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, ); 279 | ``` 280 | 281 |

282 |
283 | 284 | ## 2. Extract your data 285 | 286 | We're now at the most interesting part: how to extract & filter values, and how to iterate items. 287 | 288 | ```typescript 289 | page.get( url, options, extractor ); 290 | ^^^^^^^^^ 291 | ``` 292 | 293 | ### 2.1 Extract a value 294 | 295 | Let's start with the basics: extract a single information from the webpage. 296 | 297 | Extractors are simple javascript objects, were you can associate a `key` (the name of your data) to a `value selector`. 298 | The following example will extract the text content of the element that matches given selector: 299 | 300 | ```typescript 301 | { 302 | : $( ) 303 | } 304 | ``` 305 | 306 | Here you have two elements: 307 | 308 | 1. The **Key**: You can choose any name for the key, but it should not: 309 | 310 | * Start by a `$` 311 | * Be a reserved key: `select` is the one and only reserved key for the moment 312 | 313 | 2. The **Selector** of the element which contains the information you want to extract. 314 | To create a value selector will use the `$()` function. If you've already used jQuery, it should look a bit familiar :) 315 | And for the attribute you put in the `$()` function, it's a CSS-like / jQuery-like selector that matches the element you want to extract the value. 316 |
Show examples 317 |

318 | 319 | - `$("h3")`: Simply matches all `h3` elements 320 | - Matches: 321 | ```html 322 |

This is a title

323 | ``` 324 | - Do not matches because it's not a `h3` element: 325 | ```html 326 |

Hello

327 | ``` 328 | - `$("a.myLink[href]")`: Matches `a` elements having the class `myLink`, and where the `href` attribute is defined 329 | - Matches: 330 | ```html 331 | Link Text 332 | ``` 333 | - Do not matches, because it doesn't contains the `myLink` class 334 | ```html 335 | Link Text 336 | ``` 337 | - `$("h2:contains('Scraping API') + div")`: Matches `div` elements that are next to `h2` elements where the content is equal to `Scraping API` 338 | - Matches: 339 | ```html 340 |

Scraping API

341 |
is cool
342 | ``` 343 | - Do not matches, because the `div` element is not next to the `h2` element 344 | ```html 345 |

Scraping API

346 |

is maybe not

347 |
well configured
348 | ``` 349 | Don't hesitate to go deeper by checking theses references: 350 | * [CSS selectors](https://www.w3schools.com/cssref/css_selectors.asp) 351 | * [jQuery selectors](https://www.w3schools.com/jquery/jquery_ref_selectors.asp) 352 | 353 |

354 |
355 | 356 | But instead of extracting the text content of the element, you can also extract the HTML content. 357 | For that, simply use the `.html()` method: 358 | 359 | ```typescript 360 | { 361 | : $( ).html() 362 | ^^^^^^^ 363 | } 364 | ``` 365 | 366 | It's also possible to extract any other [HTML attributes](https://www.w3schools.com/tags/ref_attributes.asp): `href`, `class`, `src`, etc ... 367 | 368 | ```typescript 369 | { 370 | : $( ).attr( ) 371 | ^^^^^^^^^^^^^^^^^^^ 372 | } 373 | ``` 374 | 375 | #### Practical Example 376 | 377 |
Show the Example 378 |

379 | 380 | Let's start by extracting the product info: 381 | 382 | * Title 383 | * Current price 384 | * Image URL 385 | * Rating 386 | 387 | ```typescript 388 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, { 389 | 390 | title: $("#title"), 391 | price: $("#corePrice_feature_div .a-offscreen:first"), 392 | image: $("#main-image").attr("src"), 393 | reviews: { 394 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']") 395 | } 396 | 397 | }); 398 | ``` 399 | 400 | Pretty easy, isn't it ? 🙂 401 | With this code, you will get the following data: 402 | 403 | ```json 404 | { 405 | "title": "sportbull Unisex 3D Printed Graphics Novelty Casual Short Sleeve T-Shirts Tees", 406 | "price": "$9.99", 407 | "image": "https://m.media-amazon.com/images/I/71c3pFtZywL._AC_AC_SY350_QL65_.jpg", 408 | "reviews": { 409 | "rating": "4.4 out of 5" 410 | } 411 | } 412 | ``` 413 | 414 | That's cool, but here we have two problems: 415 | 416 | * The price is a string, and we need to parse it if we want to separate the price amount from the currency. 417 | In a perfect world, we could simply make a 418 | ```typescript 419 | const amount = parseFloat( data.price.substring(1) ); 420 | ``` 421 | to get the amount. 422 | Yes, but depending on any factors, he price format could vary in `9.99 USD`, `9.99 dollars incl. taxes`, `$9.99 USD free shipping`, etc ... 423 | In addition, what warranties you that the price element systematically contains a price ? For some reaosns, we could have another random value. 424 | We want to build something strong, so we need to solve this issue. 425 | * Same issue with the image URL, we need to filter and validate it to be sure we have an URL in the correct form. 426 | 427 | That's a great transition to see how you can filter the data you've extracted. 428 | 429 | 430 |

431 |
432 | 433 | ## 2.2. Filter the data 434 | 435 | To ensure that the data we've extracted matches with what we're expecting, we can specify filters for each selector: 436 | 437 | ```typescript 438 | $( ).attr( ).filter( ) 439 | ^^^^^^^^^^^^^^^^^^^^^^^^ 440 | ``` 441 | 442 | For the moment, we only support two filters: 443 | 444 | * **url**: Checks if the value is an URL. If the URL is relative, it will be transformed into an absolute URL. 445 | * **price**: Powered by the [price-extract](https://github.com/scrapingapi/price-extract) package, this filter ensures that the value express a price, autodetect the currency ISO code, and separate the amount from the currency. 446 | It will give you an object with the price info: 447 | ```json 448 | { "amount": 9.99, "currency": "USD" } 449 | ``` 450 | 451 | 💡 **If you want I add another filter, please don't hesitate to share your purposal by [submitting a an issue](https://github.com/scrapingapi/price-extract/issues).** Thank you ! 452 | 453 |
Show the Example 454 |

455 | 456 | To come back on our Amazon example, we will simply add filters on the `price` and `image` data: 457 | 458 | ```typescript 459 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, { 460 | 461 | title: $("#title"), 462 | price: $("#corePrice_feature_div .a-offscreen:first").filter("price"), 463 | ^^^^^^^^^^^^^^^^ 464 | image: $("#main-image").attr("src").filter("url"), 465 | ^^^^^^^^^^^^^^ 466 | reviews: { 467 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']") 468 | } 469 | }); 470 | ``` 471 | 472 | By running this code, you will get the following data: 473 | 474 | ```json 475 | { 476 | "title": "sportbull Unisex 3D Printed Graphics Novelty Casual Short Sleeve T-Shirts Tees", 477 | "price": { "amount": 9.99, "currency": "USD" }, 478 | "image": "https://m.media-amazon.com/images/I/71c3pFtZywL._AC_AC_SY350_QL65_.jpg", 479 | "reviews": { 480 | "rating": "4.4 out of 5" 481 | } 482 | } 483 | ``` 484 | 485 | We get clean price data, and we're certain that `image` is an URL. 486 | 487 |

488 |
489 | 490 | ## 2.3. Optional values 491 | 492 | By default, all values you will select are required. That means we absolutly want this value to be present in the item, otherwhisee, this item will be excluded from the response. 493 | 494 | But you can of course make a value optional: 495 | 496 | ```typescript 497 | $( ).attr( ).optional() 498 | ^^^^^^^^^^^ 499 | ``` 500 | 501 | When a value is optional and it has not been found on the scraped page, you will get an undefined value. 502 | 503 | Here are the reasons why a value could not be found: 504 | 505 | * The selector do not matches with any element in the page 506 | * The attribute you want to retrieve do not exists 507 | * The value is empty 508 | * The filter has rejected the value 509 | 510 | ### Practical Example 511 | 512 |
Show the Example 513 |

514 | 515 | Let's consider that the average rating is not necessarly present on the page. 516 | Even if we're not able to get this info, we still want to retireve all the other values. 517 | So, we have to make the `reviews.rating` data optional. 518 | 519 | ```typescript 520 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, { 521 | 522 | title: $("#title"), 523 | price: $("#corePrice_feature_div .a-offscreen:first").filter("price"), 524 | image: $("#main-image").attr("src").filter("url"), 525 | reviews: { 526 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']").optional() 527 | ^^^^^^^^^^^ 528 | } 529 | 530 | }); 531 | ``` 532 | 533 | If `reviews.rating` has not been found, you will get the following data: 534 | 535 | ```json 536 | { 537 | "title": "sportbull Unisex 3D Printed Graphics Novelty Casual Short Sleeve T-Shirts Tees", 538 | "price": { "amount": 9.99, "currency": "USD" }, 539 | "image": "https://m.media-amazon.com/images/I/71c3pFtZywL._AC_AC_SY350_QL65_.jpg", 540 | "reviews": {} 541 | } 542 | ``` 543 | 544 |

545 |
546 | 547 | ### And next ? 548 | 549 | Now, you know how to extract high quality data from webpages. 550 | But what if we want to extract lists, like search results, products lists, blog articles, etc ... ? 551 | That's the next topic 👇 552 | 553 | ## 3. Iterate through lists 554 | 555 | The scrapingapi SDK allows you to extract every item that matches a selector. 556 | Again, it's highly inspired by the jQuery API: 557 | 558 | ```typescript 559 | $( ).each( ); 560 | ``` 561 | 562 | Firstly, you have to provide the **items selector** which will match all the DOM elements you want to iterate. 563 | Then, you specify the values you want to extract for each element that will be iterated, like we've seen previously. 564 | 565 | 💡 All the selectors you provide to extract the `values` will be executed inside the `items selector`. 566 | 567 | ### The `this` selector 568 | 569 | In the `values`, if the selector is `"this"`, it will make reference to the items selector. 570 | 571 | By exemple: 572 | 573 | ```type 574 | $("> ul.tags > li").each({ 575 | text: $("this") 576 | }) 577 | ``` 578 | 579 | The `text` value will be the text content of every `"> ul.tags > li` element. 580 | 581 | ### Practical Example 582 | 583 |
Show the Example 584 |

585 | 586 | We can now extract every review item: 587 | 588 | ```typescript 589 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, { 590 | 591 | title: $("#title"), 592 | price: $("#corePrice_feature_div .a-offscreen:first").filter("price"), 593 | image: $("#main-image").attr("src").filter("url"), 594 | reviews: { 595 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']").optional(), 596 | list: $("#cm-cr-dp-aw-review-list > [data-hook='mobley-review-content']").each({ 597 | author: $(".a-profile-name"), 598 | title: $("[data-hook='review-title']") 599 | }) 600 | } 601 | }); 602 | ``` 603 | 604 | You will get the following result: 605 | 606 | ```json 607 | { 608 | "title": "sportbull Unisex 3D Printed Graphics Novelty Casual Short Sleeve T-Shirts Tees", 609 | "price": { "amount": 9.99, "currency": "USD" }, 610 | "image": "https://m.media-amazon.com/images/I/71c3pFtZywL._AC_AC_SY350_QL65_.jpg", 611 | "reviews": { 612 | "rating": "4.4 out of 5", 613 | "list": [ 614 | { "author": "Jon", "title": "Great shirt; very trippy" }, 615 | { "author": "LK", "title": "Birthday Gift for Bartender Son" }, 616 | { "author": "Yessica suazo", "title": "Decepcionada del producto que recibi" }, 617 | { "author": "Avid Reader", "title": "Worth it." }, 618 | { "author": "Nancy K", "title": "Husband loves it!" }, 619 | { "author": "Mychelle", "title": "Gets Noticed" }, 620 | { "author": "Suzy M. Lewis", "title": "Wish I had bought a size up" }, 621 | { "author": "Devann Shultz", "title": "Great buy!" } 622 | ] 623 | } 624 | } 625 | ``` 626 | 627 |

628 |
629 | 630 | ## 5. Handle the Response 631 | 632 | Every time you launch a request, you will receive a response following this format: 633 | 634 | ```typescript 635 | type Response = { 636 | // The final URL after all the redirections 637 | url: string, 638 | // The scraped page status code 639 | status: number, 640 | // The scraped page headers (must provide the withHeaders option) 641 | headers?: { [key: string]: string }, 642 | // The page HTML (when the withBody option is true) 643 | body?: string, 644 | // The extracted data, if you provided extractors 645 | data?: object; 646 | // The time, in seconds, your request took to be resolved from our server 647 | // The communication delays between your app and our servers are ignored 648 | time: number, 649 | // The used bandwidth, in kb 650 | bandwidth: number, 651 | ``` 652 | 653 | ### Optimize the response time 654 | 655 | Every option uses additionnal CPU resources, slows down communication inside scrapingapi's network, and increase your response size. 656 | 657 | That's why it's better do use as few options as possible to make your responses faster. 658 | 659 | ----------- 660 | 661 |

662 | Do you like this project ? Please let me know, 663 | ⭐ Give it a Star :) 664 |

665 | 666 | ------------ 667 | 668 | ## Another example 669 | 670 | ![https://wallpapercave.com/wp/wp4014371.jpg](https://wallpapercave.com/wp/wp4014371.jpg) 671 | 672 | Consider that `http://example.com/products` responds with a webpage containing the following HTML: 673 | 674 | ```html 675 |

Space Cat Holograms to motive you programming

676 |

Free shipping to all the Milky Way.

677 |
678 | 679 |
680 | 681 |

Sandwich cat lost on a burger rocket

682 | 123.45 $ 683 |
    684 |
  • sandwich
  • 685 |
  • burger
  • 686 |
  • rocket
  • 687 |
688 |
689 | 690 |
691 | 692 |

Aliens can't sleep because of this cute DJ

693 |
    694 |
  • aliens
  • 695 |
  • sleep
  • 696 |
  • cute
  • 697 |
  • dj
  • 698 |
  • music
  • 699 |
700 |
701 | 702 |
703 | 704 |

Travelling at the speed of light with a radioactive spaceship

705 |

706 | Warning: Contains Plutonium. 707 |

708 | 456.78 $ 709 |
    710 |
  • pizza
  • 711 |
  • slice
  • 712 |
  • spaceship
  • 713 |
714 |
715 | 716 |
717 | 718 |

Gentleman dropped his litter into a black hole

719 |

720 | Since he found this calm planet. 721 |

722 | undefined 723 |
    724 |
  • luxury
  • 725 |
  • litter
  • 726 |
727 |
728 |
729 | ``` 730 | 731 | Let's extract the product list: 732 | 733 | ```typescript 734 | type Product = { 735 | name: string, 736 | image: string, 737 | price: { amount: number, currency: string }, 738 | tags: { text: string }[], 739 | description?: string 740 | } 741 | 742 | scraper.get("http://example.com/products", {}, $("#products > article.product").each({ 743 | 744 | name: $("> h3"),, 745 | image: $("> img").attr("src").filter("url"), 746 | price: $("> .price").filter("price"), 747 | tags: $("> ul.tags > li").each({ 748 | text: $("this") 749 | }), 750 | description: $("> .details").optional() 751 | 752 | })); 753 | ``` 754 | 755 | Here is the response: 756 | 757 | ```json 758 | { 759 | "url": "http://example.com/products", 760 | "status": 200, 761 | "data": [{ 762 | "name": "Sandwich cat lost on a burger rocket", 763 | "image": "https://wallpapercave.com/wp/wp4014371.jpg", 764 | "price": { "amount": 123.45, "currency": "USD" }, 765 | "tags": [ 766 | { "text": "sandwich" }, 767 | { "text": "burger" }, 768 | { "text": "rocket" } 769 | ] 770 | },{ 771 | "name": "Gentlemen can't find his litter anymore", 772 | "image": "https://wallpapercave.com/wp/wp4575192.jpg", 773 | "price": { "amount": 456.78, "currency": "USD" }, 774 | "tags": [ 775 | { "text": "pizza" }, 776 | { "text": "slice" }, 777 | { "text": "spaceship" } 778 | ] 779 | }] 780 | } 781 | ``` 782 | 783 | Did you notice ? In the request, the `price` data has been marked as required. but for two HTML elements we've iterated with the `$foreach` instruction, **the extractor wasn't able to extract the price**. 784 | 785 | * `Aliens can't sleep because of this cute DJ` doesn't contains any element that matches with `> .price` 786 | * `Gentleman dropped his litter into a black hole` contains a `.price` element, but the content text doesn't represents a price 787 | 788 | ----------- 789 | 790 | # About 791 | 792 | ## Need any additional information or help ? 793 | 794 | * Search if a related issue [has not been created before](https://github.com/scrapingapi/scraper/issues) 795 | * If not, feel free to [create a new issue](https://github.com/scrapingapi/scraper/issues/new) 796 | * For more personal questions, or for profesionnal inquiries: 797 |
798 | Send me an email 799 | 800 | `contact@gaetan-legac.fr` 801 |
802 | 803 | ## Credits 804 | 805 | For any complaint about abused kittens that has been sent to the deep space, see it with [WallpaperCave](https://wallpapercave.com/space-cat-wallpapers). 806 | --------------------------------------------------------------------------------