├── _config.yml
├── media
├── google-dom.jpg
├── logo_text.png
├── sample_code.png
├── ycombinator.png
├── sample_result.png
└── carbon-config.json
├── .gitignore
├── tsconfig.json
├── examples
├── google.ts
└── amazon.ts
├── package.json
├── src
├── index.ts
├── types.ts
└── validate.ts
└── readme.md
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/media/google-dom.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capturr/scraper/HEAD/media/google-dom.jpg
--------------------------------------------------------------------------------
/media/logo_text.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capturr/scraper/HEAD/media/logo_text.png
--------------------------------------------------------------------------------
/media/sample_code.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capturr/scraper/HEAD/media/sample_code.png
--------------------------------------------------------------------------------
/media/ycombinator.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capturr/scraper/HEAD/media/ycombinator.png
--------------------------------------------------------------------------------
/media/sample_result.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/capturr/scraper/HEAD/media/sample_result.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /node_modules
2 | /package-lock.json
3 | /bin
4 | /crawlers/debug
5 | /.vscode
6 | /*.code-workspace
7 | /test.ts
--------------------------------------------------------------------------------
/tsconfig.json:
--------------------------------------------------------------------------------
1 | {
2 | "compilerOptions": {
3 | "esModuleInterop": true,
4 | "moduleResolution": "node",
5 | "outDir": "./bin",
6 | "skipLibCheck": true,
7 | "allowJs": true,
8 | "declaration": true,
9 | "downlevelIteration": true
10 | },
11 | "include": [
12 | "./src"
13 | ],
14 | "exclude": [
15 | "node_modules"
16 | ]
17 | }
--------------------------------------------------------------------------------
/media/carbon-config.json:
--------------------------------------------------------------------------------
1 | {"paddingVertical":"0px","paddingHorizontal":"0px","backgroundImage":null,"backgroundImageSelection":null,"backgroundMode":"color","backgroundColor":"rgba(255,255,255,0)","dropShadow":false,"dropShadowOffsetY":"20px","dropShadowBlurRadius":"68px","theme":"night-owl","windowTheme":"none","language":"auto","fontFamily":"Hack","fontSize":"14px","lineHeight":"133%","windowControls":true,"widthAdjustment":true,"lineNumbers":false,"firstLineNumber":1,"exportSize":"2x","watermark":false,"squaredImage":false,"hiddenCharacters":false,"name":"","width":680}
--------------------------------------------------------------------------------
/examples/google.ts:
--------------------------------------------------------------------------------
1 | import Scraper, { $, TExtractedPrice } from '../src';
2 | const page = new Scraper('API_KEY');
3 |
4 | type TGoogleResults = {
5 | price: TExtractedPrice,
6 | results: {
7 | url: string,
8 | title: string
9 | }[]
10 | }
11 |
12 | // Scrape Google search results for "bitcoin"
13 | page.get("https://www.google.com/search?q=bitcoin", { device: "desktop" }, {
14 | // Extract the current bitcoin price
15 | price: $("#search .obcontainer .card-section > div:eq(1)").filter("price"),
16 | // For each Google search result
17 | results: $("h2:contains('Web results') + div").each({
18 | // We retrieve the URL
19 | url: $("a[href]").attr("href").filter("url"),
20 | // ... And the title text
21 | title: $("h3")
22 | })
23 | }).then( data => {
24 |
25 | console.dir(data, { depth: null });
26 |
27 | });
--------------------------------------------------------------------------------
/examples/amazon.ts:
--------------------------------------------------------------------------------
1 | import Scraper, { $, TExtractedPrice } from '../src';
2 | const page = new Scraper('API_KEY');
3 |
4 | type TReview = {
5 | author: string,
6 | title: string
7 | }
8 |
9 | type TAmazonResults = {
10 | title: string,
11 | price: TExtractedPrice,
12 | image: string,
13 | reviews: {
14 | rating?: string,
15 | list: TReview[]
16 | }
17 | }
18 |
19 | // Scrape Amazon search results for "bitcoin"
20 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, {
21 |
22 | title: $("#title"),
23 | price: $("#corePrice_feature_div .a-offscreen:first").filter("price"),
24 | image: $("#main-image").attr("src").filter("url"),
25 |
26 | reviews: {
27 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']").optional(),
28 | list: $("#cm-cr-dp-aw-review-list > [data-hook='mobley-review-content']").each({
29 | author: $(".a-profile-name"),
30 | title: $("[data-hook='review-title']")
31 | })
32 | }
33 |
34 | }).then( data => {
35 |
36 | console.dir(data, { depth: null });
37 |
38 | });
--------------------------------------------------------------------------------
/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "scrapingapi",
3 | "author": "Gaetan Le Gac ",
4 | "description": "One API to scrape All the Web.",
5 | "keywords": [
6 | "scraping",
7 | "crawler",
8 | "crawling",
9 | "crawl",
10 | "captcha",
11 | "bot",
12 | "robot",
13 | "proxy",
14 | "spider",
15 | "scraper",
16 | "web",
17 | "html",
18 | "extract",
19 | "data"
20 | ],
21 | "version": "0.3.1",
22 | "license": "MIT",
23 | "private": false,
24 | "main": "bin/index.js",
25 | "files": [
26 | "bin"
27 | ],
28 | "repository": {
29 | "type": "git",
30 | "url": "git://github.com/scrapingapi/scraper.git"
31 | },
32 | "scripts": {
33 | "build": "tsc",
34 | "watch": "tsc -w",
35 | "prepare": "npm run build"
36 | },
37 | "dependencies": {
38 | "get-root-domain": "^0.0.1",
39 | "request": "^2.88.2",
40 | "validator": "^13.7.0"
41 | },
42 | "devDependencies": {
43 | "@types/node": "^16.11.9",
44 | "@types/request": "^2.48.8",
45 | "@types/validator": "^13.7.0",
46 | "typescript": "^4.3.5"
47 | }
48 | }
49 |
--------------------------------------------------------------------------------
/src/index.ts:
--------------------------------------------------------------------------------
1 | /*----------------------------------
2 | - DEPENDANCES
3 | ----------------------------------*/
4 |
5 | // Node.js
6 | import request from 'request';
7 |
8 | // Internal
9 | import validate from './validate';
10 | export { default as validate } from './validate';
11 |
12 | /*----------------------------------
13 | - TYPE
14 | ----------------------------------*/
15 |
16 | import {
17 |
18 | TGlobalOptions,
19 | TAdapter,
20 |
21 | TRequestWithExtractors,
22 | TRequestWithBody,
23 | TScrapeResult,
24 |
25 | TExtractor,
26 | ValueExtractor
27 | } from './types';
28 |
29 | export type { TExtractedPrice } from './types';
30 |
31 | type TOptions = Omit;
32 |
33 | /*----------------------------------
34 | - VARIOUS DELCARATIONS
35 | ----------------------------------*/
36 |
37 | const local = process.argv.includes('-local');
38 |
39 | class ApiError extends Error {
40 | public constructor( public code: number, message: string ) {
41 | super(message);
42 | }
43 | }
44 |
45 | const defaultAdapter: TAdapter = (options) => new Promise((resolve, reject) => request({
46 | ...options,
47 | json: true
48 | }, (error, response) => {
49 |
50 | if (response && response.statusCode !== 200)
51 | error = new ApiError( response.statusCode, response.body );
52 |
53 | if (error) {
54 | reject(error);
55 | return;
56 | }
57 |
58 | resolve(response.body);
59 |
60 | }));
61 |
62 | /*----------------------------------
63 | - SCRAPER
64 | ----------------------------------*/
65 | export default class Scraper {
66 |
67 | public constructor( public apiKey: string, private options: TGlobalOptions = {} ) {}
68 |
69 | public scrape( requests: TRequestWithExtractors[] ): Promise[]> {
70 | const sendRequest = this.options.adapter || defaultAdapter;
71 | return sendRequest({
72 | method: 'POST',
73 | url: local ? 'http://localhost:3011/v0' : 'https://scrapingapi.io/v0',
74 | headers: {
75 | 'content-type': 'application/json',
76 | 'accepted': 'application/json',
77 | 'Authorization': this.apiKey,
78 | },
79 | body: {
80 | requests: validate(requests)
81 | },
82 | });
83 | }
84 |
85 | public get(
86 | url: string,
87 | options?: TOptions,
88 | extract?: TExtractor | ValueExtractor
89 | ): Promise> {
90 | return this.scrape([{ method: 'GET', url, extract, ...options }]).then( res => res[0] );
91 | }
92 |
93 | public post(
94 | url: string,
95 | body: TRequestWithBody["body"],
96 | bodyType: TRequestWithBody["bodyType"],
97 | options?: TOptions,
98 | extract?: TExtractor | ValueExtractor
99 | ): Promise> {
100 | return this.scrape([{ method: 'POST', url, extract, body, bodyType, ...options }]).then( res => res[0] );
101 | }
102 |
103 | }
104 |
105 | export const $ = (selector: string) => new ValueExtractor(selector)
106 |
107 | /*module.exports = (apiKey: string) => ({
108 | page: new Scraper(apiKey),
109 | $: $
110 | })*/
--------------------------------------------------------------------------------
/src/types.ts:
--------------------------------------------------------------------------------
1 | /*----------------------------------
2 | - CONST
3 | ----------------------------------*/
4 |
5 | export const allowedMethods = ["GET", "POST"] as const;
6 | export const bodyTypes = ["form", "json"] as const;
7 | export const dataFilters = ["url", 'price'] as const;
8 | export const devices = ['desktop', 'tablet', 'mobile'] as const;
9 |
10 | /*----------------------------------
11 | - GLOBAL CONFIGURATION TYPES
12 | ----------------------------------*/
13 |
14 | export type TAdapter = (options: {
15 | method: HttpMethod,
16 | url: string,
17 | headers: {[k: string]: string},
18 | body: {
19 | requests: TRequestWithExtractors[]
20 | }
21 | }) => Promise;
22 |
23 | export type TGlobalOptions = {
24 | adapter?: TAdapter
25 | }
26 |
27 | /*----------------------------------
28 | - REQUEST CONFIGURATION TYPES
29 | ----------------------------------*/
30 |
31 | export type TBasicRequest = {
32 | url: string,
33 | method?: HttpMethod,
34 | cookies?: string,
35 | device?: typeof devices[number]
36 | }
37 |
38 | export type HttpMethod = typeof allowedMethods[number];
39 |
40 | export type TRequestWithBody = TBasicRequest & {
41 | body: { [key: string]: any },
42 | bodyType: typeof bodyTypes[number]
43 | }
44 |
45 | export type TRequest = TBasicRequest | TRequestWithBody;
46 |
47 | export type TRequestWithExtractors = TRequest & {
48 | extract?: TExtractor,
49 | withBody?: boolean,
50 | withHeaders?: boolean,
51 | }
52 |
53 | /*----------------------------------
54 | - SCRAPER
55 | ----------------------------------*/
56 |
57 | export type TExtractor = TItemsExtractor | TValueExtractor | ValueExtractor | TItemsIterator;
58 |
59 | export type TSelector = "this" | string;
60 | export type TAttribute = "text" | "html" | string;
61 | export type TFilter = typeof dataFilters[number];
62 |
63 | export type TValueExtractor = {
64 | select: TSelector,
65 | attr?: TAttribute,
66 | required?: boolean,
67 | filters?: TFilter[]
68 | }
69 |
70 | export type TItemsIterator = {
71 | $foreach?: string,
72 | items: {
73 | [name: string]: TExtractor
74 | }
75 | }
76 |
77 | export type TItemsExtractor = {
78 | [name: string]: TExtractor
79 | }
80 |
81 | export class ValueExtractor {
82 |
83 | public options: TValueExtractor;
84 |
85 | public constructor( select: TSelector ) {
86 | this.options = {
87 | select
88 | }
89 | }
90 |
91 | public attr( attribute: TAttribute ) {
92 | this.options.attr = attribute;
93 | return this;
94 | }
95 |
96 | public each( values: { [name: string]: TExtractor } ): TItemsIterator {
97 | return { $foreach: this.options.select, items: values };
98 | }
99 |
100 | public filter( ...filterNames: TFilter[] ) {
101 |
102 | if (this.options.filters === undefined)
103 | this.options.filters = filterNames;
104 | else
105 | for (const filterName of filterNames) {
106 | if (this.options.filters.includes( filterName ))
107 | throw new Error(`The ${this.filter} filter has already be set for this selector.`);
108 | else
109 | this.options.filters.push(filterName);
110 | }
111 |
112 | return this;
113 | }
114 |
115 | public optional( isOptional: boolean = true ) {
116 | this.options.required = !isOptional;
117 | return this;
118 | }
119 |
120 | public html() {
121 | this.options.attr = 'html';
122 | return this;
123 | }
124 |
125 | public text() {
126 | this.options.attr = 'html';
127 | return this;
128 | }
129 |
130 | }
131 |
132 | /*----------------------------------
133 | - RESPONSE
134 | ----------------------------------*/
135 | export type TExtractedPrice = {
136 | amount: number,
137 | currency: string
138 | }
139 |
140 | export type TScrapeResult = {
141 | url: string,
142 | status: number,
143 | headers?: { [key: string]: string },
144 | body?: string,
145 | data?: TData,
146 | time: number,
147 | bandwidth: number,
148 | }
--------------------------------------------------------------------------------
/src/validate.ts:
--------------------------------------------------------------------------------
1 | /*----------------------------------
2 | - DEPENDENCIES
3 | ----------------------------------*/
4 |
5 | // Npm
6 | import getRootDomain from 'get-root-domain';
7 | import isURL from 'validator/lib/isURL';
8 |
9 | // Interval
10 | import {
11 | /* const */allowedMethods, bodyTypes, dataFilters,
12 | /* types */TRequestWithExtractors, TExtractor , TValueExtractor, ValueExtractor
13 | } from './types';
14 |
15 | /*----------------------------------
16 | - TYPES
17 | ----------------------------------*/
18 |
19 | type TObjetDonnees = {[k: string]: any};
20 |
21 | class BadRequest extends Error {}
22 |
23 | /*----------------------------------
24 | - CONFIG
25 | ----------------------------------*/
26 |
27 | const reqPerCall = 3;
28 |
29 | /*----------------------------------
30 | - METHODS
31 | ----------------------------------*/
32 |
33 | export default (requests: TRequestWithExtractors | TObjetDonnees): TRequestWithExtractors[] => {
34 |
35 | // Type Check
36 | if (!Array.isArray( requests ))
37 | throw new BadRequest("requests must be an array. Provided: " + typeof requests);
38 |
39 | // Requests number / call
40 | const reqCount = requests.length;
41 | if (reqCount === 0)
42 | throw new BadRequest("You must provide at least one request.");
43 | if (reqCount > reqPerCall)
44 | throw new BadRequest("You can't send more than " + reqPerCall + " requests per call (" + reqCount + " given).");
45 |
46 | // Check every request
47 | const domains: {[domain: string]: true} = {};
48 | for (let iReq = 0; iReq < reqCount; iReq++) {
49 |
50 | const req = requests[iReq];
51 |
52 | // Type
53 | if (typeof req !== "object" || req === null)
54 | throw new BadRequest("requests must be an array of requests object, but requests[" + iReq + "] is an " + typeof req);
55 |
56 | // Method
57 | if (req.method === undefined)
58 | req.method = 'GET';
59 | else if(!allowedMethods.includes(req.method))
60 | throw new BadRequest("Only the following HTTP methods are currently allowed: " + allowedMethods.join(', '));
61 |
62 | // URL
63 | if (typeof req.url !== "string" || !isURL(req.url, {
64 | require_protocol: true,
65 | require_valid_protocol: true,
66 | protocols: ['http', 'https'],
67 | require_host: true,
68 | require_port: false,
69 | allow_protocol_relative_urls: false,
70 | }))
71 | throw new BadRequest("The url parameter must be a valid URL string: protocol (http or https) + domain + path (optional)");
72 |
73 | // Unique domain
74 | if (reqCount !== 1) {
75 |
76 | const domain = getRootDomain(req.url);
77 | if (domains[domain] === true)
78 | throw new BadRequest("When you send multiple requests in one call, each requets must point to different domain names. However, you're sending 2 requests to " + domain + ".");
79 |
80 | domains[domain] = true;
81 |
82 | }
83 |
84 | // Cookies
85 | if (req.cookies !== undefined) {
86 |
87 | // Type
88 | if (typeof req.cookies !== 'string')
89 | throw new BadRequest("The cookie parameter must be a string. Example: user=Bob; age=28;");
90 |
91 | }
92 |
93 | // body
94 | if (req.body !== undefined) {
95 |
96 | // Bodytype
97 | if (req.bodyType === undefined)
98 | throw new BadRequest("The bodyType parameter must be provided when the body parameter is specified.");
99 | if (!bodyTypes.includes(req.bodyType))
100 | throw new BadRequest("Invalid value for the bodyType parameter. Allowed values: " + bodyTypes.join(', '));
101 |
102 | // Type
103 | if (typeof req.body !== 'object' || req.body.constructor.name !== 'Object')
104 | throw new BadRequest("The body parameter must be an object.");
105 | }
106 |
107 | if (req.extract !== undefined)
108 | req.extract = validateExtractors(req.extract, 'extract');
109 |
110 | if (req.withBody !== undefined && typeof req.withBody !== "boolean")
111 | throw new BadRequest(`The withBody parameter must be a boolean.`);
112 |
113 | if (req.withHeaders !== undefined && typeof req.withHeaders !== "boolean")
114 | throw new BadRequest(`The withHeaders parameter must be a boolean.`);
115 |
116 | }
117 |
118 | return requests as TRequestWithExtractors[];
119 |
120 | }
121 |
122 | export const isValueExtractor = (extract: TValueExtractor | TObjetDonnees): extract is TValueExtractor =>
123 | ('select' in extract)
124 |
125 | const validateExtractors = (extract: ValueExtractor | TValueExtractor | TObjetDonnees, path: string): TExtractor => {
126 |
127 | if (!extract || Array.isArray(extract) || typeof extract !== 'object')
128 | throw new BadRequest("The " + path + " option must be an object (" + typeof extract + " given).");
129 |
130 | if (extract instanceof ValueExtractor)
131 | extract = extract.options;
132 |
133 | // The two required options in TValueExtractor
134 | if (isValueExtractor(extract)) {
135 |
136 | /*
137 | extract: {
138 | select: "h4",
139 | attr: "text",
140 | required: true,
141 | filters: ["title"]
142 | }
143 | */
144 |
145 | const {
146 | select,
147 | attr = 'text',
148 | required = true,
149 | filters = []
150 | } = extract;
151 |
152 | if (typeof select !== 'string')
153 | throw new BadRequest(`The ${path}.select option must be a string.`);
154 |
155 | if (typeof attr !== 'string')
156 | throw new BadRequest(`The ${path}.attr option must be a string.`);
157 |
158 | if (typeof required !== 'boolean')
159 | throw new BadRequest(`The ${path}.required option must be a boolean.`);
160 |
161 | if (!Array.isArray( filters ))
162 | throw new BadRequest(`The ${path}.filters option must be an array of strings.`);
163 |
164 | for (const filter of filters)
165 | if (!dataFilters.includes( filter ))
166 | throw new BadRequest("The filter \"" + filter + "\" you gave in " + path + " does not exists. Possible filters: " + dataFilters.join(', '));
167 |
168 | } else {
169 |
170 | /*
171 | extract: {
172 | "$foreach": ".sh-dgr__content",
173 | "name": {
174 | select: "h4",
175 | attr: "text",
176 | required: true,
177 | filters: ["title"]
178 | }
179 | ...
180 | }
181 | */
182 |
183 | if (Object.keys(extract).length === 0)
184 | throw new BadRequest("The " + path + " parameter must contain at least one entry.");
185 |
186 | // For each extractor
187 | for (const name in extract) {
188 |
189 | // Foreach
190 | if (name !== '$foreach')
191 | extract[name] = validateExtractors(extract[name], path + '.' + name);
192 | else if (typeof extract[name] !== "string")
193 | throw new BadRequest("When specified, the " + path + ".$foreach parameter must be a CSS selector string.");
194 |
195 |
196 | }
197 | }
198 |
199 | return extract;
200 |
201 | }
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | --------------
2 | # This project has been moved to [datasaucer/api](https://github.com/datasaucer/api).
3 | ------
4 |
5 |
6 |
7 |
8 |
9 |
10 | One powerful API to scrape all the web
11 |
12 |
13 | Easily scrape data from any website, without worrying about captchas and bot detection mecanisms.
14 |
15 |
16 |
17 |
18 |
19 | 
20 | [](https://www.npmjs.com/package/scrapingapi)
21 | [](https://discord.gg/m7KWXcBaBu)
22 |
23 |
24 | 
26 |
27 | 
29 |
30 | 
32 |
33 |
34 |
35 |
36 | Website •
37 | Discord •
38 | ⭐ Give a Star
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 | ## Features
49 |
50 | * No captcha, no bot detection. Websites will see you as a human.
51 | * Integrated [**data extraction**](#extractors):
52 | * Easily extract data with CSS / jQuery-like selectors
53 | * Use filters to get ultra-clean data: url, price, ...
54 | * Iterate through items (ex: search results, products list, articles, ...)
55 | * **Bulk requests**: Up to 3 per call
56 | * Post json / form-encoded body
57 | * Set request device, headers and cookies
58 | * Returns **response body, headers, final URL & status code**
59 | * Typescript typings
60 |
61 | -----------
62 |
63 |
64 | Do you like this project ? Please let me know,
65 | ⭐ Give it a Star :)
66 |
67 |
68 | ------------
69 |
70 | ## Get started in 5 minutes chrono
71 |
72 | 1. **Install** the package
73 | ```console
74 | npm install --save scrapingapi
75 | ```
76 | If you're a Yarn guy:
77 | ```console
78 | yarn add --save scrapingapi
79 | ```
80 |
81 | 2. Create your free [API Key](https://scrapingapi.io/?utm_source=github&utm_medium=readme&utm_campaign=getstarted)
82 |
83 | 3. Make your first request (example below 👇)
84 |
85 | ## Simple Usage Example
86 |
87 | Here is an example of scraping **current Bitcoin price + search results** from Google Search.
88 |
89 | ```javascript
90 | import Scraper, { $ } from 'scrapingapi';
91 | const page = new Scraper('API_KEY');
92 |
93 | // Scrape Google search results for "bitcoin"
94 | page.get("https://www.google.com/search?q=bitcoin", { device: "desktop" }, {
95 | // Extract the current bitcoin price
96 | price: $("#search .obcontainer .card-section > div:eq(1)").filter("price"),
97 | // For each Google search result
98 | results: $("h2:contains('Web results') + div").each({
99 | // We retrieve the URL
100 | url: $("a[href]").attr("href").filter("url"),
101 | // ... And the title text
102 | title: $("h3")
103 | })
104 | }).then( data => {
105 |
106 | console.log("Here are the results:", data);
107 |
108 | });
109 | ```
110 |
111 | The `Scraper.get` method sends a **GET request** to the provided URL, and automatically extract the data you asked: the price and the results.
112 |
113 | 
114 |
115 | In the data parameter, you will get a [TScrapeResult](src/types.ts#L107) object, containing the scraping results.
116 |
117 | ```json
118 | {
119 | "url": "https://www.google.com/search?q=bitcoin",
120 | "status": 200,
121 | "time": 2.930,
122 | "bandwidth": 26.33,
123 | "data": {
124 | "price": {
125 | "amount": 49805.02,
126 | "currency": "EUR"
127 | },
128 | "results": [{
129 | "url": "https://bitcoin.org/",
130 | "title": "Bitcoin - Open source P2P money"
131 | }, {
132 | "url": "https://coinmarketcap.com/currencies/bitcoin/",
133 | "title": "Bitcoin price today, BTC to USD live, marketcap and chart"
134 | }, {
135 | "url": "https://www.bitcoin.com/",
136 | "title": "Bitcoin.com | Buy BTC, ETH & BCH | Wallet, news, markets ..."
137 | }, {
138 | "url": "https://en.wikipedia.org/wiki/Bitcoin",
139 | "title": "Bitcoin - Wikipedia"
140 | }]
141 | }
142 | }
143 | ```
144 |
145 | ### Use Typescript
146 |
147 | Take advantage of the power of typescript by typing your response data:
148 |
149 | ```typescript
150 | import Scraper, { $, TExtractedPrice } from '../src';
151 | const page = new Scraper('API_KEY');
152 |
153 | type BitcoinGoogleResults = {
154 | // Metadata generated by the price filter
155 | price: TExtractedPrice,
156 | // An array containing an informations object for each Google search result
157 | results: {
158 | url: string,
159 | title: string
160 | }[]
161 | }
162 |
163 | page.get("https://www.google.com/search?q=bitcoin").then( ... );
164 | ```
165 |
166 | -----------
167 |
168 |
169 | Do you like this project ? Please let me know,
170 | ⭐ Give it a Star :)
171 |
172 |
173 | ------------
174 |
175 | # Documentation / Guide
176 |
177 | Let's consider we want to scrape an Amazon product page to retrieve the following info:
178 |
179 | * Product info
180 | * Title
181 | * Current price
182 | * Image URL
183 | * Reviews
184 | * Average rating
185 | * List of reviews
186 |
187 | Ready ? Let's start step by step:
188 |
189 | 1. [Make the **Request**](#request)
190 | - [**Method**: GET, POST](#request-methods)
191 | - [**Options**: device, cookies, body, withBody, withHeaders](#request-options)
192 | 2. [**Extract** your data](#extractors)
193 | - [Simple values](#value-extractor)
194 | - [Filters a Validators](#item-extractor)
195 | - [Optional values](#item-extractor)
196 | 3. [**Iterate** through lists](#response)
197 | 4. [Handle the **Response**](#response)
198 | 5. [Another **Example**](#another-example)
199 |
200 | ## 1. Make the Request
201 |
202 | ### 1.1 Request Methods
203 |
204 | This SDK provides one method per supported HTTP method:
205 |
206 | * GET: [See the definition](src/index.ts#66)
207 | ```typescript
208 | page.get( url, options, extractor );
209 | ```
210 | * POST: [See the definition](src/index.ts#74)
211 | ```typescript
212 | page.post( url, body, bodyType, options, extractor );
213 | ```
214 | * Bulk requests:
215 | With the `scrape` method, You can also send up to **3 requests per call** if each of them points to different domain names.
216 | [See the definition](src/index.ts#38)
217 | ```typescript
218 | page.scrape( requests );
219 | ```
220 |
221 | Show Example
222 |
223 |
224 | For our example, we only need to make a get request.
225 |
226 | ```typescript
227 | page.get( "https://www.amazon.com/dp/B08L76BSZ5", , );
228 | ```
229 |
230 |
231 |
232 |
233 | ### 1.2 Request Options
234 |
235 | ```typescript
236 | page.get( url, options, extractor );
237 | ^^^^^^^
238 | ```
239 |
240 | Depending on your needs, you can change some settings for your request:
241 |
242 | * **device** (string): Which user-agent do you want to use for your request: `desktop`, `mobile` or `tablet`
243 | ```json
244 | { "device": "mobile" }
245 | ```
246 | * **cookies** (string): The cookie string you want to pass to the request. Example:
247 | ```json
248 | { "cookies": "sessionId=34; userId=87;" }
249 | ```
250 | * **withBody** (boolean): If you want to get the page HTML in the response. Default: `false`
251 | ```json
252 | { "withBody": true }
253 | ```
254 | * **withHeaders** (boolean): If you want to retrieve the response headers. Default: `false`
255 | ```json
256 | { "withHeaders": true }
257 | ```
258 |
259 | For POST requests only:
260 |
261 | * **body** (object): The data to send in your POST request. Must be combined with bodyType.
262 | ```json
263 | { "body": { "name": "bob", "age": 25 } }
264 | ```
265 | * **bodyType** (string): In which format do you want to POST your data: `form` or `json`
266 | ```json
267 | { "bodyType": "form" }
268 | ```
269 |
270 | #### Practical Example
271 |
272 | Show the Example
273 |
274 |
275 | Here, we will simulate a mobile device, because the mobile version of Amazon is easier to scrape given that there are less elements on the page. We will also retrieve the response headers.
276 |
277 | ```typescript
278 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, );
279 | ```
280 |
281 |
282 |
283 |
284 | ## 2. Extract your data
285 |
286 | We're now at the most interesting part: how to extract & filter values, and how to iterate items.
287 |
288 | ```typescript
289 | page.get( url, options, extractor );
290 | ^^^^^^^^^
291 | ```
292 |
293 | ### 2.1 Extract a value
294 |
295 | Let's start with the basics: extract a single information from the webpage.
296 |
297 | Extractors are simple javascript objects, were you can associate a `key` (the name of your data) to a `value selector`.
298 | The following example will extract the text content of the element that matches given selector:
299 |
300 | ```typescript
301 | {
302 | : $( )
303 | }
304 | ```
305 |
306 | Here you have two elements:
307 |
308 | 1. The **Key**: You can choose any name for the key, but it should not:
309 |
310 | * Start by a `$`
311 | * Be a reserved key: `select` is the one and only reserved key for the moment
312 |
313 | 2. The **Selector** of the element which contains the information you want to extract.
314 | To create a value selector will use the `$()` function. If you've already used jQuery, it should look a bit familiar :)
315 | And for the attribute you put in the `$()` function, it's a CSS-like / jQuery-like selector that matches the element you want to extract the value.
316 | Show examples
317 |
318 |
319 | - `$("h3")`: Simply matches all `h3` elements
320 | - Matches:
321 | ```html
322 |
This is a title
323 | ```
324 | - Do not matches because it's not a `h3` element:
325 | ```html
326 | Hello
327 | ```
328 | - `$("a.myLink[href]")`: Matches `a` elements having the class `myLink`, and where the `href` attribute is defined
329 | - Matches:
330 | ```html
331 | Link Text
332 | ```
333 | - Do not matches, because it doesn't contains the `myLink` class
334 | ```html
335 | Link Text
336 | ```
337 | - `$("h2:contains('Scraping API') + div")`: Matches `div` elements that are next to `h2` elements where the content is equal to `Scraping API`
338 | - Matches:
339 | ```html
340 | Scraping API
341 | is cool
342 | ```
343 | - Do not matches, because the `div` element is not next to the `h2` element
344 | ```html
345 | Scraping API
346 | is maybe not
347 | well configured
348 | ```
349 | Don't hesitate to go deeper by checking theses references:
350 | * [CSS selectors](https://www.w3schools.com/cssref/css_selectors.asp)
351 | * [jQuery selectors](https://www.w3schools.com/jquery/jquery_ref_selectors.asp)
352 |
353 |
354 |
355 |
356 | But instead of extracting the text content of the element, you can also extract the HTML content.
357 | For that, simply use the `.html()` method:
358 |
359 | ```typescript
360 | {
361 | : $( ).html()
362 | ^^^^^^^
363 | }
364 | ```
365 |
366 | It's also possible to extract any other [HTML attributes](https://www.w3schools.com/tags/ref_attributes.asp): `href`, `class`, `src`, etc ...
367 |
368 | ```typescript
369 | {
370 | : $( ).attr( )
371 | ^^^^^^^^^^^^^^^^^^^
372 | }
373 | ```
374 |
375 | #### Practical Example
376 |
377 | Show the Example
378 |
379 |
380 | Let's start by extracting the product info:
381 |
382 | * Title
383 | * Current price
384 | * Image URL
385 | * Rating
386 |
387 | ```typescript
388 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, {
389 |
390 | title: $("#title"),
391 | price: $("#corePrice_feature_div .a-offscreen:first"),
392 | image: $("#main-image").attr("src"),
393 | reviews: {
394 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']")
395 | }
396 |
397 | });
398 | ```
399 |
400 | Pretty easy, isn't it ? 🙂
401 | With this code, you will get the following data:
402 |
403 | ```json
404 | {
405 | "title": "sportbull Unisex 3D Printed Graphics Novelty Casual Short Sleeve T-Shirts Tees",
406 | "price": "$9.99",
407 | "image": "https://m.media-amazon.com/images/I/71c3pFtZywL._AC_AC_SY350_QL65_.jpg",
408 | "reviews": {
409 | "rating": "4.4 out of 5"
410 | }
411 | }
412 | ```
413 |
414 | That's cool, but here we have two problems:
415 |
416 | * The price is a string, and we need to parse it if we want to separate the price amount from the currency.
417 | In a perfect world, we could simply make a
418 | ```typescript
419 | const amount = parseFloat( data.price.substring(1) );
420 | ```
421 | to get the amount.
422 | Yes, but depending on any factors, he price format could vary in `9.99 USD`, `9.99 dollars incl. taxes`, `$9.99 USD free shipping`, etc ...
423 | In addition, what warranties you that the price element systematically contains a price ? For some reaosns, we could have another random value.
424 | We want to build something strong, so we need to solve this issue.
425 | * Same issue with the image URL, we need to filter and validate it to be sure we have an URL in the correct form.
426 |
427 | That's a great transition to see how you can filter the data you've extracted.
428 |
429 |
430 |
431 |
432 |
433 | ## 2.2. Filter the data
434 |
435 | To ensure that the data we've extracted matches with what we're expecting, we can specify filters for each selector:
436 |
437 | ```typescript
438 | $( ).attr( ).filter( )
439 | ^^^^^^^^^^^^^^^^^^^^^^^^
440 | ```
441 |
442 | For the moment, we only support two filters:
443 |
444 | * **url**: Checks if the value is an URL. If the URL is relative, it will be transformed into an absolute URL.
445 | * **price**: Powered by the [price-extract](https://github.com/scrapingapi/price-extract) package, this filter ensures that the value express a price, autodetect the currency ISO code, and separate the amount from the currency.
446 | It will give you an object with the price info:
447 | ```json
448 | { "amount": 9.99, "currency": "USD" }
449 | ```
450 |
451 | 💡 **If you want I add another filter, please don't hesitate to share your purposal by [submitting a an issue](https://github.com/scrapingapi/price-extract/issues).** Thank you !
452 |
453 | Show the Example
454 |
455 |
456 | To come back on our Amazon example, we will simply add filters on the `price` and `image` data:
457 |
458 | ```typescript
459 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, {
460 |
461 | title: $("#title"),
462 | price: $("#corePrice_feature_div .a-offscreen:first").filter("price"),
463 | ^^^^^^^^^^^^^^^^
464 | image: $("#main-image").attr("src").filter("url"),
465 | ^^^^^^^^^^^^^^
466 | reviews: {
467 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']")
468 | }
469 | });
470 | ```
471 |
472 | By running this code, you will get the following data:
473 |
474 | ```json
475 | {
476 | "title": "sportbull Unisex 3D Printed Graphics Novelty Casual Short Sleeve T-Shirts Tees",
477 | "price": { "amount": 9.99, "currency": "USD" },
478 | "image": "https://m.media-amazon.com/images/I/71c3pFtZywL._AC_AC_SY350_QL65_.jpg",
479 | "reviews": {
480 | "rating": "4.4 out of 5"
481 | }
482 | }
483 | ```
484 |
485 | We get clean price data, and we're certain that `image` is an URL.
486 |
487 |
488 |
489 |
490 | ## 2.3. Optional values
491 |
492 | By default, all values you will select are required. That means we absolutly want this value to be present in the item, otherwhisee, this item will be excluded from the response.
493 |
494 | But you can of course make a value optional:
495 |
496 | ```typescript
497 | $( ).attr( ).optional()
498 | ^^^^^^^^^^^
499 | ```
500 |
501 | When a value is optional and it has not been found on the scraped page, you will get an undefined value.
502 |
503 | Here are the reasons why a value could not be found:
504 |
505 | * The selector do not matches with any element in the page
506 | * The attribute you want to retrieve do not exists
507 | * The value is empty
508 | * The filter has rejected the value
509 |
510 | ### Practical Example
511 |
512 | Show the Example
513 |
514 |
515 | Let's consider that the average rating is not necessarly present on the page.
516 | Even if we're not able to get this info, we still want to retireve all the other values.
517 | So, we have to make the `reviews.rating` data optional.
518 |
519 | ```typescript
520 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, {
521 |
522 | title: $("#title"),
523 | price: $("#corePrice_feature_div .a-offscreen:first").filter("price"),
524 | image: $("#main-image").attr("src").filter("url"),
525 | reviews: {
526 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']").optional()
527 | ^^^^^^^^^^^
528 | }
529 |
530 | });
531 | ```
532 |
533 | If `reviews.rating` has not been found, you will get the following data:
534 |
535 | ```json
536 | {
537 | "title": "sportbull Unisex 3D Printed Graphics Novelty Casual Short Sleeve T-Shirts Tees",
538 | "price": { "amount": 9.99, "currency": "USD" },
539 | "image": "https://m.media-amazon.com/images/I/71c3pFtZywL._AC_AC_SY350_QL65_.jpg",
540 | "reviews": {}
541 | }
542 | ```
543 |
544 |
545 |
546 |
547 | ### And next ?
548 |
549 | Now, you know how to extract high quality data from webpages.
550 | But what if we want to extract lists, like search results, products lists, blog articles, etc ... ?
551 | That's the next topic 👇
552 |
553 | ## 3. Iterate through lists
554 |
555 | The scrapingapi SDK allows you to extract every item that matches a selector.
556 | Again, it's highly inspired by the jQuery API:
557 |
558 | ```typescript
559 | $( ).each( );
560 | ```
561 |
562 | Firstly, you have to provide the **items selector** which will match all the DOM elements you want to iterate.
563 | Then, you specify the values you want to extract for each element that will be iterated, like we've seen previously.
564 |
565 | 💡 All the selectors you provide to extract the `values` will be executed inside the `items selector`.
566 |
567 | ### The `this` selector
568 |
569 | In the `values`, if the selector is `"this"`, it will make reference to the items selector.
570 |
571 | By exemple:
572 |
573 | ```type
574 | $("> ul.tags > li").each({
575 | text: $("this")
576 | })
577 | ```
578 |
579 | The `text` value will be the text content of every `"> ul.tags > li` element.
580 |
581 | ### Practical Example
582 |
583 | Show the Example
584 |
585 |
586 | We can now extract every review item:
587 |
588 | ```typescript
589 | page.get("https://www.amazon.com/dp/B08L76BSZ5", { device: 'mobile', withHeaders: true }, {
590 |
591 | title: $("#title"),
592 | price: $("#corePrice_feature_div .a-offscreen:first").filter("price"),
593 | image: $("#main-image").attr("src").filter("url"),
594 | reviews: {
595 | rating: $(".cr-widget-Acr [data-hook='average-stars-rating-text']").optional(),
596 | list: $("#cm-cr-dp-aw-review-list > [data-hook='mobley-review-content']").each({
597 | author: $(".a-profile-name"),
598 | title: $("[data-hook='review-title']")
599 | })
600 | }
601 | });
602 | ```
603 |
604 | You will get the following result:
605 |
606 | ```json
607 | {
608 | "title": "sportbull Unisex 3D Printed Graphics Novelty Casual Short Sleeve T-Shirts Tees",
609 | "price": { "amount": 9.99, "currency": "USD" },
610 | "image": "https://m.media-amazon.com/images/I/71c3pFtZywL._AC_AC_SY350_QL65_.jpg",
611 | "reviews": {
612 | "rating": "4.4 out of 5",
613 | "list": [
614 | { "author": "Jon", "title": "Great shirt; very trippy" },
615 | { "author": "LK", "title": "Birthday Gift for Bartender Son" },
616 | { "author": "Yessica suazo", "title": "Decepcionada del producto que recibi" },
617 | { "author": "Avid Reader", "title": "Worth it." },
618 | { "author": "Nancy K", "title": "Husband loves it!" },
619 | { "author": "Mychelle", "title": "Gets Noticed" },
620 | { "author": "Suzy M. Lewis", "title": "Wish I had bought a size up" },
621 | { "author": "Devann Shultz", "title": "Great buy!" }
622 | ]
623 | }
624 | }
625 | ```
626 |
627 |
628 |
629 |
630 | ## 5. Handle the Response
631 |
632 | Every time you launch a request, you will receive a response following this format:
633 |
634 | ```typescript
635 | type Response = {
636 | // The final URL after all the redirections
637 | url: string,
638 | // The scraped page status code
639 | status: number,
640 | // The scraped page headers (must provide the withHeaders option)
641 | headers?: { [key: string]: string },
642 | // The page HTML (when the withBody option is true)
643 | body?: string,
644 | // The extracted data, if you provided extractors
645 | data?: object;
646 | // The time, in seconds, your request took to be resolved from our server
647 | // The communication delays between your app and our servers are ignored
648 | time: number,
649 | // The used bandwidth, in kb
650 | bandwidth: number,
651 | ```
652 |
653 | ### Optimize the response time
654 |
655 | Every option uses additionnal CPU resources, slows down communication inside scrapingapi's network, and increase your response size.
656 |
657 | That's why it's better do use as few options as possible to make your responses faster.
658 |
659 | -----------
660 |
661 |
662 | Do you like this project ? Please let me know,
663 | ⭐ Give it a Star :)
664 |
665 |
666 | ------------
667 |
668 | ## Another example
669 |
670 | 
671 |
672 | Consider that `http://example.com/products` responds with a webpage containing the following HTML:
673 |
674 | ```html
675 | Space Cat Holograms to motive you programming
676 | Free shipping to all the Milky Way.
677 |
729 | ```
730 |
731 | Let's extract the product list:
732 |
733 | ```typescript
734 | type Product = {
735 | name: string,
736 | image: string,
737 | price: { amount: number, currency: string },
738 | tags: { text: string }[],
739 | description?: string
740 | }
741 |
742 | scraper.get("http://example.com/products", {}, $("#products > article.product").each({
743 |
744 | name: $("> h3"),,
745 | image: $("> img").attr("src").filter("url"),
746 | price: $("> .price").filter("price"),
747 | tags: $("> ul.tags > li").each({
748 | text: $("this")
749 | }),
750 | description: $("> .details").optional()
751 |
752 | }));
753 | ```
754 |
755 | Here is the response:
756 |
757 | ```json
758 | {
759 | "url": "http://example.com/products",
760 | "status": 200,
761 | "data": [{
762 | "name": "Sandwich cat lost on a burger rocket",
763 | "image": "https://wallpapercave.com/wp/wp4014371.jpg",
764 | "price": { "amount": 123.45, "currency": "USD" },
765 | "tags": [
766 | { "text": "sandwich" },
767 | { "text": "burger" },
768 | { "text": "rocket" }
769 | ]
770 | },{
771 | "name": "Gentlemen can't find his litter anymore",
772 | "image": "https://wallpapercave.com/wp/wp4575192.jpg",
773 | "price": { "amount": 456.78, "currency": "USD" },
774 | "tags": [
775 | { "text": "pizza" },
776 | { "text": "slice" },
777 | { "text": "spaceship" }
778 | ]
779 | }]
780 | }
781 | ```
782 |
783 | Did you notice ? In the request, the `price` data has been marked as required. but for two HTML elements we've iterated with the `$foreach` instruction, **the extractor wasn't able to extract the price**.
784 |
785 | * `Aliens can't sleep because of this cute DJ` doesn't contains any element that matches with `> .price`
786 | * `Gentleman dropped his litter into a black hole` contains a `.price` element, but the content text doesn't represents a price
787 |
788 | -----------
789 |
790 | # About
791 |
792 | ## Need any additional information or help ?
793 |
794 | * Search if a related issue [has not been created before](https://github.com/scrapingapi/scraper/issues)
795 | * If not, feel free to [create a new issue](https://github.com/scrapingapi/scraper/issues/new)
796 | * For more personal questions, or for profesionnal inquiries:
797 |
798 | Send me an email
799 |
800 | `contact@gaetan-legac.fr`
801 |
802 |
803 | ## Credits
804 |
805 | For any complaint about abused kittens that has been sent to the deep space, see it with [WallpaperCave](https://wallpapercave.com/space-cat-wallpapers).
806 |
--------------------------------------------------------------------------------