├── .gitignore ├── LICENSE ├── NodeJS ├── NodeJS │ ├── api_endpoint_example.js │ ├── proxy_port_example.js │ └── sdk_example.js └── NodeJS_Puppeteer │ ├── package.json │ └── proxy_port_example.js ├── Python ├── Python_Requests_Beautifulsoup │ ├── api_endpoint_example.py │ ├── proxy_port_example.py │ └── sdk_example.py ├── Python_Scrapy │ ├── Python_Scrapy │ │ ├── __init__.py │ │ ├── items.py │ │ ├── middlewares.py │ │ ├── pipelines.py │ │ ├── settings.py │ │ └── spiders │ │ │ ├── __init__.py │ │ │ ├── api_endpoint_example.py │ │ │ ├── proxy_port_example.py │ │ │ └── sdk_example.py │ └── scrapy.cfg └── Python_Selenium │ ├── api_endpoint_example.py │ └── proxy_port_example.py └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # NodeJS 7 | node_modules/ 8 | package-lock.json 9 | 10 | # C extensions 11 | *.so 12 | 13 | # Distribution / packaging 14 | .Python 15 | build/ 16 | develop-eggs/ 17 | dist/ 18 | downloads/ 19 | eggs/ 20 | .eggs/ 21 | lib/ 22 | lib64/ 23 | parts/ 24 | sdist/ 25 | var/ 26 | wheels/ 27 | pip-wheel-metadata/ 28 | share/python-wheels/ 29 | *.egg-info/ 30 | .installed.cfg 31 | *.egg 32 | MANIFEST 33 | 34 | # PyInstaller 35 | # Usually these files are written by a python script from a template 36 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 37 | *.manifest 38 | *.spec 39 | 40 | # Installer logs 41 | pip-log.txt 42 | pip-delete-this-directory.txt 43 | 44 | # Unit test / coverage reports 45 | htmlcov/ 46 | .tox/ 47 | .nox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | *.py,cover 55 | .hypothesis/ 56 | .pytest_cache/ 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Scraper API 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NodeJS/NodeJS/api_endpoint_example.js: -------------------------------------------------------------------------------- 1 | const rp = require('promise-request-retry'); 2 | const cheerio = require("cheerio"); 3 | 4 | /* 5 | SCRAPER SETTINGS 6 | 7 | You need to define the following values below: 8 | 9 | - API_KEY --> Find this on your dashboard, or signup here to create a 10 | free account here https://dashboard.scraperapi.com/signup 11 | 12 | - NUM_CONCURRENT_THREADS --> Set this equal to the number of concurrent threads available 13 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 14 | Startup Plan (25 threads), Business Plan (50 threads), 15 | Enterprise Plan (up to 5,000 threads). 16 | 17 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 18 | 95% of your requests will be successful on the first try, 19 | and 99% after 3 retries. 20 | 21 | */ 22 | 23 | 24 | const API_KEY = 'INSERT_API_KEY_HERE'; 25 | const NUM_CONCURRENT_THREADS = 5; 26 | const NUM_RETRIES = 5; 27 | 28 | 29 | // Example list of URLs to scrape 30 | const urlsToScrape = [ 31 | 'http://quotes.toscrape.com/page/1/', 32 | 'http://quotes.toscrape.com/page/2/', 33 | 'http://quotes.toscrape.com/page/3/', 34 | 'http://quotes.toscrape.com/page/4/', 35 | 'http://quotes.toscrape.com/page/5/', 36 | 'http://quotes.toscrape.com/page/6/', 37 | 'http://quotes.toscrape.com/page/7/', 38 | 'http://quotes.toscrape.com/page/8/', 39 | 'http://quotes.toscrape.com/page/9/' 40 | ]; 41 | 42 | let freeThreads = NUM_CONCURRENT_THREADS; 43 | let responsePromises = [] 44 | 45 | // Store scraped data in this list 46 | let scrapedData = []; 47 | 48 | 49 | const wait = ms => new Promise(resolve => setTimeout(() => resolve(true), ms)); 50 | 51 | 52 | const checkFreeThreads = (availableThreads, maxThreads) => { 53 | /* 54 | Function that returns True or False depending on if there is a concurrent thread 55 | free or not. Used to manage the scrapers concurrency. 56 | */ 57 | if(0 < availableThreads && availableThreads <= maxThreads){ 58 | return true 59 | } else { 60 | return false 61 | } 62 | } 63 | 64 | 65 | const makeConcurrentRequest = async (inputUrl) => { 66 | /* 67 | Function that makes a request with the request-promise-retry library, while 68 | also incremeneting/decrementing the available number of concurrent threads 69 | available to the scraper. 70 | */ 71 | freeThreads-- 72 | try { 73 | options = { 74 | uri: `http://api.scraperapi.com/`, 75 | qs: { 76 | 'api_key': API_KEY, 77 | 'url': inputUrl 78 | }, 79 | retry : NUM_RETRIES, 80 | verbose_logging : false, 81 | accepted: [ 200, 404, 403 ], 82 | delay: 5000, 83 | factor: 2, 84 | resolveWithFullResponse: true 85 | } 86 | const response = await rp(options); 87 | freeThreads++ 88 | return response 89 | } catch (e) { 90 | freeThreads++ 91 | return e 92 | } 93 | } 94 | 95 | 96 | 97 | 98 | (async () => { 99 | /* 100 | MAIN SCRAPER SCRIPT 101 | While there are still urls left to scrape, it will make requests and 102 | parse the response whilst ensuring the scraper doesn't exceed the 103 | number of concurrent threads available in the Scraper API plan. 104 | */ 105 | 106 | while(urlsToScrape.length > 0){ 107 | 108 | if(checkFreeThreads(freeThreads, NUM_CONCURRENT_THREADS)){ 109 | 110 | // take URL from the list of URLs to scrape 111 | url = urlsToScrape.shift() 112 | 113 | try { 114 | // make request and return promise 115 | response = makeConcurrentRequest(url) 116 | 117 | // log promise so we can make sure all promises resolved before exiting scraper 118 | responsePromises.push(response) 119 | 120 | // once response is recieved then parse the data from the page 121 | response.then(fullResponse => { 122 | 123 | // before parsing, check to see if response is valid. 124 | if(fullResponse.statusCode == 200){ 125 | 126 | // load html with cheerio 127 | let $ = cheerio.load(fullResponse.body); 128 | 129 | // find all quotes sections 130 | let quotes_sections = $('div.quote') 131 | 132 | // loop through the quotes sections and extract data 133 | quotes_sections.each((index, element) => { 134 | quote = $(element).find('span.text').text() 135 | author = $(element).find('small.author').text() 136 | 137 | // add scraped data to scrapedData array 138 | scrapedData.push({ 139 | 'quote': quote, 140 | 'author': author 141 | }) 142 | 143 | }); 144 | 145 | } else { 146 | // if the response status code isn't 200, then log the message 147 | console.log(fullResponse.message) 148 | } 149 | 150 | }).catch(error => { 151 | console.log(error) 152 | }) 153 | 154 | } catch (error){ 155 | console.log(error) 156 | } 157 | 158 | } 159 | // if no freeThreads available then wait for 200ms before retrying. 160 | await wait(200); 161 | 162 | } // end of while loop 163 | 164 | 165 | // don't output scraped data until all promises have been resolved 166 | Promise.all(responsePromises).then(() => { 167 | console.log('scrapedData: ', scrapedData); 168 | }); 169 | 170 | 171 | })(); 172 | 173 | 174 | 175 | 176 | -------------------------------------------------------------------------------- /NodeJS/NodeJS/proxy_port_example.js: -------------------------------------------------------------------------------- 1 | const rp = require('promise-request-retry'); 2 | const cheerio = require("cheerio"); 3 | 4 | /* 5 | SCRAPER SETTINGS 6 | 7 | You need to define the following values below: 8 | 9 | - API_KEY --> Find this on your dashboard, or signup here to create a 10 | free account here https://dashboard.scraperapi.com/signup 11 | 12 | - NUM_CONCURRENT_THREADS --> Set this equal to the number of concurrent threads available 13 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 14 | Startup Plan (25 threads), Business Plan (50 threads), 15 | Enterprise Plan (up to 5,000 threads). 16 | 17 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 18 | 95% of your requests will be successful on the first try, 19 | and 99% after 3 retries. 20 | 21 | */ 22 | 23 | 24 | const API_KEY = 'INSERT_API_KEY_HERE'; 25 | const NUM_CONCURRENT_THREADS = 5; 26 | const NUM_RETRIES = 5; 27 | 28 | 29 | // Example list of URLs to scrape 30 | const urlsToScrape = [ 31 | 'http://quotes.toscrape.com/page/1/', 32 | 'http://quotes.toscrape.com/page/2/', 33 | 'http://quotes.toscrape.com/page/3/', 34 | 'http://quotes.toscrape.com/page/4/', 35 | 'http://quotes.toscrape.com/page/5/', 36 | 'http://quotes.toscrape.com/page/6/', 37 | 'http://quotes.toscrape.com/page/7/', 38 | 'http://quotes.toscrape.com/page/8/', 39 | 'http://quotes.toscrape.com/page/9/' 40 | ]; 41 | 42 | 43 | let freeThreads = NUM_CONCURRENT_THREADS; 44 | let responsePromises = [] 45 | 46 | // Store scraped data in this list 47 | let scrapedData = []; 48 | 49 | 50 | const wait = ms => new Promise(resolve => setTimeout(() => resolve(true), ms)); 51 | 52 | 53 | const checkFreeThreads = (availableThreads, maxThreads) => { 54 | /* 55 | Function that returns True or False depending on if there is a concurrent thread 56 | free or not. Used to manage the scrapers concurrency. 57 | */ 58 | if(0 < availableThreads && availableThreads <= maxThreads){ 59 | return true 60 | } else { 61 | return false 62 | } 63 | } 64 | 65 | 66 | const makeConcurrentRequest = async (inputUrl) => { 67 | /* 68 | Function that makes a request with the request-promise-retry library, while 69 | also incremeneting/decrementing the available number of concurrent threads 70 | available to the scraper. 71 | */ 72 | freeThreads-- 73 | try { 74 | options = { 75 | uri: inputUrl, 76 | proxy:`http://scraperapi:${API_KEY}@proxy-server.scraperapi.com:8001`, 77 | retry : NUM_RETRIES, 78 | verbose_logging : false, 79 | accepted: [ 200, 404, 403 ], 80 | delay: 5000, 81 | factor: 2, 82 | resolveWithFullResponse: true 83 | } 84 | const response = await rp(options); 85 | freeThreads++ 86 | return response 87 | } catch (e) { 88 | freeThreads++ 89 | return e 90 | } 91 | } 92 | 93 | 94 | 95 | 96 | (async () => { 97 | /* 98 | MAIN SCRAPER SCRIPT 99 | While there are still urls left to scrape, it will make requests and 100 | parse the response whilst ensuring the scraper doesn't exceed the 101 | number of concurrent threads available in the Scraper API plan. 102 | */ 103 | 104 | while(urlsToScrape.length > 0){ 105 | 106 | if(checkFreeThreads(freeThreads, NUM_CONCURRENT_THREADS)){ 107 | 108 | // take URL from the list of URLs to scrape 109 | url = urlsToScrape.shift() 110 | 111 | try { 112 | // make request and return promise 113 | response = makeConcurrentRequest(url) 114 | 115 | // log promise so we can make sure all promises resolved before exiting scraper 116 | responsePromises.push(response) 117 | 118 | // once response is recieved then parse the data from the page 119 | response.then(fullResponse => { 120 | 121 | // before parsing, check to see if response is valid. 122 | if(fullResponse.statusCode == 200){ 123 | 124 | // load html with cheerio 125 | let $ = cheerio.load(fullResponse.body); 126 | 127 | // find all quotes sections 128 | let quotes_sections = $('div.quote') 129 | 130 | // loop through the quotes sections and extract data 131 | quotes_sections.each((index, element) => { 132 | quote = $(element).find('span.text').text() 133 | author = $(element).find('small.author').text() 134 | 135 | // add scraped data to scrapedData array 136 | scrapedData.push({ 137 | 'quote': quote, 138 | 'author': author 139 | }) 140 | 141 | }); 142 | 143 | } else { 144 | // if the response status code isn't 200, then log the message 145 | console.log(fullResponse.message) 146 | } 147 | 148 | }).catch(error => { 149 | console.log(error) 150 | }) 151 | 152 | } catch (error){ 153 | console.log(error) 154 | } 155 | 156 | } 157 | // if no freeThreads available then wait for 200ms before retrying. 158 | await wait(200); 159 | 160 | } // end of while loop 161 | 162 | 163 | // don't output scraped data until all promises have been resolved 164 | Promise.all(responsePromises).then(() => { 165 | console.log('scrapedData: ', scrapedData); 166 | }); 167 | 168 | 169 | })(); 170 | 171 | 172 | 173 | 174 | -------------------------------------------------------------------------------- /NodeJS/NodeJS/sdk_example.js: -------------------------------------------------------------------------------- 1 | const cheerio = require("cheerio"); 2 | 3 | /* 4 | SCRAPER SETTINGS 5 | 6 | You need to define the following values below: 7 | 8 | - API_KEY --> Find this on your dashboard, or signup here to create a 9 | free account here https://dashboard.scraperapi.com/signup 10 | 11 | - NUM_CONCURRENT_THREADS --> Set this equal to the number of concurrent threads available 12 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 13 | Startup Plan (25 threads), Business Plan (50 threads), 14 | Enterprise Plan (up to 5,000 threads). 15 | 16 | */ 17 | 18 | 19 | const API_KEY = 'INSERT_API_KEY_HERE'; 20 | const NUM_CONCURRENT_THREADS = 5; 21 | 22 | const scraperapiClient = require('scraperapi-sdk')(API_KEY) 23 | 24 | // Example list of URLs to scrape 25 | const urlsToScrape = [ 26 | 'http://quotes.toscrape.com/page/1/', 27 | 'http://quotes.toscrape.com/page/2/', 28 | 'http://quotes.toscrape.com/page/3/', 29 | 'http://quotes.toscrape.com/page/4/', 30 | 'http://quotes.toscrape.com/page/5/', 31 | 'http://quotes.toscrape.com/page/6/', 32 | 'http://quotes.toscrape.com/page/7/', 33 | 'http://quotes.toscrape.com/page/8/', 34 | 'http://quotes.toscrape.com/page/9/' 35 | ]; 36 | 37 | 38 | let freeThreads = NUM_CONCURRENT_THREADS; 39 | let responsePromises = [] 40 | 41 | // Store scraped data in this list 42 | let scrapedData = []; 43 | 44 | 45 | const wait = ms => new Promise(resolve => setTimeout(() => resolve(true), ms)); 46 | 47 | 48 | const checkFreeThreads = (availableThreads, maxThreads) => { 49 | /* 50 | Function that returns True or False depending on if there is a concurrent thread 51 | free or not. Used to manage the scrapers concurrency. 52 | */ 53 | if(0 < availableThreads && availableThreads <= maxThreads){ 54 | return true 55 | } else { 56 | return false 57 | } 58 | } 59 | 60 | 61 | const makeConcurrentRequest = async (inputUrl) => { 62 | /* 63 | Function that makes a request with the ScraperAPI SDK, while 64 | also incremeneting/decrementing the available number of concurrent threads 65 | available to the scraper. 66 | */ 67 | freeThreads-- 68 | try { 69 | const response = await scraperapiClient.get(inputUrl); 70 | freeThreads++ 71 | return response 72 | } catch (e) { 73 | freeThreads++ 74 | return e 75 | } 76 | } 77 | 78 | 79 | 80 | 81 | (async () => { 82 | /* 83 | MAIN SCRAPER SCRIPT 84 | While there are still urls left to scrape, it will make requests and 85 | parse the response whilst ensuring the scraper doesn't exceed the 86 | number of concurrent threads available in the Scraper API plan. 87 | */ 88 | 89 | while(urlsToScrape.length > 0){ 90 | 91 | if(checkFreeThreads(freeThreads, NUM_CONCURRENT_THREADS)){ 92 | 93 | // take URL from the list of URLs to scrape 94 | url = urlsToScrape.shift() 95 | 96 | try { 97 | // make request and return promise 98 | response = makeConcurrentRequest(url) 99 | 100 | // log promise so we can make sure all promises resolved before exiting scraper 101 | responsePromises.push(response) 102 | 103 | // once response is recieved then parse the data from the page 104 | response.then(htmlResponse => { 105 | 106 | // load html with cheerio 107 | let $ = cheerio.load(htmlResponse); 108 | 109 | // find all quotes sections 110 | let quotes_sections = $('div.quote') 111 | 112 | // loop through the quotes sections and extract data 113 | quotes_sections.each((index, element) => { 114 | quote = $(element).find('span.text').text() 115 | author = $(element).find('small.author').text() 116 | 117 | // add scraped data to scrapedData array 118 | scrapedData.push({ 119 | 'quote': quote, 120 | 'author': author 121 | }) 122 | 123 | }); 124 | 125 | 126 | }).catch(error => { 127 | console.log(error) 128 | }) 129 | 130 | } catch (error){ 131 | console.log(error) 132 | } 133 | 134 | } 135 | // if no freeThreads available then wait for 200ms before retrying. 136 | await wait(200); 137 | 138 | } // end of while loop 139 | 140 | 141 | // don't output scraped data until all promises have been resolved 142 | Promise.all(responsePromises).then(() => { 143 | console.log('scrapedData: ', scrapedData); 144 | }); 145 | 146 | 147 | })(); 148 | 149 | 150 | 151 | 152 | -------------------------------------------------------------------------------- /NodeJS/NodeJS_Puppeteer/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "nodejspuppeteer", 3 | "version": "1.0.0", 4 | "description": "", 5 | "main": "index.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "author": "", 10 | "license": "ISC", 11 | "dependencies": { 12 | "cheerio": "^1.0.0-rc.9", 13 | "puppeteer": "^9.1.1" 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /NodeJS/NodeJS_Puppeteer/proxy_port_example.js: -------------------------------------------------------------------------------- 1 | const puppeteer = require('puppeteer'); 2 | const cheerio = require('cheerio'); 3 | 4 | /* 5 | SCRAPER SETTINGS 6 | 7 | You need to define the following values below: 8 | 9 | - API_KEY --> Find this on your dashboard, or signup here to create a 10 | free account here https://dashboard.scraperapi.com/signup 11 | 12 | */ 13 | 14 | 15 | // ScraperAPI proxy configuration 16 | PROXY_USERNAME = 'scraperapi'; 17 | PROXY_PASSWORD = 'API_KEY'; // <-- enter your API_Key here 18 | PROXY_SERVER = 'proxy-server.scraperapi.com'; 19 | PROXY_SERVER_PORT = '8001'; 20 | 21 | // where scraped data will be stored 22 | let scraped_quotes = []; 23 | 24 | (async () => { 25 | const browser = await puppeteer.launch({ 26 | ignoreHTTPSErrors: true, 27 | args: [ 28 | `--proxy-server=http://${PROXY_SERVER}:${PROXY_SERVER_PORT}` 29 | ] 30 | }); 31 | const page = await browser.newPage(); 32 | await page.authenticate({ 33 | username: PROXY_USERNAME, 34 | password: PROXY_PASSWORD, 35 | }); 36 | 37 | 38 | try { 39 | await page.goto('http://quotes.toscrape.com/page/1/', {timeout: 180000}); 40 | let bodyHTML = await page.evaluate(() => document.body.innerHTML); 41 | let $ = cheerio.load(bodyHTML); 42 | 43 | // find all quotes sections 44 | let quotes_sections = $('div.quote') 45 | 46 | // loop through the quotes sections and extract data 47 | quotes_sections.each((index, element) => { 48 | quote = $(element).find('span.text').text() 49 | author = $(element).find('small.author').text() 50 | 51 | // add scraped data to scraped_quotes array 52 | scraped_quotes.push({ 53 | 'quote': quote, 54 | 'author': author 55 | }) 56 | 57 | }); 58 | 59 | } catch(err) { 60 | console.log(err); 61 | } 62 | 63 | await browser.close(); 64 | console.log(scraped_quotes) 65 | })(); -------------------------------------------------------------------------------- /Python/Python_Requests_Beautifulsoup/api_endpoint_example.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import concurrent.futures 4 | import csv 5 | from urllib.parse import urlencode 6 | 7 | 8 | """ 9 | SCRAPER SETTINGS 10 | 11 | You need to define the following values below: 12 | 13 | - API_KEY --> Find this on your dashboard, or signup here to create a 14 | free account here https://dashboard.scraperapi.com/signup 15 | 16 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 17 | 95% of your requests will be successful on the first try, 18 | and 99% after 3 retries. 19 | 20 | - NUM_THREADS --> Set this equal to the number of concurrent threads available 21 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 22 | Startup Plan (25 threads), Business Plan (50 threads), 23 | Enterprise Plan (up to 5,000 threads). 24 | 25 | """ 26 | API_KEY = 'INSERT_API_KEY_HERE' 27 | NUM_RETRIES = 3 28 | NUM_THREADS = 5 29 | 30 | 31 | ## Example list of urls to scrape 32 | list_of_urls = [ 33 | 'http://quotes.toscrape.com/page/1/', 34 | 'http://quotes.toscrape.com/page/2/', 35 | ] 36 | 37 | 38 | ## we will store the scraped data in this list 39 | scraped_quotes = [] 40 | 41 | def scrape_url(url): 42 | """ 43 | SEND REQUESTS TO SCRAPER API AND PARSE DATA FROM THE HTML RESPONSE 44 | 45 | INPUT/OUTPUT: Takes a single url as input, and appends the scraped data to the "scraped_quotes" list. 46 | METHOD: Takes the input url, requests it via scraperapi and keeps retrying the request until it gets a 47 | successful response (200 or 404 status code) or up to the number of retries you define in NUM_RETRIES. 48 | If it did yield a successful response then it parses the data from the HTML response and adds it to the 49 | "scraped_quotes" list. You can easily reconfigure this to store the scraped data in a database. 50 | """ 51 | 52 | params = {'api_key': API_KEY, 'url': url} 53 | 54 | # send request to scraperapi, and automatically retry failed requests 55 | for _ in range(NUM_RETRIES): 56 | try: 57 | response = requests.get('http://api.scraperapi.com/', params=urlencode(params)) 58 | if response.status_code in [200, 404]: 59 | ## escape for loop if the API returns a successful response 60 | break 61 | except requests.exceptions.ConnectionError: 62 | response = '' 63 | 64 | 65 | ## parse data if 200 status code (successful response) 66 | if response.status_code == 200: 67 | 68 | """ 69 | Insert the parsing code for your use case here... 70 | """ 71 | 72 | ## Example: parse data with beautifulsoup 73 | html_response = response.text 74 | soup = BeautifulSoup(html_response, "html.parser") 75 | quotes_sections = soup.find_all('div', class_="quote") 76 | 77 | ## loop through each quotes section and extract the quote and author 78 | for quote_block in quotes_sections: 79 | quote = quote_block.find('span', class_='text').text 80 | author = quote_block.find('small', class_='author').text 81 | 82 | ## add scraped data to "scraped_quotes" list 83 | scraped_quotes.append({ 84 | 'quote': quote, 85 | 'author': author 86 | }) 87 | 88 | 89 | """ 90 | CONFIGURE CONCURRENT THREADS 91 | 92 | Create thread pools up to the NUM_THREADS you define above and splits the urls you 93 | want to scrape amongst these threads until complete. Takes as input: 94 | 95 | - max_workers --> the maximum number of threads it will create. Here we set it to the 96 | value we defined in NUM_THREADS. 97 | 98 | - function to execute --> the first input to the executor.map() function is the function 99 | we want to execute in each thread. Here we input the "scrape_url(url)"" 100 | function which accepts a single url as input. 101 | 102 | - input list --> the second input to the executor.map() function is the data we want to 103 | be split amongst the threads created. Here we input the "list_of_urls" we 104 | want to scrape. 105 | 106 | """ 107 | with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: 108 | executor.map(scrape_url, list_of_urls) 109 | 110 | 111 | print(scraped_quotes) -------------------------------------------------------------------------------- /Python/Python_Requests_Beautifulsoup/proxy_port_example.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from bs4 import BeautifulSoup 3 | import concurrent.futures 4 | import csv 5 | import urllib.parse 6 | 7 | 8 | """ 9 | SCRAPER SETTINGS 10 | 11 | You need to define the following values below: 12 | 13 | - API_KEY --> Find this on your dashboard, or signup here to create a 14 | free account here https://dashboard.scraperapi.com/signup 15 | 16 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 17 | 95% of your requests will be successful on the first try, 18 | and 99% after 3 retries. 19 | 20 | - NUM_THREADS --> Set this equal to the number of concurrent threads available 21 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 22 | Startup Plan (25 threads), Business Plan (50 threads), 23 | Enterprise Plan (up to 5,000 threads). 24 | 25 | """ 26 | API_KEY = 'INSERT_API_KEY_HERE' 27 | NUM_RETRIES = 3 28 | NUM_THREADS = 5 29 | 30 | 31 | ## Example list of urls to scrape 32 | list_of_urls = [ 33 | 'http://quotes.toscrape.com/page/1/', 34 | 'http://quotes.toscrape.com/page/2/', 35 | ] 36 | 37 | ## Tell scraper to use Scraper API as the proxy 38 | proxies = { 39 | 'http': f'http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001', 40 | } 41 | 42 | ## we will store the scraped data in this list 43 | scraped_quotes = [] 44 | 45 | def scrape_url(url): 46 | """ 47 | SEND REQUESTS TO SCRAPER API AND PARSE DATA FROM THE HTML RESPONSE 48 | 49 | INPUT/OUTPUT: Takes a single url as input, and appends the scraped data to the "scraped_quotes" list. 50 | METHOD: Takes the input url, requests it via scraperapi and keeps retrying the request until it gets a 51 | successful response (200 or 404 status code) or up to the number of retries you define in NUM_RETRIES. 52 | If it did yield a successful response then it parses the data from the HTML response and adds it to the 53 | "scraped_quotes" list. You can easily reconfigure this to store the scraped data in a database. 54 | """ 55 | 56 | params = {'api_key': API_KEY, 'url': url} 57 | 58 | # send request to scraperapi, and automatically retry failed requests 59 | for _ in range(NUM_RETRIES): 60 | try: 61 | response = requests.get(url, proxies=proxies, verify=False) 62 | if response.status_code in [200, 404]: 63 | ## escape for loop if the API returns a successful response 64 | break 65 | except requests.exceptions.ConnectionError: 66 | response = '' 67 | 68 | 69 | ## parse data if 200 status code (successful response) 70 | if response.status_code == 200: 71 | 72 | """ 73 | Insert the parsing code for your use case here... 74 | """ 75 | 76 | ## Example: parse data with beautifulsoup 77 | html_response = response.text 78 | soup = BeautifulSoup(html_response, "html.parser") 79 | quotes_sections = soup.find_all('div', class_="quote") 80 | 81 | ## loop through each quotes section and extract the quote and author 82 | for quote_block in quotes_sections: 83 | quote = quote_block.find('span', class_='text').text 84 | author = quote_block.find('small', class_='author').text 85 | 86 | ## add scraped data to "scraped_quotes" list 87 | scraped_quotes.append({ 88 | 'quote': quote, 89 | 'author': author 90 | }) 91 | 92 | 93 | """ 94 | CONFIGURE CONCURRENT THREADS 95 | 96 | Create thread pools up to the NUM_THREADS you define above and splits the urls you 97 | want to scrape amongst these threads until complete. Takes as input: 98 | 99 | - max_workers --> the maximum number of threads it will create. Here we set it to the 100 | value we defined in NUM_THREADS. 101 | 102 | - function to execute --> the first input to the executor.map() function is the function 103 | we want to execute in each thread. Here we input the "scrape_url(url)"" 104 | function which accepts a single url as input. 105 | 106 | - input list --> the second input to the executor.map() function is the data we want to 107 | be split amongst the threads created. Here we input the "list_of_urls" we 108 | want to scrape. 109 | 110 | """ 111 | with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: 112 | executor.map(scrape_url, list_of_urls) 113 | 114 | 115 | print(scraped_quotes) -------------------------------------------------------------------------------- /Python/Python_Requests_Beautifulsoup/sdk_example.py: -------------------------------------------------------------------------------- 1 | from bs4 import BeautifulSoup 2 | import concurrent.futures 3 | import csv 4 | import urllib.parse 5 | from scraper_api import ScraperAPIClient 6 | 7 | 8 | """ 9 | SCRAPER SETTINGS 10 | 11 | You need to define the following values below: 12 | 13 | - API_KEY --> Find this on your dashboard, or signup here to create a 14 | free account here https://dashboard.scraperapi.com/signup 15 | 16 | - NUM_RETRIES --> We recommend setting this to 5 retries. For most sites 17 | 95% of your requests will be successful on the first try, 18 | and 99% after 3 retries. 19 | 20 | - NUM_THREADS --> Set this equal to the number of concurrent threads available 21 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 22 | Startup Plan (25 threads), Business Plan (50 threads), 23 | Enterprise Plan (up to 5,000 threads). 24 | 25 | """ 26 | API_KEY = 'INSERT_API_KEY_HERE' 27 | NUM_RETRIES = 3 28 | NUM_THREADS = 5 29 | 30 | client = ScraperAPIClient(API_KEY) 31 | 32 | ## Example list of urls to scrape 33 | list_of_urls = [ 34 | 'http://quotes.toscrape.com/page/1/', 35 | 'http://quotes.toscrape.com/page/2/', 36 | ] 37 | 38 | 39 | ## we will store the scraped data in this list 40 | scraped_quotes = [] 41 | 42 | def scrape_url(url): 43 | """ 44 | SEND REQUESTS TO SCRAPER API AND PARSE DATA FROM THE HTML RESPONSE 45 | 46 | INPUT/OUTPUT: Takes a single url as input, and appends the scraped data to the "scraped_quotes" list. 47 | METHOD: Takes the input url, requests it via scraperapi and keeps retrying the request until it gets a 48 | successful response (200 or 404 status code) or up to the number of retries you define in NUM_RETRIES. 49 | If it did yield a successful response then it parses the data from the HTML response and adds it to the 50 | "scraped_quotes" list. You can easily reconfigure this to store the scraped data in a database. 51 | """ 52 | 53 | response = client.get(url=url, retry=NUM_RETRIES) 54 | 55 | ## parse data if 200 status code (successful response) 56 | if response.status_code == 200: 57 | 58 | """ 59 | Insert the parsing code for your use case here... 60 | """ 61 | 62 | ## Example: parse data with beautifulsoup 63 | html_response = response.text 64 | soup = BeautifulSoup(html_response, "html.parser") 65 | quotes_sections = soup.find_all('div', class_="quote") 66 | 67 | ## loop through each quotes section and extract the quote and author 68 | for quote_block in quotes_sections: 69 | quote = quote_block.find('span', class_='text').text 70 | author = quote_block.find('small', class_='author').text 71 | 72 | ## add scraped data to "scraped_quotes" list 73 | scraped_quotes.append({ 74 | 'quote': quote, 75 | 'author': author 76 | }) 77 | 78 | 79 | """ 80 | CONFIGURE CONCURRENT THREADS 81 | 82 | Create thread pools up to the NUM_THREADS you define above and splits the urls you 83 | want to scrape amongst these threads until complete. Takes as input: 84 | 85 | - max_workers --> the maximum number of threads it will create. Here we set it to the 86 | value we defined in NUM_THREADS. 87 | 88 | - function to execute --> the first input to the executor.map() function is the function 89 | we want to execute in each thread. Here we input the "scrape_url(url)"" 90 | function which accepts a single url as input. 91 | 92 | - input list --> the second input to the executor.map() function is the data we want to 93 | be split amongst the threads created. Here we input the "list_of_urls" we 94 | want to scrape. 95 | 96 | """ 97 | with concurrent.futures.ThreadPoolExecutor(max_workers=NUM_THREADS) as executor: 98 | executor.map(scrape_url, list_of_urls) 99 | 100 | 101 | print(scraped_quotes) -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/scraperapi/scraperapi-code-examples/3f7b5d3945fb8a97579fd62a8c0062c15e658193/Python/Python_Scrapy/Python_Scrapy/__init__.py -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/items.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your scraped items 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class PythonScrapyItem(scrapy.Item): 10 | # define the fields for your item here like: 11 | # name = scrapy.Field() 12 | pass 13 | -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/middlewares.py: -------------------------------------------------------------------------------- 1 | # Define here the models for your spider middleware 2 | # 3 | # See documentation in: 4 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 5 | 6 | from scrapy import signals 7 | 8 | # useful for handling different item types with a single interface 9 | from itemadapter import is_item, ItemAdapter 10 | 11 | 12 | class PythonScrapySpiderMiddleware: 13 | # Not all methods need to be defined. If a method is not defined, 14 | # scrapy acts as if the spider middleware does not modify the 15 | # passed objects. 16 | 17 | @classmethod 18 | def from_crawler(cls, crawler): 19 | # This method is used by Scrapy to create your spiders. 20 | s = cls() 21 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 22 | return s 23 | 24 | def process_spider_input(self, response, spider): 25 | # Called for each response that goes through the spider 26 | # middleware and into the spider. 27 | 28 | # Should return None or raise an exception. 29 | return None 30 | 31 | def process_spider_output(self, response, result, spider): 32 | # Called with the results returned from the Spider, after 33 | # it has processed the response. 34 | 35 | # Must return an iterable of Request, or item objects. 36 | for i in result: 37 | yield i 38 | 39 | def process_spider_exception(self, response, exception, spider): 40 | # Called when a spider or process_spider_input() method 41 | # (from other spider middleware) raises an exception. 42 | 43 | # Should return either None or an iterable of Request or item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class PythonScrapyDownloaderMiddleware: 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/pipelines.py: -------------------------------------------------------------------------------- 1 | # Define your item pipelines here 2 | # 3 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 4 | # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html 5 | 6 | 7 | # useful for handling different item types with a single interface 8 | from itemadapter import ItemAdapter 9 | 10 | 11 | class PythonScrapyPipeline: 12 | def process_item(self, item, spider): 13 | return item 14 | -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/settings.py: -------------------------------------------------------------------------------- 1 | # Scrapy settings for Python_Scrapy project 2 | # 3 | # For simplicity, this file contains only settings considered important or 4 | # commonly used. You can find more settings consulting the documentation: 5 | # 6 | # https://docs.scrapy.org/en/latest/topics/settings.html 7 | # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 8 | # https://docs.scrapy.org/en/latest/topics/spider-middleware.html 9 | 10 | BOT_NAME = 'Python_Scrapy' 11 | 12 | SPIDER_MODULES = ['Python_Scrapy.spiders'] 13 | NEWSPIDER_MODULE = 'Python_Scrapy.spiders' 14 | 15 | 16 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 17 | #USER_AGENT = 'Python_Scrapy (+http://www.yourdomain.com)' 18 | 19 | # Obey robots.txt rules 20 | ROBOTSTXT_OBEY = False 21 | 22 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 23 | #CONCURRENT_REQUESTS = 32 24 | 25 | # Configure a delay for requests for the same website (default: 0) 26 | # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay 27 | # See also autothrottle settings and docs 28 | #DOWNLOAD_DELAY = 3 29 | # The download delay setting will honor only one of: 30 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 31 | #CONCURRENT_REQUESTS_PER_IP = 16 32 | 33 | # Disable cookies (enabled by default) 34 | #COOKIES_ENABLED = False 35 | 36 | # Disable Telnet Console (enabled by default) 37 | #TELNETCONSOLE_ENABLED = False 38 | 39 | # Override the default request headers: 40 | #DEFAULT_REQUEST_HEADERS = { 41 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 42 | # 'Accept-Language': 'en', 43 | #} 44 | 45 | # Enable or disable spider middlewares 46 | # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html 47 | #SPIDER_MIDDLEWARES = { 48 | # 'Python_Scrapy.middlewares.PythonScrapySpiderMiddleware': 543, 49 | #} 50 | 51 | # Enable or disable downloader middlewares 52 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html 53 | #DOWNLOADER_MIDDLEWARES = { 54 | # 'Python_Scrapy.middlewares.PythonScrapyDownloaderMiddleware': 543, 55 | #} 56 | 57 | # Enable or disable extensions 58 | # See https://docs.scrapy.org/en/latest/topics/extensions.html 59 | #EXTENSIONS = { 60 | # 'scrapy.extensions.telnet.TelnetConsole': None, 61 | #} 62 | 63 | # Configure item pipelines 64 | # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html 65 | #ITEM_PIPELINES = { 66 | # 'Python_Scrapy.pipelines.PythonScrapyPipeline': 300, 67 | #} 68 | 69 | # Enable and configure the AutoThrottle extension (disabled by default) 70 | # See https://docs.scrapy.org/en/latest/topics/autothrottle.html 71 | #AUTOTHROTTLE_ENABLED = True 72 | # The initial download delay 73 | #AUTOTHROTTLE_START_DELAY = 5 74 | # The maximum download delay to be set in case of high latencies 75 | #AUTOTHROTTLE_MAX_DELAY = 60 76 | # The average number of requests Scrapy should be sending in parallel to 77 | # each remote server 78 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 79 | # Enable showing throttling stats for every response received: 80 | #AUTOTHROTTLE_DEBUG = False 81 | 82 | # Enable and configure HTTP caching (disabled by default) 83 | # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 84 | #HTTPCACHE_ENABLED = True 85 | #HTTPCACHE_EXPIRATION_SECS = 0 86 | #HTTPCACHE_DIR = 'httpcache' 87 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 88 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 89 | -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/spiders/api_endpoint_example.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from urllib.parse import urlencode 3 | 4 | """ 5 | SCRAPER SETTINGS 6 | 7 | You need to define the following values below: 8 | 9 | - API_KEY --> Find this on your dashboard, or signup here to create a 10 | free account here https://dashboard.scraperapi.com/signup 11 | 12 | To use this script you need to modify a couple settings in the settings.py file: 13 | 14 | - CONCURRENT_REQUESTS --> Set this equal to the number of concurrent threads available 15 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 16 | Startup Plan (25 threads), Business Plan (50 threads), 17 | Enterprise Plan (up to 5,000 threads). 18 | 19 | - RETRY_TIMES --> We recommend setting this to 5 retries. For most sites 20 | 95% of your requests will be successful on the first try, 21 | and 99% after 3 retries. 22 | 23 | - ROBOTSTXT_OBEY --> Set this to FALSE as otherwise Scrapy won't run. 24 | 25 | - DOWNLOAD_DELAY & RANDOMIZE_DOWNLOAD_DELAY --> Make sure these have been commented out as you 26 | don't need them when using Scraper API. 27 | 28 | 29 | """ 30 | 31 | API_KEY = 'YOUR_API_KEY' 32 | 33 | def get_scraperapi_url(url): 34 | """ 35 | Converts url into API request for Scraper API. 36 | """ 37 | payload = {'api_key': API_KEY, 'url': url} 38 | proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) 39 | return proxy_url 40 | 41 | class QuotesSpider(scrapy.Spider): 42 | name = "api_endpoint_spider" 43 | 44 | def start_requests(self): 45 | urls = [ 46 | 'http://quotes.toscrape.com/page/1/', 47 | 'http://quotes.toscrape.com/page/2/', 48 | ] 49 | for url in urls: 50 | yield scrapy.Request(url=get_scraperapi_url(url), callback=self.parse) 51 | 52 | def parse(self, response): 53 | """ 54 | Insert the parsing code for your use case here... 55 | """ 56 | for quote in response.css('div.quote'): 57 | yield { 58 | 'text': quote.css('span.text::text').get(), 59 | 'author': quote.css('small.author::text').get(), 60 | 'tags': quote.css('div.tags a.tag::text').getall(), 61 | } -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/spiders/proxy_port_example.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | 3 | """ 4 | SCRAPER SETTINGS 5 | 6 | You need to define the following values below: 7 | 8 | - API_KEY --> Find this on your dashboard, or signup here to create a 9 | free account here https://dashboard.scraperapi.com/signup 10 | 11 | To use this script you need to modify a couple settings in the settings.py file: 12 | 13 | - CONCURRENT_REQUESTS --> Set this equal to the number of concurrent threads available 14 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 15 | Startup Plan (25 threads), Business Plan (50 threads), 16 | Enterprise Plan (up to 5,000 threads). 17 | 18 | - RETRY_TIMES --> We recommend setting this to 5 retries. For most sites 19 | 95% of your requests will be successful on the first try, 20 | and 99% after 3 retries. 21 | 22 | - ROBOTSTXT_OBEY --> Set this to FALSE as otherwise Scrapy won't run. 23 | 24 | - DOWNLOAD_DELAY & RANDOMIZE_DOWNLOAD_DELAY --> Make sure these have been commented out as you 25 | don't need them when using Scraper API. 26 | 27 | 28 | """ 29 | 30 | API_KEY = 'YOUR_API_KEY' 31 | 32 | 33 | class QuotesSpider(scrapy.Spider): 34 | name = "proxy_port_spider" 35 | 36 | def start_requests(self): 37 | meta = { 38 | "proxy": f"http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001" 39 | } 40 | 41 | urls = [ 42 | 'http://quotes.toscrape.com/page/1/', 43 | 'http://quotes.toscrape.com/page/2/', 44 | ] 45 | for url in urls: 46 | yield scrapy.Request(url=url, callback=self.parse, meta=meta) 47 | 48 | def parse(self, response): 49 | """ 50 | Insert the parsing code for your use case here... 51 | """ 52 | for quote in response.css('div.quote'): 53 | yield { 54 | 'text': quote.css('span.text::text').get(), 55 | 'author': quote.css('small.author::text').get(), 56 | 'tags': quote.css('div.tags a.tag::text').getall(), 57 | } -------------------------------------------------------------------------------- /Python/Python_Scrapy/Python_Scrapy/spiders/sdk_example.py: -------------------------------------------------------------------------------- 1 | import scrapy 2 | from scraper_api import ScraperAPIClient 3 | 4 | """ 5 | SCRAPER SETTINGS 6 | 7 | You need to define the following values below: 8 | 9 | - API_KEY --> Find this on your dashboard, or signup here to create a 10 | free account here https://dashboard.scraperapi.com/signup 11 | 12 | To use this script you need to modify a couple settings in the settings.py file: 13 | 14 | - CONCURRENT_REQUESTS --> Set this equal to the number of concurrent threads available 15 | in your plan. For reference: Free Plan (5 threads), Hobby Plan (10 threads), 16 | Startup Plan (25 threads), Business Plan (50 threads), 17 | Enterprise Plan (up to 5,000 threads). 18 | 19 | - RETRY_TIMES --> We recommend setting this to 5 retries. For most sites 20 | 95% of your requests will be successful on the first try, 21 | and 99% after 3 retries. 22 | 23 | - ROBOTSTXT_OBEY --> Set this to FALSE as otherwise Scrapy won't run. 24 | 25 | - DOWNLOAD_DELAY & RANDOMIZE_DOWNLOAD_DELAY --> Make sure these have been commented out as you 26 | don't need them when using Scraper API. 27 | 28 | 29 | """ 30 | 31 | API_KEY = 'YOUR_API_KEY' 32 | 33 | client = ScraperAPIClient(API_KEY) 34 | 35 | class QuotesSpider(scrapy.Spider): 36 | name = "sdk_spider" 37 | 38 | def start_requests(self): 39 | urls = [ 40 | 'http://quotes.toscrape.com/page/1/', 41 | 'http://quotes.toscrape.com/page/2/', 42 | ] 43 | for url in urls: 44 | yield scrapy.Request(client.scrapyGet(url=url), callback=self.parse) 45 | 46 | def parse(self, response): 47 | """ 48 | Insert the parsing code for your use case here... 49 | """ 50 | for quote in response.css('div.quote'): 51 | yield { 52 | 'text': quote.css('span.text::text').get(), 53 | 'author': quote.css('small.author::text').get(), 54 | 'tags': quote.css('div.tags a.tag::text').getall(), 55 | } -------------------------------------------------------------------------------- /Python/Python_Scrapy/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = Python_Scrapy.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = Python_Scrapy 12 | -------------------------------------------------------------------------------- /Python/Python_Selenium/api_endpoint_example.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | from webdriver_manager.chrome import ChromeDriverManager 3 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 4 | from bs4 import BeautifulSoup 5 | from urllib.parse import urlencode 6 | import json 7 | 8 | 9 | """ 10 | 11 | IMPORTANT: THIS IS NOT THE RECOMMENDED APPROACH, WE RECOMMEND YOU USE THE PROXY PORT 12 | 13 | ------- 14 | 15 | SCRAPER SETTINGS 16 | 17 | You need to define the following values below: 18 | 19 | - API_KEY --> Find this on your dashboard, or signup here to create a 20 | free account here https://dashboard.scraperapi.com/signup 21 | 22 | - RETRY_TIMES --> We recommend setting this to 2-3 retries, in case a request fails. 23 | For most sites 95% of your requests will be successful on the first try, 24 | and 99% after 3 retries. 25 | 26 | """ 27 | 28 | API_KEY = 'YOUR_API_KEY' 29 | NUM_RETRIES = 2 30 | 31 | ## we will store the scraped data in this list 32 | scraped_quotes = [] 33 | 34 | ## urls to scrape 35 | url_list = [ 36 | 'http://quotes.toscrape.com/page/1/', 37 | 'http://quotes.toscrape.com/page/2/', 38 | ] 39 | 40 | 41 | def get_scraperapi_url(url): 42 | """ 43 | Converts url into API request for Scraper API. 44 | """ 45 | payload = {'api_key': API_KEY, 'url': url} 46 | proxy_url = 'http://api.scraperapi.com/?' + urlencode(payload) 47 | return proxy_url 48 | 49 | 50 | def status_code_first_request(performance_log): 51 | """ 52 | Selenium makes it hard to get the status code of each request, 53 | so this function takes the Selenium performance logs as an input 54 | and returns the status code of the first response. 55 | """ 56 | for line in performance_log: 57 | try: 58 | json_log = json.loads(line['message']) 59 | if json_log['message']['method'] == 'Network.responseReceived': 60 | return json_log['message']['params']['response']['status'] 61 | except: 62 | pass 63 | return json.loads(response_recieved[0]['message'])['message']['params']['response']['status'] 64 | 65 | 66 | 67 | ## optional --> define Selenium options 68 | option = webdriver.ChromeOptions() 69 | option.add_argument('--headless') ## --> comment out to see the browser launch. 70 | option.add_argument('--no-sandbox') 71 | option.add_argument('--disable-dev-sh-usage') 72 | 73 | ## enable Selenium logging 74 | caps = DesiredCapabilities.CHROME 75 | caps['goog:loggingPrefs'] = {'performance': 'ALL'} 76 | 77 | 78 | ## set up Selenium Chrome driver 79 | driver = webdriver.Chrome(ChromeDriverManager().install(), 80 | options=option, 81 | desired_capabilities=caps) 82 | 83 | for url in url_list: 84 | 85 | for _ in range(NUM_RETRIES): 86 | try: 87 | driver.get(get_scraperapi_url(url)) 88 | performance_log = driver.get_log('performance') 89 | status_code = status_code_first_request(performance_log) 90 | if status_code in [200, 404]: 91 | ## escape for loop if the API returns a successful response 92 | break 93 | except requests.exceptions.ConnectionError: 94 | driver.close() 95 | 96 | 97 | if status_code == 200: 98 | ## feed HTML response into BeautifulSoup 99 | html_response = driver.page_source 100 | soup = BeautifulSoup(html_response, "html.parser") 101 | 102 | ## find all quotes sections 103 | quotes_sections = soup.find_all('div', class_="quote") 104 | 105 | ## loop through each quotes section and extract the quote and author 106 | for quote_block in quotes_sections: 107 | quote = quote_block.find('span', class_='text').text 108 | author = quote_block.find('small', class_='author').text 109 | 110 | ## add scraped data to "scraped_quotes" list 111 | scraped_quotes.append({ 112 | 'quote': quote, 113 | 'author': author 114 | }) 115 | 116 | 117 | print(scraped_quotes) 118 | 119 | -------------------------------------------------------------------------------- /Python/Python_Selenium/proxy_port_example.py: -------------------------------------------------------------------------------- 1 | from seleniumwire import webdriver 2 | from webdriver_manager.chrome import ChromeDriverManager 3 | from selenium.webdriver.common.desired_capabilities import DesiredCapabilities 4 | from selenium.webdriver.common.by import By 5 | from bs4 import BeautifulSoup 6 | import json 7 | 8 | """ 9 | SCRAPER SETTINGS 10 | 11 | You need to define the following values below: 12 | 13 | - API_KEY --> Find this on your dashboard, or signup here to create a 14 | free account here https://dashboard.scraperapi.com/signup 15 | 16 | - RETRY_TIMES --> We recommend setting this to 2-3 retries, in case a request fails. 17 | For most sites 95% of your requests will be successful on the first try, 18 | and 99% after 3 retries. 19 | 20 | """ 21 | 22 | API_KEY = 'YOUR_API_KEY' 23 | NUM_RETRIES = 2 24 | 25 | proxy_options = { 26 | 'proxy': { 27 | 'http': f'http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001', 28 | 'https': f'http://scraperapi:{API_KEY}@proxy-server.scraperapi.com:8001', 29 | 'no_proxy': 'localhost,127.0.0.1' 30 | } 31 | } 32 | 33 | 34 | ## we will store the scraped data in this list 35 | scraped_quotes = [] 36 | 37 | ## urls to scrape 38 | url_list = [ 39 | 'http://quotes.toscrape.com/page/1/', 40 | 'http://quotes.toscrape.com/page/2/', 41 | ] 42 | 43 | 44 | def status_code_first_request(performance_log): 45 | """ 46 | Selenium makes it hard to get the status code of each request, 47 | so this function takes the Selenium performance logs as an input 48 | and returns the status code of the first response. 49 | """ 50 | for line in performance_log: 51 | try: 52 | json_log = json.loads(line['message']) 53 | if json_log['message']['method'] == 'Network.responseReceived': 54 | return json_log['message']['params']['response']['status'] 55 | except: 56 | pass 57 | return json.loads(response_recieved[0]['message'])['message']['params']['response']['status'] 58 | 59 | 60 | 61 | ## optional --> define Selenium options 62 | option = webdriver.ChromeOptions() 63 | option.add_argument('--headless') ## --> comment out to see the browser launch. 64 | option.add_argument('--no-sandbox') 65 | option.add_argument('--disable-dev-sh-usage') 66 | 67 | ## enable Selenium logging 68 | caps = DesiredCapabilities.CHROME 69 | caps['goog:loggingPrefs'] = {'performance': 'ALL'} 70 | 71 | 72 | ## set up Selenium Chrome driver 73 | driver = webdriver.Chrome(ChromeDriverManager().install(), 74 | options=option, 75 | desired_capabilities=caps, 76 | seleniumwire_options=proxy_options) 77 | 78 | for url in url_list: 79 | 80 | for _ in range(NUM_RETRIES): 81 | try: 82 | driver.get(url) 83 | performance_log = driver.get_log('performance') 84 | status_code = status_code_first_request(performance_log) 85 | if status_code in [200, 404]: 86 | ## escape for loop if the API returns a successful response 87 | break 88 | except requests.exceptions.ConnectionError: 89 | driver.close() 90 | 91 | 92 | if status_code == 200: 93 | ## feed HTML response into BeautifulSoup 94 | html_response = driver.page_source 95 | soup = BeautifulSoup(html_response, "html.parser") 96 | 97 | ## find all quotes sections 98 | quotes_sections = soup.find_all('div', class_="quote") 99 | 100 | ## loop through each quotes section and extract the quote and author 101 | for quote_block in quotes_sections: 102 | quote = quote_block.find('span', class_='text').text 103 | author = quote_block.find('small', class_='author').text 104 | 105 | ## add scraped data to "scraped_quotes" list 106 | scraped_quotes.append({ 107 | 'quote': quote, 108 | 'author': author 109 | }) 110 | 111 | ## example --> click on the link for the next page 112 | link = driver.find_element_by_link_text("Next →") 113 | link.click() 114 | 115 | print(scraped_quotes) 116 | 117 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # scraperapi-code-examples 2 | 3 | There are 3 ways in which you can send integrate your scrapers with Scraper API: 4 | 5 | 1. API endpoint: `http://api.scraperapi.com/?api_key=YOUR_API_KEY&url=http://httpbin.org/ip` 6 | 2. One of our SDK's, currently available for: [Python](https://pypi.org/project/scraperapi-sdk/), [NodeJS](https://www.npmjs.com/package/scraperapi-sdk), [PHP](https://packagist.org/packages/scraperapi/sdk), [Ruby](https://rubygems.org/gems/scraperapi), [Java](https://github.com/scraperapi/scraperapi-java-sdk). 7 | 3. Proxy Port: `http://scraperapi:YOUR_API_KEY@proxy-server.scraperapi.com:8001` 8 | 9 | All three options have the same functionality and performance, they just offer you the flexibility to integrate Scraper API in the way that is easiest for you. 10 | 11 | 12 | Code Examples 13 | ------ 14 | This repo contains basic code examples showing you how to integrate ScaperAPI using each of the three integration options: 15 | 16 | #### Python 17 | 1. [Python Requests and Beautifulsoup](https://github.com/scraperapi/scraperapi-code-examples/tree/main/Python/Python_Requests_Beautifulsoup) 18 | 2. [Python Scrapy](https://github.com/scraperapi/scraperapi-code-examples/tree/main/Python/Python_Scrapy) 19 | 3. [Python Selenium](https://github.com/scraperapi/scraperapi-code-examples/tree/main/Python/Python_Selenium) 20 | 21 | #### NodeJS 22 | 1. [NodeJS and Cheerio](https://github.com/scraperapi/scraperapi-code-examples/tree/main/NodeJS/NodeJS) 23 | 2. [NodeJS Puppeteer](https://github.com/scraperapi/scraperapi-code-examples/tree/main/NodeJS/NodeJS_Puppeteer) 24 | 25 | The full Scraper API documentation can be found [here](https://www.scraperapi.com/documentation). 26 | --------------------------------------------------------------------------------